爬取多本小说

要爬取的网页:


http://www.tianyashuku.com/kehuan/

用到的库: urllib2,re,bs4

获取网页源代码:

 def getPage(self)       
        request=urllib2.Request(url,headers=headers)
        response=urllib2.urlopen(request)
        return response.read().decode('utf-8')

利用正则获得每个小说的网址(也就是小说的目录页面):

 def geturl(self,page):
        #得到小说章节的网址
        pattern = re.compile('<li><a href="(.*?)" title=',re.S)
        result = re.findall(pattern,page)
        return result
在小说的目录页面获得小说题目:

    def getTitle(self,page):
        #获得标题
        pattern = re.compile('<h1>(.*?)</h1>',re.S)
        result = re.search(pattern,page)
        return result.group(1).strip()
获得每一章的网址利用上面geturl方法


再利用得到的网址获取每本小说每一章的内容

    def neirong(self,page):
        #bs4获得小说的内容
        soup=BeautifulSoup(page)
        text=soup.find_all("div",class_="neirong")
        for k in text:
            return k.get_text(strip=True)

设置保存小说的文件名:

    def setFileTitle(self,title):
        if title is not None:
            fileName = re.sub('[\/:*?"<>|]','-',title)#去掉非法字符
            self.file = open(fileName + ".txt","w+")
        else:
            self.file = open('xiaoshuo' + ".txt","w+")

整体的代码:

# -*- coding:utf-8 -*-
import urllib2
import re
from bs4 import BeautifulSoup

class XiaoShuo:
    def __init__(self,url):
        self.url = url
    def geturl(self,page):
        #得到小说章节的网址
        pattern = re.compile('<li><a href="(.*?)" title=',re.S)
        result = re.findall(pattern,page)
        return result
    def seturl(self,result):
        for k in result:
            newurl = "http://www.tianyashuku.com" +k
    def getPage(self):
        #获得网页
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        url=self.url
        request=urllib2.Request(url,headers=headers)
        response=urllib2.urlopen(request)
        return response.read().decode('utf-8')
    def getxiaoshuo(self,url):
        #获得网页
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        request=urllib2.Request(url,headers=headers)
        response=urllib2.urlopen(request)
        return response.read().decode('utf-8')
    def getTitle(self,page):
        #获得标题
        pattern = re.compile('<h1>(.*?)</h1>',re.S)
        result = re.search(pattern,page)
        return result.group(1).strip()
    def setFileTitle(self,title):
        if title is not None:
            fileName = re.sub('[\/:*?"<>|]','-',title)#去掉非法字符
            self.file = open(fileName + ".txt","w+")
        else:
            self.file = open('xiaoshuo' + ".txt","w+")
    def neirong(self,page):
        #bs4获得小说的内容
        soup=BeautifulSoup(page)
        text=soup.find_all("div",class_="neirong")
        for k in text:
            return k.get_text(strip=True)
    def start(self):
        #程序的开始
        page=self.getPage()
        URL=self.geturl(page)#小说网页
        print URL
        for m in URL:
            newurl1=re.sub('" target="_blank','',m)
            xiaoshuourl="http://www.tianyashuku.com" +newurl1

            request=urllib2.Request(xiaoshuourl)
            response=urllib2.urlopen(request)
            content =response.read().decode('utf-8')
            m1=self.geturl(content)
            title=self.getTitle(content)#获得小说标题
            for k in m1:
                newurl = "http://www.tianyashuku.com" +k
                print newurl
                xiaoshuo=self.getxiaoshuo(newurl)
                a=self.getTitle(xiaoshuo)
                b=self.neirong(xiaoshuo)
                list=a+'\n'+b+'\n'
                print list
                f1 = open(title + ".txt",'a')
                f1.write(list.encode('utf-8'))
                f1.close()


# url='http://www.51shucheng.net/kehuan/santi/santi1/174.html'
#
URL='http://www.tianyashuku.com/kehuan/'
html = XiaoShuo(URL)
html.start()




Logo

开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!

更多推荐