爬虫实践练习(爬小说)
爬取多本小说要爬取的网页:http://www.tianyashuku.com/kehuan/用到的库: urllib2,re,bs4获取网页源代码:def getPage(self)request=urllib2.Request(url,headers=headers)response=urllib2.urlo
·
爬取多本小说
要爬取的网页:
http://www.tianyashuku.com/kehuan/
用到的库: urllib2,re,bs4
获取网页源代码:
def getPage(self)
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)
return response.read().decode('utf-8')
利用正则获得每个小说的网址(也就是小说的目录页面):
def geturl(self,page):
#得到小说章节的网址
pattern = re.compile('<li><a href="(.*?)" title=',re.S)
result = re.findall(pattern,page)
return result
在小说的目录页面获得小说题目:
def getTitle(self,page):
#获得标题
pattern = re.compile('<h1>(.*?)</h1>',re.S)
result = re.search(pattern,page)
return result.group(1).strip()
获得每一章的网址利用上面geturl方法
再利用得到的网址获取每本小说每一章的内容
def neirong(self,page):
#bs4获得小说的内容
soup=BeautifulSoup(page)
text=soup.find_all("div",class_="neirong")
for k in text:
return k.get_text(strip=True)
设置保存小说的文件名:
def setFileTitle(self,title):
if title is not None:
fileName = re.sub('[\/:*?"<>|]','-',title)#去掉非法字符
self.file = open(fileName + ".txt","w+")
else:
self.file = open('xiaoshuo' + ".txt","w+")
整体的代码:
# -*- coding:utf-8 -*-
import urllib2
import re
from bs4 import BeautifulSoup
class XiaoShuo:
def __init__(self,url):
self.url = url
def geturl(self,page):
#得到小说章节的网址
pattern = re.compile('<li><a href="(.*?)" title=',re.S)
result = re.findall(pattern,page)
return result
def seturl(self,result):
for k in result:
newurl = "http://www.tianyashuku.com" +k
def getPage(self):
#获得网页
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
url=self.url
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)
return response.read().decode('utf-8')
def getxiaoshuo(self,url):
#获得网页
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)
return response.read().decode('utf-8')
def getTitle(self,page):
#获得标题
pattern = re.compile('<h1>(.*?)</h1>',re.S)
result = re.search(pattern,page)
return result.group(1).strip()
def setFileTitle(self,title):
if title is not None:
fileName = re.sub('[\/:*?"<>|]','-',title)#去掉非法字符
self.file = open(fileName + ".txt","w+")
else:
self.file = open('xiaoshuo' + ".txt","w+")
def neirong(self,page):
#bs4获得小说的内容
soup=BeautifulSoup(page)
text=soup.find_all("div",class_="neirong")
for k in text:
return k.get_text(strip=True)
def start(self):
#程序的开始
page=self.getPage()
URL=self.geturl(page)#小说网页
print URL
for m in URL:
newurl1=re.sub('" target="_blank','',m)
xiaoshuourl="http://www.tianyashuku.com" +newurl1
request=urllib2.Request(xiaoshuourl)
response=urllib2.urlopen(request)
content =response.read().decode('utf-8')
m1=self.geturl(content)
title=self.getTitle(content)#获得小说标题
for k in m1:
newurl = "http://www.tianyashuku.com" +k
print newurl
xiaoshuo=self.getxiaoshuo(newurl)
a=self.getTitle(xiaoshuo)
b=self.neirong(xiaoshuo)
list=a+'\n'+b+'\n'
print list
f1 = open(title + ".txt",'a')
f1.write(list.encode('utf-8'))
f1.close()
# url='http://www.51shucheng.net/kehuan/santi/santi1/174.html'
#
URL='http://www.tianyashuku.com/kehuan/'
html = XiaoShuo(URL)
html.start()
开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!
更多推荐
已为社区贡献3条内容
所有评论(0)