【爬虫】案例(爬取豆瓣top250)[完整+详细]
流程** 使用requests库获取网页数据,使用bs4和re对网页进行解析和文字匹配最后使用xlwt和pymysql将数据存入excel和数据库**主函数from bs4 import BeautifulSoup# 网页解析,获取数据import re# 正则,文字匹配import requests# 获取网页数据import xlwt# excel操作import pymysql.cursor
·
1、需求
爬取豆瓣top250的电影的播放链接,封面链接,中外文名等数据,并作数据的持久化处理(存放到excel和数据库)
2、流程
使用requests库获取网页数据,使用bs4和re对网页进行解析和文字匹配
最后使用xlwt和pymysql将数据存入excel和数据库
3、实现
导包
# -*- coding: utf-8 -*
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则,文字匹配
import requests # 获取网页数据
import xlwt # excel操作
import pymysql.cursors # 数据库操作
主函数
def main():
baseUrl = "https://movie.douban.com/top250?start="
# 爬取网页
dataList = getDate(baseUrl)
# 保存数据
savePath = "豆瓣top250.xls"
saveData(savePath, dataList)
saveDataToDb(dataList)
数据获取
# 得到指定URL的网页内容
def askUrl(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
html = ""
try:
r = requests.get(url=url, headers=headers, timeout=3)
r.encoding = 'utf-8'
html = r.text
except Exception as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
数据解析
def getDate(baseUrl):
dataList = []
for i in range(0, 10):
url = baseUrl + str(i*25)
html = askUrl(url)
# 解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("div", class_="item"):
data = [] # 存放一部电影的所有信息
item = str(item)
link = re.findall(r'<a href="(.*)">', item)[0] # 链接
data.append(link)
image = re.findall(r'<img.*src="(.*)" .*/>', item)[0] # 图片
data.append(image)
titles = re.findall(r'<span class="title">(.*)</span>', item) # 片名
data.append(titles[0]) # 添加中文名
if len(titles) == 2: # 添加外国名
data.append(titles[1].replace("\\", ""))
else:
data.append(" ")
rate = re.findall(r'<span class="rating_num".*>(.*)</span>', item)[0] # 评分
data.append(rate)
judge = re.findall(r'<span>(\d*)人评价</span>', item)[0] # 评级人数
data.append(judge)
inq = re.findall(r'<span class="inq">(.*)</span>', item, re.S) # 简述
if len(inq) != 0:
inq = inq[0].replace("。", "")
data.append(inq)
else:
data.append("")
bd = re.findall(r'<p class="">(.*?)</p>', item, re.S)[0] # 其他信息
bd = re.sub('<br/>', " ", bd)
bd = re.sub("/", " ", bd)
bd = re.sub("\\n", " ", bd)
bd = re.sub(r"\xa0", " ", bd)
data.append(bd.strip())
dataList.append(data)
return dataList
存储到excel
def saveData(savePath, dataList):
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = workbook.add_sheet("豆瓣top250", cell_overwrite_ok=True)
col = ("电影详情链接", "图片链接", "影片中文名", "影片英文名", "评分", "评价数", "概况", "相关信息")
for i in range(0, 8):
worksheet.write(0, i, col[i])
for i in range(0, 250):
data = dataList[i]
for j in range(0, 8):
worksheet.write(i+1, j, data[j])
workbook.save(savePath)
简易数据库
DROP TABLE IF EXISTS `top250`;
CREATE TABLE `top250` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`link` varchar(255) DEFAULT NULL,
`image` varchar(255) DEFAULT NULL,
`cname` varchar(255) DEFAULT NULL,
`oname` varchar(255) DEFAULT NULL,
`rate` varchar(255) DEFAULT NULL,
`judge` varchar(255) DEFAULT NULL,
`inq` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=517 DEFAULT CHARSET=utf8;
存储到数据库
def saveDataToDb(dataList):
for i in range(0, len(dataList)):
data = dataList[i]
print(data)
# 连接数据库
connect = pymysql.Connect(
host='localhost',
port=3306,
user='root',
passwd='',
db='douban',
charset='utf8'
)
# 获取游标
cursor = connect.cursor()
sql = "INSERT INTO top250 (link, image, cname, oname, rate, judge, inq) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s')"
data = (data[0], data[1], data[2], data[3], data[4], data[5], data[6])
cursor.execute(sql % data)
connect.commit()
5、完整代码
# -*- coding: utf-8 -*
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则,文字匹配
import requests # 获取网页数据
import xlwt # excel操作
def main():
baseUrl = "https://movie.douban.com/top250?start="
# 爬取网页
dataList = getDate(baseUrl)
# 保存数据
savePath = "豆瓣top250.xls"
saveData(savePath, dataList)
# saveDataToDb(dataList)
# 得到指定URL的网页内容
def askUrl(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
html = ""
try:
r = requests.get(url=url, headers=headers, timeout=3)
r.encoding = 'utf-8'
html = r.text
except Exception as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def getDate(baseUrl):
dataList = []
for i in range(0, 10):
url = baseUrl + str(i*25)
html = askUrl(url)
# 解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("div", class_="item"):
data = [] # 存放一部电影的所有信息
item = str(item)
link = re.findall(r'<a href="(.*)">', item)[0] # 链接
data.append(link)
image = re.findall(r'<img.*src="(.*)" .*/>', item)[0] # 图片
data.append(image)
titles = re.findall(r'<span class="title">(.*)</span>', item) # 片名
data.append(titles[0]) # 添加中文名
if len(titles) == 2: # 添加外国名
data.append(titles[1].replace("\\", ""))
else:
data.append(" ")
rate = re.findall(r'<span class="rating_num".*>(.*)</span>', item)[0] # 评分
data.append(rate)
judge = re.findall(r'<span>(\d*)人评价</span>', item)[0] # 评级人数
data.append(judge)
inq = re.findall(r'<span class="inq">(.*)</span>', item, re.S) # 简述
if len(inq) != 0:
inq = inq[0].replace("。", "")
data.append(inq)
else:
data.append("")
bd = re.findall(r'<p class="">(.*?)</p>', item, re.S)[0] # 其他信息
bd = re.sub('<br/>', " ", bd)
bd = re.sub("/", " ", bd)
bd = re.sub("\\n", " ", bd)
bd = re.sub(r"\xa0", " ", bd)
data.append(bd.strip())
dataList.append(data)
return dataList
def saveData(savePath, dataList):
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
worksheet = workbook.add_sheet("豆瓣top250", cell_overwrite_ok=True)
col = ("电影详情链接", "图片链接", "影片中文名", "影片英文名", "评分", "评价数", "概况", "相关信息")
for i in range(0, 8):
worksheet.write(0, i, col[i])
for i in range(0, 250):
data = dataList[i]
for j in range(0, 8):
worksheet.write(i+1, j, data[j])
workbook.save(savePath)
if (__name__ == "__main__"):
main()
6、小注
python操作数据库的知识
python操作数据库的知识
开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!
更多推荐
已为社区贡献9条内容
所有评论(0)