python网络爬虫(第十一章:Scrapy框架实战:爬取网页新闻标题和内容)
综合练习:爬取网易新闻标题和内容1.spider.py【完成数据的爬取解析】import scrapyfrom selenium import webdriverfrom wangyiPro.items import WangyiproItemclass WangyiSpider(scrapy.Spider):name = 'wangyi'# allowed_domains = ['www.xxx
·
综合练习:爬取网易新闻标题和内容
1.spider.py【完成数据的爬取解析】
import scrapy
from selenium import webdriver
from wangyiPro.items import WangyiproItem
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://news.163.com/']
all_url = [] # 存储2个模块对应的详细页的URL
#selenium模块
def __init__(self):
# options = webdriver.ChromeOptions()
#
# # 忽略无用的日志
# options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
# self.driver = webdriver.Chrome(chrome_options=options)
# # self.driver.get(r'https://192.168.1.1')
# #
self.driver = webdriver.Chrome(executable_path='./chromedriver.exe')
#1.首页数据解析
def parse(self, response):
li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li')
num = [3,4]
for index in num:
title_url = li_list[index].xpath('./a/@href').extract_first()
self.all_url.append(title_url)
#对每一个板块对应的页面进行请求
for url in self.all_url:
yield scrapy.Request(url=url,callback=self.parse_url)
#2.每一个板块对应的新闻标题相关的内容都是动态加载
def parse_url(self,response):
div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div[1]/div/ul/li/div/div')
for div in div_list:
title = div.xpath('./a/img/@alt').extract_first()
item = WangyiproItem()
item['title'] = title
print('title:',title)
detail_url = div.xpath('./a/@href').extract_first() or div.xpath('./div/div[1]/h3/a/@href').extract_first()
# div.xpath('./a/@href' | './div/div[1]/h3/a/@href').extract_first()
yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item})
#3.解析新闻内容
def parse_detail(self,response): #解析新闻内容
item = response.meta['item']
news = response.xpath('//*[@id="content"]/div[2]//text()').extract()
# news = ''.join(news)
item['news'] = news
print('news:',news)
yield item
#4.关闭浏览器
def closed(self,spider):
self.driver.quit()
2.middlewares.py
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from time import sleep
from scrapy.http import HtmlResponse
class WangyiproDownloaderMiddleware:
# 该方法拦截2大板块对应的响应对象,进行篡改
def process_response(self, request, response, spider): # spider指爬虫文件
# 获取了在爬虫类中定义的浏览器对象
driver = spider.driver
# 挑选出指定的响应对象进行篡改
if request.url in spider.all_url:
driver.get(request.url) # 2个板块对应的url进行请求
sleep(3)
page_text = driver.page_source # 包含了动态加载数据
# 实例化一个新的响应对象(符合需求:包含动态加载出的新闻数据),替代原来旧的响应对象
# new_response = HtmlResponse(url,body,encoding,request)
# 如何获取动态加载出来的新闻数据
# 基于selenium便捷的获取动态加载数据
new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return new_response
else:
# response #其他请求对应的响应对象
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
3.items.py【明确爬取目标:新闻标题title和内容news】
import scrapy
class WangyiproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
news = scrapy.Field()
4.pipelines.py【存储数据】
class WangyiproPipeline:
def process_item(self, item, spider):
# print(item)
return item
5.settings.py
1:
ITEM_PIPELINES = {
'wangyiPro.pipelines.WangyiproPipeline': 300,
}
2:
DOWNLOADER_MIDDLEWARES = {
'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543,
}
3:
USER_AGENT = 'XXX' #自己服务器的user_agent
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!
更多推荐
已为社区贡献2条内容
所有评论(0)