使用scrapy创建爬虫爬取读书网图书信息存入mysql
简述需求:读书网数据入库环境:python3.7,pycharm脚本需求:scrapy、pymysql操作1.下载scrapypip install scrapy2.创建项目:scrapy startproject dushu3.跳转到 spiders 路径 cd\dushu\dushu\spiders4.创建爬虫类:scrapy genspider ‐t crawl read www.dushu
·
简述
需求:读书网数据入库
环境:python3.7,pycharm
脚本需求:scrapy、pymysql
操作
1.下载scrapy
pip install scrapy
2.创建项目:scrapy startproject dushu
3.跳转到 spiders 路径 cd\dushu\dushu\spiders
4.创建爬虫类:scrapy genspider ‐t crawl read www.dushu.com
5.items(实体类)
import scrapy
class ScrapyDushuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
src = scrapy.Field()
author = scrapy.Field()
intro = scrapy.Field()
pass
6.spiders(爬虫文件)
- 注意第一页数据地址
read.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_dushu.items import ScrapyDushuItem
class ReadSpider(CrawlSpider):
name = 'read'
allowed_domains = ['www.dushu.com']
# 1107表示的是科技类图书
start_urls = ['https://www.dushu.com/book/1107_1.html']
# 处理规则,不跟进(取13页数据)
rules = (
Rule(LinkExtractor(allow=r'/book/1107_\d+\.html'), callback='parse_item', follow=False),
)
# 信息爬取
def parse_item(self, response):
info_list = response.xpath('//div[@class="book-info"]')
for info in info_list:
src = info.xpath('.//img/@data-original').extract_first()
name = info.xpath('.//img/@alt').extract_first()
author = info.xpath('.//p[1]/text()').extract_first()
intro = info.xpath('.//p[2]/text()').extract_first()
book = ScrapyDushuItem(name=name,src=src,author=author,intro=intro)
yield book
7.settings.py 打开管道
ITEM_PIPELINES = {
'scrapy_dushu.pipelines.ScrapyDushuPipeline': 300,
'scrapy_dushu.pipelines.MysqlPipeline': 300
}
8.pipelines.py 数据保存到本地
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ScrapyDushuPipeline:
def open_spider(self,spider):
self.fp = open('book.csv','w',encoding='utf-8')
def process_item(self, item, spider):
self.fp.write(str(item['name']) + ',' + str(item['src']) + ',' + str(item['author']) + ',' + str(item['intro']) + '\n')
return item
def close_spider(self,spider):
self.fp.close()
9.链接跟进
跟进后取所有数据
read.py
Rule(LinkExtractor(allow=r'/book/1107_\d+\.html'), callback='parse_item', follow=True),
数据入库
下载 pymysql
pip install pymysql
(1)创建数据库以及表
CREATE DATABASE `spider01` default character set = 'utf8';
USE `spider01`;
CREATE TABLE book(
`id` INT PRIMARY KEY AUTO_INCREMENT,
`name` VARCHAR(128),
`src` VARCHAR(128),
`author` VARCHAR(50),
`intro` VARCHAR(128));
(2)settings配置参数:
DB_HOST = '127.0.0.1'
DB_PORT = 3306
DB_USER = 'root'
DB_PASSWORD = '123456'
DB_NAME = 'spider01'
DB_CHARSET = 'utf8'
(3)管道配置
pipelines.py
from itemadapter import ItemAdapter
from scrapy.utils.project import get_project_settings
import pymysql
class MysqlPipeline:
# __init__方法和open_spider的作用是一样的
# init是获取settings中的连接参数
def __init__(self):
settings = get_project_settings()
self.host = settings['DB_HOST']
self.port = settings['DB_PORT']
self.user = settings['DB_USER']
self.pwd = settings['DB_PASSWORD']
self.name = settings['DB_NAME']
self.charset = settings['DB_CHARSET']
self.connect()
# 连接数据库并且获取cursor对象
def connect(self):
self.conn = pymysql.connect(host=self.host,
port=self.port,
user=self.user,
password=self.pwd,
db=self.name,
charset=self.charset)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
sql = 'insert into book(name,src,author,intro) values("{}","{}","{}","{}")'.format(item['name'],item['src'],item['author'],item['intro'])
# 执行sql语句
self.cursor.execute(sql)
self.conn.commit()
return item
def close_spider(self, spider):
self.conn.close()
self.cursor.close()
在配置文件中也新增管道
ITEM_PIPELINES = {
'scrapy_dushu.pipelines.ScrapyDushuPipeline': 300,
'scrapy_dushu.pipelines.MysqlPipeline': 300
}
最终数据展示
开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!
更多推荐
已为社区贡献4条内容
所有评论(0)