【python】课堂派爬虫禁止下载的ppt
注意:下面程序,方法可以不看,但是要先在main函数里把账号密码啥的改成你自己的。也是一样的,在main函数里修改自己的参数。
·
先爬取pdf中所有的图片,下载到本地
注意:下面程序,方法可以不看,但是要先在main函数里把账号密码啥的改成你自己的。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2022/12/1 8:41
# @Author : Fw_022
import time
"""
获取网页源码
request--get
"""
def get(url):
import requests # 导入requests包
html = requests.get(url) # Get方式获取网页数据
text = html.text # 获得html(整个网页)的源码部分
return text
"""
F12,单击"Network"
对页面进行操作
开发者模式中,依次单击“Network”按钮和“XHR”按钮,找到目标数据。
将 Headers 中的 URL 复制出来
"""
def make_folder(folder_path):
import os
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return folder_path
# url 必须是headers的,不是原网址
def post(url):
import json
import requests # 导入requests包
From_data = {'i': "word", 'from': 'zh-CHS', 'to': 'en', 'smartresult': 'dict', 'client': 'fanyideskweb',
'salt': '15477056211258', 'sign': 'b3589f32c38bc9e3876a570b8a992604', 'ts': '1547705621125',
'bv': 'b33a2f3f9d09bde064c9275bcb33d94e', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web',
'action': 'FY_BY_REALTIME', 'typoResult': 'false'}
response = requests.post(url, data=From_data) # 请求表单数据
content = json.loads(response.text) # 将Json格式字符串转字典
print(content)
def beautifulsoup(url):
import requests # 导入requests包
from bs4 import BeautifulSoup
html = requests.get(url)
soup = BeautifulSoup(html.text, 'lxml')
"""
:method soup.select(),
:return list
标签名 不加任何修饰
类名 .
id #
"""
data = soup.select('img')
for i in data:
print(i)
class Beautiful():
def __init__(self, url, account, password):
self.url = url
self.web = None
self.account = account
self.password = password
self.start()
time.sleep(5)
self.start2()
time.sleep(3)
def start(self):
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
web = webdriver.Chrome(service=Service(r"C:\Program Files\Google\Chrome\Application\chromedriver.exe"))
web.implicitly_wait(5)
web.get(self.url)
self.web = web
self.input('/html/body/div/div[2]/div[2]/div[2]/div/div/form/div[1]/div/div/input', self.account)
self.input('/html/body/div/div[2]/div[2]/div[2]/div/div/form/div[2]/div/div/input', self.password)
print("登陆成功")
def start2(self):
self.web.get(self.url)
print("到达网页")
def get_form_xpath(self, xpath):
from selenium.webdriver.common.by import By
data = self.web.find_element(By.XPATH, xpath)
return data
def get_from_class(self, class_name):
from selenium.webdriver.common.by import By
data = self.web.find_element(By.CLASS_NAME, class_name)
return data
def get_src(self, element):
img = element.get_attribute('src')
return img
def save_img(self, save_path, img_url):
if img_url == None:
return
import requests # 导入requests包
img = requests.get(img_url)
file = open(save_path, "wb")
file.write(img.content)
file.close()
def input(self, xpath, string):
data = self.get_form_xpath(xpath)
data.clear()
data.send_keys(string)
if __name__ == "__main__":
# 所有需要设置的参数:想要爬取的pdf的url,总页码数page,保存路径path,账号,密码
url = "xxx"
page = 45
path = make_folder("xxx")
account, password = "xxx", "xxx"
bea = Beautiful(url,account,password)
# 注意这里从1开始,因为i代表页码
for i in range(1, page + 1):
pic_xpath = "/html/body/div/div[2]/div[1]/div/div[1]/div[2]/img"
title_xpath = "/html/body/div/div[2]/div[1]/div/div[1]/div[1]/p"
page_input_xpath = "/html/body/div/div[2]/div[2]/aside/div[2]/div/input"
# 到达页面(输入页码后,点击任意一个地方,这里设置为点击标题)
bea.input(page_input_xpath, str(i))
bea.get_form_xpath(title_xpath).click()
# 获取图片
data = bea.get_form_xpath(pic_xpath)
img = bea.get_src(data)
print("===> 找到图片" + str(i))
# 保存图片
save = path + "\\" + str(i) + ".jpg"
bea.save_img(save, img)
print("保存到路径" + save + "\n")
将爬取的图片合成pdf
也是一样的,在main函数里修改自己的参数
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2022/12/1 14:16
# @Author : Fw_022
import img2pdf
import os
import re
def make_folder(folder_path):
import os
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return folder_path
def from_photo_to_pdf(photo_path, width, height):
# 0、保存地址为:pdf/图片文件夹.pdf
save_path = make_folder(r"pdf/" + photo_path) + ".pdf"
# 1、按名称排序,注意这里按照数字排序,而不是字典序。否则会出现乱序
new_list = os.listdir(photo_path)
new_list = [os.path.join(photo_path, i) for i in new_list]
new_list.sort(key=lambda x: int(re.findall(r'\d+', x)[0]))
print(new_list)
# 2、指定pdf的单页的宽和高
a4inpt = (img2pdf.mm_to_pt(width), img2pdf.mm_to_pt(height))
layout_fun = img2pdf.get_layout_fun(a4inpt)
with open(save_path, 'wb') as f:
f.write(img2pdf.convert(new_list, layout_fun=layout_fun))
if __name__ == '__main__':
photo_path = "xxx"
width, height = 720, 540
from_photo_to_pdf(photo_path, width, height)
开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!
更多推荐
已为社区贡献6条内容
所有评论(0)