Python 之微信指数小程序数据抓取
Python 之微信指数小程序数据抓取
Fiddler安装和设置
安装
Fiddler 安装包可以从这里获取,如果失效了可以自己网上找一个安装。
链接:https://pan.baidu.com/s/10tYQ-uL6HMddkOcIKnWEKQ?pwd=d1io
然后就是点击安装就好了,没什么好多说的。
启用HTTPS捕获
进入软件界面,点击 Tools -> Options -> HTTPS 启用捕获 https 请求并解密。
证书信任
设置信任根证书,不然进行抓包捕获时,其他网页就访问不了了。
证书安装
有时候,如果证书安装不正确,可能导致抓取 https 失败。如果你发现上面已经设置以后,仍然抓取不到 https 的话,可以尝试使用工具重新生成证书。
可以下载 fiddlercertmaker.exe 自动生成证书,具体安装过程可参考:Fiddler死活抓不了HTTPS包解决办法_fiddler 抓包 itune 310 错误-CSDN博客
链接:https://pan.baidu.com/s/19G6aBHtxQU4ViSicWw2NOw?pwd=y3uh
设置自动转发
设置指定 url 自动转发到本地,我这里是自动把请求转发到了我本地一个 Flask 搭建的服务,设置好以后进行保存(转发地址记得和你服务的地址保持一致)。
设置自动转发 https://search.weixin.qq.com/cgi-bin/wxaweb/wxindexfluctuations 的目的主要是为了获取数据请求参数中的 openid 和 search_key,因为我需要这两个请求参数去构造新的 body。
Unmatched requests passthrough 一定要勾选上——也就是不影响其他未匹配的请求
开启捕获
可以从 File -> Capture Traffic 开启捕获,也可以用 F12 快捷键开启捕获,当左下角有 Capturing 字样时,表示捕获已开启。
然后就可以正常捕获抓取 https 请求了
数据抓取处理
搭建并启动本地服务
可以自己在本地简单写一个服务接收和转发的请求并处理。我这里构造了两个 body 去分别获取 指数趋势 和 数据来源。
如果出现 Your proxy appears to only use HTTP and not HTTPS 报错,把转发 url 修改成 http 即可。
# coding:utf-8
import csv
import datetime
import json
import os
import traceback
import pygal
from pygal.style import Style
import requests
import urllib3
from flask import Flask, request
app = Flask(__name__)
time_indexes_map = {
"time": "日期",
"score": "指数"
}
channel_scores_map = {
"finder_score": "视频号",
"live_score": "直播",
"mpdoc_score": "公众号",
"query_score": "搜一搜",
"extlink_score": "网页",
"ad_score": "其他",
"total_score": "总计",
"score_exp": "score_exp", # 这个字段没找到对应中文意义,先以原始key值映射
}
headers = {'Host': 'search.weixin.qq.com',
'Connection': 'keep-alive',
# 'Content-Length': '182',
'xweb_xhr': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309092b) XWEB/9129',
'Content-Type': 'application/json',
'Accept': '*/*',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://servicewechat.com/wxc026e7662ec26a3a/53/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
# class ValueColors(pygal.style.Style):
# value_colors = ("#f6c443", "#ff6146", "#7c160", "#4fadf8", "#a9e87a", "#eda150")
class ResultHandler:
def __init__(self, file_save_dir):
self.file_save_dir = file_save_dir
self._init_file_save_dir()
def _init_file_save_dir(self):
os.makedirs(self.file_save_dir, exist_ok=True)
def draw_line(self, title, time_indexes, last_day=7):
time_indexes = time_indexes[-last_day:]
date_chart = pygal.StackedLine(fill=True, interpolate='hermite', x_label_rotation=-20, style=pygal.style.LightGreenStyle)
date_chart.x_labels = [str(x["time"])[4:] for x in time_indexes]
date_chart.add(title, [x["score"] for x in time_indexes])
file_path = os.path.join(self.file_save_dir, "line.svg")
date_chart.render_to_file(file_path)
def draw_pie(self, title, channel_scores, last_day=7):
# 颜色对应关系可以使用 pyautogui 的 getpixel 取色器获取
# colors_map = {
# "ad_score": "#eda150",
# "extlink_score": "#a9e87a",
# "finder_score": "#f6c443",
# "live_score": "#ff6146",
# "mpdoc_score": "#7c160",
# "query_score": "#4fadf8"
# }
channel_scores = channel_scores[-last_day:]
channel_score = channel_scores.pop()
for cs in channel_scores:
for key, score in cs.items():
channel_score[key] += score
# pie_chart = pygal.Pie(inner_radius=0.5, style=pygal.style.LightSolarizedStyle)
pie_chart = pygal.Pie(inner_radius=0.5)
pie_chart.title = title
# print(channel_score)
total_score = channel_score["total_score"]
for key, score in channel_score.items():
if key in ["score_exp", "total_score"]:
continue
percent = float("{:.2f}".format(100 * score / total_score))
pie_chart.add(channel_scores_map[key], percent)
file_path = os.path.join(self.file_save_dir, "pie.svg")
pie_chart.render_to_file(file_path)
def write_csv(self, title, rows: list):
if len(rows) == 0:
return True
fieldnames = list(rows[0].keys())
fieldnames = sorted(fieldnames, key=lambda x: len(x))
file = title + "_" + datetime.datetime.now().strftime("%Y%m%d") + ".csv"
file_path = os.path.join(self.file_save_dir, file)
try:
with open(file_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames)
if set(fieldnames) == set(time_indexes_map):
writer = csv.DictWriter(f, time_indexes_map.keys())
writer.writerow(time_indexes_map)
elif set(fieldnames) == set(channel_scores_map):
writer = csv.DictWriter(f, channel_scores_map.keys())
writer.writerow(channel_scores_map)
else:
writer.writeheader()
for row in rows:
writer.writerow(row)
except Exception as e:
print(e)
traceback.format_exc()
return False
return True
@app.route('/post_data', methods=['POST'])
def post():
if request.method == 'POST':
today = datetime.datetime.now().strftime("%Y%m%d")
file_save_dir = f"./files/{today}"
result_handler = ResultHandler(file_save_dir)
urllib3.disable_warnings()
data = request.get_json()
# print(data)
openid = data.get("openid")
search_key = data.get("search_key")
query = [data.get("query")]
end_ymd = datetime.datetime.now().strftime("%Y%m%d")
start_ymd = (datetime.datetime.now() - datetime.timedelta(365)).strftime("%Y%m%d")
# forward_url = 'https://search.weixin.qq.com/cgi-bin/wxaweb/wxindex'
forward_url = 'http://search.weixin.qq.com/cgi-bin/wxaweb/wxindex'
# 指数趋势
json_data = {'openid': openid, 'search_key': search_key, 'cgi_name': 'GetDefaultIndex',
'query': query, 'compound_word': [], 'start_ymd': start_ymd, 'end_ymd': end_ymd}
response = requests.post(forward_url, json=json_data, headers=headers, verify=False)
response_data = response.json()
# json.dump(response_data, open("test1.json", "w"), indent=2)
title = response_data["content"]["resp_list"][0]["query"]
time_indexes = response_data["content"]["resp_list"][0]["indexes"][0]["time_indexes"]
print(time_indexes[:2])
title_indexes = title + "_指数趋势"
result_handler.draw_line(title_indexes, time_indexes, 30)
result_handler.write_csv(title_indexes, time_indexes)
# # 数据来源
json_data2 = {'openid': openid, 'search_key': search_key, 'cgi_name': 'GetMultiChannel',
'query': query, 'start_ymd': start_ymd, 'end_ymd': end_ymd}
response = requests.post(forward_url, json=json_data2, headers=headers, verify=False)
response_data = response.json()
# json.dump(response_data, open("test2.json", "w"), indent=2)
result_list = response_data["content"]["result_list"]
channel_scores = [c["channel_score"] for c in result_list]
print(channel_scores[:2])
title_scores = title + "_数据来源"
result_handler.draw_pie(title_scores, channel_scores, 30)
result_handler.write_csv(title_scores, channel_scores)
return {}
if __name__ == '__main__':
app.run(host="127.0.0.1", debug=True)
小程序搜索关键字
- 进入电脑端微信
- 搜索 微信指数 小程序
- 进入小程序,输入想要搜索的关键词(比如:和平精英)
数据图表展示
微信图表展示如下:
我们自己使用 pygal 画的图如下(svg 图用浏览器打开),对比发现,除了插值导致的光滑度不一样,图的整体走势是一致的:
开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!
更多推荐
所有评论(0)