Python爬虫:爬取华为应用市场全部app信息
先不着急写程序,先对网站进行分析
目标网址为:https://appgallery.huawei.com/#/Apps
抓取目标为:app名称 包名 和开发公司名
F12检查发现为动态加载
检查完发现为动态加载get请求 设置params循环获取全部数据 代码如下:
url = “https://web-drcn.hispace.dbankcloud.cn/uowap/index”
headers = {
“你的”: “ua”,
“你的”: “防盗链”
}
for c in range(1,17):
param = {
“method”: “internal.getTabDetail”,
“serviceType”: 20,
“reqPageNum”: {c},
“uri”: f”{d}”,
“maxResults”: 25,
“zone”: “”,
“locale”: “zh”
}
所有数据都拿到 接下来就是提取过程
此时我们发现此时的数据中并没有开发公司的名称 需要点击app跳转后在动态加载中才找到,所以我们需要继续分析跳转页面
数据已经找到 对url进行分析后发现在挂载内容里只多了个appid 而appid在上一个页面中可以提取到,所以我们只需要进行拼接url即可
代码如下:
url1 = “https://web-drcn.hispace.dbankcloud.cn/uowap/index”
headers1 = {
“你的”: “ua”,
“你的”: “防盗链”
}
param1 = {
“method”: “internal.getTabDetail”,
“serviceType”: 20,
“reqPageNum”: 1,
“maxResults”: 25,
“uri”: f”app|{appid}”,
“shareTo”: “”,
“currentUrl”: f”https%3A%2F%2Fappgallery.huawei.com%2F%23%2Fapp%2F{appid}”,
“accessId”: “”,
“appid”: f”{appid}”,
“zone”: “”,
“locale”: “zh”
}
此时完整一页的数据已经可以获取到但是其他分类并没有获取到
点击后我们发现只有挂载中的uri不同 而uri再上一页也能获取到 那这就好办了 继续构造url然后请求即可
我用了一个笨方法 逐一点击手动建个列表 将所有的url保存 然后进行循环
uid = [
"33ef450cbac34770a477cfa78db4cf8c","8e62cf6d238c4abdb892b400ff072f43","79bd417da03d470287c0c7c2ef8f2c96","84471de6a7524d4a9242903fbc9bbe8b",
"65696386add14dda8b7ee8a20be03aad","07e66002a01c442990ed5630aa460d48","c946b166e7c34dcab8a8960bf6979dd3","50151113bc4143d0aa7013843ff0ef32",
"ce87048699a64f5db8a90bca7bcf68fd","e3beb74372c44ee899709a038eabfc70","4d5e752fde6c4b33869058a69565171e","24305799357048a4a9585f4f4c05dc1e",
"2d2b18f338244b9db71d1ec30b257f1e","7e04648230ca4bbaa836fa8c027517ba","a29745005a8942b797d3d5ddf6bb1b48","d6566ca265754426b36cc6a12fa1e2cd",
"1f316fc086704f169e7a841341ed05c4","5e4425e03ae44a87a5293dc2d9ebcfde","3ae307aff6c541818f3f9c242f18fd85","3e28c821504e473c9f4990d78d235837",
"ee252e5e36524275b17d5bbee7ab08a5","43285bc8c9344cd2b973165ef8fc9aee"
]
保存的数据 这里只提取了三个 需要的话课追加 只需要在提取代码处增加即可
最后为整体的代码实现:
import requests
import time
import csv
uid = [
"33ef450cbac34770a477cfa78db4cf8c","8e62cf6d238c4abdb892b400ff072f43","79bd417da03d470287c0c7c2ef8f2c96","84471de6a7524d4a9242903fbc9bbe8b",
"65696386add14dda8b7ee8a20be03aad","07e66002a01c442990ed5630aa460d48","c946b166e7c34dcab8a8960bf6979dd3","50151113bc4143d0aa7013843ff0ef32",
"ce87048699a64f5db8a90bca7bcf68fd","e3beb74372c44ee899709a038eabfc70","4d5e752fde6c4b33869058a69565171e","24305799357048a4a9585f4f4c05dc1e",
"2d2b18f338244b9db71d1ec30b257f1e","7e04648230ca4bbaa836fa8c027517ba","a29745005a8942b797d3d5ddf6bb1b48","d6566ca265754426b36cc6a12fa1e2cd",
"1f316fc086704f169e7a841341ed05c4","5e4425e03ae44a87a5293dc2d9ebcfde","3ae307aff6c541818f3f9c242f18fd85","3e28c821504e473c9f4990d78d235837",
"ee252e5e36524275b17d5bbee7ab08a5","43285bc8c9344cd2b973165ef8fc9aee"
]
with open("华为应用市场.csv", "a", encoding="utf-8", newline="") as f:
csv_writer = csv.writer(f)
csv_writer.writerow(["app名称", "应用包名", "开发名称"])
for d in uid:
url = "https://web-drcn.hispace.dbankcloud.cn/uowap/index"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
"Referer": "https://appgallery.huawei.com/"
}
for c in range(1,16):
param = {
"method": "internal.getTabDetail",
"serviceType": 20,
"reqPageNum": {c},
"uri": f"{d}",
"maxResults": 25,
"zone": "",
"locale": "zh"
}
re = requests.get(url=url,headers=headers,params=param).json()
tiqu = re["layoutData"]
for a in tiqu:
bms = a["dataList"][0:]
for bm in bms:
baoming = bm["package"]
appid = bm["appid"]
name = bm["name"]
id = bm["logSource"]
# print(baoming)
url1 = "https://web-drcn.hispace.dbankcloud.cn/uowap/index"
headers1 = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36 Edg/103.0.1264.44",
"Referer": "https://appgallery.huawei.com/"
}
param1 = {
"method": "internal.getTabDetail",
"serviceType": 20,
"reqPageNum": 1,
"maxResults": 25,
"uri": f"app|{appid}",
"shareTo": "",
"currentUrl": f"https%3A%2F%2Fappgallery.huawei.com%2F%23%2Fapp%2F{appid}",
"accessId": "",
"appid": f"{appid}",
"zone": "",
"locale": "zh"
}
re1 = requests.get(url=url1, headers=headers1, params=param1).json()
# print(re1)
# dataList > developer
tiqu = re1["layoutData"][3]
tiqu1 = tiqu["dataList"][0]
kaifa = tiqu1["developer"]
# print(kaifa)
with open("华为应用市场.csv", "a", encoding="utf-8", newline="") as f:
csv_writer = csv.writer(f)
csv_writer.writerow([name,baoming,kaifa])
print(name+"保存完成")
time.sleep(2)