Request爬取各类网站的数据(实例爬取)
1. 先上代码
1 # !/usr/bin/env python 2 # ! _*_ coding:utf-8 _*_ 3 # @TIME : 2020/10/12 13:29 4 # @Author : Noob 5 # @File : bases.py 6 7 import requests 8 from bs4 import BeautifulSoup 9 import re 10 import xlwt 11 import xlrd 12 13 class Bases: 14 15 fo = open("data.txt", "r", encoding="utf-8") 16 lines = fo.readlines() 17 18 # 说明书读取 19 def readExplain(self): 20 21 x = input("是否读取说明(y or n):") 22 if x == "y": 23 ro = open("explain.txt", "r+", encoding="utf-8") 24 strs = ro.read() 25 print(strs) 26 ro.close() 27 else: 28 pass 29 30 # 动态url 31 def getUrl(self, keywords, starts): 32 33 lines = self.lines 34 baseurl = lines[15].strip() 35 key = lines[17].strip() 36 fw = lines[23].strip() 37 bw = lines[25].strip() 38 39 if "." in fw or bw: 40 fwf = fw.replace(".", ".") 41 bwf = bw.replace(".", ".") 42 else: 43 fwf = fw 44 bwf = bw 45 if fw != "": 46 url = re.sub(fwf + "(.+?)" + bwf, fw + str(starts) + bw, baseurl) 47 url = url.replace(key, keywords) 48 else: 49 url = baseurl.replace(key, keywords) 50 if "$" in url: 51 url = url[0: -1] 52 print("当前url是:%s" % url) 53 return url 54 55 # 请求头 56 def getHeader(self): 57 58 lines = self.lines 59 header = { 60 "accept": lines[5].strip(), 61 "accept-encoding": lines[7].strip(), 62 "accept-language": lines[9].strip(), 63 "cache-control": lines[11].strip(), 64 "Connection": lines[13].strip(), 65 "Upgrade-Insecure-Requests": lines[3].strip(), 66 "User-Agent": lines[1].strip() 67 } 68 return header 69 70 # 封装请求 71 def getContent(self, key="学霸", start=0): 72 73 url = self.getUrl(key, start) 74 try: 75 assert ("http" in url) 76 except: 77 return "url有问题,请重来!!!" 78 else: 79 res = requests.get(url, headers=self.getHeader()) 80 if res.status_code == 200: 81 return res 82 else: 83 return "请求失败,状态码为:%d" % res.status_code, "error" 84 finally: 85 # print("这是一个检查url是否正确的块") 86 pass 87 88 # 获取完整文本 89 def getContents(self, key): 90 91 lines = self.lines 92 try: 93 offset = int(lines[19]) 94 j = int(lines[21].strip()) 95 except ValueError as msg: 96 print("输入数据有错,请返回检查!!!", msg) 97 else: 98 words = lines[27].strip() 99 resText = "" 100 while 1: 101 res = self.getContent(key, j) 102 103 res.encoding = "utf-8" # 中文乱码的时候 104 105 if type(res) == str: 106 print(res) 107 break 108 if type(res) == tuple: 109 print(res) 110 break 111 if res.status_code == 400: 112 break 113 if len(res.text) < 100: 114 break 115 if words not in res.text: 116 break 117 if str(j) not in res.url: # 当没有页码或者滑动加载的时候,并不准确 118 resText = resText + res.text 119 break 120 resText = resText + res.text 121 j = j + offset 122 resText = resText.replace("<!DOCTYPE html>", "") 123 resText = BeautifulSoup(resText, features="html.parser") 124 eo = open("export.txt", "w", encoding="utf-8") 125 eo.write(str(resText)) 126 eo.close() 127 return resText 128 129 # 数据过滤 130 def getFilter(self, key): 131 132 lines = self.lines 133 resText = str(self.getContents(key)) 134 135 counts = int(lines[29].strip()) 136 137 j = 31 # 匹配规则开始下标 138 datas = [] # 所有匹配数据列表名 139 140 for i in range(counts): 141 pattern = lines[j].strip() 142 datas.append(re.compile(pattern).findall(resText)) 143 j = j + 2 144 145 # 数据爬取到TXT 146 # ao = open("abc.txt", "a", encoding="utf-8") 147 # 148 # ao.write(ns[0] + " " + ns[1] + " " + ns[2] + " " + ns[3] + " ") # 项目名制成表头 149 # 150 # for i in range(len(datas[0])): 151 # k = "" 152 # for j in range(len(datas)): 153 # k = k + datas[j][i] + " " 154 # ao.write(k + " ") 155 # ao.close() 156 return datas 157 158 159 # 从Excel中读取搜索数据 160 def readExcel(self): 161 162 xd = xlrd.open_workbook("ok.xlsx") 163 sn = xd.sheet_by_index(0) 164 coms = [] 165 j = 1 166 while 1: 167 com = sn.cell_value(j, 0) 168 if com == "": 169 break 170 coms.append(com) 171 j = j + 1 172 return coms 173 174 # 数据写到Excel 175 def writeExcel(self): 176 177 data = self.readExcel() # 二维数组 178 datas = [] # 三维数组datas[a][b][c] 179 for i in range(len(data)): 180 data[i] = self.getFilter(data[i]) 181 datas.append(data[i]) 182 183 print(datas) 184 185 # 创建表 186 xt = xlwt.Workbook(encoding="gbk") 187 sn = xt.add_sheet("what") 188 189 # 制表头 190 lines = self.lines 191 # 找到匹配开始的元素索引和项目名 192 j = 0 193 for i in lines: 194 if "正则匹配规则" in i: 195 n = re.compile(r"#(.+?)#").findall(i.strip()) 196 if len(n) > 0: 197 sn.write(0, j, n[0]) # 第几行第几列值是什么 198 j = j + 1 199 200 # 单元格宽度:0的占位符是256,那么20个0就是256*20 201 if "单元格宽度" in i: 202 i = lines[lines.index(i) + 1] 203 i = i.split("*") # 字符串切割成数组 204 for k in range(len(i)): 205 sn.col(k).width = 256*int(i[k]) 206 207 # 写入数据 208 count = 1 # 计行 209 for i in datas: 210 for j in range(len(i[0])): # 每个搜索值的数量 211 for k in range(len(i)): # 搜索项数量 212 sn.write(count, k, i[k][j]) # 这里不要写错了 213 count = count + 1 214 215 return xt.save("ok.xls") # 保存格式必须是.xls,否则失败 216 217 # 运行 218 def main(self): 219 print("运行开始abc") 220 self.writeExcel() 221 222 fo.close() 223 224 if __name__ == "__main__": 225 bs = Bases() 226 bs.main()