Python爬虫爬取ECVA论文标题作者摘要关键字等信息并存储到mysql数据库
网站截图:
源代码:
1 import re 2 import requests 3 import pymysql 4 from bs4 import BeautifulSoup 5 import lxml 6 import traceback 7 import time 8 import json 9 from lxml import etree 10 def query(sql,*args): 11 """ 12 封装通用查询 13 :param sql: 14 :param args: 15 :return: 返回查询结果以((),(),)形式 16 """ 17 conn,cursor = get_conn(); 18 cursor.execute(sql) 19 res=cursor.fetchall() 20 close_conn(conn,cursor) 21 return res 22 def get_paper(): 23 #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/343_ECCV_2020_paper.php 24 url="https://www.ecva.net/papers.php" 25 headers = { 26 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" 27 } 28 response=requests.get(url,headers) 29 response.encoding="utf-8" 30 page_text=response.text 31 #输出页面html 32 # print(page_text) 33 soup = BeautifulSoup(page_text,"lxml") 34 all_dt=soup.find_all("dt",class_="ptitle") 35 print("dt:"+str(len(all_dt))) 36 #暂存信息 37 temp_res=[] 38 #最后结果集 39 res=[] 40 #链接 41 link_res = [] 42 for dt in all_dt: 43 single_dt=str(dt) 44 single_soup=BeautifulSoup(single_dt,"lxml") 45 title=single_soup.find("a").text 46 #存标题 47 temp_res.append(title[2:]) 48 #存摘要 49 50 #存关键字 51 52 #存源链接 53 sourcelink=single_soup.find("a")["href"] 54 sourcelink="https://www.ecva.net/"+sourcelink 55 temp_res.append(sourcelink) 56 res.append(temp_res) 57 temp_res=[] 58 #爬取作者和pdf文件链接 59 all_dd=soup.find_all("dd") 60 print("dd:"+str(len(all_dd))) 61 flag=0 62 temp_link=[] 63 author=[] #作者列表 一层list 64 for item in all_dd: 65 if(flag%2==0): 66 #保存作者 67 author.append(item) 68 else: 69 linktext=str(item) 70 linksoup=BeautifulSoup(linktext,"lxml") 71 link_list=linksoup.find_all("a") 72 for i in link_list: 73 if(i.get("href")==None): 74 temp_link.append("fakelink") 75 else: 76 # print(i) 77 if("http" not in str(i.get("href")) and "papers" in str(i.get("href"))): 78 temp_link.append(("https://www.ecva.net/"+str(i.get("href")))) 79 else: 80 temp_link.append(i.get("href")) 81 print(temp_link) 82 link_res.append(temp_link) 83 temp_link=[] 84 #解析download 和 pdfinfo 85 flag = flag + 1 86 """ 87 继续使用beautifulsoup 88 download_text 和 pdfinfo_text 89 存储author 90 "https://www.ecva.net/" 91 """ 92 linkflag=1 93 print("------------------------------") 94 #把作者和download pdfinfo 存到res 95 for i in range(0,len(author)): 96 #添加作者 97 str_author=str(author[i]) 98 new_author=str_author.replace("<dd>","") 99 new_author=new_author.replace(" </dd>","") 100 new_author = new_author.replace(" ", "") 101 res[i].append(new_author) 102 # print("link_res:"+str(len(link_res))) 103 if(len(link_res[i])==2): 104 #添加download 105 res[i].append(link_res[i][0]) 106 #添加pdfinfo 107 res[i].append(link_res[i][1]) 108 else: 109 # 添加download 110 res[i].append(link_res[i][0]) 111 # 添加pdfinfo 112 res[i].append(link_res[i][2]) 113 print("----------------------") 114 # print(len(author)) 115 # print(len(download)) 116 # print(len(pdfinfo)) 117 # for item in res: 118 # print(item) 119 return res 120 ############################################################# 121 #继续爬取abstract 和 keyword 122 def get_further(): 123 res=get_paper() 124 temp_res=[] 125 further_res=[] 126 db_res=[] 127 sql="SELECT pdfinfo FROM pdf;" 128 db_res=query(sql) #返回元祖 要继续[0]访问数据 129 #对结果集的链接发起请求 130 for i in range(1358,len(db_res)): 131 url=db_res[i][0] #获取url 132 print(url) 133 headers={ 134 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 135 "Chrome/91.0.4472.101 Safari/537.36" 136 } 137 try: 138 response=requests.get(url,headers) 139 response.encoding = "utf-8" 140 page_text = response.text 141 # print(page_text) 142 soup = BeautifulSoup(page_text, "lxml") 143 144 abstract = soup.find("p", id="Par1").text 145 #去掉 146 abstract = abstract.replace(" ","") 147 print("摘要:"+abstract) 148 keyword = soup.find_all("span", class_="Keyword") 149 # print(keyword) 150 # find_keyword=re.compile("<span class="Keyword">(.*?)</span>") 151 keyword_str = "" 152 for items in keyword: 153 # 获取所有文本 154 keyword_str = keyword_str + items.get_text() 155 print("关键字:"+keyword_str) 156 #去掉 xa0 157 keyword_str=keyword_str.replace("xa0",",") 158 #去掉末尾的一个逗号 159 keyword_str = keyword_str[0:-1] 160 # 最后添加 摘要和关键字 161 temp_res.append(abstract) 162 temp_res.append(keyword_str) 163 further_res.append(temp_res) 164 print(temp_res) 165 print("~~~~~~~~~~~~~~~~~~~~~~~~~~~") 166 temp_res = [] 167 except: 168 print("链接无效!") 169 try: 170 if(len(further_res[i][0])==0): 171 res[i].append("no abstract") 172 else: 173 res[i].append(further_res[i][0]) 174 if(len(further_res[i][1])==0): 175 res[i].append("no keyword") 176 else: 177 res[i].append(further_res[i][1]) 178 print(res[i]) 179 # 插入数据库 180 # insert_paper_1(res[i], i) 181 except: 182 print("IndexError: list index out of range") 183 return 184 185 #连接数据库 获取游标 186 def get_conn(): 187 """ 188 :return: 连接,游标 189 """ 190 # 创建连接 191 conn = pymysql.connect(host="127.0.0.1", 192 user="root", 193 password="000429", 194 db="paperinfo", 195 charset="utf8") 196 # 创建游标 197 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示 198 if ((conn != None) & (cursor != None)): 199 print("数据库连接成功!游标创建成功!") 200 else: 201 print("数据库连接失败!") 202 return conn, cursor 203 #关闭数据库连接和游标 204 def close_conn(conn, cursor): 205 if cursor: 206 cursor.close() 207 if conn: 208 conn.close() 209 return 1 210 def insert_paper_0(): 211 conn,cursor=get_conn() 212 res=get_paper() 213 print(f"{time.asctime()}开始插入论文详情数据") 214 try: 215 sql = "insert into paper (title,sourcelink,author,download,abstract,keyword) values(%s,%s," 216 "%s,%s,%s,%s)" 217 for item in res: 218 print(item) 219 # 异常捕获,防止数据库主键冲突 220 try: 221 cursor.execute(sql, [item[0], item[1], item[2], item[3],"",""]) 222 except pymysql.err.IntegrityError: 223 print("重复!") 224 print("###########################") 225 conn.commit() # 提交事务 update delete insert操作 226 print(f"{time.asctime()}插入论文详情数据完毕") 227 except: 228 traceback.print_exc() 229 finally: 230 close_conn(conn, cursor) 231 return 232 ######################################### 233 def insert_paper_1(res,count): 234 conn,cursor=get_conn() 235 print(f"{time.asctime()}开始插入论文详情数据") 236 try: 237 sql = "insert into paper (title,sourcelink,author,download,abstract,keyword) values(%s,%s," 238 "%s,%s,%s,%s)" 239 print(res) 240 # 异常捕获,防止数据库主键冲突 241 try: 242 cursor.execute(sql, [res[0], res[1], res[2], res[3],res[5],res[6]]) 243 except pymysql.err.IntegrityError: 244 print("重复!") 245 print("###########################") 246 conn.commit() # 提交事务 update delete insert操作 247 print(f"{time.asctime()}插入第"+str(count+1)+"条论文详情数据完毕") 248 except: 249 traceback.print_exc() 250 finally: 251 close_conn(conn, cursor) 252 return 253 254 #单独插入 pdfinfo 255 def inseet_pdf(): 256 conn, cursor = get_conn() 257 res=get_paper() 258 print(f"{time.asctime()}开始插入论文pdfinfo数据") 259 try: 260 sql = "insert into pdf (id,pdfinfo) values(%s,%s)" 261 # 异常捕获,防止数据库主键冲突 262 for item in res: 263 print(item) 264 # 异常捕获,防止数据库主键冲突 265 try: 266 cursor.execute(sql, [0,item[4]]) 267 except pymysql.err.IntegrityError: 268 print("重复!") 269 print("###########################") 270 conn.commit() # 提交事务 update delete insert操作 271 print(f"{time.asctime()}插入论文pdfinfo完毕") 272 except: 273 traceback.print_exc() 274 finally: 275 close_conn(conn, cursor) 276 return 277 if (__name__=="__main__"): 278 get_further() 279 # inseet_pdf()