慕课网爬虫

"""

本demo是爬慕课网,实战课下前端,后端,移动开发,云计算大数据,数据库,部分页面下,所有课程页面信息。

  代码有需要改进,请指出,谢谢。

"""

# author:Administrator 2 # date:2021/04/30 3 4 import requests #第三方下载器 5 import re #正则表达式 6 import json #格式化数据用 7 from requests.exceptions import RequestException #做异常处理 8 from multiprocessing import Pool #使用多进程 9 10 11 12 def geturl(url): 13 try: 14 response = requests.get(url) 15 if response.status_code == 200: 16 return response.content.decode("utf-8") 17 return None 18 except RequestException: 19 return None 20 21 homeurl="https://coding.imooc.com" 22 #取慕课主页课程url 放入list 23 stuname_dict_url = {} 24 def parse_one_classUrl(html,stuname): 25 pattern = re.compile(".*?<a target="_blank" href="(.*?)">",re.S) 26 items = re.findall(pattern,html) 27 #url 拼接 28 items = [homeurl + i for i in items] 29 stuname_dict_url[stuname] = items 30 return stuname_dict_url 31 32 33 # 正则匹配数据 34 def parse_one_page(html,url,stuname): 35 pattern = re.compile( 36 ".*?<div class="title-box">.*?<h1>(.*?)</h1>" 37 ".*?<span>难度</span>.*?<span class="nodistance">(.*?)</span>" 38 ".*?<span>时长</span>.*?<span class="nodistance">(.*?)</span>" 39 ".*?<span>学习人数</span>.*?<span class="nodistance">(.*?)</span>" 40 ".*?<span>综合评分</span>.*?<span class="nodistance">(.*?)</span>" 41 ,re.S) 42 items = re.findall(pattern,html) 43 #定义个list 为了格式化 44 tup_items = items[0] + (url,stuname,) 45 list = [] 46 list.append(tup_items) 47 for item in list: 48 # 格式化每一条数据为字典类型的数据 49 yield { 50 "title": item[0], 51 "difficulty": item[1], 52 "duration": item[2], 53 "stu_number": item[3], 54 "comprehensive_evaluation": item[4], 55 "url":item[5], 56 "stuname":item[6] 57 } 58 59 #获取课程urlList 60 def getClassurl(dict): 61 for class_type in dict: 62 for stuname in dict[class_type]: 63 url = geturl(dict[class_type][stuname]) 64 #获取课程url 是一个字典类型 {name:[url]} 65 dic = parse_one_classUrl(url,stuname) 66 return dic 67 68 #写入文本 69 def write_to_file(name,content): 70 with open("..\text\%s.txt" %name,"a",encoding="utf-8") as f: 71 f.write(json.dumps(content,ensure_ascii=False)+" ") 72 f.close() 73 74 75 dict_qd = {"前端":{"vus.js":"https://coding.imooc.com/?c=vuejs","HTML/CSS":"https://coding.imooc.com/?c=html","JavaScript":"https://coding.imooc.com/?c=javascript","Node.js":"https://coding.imooc.com/?c=nodejs"}} 76 dict_hd = {"后端":{"java":"https://coding.imooc.com/?c=java","SpringBoot":"https://coding.imooc.com/?c=springboot","SpringCloud":"https://coding.imooc.com/?c=springcloud"}} 77 dict_ydkf = {"移动开发":{"android":"https://coding.imooc.com/?c=android","ios":"https://coding.imooc.com/?c=ios","Reactnative":"https://coding.imooc.com/?c=reactnative"}} 78 dict_yun = {"云计算大数据":{"hadoop":"https://coding.imooc.com/?c=hadoop","大数据":"https://coding.imooc.com/?c=bigdata","Spark":"https://coding.imooc.com/?c=spark","Docker":"https://coding.imooc.com/?c=docker"}} 79 dict_db = {"数据库":{"mysql":"https://coding.imooc.com/?c=mysql","redis":"https://coding.imooc.com/?c=redis","mongodb":"https://coding.imooc.com/?c=mongodb"}} 80 81 def main(): 82 pool = Pool(processes=5) 83 #慕课课程url 84 url_dict = pool.apply_async(getClassurl,(dict_db,)).get() 85 for stuname in url_dict: 86 for url in url_dict[stuname]: 87 print(stuname,url) 88 classhtml = pool.apply_async(geturl,(url,)).get() 89 for item in parse_one_page(classhtml,url,stuname): 90 write_to_file("dict_db",item) 91 92 pool.close() 93 pool.join() 94 95 if __name__ == "__main__": 96 main()
hmoban主题是根据ripro二开的主题,极致后台体验,无插件,集成会员系统
自学咖网 » 慕课网爬虫