慕课网爬虫

python学习网 2021-05-12 06:00:07
'''

本demo是爬慕课网,实战课下前端,后端,移动开发,云计算大数据,数据库,部分页面下,所有课程页面信息。

  代码有需要改进,请指出,谢谢。

'''

# author:Administrator 2 # date:2021/04/30 3 4 import requests #第三方下载器 5 import re #正则表达式 6 import json #格式化数据用 7 from requests.exceptions import RequestException #做异常处理 8 from multiprocessing import Pool #使用多进程 9 10 11 12 def geturl(url): 13 try: 14 response = requests.get(url) 15 if response.status_code == 200: 16 return response.content.decode("utf-8") 17 return None 18 except RequestException: 19 return None 20 21 homeurl='https://coding.imooc.com' 22 #取慕课主页课程url 放入list 23 stuname_dict_url = {} 24 def parse_one_classUrl(html,stuname): 25 pattern = re.compile('.*?<a target="_blank" href="(.*?)">',re.S) 26 items = re.findall(pattern,html) 27 #url 拼接 28 items = [homeurl + i for i in items] 29 stuname_dict_url[stuname] = items 30 return stuname_dict_url 31 32 33 # 正则匹配数据 34 def parse_one_page(html,url,stuname): 35 pattern = re.compile( 36 '.*?<div class="title-box">.*?<h1>(.*?)</h1>' 37 '.*?<span>难度</span>.*?<span class="nodistance">(.*?)</span>' 38 '.*?<span>时长</span>.*?<span class="nodistance">(.*?)</span>' 39 '.*?<span>学习人数</span>.*?<span class="nodistance">(.*?)</span>' 40 '.*?<span>综合评分</span>.*?<span class="nodistance">(.*?)</span>' 41 ,re.S) 42 items = re.findall(pattern,html) 43 #定义个list 为了格式化 44 tup_items = items[0] + (url,stuname,) 45 list = [] 46 list.append(tup_items) 47 for item in list: 48 # 格式化每一条数据为字典类型的数据 49 yield { 50 'title': item[0], 51 'difficulty': item[1], 52 'duration': item[2], 53 'stu_number': item[3], 54 'comprehensive_evaluation': item[4], 55 'url':item[5], 56 'stuname':item[6] 57 } 58 59 #获取课程urlList 60 def getClassurl(dict): 61 for class_type in dict: 62 for stuname in dict[class_type]: 63 url = geturl(dict[class_type][stuname]) 64 #获取课程url 是一个字典类型 {name:[url]} 65 dic = parse_one_classUrl(url,stuname) 66 return dic 67 68 #写入文本 69 def write_to_file(name,content): 70 with open('..\\text\%s.txt' %name,'a',encoding='utf-8') as f: 71 f.write(json.dumps(content,ensure_ascii=False)+'\n') 72 f.close() 73 74 75 dict_qd = {'前端':{'vus.js':'https://coding.imooc.com/?c=vuejs','HTML/CSS':'https://coding.imooc.com/?c=html','JavaScript':'https://coding.imooc.com/?c=javascript','Node.js':'https://coding.imooc.com/?c=nodejs'}} 76 dict_hd = {'后端':{'java':'https://coding.imooc.com/?c=java','SpringBoot':'https://coding.imooc.com/?c=springboot','SpringCloud':'https://coding.imooc.com/?c=springcloud'}} 77 dict_ydkf = {'移动开发':{'android':'https://coding.imooc.com/?c=android','ios':'https://coding.imooc.com/?c=ios','Reactnative':'https://coding.imooc.com/?c=reactnative'}} 78 dict_yun = {'云计算大数据':{'hadoop':'https://coding.imooc.com/?c=hadoop','大数据':'https://coding.imooc.com/?c=bigdata','Spark':'https://coding.imooc.com/?c=spark','Docker':'https://coding.imooc.com/?c=docker'}} 79 dict_db = {'数据库':{'mysql':'https://coding.imooc.com/?c=mysql','redis':'https://coding.imooc.com/?c=redis','mongodb':'https://coding.imooc.com/?c=mongodb'}} 80 81 def main(): 82 pool = Pool(processes=5) 83 #慕课课程url 84 url_dict = pool.apply_async(getClassurl,(dict_db,)).get() 85 for stuname in url_dict: 86 for url in url_dict[stuname]: 87 print(stuname,url) 88 classhtml = pool.apply_async(geturl,(url,)).get() 89 for item in parse_one_page(classhtml,url,stuname): 90 write_to_file("dict_db",item) 91 92 pool.close() 93 pool.join() 94 95 if __name__ == '__main__': 96 main()

 

阅读(44) 评论(0)