# author:Administrator
# date:2021/04/30
import requests #第三方下载器
import re #正则表达式
import json #格式化数据用
from requests.exceptions import RequestException #做异常处理
from multiprocessing import Pool #使用多进程
def geturl(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.content.decode("utf-8")
return None
except RequestException:
return None
#取慕课主页课程url 放入list
classUrl_list = []
def parse_one_classUrl(html):
pattern = re.compile('.*?<a target="_blank" href="(.*?)">',re.S)
items = re.findall(pattern,html)
classUrl_list.append(items)
# 正则匹配数据
def parse_one_page(html):
pattern = re.compile('.*?<div class="title-box">.*?<h1>(.*?)</h1>'
'.*?<span>难度</span>.*?<span class="nodistance">(.*?)</span>'
'.*?<span>时长</span>.*?<span class="nodistance">(.*?)</span>'
'.*?<span>学习人数</span>.*?<span class="nodistance">(.*?)</span>'
'.*?<span>综合评分</span>.*?<span class="nodistance">(.*?)</span>'
,re.S)
items = re.findall(pattern,html)
for item in items:
# 格式化每一条数据为字典类型的数据
yield {
'title': item[0],
'difficulty': item[1],
'duration': item[2],
'stu_number': item[3],
'comprehensive_evaluation': item[4]
}
#获取课程urlList
def getClassurl(dict):
for class_type in dict:
for stuname in dict[class_type]:
url = geturl(dict[class_type][stuname])
#获取课程urlList
parse_one_classUrl(url)
return classUrl_list
#url不全 拼接地址
Classurladd = []
homeurl='https://coding.imooc.com'
def getaddClassurl(list):
for pagelist in list:
for classurl in pagelist:
Classurladd.append(homeurl + classurl)
return Classurladd
#写入文本
def write_to_file(name,content):
with open('..\\text\%s.txt' %name,'a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
dict_qd = {'前端':{'vus.js':'https://coding.imooc.com/?c=vuejs','HTML/CSS':'https://coding.imooc.com/?c=html','JavaScript':'https://coding.imooc.com/?c=javascript','Node.js':'https://coding.imooc.com/?c=nodejs'}}
dict_hd = {'后端':{'java':'https://coding.imooc.com/?c=java','SpringBoot':'https://coding.imooc.com/?c=springboot','SpringCloud':'https://coding.imooc.com/?c=springcloud'}}
dict_ydkf = {'移动开发':{'android':'https://coding.imooc.com/?c=android','ios':'https://coding.imooc.com/?c=ios','Reactnative':'https://coding.imooc.com/?c=reactnative'}}
dict_yun = {'云计算大数据':{'hadoop':'https://coding.imooc.com/?c=hadoop','大数据':'https://coding.imooc.com/?c=bigdata','Spark':'https://coding.imooc.com/?c=spark','Docker':'https://coding.imooc.com/?c=docker'}}
dict_db = {'数据库':{'mysql':'https://coding.imooc.com/?c=mysql','redis':'https://coding.imooc.com/?c=redis','mongodb':'https://coding.imooc.com/?c=mongodb'}}
def main():
url_list = getClassurl(dict_hd)
Classurladd = getaddClassurl(url_list)
print(classUrl_list)
for u in Classurladd:
classhtml = geturl(u)
for item in parse_one_page(classhtml):
write_to_file("dict_hd",item)
if __name__ == '__main__':
main()
我自己写了一个爬慕课网的demo,但速度很慢,想看看大神帮我改进一下,目前我是手动去改dict值。
现在爬到的数据格式是:
{"title": "Spring Cloud Alibaba 大型互联网领域多场景最佳实践", "difficulty": "中级", "duration": "15小时"}
想更改为:
{"stu_name":"后端","title": "Spring Cloud Alibaba 大型互联网领域多场景最佳实践", "difficulty": "中级", "duration": "15小时"}
并且爬的速度太慢了
求求大神,路过帮孩子想想办法