#encoding='gbk'
import requests
import re
url='https://mp.weixin.qq.com/s/1hdWEusrm0MJZexxC_OSaw'
head={"User-Agent": "Mozilla/5.0 "}
def get_text(response):
# 优先使用 chardet 预测的 encoding, 其次使用 http header 提供的 encoding
source_encoding = response.apparent_encoding or response.encoding
if source_encoding is None:
# 说明是二进制文件
raise Exception
elif source_encoding == 'GB2312':
source_encoding = 'GBK'
return response.content.decode(source_encoding, errors="ignore")
response=requests.get(url,headers=head)
html=get_text(response)
with open('E:/Python文件/爬虫/高一新教材/html.txt','w',encoding='GBK') as f:
f.write(html)
pattern = re.compile(r'data-src="(.*)"', flags=re.S)
image_url=pattern.findall(str(response.content.decode('gbk')))
print(str(image_url))
上面是我的代码,应该有很多的错误,还请指出
这是我要爬的网址:https://mp.weixin.qq.com/s/1hdWEusrm0MJZexxC_OSaw
如果有更好的方案,还请给出