尝试爬取新浪首页新闻到本地 程序报错
源码为:
import urllib.request,re
url="https://www.sina.com.cn/"
req=urllib.request.Request(url)
req.add_header("User-Agent","马赛克")
pat1='.*?'
data1=urllib.request.urlopen(req).read().decode("UTF-8","ignore")
allink=re.compile(pat1).findall(data1)
for i in range(0,len(allink)):
thislink=allink[i]
pat2=''
req2=urllib.request.Request(url)
req2.add_header("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:65.0) Gecko/20100101 Firefox/65.0")
thispage=urllib.request.urlopen(req2).read().decode("UTF-8","ignore")
isframe=re.compile(pat2).findall(thispage)
if len(isframe)==0:
urllib.request.urlretrieve(thislink,"data/"+str(i)+".html")
else:
flink=isframe[0]
urllib.request.urlretrieve(flink,"data/"+str(i)+".html")
报错信息:
Traceback (most recent call last):
File "/Users/tanzhouyan/Desktop/python/新闻爬虫.py", line 73, in
urllib.request.urlretrieve(thislink,"data/"+str(i)+".html")
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 247, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 548, in _open
'unknown_open', req)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py", line 1387, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError:
在网上一直没有找到解决方法,谢谢大家~