在进行爬虫学习的时候,进行电影天堂的url进行爬取,但是代码执行后,一直报这个错误,请各位大神帮忙解决
代码:
#!/usr/bin/python
# -*- encoding: utf-8 -*-
from urllib import request
import re
import time
import random
from useragent import ua_list
class FilmSky(object):
def __init__(self):
self.url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
# 获取html函数(因为两个页面都需要发请求)
def get_page(self, url):
req = request.Request(
url=url,
headers={'User-Agent': random.choice(ua_list)}
)
res = request.urlopen(req)
html = res.read().decode('utf-8')
return html
# 解析提取数据(把名称和下载链接一次性拿到)
# html为一级页面响音内容
def parse_page(self, html):
# 先解析以及界面(电影名和详情链接)
pattern = re.compile('<table width="100%".*?>.*?heitht="26">.*?<a href="(.*?)" class="ulink">(.*?)</a>', re.S)
film_list = pattern.findall(html)
for film in film_list:
film_name = film_list[1]
film_link = 'http://www.dytt8.net/' + film_list[0]
# 拿到详情链接后,再去获取详情链接html,提取下载链接
download_link = self.parse_two_html(film_link)
# 测试
d = {
'名称:': film_name,
'链接:': download_link
}
# 二级爬虫
def parse_two_html(self, film_link):
two_html = self.get_page(film_link)
pattern = re.compile('<table style="BORDER_BOTTOM: .*?".*?>.*?>(.*?)</a>', re.S)
download_link = pattern.findall(two_html)[0]
return download_link
def main(self):
for page in range(1, 2):
url = self.url.format(page)
html = self.get_page(url)
self.parse_page(html)
time.sleep(random.randint(1, 3))
print("第%d页面完成爬虫作业" % page)
if __name__ == '__main__':
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
start = time.time()
spider = FilmSky()
spider.main()
end = time.time()
print("执行时间为:%.2f" % (end - start))
遇到的报错信息:
/Users/apple/PycharmProjects/spider/venv/bin/python /Users/apple/PycharmProjects/spider/day02/06_film_sky_two_urls.py
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1319, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1230, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1276, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1225, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1004, in _send_output
self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 944, in send
self.connect()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1399, in connect
self.sock = self._context.wrap_socket(self.sock,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1040, in _create
self.do_handshake()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
OSError: [Errno 0] Error
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/apple/PycharmProjects/spider/day02/06_film_sky_two_urls.py", line 78, in <module>
spider.main()
File "/Users/apple/PycharmProjects/spider/day02/06_film_sky_two_urls.py", line 67, in main
html = self.get_page(url)
File "/Users/apple/PycharmProjects/spider/day02/06_film_sky_two_urls.py", line 35, in get_page
res = request.urlopen(req)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 531, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 640, in http_response
response = self.parent.error(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 563, in error
result = self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 755, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1362, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1322, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 0] Error>
Process finished with exit code 1