#! /usr/bin/env python
# -*- coding: utf-8 -*-
import urllib
import urllib2
import re
def craw(url, page):
html1 = urllib2.Request(url)
response = urllib2.urlopen(html1).read()
response = str(response)
pat1 = '<img width="220" height="220" class="err-product" data-img="1" src="//.+?" />'
imagelist = re.compile(pat1).findall(response)
x = 1
for imageurl in imagelist:
imagename = "D:/手表/" + str(page) + str(x) + ".jpg"
imagesurl = "http://" + imageurl
try:
urllib.urlretrieve(imagesurl, filename = imagename)
except urllib2.URLError as e:
if hasattr(e, "code"):
x += 1
if hasattr(e, "reason"):
x += 1
x += 1
for i in range(2,6):
url = "https://search.jd.com/Search?keyword=手表%20男&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V16&wq=手表&page=" + str(i)
craw(url, i)
运行,开始报错:
Traceback (most recent call last):
File "C:/Users/JIE/Desktop/py file/����ѵ��.py", line 30, in <module>
craw(url, i)
File "C:/Users/JIE/Desktop/py file/����ѵ��.py", line 20, in craw
urllib.urlretrieve(imagesurl, filename = imagename)
File "C:\Python27\lib\urllib.py", line 91, in urlretrieve
return _urlopener.retrieve(url, filename, reporthook, data)
File "C:\Python27\lib\urllib.py", line 237, in retrieve
fp = self.open(url, data)
File "C:\Python27\lib\urllib.py", line 205, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 342, in open_http
h.endheaders(data)
File "C:\Python27\lib\httplib.py", line 951, in endheaders
self._send_output(message_body)
File "C:\Python27\lib\httplib.py", line 811, in _send_output
self.send(msg)
File "C:\Python27\lib\httplib.py", line 773, in send
self.connect()
File "C:\Python27\lib\httplib.py", line 754, in connect
self.timeout, self.source_address)
File "C:\Python27\lib\socket.py", line 553, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno 11001] getaddrinfo failed
在学python爬虫时遇到个问题,求解答
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
9条回答 默认 最新
- 大蛇王 2017-12-17 17:16关注
import urllib
import urllib2
import redef craw(url, page):
html1 = urllib2.Request(url)
response = urllib2.urlopen(html1).read()
response = str(response)
pat1 = '"//.+?" />'
imagelist = re.compile(pat1).findall(response)
x = 1
print(imagelist)
for imageurl in imagelist:
imagename = str(page) + str(x) + ".jpg"
imagesurl = "http://" + imageurl[3:-4]
print(imagesurl)
try:
urllib.urlretrieve(imagesurl, filename = imagename)
except urllib2.URLError as e:
if hasattr(e, "code"):
x += 1
if hasattr(e, "reason"):
x += 1x += 1
for i in range(2,6):
url = "https://search.jd.com/Search?keyword=手表%20男&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V16&wq=手表&page=" + str(i)
craw(url, i)正则不对,匹配后的内容和http://拼接后不是正确的url,所以打不开,碰到这些问题逐步print出来就能找到哪里错了
本回答被题主选为最佳回答 , 对您是否有帮助呢?解决 无用评论 打赏 举报