在学python爬虫时遇到个问题,求解答
 #! /usr/bin/env python
# -*- coding: utf-8 -*-

import urllib
import urllib2
import re

def craw(url, page):
    html1 = urllib2.Request(url)
    response = urllib2.urlopen(html1).read()
    response = str(response)
    pat1 = '<img width="220" height="220" class="err-product" data-img="1" src="//.+?" />'
    imagelist = re.compile(pat1).findall(response)
    x = 1
    for imageurl in imagelist:
                imagename = "D:/手表/" + str(page) + str(x) + ".jpg"
                imagesurl = "http://" + imageurl
                try:
                    urllib.urlretrieve(imagesurl, filename = imagename)
                except urllib2.URLError as e:
                       if hasattr(e, "code"):
                           x += 1
                       if hasattr(e, "reason"):
                           x += 1
                x += 1

for i in range(2,6):
    url = "https://search.jd.com/Search?keyword=手表%20男&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V16&wq=手表&page=" + str(i)
    craw(url, i)

运行,开始报错:
Traceback (most recent call last):
  File "C:/Users/JIE/Desktop/py file/����ѵ��.py", line 30, in <module>
    craw(url, i)
  File "C:/Users/JIE/Desktop/py file/����ѵ��.py", line 20, in craw
    urllib.urlretrieve(imagesurl, filename = imagename)
  File "C:\Python27\lib\urllib.py", line 91, in urlretrieve
    return _urlopener.retrieve(url, filename, reporthook, data)
  File "C:\Python27\lib\urllib.py", line 237, in retrieve
    fp = self.open(url, data)
  File "C:\Python27\lib\urllib.py", line 205, in open
    return getattr(self, name)(url)
  File "C:\Python27\lib\urllib.py", line 342, in open_http
    h.endheaders(data)
  File "C:\Python27\lib\httplib.py", line 951, in endheaders
    self._send_output(message_body)
  File "C:\Python27\lib\httplib.py", line 811, in _send_output
    self.send(msg)
  File "C:\Python27\lib\httplib.py", line 773, in send
    self.connect()
  File "C:\Python27\lib\httplib.py", line 754, in connect
    self.timeout, self.source_address)
  File "C:\Python27\lib\socket.py", line 553, in create_connection
    for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno 11001] getaddrinfo failed

9个回答

import urllib
import urllib2
import re

def craw(url, page):
html1 = urllib2.Request(url)
response = urllib2.urlopen(html1).read()
response = str(response)
pat1 = '"//.+?" />'
imagelist = re.compile(pat1).findall(response)
x = 1
print(imagelist)
for imageurl in imagelist:
imagename = str(page) + str(x) + ".jpg"
imagesurl = "http://" + imageurl[3:-4]
print(imagesurl)
try:
urllib.urlretrieve(imagesurl, filename = imagename)
except urllib2.URLError as e:
if hasattr(e, "code"):
x += 1
if hasattr(e, "reason"):
x += 1

            x += 1

for i in range(2,6):
url = "https://search.jd.com/Search?keyword=手表%20男&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V16&wq=手表&page=" + str(i)
craw(url, i)

正则不对,匹配后的内容和http://拼接后不是正确的url,所以打不开,碰到这些问题逐步print出来就能找到哪里错了

Ericjohnn
Ericjohnn 谢谢你啊,已经解决了
2 年多之前 回复
Ericjohnn
Ericjohnn 的确是我正则表达式写错了,但是像你这样改的话,爬出来的东西就不仅仅是手表了
2 年多之前 回复

地址和端口号,估计你的地址有问题

先打印一下imagesurl,估计它的地址有问题。或者你的网络不能访问这个URL

1、你提取图片网址的正则表达式有问题,你得到的列表应该img标签的列表,而不是网址的列表,应该修改为:pat1 = '',加一个group分组
2、中文目录需要处理一下:imagename=unicode(imagename)
3、小问题,你的编码设置好像有点问题

timezhourenquan
timezhourenquan pat1 = '<img width="220" height="220" class="err-product" data-img="1" src="(//.+?)" />'他自动转为网址了
2 年多之前 回复

网络可能不可以访问这个URL

1、你提取图片网址的正则表达式有问题,你得到的列表应该img标签的列表,而不是网址的列表,应该修改为:pat1 = \'\',加一个group分组,我测试应该是可以的
2、中文目录需要处理一下:imagename=unicode(imagename)
3、小问题,你的编码设置好像有点问题

首先, python2里面中文字符串要带u开头, 你url就没弄好
其次, 为什么不用requests库, 原生的urllib 默认不带自动urlencode功能,中文可能没encode成百分号那种编码

本人刚学python 的爬虫,就去爬京东的手表图片练手,然后一直报错,代码,错误都在上面了,求解决呀

Csdn user default icon
上传中...
上传图片
插入图片
抄袭、复制答案,以达到刷声望分或其他目的的行为,在CSDN问答是严格禁止的,一经发现立刻封号。是时候展现真正的技术了!
立即提问