Ericjohnn 2017-12-12 08:19 采纳率: 0%
浏览 1925
已采纳

在学python爬虫时遇到个问题,求解答

 #! /usr/bin/env python
# -*- coding: utf-8 -*-

import urllib
import urllib2
import re

def craw(url, page):
    html1 = urllib2.Request(url)
    response = urllib2.urlopen(html1).read()
    response = str(response)
    pat1 = '<img width="220" height="220" class="err-product" data-img="1" src="//.+?" />'
    imagelist = re.compile(pat1).findall(response)
    x = 1
    for imageurl in imagelist:
                imagename = "D:/手表/" + str(page) + str(x) + ".jpg"
                imagesurl = "http://" + imageurl
                try:
                    urllib.urlretrieve(imagesurl, filename = imagename)
                except urllib2.URLError as e:
                       if hasattr(e, "code"):
                           x += 1
                       if hasattr(e, "reason"):
                           x += 1
                x += 1

for i in range(2,6):
    url = "https://search.jd.com/Search?keyword=手表%20男&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V16&wq=手表&page=" + str(i)
    craw(url, i)

运行,开始报错:
Traceback (most recent call last):
  File "C:/Users/JIE/Desktop/py file/����ѵ��.py", line 30, in <module>
    craw(url, i)
  File "C:/Users/JIE/Desktop/py file/����ѵ��.py", line 20, in craw
    urllib.urlretrieve(imagesurl, filename = imagename)
  File "C:\Python27\lib\urllib.py", line 91, in urlretrieve
    return _urlopener.retrieve(url, filename, reporthook, data)
  File "C:\Python27\lib\urllib.py", line 237, in retrieve
    fp = self.open(url, data)
  File "C:\Python27\lib\urllib.py", line 205, in open
    return getattr(self, name)(url)
  File "C:\Python27\lib\urllib.py", line 342, in open_http
    h.endheaders(data)
  File "C:\Python27\lib\httplib.py", line 951, in endheaders
    self._send_output(message_body)
  File "C:\Python27\lib\httplib.py", line 811, in _send_output
    self.send(msg)
  File "C:\Python27\lib\httplib.py", line 773, in send
    self.connect()
  File "C:\Python27\lib\httplib.py", line 754, in connect
    self.timeout, self.source_address)
  File "C:\Python27\lib\socket.py", line 553, in create_connection
    for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno 11001] getaddrinfo failed

  • 写回答

9条回答

  • 大蛇王 2017-12-18 01:16
    关注

    import urllib
    import urllib2
    import re

    def craw(url, page):
    html1 = urllib2.Request(url)
    response = urllib2.urlopen(html1).read()
    response = str(response)
    pat1 = '"//.+?" />'
    imagelist = re.compile(pat1).findall(response)
    x = 1
    print(imagelist)
    for imageurl in imagelist:
    imagename = str(page) + str(x) + ".jpg"
    imagesurl = "http://" + imageurl[3:-4]
    print(imagesurl)
    try:
    urllib.urlretrieve(imagesurl, filename = imagename)
    except urllib2.URLError as e:
    if hasattr(e, "code"):
    x += 1
    if hasattr(e, "reason"):
    x += 1

                x += 1
    

    for i in range(2,6):
    url = "https://search.jd.com/Search?keyword=手表%20男&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=1.def.0.V16&wq=手表&page=" + str(i)
    craw(url, i)

    正则不对,匹配后的内容和http://拼接后不是正确的url,所以打不开,碰到这些问题逐步print出来就能找到哪里错了

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(8条)

报告相同问题?

悬赏问题

  • ¥100 为什么这个恒流源电路不能恒流?
  • ¥15 有偿求跨组件数据流路径图
  • ¥15 写一个方法checkPerson,入参实体类Person,出参布尔值
  • ¥15 我想咨询一下路面纹理三维点云数据处理的一些问题,上传的坐标文件里是怎么对无序点进行编号的,以及xy坐标在处理的时候是进行整体模型分片处理的吗
  • ¥15 CSAPPattacklab
  • ¥15 一直显示正在等待HID—ISP
  • ¥15 Python turtle 画图
  • ¥15 stm32开发clion时遇到的编译问题
  • ¥15 lna设计 源简并电感型共源放大器
  • ¥15 如何用Labview在myRIO上做LCD显示?(语言-开发语言)