python 爬虫,爬取google搜索结果,爬一段时间就被噤掉了,怎么破?

已经进行了一些伪装,但是还是不够,希望有经验的哥们给个指点

def google_search(keyword,page):
proxy_handler1 = urllib2.ProxyHandler({"https":"https://..........1:8080"})
proxy_handler2 = urllib2.ProxyHandler({"https":"https://..........2:8080"})
proxys = [proxy_handler1,proxy_handler2]
proxy_choice = random.randint(0, 1)
proxy_handler = proxys[proxy_choice]
cookie_handler = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(proxy_handler, cookie_handler, urllib2.HTTPHandler)
urllib2.install_opener(opener)

user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
    (KHTML, like Gecko) Element Browser 5.0',
    'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
    'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
    Version/6.0 Mobile/10A5355d Safari/8536.25',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
    Chrome/28.0.1468.0 Safari/537.36',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']
index = random.randint(0,9)
user_agent = user_agents[index]
headers = {
    "User-Agent":user_agent, #"Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Referer":"https://www.google.com",
    "Host":"www.google.com",
    "Connection":"keep-alive",
    "Accept-Language":"en-US,en;q=0.5",
    #"Accept-Encoding":"gzip, deflate",
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
}

url = 'https://www.google.com' + GOOGLE_POSTFIX + '/search?'
values = {
    'q':keyword,
    'start':page*10,
    'hl':'en'
}
data = urllib.urlencode(values)
req = urllib2.Request(url+data, headers=headers)
html = ''
try:
    rsp = urllib2.urlopen(req)
    html = rsp.read()
except urllib2.HTTPError, e:
    print 'The server couldn\'t fulfill the request.'
    print 'Error code: ', e.code
except urllib2.URLError, e:
    print 'We failed to reach a server.'
    print 'Reason: ', e.reason
except ssl.SSLError,e:
    print 'The read opertaion timed out'
except Exception,e:
    print Exception,e
else:
    pass

return html
查看全部
u014237228
矢寻life
2015/07/01 06:23
  • 爬虫
  • python
  • 点赞
  • 收藏
  • 回答
    私信

2个回复