已经进行了一些伪装,但是还是不够,希望有经验的哥们给个指点
def google_search(keyword,page):
proxy_handler1 = urllib2.ProxyHandler({"https":"https://..........1:8080"})
proxy_handler2 = urllib2.ProxyHandler({"https":"https://..........2:8080"})
proxys = [proxy_handler1,proxy_handler2]
proxy_choice = random.randint(0, 1)
proxy_handler = proxys[proxy_choice]
cookie_handler = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(proxy_handler, cookie_handler, urllib2.HTTPHandler)
urllib2.install_opener(opener)
user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
(KHTML, like Gecko) Element Browser 5.0',
'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
Version/6.0 Mobile/10A5355d Safari/8536.25',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']
index = random.randint(0,9)
user_agent = user_agents[index]
headers = {
"User-Agent":user_agent, #"Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Referer":"https://www.google.com",
"Host":"www.google.com",
"Connection":"keep-alive",
"Accept-Language":"en-US,en;q=0.5",
#"Accept-Encoding":"gzip, deflate",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
}
url = 'https://www.google.com' + GOOGLE_POSTFIX + '/search?'
values = {
'q':keyword,
'start':page*10,
'hl':'en'
}
data = urllib.urlencode(values)
req = urllib2.Request(url+data, headers=headers)
html = ''
try:
rsp = urllib2.urlopen(req)
html = rsp.read()
except urllib2.HTTPError, e:
print 'The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
except urllib2.URLError, e:
print 'We failed to reach a server.'
print 'Reason: ', e.reason
except ssl.SSLError,e:
print 'The read opertaion timed out'
except Exception,e:
print Exception,e
else:
pass
return html