2 collinswangruiqi collinswangruiqi 于 2016.09.11 20:41 提问

使用python爬虫,人工填写验证码总是报验证码错误

获取验证码后下载到本地,打开后人工输入验证码再提交。每次总报验证码错误,发现提交的url编码中字典的顺序不一样,会跟这个有关么?还是其他什么问题?

 #-*- coding:utf8 -*-
import urllib, urllib2, random, re
from time import localtime, strftime, time

from PIL import Image
import cStringIO
import win32api,win32gui,win32con
#import time


def get_secret_number():
    def handle_window(hwnd, extra):
        if win32gui.IsWindowVisible(hwnd):
            if 'tmp' in win32gui.GetWindowText(hwnd):
                win32gui.PostMessage(hwnd, win32con.WM_CLOSE, 0, 0)

    getCode_url = "http://www.sojump.com/BotDetectCaptcha.ashx?activity=4738641&get=image&c=DesignerInitializedCaptcha&t=ba800caa26be43d28d05e3cad930cd44&d=1471997781489"
    header={}
    header['Host']="www.sojump.com" 
    header['User-Agent']="Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0"
    header['Cache-Control']="max-age=0"
    request = urllib2.Request(getCode_url,headers=header)
    res = urllib2.urlopen(request).read()
    image = Image.open(cStringIO.StringIO(res))
    image.save('E:/temp/1.bmp','BMP')
    image.close()

    l_image = Image.open('E:/temp/1.bmp')
    l_image.show()

    print 'Please inpout what you saw!'

    m = raw_input()

    win32gui.EnumWindows(handle_window, None)
    #time.sleep(5)
    return m

def gen_uri_param(curID, rn):
    m = get_secret_number()
    uri_param = {}
    uri_param['submittype'] = '1'
    uri_param['t'] = str(int(time()*1000))
    uri_param['validate_text'] = m
    uri_param['starttime'] = strftime("%Y/%m/%d %H:%M:%S", localtime())
    uri_param['rn'] = rn
    uri_param['curID'] = curID
    uri_param['btuserinput'] = m
    uri_param['btcaptchald'] = 'DesignerInitializedCaptcha'
    uri_param['btinstanceId'] = 'd'
    return uri_param

def gen_post_string(answer):
    def concat_pair(pair):
        return '$'.join([str(pair[0]), str(pair[1])])

    tmp_list = []
    for x in answer:
        tmp_list.append(concat_pair(x))
    return '}'.join(tmp_list)


jq_url = "http://www.sojump.com/jq/4738641.aspx"
jq_base = "http://www.sojump.com/jq/{}.aspx"
uri_base = "http://www.sojump.com/handler/processjq.ashx?{}"


response = urllib2.urlopen(jq_url)
text = response.read();
rndnum = re.search('rndnum="[0-9.]+"',text).group(0).split('"')[1]
curID = re.search('(\d+).aspx',response.geturl()).group(1)
jq_sum = int(re.findall('div(\d+)',text)[-1])


answer_list = [1,1,1,1,1,1,1,1]
answer = zip(range(1,jq_sum+1),answer_list)

post_data = urllib.urlencode({'submitdata':gen_post_string(answer)})
get_data = urllib.urlencode(gen_uri_param(curID, rndnum))
print get_data


request_url = uri_base.format(get_data)

header={}
header['Host']="www.sojump.com" 
header['User-Agent']="Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0"
header['Cache-Control']="max-age=0"
header['method'] = 'POST'

request = urllib2.Request(request_url, post_data,headers = header)
result = urllib2.urlopen(request)
print result.read()

1个回答

caozhy
caozhy   Ds   Rxr 2016.09.11 23:20

代码就不看了,用fiddler调试下,看看有没有多次请求验证码图片,每次请求一次图片,验证码就会改变。

Csdn user default icon
上传中...
上传图片
插入图片