python使用多线程下载网页 结果下载到的内容相同。。
 import aiohttp
import asyncio
import time
import multiprocessing as mp
import requests
from bs4 import BeautifulSoup
import socket
import re
import pprint

header = 'http://osu.ppy.sh/'
middle = 'p/pp/?'
mode = 'm=3'  # 0=stanard 1=taiko 2=ctb 3=mania
url = header + middle + mode + '&'
page = [1, 3]  # 开始页数-结束页数
badRequest = {}  # pageNum:resCode
htmls={}
colls={}
#way store in mongoDB : collection: {"_id":"1", "Rank":"1","Player Name":"Jakads","Accuracy":"97.59%","Play Count":""
#"Performance":"17288pp"}

def getPages(pageNum):  #每1秒获取一个页面当做缓存

    global url
    #global badRequest
    #global htmls
    try:
        print('开始get网页,pageNum=',pageNum)
        res = requests.get(url=url + 'page=' +str(pageNum), timeout=10)
        print(url + 'page=' +str(pageNum))
        time.sleep(.1)
        # 如果res不等于200 重试3次
        count = 0
        #print(res.status_code)
        while (res.status_code != 200 and count <= 3):
            res.status_code = requests.get(url=url + 'page=' +str(pageNum), timeout=10)
            print('restart get')
            count += 1
            if (res.status_code == 200):
                return res.text
            else:
                return res.status_code
        if(res.status_code==200):
            writez(res.text)
            return res.text
        else:
            print( 'pageNum : ', pageNum, '返回码 : ', res.status_code)
            return res.status_code
    except Exception as e:
        print(e)
        return None

def findTags(html,startNum):
    soup = BeautifulSoup(html, features='lxml')
    tables = soup.findAll('table')
    # print(len(tables))

    for t in tables:
        sec = 0 #table顺序
        for tr in t.tbody.findAll('tr'):
            # print('sec:',sec)
            td_sec = 0  #table内顺序
            for td in tr.findAll('td'):
                text = td.get_text().strip()
                # print(len(text))
                if (td_sec == 0):
                    dict = {"rank": text}
                elif (td_sec == 1):
                    dict.update({"Player Name": text})
                elif (td_sec == 2):
                    dict.update({"Accuracy": text})
                elif (td_sec == 3):
                    dict.update({"Play Count": text})
                elif (td_sec == 4):
                    dict.update({"Performance": text})
                elif (td_sec == 5):
                    dict.update({"SS": text})
                elif (td_sec == 6):
                    dict.update({"S": text})
                elif (td_sec == 7):
                    dict.update({"A": text})
                td_sec += 1 #每一次遍历+1
            colls[str(startNum+sec)] = dict
            sec += 1 #每一个用户+1

def writez(msg):
    with open('tmp.txt','w',encoding='utf-8') as f:
        f.write(msg)

if __name__=='__main__':
    startTime = time.time()
    pool = mp.Pool()
    jobs=[pool.apply_async(getPages,args=(pageNum,))for pageNum in range(page[0],page[1]+1)]

    pool.close()
    pool.join()
    results=[f.get() for f in jobs]
    # for z in jobs:
    #     writez(str(z.get()))
    #print(len(results))
    startNum=1

    #print(results[2])
    for h in range(0,len(results)):
        findTags(results[h],startNum)
        startNum+=50
    pprint.pprint(colls)

    #print(htmls)
    print('花费时间 : ', time.time() - startTime, 's')
    print('ok')






1个回答

问题解决了 原因是原网站被重定向到新页面去了...坑啊!一直加载到错误的页面所以读取错了...

Csdn user default icon
上传中...
上传图片
插入图片
抄袭、复制答案,以达到刷声望分或其他目的的行为,在CSDN问答是严格禁止的,一经发现立刻封号。是时候展现真正的技术了!
立即提问