pikechuz 2018-11-20 07:48 采纳率: 0%
浏览 2140

python使用asyncio+aiohttp加载速度过快 如何进行限速

import aiohttp
import asyncio
import time
import multiprocessing as mp
import requests
from bs4 import BeautifulSoup
import socket
import re
import pprint
import os
import pymongo

url = 'https://osu.ppy.sh/rankings/mania/performance?page='#+pageNum+'#scores'
page = [1, 5] # 开始页数-结束页数
badRequest = {} # pageNum:resCode
htmls=[]
colls={}
headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding':'gb2312,utf-8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection':'Keep-alive'
}
#way store in mongoDB : collection: {"_id":"1", "Rank":"1","Player Name":"Jakads","Accuracy":"97.59%","Play Count":""
#"Performance":"17288pp"}

async def getPages(pageNum): #每1秒获取一个页面当做缓存
conn = aiohttp.TCPConnector(limit=4)
global url
#global badRequest
#global htmls
async with aiohttp.ClientSession() as session:
try:
print('开始get网页,pageNum=',pageNum)
async with session.get(url=url +str(pageNum)+'#scores',headers=headers, timeout=10) as res:
print(url +str(pageNum)+'#scores')
await asyncio.sleep(5)
txt=await res.text()
resCode= res.status
# 如果res不等于200 重试3次
count = 0
#print(res.status_code)
while (resCode != 200 and count <= 3):
res = await session.get(url=url +str(pageNum)+'#scores',headers=headers, timeout=10)
resCode=res.status
txt=await res.text()
print('restart get')
count += 1
if (resCode == 200):
print(str(pageNum)+' done')
return {str(pageNum):txt}
else:
print('pageNum : ', pageNum, '返回码 : ', resCode)
if(resCode==200):
#print(res.url)
#writez(res.text)
print(str(pageNum) + ' done')
return {str(pageNum):txt}
else:
print( 'pageNum : ', pageNum, '返回码 : ', resCode)
return {str(pageNum):resCode}
except Exception as e:
print(e)
return None

def findTags(html,startNum):
soup = BeautifulSoup(html, features='lxml')
tables = soup.findAll('table')
# print(len(tables))

for t in tables:
    sec = 0 #table顺序
    for tr in t.tbody.findAll('tr'):
        # print('sec:',sec)
        td_sec = 0  #table内顺序
        for td in tr.findAll('td'):
            text = td.get_text().strip()
            # print(len(text))
            if (td_sec == 0):
                dict = {"rank": text}
            elif (td_sec == 1):
                dict.update({"Player Name": text})
            elif (td_sec == 2):
                dict.update({"Accuracy": text})
            elif (td_sec == 3):
                dict.update({"Play Count": text})
            elif (td_sec == 4):
                dict.update({"Performance": text})
            elif (td_sec == 5):
                dict.update({"SS": text})
            elif (td_sec == 6):
                dict.update({"S": text})
            elif (td_sec == 7):
                dict.update({"A": text})
            td_sec += 1 #每一次遍历+1
        colls[str(startNum+sec)] = dict
        sec += 1 #每一个用户+1

def writez(col):#写入文本文件tmp.txt
if os.path.exists('tmp.txt'):
os.remove('tmp.txt')
with open('tmp.txt','a',encoding='utf-8') as f:
for k,v in col.items():
for k2,v2 in v.items():
f.write(k2+" : "+v2+'\n')

def mongoConnection():
conn=pymongo.MongoClient('127.0.0.1',27017)
db=conn.osu
collection=db.rank
return collection

def mongoCreateIndex(connect):
idx_result = connect.create_index([('rank', pymongo.ASCENDING)], unique=True)
return idx_result

def mongoInsert(col,connect):
tmpList = []
for k, v in col.items():
v.update({"_id":k})
tmpList.append(v)
# print('ok')
result = connect.insert_many(tmpList)
return result

def mongoCheckDuplicate(col,connect):
for k,v in col.items():
for k2,v2 in v.items():
dictz={"rank":v2}
result=connect.find_one(dictz)
if(result!=None):
res=connect.delete_one(dictz)
print('check Duplicate ok')

if name=='__main__':

startTime = time.time()

loop=asyncio.get_event_loop()

tasks=[]
results={}

conn=aiohttp.TCPConnector(limit=4)
for pageNum in range(page[0], page[1] + 1):
    tasks.append(asyncio.ensure_future(getPages(pageNum)))

finished=loop.run_until_complete(asyncio.wait(tasks))
loop.close()

for a in finished:
    for b in a:
        if(b.result()!=None):
            for k,v in b.result().items():
                results[str(k)]=str(v)
        #print(b.result())
        #f.write(b.result())
#print('共计完成 ',len(results),'页')


osu = mongoConnection()

startNum=1

#检索分析网页中的Tag
for h in range(page[0], page[1] + 1):
    findTags(results[str(h)], startNum)
    startNum += 50

#重复值鉴定,如果重复就在数据库里删除
mongoCheckDuplicate(colls,osu)

#插入
try:
    res=mongoInsert(colls,osu)
    print('insert res:',res)
except Exception as e:
    print(e)

#创建索引
# try:
#     res=mongoCreateIndex(osu)
#     print('index res:',res)
# except Exception as e:
#     print(e)

print('花费时间 : ', time.time() - startTime, 's')
print('ok')

代码如上,,当我使用session.get()时返回码一直为403,换requests.get()就能正常获取网页..初步怀疑是之前爬的太快了被封号了。。但是为什么用requests还能获取呢?有什么办法限速吗 (我用过await asyncio.sleep(),aiohttp.TCPConnector(limit=4))并没有很好的效果。

  • 写回答

1条回答 默认 最新

  • lyhsdy 2018-11-21 01:24
    关注

    session.get用headers=headers是没有加入headers的,我也不知道为什么,session.get的话用update headers的方式就可以了

    
    s=requests.session()
    s.headers.update(headers)
    s.get(url=url +str(pageNum)+'#scores',timeout=10)
    
    评论

报告相同问题?

悬赏问题

  • ¥50 树莓派安卓APK系统签名
  • ¥15 maple软件,用solve求反函数出现rootof,怎么办?
  • ¥65 汇编语言除法溢出问题
  • ¥15 Visual Studio问题
  • ¥15 state显示变量是字符串形式,但是仍然红色,无法引用,并显示类型不匹配
  • ¥20 求一个html代码,有偿
  • ¥100 关于使用MATLAB中copularnd函数的问题
  • ¥20 在虚拟机的pycharm上
  • ¥15 jupyterthemes 设置完毕后没有效果
  • ¥15 matlab图像高斯低通滤波