import aiohttp
import asyncio
import time
import multiprocessing as mp
import requests
from bs4 import BeautifulSoup
import socket
import re
import pprint
import os
import pymongo
url = 'https://osu.ppy.sh/rankings/mania/performance?page='#+pageNum+'#scores'
page = [1, 5] # 开始页数-结束页数
badRequest = {} # pageNum:resCode
htmls=[]
colls={}
headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding':'gb2312,utf-8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection':'Keep-alive'
}
#way store in mongoDB : collection: {"_id":"1", "Rank":"1","Player Name":"Jakads","Accuracy":"97.59%","Play Count":""
#"Performance":"17288pp"}
async def getPages(pageNum): #每1秒获取一个页面当做缓存
conn = aiohttp.TCPConnector(limit=4)
global url
#global badRequest
#global htmls
async with aiohttp.ClientSession() as session:
try:
print('开始get网页,pageNum=',pageNum)
async with session.get(url=url +str(pageNum)+'#scores',headers=headers, timeout=10) as res:
print(url +str(pageNum)+'#scores')
await asyncio.sleep(5)
txt=await res.text()
resCode= res.status
# 如果res不等于200 重试3次
count = 0
#print(res.status_code)
while (resCode != 200 and count <= 3):
res = await session.get(url=url +str(pageNum)+'#scores',headers=headers, timeout=10)
resCode=res.status
txt=await res.text()
print('restart get')
count += 1
if (resCode == 200):
print(str(pageNum)+' done')
return {str(pageNum):txt}
else:
print('pageNum : ', pageNum, '返回码 : ', resCode)
if(resCode==200):
#print(res.url)
#writez(res.text)
print(str(pageNum) + ' done')
return {str(pageNum):txt}
else:
print( 'pageNum : ', pageNum, '返回码 : ', resCode)
return {str(pageNum):resCode}
except Exception as e:
print(e)
return None
def findTags(html,startNum):
soup = BeautifulSoup(html, features='lxml')
tables = soup.findAll('table')
# print(len(tables))
for t in tables:
sec = 0 #table顺序
for tr in t.tbody.findAll('tr'):
# print('sec:',sec)
td_sec = 0 #table内顺序
for td in tr.findAll('td'):
text = td.get_text().strip()
# print(len(text))
if (td_sec == 0):
dict = {"rank": text}
elif (td_sec == 1):
dict.update({"Player Name": text})
elif (td_sec == 2):
dict.update({"Accuracy": text})
elif (td_sec == 3):
dict.update({"Play Count": text})
elif (td_sec == 4):
dict.update({"Performance": text})
elif (td_sec == 5):
dict.update({"SS": text})
elif (td_sec == 6):
dict.update({"S": text})
elif (td_sec == 7):
dict.update({"A": text})
td_sec += 1 #每一次遍历+1
colls[str(startNum+sec)] = dict
sec += 1 #每一个用户+1
def writez(col):#写入文本文件tmp.txt
if os.path.exists('tmp.txt'):
os.remove('tmp.txt')
with open('tmp.txt','a',encoding='utf-8') as f:
for k,v in col.items():
for k2,v2 in v.items():
f.write(k2+" : "+v2+'\n')
def mongoConnection():
conn=pymongo.MongoClient('127.0.0.1',27017)
db=conn.osu
collection=db.rank
return collection
def mongoCreateIndex(connect):
idx_result = connect.create_index([('rank', pymongo.ASCENDING)], unique=True)
return idx_result
def mongoInsert(col,connect):
tmpList = []
for k, v in col.items():
v.update({"_id":k})
tmpList.append(v)
# print('ok')
result = connect.insert_many(tmpList)
return result
def mongoCheckDuplicate(col,connect):
for k,v in col.items():
for k2,v2 in v.items():
dictz={"rank":v2}
result=connect.find_one(dictz)
if(result!=None):
res=connect.delete_one(dictz)
print('check Duplicate ok')
if name=='__main__':
startTime = time.time()
loop=asyncio.get_event_loop()
tasks=[]
results={}
conn=aiohttp.TCPConnector(limit=4)
for pageNum in range(page[0], page[1] + 1):
tasks.append(asyncio.ensure_future(getPages(pageNum)))
finished=loop.run_until_complete(asyncio.wait(tasks))
loop.close()
for a in finished:
for b in a:
if(b.result()!=None):
for k,v in b.result().items():
results[str(k)]=str(v)
#print(b.result())
#f.write(b.result())
#print('共计完成 ',len(results),'页')
osu = mongoConnection()
startNum=1
#检索分析网页中的Tag
for h in range(page[0], page[1] + 1):
findTags(results[str(h)], startNum)
startNum += 50
#重复值鉴定,如果重复就在数据库里删除
mongoCheckDuplicate(colls,osu)
#插入
try:
res=mongoInsert(colls,osu)
print('insert res:',res)
except Exception as e:
print(e)
#创建索引
# try:
# res=mongoCreateIndex(osu)
# print('index res:',res)
# except Exception as e:
# print(e)
print('花费时间 : ', time.time() - startTime, 's')
print('ok')
代码如上,,当我使用session.get()时返回码一直为403,换requests.get()就能正常获取网页..初步怀疑是之前爬的太快了被封号了。。但是为什么用requests还能获取呢?有什么办法限速吗 (我用过await asyncio.sleep(),aiohttp.TCPConnector(limit=4))并没有很好的效果。