import aiohttp
import asyncio
import time
import multiprocessing as mp
import requests
from bs4 import BeautifulSoup
import socket
import re
import pprint
header = 'http://osu.ppy.sh/'
middle = 'p/pp/?'
mode = 'm=3' # 0=stanard 1=taiko 2=ctb 3=mania
url = header + middle + mode + '&'
page = [1, 3] # 开始页数-结束页数
badRequest = {} # pageNum:resCode
htmls={}
colls={}
#way store in mongoDB : collection: {"_id":"1", "Rank":"1","Player Name":"Jakads","Accuracy":"97.59%","Play Count":""
#"Performance":"17288pp"}
def getPages(pageNum): #每1秒获取一个页面当做缓存
global url
#global badRequest
#global htmls
try:
print('开始get网页,pageNum=',pageNum)
res = requests.get(url=url + 'page=' +str(pageNum), timeout=10)
print(url + 'page=' +str(pageNum))
time.sleep(.1)
# 如果res不等于200 重试3次
count = 0
#print(res.status_code)
while (res.status_code != 200 and count <= 3):
res.status_code = requests.get(url=url + 'page=' +str(pageNum), timeout=10)
print('restart get')
count += 1
if (res.status_code == 200):
return res.text
else:
return res.status_code
if(res.status_code==200):
writez(res.text)
return res.text
else:
print( 'pageNum : ', pageNum, '返回码 : ', res.status_code)
return res.status_code
except Exception as e:
print(e)
return None
def findTags(html,startNum):
soup = BeautifulSoup(html, features='lxml')
tables = soup.findAll('table')
# print(len(tables))
for t in tables:
sec = 0 #table顺序
for tr in t.tbody.findAll('tr'):
# print('sec:',sec)
td_sec = 0 #table内顺序
for td in tr.findAll('td'):
text = td.get_text().strip()
# print(len(text))
if (td_sec == 0):
dict = {"rank": text}
elif (td_sec == 1):
dict.update({"Player Name": text})
elif (td_sec == 2):
dict.update({"Accuracy": text})
elif (td_sec == 3):
dict.update({"Play Count": text})
elif (td_sec == 4):
dict.update({"Performance": text})
elif (td_sec == 5):
dict.update({"SS": text})
elif (td_sec == 6):
dict.update({"S": text})
elif (td_sec == 7):
dict.update({"A": text})
td_sec += 1 #每一次遍历+1
colls[str(startNum+sec)] = dict
sec += 1 #每一个用户+1
def writez(msg):
with open('tmp.txt','w',encoding='utf-8') as f:
f.write(msg)
if __name__=='__main__':
startTime = time.time()
pool = mp.Pool()
jobs=[pool.apply_async(getPages,args=(pageNum,))for pageNum in range(page[0],page[1]+1)]
pool.close()
pool.join()
results=[f.get() for f in jobs]
# for z in jobs:
# writez(str(z.get()))
#print(len(results))
startNum=1
#print(results[2])
for h in range(0,len(results)):
findTags(results[h],startNum)
startNum+=50
pprint.pprint(colls)
#print(htmls)
print('花费时间 : ', time.time() - startTime, 's')
print('ok')