问题遇到的现象和发生背景
就是想分析游戏分类(益智类 冒险之类)占了多少
或者游戏平台各个数量 用hive分析
分析完后怎么导入到数据库
就是想分析游戏分类(益智类 冒险之类)占了多少
或者游戏平台各个数量 用hive分析
分析完后怎么导入到数据库
你之前采集的数据有错误,建议换成excel存储后更完整,通过pandas查询excel中的数据后做分析后再用Hive存储值数据库,目前不清楚你要将数据分析成什么格式.下面是优化后你之前的源代码,你自己看看,整理出一个格式
import pandas as pd
import requests
data = None
def getOnepage(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'Cookie': 'bid=NX8M2NI7rfg; douban-fav-remind=1; __yadk_uid=TEJSv3vlFpxrnShgBGXWW51qExiqLCiD; __gads=ID=35ed214a9a0f04f3-229b40ae8cc5003e:T=1609826897:RT=1609826897:S=ALNI_MbjrOjiMxJC6bra_BWqa1z6LwJvFA; ll="118267"; viewed="1007305"; gr_user_id=38f71f7e-49a3-4463-a3e9-dd3f77625dad; _ga=GA1.2.1355032985.1609826898; _vwo_uuid_v2=D6A8A09C6232AC6A436C3775284DBE348|dadd79d0f01552308d196454329600a7; dbcl2="247324733:YcdnDsCblB0"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.24732; ck=MMTN; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1637395735%2C%22https%3A%2F%2Fwww.gameres.com%2F%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1355032985.1609826898.1637309781.1637395736.17; __utmc=30149280; __utmz=30149280.1637395736.17.14.utmcsr=gameres.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; _pk_id.100001.8cb4=b87f2f239f7dd7c8.1609826897.15.1637395743.1637309781.; __utmb=30149280.4.10.1637395736'
}
# 发送请求,得到响应
response = requests.get(url, headers=headers)
return response.json() # 文本
# 解析一页的数据
def parseOnepage(res):
n = len(res['games'])
list1 = []
for j in range(n):
item = {}
item['name'] = res['games'][j]['title'].replace('\r\n', '')
item['star'] = res['games'][j]['star'].replace('\r\n', '')
item['rating'] = res['games'][j]['rating'].replace('\r\n', '')
item['platforms'] = res['games'][j]['platforms'].replace('\r\n', '')
item['n_ratings'] = str(res['games'][j]['n_ratings'])
item['genres'] = res['games'][j]['genres'].replace('\r\n', '')
item['content'] = res['games'][j]['review']['content'].replace('\r\n', '')
list1.append(item)
return list1
def savaData(item):
print(item)
df = pd.read_excel('douban.xlsx')
for i in item:
row_index = len(df) + 1 # 当前excel内容有几行
df.loc[row_index] = i.values()
df.to_excel('douban.xlsx', index=False)
def main():
global data
col = ['name', 'star', 'rating', 'platforms', 'n_ratings', 'genres', 'content']
data = pd.DataFrame(columns=col)
data.to_excel('douban.xlsx', index=False)
for i in range(1, 100):
url = "https://www.douban.com/j/ilmen/game/search?genres=&platforms=&q=&sort=rating&more=" + str(i)
response = getOnepage(url)
# parseOnepage(response)
savaData(parseOnepage(response))
if __name__ == '__main__': # 程序的窗口
main()
数据过滤评分为0
import pandas as pd
import requests
data = None
def getOnepage(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'Cookie': 'bid=NX8M2NI7rfg; douban-fav-remind=1; __yadk_uid=TEJSv3vlFpxrnShgBGXWW51qExiqLCiD; __gads=ID=35ed214a9a0f04f3-229b40ae8cc5003e:T=1609826897:RT=1609826897:S=ALNI_MbjrOjiMxJC6bra_BWqa1z6LwJvFA; ll="118267"; viewed="1007305"; gr_user_id=38f71f7e-49a3-4463-a3e9-dd3f77625dad; _ga=GA1.2.1355032985.1609826898; _vwo_uuid_v2=D6A8A09C6232AC6A436C3775284DBE348|dadd79d0f01552308d196454329600a7; dbcl2="247324733:YcdnDsCblB0"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.24732; ck=MMTN; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1637395735%2C%22https%3A%2F%2Fwww.gameres.com%2F%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1355032985.1609826898.1637309781.1637395736.17; __utmc=30149280; __utmz=30149280.1637395736.17.14.utmcsr=gameres.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; _pk_id.100001.8cb4=b87f2f239f7dd7c8.1609826897.15.1637395743.1637309781.; __utmb=30149280.4.10.1637395736'
}
# 发送请求,得到响应
response = requests.get(url, headers=headers)
return response.json() # 文本
# 解析一页的数据
def parseOnepage(res):
n = len(res['games'])
list1 = []
for j in range(n):
item = {}
item['name'] = res['games'][j]['title'].replace('\r\n', '')
item['star'] = res['games'][j]['star'].replace('\r\n', '')
item['rating'] = res['games'][j]['rating'].replace('\r\n', '')
item['platforms'] = res['games'][j]['platforms'].replace('\r\n', '')
item['n_ratings'] = str(res['games'][j]['n_ratings'])
item['genres'] = res['games'][j]['genres'].replace('\r\n', '')
item['content'] = res['games'][j]['review']['content'].replace('\r\n', '')
if item['rating']=='0':
continue
list1.append(item)
return list1
def savaData(item):
print(item)
df = pd.read_excel('douban.xlsx')
for i in item:
row_index = len(df) + 1 # 当前excel内容有几行
df.loc[row_index] = i.values()
df.to_excel('douban.xlsx', index=False)
def main():
global data
col = ['name', 'star', 'rating', 'platforms', 'n_ratings', 'genres', 'content']
data = pd.DataFrame(columns=col)
data.to_excel('douban.xlsx', index=False)
for i in range(1, 100):
url = "https://www.douban.com/j/ilmen/game/search?genres=&platforms=&q=&sort=rating&more=" + str(i)
response = getOnepage(url)
# parseOnepage(response)
savaData(parseOnepage(response))
if __name__ == '__main__': # 程序的窗口
main()