1这是爬取英雄联盟所有英雄和技能,并保存为json格式
from selenium import webdriver
from lxml import etree
import requests, json
driver = webdriver.Chrome()
base_url = 'https://lol.qq.com/data/info-heros.shtml'
driver.get(base_url)
html = etree.HTML(driver.page_source)
# //*[@id="jSearchHeroDiv"]/li[1]/a
# //*[@id="jSearchHeroDiv"]/li[2]/a
hero_url_list = html.xpath('.//ul[@id="jSearchHeroDiv"]/li/a/@href')
hero_list = [] # 存放所有英雄的列表
for hero_url in hero_url_list:
id = hero_url.split('=')[-1]
# print(id)
detail_url = 'https://game.gtimg.cn/images/lol/act/img/js/hero/' + id + '.js'
# print(detail_url)
headers = {
'Referer': 'https://lol.qq.com/data/info-defail.shtml?id =4',
'Sec-Fetch-Mode': 'cors',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
response = requests.get(detail_url, headers=headers)
n = json.loads(response.text)
# print(n)
hero = [] # 存放单个英雄
item_name = {}
item_name['英雄名字'] = n['hero']['name'] + ' ' + n['hero']['title']
hero.append(item_name)
for i in n['spells']: # 技能
item_skill = {}
item_skill['技能名字'] = i['name']
item_skill['技能描述'] = i['description']
hero.append(item_skill)
hero_list.append(hero)
# print(hero_list)
with open('hero.json', 'w') as file:
json.dump(hero_list, file)
# 将json解码后打印
with open('hero.json', 'r') as file:
hero_list = json.load(file)
for heroo in hero_list:
print(heroo)
2这是将json格式的数据读取出来并保存到excel中
import pandas as pd
import json
columns = ['英雄', '技能1', '被动', '技能2', '技能3', '技能4']
df = []
with open('hero.json', 'r', encoding='utf-8') as f:
data_dict = json.load(f)
for key in data_dict[0]:
df.append(key)
print(df)
df = pd.read_json('hero.json', orient='records', encoding='utf-8') # 读取json数据
print(df)
d = pd.DataFrame(df, columns=columns)
d.to_excel('英雄联盟英雄详情.xlsx', index=False)
print(d)
以上为两个.py文件
但是运行第二个程序后,print(d)的结果中间是nan(空),请问在座的各位有人懂这方面的知识吗?