爬虫需要获取多页数据时,网页的url不会刷新时怎么办
# -*- coding:utf-8 -*-
from selenium.webdriver.common.by import By
import re
import lxml.html
import matplotlib.pyplot as plt
from selenium import webdriver
import pandas as pd
import time
from selenium import webdriver
driver = webdriver.Chrome() # 创建driver对象
driver.get() # 打开页面
time.sleep(2)
a = driver.find_element(
By.XPATH, '//a[@class="hy-nav-link hy-nav-link-ext clickstat"]')
a.click()
time.sleep(2)
b = driver.find_element(By.XPATH, '//li[@data-gid="1"]')
b.click()
time.sleep(2)
driver.switch_to.window(driver.window_handles[-1]) # 读取新页面
html = driver.page_source
xp = lxml.html.fromstring(html) # lxml对象
units = xp.xpath('//li[@class="game-live-item"]')
df = pd.DataFrame() # 没有text()
for u in units:
t = {}
t['标题'] = u.xpath('./a/@title')[0]
t['主播'] = u.xpath('./span/span/i[@class="nick"]')[0].xpath("string()")
t['人气'] = u.xpath('./span/span[2]/i[@class="js-num"]/text()')[0]
df0 = pd.DataFrame([t])
df = pd.concat([df, df0], ignore_index=True)
if len(df)>100:
break
print(df)
df['人气'] = df['人气'].apply(lambda x:float(x[:-1])*10000 if "万" in x else float(x))
df.to_excel(r'D:\a62.xlsx',index=False)
df = pd.read_excel(r'D:\a62.xlsx')
plt.figure(figsize=(12,8),dpi=150)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.bar(x='主播',height='人气',data=df)
plt.xticks(rotation=90,font={'size':6})
plt.yticks(font={'size': 8})
plt.gca().yaxis.get_major_formatter().set_scientific(False)
plt.tight_layout()
plt.show()