代码还存在缺陷,无法获取全部的话题内容,爬取结果只有10条,请帮忙修改完善一下。
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
import os
chrome_driver_path = "F:\死磕\stance分析\chromedriver.exe"
driver = webdriver.Chrome(executable_path=chrome_driver_path)
weibo_url = "https://s.weibo.com/weibo?q=%23chatgpt%23"
driver.get(weibo_url)
# 等待登录按钮加载出来
wait = WebDriverWait(driver, 10)
login_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[text()='登录']")))
# 点击登录按钮
login_button.click()
# 等待登录完成
time.sleep(10) # 假设需要等待10秒钟登录完成,你可以根据实际情况调整等待时间
# 切回主页面
driver.switch_to.default_content()
# 模拟滚动加载内容
scroll_pause_time = 2.0
scroll_limit = 20 # 假设滚动20次
scrolls = 0
while scrolls < scroll_limit:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_pause_time)
scrolls += 1
# 解析页面内容
page_source = driver.page_source
# 使用 Beautiful Soup 解析页面内容
from bs4 import BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")
# 查找博文内容
posts = soup.find_all("p", class_="txt")
# 设置保存目录
save_dir = "F:\死磕\stance分析\结果"
# 创建保存目录
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# 逐条保存博文内容为 txt 文件
for i, post in enumerate(posts):
post_text = post.get_text(strip=True)
file_path = os.path.join(save_dir, f"weibo_{i + 1}.txt")
with open(file_path, "w", encoding="utf-8") as file:
file.write(post_text)
print(f"保存第{i + 1}条微博至 {file_path}")
# 关闭浏览器
driver.quit()
print("爬取和保存完成!")