
求解,完全不明白为啥会提取信息失败。
import time
import random
import requests
from lxml import etree
import csv
def header_x():
user_agents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
]
headers = {"User-Agent": random.choice(user_agents)}
return headers
def fetch_books(session, category_ids, pages=5):
books = []
for cid in category_ids:
for page in range(1, pages + 1):
url = f"https://www.bookuu.com/search.php?cid={cid}&page={page}"
headers = header_x()
response = session.get(url, headers=headers)
if response.status_code != 200:
print(f"Failed to fetch page {page} for category {cid}, status code: {response.status_code}")
continue
html = etree.HTML(response.text)
lis = html.xpath("//div[contains(@class, 'wd-640')]") # 更宽松的XPath
for li in lis:
try:
# 提取基础信息
name = li.xpath(".//a/@title")[0].strip()
author_div = li.xpath(".//div[contains(@class, 'wd-30p')]/span/text()")
author = author_div[0].strip() if author_div else "未知"
# 提取价格
price = li.xpath(".//div[@class='lh-30']/span/text()")[0].strip()
# 出版社和出版时间(优化后的XPath)
publisher_info = li.xpath(".//div[contains(text(), '出版社')]/following-sibling::div/text()")
publisher = publisher_info[0].strip() if publisher_info else "未知"
pub_date_info = li.xpath(".//div[contains(text(), '出版时间')]/following-sibling::div/text()")
pub_date = pub_date_info[0].strip() if pub_date_info else "未知"
# 处理子页面
son_path = li.xpath(".//a/@href")[0]
son_url = base_url + son_path
resp_son = session.get(son_url, headers=header_x(), timeout=10)
html_son = etree.HTML(resp_son.text)
# 销量和库存
sales = html_son.xpath("//td[contains(text(), '销量')]/following-sibling::td/text()")
stock = html_son.xpath("//td[contains(text(), '库存')]/following-sibling::td/text()")
sales = sales[0].strip() if sales else "0"
stock = stock[0].strip() if stock else "无"
books.append([name, author, price, publisher, pub_date, sales, stock])
except (IndexError, Exception) as e:
print(f"提取书籍信息失败: {e}")
continue
time.sleep(random.randint(1, 3))
return books
def save_to_csv(books, filename):
with open(filename, mode='w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
writer.writerow(['书名', '作者', '价格', '出版社', '出版日期', '销量', '库存'])
writer.writerows(books)
if __name__ == "__main__":
# 定义分类 ID 列表
category_ids = ['1018', '1019', '101802', '1020', '1021', '1022', '1023', '103713', '103714', '103715', '103716', '103701',
'103702', '103703', '103704', '103705', '103706', '103707', '103708', '103708', '103711', '103701',
'1038', '1039', '1040', '1042', '1043', '1044', '1045', '1024', '1025', '1026', '1041',
'1026', '1027', '1028', '1029', '1030', '1032', '1033', '1034', '1035', '1017', '1048', '1048',
'1049', '1050', '1051', '1052', '1053', '1054', '1055', '1056', '1057', '1058', '1011',
] # 示例分类 ID
pages = 50 # 每个分类爬取的页数
# 创建会话
session = requests.Session()
# 将登录后的 Cookie 添加到会话中
session.cookies.set("cookie", "your_cookie_value_here")
# 爬取书籍信息
books = fetch_books(session, category_ids, pages)
# 保存到 CSV 文件
save_to_csv(books, "books.csv")