George、卟壞 2025-03-03 21:54 采纳率: 16.7%
浏览 7
已结题

爬虫报错,求解,急!

img


求解,完全不明白为啥会提取信息失败。


import time
import random
import requests
from lxml import etree
import csv

def header_x():
    user_agents = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    ]
    headers = {"User-Agent": random.choice(user_agents)}
    return headers

def fetch_books(session, category_ids, pages=5):
    books = []
    for cid in category_ids:
        for page in range(1, pages + 1):
            url = f"https://www.bookuu.com/search.php?cid={cid}&page={page}"
            headers = header_x()
            response = session.get(url, headers=headers)
            if response.status_code != 200:
                print(f"Failed to fetch page {page} for category {cid}, status code: {response.status_code}")
                continue

            html = etree.HTML(response.text)
            lis = html.xpath("//div[contains(@class, 'wd-640')]") # 更宽松的XPath

            for li in lis:
                try:
                    # 提取基础信息
                    name = li.xpath(".//a/@title")[0].strip()
                    author_div = li.xpath(".//div[contains(@class, 'wd-30p')]/span/text()")
                    author = author_div[0].strip() if author_div else "未知"

                    # 提取价格
                    price = li.xpath(".//div[@class='lh-30']/span/text()")[0].strip()

                    # 出版社和出版时间(优化后的XPath)
                    publisher_info = li.xpath(".//div[contains(text(), '出版社')]/following-sibling::div/text()")
                    publisher = publisher_info[0].strip() if publisher_info else "未知"
                    pub_date_info = li.xpath(".//div[contains(text(), '出版时间')]/following-sibling::div/text()")
                    pub_date = pub_date_info[0].strip() if pub_date_info else "未知"

                    # 处理子页面
                    son_path = li.xpath(".//a/@href")[0]
                    son_url = base_url + son_path
                    resp_son = session.get(son_url, headers=header_x(), timeout=10)
                    html_son = etree.HTML(resp_son.text)

                    # 销量和库存
                    sales = html_son.xpath("//td[contains(text(), '销量')]/following-sibling::td/text()")
                    stock = html_son.xpath("//td[contains(text(), '库存')]/following-sibling::td/text()")
                    sales = sales[0].strip() if sales else "0"
                    stock = stock[0].strip() if stock else "无"

                    books.append([name, author, price, publisher, pub_date, sales, stock])
                except (IndexError, Exception) as e:
                    print(f"提取书籍信息失败: {e}")
                    continue

            time.sleep(random.randint(1, 3))
    return books

def save_to_csv(books, filename):
    with open(filename, mode='w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['书名', '作者', '价格', '出版社', '出版日期', '销量', '库存'])
        writer.writerows(books)

if __name__ == "__main__":
    # 定义分类 ID 列表
    category_ids = ['1018', '1019',  '101802', '1020', '1021',  '1022',  '1023',  '103713',  '103714', '103715', '103716', '103701',
                    '103702', '103703', '103704', '103705', '103706', '103707', '103708', '103708', '103711', '103701',
                    '1038', '1039', '1040', '1042', '1043', '1044', '1045', '1024', '1025', '1026', '1041',
                    '1026', '1027', '1028', '1029', '1030', '1032', '1033', '1034', '1035', '1017', '1048', '1048',
                    '1049', '1050', '1051', '1052', '1053', '1054', '1055', '1056', '1057', '1058', '1011',
                    ]  # 示例分类 ID
    pages = 50  # 每个分类爬取的页数

    # 创建会话
    session = requests.Session()

    # 将登录后的 Cookie 添加到会话中
    session.cookies.set("cookie", "your_cookie_value_here")

    # 爬取书籍信息
    books = fetch_books(session, category_ids, pages)

    # 保存到 CSV 文件
    save_to_csv(books, "books.csv")
  • 写回答

4条回答 默认 最新

  • George、卟壞 2025-03-03 22:02
    关注

    如果有代码代码变成汉字,就刷新一下或重新进入。

    评论

报告相同问题?

问题事件

  • 已结题 (查看结题原因) 3月4日
  • 创建了问题 3月3日