George、卟壞 2025-03-04 14:08 采纳率: 16.7%
浏览 16
已结题

爬虫,与史诗级难题,求解!

代码能运行,也没有报错,但爬取不到相关信息,求解?

img


import time
import random
import requests
from lxml import etree
import csv

def header_x():
    user_agents = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    ]
    headers = {"User-Agent": random.choice(user_agents)}
    return headers

def fetch_books(session, category_ids, pages=49):
    books = []
    base_url = "https://www.bookuu.com/" #href="//www.bookuu.com/detail.php?id=101725164"
    for cid in category_ids:
        for page in range(1, pages + 1):
            url = f"https://www.bookuu.com/search.php?cid={cid}&page={page}"
            headers = header_x()
            response = session.get(url, headers=headers)
            if response.status_code != 200:
                print(f"Failed to fetch page {page} for category {cid}, status code: {response.status_code}")
                continue

            html = etree.HTML(response.text)
            lis = html.xpath("//div[contains(@class, 'wd-640')]") # 更宽松的XPath

            for li in lis:
                try:
                    # 提取基础信息
                    name = li.xpath(".//a/text()")[0].strip()
                    author_div = li.xpath(".//div[contains(@class, 'wd-30p')]/span/text()")
                    author = author_div[0].strip() if author_div else "未知"

                    # 提取价格
                    price = li.xpath(".//div[@class='lh-30']/span/text()")[0].strip()

                    # 出版社和出版时间(优化后的XPath)
                    publisher_info = li.xpath(".//div[contains(text(), '出版社')]/following-sibling::div/text()")
                    publisher = publisher_info[0].strip() if publisher_info else "未知"
                    pub_date_info = li.xpath(".//div[contains(text(), '出版时间')]/following-sibling::div/text()")
                    pub_date = pub_date_info[0].strip() if pub_date_info else "未知"

                    # 处理子页面
                    son_path = li.xpath(".//a/@href")[0]
                    son_url = base_url + son_path
                    resp_son = session.get(son_url, headers=header_x(), timeout=10)
                    html_son = etree.HTML(resp_son.text)

                    # 销量和库存
                    sales = html_son.xpath("//td[contains(text(), '销量')]/following-sibling::td/text()")
                    stock = html_son.xpath("//td[contains(text(), '库存')]/following-sibling::td/text()")
                    sales = sales[0].strip() if sales else "0"
                    stock = stock[0].strip() if stock else "无"

                    books.append([name, author, price, publisher, pub_date, sales, stock])
                except (IndexError, Exception) as e:
                    print(f"提取书籍信息失败: {e}")
                    continue

            time.sleep(random.randint(1, 3))
    return books

def save_to_csv(books, filename):
    with open(filename, mode='w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['书名', '作者', '价格', '出版社', '出版日期', '销量', '库存'])
        writer.writerows(books)

if __name__ == "__main__":
    # 定义分类 ID 列表
    category_ids = ['1018', '1019',  '101802', '1020', '1021',  '1022',  '1023',  '103713',  '103714', '103715', '103716', '103701',
                    '103702', '103703', '103704', '103705', '103706', '103707', '103708', '103708', '103711', '103701',
                    '1038', '1039', '1040', '1042', '1043', '1044', '1045', '1024', '1025', '1026', '1041',
                    '1026', '1027', '1028', '1029', '1030', '1032', '1033', '1034', '1035', '1017', '1048', '1048',
                    '1049', '1050', '1051', '1052', '1053', '1054', '1055', '1056', '1057', '1058', '1011',
                    ]  # 示例分类 ID
    pages = 50  # 每个分类爬取的页数

    # 创建会话
    session = requests.Session()

    # 将登录后的 Cookie 添加到会话中
    session.cookies.set("cookie", "parent_qimo_sid_92464560-3f16-11e9-8a25-8d8585556f17=842fa102-a4ef-4247-b504-7856a955c2d4; accessId=92464560-3f16-11e9-8a25-8d8585556f17; pv_id=ce979e0a977dca375ff535c9c9e9176c; _uab_collina=174098147261023766843709; wwwsid=d32f327bda1b54a31c8b6d0feb1bb4fd; tfstk=gAVoQ6AzN8kSJWr92ol5B8tvGsWxNUGIYkdKvXnF3moX23d88vq30u9JeyS7xWm400E8eye08uaUxv_SvDo3vyyRk1CTPzGITXjO61FcLspTx4kr4r7E5VTPkYWIMVaZTGIOMdncuOlU2-1DQs4qc2ArY0rEgjun0BuUY0k20V0tTDrUYxRq540eaLuygKoj8XoUYX70ubKCU0NUgSS49ps5pJq_i4DobrX6wQOszhnUzqOeT00oEZ4rmBRUi-p4YToPCEnIw0arrk16Nbk3L7h4ai5E_RZ0tAlGpaMz--qIwS7eUcy82vFSn3JrSbmoQ7kAsZljL-Vmw7SBCrUm4AcYHts-pbquCcMPhiZ3o0EUZx-lDDeTk7k0bid0Av2UwjPP01jyBKJNnmdIuwF2dpMrlqm9V2RfrJVFnm7cod7IUqgSXZbDdpMrlqmOoZvNOYujPcC..; PHPSESSID=4ingct6o5llmvvsbn9ssakcl83; href=https%3A%2F%2Fwww.bookuu.com%2Findex.php; pageViewNum=234")

    # 爬取书籍信息
    books = fetch_books(session, category_ids, pages)

    # 保存到 CSV 文件
    save_to_csv(books, f"'{category_ids}' + '.csv' ")
  • 写回答

3条回答 默认 最新

  • 雪山青木 2025-03-04 15:08
    关注

    从代码看可能是xpath解析list的问题

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(2条)

报告相同问题?

问题事件

  • 已结题 (查看结题原因) 3月4日
  • 已采纳回答 3月4日
  • 创建了问题 3月4日