George、卟壞 2025-03-03 15:17 采纳率: 16.7%
浏览 5
已结题

急,求解(python爬虫)


import requests
from lxml import etree
import csv
import time
import random

def header_x():
    """
    随机选择一个User-Agent,以模拟不同的浏览器请求。
    """
    user_agents = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
    ]
    headers = {"User-Agent": random.choice(user_agents)}
    return headers

# 随机等待5到8秒,以避免频繁请求
time.sleep(random.randint(5, 8))

def fetch_books(category, pages=51):
    """
    从博库网站抓取指定分类的书籍信息。

    :param category: 书籍分类,例如 '编程'
    :param pages: 要抓取的页数,默认为50页
    :return: 包含书籍信息的列表
    """
    books = []
    for i in range(1, pages + 1):
        # 构建请求URL
        url = f'https://www.bookuu.com/search.php?cid={category}&page={i}'
        headers = header_x()
        response = requests.get(url, headers=headers)
        html = etree.HTML(response.text)
        lis = html.xpath("//div[@class='wd-640 fl']")

        for li in lis:
            try:
                # 提取书籍名称
                name = li.xpath("./a/@title")[0]
                # 提取作者信息
                author = li.xpath(".//div[@class='lh-30 fs-12']/div[@class='wd-30p fl to-hd mr-10']/span/text()")[0].strip().split('/')[0]
                # 提取价格
                intro_1 = li.xpath(".//div[@class='lh-30']/span/text()")[0].strip()
                # 提取出版社信息
                publisher = li.xpath(".//div[@class='lh-30 fs-12']/div[@class='wd-30p fl to-hd cl-9 mr-10']/span/text()")[0].strip().split('/')[-3]
                # 提取出版时间
                publish_time = li.xpath(".//div[@class='lh-30 fs-12']/div[@class='wd-30p fl to-hd cl-9']/span/text()")[0].strip().split('/')[-2]
                # 提取书籍详情页URL
                son_url = li.xpath("./a/@href")[0]

                # 请求书籍详情页
                resp_son = requests.get(son_url, headers=headers)
                html_son = etree.HTML(resp_son.text)
                # 提取销量
                score = html_son.xpath("//table[@class='lh-30 mt-10']/tbody/tr/td/text()")[0].strip()
                # 提取库存
                intro = ''.join(html_son.xpath("//table[@class='lh-30 mt-10']/tbody/tr/td[@class='cl-3 clearfix']/span/text()"))

                # 将书籍信息添加到列表中
                books.append([name, author, intro_1, publisher, publish_time, score, intro])
            except IndexError:
                # 如果某个字段提取失败,跳过该书籍
                continue

            # 随机等待3到5秒,以避免频繁请求
            time.sleep(random.randint(3, 5))
    return books

def save_to_csv(books, category):
    """
    将抓取到的书籍信息保存到CSV文件中。

    :param books: 包含书籍信息的列表
    :param category: 书籍分类,用于命名CSV文件
    """
    with open(f'{category}.csv', mode='w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        # 写入CSV文件的表头
        writer.writerow(['书名', '作者', '价格', '出版社', '出版日期', '销量', '库存'])
        # 写入书籍信息
        writer.writerows(books)

if __name__ == "__main__":
    category = '1018'
    books = fetch_books(category)
    save_to_csv(books, category)

求各位大能,帮我看看怎么修改才能爬到数据。运行没报错,但也没爬到相关数据,提取信息那块有问题,不知道怎么改,还有就是“[0].strip().split('/')[0]”这个是什么意思?

  • 写回答

3条回答 默认 最新

  • 软件技术NINI 2025-03-03 17:01
    关注
    
    import requests
    from lxml import etree
    import csv
    import time
    import random
    from requests.adapters import HTTPAdapter
    from urllib3.util.retry import Retry
    
    def header_x():
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            # 更多User-Agent...
        ]
        return {"User-Agent": random.choice(user_agents)}
    
    def requests_retry_session(retries=3, backoff_factor=1):
        session = requests.Session()
        retry = Retry(
            total=retries,
            read=retries,
            connect=retries,
            backoff_factor=backoff_factor,
            status_forcelist=[500, 502, 503, 504],
        )
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        return session
    
    def fetch_books(category, pages=51):
        books = []
        session = requests_retry_session()
        base_url = 'https://www.bookuu.com'
    
        for i in range(1, pages + 1):
            url = f'https://www.bookuu.com/search.php?cid={category}&page={i}'
            try:
                response = session.get(url, headers=header_x(), timeout=10)
                response.raise_for_status()
                html = etree.HTML(response.text)
                lis = html.xpath("//div[contains(@class, 'wd-640')]")  # 更宽松的XPath
    
                for li in lis:
                    try:
                        # 提取基础信息
                        name = li.xpath(".//a/@title")[0].strip()
                        author_div = li.xpath(".//div[contains(@class, 'wd-30p')]/span/text()")
                        author = author_div[0].strip() if author_div else "未知"
    
                        # 提取价格
                        price = li.xpath(".//div[@class='lh-30']/span/text()")[0].strip()
    
                        # 出版社和出版时间(优化后的XPath)
                        publisher_info = li.xpath(".//div[contains(text(), '出版社')]/following-sibling::div/text()")
                        publisher = publisher_info[0].strip() if publisher_info else "未知"
                        pub_date_info = li.xpath(".//div[contains(text(), '出版时间')]/following-sibling::div/text()")
                        pub_date = pub_date_info[0].strip() if pub_date_info else "未知"
    
                        # 处理子页面
                        son_path = li.xpath(".//a/@href")[0]
                        son_url = base_url + son_path
                        resp_son = session.get(son_url, headers=header_x(), timeout=10)
                        html_son = etree.HTML(resp_son.text)
    
                        # 销量和库存
                        sales = html_son.xpath("//td[contains(text(), '销量')]/following-sibling::td/text()")
                        stock = html_son.xpath("//td[contains(text(), '库存')]/following-sibling::td/text()")
                        sales = sales[0].strip() if sales else "0"
                        stock = stock[0].strip() if stock else "无"
    
                        books.append([name, author, price, publisher, pub_date, sales, stock])
                    except (IndexError, Exception) as e:
                        print(f"提取书籍信息失败: {e}")
                        continue
    
                    time.sleep(random.uniform(1, 3))  # 更自然的等待时间
            except requests.exceptions.RequestException as e:
                print(f"请求失败: {url}, 错误: {e}")
                continue
    
        return books
    
    def save_to_csv(books, category):
        with open(f'{category}.csv', 'w', encoding='utf-8-sig', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['书名', '作者', '价格', '出版社', '出版日期', '销量', '库存'])
            writer.writerows(books)
    
    if __name__ == "__main__":
        category = '1018'
        books = fetch_books(category, pages=5)  # 测试时减少页数
        save_to_csv(books, category)
    
    评论

报告相同问题?

问题事件

  • 已结题 (查看结题原因) 3月4日
  • 创建了问题 3月3日