import requests
from lxml import etree
import csv
import time
import random
def header_x():
"""
随机选择一个User-Agent,以模拟不同的浏览器请求。
"""
user_agents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
]
headers = {"User-Agent": random.choice(user_agents)}
return headers
# 随机等待5到8秒,以避免频繁请求
time.sleep(random.randint(5, 8))
def fetch_books(category, pages=51):
"""
从博库网站抓取指定分类的书籍信息。
:param category: 书籍分类,例如 '编程'
:param pages: 要抓取的页数,默认为50页
:return: 包含书籍信息的列表
"""
books = []
for i in range(1, pages + 1):
# 构建请求URL
url = f'https://www.bookuu.com/search.php?cid={category}&page={i}'
headers = header_x()
response = requests.get(url, headers=headers)
html = etree.HTML(response.text)
lis = html.xpath("//div[@class='wd-640 fl']")
for li in lis:
try:
# 提取书籍名称
name = li.xpath("./a/@title")[0]
# 提取作者信息
author = li.xpath(".//div[@class='lh-30 fs-12']/div[@class='wd-30p fl to-hd mr-10']/span/text()")[0].strip().split('/')[0]
# 提取价格
intro_1 = li.xpath(".//div[@class='lh-30']/span/text()")[0].strip()
# 提取出版社信息
publisher = li.xpath(".//div[@class='lh-30 fs-12']/div[@class='wd-30p fl to-hd cl-9 mr-10']/span/text()")[0].strip().split('/')[-3]
# 提取出版时间
publish_time = li.xpath(".//div[@class='lh-30 fs-12']/div[@class='wd-30p fl to-hd cl-9']/span/text()")[0].strip().split('/')[-2]
# 提取书籍详情页URL
son_url = li.xpath("./a/@href")[0]
# 请求书籍详情页
resp_son = requests.get(son_url, headers=headers)
html_son = etree.HTML(resp_son.text)
# 提取销量
score = html_son.xpath("//table[@class='lh-30 mt-10']/tbody/tr/td/text()")[0].strip()
# 提取库存
intro = ''.join(html_son.xpath("//table[@class='lh-30 mt-10']/tbody/tr/td[@class='cl-3 clearfix']/span/text()"))
# 将书籍信息添加到列表中
books.append([name, author, intro_1, publisher, publish_time, score, intro])
except IndexError:
# 如果某个字段提取失败,跳过该书籍
continue
# 随机等待3到5秒,以避免频繁请求
time.sleep(random.randint(3, 5))
return books
def save_to_csv(books, category):
"""
将抓取到的书籍信息保存到CSV文件中。
:param books: 包含书籍信息的列表
:param category: 书籍分类,用于命名CSV文件
"""
with open(f'{category}.csv', mode='w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
# 写入CSV文件的表头
writer.writerow(['书名', '作者', '价格', '出版社', '出版日期', '销量', '库存'])
# 写入书籍信息
writer.writerows(books)
if __name__ == "__main__":
category = '1018'
books = fetch_books(category)
save_to_csv(books, category)
求各位大能,帮我看看怎么修改才能爬到数据。运行没报错,但也没爬到相关数据,提取信息那块有问题,不知道怎么改,还有就是“[0].strip().split('/')[0]”这个是什么意思?