




为啥销量和库存爬取不上信息?
import time
import random
import requests
from lxml import etree
import csv
from concurrent.futures import ThreadPoolExecutor
def header_x():
user_agents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:3.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.5.2',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
]
headers = {"User-Agent": random.choice(user_agents)}
return headers
def fetch_books(session, category_id, page):
books = []
base_url = "https://www.bookuu.com/"
url = f"https://www.bookuu.com/search.php?cid={category_id}&page={page}"
headers = header_x()
response = session.get(url, headers=headers) # 将 timeout 参数添加到 requests.get()
if response.status_code != 200:
print(f"Failed to fetch page {page} for category {category_id}, status code: {response.status_code}")
return books
html = etree.HTML(response.text)
lis = html.xpath("//div[contains(@class, 'wd-640')]") # 提取包含书籍信息的 HTML 元素
for li in lis:
try:
# 提取书名
name = li.xpath(".//a/text()")[0].strip()
# 提取作者
author_div = li.xpath(".//div[contains(@class, 'wd-30p')]/span[2]/text()")
author = author_div[0].strip() if author_div else "未知"
# 提取价格
price = li.xpath(".//div[@class='lh-30']/span/text()")[0].strip()
# 提取出版社
publisher_info = li.xpath(".//div[contains(@class , 'wd-30p')]/span[contains(@style,'color: #212121;')]/text()")
publisher = publisher_info[0].strip() if publisher_info else "未知"
# 提取出版日期
pub_date_info = li.xpath(".//div[contains(@class , 'wd-30p')]/span[contains(@style,'color: #212121;')]/text()")
pub_date = pub_date_info[0].strip() if pub_date_info else "未知"
son_path = li.xpath("./a/@href")[0] # 提取子链接
son_url = base_url + son_path # 构造完整的子页面 URL
resp_son = session.get(son_url, headers=header_x(), timeout=10) # 请求子页面
html_son = etree.HTML(resp_son.text) # 解析子页面 HTML
# 提取销量
sales = li.xpath(".//span[@class='cl-3']/text()")
sales = sales[0].strip() if sales else "0"
# 提取库存
stock_info = html_son.xpath(".//span[@id='www_goods_stores']/text()")
stock = stock_info[0].split("库存:")[1].strip() if stock_info else "无"
books.append([name, author, price, publisher, pub_date, sales, stock]) # 将书籍信息添加到列表
except (IndexError, Exception) as e:
print(f"提取书籍信息失败: {e}")
continue
return books # 返回当前页的书籍信息
def save_to_csv(books, filename):
with open(filename, mode='w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
writer.writerow(['书名', '作者', '价格', '出版社', '出版日期', '销量', '库存'])
writer.writerows(books)
def main():
category_ids = ['1018', ] # 示例分类 ID
pages = 5 # 每个分类爬取的页数
session = requests.Session()
session.cookies.set("cookie", "parent_qimo_sid_92464560-3f16-11e9-8a25-8d8585556f17=842fa102-a4ef-4247-b504-7856a955c2d4; accessId=92464560-3f16-11e9-8a25-8d8585556f17; pv_id=ce979e0a977dca375ff535c9c9e9176c; _uab_collina=174098147261023766843709; wwwsid=d32f327bda1b54a31c8b6d0feb1bb4fd; tfstk=gAVoQ6AzN8kSJWr92ol5B8tvGsWxNUGIYkdKvXnF3moX23d88vq30u9JeyS7xWm400E8eye08uaUxv_SvDo3vyyRk1CTPzGITXjO61FcLspTx4kr4r7E5VTPkYWIMVaZTGIOMdncuOlU2-1DQs4qc2ArY0rEgjun0BuUY0k20V0tTDrUYxRq540eaLuygKoj8XoUYX70ubKCU0NUgSS49ps5pJq_i4DobrX6wQOszhnUzqOeT00oEZ4rmBRUi-p4YToPCEnIw0arrk16Nbk3L7h4ai5E_RZ0tAlGpaMz--qIwS7eUcy82vFSn3JrSbmoQ7kAsZljL-Vmw7SBCrUm4AcYHts-pbquCcMPhiZ3o0EUZx-lDDeTk7k0bid0Av2UwjPP01jyBKJNnmdIuwF2dpMrlqm9V2RfrJVFnm7cod7IUqgSXZbDdpMrlqmOoZvNOYujPcC..; PHPSESSID=4ingct6o5llmvvsbn9ssakcl83; href=https%3A%2F%2Fwww.bookuu.com%2Findex.php; qimo_seosource_0=%E7%AB%99%E5%86%85; qimo_seokeywords_0=; qimo_seosource_92464560-3f16-11e9-8a25-8d8585556f17=%E7%AB%99%E5%86%85; qimo_seokeywords_92464560-3f16-11e9-8a25-8d8585556f17=; qimo_xstKeywords_92464560-3f16-11e9-8a25-8d8585556f17=; pageViewNum=238") # 替换为实际的 Cookie
all_books = []
# 使用多线程爬取每个分类的每一页
with ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for cid in category_ids:
for page in range(1, pages + 1):
futures.append(executor.submit(fetch_books, session, cid, page))
for future in futures:
all_books.extend(future.result())
save_to_csv(all_books, "books.csv")
if __name__ == "__main__":
main()
# '101802', '1020', '1021', '1022', '1023', '103713', '103714', '103715', '103716', '103701',
# '103702', '103703', '103704', '103705', '103706', '103707', '103708', '103708', '103711', '103701',
# '1038', '1039', '1040', '1042', '1043', '1044', '1045', '1024', '1025', '1026', '1041',
# '1026', '1027', '1028', '1029', '1030', '1032', '1033', '1034', '1035', '1017', '1048', '1048',
# '1049', '1050', '1051', '1052', '1053', '1054', '1055', '1056', '1057', '1058', '1011',