为什么会出现,抓取数据时发生错误: get_mooncake_data() missing 1 required positional argument: 'driver'的问题,怎么改出正确的爬虫代码?
1.0 scraper.py
import os
import pickle
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 硬编码值
GECKODRIVER_PATH = r'D:\Mozilla Firefox\geckodriver.exe' # 使用原始字符串避免转义问题
USERNAME = '' # 替换为您的淘宝用户名
PASSWORD = '' # 替换为您的淘宝密码
def save_cookies(driver, filename='cookies.pkl'):
with open(filename, 'wb') as f:
pickle.dump(driver.get_cookies(), f)
def load_cookies(driver, filename='cookies.pkl'):
try:
with open(filename, 'rb') as f:
cookies = pickle.load(f)
for cookie in cookies:
driver.add_cookie(cookie)
driver.refresh()
return True
except Exception as e:
logging.error(f"加载 cookies 失败: {e}")
return False
def setup_driver():
options = Options()
options.set_preference('dom.webdriver.enabled', False)
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
service = Service(GECKODRIVER_PATH)
driver = webdriver.Firefox(service=service, options=options)
driver.execute_script('Object.defineProperty(navigator, "webdriver", {get: () => undefined})')
return driver
def login(driver):
url_login = 'https://login.taobao.com/member/login.jhtml'
driver.get(url_login)
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH, '//input[@type="text"]'))
)
username_field = driver.find_element(By.XPATH, '//input[@type="text"]')
password_field = driver.find_element(By.XPATH, '//input[@type="password"]')
username_field.send_keys(USERNAME)
password_field.send_keys(PASSWORD)
login_button = WebDriverWait(driver, 30).until(
EC.element_to_be_clickable((By.XPATH, '//button[text()="登录"]'))
)
login_button.click()
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CLASS_NAME, 'site-nav-menu-hd'))
) # 修改为登录成功后的某个元素作为等待条件
save_cookies(driver)
logging.info("登录成功并已保存 cookies.")
def get_mooncake_data(driver):
url = "https://s.taobao.com/search?q=%E6%9C%88%E9%A5%BC"
driver.get(url)
# 尝试加载之前保存的cookies(如果存在且有效)
if not load_cookies(driver):
logging.info("没有找到有效的 Cookies,正在进行自动登录...")
login(driver)
# 登录后不需要再次加载cookies,因为login函数已经保存了新登录的cookies
# 等待商品列表加载完成
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CLASS_NAME, 'item'))
)
mooncake_data = []
items = driver.find_elements(By.CLASS_NAME, 'item')
logging.info(f"Found {len(items)} items.")
for i, item in enumerate(items):
try:
title = item.find_element(By.CSS_SELECTOR, '.title')
price = item.find_element(By.CLASS_NAME, 'price')
mooncake_data.append({
'title': title.text.strip(),
'price': price.text.strip()
})
except Exception as e:
logging.error(f"错误: {e} 在处理 item {i+1} 时")
return mooncake_data
def main():
driver = setup_driver()
try:
# 在访问页面之前不加载cookies,因为get_mooncake_data会处理它
data = get_mooncake_data(driver) # 确保这里传递了driver参数
for item in data:
print(item)
finally:
driver.quit()
if __name__ == "__main__":
main()
2.0main.py
from PyQt5.QtWidgets import QApplication, QWidget
import sys
from gui import MooncakeApp # 确保你有这个类
if __name__ == "__main__":
# 先创建 QApplication 对象
app = QApplication(sys.argv)
# 创建主窗口
window = MooncakeApp()
# 显示主窗口
window.show()
# 运行应用
sys.exit(app.exec_()) # 进入 PyQt5 的事件循环
3.0 gui.py
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QPushButton, QLabel, QTableWidget, QTableWidgetItem
import sys
from scraper import get_mooncake_data
from visualizer import visualize_data
class MooncakeApp(QWidget):
def __init__(self):
super().__init__()
self.setWindowTitle("月饼销售数据爬虫")
self.setGeometry(100, 100, 800, 600)
self.layout = QVBoxLayout()
self.label = QLabel("月饼销售数据", self)
self.layout.addWidget(self.label)
self.btn_scrape = QPushButton("抓取数据", self)
self.btn_scrape.clicked.connect(self.scrape_data)
self.layout.addWidget(self.btn_scrape)
self.table = QTableWidget(self)
self.layout.addWidget(self.table)
self.btn_visualize = QPushButton("展示可视化图", self)
self.btn_visualize.clicked.connect(visualize_data)
self.layout.addWidget(self.btn_visualize)
self.setLayout(self.layout)
def scrape_data(self):
try:
data = get_mooncake_data() # 获取数据
if data: # 如果有数据,则显示
self.show_data_in_table(data)
else:
print("没有抓取到数据")
except Exception as e:
print(f"抓取数据时发生错误: {e}")
def show_data_in_table(self, data):
self.table.setRowCount(len(data)) # 设置表格行数
self.table.setColumnCount(5) # 设置表格列数
self.table.setHorizontalHeaderLabels(['标题', '价格', '销量', '店铺', '评分'])
for i, row in enumerate(data):
for j, col in enumerate(row):
self.table.setItem(i, j, QTableWidgetItem(str(col))) # 填充表格
if __name__ == "__main__":
app = QApplication(sys.argv)
window = MooncakeApp()
window.show()
sys.exit(app.exec_())
4.0 visualizer.py
import matplotlib.pyplot as plt
import pandas as pd
import mysql.connector
def visualize_data():
# 从数据库读取数据
conn = mysql.connector.connect(
host="localhost", user="root", password="1ot", database="taobao_sales"
)
query = "SELECT title, price, sales_volume FROM mooncake_sales"
df = pd.read_sql(query, conn)
conn.close()
# 绘制销量与价格关系图
plt.figure(figsize=(10, 6))
plt.scatter(df['price'], df['sales_volume'], c='blue', alpha=0.5)
plt.title('月饼价格与销量关系图')
plt.xlabel('价格 (¥)')
plt.ylabel('销量')
plt.show()
5.0 config.py
# 配置文件示例
DB_CONFIG = {
'host': 'localhost',
'user': 'root',
'password': 'root',
'database': 'taobao_sales'
}
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
6.0 db_handler.py
import mysql.connector
def save_to_mysql(mooncake_data):
conn = mysql.connector.connect(
host="localhost", user="root", password="root", database="taobao_sales"
)
cursor = conn.cursor()
for data in mooncake_data:
cursor.execute("""
INSERT INTO mooncake_sales (title, price, sales_volume, shop_name, rating)
VALUES (%s, %s, %s, %s, %s)
""", data)
conn.commit()
cursor.close()
conn.close()