import requests
import csv
import time
import re
from datetime import datetime, timedelta
import urllib3
import pandas as pd
# 禁用SSL警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class XiaomiCarSafetyCrawler:
def __init__(self):
# 初始化请求头,需替换为自己的Cookie
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
'Cookie': 'SCF=AkOpnqaTAgfNhJckiG6t5SgoeTg341BVJvXaT54-ua6mudRJ4pNBFP-QcEkSdpuYumtUkhvQJIdTMW1t6s745cE.; SUB=_2A25ELRlrDeRhGe9O61EQ9ifPzzSIHXVnQxSjrDV8PUNbmtANLXjNkW9NdPXGVgHgEe-abE8JLbplTzaRF-FidrHo; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh.J8y_l7yKo6i-2cVy5pXr5JpX5KzhUgL.Fo.7ehepSo.0Shn2dJLoIEBLxKML1K.LB.BLxKML1K2L1-eLxK-LBKBL1K-LxKqL1KnL1-qt; ALF=02_1766913595; SINAGLOBAL=9296333081626.326.1764321612821; _s_tentry=www.weibo.com; Apache=322539749715.7223.1764480589922; ULV=1764480589924:3:3:1:322539749715.7223.1764480589922:1764414571441', # 必须替换
'Referer': 'https://s.weibo.com/',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'X-Requested-With': 'XMLHttpRequest'
}
# 微博搜索接口
self.search_url = 'https://s.weibo.com/ajax/searchscroll'
# 存储所有爬取的数据
self.all_data = []
def format_time(self, time_str):
"""标准化发布时间格式"""
try:
if '刚刚' in time_str:
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
elif '分钟前' in time_str:
minutes = int(time_str.replace('分钟前', ''))
return (datetime.now() - timedelta(minutes=minutes)).strftime('%Y-%m-%d %H:%M:%S')
elif '小时前' in time_str:
hours = int(time_str.replace('小时前', ''))
return (datetime.now() - timedelta(hours=hours)).strftime('%Y-%m-%d %H:%M:%S')
elif '昨天' in time_str:
t = time_str.replace('昨天 ', '')
return (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d') + ' ' + t
elif len(time_str) == 5: # 如 "12:30"
return datetime.now().strftime('%Y-%m-%d') + ' ' + time_str
elif len(time_str) == 8: # 如 "05-10 12:30"
return datetime.now().strftime('%Y-') + time_str
else:
return time_str
except:
return time_str
def fetch_page(self, keyword, page):
"""获取单页数据"""
params = {
'q': keyword,
'page': page,
'cate': 'weball',
'is_search': '1',
'is_newpage': '1',
'scroll_id': f'{int(time.time() * 1000)}' # 时间戳作为scroll_id
}
try:
response = requests.get(
self.search_url,
headers=self.headers,
params=params,
timeout=15,
verify=False
)
# 检查响应状态
if response.status_code != 200:
print(f"请求失败,状态码: {response.status_code}")
return None
return response.json()
except Exception as e:
print(f"获取第{page}页数据出错: {str(e)}")
return None
def parse_data(self, json_data):
"""解析JSON数据,提取所需字段"""
parsed = []
# 提取卡片数据
cards = json_data.get('data', {}).get('cards', [])
for card in cards:
mblog = card.get('mblog')
if not mblog:
continue
# 提取用户信息
user = mblog.get('user', {})
# 清理微博内容中的HTML标签
content = re.sub(r'<[^>]+>', '', mblog.get('text', ''))
# 构建数据字典
item = {
'用户名': user.get('screen_name', '未知用户'),
'发布时间': self.format_time(mblog.get('created_at', '')),
'发布地点': mblog.get('region_name', ''), # 发布地点
'转发数': mblog.get('reposts_count', 0),
'评论数': mblog.get('comments_count', 0),
'点赞数': mblog.get('attitudes_count', 0),
'内容': content.strip()
}
parsed.append(item)
return parsed
def save_to_csv(self, filename='小米汽车安全微博数据.csv'):
"""将数据保存为CSV文件"""
if not self.all_data:
print("没有数据可保存")
return
# 定义CSV表头顺序
headers = ['用户名', '发布时间', '发布地点', '转发数', '评论数', '点赞数', '内容']
with open(filename, 'w', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
writer.writerows(self.all_data)
print(f"数据已保存至 {filename},共 {len(self.all_data)} 条记录")
def run(self, keyword='小米汽车安全', pages=5):
"""运行爬虫主程序"""
print(f"开始爬取关键词: {keyword},共 {pages} 页")
for page in range(1, pages + 1):
print(f"正在爬取第 {page} 页...")
# 获取页面数据
json_data = self.fetch_page(keyword, page)
if not json_data:
print(f"第 {page} 页无有效数据,停止爬取")
break
# 解析数据
page_data = self.parse_data(json_data)
if not page_data:
print(f"第 {page} 页解析不到数据,停止爬取")
break
# 存储数据
self.all_data.extend(page_data)
print(f"第 {page} 页爬取完成,获取 {len(page_data)} 条记录")
# 设置延迟,避免被反爬
time.sleep(3)
# 保存数据
self.save_to_csv()
return self.all_data
if __name__ == "__main__":
# 创建爬虫实例
crawler = XiaomiCarSafetyCrawler()
# 运行爬虫,爬取5页数据
# 注意:请先替换代码中的Cookie
crawler.run(pages=5)
结果出错是为什么,给出正确代码和解决办法