import os
import csv
import requests
import random
import re
from bs4 import BeautifulSoup
import time
# 当前文件夹路径
current_folder = os.getcwd()
def crawl_and_save_links():
# 用于保存爬取结果
results = []
# 定义正则表达式模式匹配链接
link_pattern = re.compile(r'https?://[^\s]+')
# 遍历当前文件夹中的文件
for file in os.listdir(current_folder):
if file.endswith('.csv'):
file_path = os.path.join(current_folder, file)
with open(file_path, 'r') as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
for cell in row:
match = link_pattern.search(cell)
if match:
link = match.group()
try:
response = requests.get(link)
print(response.text)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
results.append([link, text])
except requests.exceptions.RequestException as e:
print(f"爬取 {link} 时出错: {e}")
# 添加随机时间间隔
time.sleep(random.uniform(1, 3))
# 将结果保存到新的 CSV 文件
with open('crawled_results.csv', 'w', newline='', encoding='utf-8') as output_file:
csv_writer = csv.writer(output_file)
csv_writer.writerow(['链接', '文本内容'])
for result in results:
csv_writer.writerow(result)
if __name__ == '__main__':
crawl_and_save_links()
还有爬取成保存的文件会出现乱码,和不必要的空格