以下代码网址未能正常过滤,请帮助优化,要求过滤content中的所有网址。
```python
import pymysql
import pymysql.cursors
from bs4 import BeautifulSoup
import csv
import re
import os
# 数据库连接配置
config = {
'host': 'localhost',
'user': 'root',
'password': '',
'database': '',
'charset': '',
'cursorclass': pymysql.cursors.DictCursor
}
# 连接到数据库
connection = pymysql.connect(**config)
def fix_urls(text):
# 正则表达式用于匹配和修正错误的网址格式
url_patterns = [
(r'http: (\S+)', r'http://\1'), # 修正缺少“//”的情况
(r'http//(\S+)', r'http://\1') # 修正只有一个“/”的情况
]
for pattern, replacement in url_patterns:
text = re.sub(pattern, replacement, text)
return text
try:
with connection.cursor() as cursor:
# SQL 查询语句
sql = "SELECT id, content FROM zhengwu_copy LIMIT 200000"
cursor.execute(sql)
# 准备CSV文件
csv_file = open('C:/Users/lvdon/Desktop/output.csv', 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['id', 'content'])
count = 0
file_count = 0
# 编译正则表达式
link_regex = re.compile(
r'http[s]?://(?:www\.)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
unwanted_patterns = [
r'首页/通知公告', r'点击播放视频', r'相关文档:', r'附件下载:',
r'(此件公开发布)', r'关联文件:', r'【我要纠错】', r'【打印本页】'
]
unwanted_regexes = [re.compile(pattern) for pattern in unwanted_patterns]
while True:
result = cursor.fetchmany(10000)
if not result:
break
for row in result:
try:
if row['content']:
# 检查content是否为文件名
if os.path.isfile(row['content']):
with open(row['content'], 'r', encoding='utf-8') as file:
html_content = file.read()
else:
html_content = row['content']
# 修正网址格式
html_content = fix_urls(html_content)
# 移除特定字符和字符串
cleaned_html_content = html_content.replace('■', '').replace('▌', '')
for regex in unwanted_regexes:
cleaned_html_content = regex.sub('', cleaned_html_content)
# 在移除HTML标签之前先移除链接
cleaned_html_content = link_regex.sub('', cleaned_html_content)
# 使用BeautifulSoup来移除HTML标签
soup = BeautifulSoup(cleaned_html_content, 'html.parser')
# 移除<img>和<video>标签
for tag in soup(['img', 'video']):
tag.decompose()
# 获取清理后的文本
cleaned_content = soup.get_text()
csv_writer.writerow([row['id'], cleaned_content])
count += 1
except Exception as e:
print(f"Error processing row {row['id']}: {e}")
if count >= 100000:
csv_file.close()
file_count += 1
csv_file = open(f'C:/Users/lvdon/Desktop/output_{file_count + 1}.csv', 'w', newline='',
encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['id', 'content'])
count = 0
finally:
connection.close()
csv_file.close()
print("导出完成!")
```