在爬虫时出现attributeerror 在尝试过多次方法也未能得到解决 希望各位能帮忙看下代码
代码如下:
```python
import requests
from bs4 import BeautifulSoup
import urllib.request
import xml.etree.ElementTree as ET
import time
import urllib.parse
import socket
User_Agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56'
headers = {'User-Agent': User_Agent}
# 得到所有新闻rul链接
def get_url_list_025(url):
init_page = requests.get(url, headers=headers).content
re = urllib.request.Request(url, headers=headers)
try:
response = urllib.request.urlopen(re, timeout=10)
html = response.read().decode("utf-8") # 获得html文档
except:
print('爬取失败:', url)
news_list = []
root = 'https://china.huanqiu.com'
f1 = open('国内要闻_国内新闻_环球网.html', 'r', encoding='utf-8') # 从文件中读取HTML文档
soup = BeautifulSoup(f1, "html.parser") # 解析HTML文档
f1.close()
news_list1 = soup.find('div', class_='m-recommend-con') # 先找到大的div标签
items = news_list1.find_all('li') # 找到所有‘li’标签
for i, item in enumerate(items):
if len(item) == 0:
continue
a = item.find('a') # 找到链接标签
title = a.find('h4').string # 获得新闻标题
url = a.get('href') # 获得新闻链接
if root in url:
url = url[len(root):]
date_time = a.find('span', class_="time").string # 获得新闻时间
date_time = str(date_time) + ':00'
new_txt = [date_time, root+url, title] # 组成一个列表形式
news_list.append(new_txt) # 添加到列表
return news_list
def crwal_url_list_025(url_list):
doc_dir_path ='data/news/' # 文档存储路径
doc_encoding ='utf-8' # 编码格式
n = 1
for i, news in enumerate(url_list):
print('爬取数量:%d/%d' % (i, len(url_list))) # 打印爬取进度
re = urllib.request.Request(news[1], headers=headers)
try:
response = urllib.request.urlopen(re, timeout=10)
html = response.read() # 获得html文档
except socket.timeout as err: # 超时异常
print('超时')
print(err)
print('休息10s')
time.sleep(60)
continue # 继续
except Exception as C: # 其他类异常
print('<%s, %s, %s>' % (type(C), C.reason, news[1]))
print('休息5s')
time.sleep(5)
continue
soup = BeautifulSoup(html, "html.parser") # 解析HTML文档
# print(soup.prettify())
for each in soup('script'): # 移除soup中script标签
each.extract()
try:
ps = soup.find('div', class_='l-con clear').find('article').find_all('p') # # 找到所有正文部分中p标签
except Exception as C:
print('%s, %s' % (type(C), news[1]))
continue
txt = ''
for each in ps: # 每次读取一个P段落
p = each.get_text().strip() #去除首尾的空格
if p == '': # 跳过空段落(因为正文是段与段之间是空段落隔开的)
continue
txt += '\t' + p + '\n' # 格式控制(每个p段落后有换行)
ze = soup.find('div', class_='l-sign').find('p', class_="edit-peo").get_text() # 获取责任编辑人
txt += ze
txt = txt.replace(' ', '') # 空格用逗号隔开
if len(txt) < 150: # 去掉新闻词数小于150的
continue
doc = ET.Element("doc")
ET.SubElement(doc, 'id').text = "%d" % n # 添加id号
ET.SubElement(doc, "url").text = news[1] # 添加url链接
ET.SubElement(doc, "title").text = news[2] # 添加标题
ET.SubElement(doc, "datetime").text = news[0] # 添加新闻时间
ET.SubElement(doc, "body").text = txt # 添加正文部分
tree = ET.ElementTree(doc) # 生成ElementTree对象
tree.write((doc_dir_path + "%d.xml" % n), encoding=doc_encoding, xml_declaration=True) # 写入文件
n += 1
if n % 1000 == 0: # 爬取了1000个新闻后睡眠20秒
print("休息10秒")
time.sleep(10)
if __name__ == "__main__":
# url = 'https://china.huanqiu.com/focus'
url = 'https://china.huanqiu.com'
url_list = get_url_list_025(url)
print('爬取%d个新闻' % len(url_list))
crwal_url_list_025(url_list)
![img](https://img-mid.csdnimg.cn/release/static/image/mid/ask/612380972966162.png "#left")
![img](https://img-mid.csdnimg.cn/release/static/image/mid/ask/507080972966150.png "#left")