import requests
import re
import pymysql
import time
def baidu(company):
ur1 = 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=' + company
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36 Edg/96.0.1054.29'}
res = requests.get(ur1, headers=headers, timeout=10).text
p_title = 'data-click="{.*?}"><!--s-text-->(.*?)<!--/s-text--></a>'
p_href = '<h3 class="news-title_1YtI1"><a href="(.*?)" target="_blank" class="news-title-font_1xS-F"'
p_date = '<span class="c-color-gray2 c-font-normal" aria-label="发布于:.*?">(.*?)</span>'
p_info = '<span class="c-color-gray c-font-normal c-gap-right" aria-label="新闻来源:.*?">(.*?)</span>'
href = re.findall(p_href, res, re.S)
title = re.findall(p_title, res, re.S)
date = re.findall(p_date, res, re.S)
info = re.findall(p_info, res, re.S)
for i in range(len(title)):
title[i] = title[i].strip()
title[i] = re.sub('<.*?>', '', title[i])
title[i] = re.sub('[...]', '', title[i])
title[i] = re.sub('&.*?', '', title[i])
title[i] = re.sub('#', '', title[i])
title[i] = re.sub('#', '', title[i])
href[i] = href[i].strip()
date[i] = date[i].strip()
date[i] = re.sub('月', '-', date[i])
date[i] = re.sub('日', '', date[i])
if ('小时' in date[i]) or ('分钟' in date[i]):
date[i] = time.strftime("%Y-%m-%d")
else:
date[i] = date[i]
# print(str(i+1)+'.'+title[i])
# print(href[i])
# print(company+'该条舆情评分为'+str(score[i]))
#舆情0数据评分4.0
score = []
keywords = ['违约', '诉讼', '兑付', '投诉']
for i in range(len(title)):
num = 0
try:
article = requests.get(href[i],headers = headers,timeout =10).text
except:
article = '单个新闻爬取失败'
try:
article = article.encode('ISO-8859-1').decode('utf-8')
except:
try:
article = article.encode('ISO-8859-1').decode('gbk')
except:
article = article
p_article = '<p>(.*?)</p>'
article_main = re.findall(p_article,article)
article = ''.join(article_main)
for k in keywords:
if (k in article) or (k in title[i]):
num -= 5
if ('违约' in article):
num -= 10
score.append(num)
company_re = company[0]+'.{0,5}'+company[-1]
if len(re.findall(company_re,company))<1:
title[i]=''
href[i]=''
date[i]=''
info[i]=''
score[i]=''
while''in title:
title.remove('')
while''in href:
href.remove('')
while''in date:
date.remove('')
while''in info:
info.remove('')
while''in score:
score.remove('')
for i in range(len(title)):
print(str(i+1)+'.'+title[i]+'('+date[i]+' '+info[i]+')')
print(href[i])
print(company+'该条新闻的舆情评分为'+ str(score[i]))
#入数据库
for i in range(len(date)):
db = pymysql.connect(host='localhost', port=3306, user='root', password='', database='pachongnew',charset='utf8')
cur = db.cursor()
sql_1 = 'SELECT * FROM article WHERE company = %s'
cur.execute(sql_1, company)
data_all = cur.fetchall()
title_all = []
for j in range(len(data_all)):
title_all.append(data_all[j][1])
if title[i] not in title_all:
sql_2 = 'INSERT INTO article(company,title,href,date,info,score) VALUES (%s,%s,%s,%s,%s,%s)'
cur.execute(sql_2,(company,title[i],href[i],date[i],info[i],score[i]))
db.commit()
cur.close()
db.close()
print('-------------------------------------------------------')
baidu('腾讯')
#各种公司
# companys = ['阿里巴巴', '京东', '华能信托', '腾讯','百度集团']
# for i in companys:
# try:
# baidu(i)
# print(i + '百度新闻爬取成功')
# except:
# print(i + '百度新闻爬取失败')

为啥会出现索引超出范围,是哪个地方出问题了
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
5条回答 默认 最新
- CSDN专家-文盲老顾 2021-11-29 11:51关注
title = re.findall(p_title, res, re.S) date = re.findall(p_date, res, re.S) title[i] = re.sub('#', '', title[i]) href[i] = href[i].strip() date[i] = date[i].strip()
你的 title 匹配的到数量和 date 匹配到的数量不一致
['<em>腾讯</em>首页', '...市一年半GMV累计破亿,距离下一个“螺蛳粉”还有多远 - <em>腾讯</em>...', '劳动力短缺达70年之最,美国一些地方开始瞄上了“童工”? - <em>腾讯</em>...', '助力数字政府建设,<em>腾讯</em>数字政务全面升级', '“复出在即 即将迎来完全体的勇士”汤普森被下放至发展联盟', '<em>腾讯</em>新闻', '中国科技三巨头百度,阿里巴巴和<em>腾讯</em>如何在2020年取得成功', '<em>腾讯</em>,搬起石头砸自己的脚!', '马化腾也没料到,工信部动真格了,勒令<em>腾讯</em>旗下所有APP停更', '<em>腾讯</em>游戏无法投资每个王信文'] ['今天', '今天', '34分钟前', '3小时前', '46分钟前', '昨天22:51', '昨天21:50', '4天前', '昨天14:36']
百度搜索,部分结果上没有日期的,你需要自己适配
本回答被题主选为最佳回答 , 对您是否有帮助呢?解决评论 打赏 举报无用 1