import requests
import re
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/79.0.3941.4 Safari/537.36'}
def sou_gou(company):
url = 'https://news.sogou.com/news?query=' + company + '&shid=djt1'
res = requests.get(url, headers=headers).text
p_href = '
.*?
p_title = '.*?(.*?)'
p_info = '
(.*?)
'
href = re.findall(p_href, res, re.S)
title = re.findall(p_title, res, re.S)
info = re.findall(p_info, res, re.S)
file1 = open('E:\python数据爬取\搜狗数据挖掘报告.txt', 'a')
file1.write(company + '数据挖掘completed!' + '\n' + '\n')
source = []
date = []
for i in range(len(title)):
title[i] = title[i].strip()
title[i] = re.sub('<.*?>', '', title[i])
info[i] = re.sub('<.*?>', '', info[i])
source.append(info[i].split(' ')[0])
date.append(info[i].split(' ')[1])
source[i] = source[i].strip()
date[i] = date[i].strip()
file1.write(str(i + 1) + '.' + title[i] + '(' + date[i] + '-' + source[i] + ')' + '\n')
file1.write(href[i] + '\n')
file1.write('-------------------' + '\n' + '\n')
file1.close()
company_name = ['华能信托', '阿里巴巴', '万科集团', '百度', '腾讯', '京东']
for j in company_name:
sou_gou(j)
print(j + '搜狗新闻爬取成功')
p_info = '
for j in company_name:
sou_gou(j)
print(j + '搜狗新闻爬取成功')