使用 pymysql 模块正常 读取和存储
#! /usr/bin/env python
#-*- encoding: utf-8 -*-
import requests
import re
import pymysql
import time
import mysql.connector # mysql 8.0 连接模块
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
def baidu(company):
# 1.获取网页源代码(参考2.3、3.1、3.4节)
url = 'https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=' + company # 其中设置rtt=4则为按时间排序,如果rtt=1则为按焦点排序
res = requests.get(url, headers=headers, timeout=10).text
# 2.编写正则提炼内容(参考3.1节)
p_href = '<h3 class="news-title_1YtI1">.*?<a href="(.*?)"'
p_title = '<a href=.*?class="news-title-font_1xS-F".*?aria-label="(.*?)".*?</a>'
p_info = '<span class="c-color-gray c-font-normal c-gap-right".*?>(.*?)</span>'
p_date = '<span class="c-color-gray2 c-font-normal".*?>(.*?)</span>'
href = re.findall(p_href, res, re.S)
title = re.findall(p_title, res, re.S)
info = re.findall(p_info, res, re.S)
source = info
date = re.findall(p_date, res, re.S)
# 3.数据清洗(参考3.1节)
for i in range(len(title)):
title[i] = title[i].strip()
title[i] = re.sub('标题:', '', title[i])
score = []
# 6.将数据存入数据库及数据去重(参考4.4节和5.1节)
for i in range(len(title)):
print(title[i])
db = mysql.connector.connect(host='localhost', port=3308, user='root', password='x', database='pachong',
charset='utf8')
cur = db.cursor() # 获取会话指针,用来调用SQL语句
# 6.1 查询数据,为之后的数据去重做准备
sql_1 = 'SELECT * FROM article WHERE company =%s'
cur.execute(sql_1, company)
data_all = cur.fetchall()
print(data_all)
title_all = []
for j in range(len(data_all)):
title_all.append(data_all[j][1])
# 6.2 判断数据是否在原数据库中,不在的话才进行数据存储
if title[i] not in title_all:
sql_2 = 'INSERT INTO article(company,title,href,source,date,score) VALUES (%s,%s,%s,%s,%s,%s)' # 编写SQL语句
print(title[i])
print(source[i])
# 执行SQL语句
cur.execute(sql_2, (company, title[i], href[i], source[i], date[i], score[i]))
db.commit() # 当改变表结构后,更新数据表的操作
cur.close() # 关闭会话指针
db.close() # 关闭数据库链接
print('------------------------------------') # 分割符
def main():
# 7.批量爬取多家公司(参考3.2节)
companys = ['华能信托', '阿里巴巴', '百度集团']
for company in companys:
try:
baidu(company)
print(company + '数据爬取并存入数据库成功')
except:
print(company + '数据爬取并存入数据库失败')
main()