import requests, re
from lxml import etree
import time
import random
import pymysql
from fake_useragent import UserAgent
from multiprocessing import Pool
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
navicode_list = [
'A001,A002,A003,A004,A005,A006,A007','A008,A009,A010,A011,A012,A013',
'F,G,H,J','B014_1,B014_2', 'B014_31,B014_32,B014_33,B014_34,B014_35,B014_36,B014_37',
'B014_38,B014_39,B014_3A','B014_4',
'B014_5,B014_6,B014_7,B014_8','B015_1,B015_3,B015_4,B015_5,B015_6,B015_7,B015_8',
'B015_2', 'B016_11', 'B016_12',
'B016_3,B016_5,B016_6,B016_7,B016_8', 'B016_4',
'B016_21,B016_22,B016_23,B016_24,B016_26,B016_27,B016_28,B016_29', 'B016_25', 'B017,B018,B019',
'B020_1',
'B020_2,B020_3,B020_4,B020_5,B020_6,B020_7,B020_8,B020_9,B020_A,B020_B,B020_C',
'B021,B023,B025','B022_1,B022_2,B022_3,B022_4,B022_5', 'B022_6,B022_7', 'B022_8,B022_B,B022_C','B022_9',
'B022_A','B024_3',
'B024_7', 'B024_1,B024_2,B024_4,B024_5,B024_6','B024_A',
'B024_B,B024_E','B024_C',
'B024_D','B024_8,B024_9','B026', 'B027_1,B027_2,B027_3,B027_4', 'B027_5,B027_6',
'C028_1,C028_2,C028_4,C028_5,C028_6,C028_7,C028_8','C028_9','C028_38',
'C028_31,C028_32,C028_33,C028_34,C028_35',
'C028_36,C028_37,C028_39,C028_3A,C028_3B,C028_3C',
'C029_1,C029_2,C029_3,C029_4,C029_6,C029_7', 'C029_8,C029_9,C029_B,C029_C,C029_D',
'C029_51,C029_52,C029_53,C029_54,C029_55','C029_56,C029_57,C029_58,C029_59',
'C029_A1,C029_A3','C029_A2', 'C030_1,C030_2,C030_3,C030_4,C030_5',
'C030_6,C030_7,C030_8,C030_9,C030_A',
'C030_B,C030_C,C030_D,C030_E,C030_F,C030_G,C030_H,C030_I', 'C031,C032,C033,C034',
'C035_1,C035_2,C035_3,C035_4,C035_5,C035_6,C035_7,C035_8',
'C035_9,C035_A,C035_B,C035_C,C035_D,C035_E', 'C036,C037,C040,C041',
'C038_1,C038_21,C038_22,C038_23,C038_24,C038_3',
'C038_25,C038_26,C038_27,C038_28,C038_29', 'C039', 'C042',
'D043,D044,D045,D046,D047', 'D044,D045,D046,D047',
'I135_1,I135_2,I135_3,I135_4,I135_7,I135_8', 'I135_6',
'I135_522,I135_523,I135_524', 'I135_521',
'I138_1,I138_2,I138_3,I138_4,I138_5,I138_6,I138_7,I138_8,I138_9,I138_A,I138_B',
'I138_C12,I138_C13,I138_C14,I138_C2', 'I138_C11',
'I136_87,I136_88','I136_84,I136_85,I136_86',
'I136_81,I136_82,I136_83',
'I137_3,I137_4,I137_5','I137_1,I137_2',
'I139,I140,I141,I142,I143,I144',
'I136_1,I136_2,I136_3,I136_4,I136_5,I136_6,I136_7',
'I136_9,I136_A,I136_B,I136_C,I136_D,I136_E,I136_F,I136_G'
]
time_list = ['2020-04',
# '2021-04','2021-03','2021-01','2021-02',
# '2020-12','2020-11','2020-10','2020-09','2020-08','2020-07','2020-06','2020-05','2020-04','2020-03','2020-02','2020-01',
# '2019-12','2019-11','2019-10','2019-09','2019-08','2019-07','2019-06','2019-05','2019-04','2019-03','2019-02','2019-01',
# '2018-12','2018-11','2018-10','2018-09','2018-08','2018-07','2018-06','2018-05','2018-04','2018-03','2018-02','2018-01',
# '2017-12','2017-11','2017-10','2017-09','2017-08','2017-07','2017-06','2017-05','2017-04','2017-03','2017-02','2017-01',
# '2016-12','2016-11','2016-10','2016-09','2016-08','2016-07','2016-06','2016-05','2016-04','2016-03','2016-02','2016-01',
# '2015-12','2015-11','2015-10','2015-09','2015-08','2015-07','2015-06','2015-05','2015-04','2015-03','2015-02','2015-01',
# '2014-12','2014-11','2014-10','2014-09','2014-08','2014-07','2014-06','2014-05','2014-04','2014-03','2014-02','2014-01',
]
# db = pymysql.connect(host='',user='root',password='',database='industrynav')
# cursor = db.cursor()
times = time.strftime('%a %b %d %Y %H:%M:%S') + ' GMT+0800 (中国标准时间)'
t = random.randint(1,2)
for time1 in time_list:
for LB in navicode_list:
print(time1,LB)
params1 = (
('action', ''),
('NaviCode', LB), # 筛选的类别
('ua', '1.21'),
('PageName', 'ASP.brief_result_aspx'),
('DbPrefix', 'SCPD'),
('DbCatalog', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'),
('ConfigFile', 'SCPD.xml'),
('db_opt', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'),
('db_value', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'),
('date_gkr_from', time1), # 筛选日期
('date_gkr_to', time1), # 筛选日期
('his', '0'),
('__', times),
)
def get_cookie(): # 获取访问的cookie
session = requests.session()
try:
session.get('https://epub.cnki.net/kns/request/SearchHandler.ashx', headers=headers, params=params1)
return session
except:
pass
def max11(): #获取最大页
session = get_cookie()
params = (
('curpage', ''), # 当前页数
('RecordsPerPage', '50'),
('QueryID', '20'),
('ID', ''),
('turnpage', '1'),
('tpagemode', 'L'),
('dbPrefix', 'SCPD'),
('Fields', ''),
('DisplayMode', 'listmode'),
('SortType', "(公开日, 'DATE')desc"),
('PageName', 'ASP.brief_result_aspx'),
)
response = session.get('https://epub.cnki.net/kns/brief/brief.aspx', headers=headers, params=params)
selector = etree.HTML(response.text)
try:
page_info = selector.xpath('//*[@id="J_ORDER"]/tr[2]/td/table/tr/td[2]/div/span[1]')[0].text
max1 = int(re.compile('浏览.*/(.*)').findall(page_info)[0])
# print(max1)
return max1
except:
pass
def get_list_info(): # 获取列表页
max1 = max11()
for i in range(1, max1 + 1):
session = get_cookie()
params = (
('curpage', i), # 当前页数
('RecordsPerPage', '50'),
('QueryID', '20'),
('ID', ''),
('turnpage', '1'),
('tpagemode', 'L'),
('dbPrefix', 'SCPD'),
('Fields', ''),
('DisplayMode', 'listmode'),
('SortType', "(公开日, 'DATE')desc"),
('PageName', 'ASP.brief_result_aspx'),
)
try:
response = session.get('https://epub.cnki.net/kns/brief/brief.aspx', headers=headers, params=params)
selector = etree.HTML(response.text)
urls_info = re.compile("<a class='fz14' href='/kns/detail/detail.aspx(.*?)'").findall(response.text)
page_info = selector.xpath('//*[@id="J_ORDER"]/tr[2]/td/table/tr/td[2]/div/span[1]')[0].text
print(page_info)
nums = len(urls_info)
now_page = int(re.compile('浏览(.*?)/').findall(page_info)[0])
print("当前获取第{}页数据".format(now_page), "数目", nums)
print(LB, time1)
except:
pass
for url in urls_info:
detail_url = 'https://kns.cnki.net/kcms/detail/detail.aspx?' + url # 详情页地址
print(detail_url)
try:
response = requests.get(url=detail_url, headers=headers)
except:
time.sleep(5)
response = requests.get(url=detail_url, headers=headers)
main_info = str(response.text)
a = str(main_info.split(' ')).replace(' ', '').replace(',', '').replace('\r\n', '').replace('\'', '')
b = str(a.replace('</span><pclass="funds">', '').replace('</h5>\\r\\n<divclass="abstract-text">', ''))
# print(b)
title = ''.join(re.compile('<title>(.*?)-中国知网').findall(b)[0]) # 专利名称
leixing = ''.join(re.compile('>专利类型:(.*?)<').findall(b)[0]) # 类型
sqgb = ''.join(re.compile('>申请公布号:(.*?)<').findall(b)) # 申请公布
if (sqgb == ''):
gb_id = ''.join(re.compile('授权公布号:(.*?)<').findall(b))
else:
gb_id = sqgb
gkr = ''.join(re.compile('>公开公告日:(.*?)<').findall(b)) # 公开日
if (gkr == ''):
gb_time = ''.join(re.compile('授权公告日:(.*?)<').findall(b))
else:
gb_time = gkr
sq_id = ''.join(re.compile('>申请\(专利\)号:(.*?)<').findall(b)) # 申请号
sq_time = ''.join(re.compile('>申请日:(.*?)<').findall(b)) # 申请日
sqr = ''.join(re.compile('申请人:.*?">(.*?)<').findall(b)) # 申请人
if (sqr == ''):
sq_person = ''.join(re.compile('申请人:(.*?)</p>').findall(b))
else:
sq_person = sqr
addr = ''.join(re.compile('>地址:(.*?)<').findall(b)) # 地址
cl = ''.join(re.compile('>主分类号:(.*?)<').findall(b)) # 主分类号
flh = ''.join(re.compile('>分类号:(.*?)<').findall(b)) # 分类号
gsdm = ''.join(re.compile('>国省代码:(.*?)<').findall(b)) # 国省代码
abstracts = ''.join((re.compile('>摘要:(.*?)<').findall(b))) # 摘要
The_inventor = ''.join(re.compile('>发明人:(.*?)<', re.S).findall(b)) # 发明人
if (The_inventor == ''):
fmr = ''.join(re.compile('au([\u4e00-\u9fa5a-zA-Z. ]+)\d+', re.S).findall(b))
else:
fmr = The_inventor
try:
patent_agency = ''.join(re.compile('>代理机构:(.*?)<').findall(b)) # 专利代理机构
agent = ''.join(re.compile('">代理人:(.*?)<').findall(b)) # 代理人
except Exception:
patent_agency = agent = ''
print(title, leixing, gb_id, gb_time, sq_id, sq_time, sq_person, addr, cl, flh, gsdm, abstracts,
patent_agency, agent, fmr)
# sql = """insert into patent(title, leixing, gb_id, gb_time, sq_id, sq_time, sq_person, addr, cl, flh, gsdm, abstracts,patent_agency, agent, fmr) values ('{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}','{}')""".format(
# title, leixing, gb_id, gb_time, sq_id, sq_time, sq_person, addr, cl, flh, gsdm, abstracts,patent_agency, agent, fmr)
# print(sql)
# cursor.execute(sql)
# db.commit()
if __name__ == '__main__':
ua = UserAgent().random
headers = {
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': ua,
'Referer': 'https://epub.cnki.net/kns/brief/result.aspx?dbprefix=SCPD',
}
urls_info = get_list_info()
# db.close()
麻烦大佬帮忙把程序改成多线程
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
2条回答 默认 最新
CSDN专家-黄老师 2021-06-04 16:07关注多线程你要将同一个操作分给不同线程操作,比如一个任务循环10次,5条线程,每条线程循环2次,你要将任务细分就可以了
解决 无用评论 打赏 举报