from typing import Any, Union
import requests
from bs4 import BeautifulSoup
import time
import random
def get_html(url, soup):#获取html
header = {
"User - Agent: Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 92.0.4515.107Safari / 537.36Edg / 92.0.902.55"}
html = requests.get(url, headers=header).content
try:
html = requests.get(url,timeout = 5)
soup = BeautifulSoup(html.content,'html.parser') #解析
# soup1 = soup.prettify()
# print(soup1)
return soup
except Exception as e:
pass
def xlfirstline(soup)
get_line = []
get_jc_a = soup.find_all('div',class_='jc_a')
get_tr = get_jc_a.find_all('tr')
tr = get_tr.find_all('th')
for each_th in tr:
th = each_th.get_text
get_line.append(th)
return get_line
def get_content(soup):
get_result = []
get_jc_a = soup.find('div',class_='jc_a')
tr = get_jc_a.find_all('td')
for each_td in tr:
td = each_td.get_text()
get_result.append(td)
return get_result
def prt_ret(get_result):
with open(r'C:\Users\DELL\Desktop\国家社科基金项目数据库.txt','a') as f:
while get_result:
for i in range(20):
f.write(get_result.pop(0) + '\t')
f.write('\n')
def main():
url = 'http://fz.people.com.cn/skygb/sk/index.php/Index/index?&p=1'
soup = get_html(url, None)
xlfirstline(soup)
get_line = xlfirstline(soup)
prt_ret(get_line)
for i in range(3):
wait_time: Union[int, Any] = random.randint(3,10)
time.sleep(wait_time)
url = 'http://fz.people.com.cn/skygb/sk/index.php/Index/index?&p=' + str(i + 1)
soup = get_html(url, None)
get_result = get_content(soup)
prt_ret(get_result)
if __name__ == "__main__":
main()
line 67, in
main()
line 53, in main
soup = get_html(url, None)
line 11, in get_html
html = requests.get(url, headers=header).content
line 75, in get
return request('get', url, params=params, **kwargs)
line 61, in request
return session.request(method=method, url=url, **kwargs)
line 528, in request
prep = self.prepare_request(req)
line 456, in prepare_request
p.prepare(
line 317, in prepare
self.prepare_headers(headers)
line 449, in prepare_headers
for header in headers.items():
AttributeError: 'set' object has no attribute 'items'