```python
import requests
from bs4 import BeautifulSoup
import time
import random
def get_html(url):
try:
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
"Cookie": "your cookie"}
html = requests.get(url, headers=header, timeout=5).content
soup = BeautifulSoup(html.content, 'html.parser')
return soup
except Exception as e:
pass
def xlfirstline(soup):
get_line = []
get_jc_a = soup.find('div',class_='jc_a')
get_tr = get_jc_a.find('tr')
tr = get_tr.find_all('th')
for each_th in tr:
th = each_th.get_text()
get_line.append(th)
return get_line
def get_content(soup):
get_result = []
get_jc_a = soup.find('div', class_='jc_a')
tr = get_jc_a.find_all('td')
for each_td in tr:
td = each_td.get_text()
get_result.append(td)
return get_result
def prt_ret(get_result):
with open(r'/Users/cenxinxiu/Desktop/国家社科基金项目库.txt', 'a') as f:
while get_result:
for i in range(20):
f.write(get_result.pop(0) + '\t')
f.write('\n')
def main():
url = 'http://fz.people.com.cn/skygb/sk/index.php/index/index/'
soup = get_html(url)
xlfirstline(soup)
get_line = xlfirstline(soup)
prt_ret(get_line)
for i in range(3):
wait_time = random.randint(5, 10)
time.sleep(wait_time)
url = "http://fz.people.com.cn/skygb/sk/index.php/index/index/" + str(i + 1)
soup = get_html(url)
get_result = get_content(soup)
prt_ret(get_result)
if __name__ == "__main__":
main()
Traceback (most recent call last):
line 64, in <module>
main()
line 51, in main
xlfirstline(soup)
line 21, in xlfirstline
get_jc_a = soup.find('div', class_='jc_a')
AttributeError: 'NoneType' object has no attribute 'find'