用爬虫爬取网页,表格第一行内容无法获得,直接跳过了标题
import csv
import os
import requests
from bs4 import BeautifulSoup
allUniv = []
csvUniv = []
ranking = 11
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'gb2312'
return r.text
except:
return ""
def fillUnivList(soup):
data = soup.find_all('tr')
for tr in data:
ltd = tr.find_all('td')
if len(ltd)==0:
continue
singleUniv = []
for td in ltd:
singleUniv.append(td.string)
csvUniv.append(td.string)
allUniv.append(singleUniv)
def writercsv(book,num,table):
if os.path.isfile(book):
with open(book,'a',newline='',encoding='utf-8')as f:
csv_write=csv.writer(f,dialect='excel')
for i in range(num):
u=allUniv[i]
csv_write.writerow(u)
else:
with open(book,'w',newline='')as f:
csv_write=csv.writer(f,dialect='excel')
for i in range(num):
u=csvUniv[i]
csv_write.writerow(u)
def printUnivList(num):
for i in range(num):
u=allUniv[i]
print("{1:^5}\t{2:{0}^11}\t{3:^50}\t{4:^8}\t{5:^7}\t".format(chr(12288),u[0],u[1],u[2],u[3],u[4]))
table=["排名","学校中文名称","学校英文名称","国家/地区""得分"]
book="2034.csv"
def main():
url = 'https://www.igo.cn/zt/University_Rankings/?utm_source=source-baidu&tm_medium=xtjy22&utm_term=JS-TY-%E6%8E%92%E5%90%8D&utm_content=QS&tm_campaign=2021%E5%B9%B4%E5%BA%A6QS%E4%B8%96%E7%95%8C%E5%A4%A7%E5%AD%A6%E6%8E%92%E5%90%8D&bd_vid=7602746426293878947'
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
fillUnivList(soup)
printUnivList(ranking)
writercsv(book,ranking,table)
main()
这是网页内容: