鬼890
2021-12-14 09:11
采纳率: 33.3%
浏览 26
已结题

用爬虫爬取网页,表格第一行内容无法获得

用爬虫爬取网页,表格第一行内容无法获得,直接跳过了标题

import csv
import os
import requests
from bs4 import BeautifulSoup
allUniv = []
csvUniv = []
ranking = 11

def getHTMLText(url):
     try:
         r = requests.get(url, timeout=30)
         r.raise_for_status()
         r.encoding = 'gb2312'
         return r.text
     except:
         return ""     
def fillUnivList(soup):
     data = soup.find_all('tr')
     for tr in data:
         ltd = tr.find_all('td')
         if len(ltd)==0:
             continue
         singleUniv = []
         for td in ltd:
             singleUniv.append(td.string)
         csvUniv.append(td.string)
         allUniv.append(singleUniv)   
         
def writercsv(book,num,table):
     if os.path.isfile(book):
         with open(book,'a',newline='',encoding='utf-8')as f:
             csv_write=csv.writer(f,dialect='excel')
             for i in range(num):
                 u=allUniv[i]
                 csv_write.writerow(u)
     else:
          with open(book,'w',newline='')as f:
             csv_write=csv.writer(f,dialect='excel')

             for i in range(num):
                 u=csvUniv[i]
                 csv_write.writerow(u)  
                 
def printUnivList(num):
    
    for i in range(num):
        u=allUniv[i]
        print("{1:^5}\t{2:{0}^11}\t{3:^50}\t{4:^8}\t{5:^7}\t".format(chr(12288),u[0],u[1],u[2],u[3],u[4]))
        
table=["排名","学校中文名称","学校英文名称","国家/地区""得分"]
book="2034.csv"
def main():
     url = 'https://www.igo.cn/zt/University_Rankings/?utm_source=source-baidu&tm_medium=xtjy22&utm_term=JS-TY-%E6%8E%92%E5%90%8D&utm_content=QS&tm_campaign=2021%E5%B9%B4%E5%BA%A6QS%E4%B8%96%E7%95%8C%E5%A4%A7%E5%AD%A6%E6%8E%92%E5%90%8D&bd_vid=7602746426293878947'
     html = getHTMLText(url)
     soup = BeautifulSoup(html, "html.parser")
     fillUnivList(soup)
     printUnivList(ranking)
     writercsv(book,ranking,table)
     
main()

这是网页内容:

img

1条回答 默认 最新

相关推荐 更多相似问题