爬另一个网站也是这么操作的,能爬取到数据,爬这个就获取不到数据,控制台显示0条,我检查了节点感觉没问题呀
import requests
from fake_useragent import UserAgent
from lxml import etree
import pandas as pd
class driver:
def __init__(self):
self.url='https://www.ncss.cn/student/jobs/index.html'#请求url/网址
self.headers = {'User-Agent':UserAgent().random}#请求头,包含随机生成的User-Agent
self.df = pd.DataFrame(columns=["职位","薪资","公司名","学历","专业","其它"])#空的DataFrame,用于存储爬取的数据
def get_html(self,url):
res = requests.get(url, headers=self.headers)
return res.text
def parse_html(self,html):
html = etree.HTML(html)
list = html.xpath('//div[@class="job-list-box"]/div')
#定义两个数组
titles = []
texts = []
print(len(list))
for li in list:
title = li.xpath('./div/ul/h5/a/text()')[0]
titles.append(title)
text = li.xpath('./div/ul/li/text()')[0]
texts.append(text)
temp_df = pd.DataFrame({
"职位":titles,
"详情":texts
})
self.df = pd.concat([self.df,temp_df],ignore_index=True)
print(self.df)
def run(self,pages=1):
for page in range(1,pages + 1):
url = f"{self.url}?start={10*(page - 1)}&filter="
html = self.get_html(url)
self.parse_html(html)
self.df.to_excel("../tmp/爬虫数据.xlsx",index=False)
if __name__ =='__main__':
spider = driver()
spider.run()