最下方的薪资一栏也不知道为啥Xpath语法老是报错说我写的不对,可是我反复检查了好像也没啥问题
'''
网站https://www.shixiseng.com/interns?page=1&type=intern&keyword=%E9%A1%B9%E7%9B%AE%E7%BB%8F%E7%90%86&area&months&days°ree&official&enterprise&salary=-0&publishTime&sortType&city=%E5%85%A8%E5%9B%BD&internExtend
薪资、所在地、单位名称、所属行业、公司规模、福利标签、备注信息、企业性质的内容
'''
import requests
from lxml import etree
import pprint
import re
import urllib.request
import csv
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
url = 'https://www.shixiseng.com/interns?page=1&type=intern&keyword=%E9%A1%B9%E7%9B%AE%E7%BB%8F%E7%90%86&area&months&days°ree&official&enterprise&salary=-0&publishTime&sortType&city=%E5%85%A8%E5%9B%BD&internExtend'
data = requests.get(url, headers=headers).text
# 拿到网页源码
# print(data)
ele = etree.HTML(data)
div = ele.xpath('//div[@searchtype="intern"]')
# div = ele.xpath('/html/body/div[1]/div/div/div[2]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]')
# print(div,len(div))
# 获得薪资、所在地、单位名称、所属行业、公司规模、福利标签、备注信息、企业性质
# lst = []
for move in div:
# 所在地
location = move.xpath('./div[1]/div[1]/p[2]/span[1]/text()')
print(location)
# 单位名称
company = move.xpath('./div[1]/div[2]/p[1]/a[1]/@title')
print(company)
# 所属行业
industry = move.xpath('./div[1]/div[2]/p[2]/span[1]/text()')
print(industry)
# 福利标签
welfare = move.xpath('./div[2]/div[1]/span/text()')
print(welfare)
# 备注信息
notes = move.xpath('./div[2]/div[2]/span/text()')
print(notes)
# 薪资
href_list = move.xpath('./div[1]/div[1]/p[1]/a[1]/@href')
# print(href)
for href in href_list:
data_href = requests.get(href, headers=headers).text
# print(data_href)
ele_href = etree.HTML(data_href)
# print(ele_href)
salary = ele_href.xpath('//div[@class="job_msg"]/span[1]/text()"]')
print(salary)
# # 企业性质
# nature = ele_href.xpath('/html/body/div[1]/div/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/text()')
# print(nature)
# # 公司规模
# scale = ele_href.xpath('/html/body/div[1]/div/div/div[2]/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[3]/text()')
# print(scale)
# break