m0_53693041
2021-12-18 19:23
采纳率: 85.7%
浏览 61
已结题

python爬虫翻页爬取的数据是第一页的重复数据


import pandas as pd
import chardet
from lxml import etree
import requests
import re
import time
import warnings
warnings.filterwarnings("ignore")

def get_CI(url):
    url = 'https://www.shixi.com/search/index?key=%E5%A4%A7%E6%95%B0%E6%8D%AE&districts=&education=0&full_opportunity=0&stage=0&practice_days=0&nature=0&trades=&lang=zh_cn'
    dic = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57'}
    # 移除感叹号
    requests.packages.urllib3.disable_warnings()

    resp = requests.get(url,headers=dic,verify=False)

    resp.encoding = chardet.detect(resp.content)['encoding']
    et = etree.HTML(resp.text)

    # 公司
    company_list = et.xpath('//div[@class="job-pannel-list"]//div[@class="job-pannel-one"]//a/text()')
    company_list = [company_list[i].strip() for i in range(len(company_list)) if i % 2 != 0]

    # 岗位
    job_list = et.xpath('//div[@class="job-pannel-list"]//div[@class="job-pannel-one"]//a/text()')
    job_list = [job_list[i].strip() for i in range(len(job_list)) if i % 2 == 0]

    # 地址
    address_list = et.xpath('//div[@class="job-pannel-two"]//a/text()')

    # 学历
    degree_list = et.xpath('//div[@class="job-pannel-list"]//dd[@class="job-des"]/span/text()')

    # 薪资
    salary_list = et.xpath('//div[@class="job-pannel-two"]//div[@class="company-info-des"]//text()')
    salary_list = [i.strip() for i in salary_list]

    # 时间
    time_list = et.xpath('//div[@class="job-pannel-two"]//span[@class="job-time"]/text()')

    # 获取二级界面
    deep_url_list = et.xpath('//div[@class="job-pannel-list"]//dt/a/@href')
    x = "https://www.shixi.com"
    deep_url_list = [x + i for i in deep_url_list]

    demand_list = []


    for deep_url in deep_url_list:
        rqg = requests.get(deep_url, headers=dic, verify=False)
        rqg.encoding = chardet.detect(rqg.content)['encoding']
        html = etree.HTML(rqg.text)

        discribe =html.xpath('//div[@class="container-fluid"]//div[@class="work_b"]/text()')
        demand_list.append(discribe)

    data = {'公司名': company_list, '岗位名': job_list, '地址': address_list, "学历": degree_list,

                         '薪资': salary_list, '时间': time_list, '岗位需求量': demand_list}

    df = pd.DataFrame.from_dict(data, orient='index')
    return (df)

x = "https://www.shixi.com/search/index?key=%E5%A4%A7%E6%95%B0%E6%8D%AE&page="
url_list = [x + str(i) for i in range(1,4)]
res = pd.DataFrame(columns=['公司名', '岗位名', '地址', "学历", '薪资', '时间', '岗位需求量'])
# 翻页
for url in url_list:
    res0 = get_CI(url)
    res = pd.concat([res, res0])
    time.sleep(2)

res.to_csv('a.csv', encoding='utf_8_sig')


爬出来的是第一页的重复数据

img


请问怎么解决?

  • 写回答
  • 好问题 提建议
  • 追加酬金
  • 关注问题
  • 邀请回答

1条回答 默认 最新

  • C_Code_P 2021-12-18 23:46
    最佳回答

    爬下一页就好了

    评论
    解决 无用
    打赏 举报

相关推荐 更多相似问题