Boss直聘岗位招聘分析 数据爬取失败

期末作业是爬取Boss直聘岗位相关数据,前期结合所学和网上的代码,但CSV文件中仍是空的,找不到问题,还望大家帮忙看看问题出在哪儿,谢谢大家!

import requests
import csv
import pandas as pd
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
cookie="__zp__pub__=; lastCity=101110100; __zp_stoken__=ddfaaC1oxcwcya3oOXUNWPSV5Vn9mfABYVFAhHlNZG1cvMgZMNnVnHHNKSzcYaQtJQSADE3tSZTopfT5ka30GRSlaU3c6ckVVaR4eBiMvDT8aR38lBkcNWAI8UVctTitNAxlGbCBbZz9gTSU%3D; t=Oh8LmQ5pyMOhjqah; wt=Oh8LmQ5pyMOhjqah; sid=sem_pz_bdpc_dasou_title; __c=1591769829; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2Fxian%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fsp0.baidu.com%2F9q9JcDHa2gU2pMbgoY3K%2Fadrc.php%3Ft%3D06KL00c00fDdiHC088qh0KZEgsZ9X8KX00000ZoOx7C00000UkfexZ.THdBULP1doZA80K85yF9pywd0ZnquAu9rjTdnj6snj0YrHc4mfKd5Hmkwbnsn1RzfbmLn1mvfRPArRf4wjnsfRcvf1wAn1bd0ADqI1YhUyPGujY1n1f1PWTsnHckFMKzUvwGujYkP6K-5y9YIZK1rBtEILILQMGCpgKGUB4WUvYE5LPGujd1uydxTZGxmhwsmdqbmgPEINqYpgw_ufKWThnqn1nYrHD%26tpl%3Dtpl_11534_22672_17382%26l%3D1518141306%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E5%252587%252586%2525E5%2525A4%2525B4%2525E9%252583%2525A8-%2525E6%2525A0%252587%2525E9%2525A2%252598-%2525E4%2525B8%2525BB%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253DBOSS%2525E7%25259B%2525B4%2525E8%252581%252598%2525E2%252580%252594%2525E2%252580%252594%2525E6%252589%2525BE%2525E5%2525B7%2525A5%2525E4%2525BD%25259C%2525EF%2525BC%25258C%2525E6%252588%252591%2525E8%2525A6%252581%2525E8%2525B7%25259F%2525E8%252580%252581%2525E6%25259D%2525BF%2525E8%2525B0%252588%2525EF%2525BC%252581%2526xp%253Did(%252522m3343670121_canvas%252522)%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D140%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3DBoss%25E7%259B%25B4%25E8%2581%2598%26oq%3DBoss%25E7%259B%25B4%25E8%2581%2598%26rqlang%3Dcn&g=%2Fwww.zhipin.com%2Fxian%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&friend_source=0&friend_source=0; _bl_uid=I3k1mb9y8d8y37ngjsvq4eevzRaj; __zp_seo_uuid__=8792ec29-03f0-439e-86f2-1b86c2c55784; __a=40025213.1591065226.1591691731.1591769829.57.7.3.3"
def get_one_page(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
        html = requests.get(url, headers=headers)
        if html.status_code == 200:
            return html.text
        return None
    except RequestException:
        return None
result_all = [] # 用于存储样本
def parse_one_page(html):
    soup = BeautifulSoup(html, 'lxml')
    companies = soup.find_all('div', 'job-primary', True)
    for com in companies:
        res = parse_one_company(com)
        result_all.append(res)

def parse_one_company(comp):
    result = []
    company_soup = comp.find('div', class_='info-company')
    com_desc = company_soup.find('p').text
    primary_soup = comp.find('div', class_='info-primary')
    job_name = primary_soup.find('div').text
    salary = primary_soup.find('span').text
    requirement = primary_soup.find('p').text
    result.append(com_desc)
    result.append(job_name)
    result.append(salary)
    result.append(requirement)
    return result

def parse_all_page(num, offset):
    url1 = 'https://www.zhipin.com/c101280100/h_101280100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 广州
    url2 = 'https://www.zhipin.com/c101280600/h_101280600/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 深圳
    url3 = 'https://www.zhipin.com/c101010100/h_101010100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 北京
    url4 = 'https://www.zhipin.com/c101020100/h_101020100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 上海
    urldict = {'1':url1, '2':url2, '3':url3, '4':url4}
    html = get_one_page(urldict[str(num)])
    parse_one_page(html)

if __name__ == '__main__':
    for j in range(1, 5):
        for i in range(1,11):
            parse_all_page(j, i)
    file = pd.DataFrame(result_all, columns=['公司信息', '岗位', '薪水', '其他'])
    # encoding='utf_8_sig解决保存到CSV文件后显示乱码问题
    file.to_csv('Bosszhiping_four_city.csv', mode='a', index=True, encoding='utf_8_sig')


``

1个回答

你确定爬到了页面的数据吗?我用代码测试的时候发现爬到的都是

请稍后的页面

爬取boss直聘需要在请求里带上自己的cookies,并且它的cookies每次都会变,这需要你自己来想办法通过它的js文件获取了,下面的代码实现了爬取并存储到csv文件,运行前更换cookie,并删除原本你目录里的csv文件

import requests
import csv
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
from requests.exceptions import RequestException



def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        cookies = "__zp__pub__=; _uab_collina=159178398373799871604155; lastCity=100010000; __c=1591783980; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1591783980; __l=l=%2Fwww.zhipin.com%2Fc101280100%2Fh_101280100%2F%3Fquery%3D%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588&r=&friend_source=0&friend_source=0; __a=942990.1591783980..1591783980.8.1.8.8; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1591834512; __zp_stoken__=226aaABFRYnVuOBpEfiQLP1IsSndBHmsQQFh2eBYxWmU7UHwcXAAhW1wdHWB%2BIVo0HmQDGF1EawMDHH0Edz8%2BIXY2XRwMFQNge0YlZzI2CHdFAzBuZVQVbTVzdV4tDzp%2FXF1GZwY%2FSHRLBg0%3D"
        cook_dict = {cookie.split('=')[0]: cookie.split('=')[1] for cookie in cookies.split('; ')}
        html = requests.get(url, headers=headers, cookies=cook_dict)
        if html.status_code == 200:
            return html.content.decode('utf-8')
        return None
    except RequestException:
        return None


result_all = []  # 用于存储样本


def parse_one_page(html):
    soup = BeautifulSoup(html, 'lxml')
    companies = soup.find_all('div', 'job-primary', True)
    for com in companies:
        res = parse_one_company(com)
        result_all.append(res)


def parse_one_company(comp):
    result = []
    company_soup = comp.find('div', class_='info-company')
    com_desc = company_soup.find('a').text
    job_soup = comp.find('div', class_='job-title')
    job_name = job_soup.find('a').text
    salary_soup = comp.find('div', class_='job-limit clearfix')
    salary = salary_soup.find('span').text
    requirement = salary_soup.find('p').text
    result.append(com_desc)
    result.append(job_name)
    result.append(salary)
    result.append(requirement)
    return result


def parse_all_page(num, offset):
    url1 = 'https://www.zhipin.com/c101280100/h_101280100/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
        offset)  # 广州
    url2 = 'https://www.zhipin.com/c101280600/h_101280600/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
        offset)  # 深圳
    url3 = 'https://www.zhipin.com/c101010100/h_101010100/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
        offset)  # 北京
    url4 = 'https://www.zhipin.com/c101020100/h_101020100/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
        offset)  # 上海
    urldict = {'1': url1, '2': url2, '3': url3, '4': url4}
    html = get_one_page(urldict[str(num)])
    parse_one_page(html)


if __name__ == '__main__':
    for j in range(1, 5):
        for i in range(1, 11):
            parse_all_page(j, i)
    print(result_all)
    file = pd.DataFrame(result_all, columns=['公司信息', '岗位', '薪水', '其他'])
    # encoding='utf_8_sig解决保存到CSV文件后显示乱码问题
    file.to_csv('Bosszhiping_four_city.csv', mode='a', index=True, encoding='utf_8_sig')




qq_36369061
DataSheep 回复Lynnzzz: 你可以试试我刚刚发出来的代码
3 个月之前 回复
Lynnzzz
Lynnzzz 没有爬到 CSV文档中是空白
3 个月之前 回复
Csdn user default icon
上传中...
上传图片
插入图片
抄袭、复制答案,以达到刷声望分或其他目的的行为,在CSDN问答是严格禁止的,一经发现立刻封号。是时候展现真正的技术了!
立即提问
相关内容推荐