Lynnzzz 2020-06-10 15:37 采纳率: 0%
浏览 908

Boss直聘岗位招聘分析 数据爬取失败

期末作业是爬取Boss直聘岗位相关数据,前期结合所学和网上的代码,但CSV文件中仍是空的,找不到问题,还望大家帮忙看看问题出在哪儿,谢谢大家!

import requests
import csv
import pandas as pd
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
cookie="__zp__pub__=; lastCity=101110100; __zp_stoken__=ddfaaC1oxcwcya3oOXUNWPSV5Vn9mfABYVFAhHlNZG1cvMgZMNnVnHHNKSzcYaQtJQSADE3tSZTopfT5ka30GRSlaU3c6ckVVaR4eBiMvDT8aR38lBkcNWAI8UVctTitNAxlGbCBbZz9gTSU%3D; t=Oh8LmQ5pyMOhjqah; wt=Oh8LmQ5pyMOhjqah; sid=sem_pz_bdpc_dasou_title; __c=1591769829; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2Fxian%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fsp0.baidu.com%2F9q9JcDHa2gU2pMbgoY3K%2Fadrc.php%3Ft%3D06KL00c00fDdiHC088qh0KZEgsZ9X8KX00000ZoOx7C00000UkfexZ.THdBULP1doZA80K85yF9pywd0ZnquAu9rjTdnj6snj0YrHc4mfKd5Hmkwbnsn1RzfbmLn1mvfRPArRf4wjnsfRcvf1wAn1bd0ADqI1YhUyPGujY1n1f1PWTsnHckFMKzUvwGujYkP6K-5y9YIZK1rBtEILILQMGCpgKGUB4WUvYE5LPGujd1uydxTZGxmhwsmdqbmgPEINqYpgw_ufKWThnqn1nYrHD%26tpl%3Dtpl_11534_22672_17382%26l%3D1518141306%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E5%252587%252586%2525E5%2525A4%2525B4%2525E9%252583%2525A8-%2525E6%2525A0%252587%2525E9%2525A2%252598-%2525E4%2525B8%2525BB%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253DBOSS%2525E7%25259B%2525B4%2525E8%252581%252598%2525E2%252580%252594%2525E2%252580%252594%2525E6%252589%2525BE%2525E5%2525B7%2525A5%2525E4%2525BD%25259C%2525EF%2525BC%25258C%2525E6%252588%252591%2525E8%2525A6%252581%2525E8%2525B7%25259F%2525E8%252580%252581%2525E6%25259D%2525BF%2525E8%2525B0%252588%2525EF%2525BC%252581%2526xp%253Did(%252522m3343670121_canvas%252522)%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D140%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3DBoss%25E7%259B%25B4%25E8%2581%2598%26oq%3DBoss%25E7%259B%25B4%25E8%2581%2598%26rqlang%3Dcn&g=%2Fwww.zhipin.com%2Fxian%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&friend_source=0&friend_source=0; _bl_uid=I3k1mb9y8d8y37ngjsvq4eevzRaj; __zp_seo_uuid__=8792ec29-03f0-439e-86f2-1b86c2c55784; __a=40025213.1591065226.1591691731.1591769829.57.7.3.3"
def get_one_page(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
        html = requests.get(url, headers=headers)
        if html.status_code == 200:
            return html.text
        return None
    except RequestException:
        return None
result_all = [] # 用于存储样本
def parse_one_page(html):
    soup = BeautifulSoup(html, 'lxml')
    companies = soup.find_all('div', 'job-primary', True)
    for com in companies:
        res = parse_one_company(com)
        result_all.append(res)

def parse_one_company(comp):
    result = []
    company_soup = comp.find('div', class_='info-company')
    com_desc = company_soup.find('p').text
    primary_soup = comp.find('div', class_='info-primary')
    job_name = primary_soup.find('div').text
    salary = primary_soup.find('span').text
    requirement = primary_soup.find('p').text
    result.append(com_desc)
    result.append(job_name)
    result.append(salary)
    result.append(requirement)
    return result

def parse_all_page(num, offset):
    url1 = 'https://www.zhipin.com/c101280100/h_101280100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 广州
    url2 = 'https://www.zhipin.com/c101280600/h_101280600/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 深圳
    url3 = 'https://www.zhipin.com/c101010100/h_101010100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 北京
    url4 = 'https://www.zhipin.com/c101020100/h_101020100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 上海
    urldict = {'1':url1, '2':url2, '3':url3, '4':url4}
    html = get_one_page(urldict[str(num)])
    parse_one_page(html)

if __name__ == '__main__':
    for j in range(1, 5):
        for i in range(1,11):
            parse_all_page(j, i)
    file = pd.DataFrame(result_all, columns=['公司信息', '岗位', '薪水', '其他'])
    # encoding='utf_8_sig解决保存到CSV文件后显示乱码问题
    file.to_csv('Bosszhiping_four_city.csv', mode='a', index=True, encoding='utf_8_sig')


``
  • 写回答

2条回答 默认 最新

  • 编程理想国 2020-06-10 18:56
    关注

    你确定爬到了页面的数据吗?我用代码测试的时候发现爬到的都是

    请稍后的页面

    爬取boss直聘需要在请求里带上自己的cookies,并且它的cookies每次都会变,这需要你自己来想办法通过它的js文件获取了,下面的代码实现了爬取并存储到csv文件,运行前更换cookie,并删除原本你目录里的csv文件

    import requests
    import csv
    import pandas as pd
    from lxml import etree
    from bs4 import BeautifulSoup
    from requests.exceptions import RequestException
    
    
    
    def get_one_page(url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
            }
            cookies = "__zp__pub__=; _uab_collina=159178398373799871604155; lastCity=100010000; __c=1591783980; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1591783980; __l=l=%2Fwww.zhipin.com%2Fc101280100%2Fh_101280100%2F%3Fquery%3D%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588&r=&friend_source=0&friend_source=0; __a=942990.1591783980..1591783980.8.1.8.8; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1591834512; __zp_stoken__=226aaABFRYnVuOBpEfiQLP1IsSndBHmsQQFh2eBYxWmU7UHwcXAAhW1wdHWB%2BIVo0HmQDGF1EawMDHH0Edz8%2BIXY2XRwMFQNge0YlZzI2CHdFAzBuZVQVbTVzdV4tDzp%2FXF1GZwY%2FSHRLBg0%3D"
            cook_dict = {cookie.split('=')[0]: cookie.split('=')[1] for cookie in cookies.split('; ')}
            html = requests.get(url, headers=headers, cookies=cook_dict)
            if html.status_code == 200:
                return html.content.decode('utf-8')
            return None
        except RequestException:
            return None
    
    
    result_all = []  # 用于存储样本
    
    
    def parse_one_page(html):
        soup = BeautifulSoup(html, 'lxml')
        companies = soup.find_all('div', 'job-primary', True)
        for com in companies:
            res = parse_one_company(com)
            result_all.append(res)
    
    
    def parse_one_company(comp):
        result = []
        company_soup = comp.find('div', class_='info-company')
        com_desc = company_soup.find('a').text
        job_soup = comp.find('div', class_='job-title')
        job_name = job_soup.find('a').text
        salary_soup = comp.find('div', class_='job-limit clearfix')
        salary = salary_soup.find('span').text
        requirement = salary_soup.find('p').text
        result.append(com_desc)
        result.append(job_name)
        result.append(salary)
        result.append(requirement)
        return result
    
    
    def parse_all_page(num, offset):
        url1 = 'https://www.zhipin.com/c101280100/h_101280100/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
            offset)  # 广州
        url2 = 'https://www.zhipin.com/c101280600/h_101280600/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
            offset)  # 深圳
        url3 = 'https://www.zhipin.com/c101010100/h_101010100/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
            offset)  # 北京
        url4 = 'https://www.zhipin.com/c101020100/h_101020100/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
            offset)  # 上海
        urldict = {'1': url1, '2': url2, '3': url3, '4': url4}
        html = get_one_page(urldict[str(num)])
        parse_one_page(html)
    
    
    if __name__ == '__main__':
        for j in range(1, 5):
            for i in range(1, 11):
                parse_all_page(j, i)
        print(result_all)
        file = pd.DataFrame(result_all, columns=['公司信息', '岗位', '薪水', '其他'])
        # encoding='utf_8_sig解决保存到CSV文件后显示乱码问题
        file.to_csv('Bosszhiping_four_city.csv', mode='a', index=True, encoding='utf_8_sig')
    
    
    
    
    
    评论

报告相同问题?

悬赏问题

  • ¥30 这是哪个作者做的宝宝起名网站
  • ¥60 版本过低apk如何修改可以兼容新的安卓系统
  • ¥25 由IPR导致的DRIVER_POWER_STATE_FAILURE蓝屏
  • ¥50 有数据,怎么建立模型求影响全要素生产率的因素
  • ¥50 有数据,怎么用matlab求全要素生产率
  • ¥15 TI的insta-spin例程
  • ¥15 完成下列问题完成下列问题
  • ¥15 C#算法问题, 不知道怎么处理这个数据的转换
  • ¥15 YoloV5 第三方库的版本对照问题
  • ¥15 请完成下列相关问题!