期末作业是爬取Boss直聘岗位相关数据,前期结合所学和网上的代码,但CSV文件中仍是空的,找不到问题,还望大家帮忙看看问题出在哪儿,谢谢大家!
import requests
import csv
import pandas as pd
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
cookie="__zp__pub__=; lastCity=101110100; __zp_stoken__=ddfaaC1oxcwcya3oOXUNWPSV5Vn9mfABYVFAhHlNZG1cvMgZMNnVnHHNKSzcYaQtJQSADE3tSZTopfT5ka30GRSlaU3c6ckVVaR4eBiMvDT8aR38lBkcNWAI8UVctTitNAxlGbCBbZz9gTSU%3D; t=Oh8LmQ5pyMOhjqah; wt=Oh8LmQ5pyMOhjqah; sid=sem_pz_bdpc_dasou_title; __c=1591769829; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2Fxian%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fsp0.baidu.com%2F9q9JcDHa2gU2pMbgoY3K%2Fadrc.php%3Ft%3D06KL00c00fDdiHC088qh0KZEgsZ9X8KX00000ZoOx7C00000UkfexZ.THdBULP1doZA80K85yF9pywd0ZnquAu9rjTdnj6snj0YrHc4mfKd5Hmkwbnsn1RzfbmLn1mvfRPArRf4wjnsfRcvf1wAn1bd0ADqI1YhUyPGujY1n1f1PWTsnHckFMKzUvwGujYkP6K-5y9YIZK1rBtEILILQMGCpgKGUB4WUvYE5LPGujd1uydxTZGxmhwsmdqbmgPEINqYpgw_ufKWThnqn1nYrHD%26tpl%3Dtpl_11534_22672_17382%26l%3D1518141306%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E5%252587%252586%2525E5%2525A4%2525B4%2525E9%252583%2525A8-%2525E6%2525A0%252587%2525E9%2525A2%252598-%2525E4%2525B8%2525BB%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253DBOSS%2525E7%25259B%2525B4%2525E8%252581%252598%2525E2%252580%252594%2525E2%252580%252594%2525E6%252589%2525BE%2525E5%2525B7%2525A5%2525E4%2525BD%25259C%2525EF%2525BC%25258C%2525E6%252588%252591%2525E8%2525A6%252581%2525E8%2525B7%25259F%2525E8%252580%252581%2525E6%25259D%2525BF%2525E8%2525B0%252588%2525EF%2525BC%252581%2526xp%253Did(%252522m3343670121_canvas%252522)%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D140%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3DBoss%25E7%259B%25B4%25E8%2581%2598%26oq%3DBoss%25E7%259B%25B4%25E8%2581%2598%26rqlang%3Dcn&g=%2Fwww.zhipin.com%2Fxian%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&friend_source=0&friend_source=0; _bl_uid=I3k1mb9y8d8y37ngjsvq4eevzRaj; __zp_seo_uuid__=8792ec29-03f0-439e-86f2-1b86c2c55784; __a=40025213.1591065226.1591691731.1591769829.57.7.3.3"
def get_one_page(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
html = requests.get(url, headers=headers)
if html.status_code == 200:
return html.text
return None
except RequestException:
return None
result_all = [] # 用于存储样本
def parse_one_page(html):
soup = BeautifulSoup(html, 'lxml')
companies = soup.find_all('div', 'job-primary', True)
for com in companies:
res = parse_one_company(com)
result_all.append(res)
def parse_one_company(comp):
result = []
company_soup = comp.find('div', class_='info-company')
com_desc = company_soup.find('p').text
primary_soup = comp.find('div', class_='info-primary')
job_name = primary_soup.find('div').text
salary = primary_soup.find('span').text
requirement = primary_soup.find('p').text
result.append(com_desc)
result.append(job_name)
result.append(salary)
result.append(requirement)
return result
def parse_all_page(num, offset):
url1 = 'https://www.zhipin.com/c101280100/h_101280100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 广州
url2 = 'https://www.zhipin.com/c101280600/h_101280600/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 深圳
url3 = 'https://www.zhipin.com/c101010100/h_101010100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 北京
url4 = 'https://www.zhipin.com/c101020100/h_101020100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 上海
urldict = {'1':url1, '2':url2, '3':url3, '4':url4}
html = get_one_page(urldict[str(num)])
parse_one_page(html)
if __name__ == '__main__':
for j in range(1, 5):
for i in range(1,11):
parse_all_page(j, i)
file = pd.DataFrame(result_all, columns=['公司信息', '岗位', '薪水', '其他'])
# encoding='utf_8_sig解决保存到CSV文件后显示乱码问题
file.to_csv('Bosszhiping_four_city.csv', mode='a', index=True, encoding='utf_8_sig')
``