需要构建请求信息,包括url,headers,proxies等信息,并利用requests库或者selenium库进行访
怎么解析网页,将所有信息保存为csv文件。
关于python的问题
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
2条回答 默认 最新
小杰911 2023-06-10 14:21关注import requests from lxml import etree import csv f = open('java工程师招聘信息数据表.csv', 'a', encoding='utf-8', newline='') xiaoluo=[ '招聘岗位', '薪水', '工作地点', '学历', '发布时间', '公司名称', '公司地址', '任职详细信息'] ww = csv.writer(f) ww.writerow(xiaoluo) a = 0 for i in range(1, 11): url = f'https://msearch.51job.com/job_list.php?keyword=Java%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88&keywordtype=2&jobarea=020000&fromapp=&pageno={i} ' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.41', 'Referer': 'https://m.51job.com/', 'Cookie': 'guid=9005afff64eb7f1f3f879ec0436f1583; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229005afff64eb7f1f3f879ec0436f1583%22%2C%22first_id%22%3A%2218181483d7eb65-0b5fde49beaee18-26021b51-1350728-18181483d7f467%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgxODE0ODNkN2ViNjUtMGI1ZmRlNDliZWFlZTE4LTI2MDIxYjUxLTEzNTA3MjgtMTgxODE0ODNkN2Y0NjciLCIkaWRlbnRpdHlfbG9naW5faWQiOiI5MDA1YWZmZjY0ZWI3ZjFmM2Y4NzllYzA0MzZmMTU4MyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%229005afff64eb7f1f3f879ec0436f1583%22%7D%2C%22%24device_id%22%3A%2218181483d7eb65-0b5fde49beaee18-26021b51-1350728-18181483d7f467%22%7D; _uab_collina=165573142584412596676913; partner=51jobhtml5; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60010000%2C020000%2C030200%2C040000%2C090200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch4%7E%60010000%2C020000%2C030200%2C040000%2C090200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; msearchhistory=020000%2CJava%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88%2C2%2C%2C%2C%7C%7C020000%2C%E7%88%AC%E8%99%AB%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88%2C2%2C%2C%2C%7C%7C020000%2C%E7%88%AC%E8%99%AB%2C2%2C%2C%2C; m_search=areacode%3D020000%26%7C%26keyword%3DJava%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88; acw_tc=76b20ff916558203895284101e604b838909c0a4b9c6f6fbc1969a4d2dbe6d; SECKEY_ABVK=xgzxoSyn2rtXh5MMsu8vphF1pw/gDr23aCF3+p/Ki2c%3D; BMAP_SECKEY=2mBlqVQnvkTyj7CDYCSepCV66Z1JJFHnlxN1HBYtIwhr14tDfacPvwpRsh-FttecZsXYmsTaSLo4-NkE4QJbdsk64mkhlzL2Ezr_kzAwhhqC0PaFJqgR6OiihpJ14g93nuM-6VNoqCkYBFCDJ7SRQtIOUh9eKYILo6glUsfuGO8B5yK7TgM1g4D6YaaYHOS8; acw_sc__v2=62b1d0733f8741b0e9094a2a23d15f489ac3336d; ssxmod_itna=YqUx0DBD9DnA3AKK0dKYIE=4rqqu0ibOb3qGNdAoDZDiqAPGhDC84Ix27RkD3h0EsDIoweDgC4xhxEGlMeTevQipyeDHxY=DU=CiKbD4+KGwD0eG+DD4DWeqAoDexGpc2pXKGWD4qDODWKDX2akDiPCDmR8pxGClxDCUAPDwx0CLovF=4YpDiyiROhPLxG1F40HiGfxLxOfL8G1RPSAozbO43YDvxDkDUKDo2PpDB+kBpYNQCRDWjuDYWb3qOx45xRi=Q2isx2rqZxxLQexQWUPKYaKTjqKKDDWilty4D===; ssxmod_itna2=YqUx0DBD9DnA3AKK0dKYIE=4rqqu0ibOb4A=TGOj7DBTgq7pxLhBaHGFj+g6fk6LHNYq8O7APDgemv=GY6hBBj7QrIR3cj3A31LFj2YDzf7u27eq1F9LxsZ97BWggIlgSXwUsY8MpcmsSh9HY2wbmoebQAANZxBhum+GKtWCF4AirtAmzA5za=Grz8D7qGIb3PqeenpvFW5HW+ee2AGoDQIEDjKD+OfxEnDTV7hx4D==' } response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' page_text = response.text html = etree.HTML(page_text) rel_list = html.xpath('//div[@class="list"]/a/@rel') # 获取详细信息页链接实现详细信息爬取 for rel in rel_list: response = requests.get(url=rel, headers=headers) response.encoding = 'utf-8' page_text = response.text html = etree.HTML(page_text) jie1 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[1]/p[@class="jname"]/text()')[0] # 招聘岗位名称 jie2 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[1]/p[@class="sal"]/text()')[0] # 薪水 jie3 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/div/span[1]/text()')[0] # 工作地点 jie4 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/div/span[2]/text()')[0] # 学历 jie5 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/span/text()')[0] # 时间 jie6 = html.xpath('//div[@id="pageContent"]/div[2]/div/div[2]/h3/text()')[0] # 公司名称 jie7 = html.xpath('//div[@id="pageContent"]/div[1]/a/span/text()')[0] # 公司地址 jie8 = html.xpath('//div[@id="pageContent"]/div[3]/div[2]/article//text()') # 任职详细信息 jie8 = ",".join(jie8).replace('\xa0', '') a += 1 print(f'正在爬取第{a}条招聘信息') xiaojie = [jie1,jie2,jie3,jie4,jie5,jie6,jie7,jie8] print(xiaojie) ww.writerow(xiaojie) print(f'第{i}页爬取结束!!!') f.close()
本回答被题主选为最佳回答 , 对您是否有帮助呢?评论 打赏 举报解决 1无用