import requests
from lxml import etree
import csv
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "
"AppleWebKit/537.36"
" (KHTML, like Gecko) "
"Chrome/65.0.3325.181 Safari/537.36"
}
proxy = {'http': '58.220.95.90'}
def get_detail_url(url):
response = requests.get(url, headers=headers, proxies=proxy)
text = response.text
html = etree.HTML(text)
detail_urls = html.xpath('//div[@class="job-content"]//h3//a/@href')
return detail_urls
# print(detail_urls)
def parse_detail_url(url):
carrers = {}
response = requests.get(url, headers=headers, proxies=proxy)
text = response.text # text = response.content.decode('gbk')
html = etree.HTML(text)
title = html.xpath('//div[@class="title-info"]//h1/text()')[0]
carrers['职位'] = title
company = html.xpath("//div[@class='title-info']//h3/a/text()")[0]
carrers['用人公司'] = company
salary = html.xpath('//div[@class="job-title-left"]//p[@class="job-item-title"]/text()')[0]
carrers['工资'] = salary
address = html.xpath('//div[@class="job-title-left"]//p[@class="basic-infor"]/text()')[0]
carrers['地址'] = address
describe = html.xpath("//div[@class='content content-word']/text()")[0]
carrers['描述'] = describe
return carrers
def spider():
base_url = 'https://www.liepin.com/zhaopin/?init=' \
'-1&headckid=8abc72d8e99a221e&dqs=&' \
'fromSearchBtn=2&imscid=R000000035&ckid=' \
'3b940b77623f2371°radeFlag=0&key=' \
'Python&siTag=p_XzVCa5J0EfySMbVjghcw' \
'~fA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_' \
'ckId=d19babb6ec73fe05edac4894af70bb1e&d_' \
'curPage=1&d_pageSize=40&d_headId=ebc370a67b2c5d86e' \
'72b13b82b6e90c7&curPage={}'
zhaopin = []
for x in range(1,10):
url=base_url.format(x)
# print(url)
detail_urls = get_detail_url(url)
for detail_url in detail_urls:
carrers = parse_detail_url(detail_url)
zhaopin.append(carrers)
print(zhaopin)
filednames=['职位','用人公司','工资','地址','描述']
with open('result.csv','w',encoding='utf-8') as f:
writer=csv.DictWriter(f,filednames)
writer.writeheader()
writer.writerows(zhaopin)
for x in range(1,10): 就这个位置之前是9可以打印改了10不打印了,又改回9还是不打印结果
if name == '__main__':
spider()