爬取拉勾网源代码如下
from selenium import webdriver
import requests
from lxml import etree
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import re
class lagouSpitder(object):
driver_path=r'C:\Users\哥斯拉\AppData\Local\Google\Chrome\Application\chromedriver.exe'
def __init__(self):
self.driver=webdriver.Chrome(executable_path=lagouSpitder.driver_path)
self.url='https://www.lagou.com/jobs/list_python/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput='
self.positions=[]
def run(self): #主页面
self.driver.get(self.url)
while True:
source = self.driver.page_source # source页面来源 先获取一页
WebDriverWait(driver=self.driver,timeout=10).until(
EC.presence_of_element_located((By.XPATH,'//span[@action="next"]'))
) #等待按钮加载出来
self.parse_list_page(source) # 解析完获取的一页之后,去点击下一页
next_btn = self.driver.find_element_by_xpath('//span[@action="next"]')
if "pager_next pager_next_disabled" in next_btn.get_attribute('class'): # 如果class等于最后一页则停止,否则继续点击
break
else:
next_btn.click()
time.sleep(1)
def parse_list_page(self,source): #获取职位详情页url
html=etree.HTML(source)
links=html.xpath('//a[@class="position_link"]/@href')
for link in links:
self.request_detall_page(link)
time.sleep(2)
def request_detall_page(self,url): #去请求细节页面 这里面得url等于self.request_detall_page里面传递得link
# self.driver.get(url)
self.driver.execute_script("window.open('%s')"%url) #新打开一个职位页面
self.driver.switch_to_window(self.driver.window_handles[1]) #切换到当前页面
source=self.driver.page_source #source页面来源
self.pares_detail_page(source) #解析页面
self.driver.close() #解析完关闭页面
self.driver.switch_to_window(self.driver.window_handles[0])
def pares_detail_page(self,source): #获取职位细节信息
html=etree.HTML(source)
Position_name=html.xpath('//span[@class="position-head-wrap-position-name"]/text()')[0] #职位名字
salary=html.xpath('//span[@class="salary"]/text()')[0] #薪水
Position_the_temptation=html.xpath('//dd[@class="job-advantage"]/p/text()')[0] #职位诱惑
Job_description=html.xpath('//div[@class="job-detail"]//text()') #职位详情
# Job_description=re.sub(r'[\s/]','',Job_description)
desc=''.join(html.xpath('//div[@class="job-detail"]//text()')).strip() #拼接
work_address=html.xpath('//div[@class="work_addr"]//text()')[3] #工作地址
CompanyName = html.xpath('//h3[@class="fl"]/em/text()')[0] # 公司名字
Company_Basic_Information=html.xpath('//li/h4/text()') #公司基本信息
position={
'职位名字':Position_name,
'薪水':salary,
'职位诱惑':Position_the_temptation,
'职位详情':Job_description,
'工作地址':work_address,
'公司名字': CompanyName,
'公司基本信息':Company_Basic_Information,
}
self.positions.append(position)
print(position)
print('='*40)
if __name__ == '__main__':
spider=lagouSpitder() #爬虫spider
spider.run()
麻烦帮忙看下怎么保存为CSV文件,完善一下代码,本人不是很懂
麻烦了