以下为一段模拟浏览器爬虫的代码,我有15个url,每个都是不一样的,可是我获取的all_content全部是重复,一样的数据,我不知道哪里出了问题,恳请各位帮忙。
import random
from selenium import webdriver
from lxml import etree
import json
import time
#import time
driver_path='D:\geckodriver.exe'
driver=webdriver.Firefox(executable_path=driver_path)
url="https://www.lagou.com/wn/jobs?kd=Java&city=%E5%85%A8%E5%9B%BD/"
driver.get(url)
html=etree.HTML(driver.page_source)
script=html.xpath('//script[@id="__NEXT_DATA__"]/text()')[0]
script=json.loads(script)
props=script.get('props')
pageProps=props.get('pageProps')
initData=pageProps.get('initData')
content=initData.get('content')
positionResult=content.get('positionResult')
result=positionResult.get('result')
results=[]
for result1 in result:
positionId= result1.get('positionId')
results.append(positionId)
"""以上的代码都是用于导入库,和获取url而写的,可以先忽略不看,
看下面的代码,我是打算通过urls遍历得到每个url的数据,可是得出来都是重复的
"""
urls=[]
for result2 in results:
url='https://www.lagou.com/wn/jobs/'+str(result2)+'.html'
urls.append(url)
all_contents=[]
for url in urls:
driver.execute_script("window.open('{}')".format(url))
driver.switch_to.window(driver.window_handles[1])
html = etree.HTML(driver.page_source)
job=html.xpath('//span/span[@class="position-head-wrap-position-name"]/text()')
salary=html.xpath('//span/span[@class="salary"]/text()')
other=html.xpath('//dd[@class="job_request"]/h3//text()')
others=''
for i in other:
i = i.strip()
if i=="":
continue
elif i=="/":
continue
elif '/' in i:
i=i.replace('/','')
others+=i+' '
singles=''
single=html.xpath('//dd/div[@class="job-detail"]//text()')
for b in single:
b = b.strip()
if b=="":
continue
elif b=="/":
continue
elif '/' in b:
b=b.replace('/','')
singles+=b
all_content={"job":job,"salary":salary,"others":others,"singles":singles}
all_contents.append(all_content)
print(all_contents)
time.sleep(random.uniform(2,4))
#driver.close()
# driver.switch_to.window(driver.window_handles[0])
#time.sleep(random.uniform(2,4))