各位大佬好:
刚接触python,想用webdriver实现爬取京东商品信息,举个例子:在京东首页搜索 手机 会显示许多商品结果,我想在搜索出来的基础(商品价钱,商品链接,商品店铺链接)上根据这个“商品店铺链接”进一步爬取这个店铺里面的信息, 请问我的思路是否可行(不一定非要用webdriver),如果可行能否指导下怎么进入这个商品店铺链接 悬赏可以修改这是我的代码:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
# noinspection SyntaxError
def get_good(driver):
try:
# 通过JS控制滚轮滑动获取所有商品信息
js_code = '''
window.scrollTo(0,5000);
'''
driver.execute_script(js_code) # 执行js代码
# 等待数据加载
time.sleep(2)
# 3、查找所有商品div
# good_div = driver.find_element_by_id('J_goodsList')
good_list = driver.find_elements_by_class_name('gl-item')
n = 1
for good in good_list:
# 根据属性选择器查找
# 商品链接
good_url = good.find_element_by_css_selector(
'.p-img a').get_attribute('href')
# 商品名称
good_name = good.find_element_by_css_selector(
'.p-name em').text.replace("\n", "--")
# 商品价格
good_price = good.find_element_by_class_name(
'p-price').text.replace("\n", ":")
# 评价人数
good_commit = good.find_element_by_class_name(
'p-commit').text.replace("\n", " ")
# 店铺连接
#try:
shop_url = good.find_element_by_css_selector(
'.p-shop a').get_attribute('href')
#except OSError:
pass
# 店铺名称
#try:
shop_name = good.find_element_by_css_selector(
'.p-shop a').get_attribute('title')
#except OSError:
pass
good_content = f'''
商品链接: {good_url}
商品名称: {good_name}
商品价格: {good_price}
评价人数: {good_commit}
店铺名称: {shop_url}
店铺连接: {shop_name}
\n
'''
#qianjian = '旗舰店'
#if qianjian in shop_name:
# print('旗舰店不要')
#else:
print(good_content)
with open('jd.txt', 'a', encoding='utf-8') as f:
f.write(good_content)
next_tag = driver.find_element_by_class_name('pn-next')
next_tag.click()
# except AttributeError:
# pass
# raise Exception("a must not be zero")
time.sleep(2)
# 递归调用函数
get_good(driver)
time.sleep(10)
finally:
driver.close()
if __name__ == '__main__':
good_name = input('请输入爬取商品信息:').strip()
driver = webdriver.Firefox()
#driver.implicitly_wait(10)
# 1、往京东主页发送请求
driver.get('https://www.jd.com/')
# 2、输入商品名称,并回车搜索
input_tag = driver.find_element_by_id('key')
input_tag.send_keys(good_name)
input_tag.send_keys(Keys.ENTER)
time.sleep(2)
get_good(driver)
ps:环境:python3.7 需要引入下浏览器驱动,我用的火狐(45.0.2)引入的驱动是:geckodriver-v0.15.0