每次写爬虫的时候,用for in 写循环的结果总是不理想,提取出了节点列表,在提取信息的时候总是会出现问题
import os
import requests
from selenium import webdriver #导入webdriver模块
import time
from lxml import etree
url='https://shop149668360.taobao.com/category-1191801314.htm?spm=a1z10.5-c.w4002-22681847748.61.465d5c8f0dh86q&_ksTS=1659077354304_134&callback=jsonp135&input_charset=gbk&mid=w-22681847748-0&wid=22681847748&path=%2Fcategory-1191801314.htm&search=y&catName=%C0%AD%C9%EC%B5%AF%BB%C9&scene=taobao_shop&catId=1191801314&pageNo=1#anchor'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'cookie': 'cna=n29qG1ayvS8CAXjrPrkLPEYS; t=95a6aaddbb3b982ba6bff21e21e1a82f; sgcookie=E1005IBFHd2%2Ba9zk9ToHYY7uoomUqSo2MenADQh000aADhIBvH8KI2fIJmbc3dJOhIk3j6H%2BmHXj%2BOATXVvLt5jT87gjU4d99rq90CvSEVJVKcw%3D; uc3=lg2=UIHiLt3xD8xYTw%3D%3D&nk2=F5RHo3%2Bchnwovg%3D%3D&vt3=F8dCv4Jjgeh1e7zPFGw%3D&id2=UNX8g2IwxK%2Fnog%3D%3D; lgc=tb22421930; uc4=nk4=0%40FY4MsTXATnUZNB%2FZTM%2FgjtVhHlE9&id4=0%40UgJ5MLXCY0y6abgKbGhYehJ0jGdz; tracknick=tb22421930; _cc_=WqG3DMC9EA%3D%3D; thw=cn; enc=OK%2FpAK1erq88qjMHjWKgm9DeOwIZSDi8ItyHS9Ku49yjMectsp38SatVdN0k5ukbNYy5QgQnhey8G7wPj277ew%3D%3D; xlly_s=1; _m_h5_tk=b938cb534abcb6427e11f4673a0fce73_1659346008765; _m_h5_tk_enc=8377ec7fabd6e2d0eccf19ecfdcc0d0e; mt=ci=-1_0; uc1=cookie14=UoexOz%2FfS1JcqQ%3D%3D; _tb_token_=501133f5357e5; cookie2=1c616690780d906e68deca0698bf5ad4; pnm_cku822=; isg=BOLiWFUY9NpOAuiC3XQmOXplM2hEM-ZNc4qp-Cx7DtUA_4J5FMM2XWh5LzsDb17l; tfstk=cBsPBui_yuEy0w_NuntUdXeJh_-RZUjhfzppr5Ohqdu8DLLliZlpnfiAw6kqtUf..; l=eBOEgfuVL5CkONefBOfwourza77OSIRAguPzaNbMiOCP965p5hnFW6xuccL9C3GVh6DwR3Wrj_IwBeYBqnvYbeZ7a6PlPaMmn',
'Referer': 'https://shop149668360.taobao.com/category-1192388203.htm?spm=a1z10.1-c.w5001-22681847674.7.34e04d1ekuitHF&search=y&catName=%D1%B9%CB%F5%B5%AF%BB%C9&scene=taobao_shop'
}
proxies = {
'http': 'http://127.0.0.1:8888',
'https': 'http://127.0.0.1:8888',
}
browser = webdriver.Chrome()
browser.get(url)
#压簧系列
try:
element = browser.find_element_by_xpath('//*[@id="sufei-dialog-close"]')
element.click()
except:
time.sleep(2)
ite ={}
a = browser.page_source
a2 = etree.HTML(a)
pro_list = a2.xpath('//*[@class="item4line1"]/dl')
print(pro_list)
if not os.path.exists('img'):
os.mkdir('img')
for pro in pro_list:
pic = pro.xpath('//dt/a/img/@src')[0]
pro_n = pro.xpath('//dd/a/text()')[0]
ite['图片地址'] = pic
ite['名字'] = pro_n
url = 'https:' + str(ite['图片地址'])
str2 = url.replace('[', '')
str3 = str2.replace(']', '')
str4 = str2.replace('\'', '')
print(str3)
r = requests.get(str4, headers=headers)
filename = str(ite['名字'])
f = open('img/' + filename + '.png', 'ab')
f.write(r.content)
f.close()
print(ite)
[<Element dl at 0x1138ccb4288>, <Element dl at 0x1138ccb4048>, <Element dl at 0x1138ccb4308>, <Element dl at 0x1138ccb4248>, <Element dl at 0x1138ccb41c8>, <Element dl at 0x1138ccb4188>, <Element dl at 0x1138ccb4208>, <Element dl at 0x1138ccb43c8>, <Element dl at 0x1138ccb45c8>, <Element dl at 0x1138ccb4108>, <Element dl at 0x1138ccb42c8>, <Element dl at 0x1138ccb44c8>, <Element dl at 0x1138ccb4648>, <Element dl at 0x1138ccb4688>, <Element dl at 0x1138ccb46c8>, <Element dl at 0x1138ccb4708>, <Element dl at 0x1138ccb4748>, <Element dl at 0x1138ccb4788>, <Element dl at 0x1138ccb47c8>, <Element dl at 0x1138ccb4808>]
https://img.alicdn.com/bao/uploaded/i1/1609787275/O1CN01ansOM723c07i1c3uH_!!1609787275.jpg_180x180.jpg
{'图片地址': '//img.alicdn.com/bao/uploaded/i1/1609787275/O1CN01ansOM723c07i1c3uH_!!1609787275.jpg_180x180.jpg', '名字': '弹簧压簧大全强力小五金压缩1.5压力300长不锈钢软机械回位弹黄钢'}
https://img.alicdn.com/bao/uploaded/i1/1609787275/O1CN01ansOM723c07i1c3uH_!!1609787275.jpg_180x180.jpg
{'图片地址': '//img.alicdn.com/bao/uploaded/i1/1609787275/O1CN01ansOM723c07i1c3uH_!!1609787275.jpg_180x180.jpg', '名字': '弹簧压簧大全强力小五金压缩1.5压力300长不锈钢软机械回位弹黄钢'}
https://img.alicdn.com/bao/uploaded/i1/1609787275/O1CN01ansOM723c07i1c3uH_!!1609787275.jpg_180x180.jpg
{'图片地址': '//img.alicdn.com/bao/uploaded/i1/1609787275/O1CN01ansOM723c07i1c3uH_!!1609787275.jpg_180x180.jpg', '名字': '弹簧压簧大全强力小五金压缩1.5压力300长不锈钢软机械回位弹黄钢'}
https://img.alicdn.com/bao/uploaded/i1/1609787275/O1CN01ansOM723c07i1c3uH_!!1609787275.jpg_180x180.jpg
{'图片地址': '//img.alicdn.com/bao/uploaded/i1/1609787275/O1CN01ansOM723c07i1c3uH_!!1609787275.jpg_180x180.jpg', '名字': '弹簧压簧大全强力小五金压缩1.5压力300长不锈钢软机械回位弹黄钢'}
后面都是输出一样的
尝试过好多种更改,不然就是全部爬取到一起来,不然就是爬取不出东西
不知道到底哪里出现了错误,之前试过爬取豆瓣的也是这样。
求解!谢谢!