问题遇到的现象和发生背景 当我爬取小量页面的时候就没问题,可页面多起来的时候就会出现str' object has no attribute 'get_text'问题,应该如何改下列代码,然后爬取多点页面数据,不出问题呢?十分感谢
import requests
import time
from bs4 import BeautifulSoup
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.53'
}
def get_info(url):
res=requests.get(url,headers=headers)
soup=BeautifulSoup(res.text,'lxml')
name=soup.select('#proName > a') if len(soup.select('#proName > a'))>0 else '无'
price=soup.select('body > div.product-con > div.product-con > div.fl.pro-left > div.cell-con > div.cell-price > span.red')if len(soup.select('body > div.product-con > div.product-con > div.fl.pro-left > div.cell-con > div.cell-price > span.red'))>0 else '无'
operator=soup.select('#cell-con-table > ul:nth-child(2) > li:nth-child(1) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(2) > li:nth-child(1) > div.right > p'))>0 else '无'
fast=soup.select('#cell-con-table > ul:nth-child(4) > li:nth-child(6) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(4) > li:nth-child(6) > div.right > p'))>0 else '无'
size=soup.select('#cell-con-table > ul:nth-child(6) > li:nth-child(1) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(6) > li:nth-child(1) > div.right > p'))>0 else '无'
texture=soup.select('#cell-con-table > ul:nth-child(6) > li:nth-child(2) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(6) > li:nth-child(2) > div.right > p'))>0 else '无'
resolution=soup.select('#cell-con-table > ul:nth-child(6) > li:nth-child(3) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(6) > li:nth-child(3) > div.right > p'))>0 else '无'
pixel=soup.select('#cell-con-table > ul:nth-child(6) > li:nth-child(4) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(6) > li:nth-child(4) > div.right > p'))>0 else '无'
system=soup.select('#cell-con-table > ul:nth-child(8) > li:nth-child(1) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(8) > li:nth-child(1) > div.right > p'))>0 else '无'
cpu=soup.select('#cell-con-table > ul:nth-child(8) > li:nth-child(2) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(8) > li:nth-child(2) > div.right > p'))>0 else '无'
memory=soup.select('#cell-con-table > ul:nth-child(8) > li:nth-child(7) > div.right > p > a')if len(soup.select('#cell-con-table > ul:nth-child(8) > li:nth-child(7) > div.right > p > a'))>0 else '无'
capacity=soup.select('#cell-con-table > ul:nth-child(8) > li:nth-child(9) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(8) > li:nth-child(9) > div.right > p'))>0 else '无'
sensor=soup.select('#cell-con-table > ul:nth-child(10) > li:nth-child(1) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(10) > li:nth-child(1) > div.right > p'))>0 else '无'
rearcamera=soup.select('#cell-con-table > ul:nth-child(10) > li:nth-child(2) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(10) > li:nth-child(2) > div.right > p'))>0 else '无'
frontcamera=soup.select('#cell-con-table > ul:nth-child(10) > li:nth-child(3) > div.right > p')if len(soup.select('#cell-con-table > ul:nth-child(10) > li:nth-child(3) > div.right > p'))>0 else '无'
for name,price,operator,fast,size,texture,resolution,pixel,system,cpu,memory,capacity,sensor,rearcamera,frontcamera in zip(name,price,operator,fast,size,texture,resolution,pixel,system,cpu,memory,capacity,sensor,rearcamera,frontcamera):
data={
'name':name.get_text().strip(),
'price': price.get_text().split('\n')[-1].strip(),
'operator': operator.get_text().split('\n')[1].strip(),
'fast':fast.get_text().strip(),
'size':size.get_text().strip(),
'texture':texture.get_text().strip(),
'resolution':resolution.get_text().strip(),
'pixel':pixel.get_text().strip(),
'system':system.get_text().strip(),
'cpu':cpu.get_text().strip(),
'memory': memory.get_text().strip(),
'capacity': capacity.get_text().strip(),
'sensor':sensor.get_text().strip(),
'rearcamera': rearcamera.get_text().strip(),
'frontcamera':frontcamera.get_text().strip(),
}
print(data)
if __name__ == '__main__':
urls=['https://product.cnmo.com/1626/162{}/param.shtml'.format(("%04d" % i)) for i in range(5700,5732)]
for url in urls:
get_info(url)
time.sleep(1)