import requests
from lxml import etree
from time import sleep
from fake_useragent import UserAgent
import pandas as pd
name_all = []
min_price_all = []
mean_price_all = []
max_price_all = []
guige_all = []
unit_all = []
data_all = []
for i in range(5):
print(i)
UA = UserAgent().edge
url_base='http://www.xinfadi.com.cn/marketanalysis/0/list/{}.shtml'
url = 'http://www.xinfadi.com.cn/marketanalysis/0/list/{}.shtml'.format(i+1)
headers={
'User-Agent':UA
}
response=requests.get(url,headers=headers)
sleep(3)
# print(response.text)
# print(response.encoding)
e = etree.HTML(response.text)
name = e.xpath('''//table[@class='hq_table']/tr[position()>1]/td[1]/text()''')
min_price = e.xpath('''//table[@class='hq_table']/tr[position()>1]/td[2]/text()''')
mean_price = e.xpath('''//table[@class='hq_table']/tr[position()>1]/td[3]/text()''')
max_price = e.xpath('''//table[@class='hq_table']/tr[position()>1]/td[4]/text()''')
guige = e.xpath('''//table[@class='hq_table']/tr[position()>1]/td[5]/text()''')
unit = e.xpath('''//table[@class='hq_table']/tr[position()>1]/td[6]/text()''')
data = e.xpath('''//table[@class='hq_table']/tr[position()>1]/td[7]/text()''')
name_all = name_all+name
min_price_all = min_price_all+min_price
mean_price_all = mean_price_all+mean_price
max_price_all=max_price_all+max_price
guige_all = guige_all+guige
unit_all = unit_all+unit
data_all=data_all+data
if i % 300 == 0:
all_info = {
'名称': name_all,
'最低价格': min_price_all,
'平均价格': mean_price_all,
'最高价格': max_price_all,
'规格': guige_all,
'单位':unit_all,
'日期':data_all
}
outdata = pd.DataFrame(all_info)
print(outdata)
#outdata.to_csv('C:\\Users\geng\Desktop\蔬菜价格数据\新发地市场价格.csv', encoding='GBK')
上面这个是在网上找到的一段爬取蔬菜价格的代码,只能爬取第一页的数据,无法实现翻页
import requests #用来发送请求
from bs4 import BeautifulSoup#用来解析网页
import time#导入时间隔
import codecs
with open('新发地果蔬价格.csv','wb+') as cf:
cf.write(codecs.BOM_UTF8)
for i in range(5):#爬取第一页到第100页的数据
f = open("新发地果蔬价格.csv",mode='a',encoding="utf8")#写人文件新发地果蔬文件名,格式是csv.字符编码,a是文字
resp = requests.get(f"http://www.xinfadi.com.cn/marketanalysis/0/list/1.shtml")
print(resp)#200: ok, 302:重定向, 404:页面丢失,500:服务器错误
page_one = BeautifulSoup(resp.text, "html.parser")#通过html来,把请求的网页打印出来
#找到表格数据(table)
table = page_one.find("table",attrs={"class":"hq_table"}) #表格特征是class和hq_table
#找到所有表格的所有tr
trs = table.find_all("tr")[1:]#获取表格的内容,通过列表来,第0个元素不要了,[第一行,第二行,到三行......
for tr in trs:#tr是每一行内容,在所有的行中,遍历每一列的内容
tds = tr.find_all("td")#tds是每一列的内容,[第一列,第二列,第三列......]
name = tds[0].text.strip()#从第0项元素开始取值,取文本形式的内容数据,strip去掉左右两边所有的空格
lowest = tds[1].text.strip()#获取到数据,去掉左右空格
avg = tds[2].text.strip()#获取平均价数据,去掉左右空格
highest = tds[3].text.strip()#获取最高价,去掉左右空格
fenlei = tds[4].text.strip()#获取分类,去掉左右的空格
danwei = tds[5].text.strip()#获取单位数据,去掉左右的空格
date = tds[6].text.strip()#获取日期数据,去掉左右空ge
f.write(f"{name},{lowest},{avg},{highest},{fenlei},{danwei},{date}\n")#把每种果蔬的名字.日期,价格写入,且换行
#print(f"{name},{lowest},{avg},{highest},{fenlei},{danwei},{date}")
print(f"一个页面完事了{i}")
time.sleep(1)#防止服务器蹦了,间隔一秒钟
这是另一段代码,功能相同,也是无法翻页爬取,请问怎么才能翻页爬取内容