from pandas import DataFrame as DF
import requests
from bs4 import BeautifulSoup as BS
url = 'https://wh.lianjia.com/ershoufang/wuchang/'
urls = ['https://wh.lianjia.com/ershoufang/wuchang/pg{}'
.format(i) for i in range(1,101)]
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}
html = requests.get(url,headers=headers)
bs_house = BS(html.text,'html.parser')
house_data = DF(columns=['标题','小区','地区','户型','面积','朝向','装修情况','楼层','楼型','总价','单价'])
for url in urls:
html = requests.get(url,headers=headers)
title = bs_house.findAll('div',{'class':'title'})
position_info = bs_house.findAll('div',{'class':'positionInfo'})
house_info = bs_house.findAll('div',{'class':'houseInfo'})
total_price = bs_house.findAll('div',{'class':'totalPrice totalPrice2'})
unit_price = bs_house.findAll('div',{'class':'unitPrice'})
for i,j,k,a,b in zip(title,position_info,house_info,total_price,unit_price):
data = {
'标题':i.text.strip(),
'小区':j.text.split('-')[0].strip(),
'地区':j.text.split('-')[1].strip(),
'户型':k.text.split('|')[0].strip(),
'面积':k.text.split('|')[1].strip(),
'朝向':k.text.split('|')[2].strip(),
'装修情况':k.text.split('|')[3].strip(),
'楼层':k.text.split('|')[4].strip(),
'楼型':k.text.split('|')[5].strip(),
'总价':a.text.strip(),
'单价':b.text.strip()
}
house_data = house_data._append(data,ignore_index=True)
house_data.to_excel(r'E:\secondhandhouse\wuchang_secondhand_house.xlsx')
为啥总是反复爬取第一个页面,有没有人能解决一下?
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
5条回答 默认 最新
关注引自免费微信小程序:皆我百晓生
在您提供的代码中,问题出在对网页内容解析的部分。
bs_house变量只包含了初始URL('https://wh.lianjia.com/ershoufang/wuchang/')的HTML内容。在循环中,虽然请求了不同的分页URL,但是没有正确地将这些新获取的HTML内容传递给BeautifulSoup进行解析。为了解决这个问题,您应该在每次循环内部创建一个新的BeautifulSoup对象,并传入相应分页的HTML内容。以下是修改后的代码:
from pandas import DataFrame as DF import requests from bs4 import BeautifulSoup as BS url = 'https://wh.lianjia.com/ershoufang/wuchang/' urls = ['https://wh.lianjia.com/ershoufang/wuchang/pg{}' .format(i) for i in range(1,101)] headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'} # Create a DataFrame to store the data house_data = DF(columns=['标题','小区','地区','户型','面积','朝向','装修情况','楼层','楼型','总价','单价']) # Iterate through the URLs and scrape the data for url in urls: # Get the HTML content for each URL response = requests.get(url, headers=headers) # Parse the HTML content with BeautifulSoup bs_house = BS(response.text, 'html.parser') # Scrape the data from the current page title = bs_house.findAll('div', {'class': 'title'}) position_info = bs_house.findAll('div', {'class': 'positionInfo'}) house_info = bs_house.findAll('div', {'class': 'houseInfo'}) total_price = bs_house.findAll('div', {'class': 'totalPrice totalPrice2'}) unit_price = bs_house.findAll('div', {'class': 'unitPrice'}) # Process the data and append it to the DataFrame for i, j, k, a, b in zip(title, position_info, house_info, total_price, unit_price): data = { '标题': i.text.strip(), '小区': j.text.split('-')[0].strip(), '地区': j.text.split('-')[1].strip(), '户型': k.text.split('|')[0].strip(), '面积': k.text.split('|')[1].strip(), '朝向': k.text.split('|')[2].strip(), '装修情况': k.text.split('|')[3].strip(), '楼层': k.text.split('|')[4].strip(), '楼型': k.text.split('|')[5].strip(), '总价': a.text.strip(), '单价': b.text.strip() } house_data = house_data.append(data, ignore_index=True) # Save the DataFrame to an Excel file house_data.to_excel(r'E:\secondhandhouse\wuchang_secondhand_house.xlsx')这段修改后的代码会在每次循环内分别解析每个分页的HTML内容,确保不会反复抓取同一个页面。这样就可以正确地从所有分页中提取数据并存储到Excel文件中。
本回答被题主选为最佳回答 , 对您是否有帮助呢?解决 无用评论 打赏 举报 编辑记录