代码如下
import re
import requests
import time
import pandas as pd
import json
import time
import openpyxl
from bs4 import BeautifulSoup
head = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
starturl_list = ['https://cs.lianjia.com/chengjiao/']
#获取县级市的url
def get_cityurls(url):
request = requests.get(url,headers=head)
request.encoding = 'utf-8'
soup = BeautifulSoup(request.text,'html.parser')
cityurls = []
prenews = soup.select('div.position>dl>dd>div>div>a')
pre_news = ''.join([str(i) for i in prenews])
nameslist = re.findall("/chengjiao/[a-zA-Z0-9]+/. t",pre_news)
namesliststrip = [i.lstrip('/chengjiao/').rstrip('" t') for i in nameslist]
k = len(namesliststrip)
i = 0
for i in range(0,1):
newcity = url + '{}'.format(namesliststrip[i])
cityurls.append(newcity)
i += 1
return cityurls
#获取二手房每一页的url
def get_pageurls(url):
request = requests.get(url,headers=head)
request.encoding = 'utf-8'
soup = BeautifulSoup(request.text,'html.parser')
totalnum = json.loads(soup.find('div',{'class':"page-box house-lst-page-box"}).get('page-data'))['totalPage']+1
pageurls_list = []
pageurls_list.append(url)
for num in range(0,0):
newurl = url + 'pg{}/'.format(num)
pageurls_list.append(newurl)
return pageurls_list
#获取每一页的二手房url
def get_eachurls(url):
eachurl_list = []
request = requests.get(url,headers=head)
request.encoding = 'utf-8'
soup = BeautifulSoup(request.text,'html.parser')
address_a = soup.select('li > div.info > div.title>a')
for i in address_a:
eachurl_list.append(i['href'])
return eachurl_list
def news_ershoufang(url):
data_all = []
res = requests.get(url, headers=head)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
pre_data = soup.select('div.content > ul > li')
pre_datanews = ''.join([str(i) for i in pre_data])
# 城市
data_all.append('长沙')
# 室厅厨卫
shi = re.findall(u"房屋户型</span>[\d\u4e00-\u9fa5]+", pre_datanews)
if len(shi) == 0:
data_all.append('None')
else:
shi_news = shi[0].lstrip('房屋户型</span>')
data_all.append(shi_news)
data_pageurls = []
a = []
data_eachurls = []
alldata = []
city_list = get_cityurls(starturl_list[0])
#得到每页的url
m = 1
for i in city_list:
try:
a = get_pageurls(i)
data_pageurls.extend(a)
print('得到第{}页网址成功'.format(m))
except:
print('得到第{}页网址不成功'.format(m))
m +=1
#得到每个房子信息的url
n = 1
for i in data_pageurls:
try:
b = get_eachurls(i)
data_eachurls.extend(b)
print('得到第{}个房子网址成功'.format(n))
except:
print('得到第{}个房子网址不成功'.format(n))
n +=1
#得到每户房子信息
r = 1
for i in data_eachurls:
try:
c = news_ershoufang(i)
alldata.append(c)
print('得到第{}户房子信息成功'.format(r),[0])
except:
print('得到第{}户房子信息不成功'.format(r))
time.sleep(5)
r +=1
df = pd.DataFrame(alldata)
df.columns = ['房屋户型']
df.to_excel('长沙.xlsx')
爬取有结果,但是导出excel文件是空的
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
2条回答 默认 最新
幻灰龙 2021-03-12 08:14关注直接答案:
- 在函数 `news_ershoufang` 最后一行返回下数据 `return data_all`
- 修改下excel的列名,一共两列 `df.columns = ['城市', '房屋户型']`
- 如果有其他信息需要提取,同样的方式,一个是修改 `news_ershoufang` 内的解析,另一个是添加导出的列名
另外,其实你多加几个日志就可以诊断出来,最后一个日志打印出来的 `c` 是 None,那么你就要怀疑`news_ershoufang` 函数内是否有什么问题,你再去看下那个函数,内部的 `data_all` 并没有 `return`,从而获得一步的解决,再执行下,诊断进一步的问题,如此反复:
- print('得到第{}页网址成功:{}'.format(m, a))
- print('得到第{}个房子网址成功:{}'.format(n, b))
- print('得到第{}户房子({})信息成功:{}'.format(r, i, c))
本回答被题主选为最佳回答 , 对您是否有帮助呢?解决评论 打赏 举报无用 1