CaiMH436
2021-03-12 02:16
采纳率: 100%
浏览 133
已采纳

爬取有结果,但是导出excel文件是空的

代码如下
import re
import requests
import time
import pandas as pd
import json
import time
import openpyxl


from bs4 import BeautifulSoup

head = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
starturl_list = ['https://cs.lianjia.com/chengjiao/']

#获取县级市的url
def get_cityurls(url):
    request = requests.get(url,headers=head)
    request.encoding = 'utf-8'
    soup = BeautifulSoup(request.text,'html.parser')
    cityurls = []
    prenews = soup.select('div.position>dl>dd>div>div>a')
    pre_news =  ''.join([str(i) for i in prenews])
    nameslist = re.findall("/chengjiao/[a-zA-Z0-9]+/. t",pre_news)
    namesliststrip = [i.lstrip('/chengjiao/').rstrip('" t')  for i in nameslist]
    k = len(namesliststrip)
    i = 0
    for i in range(0,1):
        newcity = url + '{}'.format(namesliststrip[i])
        cityurls.append(newcity)
        i += 1
    return cityurls

#获取二手房每一页的url
def get_pageurls(url):
    request = requests.get(url,headers=head)
    request.encoding = 'utf-8'
    soup = BeautifulSoup(request.text,'html.parser')
    totalnum = json.loads(soup.find('div',{'class':"page-box house-lst-page-box"}).get('page-data'))['totalPage']+1
    pageurls_list = []
    pageurls_list.append(url)
    for num in range(0,0):
        newurl = url + 'pg{}/'.format(num)
        pageurls_list.append(newurl)
    return pageurls_list

#获取每一页的二手房url
def get_eachurls(url):
    eachurl_list = []
    request = requests.get(url,headers=head)
    request.encoding = 'utf-8'
    soup = BeautifulSoup(request.text,'html.parser')
    address_a = soup.select('li > div.info > div.title>a')
    for i in address_a:
        eachurl_list.append(i['href'])
    return eachurl_list


def news_ershoufang(url):
    data_all = []
    res = requests.get(url, headers=head)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    pre_data = soup.select('div.content > ul > li')
    pre_datanews = ''.join([str(i) for i in pre_data])
    # 城市
    data_all.append('长沙')

    # 室厅厨卫
    shi = re.findall(u"房屋户型</span>[\d\u4e00-\u9fa5]+", pre_datanews)
    if len(shi) == 0:
        data_all.append('None')
    else:
        shi_news = shi[0].lstrip('房屋户型</span>')
        data_all.append(shi_news)

data_pageurls = []
a = []
data_eachurls = []
alldata = []


city_list = get_cityurls(starturl_list[0])

#得到每页的url
m = 1
for i in city_list:
    try:
        a = get_pageurls(i)
        data_pageurls.extend(a)
        print('得到第{}页网址成功'.format(m))
    except:
        print('得到第{}页网址不成功'.format(m))
    m +=1

#得到每个房子信息的url
n = 1
for i in data_pageurls:
    try:
        b = get_eachurls(i)
        data_eachurls.extend(b)
        print('得到第{}个房子网址成功'.format(n))
    except:
        print('得到第{}个房子网址不成功'.format(n))
    n +=1

#得到每户房子信息
r = 1
for i in data_eachurls:
    try:
        c = news_ershoufang(i)
        alldata.append(c)
        print('得到第{}户房子信息成功'.format(r),[0])
    except:
        print('得到第{}户房子信息不成功'.format(r))
        time.sleep(5)
    r +=1

df = pd.DataFrame(alldata)
df.columns = ['房屋户型']
df.to_excel('长沙.xlsx')
  • 写回答
  • 好问题 提建议
  • 关注问题
  • 收藏
  • 邀请回答

2条回答 默认 最新

  • 幻灰龙 2021-03-12 08:14
    已采纳

    直接答案:

    1. 在函数 `news_ershoufang` 最后一行返回下数据 `return data_all`
    2. 修改下excel的列名,一共两列 `df.columns = ['城市', '房屋户型']`
    3. 如果有其他信息需要提取,同样的方式,一个是修改 `news_ershoufang` 内的解析,另一个是添加导出的列名

    另外,其实你多加几个日志就可以诊断出来,最后一个日志打印出来的 `c` 是 None,那么你就要怀疑`news_ershoufang` 函数内是否有什么问题,你再去看下那个函数,内部的 `data_all` 并没有 `return`,从而获得一步的解决,再执行下,诊断进一步的问题,如此反复

    • print('得到第{}页网址成功:{}'.format(m, a))
    • print('得到第{}个房子网址成功:{}'.format(n, b))
    • print('得到第{}户房子({})信息成功:{}'.format(r, i, c))
    已采纳该答案
    评论
    解决 无用
    打赏 举报
  • SoftwareTeacher 2021-03-24 16:33

    如果回答解决了问题, 请点击 ‘采纳’ 

    评论
    解决 无用
    打赏 举报

相关推荐 更多相似问题