CaiMH436
2021-03-12 02:16
采纳率: 100%
浏览 257
已采纳

爬取有结果,但是导出excel文件是空的

代码如下
import re
import requests
import time
import pandas as pd
import json
import time
import openpyxl


from bs4 import BeautifulSoup

head = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
starturl_list = ['https://cs.lianjia.com/chengjiao/']

#获取县级市的url
def get_cityurls(url):
    request = requests.get(url,headers=head)
    request.encoding = 'utf-8'
    soup = BeautifulSoup(request.text,'html.parser')
    cityurls = []
    prenews = soup.select('div.position>dl>dd>div>div>a')
    pre_news =  ''.join([str(i) for i in prenews])
    nameslist = re.findall("/chengjiao/[a-zA-Z0-9]+/. t",pre_news)
    namesliststrip = [i.lstrip('/chengjiao/').rstrip('" t')  for i in nameslist]
    k = len(namesliststrip)
    i = 0
    for i in range(0,1):
        newcity = url + '{}'.format(namesliststrip[i])
        cityurls.append(newcity)
        i += 1
    return cityurls

#获取二手房每一页的url
def get_pageurls(url):
    request = requests.get(url,headers=head)
    request.encoding = 'utf-8'
    soup = BeautifulSoup(request.text,'html.parser')
    totalnum = json.loads(soup.find('div',{'class':"page-box house-lst-page-box"}).get('page-data'))['totalPage']+1
    pageurls_list = []
    pageurls_list.append(url)
    for num in range(0,0):
        newurl = url + 'pg{}/'.format(num)
        pageurls_list.append(newurl)
    return pageurls_list

#获取每一页的二手房url
def get_eachurls(url):
    eachurl_list = []
    request = requests.get(url,headers=head)
    request.encoding = 'utf-8'
    soup = BeautifulSoup(request.text,'html.parser')
    address_a = soup.select('li > div.info > div.title>a')
    for i in address_a:
        eachurl_list.append(i['href'])
    return eachurl_list


def news_ershoufang(url):
    data_all = []
    res = requests.get(url, headers=head)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    pre_data = soup.select('div.content > ul > li')
    pre_datanews = ''.join([str(i) for i in pre_data])
    # 城市
    data_all.append('长沙')

    # 室厅厨卫
    shi = re.findall(u"房屋户型</span>[\d\u4e00-\u9fa5]+", pre_datanews)
    if len(shi) == 0:
        data_all.append('None')
    else:
        shi_news = shi[0].lstrip('房屋户型</span>')
        data_all.append(shi_news)

data_pageurls = []
a = []
data_eachurls = []
alldata = []


city_list = get_cityurls(starturl_list[0])

#得到每页的url
m = 1
for i in city_list:
    try:
        a = get_pageurls(i)
        data_pageurls.extend(a)
        print('得到第{}页网址成功'.format(m))
    except:
        print('得到第{}页网址不成功'.format(m))
    m +=1

#得到每个房子信息的url
n = 1
for i in data_pageurls:
    try:
        b = get_eachurls(i)
        data_eachurls.extend(b)
        print('得到第{}个房子网址成功'.format(n))
    except:
        print('得到第{}个房子网址不成功'.format(n))
    n +=1

#得到每户房子信息
r = 1
for i in data_eachurls:
    try:
        c = news_ershoufang(i)
        alldata.append(c)
        print('得到第{}户房子信息成功'.format(r),[0])
    except:
        print('得到第{}户房子信息不成功'.format(r))
        time.sleep(5)
    r +=1

df = pd.DataFrame(alldata)
df.columns = ['房屋户型']
df.to_excel('长沙.xlsx')
  • 写回答
  • 好问题 提建议
  • 追加酬金
  • 关注问题
  • 邀请回答

2条回答 默认 最新

相关推荐 更多相似问题