代码如下 import re import requests import time import pandas as pd import json import time import openpyxl from bs4 import BeautifulSoup head = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'} starturl_list = ['https://cs.lianjia.com/chengjiao/'] #获取县级市的url def get_cityurls(url): request = requests.get(url,headers=head) request.encoding = 'utf-8' soup = BeautifulSoup(request.text,'html.parser') cityurls = [] prenews = soup.select('div.position>dl>dd>div>div>a') pre_news = ''.join([str(i) for i in prenews]) nameslist = re.findall("/chengjiao/[a-zA-Z0-9]+/. t",pre_news) namesliststrip = [i.lstrip('/chengjiao/').rstrip('" t') for i in nameslist] k = len(namesliststrip) i = 0 for i in range(0,1): newcity = url + '{}'.format(namesliststrip[i]) cityurls.append(newcity) i += 1 return cityurls #获取二手房每一页的url def get_pageurls(url): request = requests.get(url,headers=head) request.encoding = 'utf-8' soup = BeautifulSoup(request.text,'html.parser') totalnum = json.loads(soup.find('div',{'class':"page-box house-lst-page-box"}).get('page-data'))['totalPage']+1 pageurls_list = [] pageurls_list.append(url) for num in range(0,0): newurl = url + 'pg{}/'.format(num) pageurls_list.append(newurl) return pageurls_list #获取每一页的二手房url def get_eachurls(url): eachurl_list = [] request = requests.get(url,headers=head) request.encoding = 'utf-8' soup = BeautifulSoup(request.text,'html.parser') address_a = soup.select('li > div.info > div.title>a') for i in address_a: eachurl_list.append(i['href']) return eachurl_list def news_ershoufang(url): data_all = [] res = requests.get(url, headers=head) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') pre_data = soup.select('div.content > ul > li') pre_datanews = ''.join([str(i) for i in pre_data]) # 城市 data_all.append('长沙') # 室厅厨卫 shi = re.findall(u"房屋户型</span>[\d\u4e00-\u9fa5]+", pre_datanews) if len(shi) == 0: data_all.append('None') else: shi_news = shi[0].lstrip('房屋户型</span>') data_all.append(shi_news) data_pageurls = [] a = [] data_eachurls = [] alldata = [] city_list = get_cityurls(starturl_list[0]) #得到每页的url m = 1 for i in city_list: try: a = get_pageurls(i) data_pageurls.extend(a) print('得到第{}页网址成功'.format(m)) except: print('得到第{}页网址不成功'.format(m)) m +=1 #得到每个房子信息的url n = 1 for i in data_pageurls: try: b = get_eachurls(i) data_eachurls.extend(b) print('得到第{}个房子网址成功'.format(n)) except: print('得到第{}个房子网址不成功'.format(n)) n +=1 #得到每户房子信息 r = 1 for i in data_eachurls: try: c = news_ershoufang(i) alldata.append(c) print('得到第{}户房子信息成功'.format(r),[0]) except: print('得到第{}户房子信息不成功'.format(r)) time.sleep(5) r +=1 df = pd.DataFrame(alldata) df.columns = ['房屋户型'] df.to_excel('长沙.xlsx')
爬取有结果,但是导出excel文件是空的
- 写回答
- 好问题 0 提建议
- 追加酬金
- 关注问题
- 邀请回答
-
2条回答 默认 最新
- 幻灰龙 2021-03-12 08:14关注
直接答案:
- 在函数 `news_ershoufang` 最后一行返回下数据 `return data_all`
- 修改下excel的列名,一共两列 `df.columns = ['城市', '房屋户型']`
- 如果有其他信息需要提取,同样的方式,一个是修改 `news_ershoufang` 内的解析,另一个是添加导出的列名
另外,其实你多加几个日志就可以诊断出来,最后一个日志打印出来的 `c` 是 None,那么你就要怀疑`news_ershoufang` 函数内是否有什么问题,你再去看下那个函数,内部的 `data_all` 并没有 `return`,从而获得一步的解决,再执行下,诊断进一步的问题,如此反复:
- print('得到第{}页网址成功:{}'.format(m, a))
- print('得到第{}个房子网址成功:{}'.format(n, b))
- print('得到第{}户房子({})信息成功:{}'.format(r, i, c))
本回答被题主选为最佳回答 , 对您是否有帮助呢?解决评论 打赏 举报无用 1