yubt007
yubt007
采纳率0%
2017-07-28 14:44 阅读 4.0k

关于使用python爬取房天下数据

70

由于论文需要,想要爬取房天下的二手房,于是修改简书上竹间为简的租房代码进行爬取。
修改后的代码为


from bs4 import BeautifulSoup
import urllib
import gzip
import inspect
import re
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
import datetime
import sqlite3
from time import sleep
from random import choice
#网页解压缩
def read_zip_url(url):
fails = 0
while fails < 5:
try:
content = urllib.request.urlopen(url).read()
content = gzip.decompress(content).decode("gb18030") #网页gb2312的编码要用这个
break
except:
fails += 1
print(inspect.stack()[1][3] + ' occused error')
soup = BeautifulSoup(content, "lxml")
return soup
#划分城区
starturl = "http://esf.km.fang.com/house/i31/"
soup = read_zip_url(starturl)
area_first_soup = soup.find_all('div',id = 'list_D02_10')[0].find_all('a')
del area_first_soup[-2]
del area_first_soup[0]
area_first = [] 
for i in area_first_soup:
area_first.append("http://esf.km.fang.com" + i.get('href'))
#按价格划分
area_second = [] 
for i in area_first:
soup = read_zip_url(i)
area_second_soup = soup.find_all('li',id = 'list_D02_11')[0].find_all('a')
del area_second_soup[0]
for i in area_second_soup:
area_second.append("http://esf.km.fang.com" + i.get('href'))
#按房屋类型划分
area_third = [] 
def area_third_func(li):
soup = read_zip_url(li)
area_third_soup = soup.find_all('li',id = 'list_D02_13')[0].find_all('a')
del area_third_soup[0]
for i in area_third_soup:
area_third.append("http://esf.km.fang.com" + i.get('href'))
pool = ThreadPool(4)
pool.map(area_third_func, area_second)
pool.close()
pool.join()
area_fourth = [] 
def area_fourth_func(li):
soup = read_zip_url(li)
if soup.find(text=re.compile("很抱歉")) == None:
pagenum1 = soup.find_all('span', class_ = 'txt')[0].get_text()
pagenum = int(re.findall(r'\d+',pagenum1)[0])
splitted = li.split('-')
for j in range(1, int(pagenum)+1):
new_url = (splitted[0]+ '{0}' + splitted[1] + '{0}' + splitted[2]+ '{0}' + 'i3{1}').format('-',j)
area_fourth.append(new_url)
pool = ThreadPool(4)
pool.map(area_fourth_func, area_third)
pool.close()
pool.join()
finalinks = [] 
def get_links(li):
soup = read_zip_url(li)
urlist = soup.select('a[href^="/chushou/"]')
for i in urlist:
href = 'http://esf.km.fang.com' + i.get('href')
if href not in finalinks:
finalinks.append(href)
sleep(0.1)
pool = ThreadPool(4)
pool.map(get_links, area_fourth)
pool.close()
pool.join()
today = datetime.date.today().strftime("%Y%m%d")
finalinks = pd.DataFrame(finalinks)
finalinks = finalinks.drop_duplicates()
finalinks.to_csv("%s" %'sf_links'+today + '.csv')


希望各位大大指导,感激不尽

  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享

2条回答 默认 最新

  • devmiao devmiao 2017-07-30 13:46

    看下是不是代码有bug

    点赞 评论 复制链接分享
  • zy841958835 cloudyzhao 2017-07-31 01:40
    点赞 评论 复制链接分享

相关推荐