由于论文需要,想要爬取房天下的二手房,于是修改简书上竹间为简的租房代码进行爬取。
修改后的代码为
from bs4 import BeautifulSoup
import urllib
import gzip
import inspect
import re
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
import datetime
import sqlite3
from time import sleep
from random import choice
#网页解压缩
def read_zip_url(url):
fails = 0
while fails < 5:
try:
content = urllib.request.urlopen(url).read()
content = gzip.decompress(content).decode("gb18030") #网页gb2312的编码要用这个
break
except:
fails += 1
print(inspect.stack()[1][3] + ' occused error')
soup = BeautifulSoup(content, "lxml")
return soup
#划分城区
starturl = "http://esf.km.fang.com/house/i31/"
soup = read_zip_url(starturl)
area_first_soup = soup.find_all('div',id = 'list_D02_10')[0].find_all('a')
del area_first_soup[-2]
del area_first_soup[0]
area_first = []
for i in area_first_soup:
area_first.append("http://esf.km.fang.com" + i.get('href'))
#按价格划分
area_second = []
for i in area_first:
soup = read_zip_url(i)
area_second_soup = soup.find_all('li',id = 'list_D02_11')[0].find_all('a')
del area_second_soup[0]
for i in area_second_soup:
area_second.append("http://esf.km.fang.com" + i.get('href'))
#按房屋类型划分
area_third = []
def area_third_func(li):
soup = read_zip_url(li)
area_third_soup = soup.find_all('li',id = 'list_D02_13')[0].find_all('a')
del area_third_soup[0]
for i in area_third_soup:
area_third.append("http://esf.km.fang.com" + i.get('href'))
pool = ThreadPool(4)
pool.map(area_third_func, area_second)
pool.close()
pool.join()
area_fourth = []
def area_fourth_func(li):
soup = read_zip_url(li)
if soup.find(text=re.compile("很抱歉")) == None:
pagenum1 = soup.find_all('span', class_ = 'txt')[0].get_text()
pagenum = int(re.findall(r'\d+',pagenum1)[0])
splitted = li.split('-')
for j in range(1, int(pagenum)+1):
new_url = (splitted[0]+ '{0}' + splitted[1] + '{0}' + splitted[2]+ '{0}' + 'i3{1}').format('-',j)
area_fourth.append(new_url)
pool = ThreadPool(4)
pool.map(area_fourth_func, area_third)
pool.close()
pool.join()
finalinks = []
def get_links(li):
soup = read_zip_url(li)
urlist = soup.select('a[href^="/chushou/"]')
for i in urlist:
href = 'http://esf.km.fang.com' + i.get('href')
if href not in finalinks:
finalinks.append(href)
sleep(0.1)
pool = ThreadPool(4)
pool.map(get_links, area_fourth)
pool.close()
pool.join()
today = datetime.date.today().strftime("%Y%m%d")
finalinks = pd.DataFrame(finalinks)
finalinks = finalinks.drop_duplicates()
finalinks.to_csv("%s" %'sf_links'+today + '.csv')
希望各位大大指导,感激不尽