python房天下爬虫问题

def __init__(self):
    self.start_urls = pd.read_excel('xinfangcitymatch.xlsx')['https'][30:45]
    self.quchong = {}


    self.cookies = RequestsCookieJar
    self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; ox64) AppleWebKit/537.36'
                                  ' (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
                    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                    'accept-encoding': 'gzip, deflate, br',
                    'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6',
                    }
    self.cookies1 = {
        'global_cookie': '2a8hamrvwdz0punlkee3ifojo26jyqn1d1y',
        'unique_cookie': 'U_2a8hamrvwdz0punlkee3ifojo26jyqn1d1y*1',
        'city': 'www',
    }
    self.excel_head = ['date','city','perprice', 'xiaqu','fangwu', 'zhuangxiu', 'huanxian', 'zhandi', 'jianmian', 'rongji', 'lvhua', 'chewei', 'wuye']
    self.today_str = datetime.strftime(datetime.now(), '%Y-%m-%d')
def get_html(self, url):
    self.headers['Referer'] = url
    try:
        response = requests.get(url, headers=self.headers, timeout=5, allow_redirects=False, cookies=self.cookies1)
    except Exception as e:
        print(url)
        print('网络不好,正在重新爬取')
        time.sleep(1)
        return self.get_html(url)
    if response.status_code == 200:
        #
        return response.content.decode('gb2312', errors='ignore')
    elif response.status_code == 403:
        print(response.status_code)
        time.sleep(1)
        return self.get_html(url)
    elif response.status_code == 302:
        print(url)
        # print('cookies失效')
        print('访问受限')
        time.sleep(1)
        return self.get_html(url)
    else:
        time.sleep(1)
        print(response.status_code)
        return self.get_html(url)

def get_html_two(self, url):
    self.headers['Referer'] = url
    try:
        response = requests.get(url, headers=self.headers, timeout=5, allow_redirects=False, cookies=self.cookies1)
    except Exception as e:
        print(url)
        print('网络不好,正在重新爬取')
        time.sleep(1)
        return self.get_html(url)
    if response.status_code == 200:
        #gb2312
        return response.content.decode('utf8', errors='ignore')
    elif response.status_code == 403:
        print(response.status_code)
        time.sleep(1)
        return self.get_html(url)
    elif response.status_code == 302:
        print(url)
        # print('cookies失效')
        print('访问受限')
        time.sleep(1)
        return self.get_html(url)
    else:
        time.sleep(1)
        print(response.status_code)
        return self.get_html(url)

def parse(self, current_city_url, html, city_name):
    file_name = f'C:/Users/DELL/Desktop/新房/{city_name}{self.today_str}房天下新房.xlsx'
    if not os.path.exists(file_name):
        wb = openpyxl.Workbook()
        ws = wb.worksheets[0]
        self.save_to_excel(ws, 0, self.excel_head)
        wb.save(file_name)
    row_count = 1
    html_eles = etree.HTML(html)
    # 获取尾页
    total_href = html_eles.xpath('//a[@class="last"]/@href')
    if not total_href:
        print(city_name + '解析分页方式有误')
        return
    # 处理总页码
    total_pn = int(total_href[0].split('/')[-2].replace('b9', ''))
    for pn in range(2, total_pn+1):
        wb = openpyxl.load_workbook(file_name)
        ws = wb.worksheets[0]
        # house/s/b91/
        print(f'-----------{city_name}第{pn-1}/{total_pn}页---------------')
        next_url = current_city_url + 'house/s/b9' + str(pn) + '/'
        # 获取网页所有楼盘信息
        house_eles = html_eles.xpath('//div[@id="newhouse_loupai_list"]/ul/li')
        # 遍历每个楼盘获取楼盘信息

        for house_ele in house_eles:
            # 获取楼盘链接
            house_url = house_ele.xpath('.//div[@class="nlcd_name"]/a/@href')

            if house_url:  # 图片数量不存在说明，是广告，不做处理
                house_url = 'https:' + house_url[0]   # 拼全url
                try:
                    # 提取楼板id
                    if house_url in self.quchong[city_name]:
                        continue
                    # 楼盘均价（perprice）
                    perprice = ''.join([field.replace('/O', '/㎡') for field in house_ele.xpath('.//div[@class="nhouse_price"]//text()') if field.isdigit() or '元' in field ])
                    perprice = perprice if perprice else '价格待定'
                    # 辖区
                    xiaqu_item = house_ele.xpath('.//div[@class="address"]/a/text()')
                    xiaqu = re.findall('\[(.*?)\]', xiaqu_item[0].strip())[0] if xiaqu_item and re.findall('\[(.*?)\]', xiaqu_item[0].strip()) else '无'
                    if xiaqu == '无':
                        xiaqu_item = house_ele.xpath('.//span[@class="sngrey"]/text()')
                    xiaqu = re.findall('\[(.*?)\]', xiaqu_item[0].strip())[0] if xiaqu_item and re.findall(
                        '\[(.*?)\]', xiaqu_item[0].strip()) else '无'
                    # 房屋面积（fangwu）
                    # print(house_url)
                    fangwu = re.sub('\t|\n|－', '', house_ele.xpath('.//div[@class="house_type clearfix"]//text()')[-1])
                    fangwu = fangwu if fangwu else '无'

                    # 获取二级页面
                    html = self.get_html(house_url)

                    if city_name not in html:
                        html = self.get_html_two(house_url)

                    # 获取更多详细信息链接
                    more_info_url = etree.HTML(html).xpath('//a[text()="更多详细信息>>"]/@href')[0]
                    # 获取三级页面
                    if '//' not in more_info_url:
                        continue
                    html = self.get_html('https:' + more_info_url)
                    if city_name not in html:
                        html = self.get_html_two('https:' + more_info_url)
                    house_ele = etree.HTML(html)
                    # 装修状况
                    try:
                        zhuangxiu = re.sub('\n|\t| |', '', house_ele.xpath('//div[text()="装修状况："]/following-sibling::div[1]/text()')[0])
                    except:
                        zhuangxiu = ''
                    # 环线位置（huanxian）
                    huanxian = re.sub('\n|\t| |', '', house_ele.xpath('//div[text()="环线位置："]/following-sibling::div[1]/text()')[0])\
                        if house_ele.xpath('//div[text()="环线位置："]/following-sibling::div[1]/text()') else '无'
                    # 占地面积（zhandi）、建筑面积（jianmian）、容积率（rongji）、绿化率（lvhua）、停车位（chewei）、物业费（wuye）
                    zhandi = house_ele.xpath('//div[text()="占地面积："]/following-sibling::div[1]/text()')[0]
                    jianmian = house_ele.xpath('//div[text()="建筑面积："]/following-sibling::div[1]/text()')[0]
                    guihua_info = house_ele.xpath('//h3[text()="小区规划"]/following-sibling::ul[1]/li')
                    rongji = guihua_info[2].xpath('./div[2]/text()')[0].strip('\xa0 ')
                    lvhua = guihua_info[3].xpath('./div[2]/text()')[0]
                    chewei = guihua_info[4].xpath('./div[2]/text()')[0]

                    wuye = guihua_info[8].xpath('./div[2]/text()')[0].replace('/O', '/㎡')

                except Exception as e:
                    continue
                else:
                    if row_count > 1000:
                        wb.save(file_name)
                        return
                    if house_url[0] not in self.quchong[city_name]:
                        print(perprice, xiaqu, fangwu, zhuangxiu, huanxian, zhandi, jianmian, rongji, lvhua, chewei, wuye)
                        print(f'正在爬取:{city_name}-->第{row_count}条新房信息', )
                        # 保存数据
                        self.save_to_excel(ws, row_count, [self.today_str,city_name,perprice, xiaqu, fangwu, zhuangxiu, huanxian, zhandi, jianmian, rongji, lvhua, chewei, wuye])
                        row_count += 1
                        self.quchong[city_name].append(house_url)  # 将爬取过的楼盘id放进去，用于去重
                    else:
                        print('已存在')
        wb.save(file_name)
        html = self.get_html(next_url)
        if not html:
            return
        html_eles = etree.HTML(html)


def run_spider(self, city_url_list):
    for city_url in city_url_list:
        current_city_url = city_url
        city_url = city_url + 'house/s/' if 'house/s/' not in city_url else city_url
        try:
            html = self.get_html(city_url)
            city_name = re.findall(re.compile('class="s4Box"><a href="#">(.*?)</a>'), html)[0]  # 获取城市名
            self.quchong[city_name] = []  # 构建{'城市名': [新房1,2,3,4,]}用于去重
            self.parse(current_city_url, html, city_name)
        except Exception as e:
            print(city_url)
            pass

# 数组拆分 (将一个大元组拆分多个小元组，用于多线程任务分配)
def div_list(self, ls, n):
   result = []
   cut = int(len(ls)/n)
   if cut == 0:
       ls = [[x] for x in ls]
       none_array = [[] for i in range(0, n-len(ls))]
       return ls+none_array
   for i in range(0, n-1):
       result.append(ls[cut*i:cut*(1+i)])
   result.append(ls[cut*(n-1):len(ls)])
   return result

def save_to_excel(self, ws, row_count, data):
    for index, value in enumerate(data):
        ws.cell(row=row_count+1, column=index + 1, value=value)  # openpyxl 是以1，开始第一行，第一列






# 单线程
# for city_url in spider.start_urls[:1]:
#     spider.run_spider([city_url])
# spider.run_spider(['https://qhd.newhouse.fang.com/'])

原本应该是洛阳的变成了娲不知道该如何解决

写回答
好问题 0 提建议
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

1条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
忍气吞声埋头苦干 2021-07-19 16:16
关注
用xpath直接在页面取城市名把

//div[@class="s4Box"]/a[@href="#"]/text()

你这个代码可读性太差啦。。。

本回答被题主选为最佳回答 , 对您是否有帮助呢?

解决 1
无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

python爬虫房天下商品房数据
2018-07-23 20:45

利用python编写了一个爬虫代码，爬取房天下商品房信息，可以更改链接地址，爬取其他信息
python爬虫20个案例
2018-03-25 07:34

讲诉python爬虫的20个案例。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
python爬虫数据可视化分析大作业.zip
2020-06-12 15:39

在本项目中，"python爬虫数据可视化分析大作业.zip" 是一个综合性的学习资源，主要涉及了Python编程中的两个重要领域：网络爬虫（Web Scraping）和数据可视化（Data Visualization）。通过这个作业，我们可以深入...
python爬取房天下信息
2018-10-20 22:55

小白入手python爬取房天下数据，初识爬虫，加深对python的理解，上手更快。
Python 万能代码模版：爬虫代码篇
2021-09-14 15:27

AI悦创Python一对一辅导的博客但今天的 Python 课程是个例外，因为今天讲的 **Python 技能，不需要你懂计算机原理，也不需要你理解复杂的编程模式。**即使是非开发人员，只要替换链接、文件，就可以轻松完成。并且这些几个实用技巧，简直是 ...
Python爬虫详解（一看就懂）
2022-06-21 22:07

练习时长两年半的Programmer的博客爬虫简单的来说就是用程序获取网络上数据这个过程的一种名称。如果要获取网络上数据，我们要给爬虫一个网址（程序中通常叫URL），爬虫发送一个HTTP请求给目标网页的服务器，服务器返回数据给客户端（也就是我们的...
Python爬虫系列（一）——手把手教你写Python爬虫
2021-10-23 15:47

纸照片的博客这一部分我写在另一篇文章了，Python爬虫批量下载百度图片–点击跳转 5. 后言爬虫还是比较好入门的，这得益于成熟的爬虫工具。爬虫可以满足自己的个性化搜索需求，大家赶快动手试试吧。（如果觉得文章还不错的话...
python专利爬虫下载
2017-04-24 01:07

1. **Python爬虫**：Python是一种流行的编程语言，因其简洁的语法和丰富的库支持，在数据抓取方面非常受欢迎。常见的Python爬虫库包括requests（用于发送HTTP请求）、BeautifulSoup（解析HTML和XML文档）以及Scrapy...
python自动化爬虫实战
2023-05-09 09:52

吴明_yst的博客偶尔的一次复习一下爬虫
分享Python7个爬虫小案例（附源码）
2023-03-17 14:13

墨子琦的博客在这篇文章中，我们将分享7个Python爬虫的小案例，帮助大家更好地学习和了解Python爬虫的基础知识。
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
已采纳回答 7月19日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 7月19日

python房天下爬虫问题

1条回答 默认 最新

问题事件

1条回答默认最新