# -*- coding: utf-8 -*-
import scrapy
import re
import os
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.utils.project import get_project_settings
class HuanqiuSpider(CrawlSpider):
name = 'huanqiu' # 爬虫名
allowed_domains = ['tech.huanqiu.com']
start_urls = ['https://tech.huanqiu.com/internet']
rules = (
Rule(LinkExtractor(allow=(r"article", )), callback='parse_item', follow=False),
)
def parse_item(self, response):
title = response.xpath("//div[@class='t-container-title']/h3/text()").extract_first()
url = response.url
print(title, url)
p_list = response.xpath('//article/section/p')
if p_list:
content = '\n'.join([p.xpath('.//text()').extract_first() for p in p_list])
#print(content)
爬取结果最后两个不理解,为什么allowed_domains和rules过滤不起作用