想要获取所有子网页的标题,但是为什么只有一小部分标题获取成功,而大部分都为空??(url可以全部得到)
代码如下
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from tutorial.items import TutorialItem
class herbSpider(scrapy.Spider):
name = 'herbalism'
allowed_domins = ['pharmnet.com']
start_urls = ['http://www.pharmnet.com.cn/tcm/knowledge/ycrs/']
def parse(self, response):
url_list = response.xpath('//td[@height="22"]/a[@target="_blank"]/@href').extract()
for url in url_list:
yield Request(url,callback=self.parse_name)
for i in range(1,10):
page_url = 'http://www.pharmnet.com.cn/tcm/knowledge/ycrs/index{}.html'.format(i)
yield Request(page_url,callback=self.parse)
def parse_name(self,response):
items = TutorialItem()
items['title'] = response.xpath('//font[@color="#300901"]/h1/text()').extract()
items['link'] = response.url
yield items
:
运行结果: