以下是代码,发现response.url一直是“http://book.douban.com/top250”,没有继续跟进去,求大神帮忙解决 不胜感激
books.py
!/usr/bin/pyhon
-*- coding: utf-8 -*-
coding=utf-8
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors import LinkExtractor
from douban.items import DoubanItem
class BooksSpider(CrawlSpider):
name = "BooksSpider"
allowed_domains = ["book.douban.com"]
start_urls = [
"http://book.douban.com/top250"
]
rules = (
Rule(LinkExtractor(allow=
r'https://book.douban.com/top250\?start=\d+'),callback="parse"),
Rule(LinkExtractor(allow=
r'https://book.douban.com/subject/\d+'),callback="parse"),
)
def parse(self, response):
sel = Selector(response=response)
item = DoubanItem()
item['name'] = sel.xpath("//h1")[0].extract().strip()
try:
contents = sel.xpath("//div[@id='link-report']/p//text()").extract()
item['content_desc'] = "\n".join(content for content in contents)
except:
item['content_desc'] = " "
try:
profiles = sel.xpath("//div[@class='related_info']/div[@class='indent']")[1].xpath("//div[@class='intro']/p/text()").extract()
item['author_profile'] = "\n".join(profile for profile in profiles)
except:
item['author_profile'] = " "
datas = response.xpath("//div[@id='info']//text()").extract()
datas = [data.strip() for data in datas]
datas = [data for data in datas if data !='']
for data in datas:
if u"作者" in data:
item["author"] = datas[datas.index(data)+1]
elif u":" not in data:
item["author"] = datas[datas.index(data)+2]
elif u"出版社:" in data:
item["press"] = datas[datas.index(data)+1]
elif u"出版年:" in data:
item["date"] = datas[datas.index(data)+1]
elif u"页数:" in data:
item["page"] = datas[datas.index(data)+1]
elif u"定价:" in data:
item["price"] = datas[datas.index(data)+1]
elif u"ISBN:" in data:
item["ISBN"] = datas[datas.index(data)+1]
print item
return item