想爬一下 游戏信息和评论 评论,信息是分开的 所以用了两个方法
parse中 用yield 进入第二个方法 和回调自己 都没问题
parse two 中 用yield 回调就不行 但是也不报错 就是没有进行
def parse(self, response):
#print response.body
selector = scrapy.Selector(response)
games = selector.xpath('//div[@class="app-item-caption"]/a[@class="item-caption-title flex-text-overflow"]/@href').extract()
for game in games:
game = game + '/review'
yield scrapy.http.Request(game, callback=self.parse_two)
# print game
#游戏列表下一页
nextPage = selector.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract()
if nextPage:
next = nextPage[0]
# print next
yield scrapy.http.Request(next, callback=self.parse)
def parse_two(self,response):
Gid = response.url[27:32]
Gid = int(Gid)
selector = scrapy.Selector(response)
game_review_times = selector.xpath('//a[@class="text-header-time"]/span/@data-dynamic-time').extract()
game_reviews = selector.xpath('//div[@class="review-item-text"]/div[@class="item-text-body"]').extract()
game_reivew_author = selector.xpath('//span[@class="taptap-user"]/a/text()').extract()
reviewNo = 1
review_dict = {}
# 处理评论
for review in game_reviews:
# 计算每天评论量
# time_day = time.strftime('%Y-%m-%d',time.localtime(int(game_review_times[reviewNo - 1])))
# if review_dict.get(time_day):
# review_dict[time_day] += 1
# else:
# review_dict[time_day] = 1
review_lines = re.findall('<p>(.*?)</p>',review,re.S)
review = ''
for line in review_lines:
review += line
item = TaptapItem()
item['Review_GID'] = Gid
item['Review_content'] = review
item['Review_Author'] = game_reivew_author[reviewNo-1]
item['Reivew_Time'] = game_review_times[reviewNo-1]
yield item
print '评论%d:'%reviewNo
print game_review_times[reviewNo-1]
print review
reviewNo += 1
#评论下一页
nextPage = selector.xpath('//ul[@class="pagination"]/li[last()]/a/@href').extract()
if nextPage:
next = nextPage[0]
# print next
yield scrapy.http.Request(next, callback=self.parse_two)