# -*- coding: utf-8 -*-
import scrapy,re
from bs4 import BeautifulSoup
from scrapy_redis.spiders import RedisSpider
from urllib import parse
class TtttSpider(RedisSpider):
name = 'tttt'
allowed_domains = ['chinanews.com']
redis_key = "tttt"
def parse(self, response):
html = response.text
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all('a')
for item in data:
index = {}
if item.string is not None and item['href'] != 'javascript:;' and item.get('href') and item['href'] != '#':
url = parse.urljoin(response.url, item.get('href'))
index[url] = item.string
print("index", index)
print(url)
yield scrapy.Request(url, callback=self.next_parse, meta={"item": index})
def next_parse(self, response):
print("11111111")
不是立即报错,调用了parse函数,就不在往下执行了。报错