MySpider里面是这样的:
class MySpider(scrapy.Spider):
name = 'BAIScrapy'
def start_requests(self):
print('开始')
url = 'https://www.bilibili.com/'
request = scrapy.Request(url=url, callback=self.parse, dont_filter=True)
request.meta['PhantomJS'] = True
yield request
def parse(self, response):
print('Emmm...')
item = BilibiliAnimeInfoScrapyItem()
item['links'] = response.css('a::attr("href")').re("www.bilibili.com/bangumi/play/")
middlewares里面是这样的:
def process_reqeust(self, request, spider):
print('进入selenium')
driver = webdriver.PhantomJS()
driver.get(request.url)
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID,'bili_bangumi')))
driver.quit()
yield HtmlResponse(url=request.url, encoding='utf-8', body=driver.page_source, request=request)
settings里面是这样的:
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'bilibili_anime_info_scrapy.middlewares.BilibiliAnimeInfoScrapyDownloaderMiddleware': 543,
}