主
import scrapy
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from tencent.items import TencentItem
class QqnewsSpider(scrapy.Spider):
name = 'qqnews'
allowed_domains = ['news.qq.com']
start_urls = ['https://news.qq.com/']
def __init__(self, **kwargs):
options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images“,2)
options.add_experimental_option("prefs", prefs)
# options.add_argument("--headless")
self.browser = webdriver.Chrome(chrome_options=options, executable_path="/Users/bloodhound/WorkProject/Behind/Full-crawl/chromedriver")
super(QqnewsSpider, self).__init__()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
# 当爬虫退出的时候关闭浏览器
print("spider closed")
self.browser.quit()
settings.py
BOT_NAME = 'tencent'
SPIDER_MODULES = ['tencent.spiders']
NEWSPIDER_MODULE = 'tencent.spiders'
LOG_LEVEL = 'WARNING'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tencent (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
DOWNLOADER_MIDDLEWARES = {
'tencent.middlewares.JSPageMiddleware': 1,
}
ITEM_PIPELINES = {
'tencent.pipelines.TencentPipeline': 300,
}