爬取文件
import scrapy
from toutiao.items import ToutiaoItem
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains=['weixin.sogou.com','mp.weixin.qq.com']
start_urls = [
"https://weixin.sogou.com/",
"https://weixin.sogou.com/pcindex/pc/pc_0/1.html"
]
def parse(self, response):
node_list=response.xpath('//ul/li/div[@class="img-box"]/a/@href').extract()[:1]
for node in node_list:
item = ToutiaoItem()
item['url']=response.url
item['title']='test title'
item['content']='test1 content'
yield item
pipelines 文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class ToutiaoPipeline(object):
def __init__(self):
self.connect=pymysql.connect(host='127.0.0.1',user='test',password='123456',db='bin_xinwen',port=3306)
self.cursor=self.connect.cursor()
def process_item(self, item, spider):
print(11111111111111111111111)
print(item['url'])
print(22222222222222222222222)
# self.cursor.execute('insert into articles(url,title,content) VALUES ("{}","{}","{}")').format(item['url'],item['title'],item['content'])
# self.cursor.commit()
return item
def close_spider(self,spider):
self.cursor.close()
self.connect.close()
settings.py
# -*- coding: utf-8 -*-
BOT_NAME = 'toutiao'
SPIDER_MODULES = ['toutiao.spiders']
NEWSPIDER_MODULE = 'toutiao.spiders'
ROBOTSTXT_OBEY = False
SPIDER_MIDDLEWARES = {
'toutiao.middlewares.ToutiaoSpiderMiddleware': 543,
}
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'toutiao.pipelines.ToutiaoPipeline': 100,
}
执行的时候打印
2019-05-23 17:43:07 [scrapy.middleware] INFO: Enabled item pipelines:
['toutiao.pipelines.ToutiaoPipeline']