weixin_46373697
方兔叽
采纳率33.3%
2020-03-04 16:49

scrapy存到mysql查询无数据

1. 问题描述

尝试使用scrapy框架爬取网站,将爬取的数据存储到mysql数据库,执行完毕之后没有报错,但是我查询数据时,显示没有数据
(代码框架参考使用该博主代码尝试运行:
https://www.cnblogs.com/fromlantianwei/p/10607956.html)

2. 部分截图

  1. scrapy项目:

图片说明

数据库创建:

图片说明
##3. 相关代码
scrapy框架代码:

(1)tencent爬虫文件

# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
import re
from copy import deepcopy

from ScrapyPro3.items import ScrapyPro3Item


class tencentSpider(scrapy.Spider):
    name = 'tencent'

    allowed_domains = []
    start_urls = [
        'http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/m?kw=%E6%A1%82%E6%9E%97%E7%94%B5%E5%AD%90%E7%A7%91%E6%8A%80%E5%A4%A7%E5%AD%A6%E5%8C%97%E6%B5%B7%E6%A0%A1%E5%8C%BA&pn=26140',
        ]

    def parse(self, response):  # 总页面
        item = ScrapyPro3Item()

        all_elements = response.xpath(".//div[@class='i']")
        # print(all_elements)

        for all_element in all_elements:
            content = all_element.xpath("./a/text()").extract_first()
            content = "".join(content.split())
            change = re.compile(r'[\d]+.')
            content = change.sub('', content)
            item['comment'] = content

            person = all_element.xpath("./p/text()").extract_first()
            person = "".join(person.split())
            # 去掉点赞数 评论数
            change2 = re.compile(r'点[\d]+回[\d]+')
            person = change2.sub('', person)
            # 选择日期
            change3 = re.compile(r'[\d]?[\d]?-[\d][\d](?=)')
            date = change3.findall(person)

            # 如果为今天则选择时间
            change4 = re.compile(r'[\d]?[\d]?:[\d][\d](?=)')
            time = change4.findall(person)

            person = change3.sub('', person)
            person = change4.sub('', person)

            if time == []:
                item['time'] = date
            else:
                item['time'] = time

            item['name'] = person

            # 增加密码 活跃
            item['is_active'] = '1'
            item['password'] = '123456'

            print(item)
            yield item

        # 下一页
        """next_url = 'http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/' + parse.unquote(
            response.xpath(".//div[@class='bc p']/a/@href").extract_first())

        print(next_url)
        yield scrapy.Request(
            next_url,
            callback=self.parse,

        )"""

(2)item文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class ScrapyPro3Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    comment = scrapy.Field()
    time = scrapy.Field()
    name = scrapy.Field()
    password = scrapy.Field()
    is_active = scrapy.Field()

(3)pipelines文件

-*- coding: utf-8 -*-

Define your item pipelines here

#

Don't forget to add your pipeline to the ITEM_PIPELINES setting

See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

"""class Scrapypro3Pipeline(object):
def process_item(self, item, spider):
return item"""
import pymysql
from twisted.enterprise import adbapi

class Scrapypro3Pipeline(object):
def init(self, dbpool):
self.dbpool = dbpool

@classmethod
def from_settings(cls, settings):  # 函数名固定,会被scrapy调用,直接可用settings的值
    """
    数据库建立连接
    :param settings: 配置参数
    :return: 实例化参数
    """
    adbparams = dict(
        host='localhost',
        db='mu_ke',
        user='root',
        password='root',
        cursorclass=pymysql.cursors.DictCursor  # 指定cursor类型
    )
    # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
    dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
    # 返回实例化参数
    return cls(dbpool)

def process_item(self, item, spider):
    """
    使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
    """
    query = self.dbpool.runInteraction(self.do_insert, item)  # 指定操作方法和操作数据
    # 添加异常处理
    query.addCallback(self.handle_error)  # 处理异常

def do_insert(self, cursor, item):
    # 对数据库进行插入操作,并不需要commit,twisted会自动commit
    insert_sql = """
    insert into login_person(name,password,is_active,comment,time) VALUES(%s,%s,%s,%s,%s)
                """
    cursor.execute(insert_sql, (item['name'], item['password'], item['is_active'], item['comment'],
                                item['time']))

def handle_error(self, failure):
    if failure:
        # 打印错误信息
        print(failure)```

(4) settings文件


-*- coding: utf-8 -*-

Scrapy settings for ScrapyPro3 project

#

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

#

https://doc.scrapy.org/en/latest/topics/settings.html

https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'ScrapyPro3'

SPIDER_MODULES = ['ScrapyPro3.spiders']
NEWSPIDER_MODULE = 'ScrapyPro3.spiders'

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'

MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'mu_ke'
MYSQL_USER = 'root'
MYSQL_PASSWD = 'root'

Obey robots.txt rules

ROBOTSTXT_OBEY = False

Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

#COOKIES_ENABLED = False

Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Language': 'en',

#}

Enable or disable spider middlewares

See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

'ScrapyPro3.middlewares.ScrapyPro3SpiderMiddleware': 543,

#}

Enable or disable downloader middlewares

See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

'ScrapyPro3.middlewares.ScrapyPro3DownloaderMiddleware': 543,

#}

Enable or disable extensions

See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

'scrapy.extensions.telnet.TelnetConsole': None,

#}

Configure item pipelines

See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {
'ScrapyPro3.pipelines.Scrapypro3Pipeline':200,

}

Enable and configure the AutoThrottle extension (disabled by default)

See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

The average number of requests Scrapy should be sending in parallel to

each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


(5)start文件——执行爬虫文件

from scrapy import cmdline
cmdline.execute(["scrapy","crawl","tencent"])



数据库创建代码:


create database mu_ke;
CREATE TABLE login_person (
id int(10) NOT NULL AUTO_INCREMENT,
name varchar(100) DEFAULT NULL,
passsword varchar(100) DEFAULT NULL,
is_active varchar(100) DEFAULT NULL,
comment varchar(100) DEFAULT NULL,
time varchar(100) DEFAULT NULL,
PRIMARY KEY (id)
) ENGINE=InnoDB AUTO_INCREMENT=1181 DEFAULT CHARSET=utf8;
select count(name) from login_person;#查询结果条数为0

# 运行完代码后查询数据,显示条数为0,这里面有什么问题吗?

(1) 
执行过程正常

(2)运行

pycharm2019.3

python3.8 

mysql8.0(workbench8.0)

(3) 数据连接没有
  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享
  • 邀请回答

1条回答

  • qq_43656607 放风喽 1年前

    在pipelines文件的内部,打印item,看看数据到底有没有获取到
    连接数据库成功后,打印一个数据库内部的数据,看看是不是连接成功
    大概率你没搞到数据,所以什么也没有写入

    点赞 评论 复制链接分享