weixin_38238983
Ex_treme
采纳率0%
2017-04-09 05:10

非常简单的scrapy代码但就是不清楚到底哪里出问题了,高手帮忙看看吧!

5

News_spider文件

-*- coding: utf-8 -*-

import scrapy
import re

from scrapy import Selector
from News.items import NewsItem

class NewsSpiderSpider(scrapy.Spider):
name = "news_spider"
allowed_domains = ["http://18.92.0.1"]
start_urls = ['http://18.92.0.1/contents/7/121174.html']

def parse_detail(self, response):
    sel = Selector(response)

    items = []
    item = NewsItem()

    item['title'] = sel.css('.div_bt::text').extract()[0]

    characters = sel.css('.div_zz::text').extract()[0].replace("\xa0","")

    pattern = re.compile('[:].*[ ]')
    result = pattern.search(characters)
    item['post'] = result.group().replace(":","").strip()

    pattern = re.compile('[ ][^发]*')
    result = pattern.search(characters)
    item['approver'] = result.group()

    pattern = re.compile('[201].{9}')
    result = pattern.search(characters)
    item['date_of_publication'] = result.group()

    pattern = re.compile('([0-9]+)$')
    result = pattern.search(characters)
    item['browse_times'] = result.group()

    content = sel.css('.xwnr').extract()[0]
    pattern = re.compile('[\u4e00-\u9fa5]|[,、。“”]')
    result = pattern.findall(content)
    item['content'] = ''.join(result).replace("仿宋"," ").replace("宋体"," ").replace("楷体"," ")



    item['img1_url'] = sel.xpath('//*[@id="newpic"]/div[1]/div[1]/img/@src').extract()[0]
    item['img1_name'] = sel.xpath('//*[@id="newpic"]/div[1]/div[2]/text()').extract()[0]

    item['img2_url'] = sel.xpath('//*[@id="newpic"]/div[2]/div[1]/img/@src').extract()[0]
    item['img2_name'] = sel.xpath('//*[@id="newpic"]/div[2]/div[2]').extract()[0]

    item['img3_url'] = sel.xpath('//*[@id="newpic"]/div[3]/div[1]/img/@src').extract()[0]
    item['img3_name'] = sel.xpath('//*[@id="newpic"]/div[3]/div[2]/text()').extract()[0]

    item['img4_url'] = sel.xpath('//*[@id="newpic"]/div[4]/div[1]/img/@src').extract()[0]
    item['img4_name'] = sel.xpath('//*[@id="newpic"]/div[4]/div[2]/text()').extract()[0]

    item['img5_url'] = sel.xpath('//*[@id="newpic"]/div[5]/div[1]/img/@src').extract()[0]
    item['img5_name'] = sel.xpath('//*[@id="newpic"]/div[5]/div[2]/text()').extract()[0]

    item['img6_url'] = sel.xpath('//*[@id="newpic"]/div[6]/div[1]/img/@src').extract()[0]
    item['img6_name'] = sel.xpath('//*[@id="newpic"]/div[6]/div[2]/text()').extract()[0]

    characters = sel.xpath('/html/body/div/div[2]/div[4]/div[4]/text()').extract()[0].replace("\xa0","")

    pattern = re.compile('[:].*?[ ]')
    result = pattern.search(characters)
    item['company'] = result.group().replace(":", "").strip()

    pattern = re.compile('[ ][^联]*')
    result = pattern.search(characters)
    item['writer_photography'] = result.group()

    pattern = re.compile('(([0-9]|[-])+)$')
    result = pattern.search(characters)
    item['tel'] = result.group()

    items.append(item)

items文件
return items
import scrapy

class NewsItem(scrapy.Item):

title = scrapy.Field()
post = scrapy.Field()
approver = scrapy.Field()
date_of_publication = scrapy.Field()
browse_times = scrapy.Field()
content = scrapy.Field()
img1_url = scrapy.Field()
img1_name = scrapy.Field()
img2_url = scrapy.Field()
img2_name = scrapy.Field()
img3_url = scrapy.Field()
img3_name = scrapy.Field()
img4_url = scrapy.Field()
img4_name = scrapy.Field()
img5_url = scrapy.Field()
img5_name = scrapy.Field()
img6_url = scrapy.Field()
img6_name = scrapy.Field()
company = scrapy.Field()
writer_photography = scrapy.Field()
tel = scrapy.Field()

pipelines文件
import MySQLdb
import MySQLdb.cursors

class NewsPipeline(object):
def process_item(self, item, spider):
return item

class MysqlPipeline(object):
def init(self):
self.conn = MySQLdb.connect('192.168.254.129','root','root','news',charset="utf8",use_unicode=True)
self.cursor = self.conn.cursor()

def process_item(self, item, spider):
    insert_sql = "insert into news_table(title,post,approver,date_of_publication,browse_times,content,img1_url,img1_name,img2_url,img2_name,img3_url,img3_name,img4_url,img4_name,img5_url,img5_name,img6_url,img6_name,company,writer_photography,tel)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
    self.cursor.execute(insert_sql,(item['title'],item['post'],item['approver'],item['date_of_publication'],item['browse_times'],item['content'],item['img1_url'],item['img1_name'],item['img1_url'],item['img1_name'],item['img2_url'],item['img2_name'],item['img3_url'],item['img3_name'],item['img4_url'],item['img4_name'],item['img5_url'],item['img5_name'],item['img6_url'],item['img6_name'],item['company'],item['writer_photography'],item['tel']))
    self.conn.commit()

setting文件
BOT_NAME = 'News'

SPIDER_MODULES = ['News.spiders']
NEWSPIDER_MODULE = 'News.spiders'
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = True
ITEM_PIPELINES = {
#'News.pipelines.NewsPipeline': 300,
'News.pipelines.MysqlPipeline': 300,
}

/usr/bin/python3.5 /home/pzs/PycharmProjects/News/main.py
2017-04-08 11:00:12 [scrapy.utils.log] INFO: Scrapy 1.3.3 started (bot: News)
2017-04-08 11:00:12 [scrapy.utils.log] INFO: Overridden settings: {'BOT_NAME': 'News', 'SPIDER_MODULES': ['News.spiders'], 'NEWSPIDER_MODULE': 'News.spiders'}
2017-04-08 11:00:12 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.logstats.LogStats']
2017-04-08 11:00:12 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2017-04-08 11:00:12 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2017-04-08 11:00:12 [scrapy.middleware] INFO: Enabled item pipelines:
['News.pipelines.MysqlPipeline']
2017-04-08 11:00:12 [scrapy.core.engine] INFO: Spider opened
2017-04-08 11:00:12 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2017-04-08 11:00:12 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2017-04-08 11:00:13 [scrapy.core.engine] DEBUG: Crawled (200) (referer: None)
2017-04-08 11:00:13 [scrapy.core.scraper] ERROR: Spider error processing (referer: None)
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/twisted/internet/defer.py", line 653, in runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/local/lib/python3.5/dist-packages/scrapy/spiders/
_init__.py", line 76, in parse
raise NotImplementedError
NotImplementedError
2017-04-08 11:00:13 [scrapy.core.engine] INFO: Closing spider (finished)
2017-04-08 11:00:13 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 229,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 16609,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 4, 8, 18, 0, 13, 938637),
'log_count/DEBUG': 2,
'log_count/ERROR': 1,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/NotImplementedError': 1,
'start_time': datetime.datetime(2017, 4, 8, 18, 0, 12, 917719)}
2017-04-08 11:00:13 [scrapy.core.engine] INFO: Spider closed (finished)

Process finished with exit code 0
直接运行会弹出NotImplementedError错误,单步调试也看不出到底哪里出了问题

  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 复制链接分享
  • 邀请回答

4条回答

  • Lzq_1010 Lzq_1010 2年前

    如果能力还没到,就先模仿自动生成的代码来

    点赞 评论 复制链接分享
  • YouYuAi vdbrcxby 4年前

    你必须实现 scrapy.Spider 的parse方法

    点赞 评论 复制链接分享
  • ifillbad jjzhouke 2年前

    allowed_domains = ["http://18.92.0.1"]
    前面不能加http://

    点赞 评论 复制链接分享
  • zhangbinbinz 602437897 4年前

    def parse(self, response):
    用这个response来处理应答

    点赞 评论 复制链接分享

相关推荐