新入门Django,现在已经写好了一个Python爬虫,直接用Python跑测试没问题,
------在Django项目中加入了一个新的爬虫app,用model创建了表格,和展示爬虫的html
------但是runserver, 以后查看db.sqlite3里面对应的表已经创建,但是里面没有存爬到的内容,
------ 请大神们指教该怎么办, 代码如下
Spider.py, 爬虫并存入model.py 创建的**Website**表
#!/usr/bin/python
# -*- coding: utf-8 -*-
# import data into mysql(sqlite3), must have these four lines defination:
import os
# # 我所创建的project名称为learn_spider;里面的app名称为website
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "blogproject.settings")
# import django
# django.setup()
# urllib2 package: open resource by URL; re package: use regular expression to filter the objects
import urllib.request, re
import urllib.parse
# BeautifulSoup: abstract data clearly from html/xml files
from bs4 import BeautifulSoup
# import tables from models.py
from .models import Website
# urlopen()方法需要加read()才可视源代码,其中decode("utf-8")表示以utf-8编码解析原网页,这个编码格式是根据网页源代码中<head>标签下的<meta charset="utf-8">来决定的。
ul = "https://baike.baidu.com/item/Python"
req = urllib.request.Request(ul)
html_python = urllib.request.urlopen(req).read().decode("utf-8")
#html_python = urllib.request.urlopen('https://baike.baidu.com/item/Python').read().decode("utf-8")
soup_python = BeautifulSoup(html_python, "html.parser")
# print soup
#这里用到了正则表达式进行筛选
item_list = soup_python.find_all('a', href=re.compile("item"))
for each in item_list:
print (each.string)
# use quote to replace special characters in string(escape encode method)
urls = "https://baike.baidu.com/item/" + urllib.parse.quote(each.string.encode("utf-8"))
print (urls)
html = urllib.request.urlopen(urls).read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
if soup.find('div', 'lemma-summary') == None:
text = "None"
else:
text = soup.find('div', 'lemma-summary').get_text()
print (text)
Website.objects.get_or_create(name=each.string, url=urls, text=text)
text_python = soup_python.find('div', 'lemma-summary').text
Website.objects.get_or_create(name="Python", url="https://baike.baidu.com/item/Python", text=text_python)
model.py 创建Website 表用于存储爬到的内容
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import models
# Create your models here.
class Website(models.Model):
name = models.CharField(max_length=100)
url = models.CharField(max_length=100)
text = models.TextField()
def __unicode__(self):
return self.name
view.py 提取表中已爬取的内容
from __future__ import unicode_literals
from django.shortcuts import render
# Create your views here.
from .models import Website
def show(request):
# 这里直接通过QuerySet API获取所有的object,默认返回类型为tuple(元组)
queryset = Website.objects.all()
# 传入三个渲染参数
return render(request, 'news/nws.html', {'QuerySet': queryset})