from flask import Flask,Blueprint
app = Flask(__name__)
import requests
import time
from lxml import etree
import random
@app.route('/')
class Spider:
def __init__(self):
self.index_url="https://www.daomubiji.com/"
def get_html(self,url):
"""功能函数一: 请求获取html"""
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'}
html=requests.get(url=url,headers=headers,verify=False).text
return html
def xfunc(self,html,x):
"""功能函数2: 解析html请求"""
eobj=etree.HTML(html)
r_list=eobj.xpath(x)
return r_list
def parse_html(self):
"""爬虫逻辑函数"""
first_html=self.get_html(url=self.index_url)
first_x='//li[contains(@id,"menu-item-20")]'
li_list=self.xfunc(first_html,first_x)
for li in li_list:
"""提取大标题和大链接"""
item1={}
item1["parent_title"]=li.xpath('./a/text()')[0]
item1["parent_href"]=li.xpath('./a/@href')[0]
# 解析提取二级页面数据
self.parse_two_html(item1)
def parse_two_html(self,item1):
"""二级页面解析:小标题和小链接"""
two_url=item1["parent_href"]
two_html=self.get_html(two_url)
two_x='//article'
art_list=self.xfunc(two_html,two_x)
# print(art_list)
for art in art_list:
item2={}
item2['parent_title']=item1["parent_title"]
item2['parent_href']=item1["parent_href"]
item2["son_title"]=art.xpath('./a/text()')[0]
item2["son_href"]=art.xpath('./a/@href')[0]
# 解析提取三级页面数据
self.parse_three_html(item2)
# 控制频率
time.sleep(random.uniform(0,1))
def parse_three_html(self,item2):
"""三级页面解析:小说内容"""
three_url = item2["son_href"]
three_html = self.get_html(three_url)
three_x = '//article/p/text()'
p_list = self.xfunc(three_html, three_x)
novel_content='\n'.join(p_list)
item2['novel_content']=novel_content.replace('\u3000',' ')
item_list = [item2['parent_title'], item2['parent_href'], item2['son_title'], item2['son_href'], item2['novel_content']]
# print(item_list)
return item_list
def carw(self):
self.parse_html()
if __name__ == '__main__':
spider=Spider()
spider.carw()
if __name__ == '__main__':
app.run()
中间的类是我写的爬虫程序,为什么不能在flask里执行