from flask import Flask,Blueprint
# 创建蓝图对象
user_bp=Blueprint('main',__name__)
# from main import Spider
import requests
import time
from lxml import etree
import random
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class Spider:
def __init__(self):
self.index_url="https://www.daomubiji.com/"
def get_html(self,url):
"""功能函数一: 请求获取html"""
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'}
html=requests.get(url=url,headers=headers,verify=False).text
return html
def xfunc(self,html,x):
"""功能函数2: 解析html请求"""
eobj=etree.HTML(html)
r_list=eobj.xpath(x)
return r_list
def parse_html(self):
"""爬虫逻辑函数"""
first_html=self.get_html(url=self.index_url)
first_x='//li[contains(@id,"menu-item-20")]'
li_list=self.xfunc(first_html,first_x)
for li in li_list:
"""提取大标题和大链接"""
li=[li["parent_title"],li["parent_href"]]
print(li)
def carw(self):
self.parse_html()
if __name__ == '__main__':
spider=Spider()
spider.carw()