思路:从搜索界面的elements获取搜索结果目标网页id,得到url列表
问题:1、直接用urllib.request.urlopen获取的html并非elements(“获取网页源代码”和elements不一样)
2、运行如下代码后报错(仅运行了部分代码块)(结果见代码后图片)
3、报错内容显示它甚至打开了同文件夹中其他python代码
期望的最终结果中的网址来自f12-network-response,800ms左右
# coding = utf-8
import selenium
import requests
from lxml import html
from selenium.webdriver import Chrome
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
import time
import re
import sqlite3
import urllib.error
import urllib.request
from selenium import webdriver
import os
from shutil import copy, rmtree
import random
import xlwt
from bs4 import BeautifulSoup
# chromedriver = r"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe"
# # 设置浏览器
# os.environ["webdriver.chrome.driver"] = chromedriver
# browser = webdriver.Chrome(chromedriver)
opt = Options()
opt.add_argument('--headless')
opt.add_argument('--disable-gpu')
web = Chrome(options=opt)
url = r'https://www.xuexi.cn/dc12897105c8c496d783c5e4d3b680a2/9a75e290b9cf8cb8fb529a6e503db78d.html?page=1&query=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&program_id=1&search_source=6&_t=1644671322317'
web.get(url)
text = web.page_source # 得到页面element的html代码
etree = html.etree
tree = etree.HTML(text)
dd = tree.xpath('//*[@id="home"]/div/div[3]/div/div[2]/div[1]/div[1]/a')
ee = web.find_elements_by_xpath('//*[@id="home"]/div/div[3]/div/div[2]/div[1]/div[1]/a').text
print(dd)
print(ee)
期望的最终结果(代码+结果):
# coding = utf-8
import re
import sqlite3
import urllib.error
import urllib.request
import xlwt
from bs4 import BeautifulSoup
response = urllib.request.urlopen(
r"https://search.xuexi.cn/api/search?size=15&hid=StX393CuVYfmJlIVNmTTKKCh7vIUw7Au&page=1&query=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&program_id=1&pub_time=0&_t=1644718290581") # 获取网页
bb = (response.read().decode('utf-8')) # 解析网页
aa = re.findall("\d{15}", bb) # 从网页中找到连续的大于15位的数字即为子页面id
cc = list(set(aa)) # 将获取到id列表去重
baseurl = 'https://www.xuexi.cn/lgpage/detail/index.html' # 根url
for item in list(cc):
url = baseurl + '?id=' + item + '&item_id=' + item # 将获取的id与根url结合形成目标子页面链接
print(url)