在漫画网站的目录部分点进去一个章节返回正常的页面
获取这个章节的href拼接成url之后按照这个链接却是另外一个样子,所有图片都加载不出来
用爬虫去获取页面的html也和原来不一样
漫画的链接是http://www.qiman6.com/12693/
爬虫获得的图片链接是'http://www.qiman6.com/images/loading_bak.png'之类的
正确的链接是这样的"https://p.pstatp.com/origin/pgc-image/308d526996464348a134c29399e65ca2"
我的源码:
import requests
from lxml import etree
from selenium import webdriver
from pyquery import PyQuery as pq
import time
import pyperclip
# //*[@id="page_01"]/div/img
# //*[@id="page_02"]/div/img
head={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}
browser = webdriver.Chrome()
browser.get('http://www.qiman6.com/12693/')
browser.find_element_by_xpath('//*[@id="chapterlistload"]/div[2]/span').click()
time.sleep(1)
browser.find_element_by_xpath('//*[@id="chapterList"]/div[2]/a').click()
time.sleep(1)
_html=browser.page_source
html=etree.HTML(str(pq(_html)))
links=html.xpath('//*[@id="chapter-list1"]/a/@href')
text=html.xpath('//*[@id="chapter-list1"]//a/text()')
print(text)
print(len(text))
print(links)
for index,i in enumerate(links):
# new_res=requests.get(f'http://www.qiman6.com{i}',headers=head)
# new_res.encoding='utf-8'
# new_html = etree.HTML(new_res.text)
# new_links=new_html.xpath('//*[@id="mainView"]/ul//img')
# print(index,f'http://www.qiman6.com{i}',new_res.status_code)
# print(new_links)
# result = new_html.xpath('')
browser.get(f'http://www.qiman6.com{i}')
js = "var q=document.documentElement.scrollTop=100000"
browser.execute_script(js)
browser.execute_script(open('jquery.min.js',encoding='utf-8').read())
browser.execute_script(open('lazyloadimg2.js',encoding='utf-8').read())
browser.execute_script(open('q2.js',encoding='utf-8').read())
browser.execute_script(open('qmw.js',encoding='utf-8').read())
src=browser.find_elements_by_xpath('//*[@id="mainView"]/ul//img')
print([x.get_attribute('src') for x in src])
time.sleep(1)
# print(browser.page_source)