Python爬虫爬取m3u8流媒体加密视频并定位iframe
无法导入By以致无法定位ifame
1、先解析网页并定位iframe
from selenium.webdriver.chrome.options import Options #(这个需要前提引入的,就想时间等待一样)
import requests
import threading
from bs4 import BeautifulSoup
import re
from selenium import webdriver
import selenium.webdriver.common.by import By
import os
import warnings
warnings.simplefilter('ignore',ResourceWarning)
url = 'http://www.zhirongedu.com/vodplay/46967-1-1.html'
# 开启无头模式提取视频m3u8路径地址
ch_options = Options()
ch_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=ch_options)
videoAnalysis = driver.get(url)
videoName = driver.title # 获取当前页面title值
print(videoName)
def find_all_iframes(driver, name='WorkAreaFrame1'):
iframes = driver.find_elements(by=By.XPATH, value = "//iframe")
for index, iframe in enumerate(iframes):
if iframe.get_attribute('name') == name:
driver.switch_to.frame(iframe)
# 需要执行的操作写在这里
driver.switch_to.parent_frame()
return
else:
driver.switch_to.frame(index)
find_all_iframes(driver, name)
driver.switch_to.parent_frame()
videoSrcAll = find_all_iframes(driver, name='WorkAreaFrame1')
print(videoSrcAll)
driver.close()
2、找到流媒体m3u8真实链接
# 链接分割取出真正的m3u8
videoSrc = str(url).split('=')[1] # 取出“=”分割的右半部分
print(videoSrc) # 输出我们所需的url
html = requests.get(videoSrc).text
print(html) # 输出我们所需的url的内容
#html=r.text#储存到的信息以text呈现
soup=BeautifulSoup(html,'html.parser')#解析数据
getShortSrc = html.split('\n')[2] # 获取文件第三行内容
print(getShortSrc)
noindexurl = url.replace('index.m3u8', '\s') # 删除url的index.m3u8
trueUrl = noindexurl + getShortSrc # 得到m3u8二次链接(真正的m3u8链接)
print(trueUrl)
3、以text文本呈现
res1 = requests.get(trueUrl).text
print(res1)
4、列出js列表
tslist = re.findall('EXTINF:(.*),\n(.*)\n#', res1) # 得到每一个ts视频名称
newlist = []
for i in tslist:
newlist.append(i[int]) # 将ts视频名称添加到列表中
print(newlist) # 输出列表
noindextrueurl = trueUrl.replace('index.m3u8', '\s') # 删除trueurl的index.m3u8
tslisturl = [] # 构造链接空列表
for i in newlist:
tsurl = noindextrueurl + i
tslisturl.append(tsurl)
print(tslisturl) # 输出列表
5、解密下载并储存
def downloadvideo(tslisturl, videoName):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
path = './' + videoName.strip() + ".mp4"
videolen = len(tslisturl)
for i in tslisturl:
print("视频下载中...剩余" + str(videolen) + "个ts视频未下载!")
videolen = videolen - 1
r = requests.get(i, header).content
videoName = i[-9:-3] + '.ts'
if os.path.exists(folder ):
continue
with open(folder, 'ab+') as f:
f.write(r.content)
tslisturl = f.readlines()
f.close()
count=0
for i in tslisturl:
if "#" not in i:
i = i.replace("\n","\s")
n = i[-7:]
threading.Thread(target=downloadvideo, args=(noindextrueurl+""+i,"cdzj2/"+str(n),)).start()
downloadvideo(noindextrueurl+""+i,"cdzj2/"+str(count)+".ts")
print("视频下载完毕!")
download_file = "G:\Desktop"
folder = os.path.exists(download_file + '/' + path)
if not folder:
os.makedirs(download_file + '/' + path)
folder = download_file + '/' + path
定位iframe运行结果
File "d:\mypycode\TEST\.vscode\film1.py", line 7
import selenium.webdriver.common.by import By
^^^^^^
SyntaxError: invalid syntax
```