问题遇到的现象和发生背景
程序运行的比较慢,想要提速。能帮忙封装一下就最好了
相关代码
```python
# 多线程头像抓取
# 封装函数 并完美运行
import requests
from lxml import etree
import os
import threading
import time
def get_respose_text(url):
#通过url 获取 respose 文本
global session
session = requests.sessions.Session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
}
response = session.get(url=url, headers=headers)
response.encoding = "utf-8"
return response.text
def get_Date(response,a_urls_xpath,a_names_xpath):
#抓取并返回 子链接,名称字典
html = etree.HTML(response)
a_urls = html.xpath(a_urls_xpath) # 抓取<a>标签
a_names = html.xpath(a_names_xpath)
new_a_urls = {}
for (a_url, a_name) in zip(a_urls, a_names):
a_name = a_name.text
a_url = index_url + a_url
new_a_urls[a_name] = a_url #将子链接以及对应名称放入字典
return new_a_urls
def save_date(src, s_path):
# 保存一个数据
src = "https:" + src
src_name = src[-17:-13]
src_path = s_path + '\%s.%s' % (src_name, src.split('.')[-1])
src_date = session.get(src)
with open(src_path, 'wb')as f:
f.write(src_date.content)
print("正在下载 >>> %s" % src_path)
def url_name_replace(url_name):
url_name = url_name.replace(' ','')
url_name = url_name.replace('/', '')
url_name = url_name.replace(':', '')
url_name = url_name.replace('?', '')
url_name = url_name.replace(',', '')
url_name = url_name.replace('·', '')
url_name = url_name.replace("‘", '')
url_name = url_name.replace('”', '')
url_name = url_name.replace('|', '')
url_name = url_name.replace('?', '')
url_name = url_name.replace('', '')
url_name = url_name.replace('"', '')
url_name = url_name.replace('<', '')
url_name = url_name.replace('>', '')
url_name = url_name.replace(':',"")
url_name = url_name.replace('.','')
return url_name
def multi_thread(srcs, s_path):
# 多线程
threads = []
for src in srcs:
threads.append(
threading.Thread(target=save_date, args=(src, s_path))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
def single_thread(srcs,s_path):
#单线程
for src in srcs :
save_date(src,s_path)
def main():
global index_url
index_url = "https://www.woyaogexing.com"
#1.爬取主页面
index_respose = get_respose_text(index_url)
# 2.根据主页面响应抓取 数据
urls = get_Date(index_respose,'//*[@id="indexMain"]/div[1]/div[1]/div[3]/div/a/@href','//*[@id="indexMain"]/div[1]/div[1]/div[3]/div/a')
print(urls)
for i,n in urls.items():
# 3.创建第一层目录
i = url_name_replace(i)
save_path = str(os.getcwd()) + r"\%s" % i
if os.path.exists(save_path) == False:
os.mkdir(save_path)
print("在 %s 目录下保存数据" % save_path)
#抓取第第一层页面下 数据
list2_respose = get_respose_text(n)
urls_1 = get_Date(list2_respose,'//*[@id="main"]/div[3]/div[1]/div[2]/div[*]/a[2]/@href','//*[@id="main"]/div[3]/div[1]/div[2]/div[*]/a[2]')
# print(urls_1)
for j,k in urls_1.items():
#创建第二层目录
j = url_name_replace(j)
save_path1 = save_path + r"\%s" % j
if os.path.exists(save_path1) == False:
os.mkdir(save_path1)
#抓取第三层节目的数据
list3_respose = get_respose_text(k)
html = etree.HTML(list3_respose)
image_srcs = html.xpath('//*[@id="main"]/div[3]/div[1]/div[1]/ul/li[*]/a/img/@src')
#多线程保存
multi_thread(image_srcs,save_path1)
#单线程保存
# single_thread(image_srcs,save_path1)
if __name__ == '__main__':
start = time.time()
main()
end = time.time()
print("总运行时间为:%d",end-start,"秒")
# 多线程总运行时间为:%d 59.41753435134888 秒
# 单线程总运行时间为:%d 96.5646162033081 秒