小弟最近在做网络爬虫,已经开了三贴了哈哈哈。这次的问题是我想在下载图片到本地时进行多线程下载以提高速度,但是我写的多线程每次都会回到程序最开始处,也就是让输入爬虫网站和深度的那里,有大佬能解答下吗
import time
import re
import os
import requests
from multiprocessing.pool import Pool
from multiprocessing import cpu_count
start_time=time.time()
url_website=input('Please type the URL:')
deep_number=input("Please specify the deep you want to reach: ")
html_name='http://'
link_list=[url_website]
list=[]
def split_website(url_website):
re_website=re.findall('//.*',url_website)
string_website="".join(re_website)
path_website=re.sub('//','',string_website)
return path_website
host_name=split_website(url_website)
host_name_list=host_name.split('/')
host_name=host_name_list[0]
deep=int(deep_number)
def save_image(iter,list_split):
iter = "http://" + list_split[0] + iter
im_string = ''.join(iter)
im_list = im_string.split('/')
im_name = im_list[-1]
print(im_name)
exc = False
try:
imgs = requests.get(iter)
except:
exc = True
pass
if not exc:
print('write')
image_file = open(im_name, 'wb')
image_file.write(imgs.content)
image_file.close()
while deep>=0:
print(deep)
print(link_list,'before foor loop')
for element in link_list:
print(element)
res=requests.get(element)
html_process=open('html_test.html','wb')
html_process.write(res.content)
html_process.close()
html_read=open('html_test.html','r',encoding='UTF-8')
read_content=html_read.read()
urls=re.findall("<a.*?href=.*?<\/a>",read_content)
print(urls)
image = re.findall('img.*?src="(.+?)"',read_content)
print(image)
path_website = split_website(element)
split_list = path_website.split('/')
os.chdir(os.path.split(os.path.realpath(__file__))[0])
print(link_list,'before 2 foor loop')
for i in range(len(split_list)):
dir_name = split_list[i]
folder_name = dir_name
if not os.path.exists(folder_name):
os.mkdir(folder_name)
os.chdir(folder_name)
if i == (len(split_list) - 1):
## _** for im_iter in image:
## pool=Pool(5)
## pool.map(save_image,[im_iter,split_list])
## pool.close()_**
print(link_list,'before 3 for loop')
for url in urls:
url_string="".join(url)
url_href_list=url_string.split("\"")
url_href_list[1]=html_name+host_name+url_href_list[1]
nick_name = re.findall('>.*?<', url)
if (''.join(nick_name))!='>Back<':
list.append(url_href_list[1])
print(list,'this is back up list')
print(link_list,'Before removing')
print(link_list,'After removing')
print(list)
link_list=list
list=[]
print(deep)
deep=deep-1
end_time=time.time()
print('time used: ',end_time-start_time)
加粗斜体那是小弟写的多线程,但奇怪的是每次它都会回到最开始叫我输入网址的地方并且出现5次。如何避免这个问题只让下图片那多线程呢,求大佬解答