import os
import requests
import re
import time
import random
if __name__ == "__main__" :
# 创建一个文件夹,予以保存图片
if not os.path.exists('./123'):
os.mkdir('./123')
# 1、指定url
url = 'https://www.qiushibaike.com/imgrank/'
# UA伪装:将user-Agent封装到一个字典中
headers = {
'user-agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81 '
}
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
]
# 对指定uil发起请求
# 对整张页面进行爬取
page_text = requests.get(url=url, headers=headers).text
# 利用聚焦爬虫对所有图片进行爬取
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
imj_src_list = re.findall(ex, page_text, re.S)
# print(imj_src_list)
retryLimit = 5
for src in imj_src_list :
# 拼接出一个完整的图片url
src = 'http:' + src
# 请求到图片二进制数据
flag = True
retryTime = 0
while flag:
try:
headers['User-Agent'] = random.choice(user_agent_list)
img_data = requests.get(url=src, headers=headers).content
# 上传图片名称
img_name = src.split('/')[-1]
# 图片储存路径
imgPath = './123/' + img_name
with open(imgPath, "wb") as fp :
fp.write(img_data)
print(src)
print(imgPath, '下载成功!!!')
flag = False
except:
print("访问被拒绝,尝试等待后重连")
time.sleep(5)
retryTime += 1
if retryTime >= retryLimit:
print(src)
print(imgPath, "下载失败")
break