在爬虫过程中,最好把请求头添加上
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re,requests
from urllib.request import urlretrieve
url = 'https://www.meishij.net/zuofa/zhuduji_7.html'
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Refer":'https://www.meishij.net/',
"Host":"www.meishij.net",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
}
def download_jpg(url):
html = requests.get(url,headers=headers).content.decode('utf-8')
pattern = 'src="(https://st-cn\.meishij\.net/.+?\.jpg)"'
image_urls = re.findall(pattern, html)
count = 0
for image_url in image_urls:
print(image_url)
name = image_url.rsplit('/', 1)[1]
urlretrieve(image_url, name)
# urlretrieve(image_url, '/home/aistudio/data/1/img{}'.format(name))
count += 1
print("download successfully")
print(count)
if __name__ == '__main__':
url = 'https://www.meishij.net/zuofa/zhuduji_7.html'
download_jpg(url)