不建议用re进行页面解析,用bs4库解析,代码如下:
import requests
from bs4 import BeautifulSoup as bs
#import re
if __name__=='__main__':
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'}
url='https://www.qiushibaike.com/imgrank/'
#使用通用爬虫对URL对应的一整张页面进行爬取
page_text=requests.get(url=url,headers=headers).text
#使用聚焦爬虫对页面中所有图片进行解析、提取
soup=bs(page_text,'lxml')
img_src=soup.select('div.thumb img')
img_list=[]
for tag in img_src:
img_list.append("https:"+tag['src']+","+tag['alt'])
print(img_list)
输出图片链接地址和标签名