2301_80039833 2024-04-15 17:14 采纳率: 0%
浏览 2

xpath爬虫解析站长

import urllib.request
from lxml import etree
def create_request(page):
    if(page==1):
        url='https://sc.chinaz.com/tupian/shanshuitupian.html'
    else:
        url ='https://sc.chinaz.com/tupian/shanshuitupian_'+str(page)+'.html'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0',
        'Cookie':'cz_statistics_visitor = 0feda378 - 4c8f - a225 - 7a91 - 755c228c51fb;_clck = 1jn4eo3 % 7C2 % 7Cfkx % 7C0 % 7C1565;__gads = ID = b9ca106bba83c1c6:T = 1713111545:RT = 1713111545:S = ALNI_MYXYNBp9CApMOsDFxHnXQoocoLPXQ;__gpi = UID = 00000debfd2f75d9: T = 1713111545:RT = 1713111545:S = ALNI_MaWelFfZUAHAN3ee1wOLzGOFB05KA;__eoi = ID = bd66e9b7043bae01:T = 1713111545:RT = 1713111545:S = AA - Afjak04XLL - 3r2QGac86IgZ5W;_clsk = xn6u0n % 7C1713112855925 % 7C3 % 7C1 % 7Ch.clarity.ms % 2Fcollect;Hm_lvt_398913ed58c9e7dfe9695953fb7b6799 = 1713111487, 1713115404, 1713154098;Hm_lpvt_398913ed58c9e7dfe9695953fb7b6799 = 1713154098'
    }
    request = urllib.request.Request(url=url, headers=headers)
    return request


def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content

def down_load(content):
    tree=etree.HTML(content)
    name_list= tree.xpath('//div[@class="item masonry-brick"]//img/@alt')
    pic_list= tree.xpath('//div[@class="item masonry-brick"]//img/@data-original')
    for i in range(len(name_list)):
        name =name_list[i]
        pic =pic_list[i]
        url='https:' +pic
        urllib.request.urlretrieve(url=url,filename=name + '.jpg')



if __name__ == '__main__':
    start_page=int(input('请输入起始页码'))
    end_page=int(input('请输入结束页码'))
    for page in range(start_page,end_page+1):
        request=create_request(page)
        content=get_content(request)
        down_load(content)


不报错但是无法下载图片

  • 写回答

4条回答 默认 最新

  • 专家-赤兔[在线] 优质创作者: 编程框架技术领域 2024-04-15 17:14
    关注

    引自免费微信小程序:皆我百晓生

    经过分析您的代码,我发现问题出在图片下载的部分。urllib.request.urlretrieve() 方法可能由于网站的安全设置、请求头或者其他原因导致无法正确下载图片。为了解决这个问题,我们可以尝试使用 requests 库替换 urllib.request,因为 requests 提供了更强大的功能,并且可以处理一些复杂的情况。

    以下是修改后的代码:

    import requests
    from bs4 import BeautifulSoup
    import os
    
    def create_headers():
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0',
            'Cookie':'cz_statistics_visitor = 0feda378 - 4c8f - a225 - 7a91 - 755c228c51fb;_clck = 1jn4eo3 % 7C2 % 7Cfkx % 7C0 % 7C1565;__gads = ID = b9ca106bba83c1c6:T = 1713111545:RT = 1713111545:S = ALNI_MYXYNBp9CApMOsDFxHnXQoocoLPXQ;__gpi = UID = 00000debfd2f75d9: T = 1713111545:RT = 1713111545:S = ALNI_MaWelFfZUAHAN3ee1wOLzGOFB05KA;__eoi = ID = bd66e9b7043bae01:T = 1713111545:RT = 1713111545:S = AA - Afjak04XLL - 3r2QGac86IgZ5W;_clsk = xn6u0n % 7C1713112855925 % 7C3 % 7C1 % 7Ch.clarity.ms % 2Fcollect;Hm_lvt_398913ed58c9e7dfe9695953fb7b6799 = 1713111487, 1713115404, 1713154098;Hm_lpvt_398913ed58c9e7dfe9695953fb7b6799 = 1713154098'
        }
        return headers
    
    def download_image(url, filename):
        response = requests.get(url, headers=create_headers(), stream=True)
        if response.status_code == 200:
            with open(filename + '.jpg', 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
    
    def get_content_and_download_images(page, start_page, end_page):
        url = 'https://sc.chinaz.com/tupian/shanshuitupian_' + str(page) + '.html'
        headers = create_headers()
        response = requests.get(url, headers=headers)
        
        soup = BeautifulSoup(response.text, 'lxml')
        item_elements = soup.select('.item.masonry-brick img')
    
        for i, element in enumerate(item_elements):
            alt = element['alt']
            data_url = element['data-original']
            filename = f'image_{start_page}_{end_page}_{i}'
            download_image(data_url, filename)
    
    if __name__ == '__main__':
        start_page = int(input('请输入起始页码'))
        end_page = int(input('请输入结束页码'))
    
        # 创建用于存储图片的目录,如果不存在的话
        if not os.path.exists("images"):
            os.makedirs("images")
    
        for page in range(start_page, end_page + 1):
            print(f"正在下载第 {page} 页...")
            get_content_and_download_images(page, start_page, end_page)
            print(f"第 {page} 页下载完成.")
    

    这段代码做了以下更改:

    1. 使用 requests 库替代 urllib.request 进行网络请求。
    2. 添加了一个 create_headers() 函数来管理请求头。
    3. 创建了一个新的函数 download_image() 来专门负责下载单个图片。
    4. get_content_and_download_images() 函数中,使用 BeautifulSoup 解析 HTML 并提取图片信息,然后调用 download_image() 下载图片。

    确保在运行此代码之前已经安装了 beautifulsoup4requests 库。如果没有安装,可以使用以下命令进行安装:

    pip install beautifulsoup4 requests
    

    现在,您可以按照提示输入起始页码和结束页码,程序将开始下载图片。如果遇到任何错误,请告诉我具体错误信息以便进一步排查。

    评论 编辑记录

报告相同问题?

问题事件

  • 创建了问题 4月15日