PyCUDA报错 PyCUDA WARNING: a clean-up operation failed


PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
Traceback (most recent call last):
  File "mapHE4b.py", line 102, in merge_diamond_images_pycuda
    image_gpu = cuda.mem_alloc(image.nbytes)
pycuda._driver.LogicError: cuMemAlloc failed: an illegal memory access was encountered

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "mapHE4b.py", line 153, in <module>
  File "mapHE4b.py", line 135, in merge_diamond_images_pycuda
    except cuda.CudaError as e:
AttributeError: module 'pycuda.driver' has no attribute 'CudaError'
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuModuleUnload failed: an illegal memory access was encountered



import subprocess
import sys
import time
import os
import numpy as np
import cv2
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

def check_and_install_package(package_name, import_name=None):
    if import_name is None:
        import_name = package_name
    except ImportError:
        print(f"{package_name} not found, installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])

# Ensure necessary packages are installed

# CUDA kernel for image merging
cuda_code = """
__global__ void merge_images(unsigned char* canvas, const unsigned char* image, 
                             int canvas_width, int canvas_height, 
                             int image_width, int image_height,
                             int x_offset, int y_offset) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x < image_width && y < image_height) {
        int canvas_x = x + x_offset;
        int canvas_y = y + y_offset;
        if (canvas_x >= 0 && canvas_x < canvas_width && canvas_y >= 0 && canvas_y < canvas_height) {
            int canvas_idx = (canvas_y * canvas_width + canvas_x) * 4;
            int image_idx = (y * image_width + x) * 4;
            float alpha = image[image_idx + 3] / 255.0f;
            for (int c = 0; c < 3; c++) {
                canvas[canvas_idx + c] = (unsigned char)(
                    alpha * image[image_idx + c] + 
                    (1 - alpha) * canvas[canvas_idx + c]
            canvas[canvas_idx + 3] = max(canvas[canvas_idx + 3], image[image_idx + 3]);

# Compile the CUDA kernel
mod = SourceModule(cuda_code)
merge_images = mod.get_function("merge_images")

def merge_diamond_images_pycuda(map_id):
    start_time = time.time()
    # Get all png files in the current directory
    png_files = [f for f in os.listdir('.') if f.endswith('.png') and f.startswith(str(map_id))]
    # Filter out files that do not match the format
    valid_files = [f for f in png_files if len(f.split('_')) == 3]
    # Extract unique N and M values
    n_values = sorted(set(int(f.split('_')[1]) for f in valid_files))
    m_values = sorted(set(int(f.split('_')[2].split('.')[0]) for f in valid_files))
    sN = len(n_values)
    sM = len(m_values)
    # Calculate overall canvas size
    canvas_width = int((sN + sM) / 2 * 6144)
    canvas_height = int((sN + sM) / 2 * 3072)
    # If there are no valid image files, skip processing
    if not valid_files:
        print(f"No valid image files found, skipping processing")
        # Create canvas
        canvas = cuda.mem_alloc(canvas_width * canvas_height * 4)
        cuda.memset_d8(canvas, 0, canvas_width * canvas_height * 4)
        # Draw diamond images
        for n in reversed(n_values):
            for m in reversed(m_values):
                filename = f"{map_id}_{n}_{m}.png"
                if filename in valid_files:
                    image = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
                    if image.shape[2] == 3:
                        image = cv2.cvtColor(image, cv2.COLOR_BGR2BGRA)
                    x = (n_values.index(n) + m_values.index(m)) * 3072 - 768
                    y = (sN - 1 - n_values.index(n) + m_values.index(m)) * 1536 - 792
                    # Allocate memory on GPU for the image
                    image_gpu = cuda.mem_alloc(image.nbytes)
                    cuda.memcpy_htod(image_gpu, image)
                    # Set up grid and block dimensions
                    block_dim = (32, 32, 1)
                    grid_dim = ((image.shape[1] + block_dim[0] - 1) // block_dim[0],
                                (image.shape[0] + block_dim[1] - 1) // block_dim[1])
                    # Call CUDA kernel
                    merge_images(canvas, image_gpu, np.int32(canvas_width), np.int32(canvas_height),
                                 np.int32(image.shape[1]), np.int32(image.shape[0]),
                                 np.int32(x), np.int32(y),
                                 block=block_dim, grid=grid_dim)
                    # Free GPU memory for the image
        # Copy result back to CPU
        result = np.zeros((canvas_height, canvas_width, 4), dtype=np.uint8)
        cuda.memcpy_dtoh(result, canvas)
        # Save the result
        output_filename = f"{map_id}_all.png"
        cv2.imwrite(output_filename, result)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Merge complete, saved as {output_filename}")
        print(f"Processing time: {elapsed_time:.2f} seconds")
    except cuda.MemoryError:
        print("Insufficient GPU memory. Try using smaller images or increasing GPU memory.")
    except cuda.CudaError as e:
        print(f"CUDA error: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
        # Free GPU memory
        if 'canvas' in locals():

# Get all unique map_ids
map_ids = sorted(set(f.split('_')[0] for f in os.listdir('.') if f.endswith('.png')))

# Process all map_ids or a specified map_id
map_id = None  # Process images for all map_ids
if map_id:
    for map_id in map_ids:


    为了解决这个问题,我们需要检查 imageimage_gpu 是否已经被正确地初始化和分配。如果它们已经被错误地初始化或分配了,那么每次调用 merge_image() 函数都会失败,并导致 PyCUDA网络安全异常

    if 'image' not in locals():
            locals()['image'] = np.random.randint(0, 255, (canvas_width, canvas_height)).astype(np.uint8)
        except TypeError:

    同样,我们也需要确保 image_gpu 已经正确地初始化和分配。我们可以通过以下方式检查:

    if 'image_gpu' not in locals():
            locals()['image_gpu'] = np.random.randint(0, 255, (canvas_width, canvas_height)).astype(np.uint8)
        except TypeError:

    在上面的代码中,numpy.random.randint(a, b, size) 是一个生成随机整数的函数。在这个例子中,我们将生成 0 到 255 的整数,以覆盖所有的可能值。请注意,这个范围应该足够大,以避免因为生成的图像数量不足以填充 image_gpu 所需内存而失败。另外,生成的所有整数都是浮点数,因为我们正在计算每个像素的颜色值。


    1. 将 PyCUDA 加载器更改为默认加载器,例如使用 pip 安装的已安装版本。
    2. 使用 GPU 相关的库(如 openCV 或 scikit-image)代替 CPU 相关的库(如 PIL、OpenCV2 和 scikit-image)。
    3. 在代码中添加适当的错误处理和日志记录机制,以便在出现问题时能够快速定位问题。



