我是一个码盲设计师,最近发现使用CUDA比用CPU合成图片快10倍不止,
用GPT制作的PyCUDA合成图片脚本时报错:
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
Traceback (most recent call last):
File "mapHE4b.py", line 102, in merge_diamond_images_pycuda
image_gpu = cuda.mem_alloc(image.nbytes)
pycuda._driver.LogicError: cuMemAlloc failed: an illegal memory access was encountered
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "mapHE4b.py", line 153, in <module>
merge_diamond_images_pycuda(map_id)
File "mapHE4b.py", line 135, in merge_diamond_images_pycuda
except cuda.CudaError as e:
AttributeError: module 'pycuda.driver' has no attribute 'CudaError'
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuModuleUnload failed: an illegal memory access was encountered
合成图片超过3?时才会报错,图片少的时候合成没问题,硬件资源远未占满。
以下是脚本内容
import subprocess
import sys
import time
import os
import numpy as np
import cv2
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
def check_and_install_package(package_name, import_name=None):
if import_name is None:
import_name = package_name
try:
__import__(import_name)
except ImportError:
print(f"{package_name} not found, installing...")
subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
# Ensure necessary packages are installed
check_and_install_package('setuptools')
check_and_install_package('pycuda')
check_and_install_package('opencv-python')
# CUDA kernel for image merging
cuda_code = """
__global__ void merge_images(unsigned char* canvas, const unsigned char* image,
int canvas_width, int canvas_height,
int image_width, int image_height,
int x_offset, int y_offset) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < image_width && y < image_height) {
int canvas_x = x + x_offset;
int canvas_y = y + y_offset;
if (canvas_x >= 0 && canvas_x < canvas_width && canvas_y >= 0 && canvas_y < canvas_height) {
int canvas_idx = (canvas_y * canvas_width + canvas_x) * 4;
int image_idx = (y * image_width + x) * 4;
float alpha = image[image_idx + 3] / 255.0f;
for (int c = 0; c < 3; c++) {
canvas[canvas_idx + c] = (unsigned char)(
alpha * image[image_idx + c] +
(1 - alpha) * canvas[canvas_idx + c]
);
}
canvas[canvas_idx + 3] = max(canvas[canvas_idx + 3], image[image_idx + 3]);
}
}
}
"""
# Compile the CUDA kernel
mod = SourceModule(cuda_code)
merge_images = mod.get_function("merge_images")
def merge_diamond_images_pycuda(map_id):
start_time = time.time()
# Get all png files in the current directory
png_files = [f for f in os.listdir('.') if f.endswith('.png') and f.startswith(str(map_id))]
# Filter out files that do not match the format
valid_files = [f for f in png_files if len(f.split('_')) == 3]
# Extract unique N and M values
n_values = sorted(set(int(f.split('_')[1]) for f in valid_files))
m_values = sorted(set(int(f.split('_')[2].split('.')[0]) for f in valid_files))
sN = len(n_values)
sM = len(m_values)
# Calculate overall canvas size
canvas_width = int((sN + sM) / 2 * 6144)
canvas_height = int((sN + sM) / 2 * 3072)
# If there are no valid image files, skip processing
if not valid_files:
print(f"No valid image files found, skipping processing")
return
try:
# Create canvas
canvas = cuda.mem_alloc(canvas_width * canvas_height * 4)
cuda.memset_d8(canvas, 0, canvas_width * canvas_height * 4)
# Draw diamond images
for n in reversed(n_values):
for m in reversed(m_values):
filename = f"{map_id}_{n}_{m}.png"
if filename in valid_files:
image = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
if image.shape[2] == 3:
image = cv2.cvtColor(image, cv2.COLOR_BGR2BGRA)
x = (n_values.index(n) + m_values.index(m)) * 3072 - 768
y = (sN - 1 - n_values.index(n) + m_values.index(m)) * 1536 - 792
# Allocate memory on GPU for the image
image_gpu = cuda.mem_alloc(image.nbytes)
cuda.memcpy_htod(image_gpu, image)
# Set up grid and block dimensions
block_dim = (32, 32, 1)
grid_dim = ((image.shape[1] + block_dim[0] - 1) // block_dim[0],
(image.shape[0] + block_dim[1] - 1) // block_dim[1])
# Call CUDA kernel
merge_images(canvas, image_gpu, np.int32(canvas_width), np.int32(canvas_height),
np.int32(image.shape[1]), np.int32(image.shape[0]),
np.int32(x), np.int32(y),
block=block_dim, grid=grid_dim)
# Free GPU memory for the image
image_gpu.free()
# Copy result back to CPU
result = np.zeros((canvas_height, canvas_width, 4), dtype=np.uint8)
cuda.memcpy_dtoh(result, canvas)
# Save the result
output_filename = f"{map_id}_all.png"
cv2.imwrite(output_filename, result)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Merge complete, saved as {output_filename}")
print(f"Processing time: {elapsed_time:.2f} seconds")
except cuda.MemoryError:
print("Insufficient GPU memory. Try using smaller images or increasing GPU memory.")
except cuda.CudaError as e:
print(f"CUDA error: {e}")
except Exception as e:
print(f"An error occurred: {e}")
finally:
# Free GPU memory
if 'canvas' in locals():
canvas.free()
# Get all unique map_ids
map_ids = sorted(set(f.split('_')[0] for f in os.listdir('.') if f.endswith('.png')))
# Process all map_ids or a specified map_id
map_id = None # Process images for all map_ids
if map_id:
merge_diamond_images_pycuda(map_id)
else:
for map_id in map_ids:
merge_diamond_images_pycuda(map_id)
我让GPT读了很多相关的错误解决方式,但是并没有返回正确的结果。
~~