pyopencl内核代码报错
主机代码
buffer_ptrs = np.zeros(self.num_ptrs, dtype=np.int32)
cl.enqueue_copy(queue, buffer_ptrs, np.array([mem.value for mem in self.fg_img_ptr_lis]))
img_info = np.array(self.img_info, dtype=np.int32)
img_info = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=img_info.tobytes())
self.program.overlay(queue, (1232, 688), local_size, self.bg_img.buf, buffer_ptrs, img_info, np.int32(self.num_ptrs)).wait()
self.fg_img_ptr_lis = []
self.img_info = []
self.num_ptrs = 0
内核代码
__kernel void overlay(__global float4 *bg, __global float4 *ptrs, __global int *img_info, int num_ptrs) {
int x = get_global_id(0);
int y = get_global_id(1);
int bg_index = (y * 1232 + x);
for (int i = 0; i < num_ptrs; i++) {
int left = img_info[i * 4];
int top = img_info[i * 4 + 1];
int width = img_info[i * 4 + 2];
int height = img_info[i * 4 + 3];
if (x >= left && x < left + width && y >= top && y < top + height) {
int img_index = ((y - top) * width + (x - left));
printf("img_index: %d\\n", img_index);
__global float4 *img = (__global float4 *)((float4*)ptrs)[i];
// 打印img
float4 bg_pixel = bg[bg_index];
float4 img_pixel = img[img_index];
printf("img_pixel: %f, %f, %f, %f\\n", img_pixel.x, img_pixel.y, img_pixel.z, img_pixel.w);
float alpha = img_pixel.w / 255.0f;
float inv_alpha = 1.0f - alpha;
bg[bg_index] = (float4)(
bg_pixel.x * inv_alpha + img_pixel.x * alpha,
bg_pixel.y * inv_alpha + img_pixel.y * alpha,
bg_pixel.z * inv_alpha + img_pixel.z * alpha,
bg_pixel.w
);
}
}
}
主要问题出在ptrs的传参上,ptrs是一个数组,数组内部包含了100个buffer类型的数据
报错内容:
Build on <pyopencl.Device 'GeForce GT 740M' on 'NVIDIA CUDA' at 0x1eabfa5a1f0>:
<kernel>:13:65: error: Explicit cast from address space "global" to address space "private" is not allowed
__global float4 *img = (__global float4 *)((float4*)ptrs)[i];
^~~~
<kernel>:13:36: error: invalid conversion between vector type 'float4' and scalar type '__global float4 *'
__global float4 *img = (__global float4 *)((float4*)ptrs)[i];
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~