自己写的核函数,功能是实现YUV格式的图像转RGB格式的图像,基本上是纯数学计算,感觉执行速度太慢了,不知如何优化?
第一个核函数代码如下:
__global__ void WholeUyvyToRgbKernel(uint8_t *yuv_buffer, uint8_t *rgb_buffer) {
int bSize = blockDim.z * blockDim.y * blockDim.x;
int bIndex = blockIdx.z * gridDim.x * gridDim.y + \
blockIdx.y * gridDim.x + \
blockIdx.x;
int tIndex = threadIdx.z * blockDim.x * blockDim.y + \
threadIdx.y * blockDim.x + \
threadIdx.x;
int index = bIndex * bSize + tIndex;
//printf("block idx: %3d, thread idx in block: %3d, thread idx, %3d\n",
// bIndex, tIndex, index);
uint8_t *yuv_start_ptr = yuv_buffer + index * 4;
uint8_t *rgb_start_ptr = rgb_buffer + index * 6;
int U = *(yuv_start_ptr) - 128;
int Y1 = *(yuv_start_ptr + 1);
int V = *(yuv_start_ptr + 2) - 128;
int Y2 = *(yuv_start_ptr + 3);
int R1 = Y1 + 1.4075 * V;
int G1 = Y1 - 0.3455 * U - 0.7169 * V;
int B1 = Y1 + 1.779 * U;
int R2 = Y2 + 1.4075 * V;
int G2 = Y2 - 0.3455 * U - 0.7169 * V;
int B2 = Y2 + 1.779 * U;
*(rgb_start_ptr + 0) = min(max(R1, 0), 255);
*(rgb_start_ptr + 1) = min(max(G1, 0), 255);
*(rgb_start_ptr + 2) = min(max(B1, 0), 255);
*(rgb_start_ptr + 3) = min(max(R2, 0), 255);
*(rgb_start_ptr + 4) = min(max(G2, 0), 255);
*(rgb_start_ptr + 5) = min(max(B2, 0), 255);
}
第一个核函数调用代码:
dim3 block_size_dim(32, 32, 1);
dim3 grid_size_dim = dim3(2616, 1, 1); // 实际执行中都是固定宽高的图像
WholeUyvyToRgbKernel <<< grid_size_dim, block_size_dim, 0, stream >>>(WholeUyvyBufferDevice, WholeRgbBufferDevice);
第二个核函数代码如下:
__global__ void SubYuvToRgbKernel(uint8_t *sub_buffer_device, int sub_width) {
int position = blockDim.x * blockIdx.x + threadIdx.x;
int col_index = position % sub_width;
int row_index = position / sub_width;
int sub_rgb_index = row_index * sub_width + col_index;
auto y = sub_buffer_device[sub_rgb_index * 3 + 0];
auto u = sub_buffer_device[sub_rgb_index * 3 + 1] - 128;
auto v = sub_buffer_device[sub_rgb_index * 3 + 2] - 128;
auto r = static_cast<int>(y + 1.4075 * v);
auto g = static_cast<int>(y - 0.3455 * u - 0.7169 * v);
auto b = static_cast<int>(y + 1.779 * u);
sub_buffer_device[sub_rgb_index * 3 + 0] = static_cast<unsigned char>(min(max(r, 0), 255));
sub_buffer_device[sub_rgb_index * 3 + 1] = static_cast<unsigned char>(min(max(g, 0), 255));
sub_buffer_device[sub_rgb_index * 3 + 2] = static_cast<unsigned char>(min(max(b, 0), 255));
}
第二个核函数调用代码:
int threads = (std::min)(src_ width, src_height); // 两个变量 src_ width, src_height 的取值范围均大于64小于512,一般在128左右
int blocks = (std::max)(src_width, src_height);
SubYuvToRgbKernel <<<blocks, threads, 0, stream >>>(SubBufferDevice, src_width);
输入的uint8_t *类型的参数,都是从内存使用cudaMemCpy到显存的,好像是在Global Memory上,没做其他处理。可否指点一下,如何加速? 先行谢过。