核函数定义
__global__
void local_binary_fit_part1_kernel(
float *Hu_gpu,
float *I_gpu,
const float *img_gpu,
const float *phi_gpu,
const float epsilon,
const int width,
const int height,
const int depth
)
{
int x = threadIdx.x;
int y = blockIdx.x;
int z = blockIdx.y;
int pos = x + (y + z*height)*width;
float hu = 0.5*(1 + (2 / PI)*atan(phi_gpu[pos] / epsilon));
Hu_gpu[pos] = hu;
I_gpu[pos] = hu * img_gpu[pos];
return;
}
在C函数中调用该核函数
cudaStream_t stream0, stream1;
cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);
int div1 = depth / 2;
int div2 = depth - div1;
dim3 grid1(height, div1, 1);
dim3 grid2(height, div2, 1);
cudaMemcpyAsync(phi_gpu, phi, div1*height*width*sizeof(float), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(phi_gpu + div1*width*height, phi + div1*width*height, div2*height*width*sizeof(float), cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(img_gpu, img, div1*height*width*sizeof(float), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(img_gpu + div1*width*height, img + div1*width*height, div2*height*width*sizeof(float), cudaMemcpyHostToDevice, stream1);
local_binary_fit_part1_kernel <<<grid1, width, 0, stream0 >>>(Hu_gpu, I_gpu, img_gpu, phi_gpu, epsilon, width, height, depth);
local_binary_fit_part1_kernel <<<grid2, width,0, stream1 >>>(Hu_gpu + div1*width*height, I_gpu + div1*width*height, img_gpu + div1*width*height, phi_gpu + div1*width*height, epsilon, width, height, depth);
cudaStreamSynchronize(stream0);
cudaStreamSynchronize(stream1);
cudaStreamDestroy(stream0);
cudaStreamDestroy(stream1);
编译时出现如下错误
error : unrecognized token
如改成
local_binary_fit_part1_kernel <<<grid1, width>>>(Hu_gpu, I_gpu, img_gpu, phi_gpu, epsilon, width, height, depth);
则能通过,这个是什么原因