__global__ void gpuRecursiveReduce (int *g_idata, int *g_odata,
unsigned int isize)
{
// set thread ID
unsigned int tid = threadIdx.x;
// convert global data pointer to the local pointer of this block
int *idata = g_idata + blockIdx.x * blockDim.x;
int *odata = &g_odata[blockIdx.x];
// stop condition
if (isize == 2 && tid == 0)
{
g_odata[blockIdx.x] = idata[0] + idata[1];
return;
}
// nested invocation
int istride = isize >> 1;
if(istride > 1 && tid < istride)
{
// in place reduction
idata[tid] += idata[tid + istride];
}
// sync at block level
__syncthreads();
// nested invocation to generate child grids
if(tid == 0)
{
gpuRecursiveReduce<<<1, istride>>>(idata, odata, istride);
// sync all child grids launched in this block
cudaDeviceSynchronize();
}
// sync at block level again
__syncthreads();
}
// main from here
int main(int argc, char **argv)
{
。。。。。
gpuRecursiveReduce<<<grid, block>>>(d_idata, d_odata, block.x);
。。。。。。
}
make
nvcc -O2 -arch=sm_86 -o nestedHelloWorld nestedHelloWorld.cu -lcudadevrt --relocatable-device-code true
nvcc -O2 -arch=sm_86 -o nestedReduce nestedReduce.cu -lcudadevrt --relocatable-device-code true
nestedReduce.cu(94): error: calling a host function("cudaDeviceSynchronize") from a global function("gpuRecursiveReduce") is not allowed
nestedReduce.cu(94): error: identifier "cudaDeviceSynchronize" is undefined in device code
2 errors detected in the compilation of "nestedReduce.cu".
make: *** [Makefile:8:nestedReduce] 错误 1
一本书里的示例代码,请问这里为啥有错?应该如何修改才行。gpuRecursiveReduce 既被host端调用,同时进入device里后,在device里递归调用,所以又被device调用,只能定义成__global__,还能怎么改呢?