刚开始接触CUDA,C++也不熟悉,想把一个循环用GPU并行加速。
有这么一个循环:
for (int i = 0; i < nbint; i++)
{
_theta = (0.5 + i)*delta;
for (int j = 0; j < nbinp; j++)
{
_phi = (0.5 + j)*delta;
_l = sin(_theta* PI / 180.0)*cos(_phi* PI / 180.0);
_m = sin(_theta* PI / 180.0)*sin(_phi* PI / 180.0);
_n = cos(_theta* PI / 180.0);
for (int k = 0; k < hits; k++)
{
_k = -(_l*x[k] + _m*y[k] + _n*z[k]);
t[k] = a[k] - _k;
}
sort(t,0,hits-1)
}
}
变幻完成后对这nbint*nbinp个数组排序。我想并行处理这么多数组。下面是我的核函数部分:
__global__ void boot(float *theta, float *phi, float *h, float *t)
{
using namespace std;
int nbint = 2, nbinp = 2, hits = 20, delta = 5;
float l, m, n = 0;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < nbinp*nbint)
{
for (int i = 0; i < nbint; i++)
{
*theta = (0.5 + i)*delta;
for (int j = 0; j < nbinp; j++)
{
*phi = (0.5 + j)*delta;
l = sin((*theta)* PI / 180.0)*cos((*phi)* PI / 180.0);
m = sin((*theta)* PI / 180.0)*sin((*phi)* PI / 180.0);
n = cos((*theta)* PI / 180.0);
for (int k = 0; k < hits; k++)
{
float *x = new float[k];
float *y = new float[k];
float *z = new float[k];
float *a = new float[k];
*h = -(l*x[k] + m*y[k] + n*z[k]);
t[k] = a[k] - *h;
}
GPUqsort << < 1, 1 >> >(t, 0, hits-1);
}
}
}
}
结果不对。。请大神们给点指导~~~