void *device_numofduanyuan;
cublasHandle_t handler;
cudaEvent_t cublas_start,cublas_stop,cula_start,cula_stop;
clock_t begin,end;
begin=clock();
cudaEventCreate(&cublas_start);
cudaEventCreate(&cublas_stop);
cudaEventCreate(&cula_start);
cudaEventCreate(&cula_stop);
//cublasCreate(&handle);
cudaMalloc((void **)&device_numofduanyuan,sizeof(int));
cudaMemset(device_numofduanyuan,0,sizeof(int));
if (type==3||type==0||type==1||type==2)
{
//float *matVt,*matMt,*matOMEGA;
//matVt=(float *)malloc(sizeof(float)*bands*width); //matVt 所有波段的一行数据;
//matMt=(float *)malloc(sizeof(float)*height*width);
//matOMEGA=(float *)malloc(sizeof(float)*bands*numofduanyuan);
matrix_f matVt,matMt,matOMEGA;
Init_fmatrix(matVt,bands,width);
Init_fmatrix(matMt,height,width);
Init_fmatrix(matOMEGA,bands,numofduanyuan);
float *temp_data=(float *)host_data;
float *dev_matVt,*dev_matMt,*dev_matOMEGA;
cudaMalloc((void**)&dev_matVt,sizeof(float)*bands);
cudaMalloc((void**)&dev_matMt,sizeof(float)*height*width);
cudaMalloc((void**)&dev_matOMEGA,sizeof(float)*bands*numofduanyuan);
#pragma unroll
for (int i=0;i<height;i++)
{
for (int j=0;j<bands;j++)
{
for (int k=0;k<width;k++)
{
int pos=i*width+k+j*(width*height);
//float temp=host_data[pos];
matVt.mat[j][k]=temp_data[pos];
}
}
cudaMemcpy(dev_matVt,matVt.mat,sizeof(float)*width*bands,cudaMemcpyHostToDevice);
cudaMemcpy(dev_matMt,matMt.mat,sizeof(float)*height*width,cudaMemcpyHostToDevice);
cublasSnrm2(handler,bands,dev_matVt,sizeof(float),dev_matMt);