对于VisualProfiler的使用,新建NewSession,在File处选择已经编译好后出现的.exe文件,
然后点next,点finish后出现如图所示错误:
问题描述为:The application being profiled returned a non-zero return code.
百度翻译为:正在被配置的应用程序返回非零返回代码。
通过网上搜寻,可能的解决办法是:
1.内存没释放完全
2.主函数末尾需要添加cudaThreadExit();
3.主函数末尾添加cudaFree(0);
如果上述解决办法,解决了,我是不会来这里提问的。
问题就是没有解决!!!
其中,我用一段HelloWorld.cu测试,可以使用VisualProfiler
//使用CUDA的第一个独立编程。简要使用CUDA。
//功能:从主机向设备传入数组a、b,计算a、b数组的和,传入数组c中,再传回主机。
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include "device_launch_parameters.h"
#define N 200000
__global__ void Add(int *d_a, int *d_b, int *d_c);
int main()
{
int *a = (int*)malloc(sizeof(int)*N);
int *b = (int*)malloc(sizeof(int)*N);
int *c = (int*)malloc(sizeof(int)*N);
//CPU计算
for (int i = 0; i < N; i++)
{
a[i] = i;
b[i] = i;
c[i] = a[i] + b[i];
}
printf("CPU : c[N-1] = %d\n", c[N - 1]);
for (int i = 0; i < N; i++)
{
c[i] = 0;
}
printf("c[N-1] = %d\n", c[N - 1]);
int *d_a;
int *d_b;
int *d_c;
cudaMalloc((void**)&d_a, sizeof(int)*N);
cudaMalloc((void**)&d_b, sizeof(int)*N);
cudaMalloc((void**)&d_c, sizeof(int)*N);
dim3 block_add((N + 256 - 1) / 256, 1);
dim3 thread_add(256, 1);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_a, a, sizeof(int)*N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(int)*N, cudaMemcpyHostToDevice);
Add << <block_add, thread_add >> > (d_a, d_b, d_c);
cudaMemcpy(c, d_c, sizeof(int)*N, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float tm; //创建时间变量
cudaEventElapsedTime(&tm, start, stop);//把事件的时间记录下来
cudaEventDestroy(start); //销毁事件
cudaEventDestroy(stop); //销毁事件
printf("GPU Elapsed time:%.6f ms.\n", tm); //屏幕显示时间
printf("GPU : c[N-1] = %d", c[N - 1]);
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
__global__ void Add(int *d_a, int *d_b, int *d_c)
{
const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
if (tidx < N)
d_c[tidx] = d_a[tidx] + d_b[tidx];
}
但是,使用我需要完成任务后,进行调试编译完成之后,不可以使用VisualProfiler
非常奇怪。这是我下面的一段代码:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#define C 3e8
#define pi 3.141592653589793
#define uchar unsigned char
float B = 100e6; //带宽
float fc = 10e9; //载频
float Fs = 140e6; //采样频率
float PRF = 500; //脉冲重复周期
float Rs = 10e3; //最近斜距
float Tp = 20e-6; //脉冲时宽
float H = 4e3; //平台高度
int Nan = 8192; //方位向采样点数
int Nrn = 4096; //距离向采样点数
int Nz;
float vx = 70.0, vy = 0.0, vz = 0.0, angle_equal = 0;//俯冲参数
float BeamWide_azimuth = 3.0 / 180 * pi; //雷达波束宽度
int data_nrn_new = 512, data_nan_new = 512;//从原图中截取的图像大小
float DeltaR = C / 2 / Fs; //距离采样间隔
float x_interval = DeltaR;
float y_interval = DeltaR;
float theta = 3.0 / 180 * pi;
__global__ void pos(float *d_pos_x, float *d_pos_y, float *d_pos_z, int nan,
float PRF, float vx, float vy, float vz, float Rs, float angle_equal, float H);
int main()
{
//雷达坐标
float* pos_x;
pos_x = (float*)malloc(sizeof(float)*Nan);
float* pos_y;
pos_y = (float*)malloc(sizeof(float)*Nan);
float* pos_z;
pos_z = (float*)malloc(sizeof(float)*Nan);
float *d_pos_x; //平台坐标x
cudaMalloc((void **)&d_pos_x, Nan * sizeof(float));
float *d_pos_y; //平台坐标y
cudaMalloc((void **)&d_pos_y, Nan * sizeof(float));
float *d_pos_z; //平台坐标z
cudaMalloc((void **)&d_pos_z, Nan * sizeof(float));
printf("分配空间完毕\n");
//线程分配
dim3 blocks_Pos((Nan + 512 - 1) / 512, 1);//雷达位置线程
dim3 threads_Pos(512, 1);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
pos<< <blocks_Pos, threads_Pos >> >(d_pos_x, d_pos_y, d_pos_z, Nan, PRF, vx, vy, vz, Rs, angle_equal, H);
printf("核函数计算完毕\n");
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float dt;
cudaEventElapsedTime(&dt, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("time: %.2f ms\n", dt);
cudaMemcpy(pos_x, d_pos_x, sizeof(float)*Nan, cudaMemcpyHostToDevice);
cudaMemcpy(pos_y, d_pos_y, sizeof(float)*Nan, cudaMemcpyHostToDevice);
cudaMemcpy(pos_z, d_pos_z, sizeof(float)*Nan, cudaMemcpyHostToDevice);
printf("数据传输完毕\n");
cudaFree(d_pos_x);
cudaFree(d_pos_y);
cudaFree(d_pos_z);
return 1;
}
//计算传感器实时位置
__global__ void pos(float *d_pos_x, float *d_pos_y, float *d_pos_z, int nan,
float PRF, float vx, float vy, float vz, float Rs, float angle_equal, float H)
{
const int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid<nan)
{
d_pos_x[tid] = (tid - nan / 2.0) / PRF*vx;
d_pos_y[tid] = (tid - nan / 2.0) / PRF*vy - float(sqrt(pow(double(Rs*cos(angle_equal)), 2.0) - pow(double(H), 2.0)));
d_pos_z[tid] = H;
}
}
恳请哪位大神可以帮助我看下出现了什么问题