m0_63027072 2023-07-11 14:37 采纳率: 0%
浏览 73

cuda运算时报错:an illegal memory access was encountered

在使用cuda计算时,在DeviceToHost代码处报错:" an illegal memory access was encountered "。

float Forward_Solution_t(double* lattice, double* lattice_result_pointer, unsigned int size, unsigned int ponit, unsigned int velue ,
                        double alpha[], double a[], double d[])
{
    // CUDA事件对象
    cudaEvent_t start, end;
    // 创建CUDA事件
    cudaEventCreate(&start);
    cudaEventCreate(&end);
    // 记录开始时间
    cudaEventRecord(start, 0);

    //将数据分层处理
    unsigned int tier_c;
    if (ponit < 2000) {
        tier_c = ponit;
    }
    else {
        tier_c = 2000;//每层处理2000组数据 tier_c*velue
    }
    int data = tier_c*velue;//每层处理的数据量
    
    //DH参数
    double* d_alpha;
    double*    d_a;
    double*    d_d;
    //copy data
    double* lattice_tier_in;//临时数组(输入)
    double* lattice_tier_out;//临时数组(输出)
    double* data_in;//临时数组(传入)
    double* data_out;//临时数组(传出)
    unsigned int tier;//层数
    double* data_temp;//临时数组(存储数据)
    //分配线程块
    int gridSize;//需要块数量
    int    blockSize;//需要线程数
    unsigned int thread = 800;//每个块分配的线程
    //
    bool tier_bool = false;

    //分配内存
        data = tier_c*velue;//每层处理的数据量
        lattice_tier_in = new double[data];
        lattice_tier_out = new double[data];
        data_temp = new double[data];
        //分配设备内存
        cudaMalloc((void**)&data_in, data * sizeof(double));
        cudaMalloc((void**)&data_out, data * sizeof(double));
        cudaMalloc((void**)&d_alpha, velue * sizeof(double));
        cudaMalloc((void**)&d_a, velue * sizeof(double));
        cudaMalloc((void**)&d_d, velue * sizeof(double));

        //分配线程块
        if (tier_c>thread) {
            gridSize = (tier_c + thread - 1) / thread;
            blockSize = thread;
        }
        else {
            blockSize = tier_c;
            gridSize = 1;

        }

    //循环处理数据
    int loop_v = (ponit + tier_c - 1) / tier_c;//循环变量
    for (tier = 0; tier < loop_v; tier++) {
        tier_bool = (tier == (ponit / tier_c));//是否为最后一层
        if (tier_bool) {
            //释放内存
            delete[] lattice_tier_in;
            lattice_tier_in = nullptr;
            delete[] lattice_tier_out;
            lattice_tier_out = nullptr;
            delete[] data_temp;
            //device
            cudaFree(data_in);
            data_in = nullptr;
            cudaFree(data_out);
            data_out = nullptr;
            //最后一层
            tier_c = ponit%tier_c;//最后一层的c组数据
            data = tier_c*velue;//最后一层的数据量
            lattice_tier_in = new double[data];
            lattice_tier_out = new double[data];
            data_temp = new double[data];
            //分配设备内存
            cudaMalloc((void**)&data_in, data * sizeof(double));
            cudaMalloc((void**)&data_out, data * sizeof(double));
            cudaMalloc((void**)&d_alpha, velue * sizeof(double));
            cudaMalloc((void**)&d_a, velue * sizeof(double));
            cudaMalloc((void**)&d_d, velue * sizeof(double));

            //分配线程块
            if (tier_c>thread) {
                gridSize = (tier_c + thread - 1) / thread;
                blockSize = thread;
            }
            else {
                blockSize = tier_c;
                gridSize = 1;

            }
        }

        //初始化计算数组(输入)
        for (int i = 0; i < data; i++) {
            lattice_tier_in[i] = lattice[tier*2000*6 + i];
        }
        //执行调用
        
            
            //HostToDevice
            HANDLE_ERROR(cudaMemcpy(data_in, lattice_tier_in, data * sizeof(double), cudaMemcpyHostToDevice));
            HANDLE_ERROR(cudaMemcpy(data_out, lattice_tier_out, data * sizeof(double), cudaMemcpyHostToDevice));
            HANDLE_ERROR(cudaMemcpy(d_alpha, alpha, velue * sizeof(double), cudaMemcpyHostToDevice));
            HANDLE_ERROR(cudaMemcpy(d_a, a, velue * sizeof(double), cudaMemcpyHostToDevice));
            HANDLE_ERROR(cudaMemcpy(d_d, d, velue * sizeof(double), cudaMemcpyHostToDevice));

            
            
            //开启并行主函数
            Forward_Solution_kernel << <gridSize, blockSize >> >(data_in, data_out, tier_c, d_alpha, d_a , d_d);
            //最近一次核函数异常
            cudaError_t error = cudaGetLastError();
            printf("CUDA error: %s %d\n", cudaGetErrorString(error),tier);
            // 等待设备上的所有任务完成
            cudaDeviceSynchronize();
            //cudaMemcpyDeviceToHost
            HANDLE_ERROR(cudaMemcpy(lattice_tier_out, data_out, data * sizeof(double), cudaMemcpyDeviceToHost));

            //存储处理后的数据
            for (int i = 0; i < data; i++) {
                data_temp[i] = lattice_tier_out[i];
            }
            for (int i = 0; i < data; i++) {
                lattice_result_pointer[tier*data + i] = data_temp[i];
            }
        
    }//循环结束

    //释放内存
    delete[] lattice_tier_in;
    lattice_tier_in = nullptr;
    delete[] lattice_tier_out;
    lattice_tier_out = nullptr;
    delete[] data_temp;
    //host
    /*cudaFreeHost(lattice_tier_in);
    cudaFreeHost(lattice_tier_out);*/
    //device
    cudaFree(data_in);
    data_in = nullptr;
    cudaFree(data_out);
    data_out = nullptr;


     // 记录结束时间
    cudaEventRecord(end, 0);
    cudaEventSynchronize(end);
    // 计算时间差(以毫秒为单位)
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, end);

    // 释放CUDA事件
    cudaEventDestroy(start);
    cudaEventDestroy(end);

    return elapsedTime;
}


数据量只能增大到5000,再增加就会报错。一直在找内存的问题,写循环分组也是为了找问题,最开始只能运算2000组数据,没想到分组还是会报错。
以下是10000组数据时当前代码的报错信息

img

其中474与458--->462为:

//cudaMemcpyDeviceToHost
HANDLE_ERROR(cudaMemcpy(lattice_tier_out, data_out, data * sizeof(double), cudaMemcpyDeviceToHost));
//HostToDevice
HANDLE_ERROR(cudaMemcpy(data_in, lattice_tier_in, data * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(data_out, lattice_tier_out, data * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_alpha, alpha, velue * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_a, a, velue * sizeof(double), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(d_d, d, velue * sizeof(double), cudaMemcpyHostToDevice));

前两次循环能导出结果且是正确的,第三次出了问题
作为初学者想知道对此问题的原因和解决办法

  • 写回答

1条回答 默认 最新

  • 赵4老师 2023-07-11 16:33
    关注
    评论

报告相同问题?

问题事件

  • 创建了问题 7月11日

悬赏问题

  • ¥15 三分类机器学习模型可视化分析
  • ¥15 本地测试网站127.0.0.1 已拒绝连接,如何解决?(标签-ubuntu)
  • ¥50 Qt在release捕获异常并跟踪堆栈(有Demo,跑一下环境再回答)
  • ¥30 python,LLM 文本提炼
  • ¥15 关于将inet引入的相关问题
  • ¥15 关于一个倒计时的操作和显示设计
  • ¥15 提问STK的问题,哪位航天领域的同学会啊
  • ¥15 苹果系统的mac m1芯片的笔记本使用ce修改器使用不了
  • ¥15 单相逆变的电压电流双闭环中进行低通滤波PID算法改进
  • ¥15 关于#java#的问题,请各位专家解答!