Ck_Zheng 2021-07-09 22:13 采纳率: 33.3%
浏览 44

关于cusparseSpmv计算时出现问题,为什么Ax的第三个数字会是零

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#include<iostream>
using namespace std;

// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

int main( )
{
    //CSR format of matrix A and Vector b
    double* h_valA;
    int* h_csrRowPtrA;
    int* h_csrColIndA;
    double* h_b;
    int n, nnzA;
    n = 3; nnzA = 5;
    
    h_valA = (double*)malloc(nnzA * sizeof(double));
    h_csrRowPtrA = (int*)malloc((n + 1) * sizeof(int));
    h_csrColIndA = (int*)malloc(nnzA * sizeof(int));
    h_b = (double*)malloc(n * sizeof(double));

    h_valA[0] = 3.0;
    h_valA[1] = 2.0;
    h_valA[2] = 2.0;
    h_valA[3] = 2.0;
    h_valA[4] = 1.0;

    h_csrRowPtrA[0] = 0;
    h_csrRowPtrA[1] = 2;
    h_csrRowPtrA[2] = 3;
    h_csrRowPtrA[3] = 5;

    h_csrColIndA[0] = 0;
    h_csrColIndA[1] = 2;
    h_csrColIndA[2] = 1;
    h_csrColIndA[3] = 0;
    h_csrColIndA[4] = 2;

    h_b[0] = 3.5; 
    h_b[1] = 1.5; 
    h_b[2] = 2.0;

    //CSR format of matrix A and Vector b (device)
    double* valA;
    int* csrRowPtrA;
    int* csrColIndA;
    double* b;

    cudaMalloc((void**)&valA, nnzA * sizeof(double));
    cudaMalloc((void**)&csrRowPtrA, (n + 1) * sizeof(int));
    cudaMalloc((void**)&csrColIndA, nnzA * sizeof(int));
    cudaMalloc((void**)&b, n * sizeof(double));
    cudaCheckErrors("cudaMalloc fail");

    cudaMemcpy(valA, h_valA, (size_t)(nnzA * sizeof(double)), cudaMemcpyHostToDevice);
    cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (size_t)((n + 1) * sizeof(int)), cudaMemcpyHostToDevice);
    cudaMemcpy(csrColIndA, h_csrColIndA, (size_t)(nnzA * sizeof(int)), cudaMemcpyHostToDevice);
    cudaMemcpy(b, h_b, (size_t)(n * sizeof(double)), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy fail");

    //Initialize cuSPARSE
    cusparseHandle_t handle;
    CUSPARSE_CHECK(cusparseCreate(&handle));
    cusparseStatus_t status;

    cusparseMatDescr_t descrA;
    status = cusparseCreateMatDescr(&descrA);
    CUSPARSE_CHECK(status);

    cusparseMatDescr_t  descr_L;
    status = cusparseCreateMatDescr(&descr_L);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
    CUSPARSE_CHECK(status);

    cusparseMatDescr_t  descr_U;
    status = cusparseCreateMatDescr(&descr_U);
    status = cusparseSetMatIndexBase(descr_U, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
    CUSPARSE_CHECK(status);

    //Query space and allocate memory
    csrilu02Info_t info_A = 0; CUSPARSE_CHECK(cusparseCreateCsrilu02Info(&info_A));
    csrsv2Info_t info_L = 0; CUSPARSE_CHECK(cusparseCreateCsrsv2Info(&info_L));
    csrsv2Info_t info_U = 0; CUSPARSE_CHECK(cusparseCreateCsrsv2Info(&info_U));

    int pBufferSize_A; int pBufferSize_L; int pBufferSize_U;
    status = cusparseDcsrilu02_bufferSize(handle, n, nnzA, descrA, valA, csrRowPtrA, csrColIndA, info_A, &pBufferSize_A);
    CUSPARSE_CHECK(status);
    status = cusparseDcsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, descr_L, valA, csrRowPtrA, csrColIndA, info_L, &pBufferSize_L);
    CUSPARSE_CHECK(status);
    status = cusparseDcsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, descr_U, valA, csrRowPtrA, csrColIndA, info_U, &pBufferSize_U);
    CUSPARSE_CHECK(status);

    int pBufferSize = max(pBufferSize_A, max(pBufferSize_L, pBufferSize_U));
    void* pBuffer = 0; 
    cudaMalloc((void**)&pBuffer, pBufferSize);
    cudaCheckErrors("cudaMalloc fail");

    // LU decomposition analysis
    int structural_zero;
    status = cusparseDcsrilu02_analysis(handle, n, nnzA, descrA, valA, csrRowPtrA, csrColIndA, info_A, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);
    status = cusparseXcsrilu02_zeroPivot(handle, info_A, &structural_zero);
    CUSPARSE_CHECK(status);
    if (CUSPARSE_STATUS_ZERO_PIVOT == status)
    {
        printf("A(%d,%d) is missing\n", structural_zero, structural_zero);
    }
    status = cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, descr_L, valA, csrRowPtrA, csrColIndA, info_L, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);
    status = cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, descr_U, valA, csrRowPtrA, csrColIndA, info_U, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);

    // A = L * U
    int numerical_zero;
    status = cusparseDcsrilu02(handle, n, nnzA, descrA, valA, csrRowPtrA, csrColIndA, info_A, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);
    status = cusparseXcsrilu02_zeroPivot(handle, info_A, &numerical_zero);
    CUSPARSE_CHECK(status);
    if (CUSPARSE_STATUS_ZERO_PIVOT == status)
    {
        printf("U(%d,%d) is zero\n", numerical_zero, numerical_zero);
    }

    // b = L * Z
    double* d_z;  
    cudaMalloc(&d_z, n * sizeof(double));
    cudaCheckErrors("cudaMalloc fail");

    const double alpha = 1.0;
    status = cusparseDcsrsv2_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, &alpha, descr_L, valA, csrRowPtrA, csrColIndA, info_L, b, d_z, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);

    // Z = U * X
    double* d_x;
    cudaMalloc((void**)&d_x, n * sizeof(double));
    cudaCheckErrors("cudaMalloc fail");
    status = cusparseDcsrsv2_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, &alpha, descr_U, valA, csrRowPtrA, csrColIndA, info_U, d_z, d_x, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);

    // Ax = A * x
    const double beta = 0.0;
    double* d_Ax;
    cudaMalloc((void**)&d_Ax, n * sizeof(double));
    cudaCheckErrors("cudaMalloc fail");
    cusparseSpMatDescr_t matA;
    cusparseDnVecDescr_t vecX, vecAx;
    void*   dBuffer    = NULL;
    size_t  bufferSize = 0;
    status = cusparseCreateCsr(&matA, n, n, nnzA, csrRowPtrA, csrColIndA, valA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
    CUSPARSE_CHECK(status);
    status = cusparseCreateDnVec(&vecX, n, d_x, CUDA_R_64F);
    CUSPARSE_CHECK(status);
    status = cusparseCreateDnVec(&vecAx, n, d_Ax, CUDA_R_64F);
    CUSPARSE_CHECK(status);
    cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize);
    CUSPARSE_CHECK(status);
    cudaMalloc(&dBuffer, bufferSize);
    cudaCheckErrors("cudaMalloc fail");
    cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, dBuffer);
    CUSPARSE_CHECK(status);

    // Return the data from GPU to CPU
   double* h_x = (double*)malloc(n * sizeof(double)); 
   cudaMemcpy(h_x, d_x, n * sizeof(double), cudaMemcpyDeviceToHost); 
   cudaCheckErrors("cudaMemcpy fail");
   printf("Final result\n");
   for (int k = 0; k < n; k++)
   {
       printf("x[%i] = %f\n", k, h_x[k]);
   }
   double* h_Ax = (double*)malloc(n * sizeof(double));
   cudaMemcpy(h_Ax, d_Ax, n * sizeof(double), cudaMemcpyDeviceToHost);
   cudaCheckErrors("cudaMalloc fail");
   printf("Relative error analysis\n");
   for (int k = 0; k < n; k++)
   {
       printf("h_Ax[%i] = %f\n", k, h_Ax[k]);
   }



    if (h_valA)free(h_valA);
    if (h_csrRowPtrA)free(h_csrRowPtrA);
    if (h_csrColIndA)free(h_csrColIndA);
    if (h_x)free(h_x);
    if (valA)cudaFree(valA);
    if (csrRowPtrA)cudaFree(csrRowPtrA);
    if (csrColIndA)cudaFree(csrColIndA);
    if (d_z)cudaFree(d_z);
    if (d_x)cudaFree(d_x);
    if (dBuffer)cudaFree(dBuffer);
    if (d_Ax)cudaFree(d_Ax);
    CUSPARSE_CHECK(cusparseDestroy(handle));
    CUSPARSE_CHECK(cusparseDestroyMatDescr(descrA));
    CUSPARSE_CHECK(cusparseDestroyMatDescr(descr_L));
    CUSPARSE_CHECK(cusparseDestroyMatDescr(descr_U));
    CUSPARSE_CHECK(cusparseDestroyCsrilu02Info(info_A));
    CUSPARSE_CHECK(cusparseDestroyCsrsv2Info(info_L));
    CUSPARSE_CHECK(cusparseDestroyCsrsv2Info(info_U));
    CUSPARSE_CHECK(cusparseDestroySpMat(matA));
    CUSPARSE_CHECK(cusparseDestroyDnVec(vecX));
    CUSPARSE_CHECK(cusparseDestroyDnVec(vecAx));

    return 0;
}

img

  • 写回答

1条回答 默认 最新

  • 「已注销」 2023-03-15 18:19
    关注

    参考GPT和自己的思路:

    在这段代码中,使用了cusparse库的函数计算CSR格式的稀疏矩阵与稠密向量的乘积。其中,在计算过程中,会用到稀疏矩阵的CSR格式的行指针、列指针、非零元值等信息。但是,在这个例子中,由于稀疏矩阵并不是严格的下三角矩阵,因此在进行稀疏矩阵与稠密向量的乘积时,某些元素是不需要计算的。也就是说,在A的第三行中,第二列和第三列的元素是不需要计算的,因为这两个元素所表示的位置是在下三角矩阵之外的位置,因此在计算中会被忽略。所以,在结果中,Ax的第三个数字为0的现象是理所当然的。

    评论

报告相同问题?

问题事件

  • 创建了问题 7月9日

悬赏问题

  • ¥50 随机森林与房贷信用风险模型
  • ¥50 buildozer打包kivy app失败
  • ¥30 在vs2022里运行python代码
  • ¥15 不同尺寸货物如何寻找合适的包装箱型谱
  • ¥15 求解 yolo算法问题
  • ¥15 虚拟机打包apk出现错误
  • ¥15 用visual studi code完成html页面
  • ¥15 聚类分析或者python进行数据分析
  • ¥15 三菱伺服电机按启动按钮有使能但不动作
  • ¥15 js,页面2返回页面1时定位进入的设备