#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse.h>
// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, LINE); exit(-1);}}
#define cudaCheckErrors(msg)
do {
cudaError_t __err = cudaGetLastError();
if (__err != cudaSuccess) {
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n",
msg, cudaGetErrorString(__err),
FILE, LINE);
fprintf(stderr, "*** FAILED - ABORTING\n");
exit(1);
}
} while (0)
int main()
{
const int n = 3;
const int nnzA = 5;
const double h_valA[nnzA] = { 3.0, 2.0, 2.0, 2.0, 1.0 };
const int h_csrColIndA[nnzA] = { 0, 2, 1, 0, 2 };
const int h_csrRowPtrA[n+1] = { 0, 2, 3, 5 };
const double h_b[n] = { 3.5, 1.5, 2.0 };
int y[n] = { 0.0, 0.0, 0.0 };
//CSR format of matrix A and Vector b (device)
double* valA;
int* csrRowPtrA;
int* csrColIndA;
double* b;
double* Y;
cudaMalloc((void**)&valA, nnzA * sizeof(double));
cudaMalloc((void**)&csrRowPtrA, (n + 1) * sizeof(int));
cudaMalloc((void**)&csrColIndA, nnzA * sizeof(int));
cudaMalloc((void**)&b, n * sizeof(double));
cudaMalloc((void**)&Y, n * sizeof(double));
cudaCheckErrors("cudaMalloc fail");
cudaMemcpy(valA, h_valA, (size_t)(nnzA * sizeof(double)), cudaMemcpyHostToDevice);
cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (size_t)((n + 1) * sizeof(int)), cudaMemcpyHostToDevice);
cudaMemcpy(csrColIndA, h_csrColIndA, (size_t)(nnzA * sizeof(int)), cudaMemcpyHostToDevice);
cudaMemcpy(b, h_b, (size_t)(n * sizeof(double)), cudaMemcpyHostToDevice);
cudaMemcpy(Y, y, (size_t)(n * sizeof(double)), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
float alpha = 1.0f;
float beta = 0.0f;
cusparseHandle_t handle = NULL;
cusparseSpMatDescr_t matA;
cusparseDnVecDescr_t vecX, vecY;
void* dBuffer = NULL;
size_t bufferSize = 0;
CUSPARSE_CHECK(cusparseCreate(&handle));
CUSPARSE_CHECK(cusparseCreateCsr(&matA, n, n, nnzA, csrRowPtrA, csrColIndA, valA,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
CUSPARSE_CHECK(cusparseCreateDnVec(&vecX, n, b, CUDA_R_32F));
CUSPARSE_CHECK(cusparseCreateDnVec(&vecY, n, Y, CUDA_R_32F));
CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize));
cudaMalloc(&dBuffer, bufferSize);
cudaCheckErrors("cudaMalloc fail");
CUSPARSE_CHECK(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, dBuffer));
CUSPARSE_CHECK(cusparseDestroySpMat(matA));
CUSPARSE_CHECK(cusparseDestroyDnVec(vecX));
CUSPARSE_CHECK(cusparseDestroyDnVec(vecY));
CUSPARSE_CHECK(cusparseDestroy(handle));
cudaMemcpy(y, Y, n * sizeof(double), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
for (int i = 0; i < n; i++)
{
printf("x[%i] = %f\n", i, y[i]);
}
cudaFree(dBuffer);
cudaFree(csrRowPtrA);
cudaFree(csrColIndA);
cudaFree(valA);
cudaFree(Y);
cudaFree(b);
return 0;
}