CUDA CGBN 算法优化

正在优化一段CUDA的算法，用到了CGNB的库，想要实现四则运算中从单一的加法运算变为数列的形式并行运算。
CGBN参考文献：https://github.com/NVlabs/CGBN

最基本的a+b=c的代码中，c的值一直为0

我尝试将a，b都设置为随机数但是c的结果始终不变
代码如下


void cgbn_add_func(mp a[], mp b[], mp c[], int n) {



  instance_t          *gpuInstances;
  cgbn_error_report_t *report;
  int i;
  size_t bits = mpz_sizeinbase(a[1], 2);
  uint32_t count = bits/TPI;
  instance_t *instances = (instance_t *)malloc(sizeof(instance_t)*count);

  //配列に変更する
  for(int index=0;index<count;index++) {
    from_mpz(instances[index].a._limbs, count, a[index]);//配列に変更する a[index]
    from_mpz(instances[index].b._limbs, count, b[index]);//配列に変更する b[index]

  }



  cudaSetDevice(0);
  cudaMalloc((void **)&gpuInstances, sizeof(instance_t)*count);
  cudaMemcpy(gpuInstances, instances, sizeof(instance_t)*count, cudaMemcpyHostToDevice);
  cgbn_error_report_alloc(&report);

  kernel_add<<<(count+3)/4, 128>>>(report, gpuInstances, count);

  cudaDeviceSynchronize();//計算が終わるまで待っています
  

  cudaMemcpy(instances, gpuInstances, sizeof(instance_t)*count, cudaMemcpyDeviceToHost);//GPUからCPUにコピーしています
  print_words(instances[0].a._limbs, count);
  print_words(instances[0].b._limbs, count);
  print_words(instances[0].sum._limbs, count);
  
  for(int index=0;index<count;index++) {
    to_mpz(c[index], instances[index].sum._limbs, count);
    printf("ccc");
    mpz_out_str(stdout, 10, c[index]);
    printf("\n");
  }
   for (i = 0; i < 9; i++) {
    printf("a");
    mpz_out_str(stdout, 10, a[i]);
    printf("\n");
    printf("b");
    mpz_out_str(stdout, 10, b[i]);
    printf("\n");
    printf("c");
    mpz_out_str(stdout, 10, c[i]);
    printf("\n");
  }
  // clean up
  free(instances);
  cudaFree(gpuInstances);
  cgbn_error_report_free(report);


}


 extern "C" int func() {


  gmp_randstate_t  state;
  gmp_randinit_default(state);
  //mpz_t         a,b,c;//高精度整数
  mpz_t a[9],b[9],c[9];
  int i;

  for (i = 0; i < 9; i++) {
    mpz_inits(a[i], b[i], c[i], NULL);
  }
  //mpz_inits(a[i], b[i], c[i], NULL);
  //mpz_inits(a, b, c, NULL);
  //a,b,cを配列にする
  
  for (i = 0; i < 9; i++) {
  mpz_urandomb(a[i], state, 3);
  mpz_urandomb(b[i], state, 3);
    }
  //mpz_urandomb(a, state, BITS);
  //mpz_urandomb(b, state, BITS);
  cgbn_add_func(a, b, c, 9);

  for (i = 0; i < 9; ++i) {
   mpz_clears(a[i],b[i],c[i],NULL);
  }

  
  return 0;
}

源码如下


```c++
/***

Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.

***/


#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <cuda.h>
#include <gmp.h>
#include "cgbn/cgbn.h"


/************************************************************************************************
 *  This example performs component-wise addition of two arrays of 1024-bit bignums.
 *
 *  The example uses a number of utility functions and macros:
 *
 *    random_words(uint32_t *words, uint32_t count)
 *       fills words[0 .. count-1] with random data
 *
 *    add_words(uint32_t *r, uint32_t *a, uint32_t *b, uint32_t count) 
 *       sets bignums r = a+b, where r, a, and b are count words in length
 *
 *    compare_words(uint32_t *a, uint32_t *b, uint32_t count)
 *       compare bignums a and b, where a and b are count words in length.
 *       return 1 if a>b, 0 if a==b, and -1 if b>a
 *    
 *    CUDA_CHECK(call) is a macro that checks a CUDA result for an error,
 *    if an error is present, it prints out the error, call, file and line.
 *
 *    CGBN_CHECK(report) is a macro that checks if a CGBN error has occurred.
 *    if so, it prints out the error, and instance information
 *
 ************************************************************************************************/
 
// IMPORTANT:  DO NOT DEFINE TPI OR BITS BEFORE INCLUDING CGBN
#define TPI 32
#define BITS 1024


typedef mpz_t mp;

// Declare the instance type
typedef struct {
  cgbn_mem_t<BITS> a;
  cgbn_mem_t<BITS> b;
  cgbn_mem_t<BITS> sum;
} instance_t;


void print_words(uint32_t *x, uint32_t count) {
  int index;

  for(index=count-1;index>=0;index--)
    printf("%08X", x[index]);
  printf("\n");
}

void from_mpz(uint32_t *words, uint32_t count, mpz_t value) {
  size_t written;

  if(mpz_sizeinbase(value, 2)>count*32) {
    fprintf(stderr, "from_mpz failed -- result does not fit\n");
    exit(1);
  }

  mpz_export(words, &written, -1, sizeof(uint32_t), 0, 0, value);
  while(written<count)
    words[written++]=0;
}

void to_mpz(mpz_t r, uint32_t *x, uint32_t count) {
  mpz_import(r, count, -1, sizeof(uint32_t), 0, 0, x);
}


// helpful typedefs for the kernel
typedef cgbn_context_t<TPI>         context_t;
typedef cgbn_env_t<context_t, BITS> env_t;

// the actual kernel
__global__ void kernel_add(cgbn_error_report_t *report, instance_t *instances, uint32_t count) {
  int32_t instance;
  
  // decode an instance number from the blockIdx and threadIdx
  instance=(blockIdx.x*blockDim.x + threadIdx.x)/TPI;
  if(instance>=count)
    return;

  context_t      bn_context(cgbn_report_monitor, report, instance);   // construct a context
  env_t          bn_env(bn_context.env<env_t>());                     // construct an environment for 1024-bit math
  env_t::cgbn_t  a, b, r;                                             // define a, b, r as 1024-bit bignums

  cgbn_load(bn_env, a, &(instances[instance].a));      // load my instance's a value
  cgbn_load(bn_env, b, &(instances[instance].b));      // load my instance's b value
  cgbn_add(bn_env, r, a, b);                           // r=a+b
  cgbn_store(bn_env, &(instances[instance].sum), r);   // store r into sum
}

void cgbn_add_func(mp a[], mp b[], mp c[], int n) {



  instance_t          *gpuInstances;
  cgbn_error_report_t *report;
  int i;
  size_t bits = mpz_sizeinbase(a[1], 2);
  uint32_t count = bits/TPI;
  instance_t *instances = (instance_t *)malloc(sizeof(instance_t)*count);

  //配列に変更する
  for(int index=0;index<count;index++) {
    from_mpz(instances[index].a._limbs, count, a[index]);//配列に変更する a[index]
    from_mpz(instances[index].b._limbs, count, b[index]);//配列に変更する b[index]

  }



  cudaSetDevice(0);
  cudaMalloc((void **)&gpuInstances, sizeof(instance_t)*count);
  cudaMemcpy(gpuInstances, instances, sizeof(instance_t)*count, cudaMemcpyHostToDevice);
  cgbn_error_report_alloc(&report);

  kernel_add<<<(count+3)/4, 128>>>(report, gpuInstances, count);

  cudaDeviceSynchronize();//計算が終わるまで待っています
  

  cudaMemcpy(instances, gpuInstances, sizeof(instance_t)*count, cudaMemcpyDeviceToHost);//GPUからCPUにコピーしています
  print_words(instances[0].a._limbs, count);
  print_words(instances[0].b._limbs, count);
  print_words(instances[0].sum._limbs, count);
  
  for(int index=0;index<count;index++) {
    to_mpz(c[index], instances[index].sum._limbs, count);
    printf("ccc");
    mpz_out_str(stdout, 10, c[index]);
    printf("\n");
  }
   for (i = 0; i < 9; i++) {
    printf("a");
    mpz_out_str(stdout, 10, a[i]);
    printf("\n");
    printf("b");
    mpz_out_str(stdout, 10, b[i]);
    printf("\n");
    printf("c");
    mpz_out_str(stdout, 10, c[i]);
    printf("\n");
  }
  // clean up
  free(instances);
  cudaFree(gpuInstances);
  cgbn_error_report_free(report);


}


 extern "C" int func() {


  gmp_randstate_t  state;
  gmp_randinit_default(state);
  //mpz_t         a,b,c;//高精度整数
  mpz_t a[9],b[9],c[9];
  int i;

  for (i = 0; i < 9; i++) {
    mpz_inits(a[i], b[i], c[i], NULL);
  }
  //mpz_inits(a[i], b[i], c[i], NULL);
  //mpz_inits(a, b, c, NULL);
  //a,b,cを配列にする
  
  for (i = 0; i < 9; i++) {
  mpz_urandomb(a[i], state, 3);
  mpz_urandomb(b[i], state, 3);
    }
  //mpz_urandomb(a, state, BITS);
  //mpz_urandomb(b, state, BITS);
  cgbn_add_func(a, b, c, 9);

  for (i = 0; i < 9; ++i) {
   mpz_clears(a[i],b[i],c[i],NULL);
  }

  
  return 0;
}

想通过修改的方式优化完成这个算法

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

1条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
歇歇 2022-04-09 22:52
关注
Debug调试，应该是没有执行到，或者你看看每一步数据的变化

解决
无用 1
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

cuda编程数据传输 c++ 有问必答
2022-07-13 15:45

回答 2 已采纳可以看一下下面这篇文章： CUDA编程记之一基本使用及线程、同步、存储器_辜易的博客-CSDN博客_cuda 线程同步欢迎大家一起来观摩我
cuda编程困惑cudaMemcopy
2017-10-18 06:30

回答 2 已采纳 cudaMemcopy是CPU和GPU两者共同作用的结果。 GPU编程就是八股文，共分三个步骤： 1、在启动GPU计算前使用cudaMemcopy将计算机内存的数据拷贝入GPU内存中， 2、启动
安装了cuda,为啥VS2013里面没有cuda模块？
2017-06-10 16:27

回答 2 已采纳后来重装cuda8.0时发现，应该先安装VS2013然后才安装cuda8.0，因为安装cuda8.0时，它会配置VS2013，如果安装顺序相反，则新建项目的时候，没有nvidia模块。
CUDA粒子群优化算法
2012-07-09 16:16

CUDA平台加速粒子群优化算法（pso），自己实现，结构清晰，加速比可达到10倍左右，适合初学者，因为还有一定的优化空间。
YOLOv5算法报错 python 有问必答算法
2021-12-27 17:14

回答 2 已采纳 CUDA out of memory 提示你显存不够，吧batchsize减小或者imgz减小
Nvida GEforce GTX 1650支不支持CUDA编程吗？ c++
2019-07-17 11:27

回答 6 已采纳经过自己测试（可以在visual studio上编写CUDA程序），确实是支持的。
在cuda编程中，一个核函数最多可以用多少个线程？ c++ c语言有问必答
2022-04-24 20:17

回答 2 已采纳调用核函数的时候，可以有多个block，，每个block所能容纳的最大线程数也是有限的。其实在硬件上，每个block里面的所有thread会共用一个处理器核心，而且它们共享的shared memory
《GPU并行计算与CUDA编程》课程视频和代码
2023-06-18 13:22

《GPU并行计算与CUDA编程》课程视频和代码
vscode写cuda代码，如何像写c++那样写完可以一键格式化代码(format)? c++ c语言 ide 开发语言
2020-12-01 10:06

回答 2 已采纳 https://blog.csdn.net/qq_35333978/article/details/110201655
Golang调用CUDA库 c++
2016-03-02 21:47

回答 1 已采纳 It appears, at least in this case, that the go import of C is expecting the function to be provid
cuda11.7安装 pytorch
2023-03-10 17:35

回答 2 已采纳这是因为 PyTorch 版本所依赖的 CUDA 版本不匹配导致的。is_valible() 是 PyTorch 1.7 的一个函数，但是你使用的 CUDA 版本可能过老了。解决方法有两种：升级
cuda 作业排序算法 sort
2015-05-24 16:32

中科院 cuda 作业排序算法 sort
关于opencv+cuda混合编程遇到结果图横向重复四个块的问题！(语言-c++) c++ opencv 有问必答计算机视觉
2021-12-21 14:17

回答 4 已采纳你好，我是有问必答小助手，非常抱歉，本次您提出的有问必答问题，技术专家团超时未为您做出解答本次提问扣除的有问必答次数，将会以问答VIP体验卡（1次有问必答机会、商城购买实体图书享受95折优惠）的形式为
NVIDIA CUDA并行编程语言及其矢量相加实例——一文带你快速入门
2023-10-10 19:36

张小殊.的博客 CUDA的软件堆栈由以下三层构成：如图1所示，CUDA的核心是CUDA C语言，它包含对C语言的最小扩展集和一个运行时库，使用这些扩展和运行时库的源文件必须通过nvcc编译器进行编译。图1 CUDA软件栈示意图CUDA C语言编译...
cuda编程（一）基础
2022-05-04 16:22

姜大大的博客的博客 cuda支持的编程语言：c/c++/python/fortran/java… 1、CUDA并行计算基础异构计算 CUDA 安装 CUDA 程序的编写 CUDA 程序编译利用NVProf查看程序执行情况 gpu不是单独的在计算机中完成任务，而是通过协助cpu和...
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
系统已结题 4月10日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 4月2日

悬赏问题

¥20 怎么用dlib库的算法识别小麦病虫害
¥15 华为ensp模拟器中S5700交换机在配置过程中老是反复重启
¥15 java写代码遇到问题，求帮助
¥15 uniapp uview http 如何实现统一的请求异常信息提示？
¥15 有了解d3和topogram.js库的吗？有偿请教
¥100 任意维数的K均值聚类
¥15 stamps做sbas-insar，时序沉降图怎么画
¥15 买了个传感器，根据商家发的代码和步骤使用但是代码报错了不会改，有没有人可以看看
¥15 关于#Java#的问题，如何解决？
¥15 加热介质是液体，换热器壳侧导热系数和总的导热系数怎么算

CUDA CGBN 算法优化

1条回答 默认 最新

问题事件

悬赏问题

1条回答默认最新