Strawberry Bubblegum 2022-04-02 14:40 采纳率: 25%
浏览 121
已结题

CUDA CGBN 算法优化

正在优化一段CUDA的算法,用到了CGNB的库,想要实现四则运算中从单一的加法运算变为数列的形式并行运算。
CGBN参考文献:https://github.com/NVlabs/CGBN

最基本的a+b=c的代码中,c的值一直为0

我尝试将a,b都设置为随机数但是c的结果始终不变
代码如下


void cgbn_add_func(mp a[], mp b[], mp c[], int n) {



  instance_t          *gpuInstances;
  cgbn_error_report_t *report;
  int i;
  size_t bits = mpz_sizeinbase(a[1], 2);
  uint32_t count = bits/TPI;
  instance_t *instances = (instance_t *)malloc(sizeof(instance_t)*count);

  //配列に変更する
  for(int index=0;index<count;index++) {
    from_mpz(instances[index].a._limbs, count, a[index]);//配列に変更する a[index]
    from_mpz(instances[index].b._limbs, count, b[index]);//配列に変更する b[index]

  }



  cudaSetDevice(0);
  cudaMalloc((void **)&gpuInstances, sizeof(instance_t)*count);
  cudaMemcpy(gpuInstances, instances, sizeof(instance_t)*count, cudaMemcpyHostToDevice);
  cgbn_error_report_alloc(&report);

  kernel_add<<<(count+3)/4, 128>>>(report, gpuInstances, count);

  cudaDeviceSynchronize();//計算が終わるまで待っています
  

  cudaMemcpy(instances, gpuInstances, sizeof(instance_t)*count, cudaMemcpyDeviceToHost);//GPUからCPUにコピーしています
  print_words(instances[0].a._limbs, count);
  print_words(instances[0].b._limbs, count);
  print_words(instances[0].sum._limbs, count);
  
  for(int index=0;index<count;index++) {
    to_mpz(c[index], instances[index].sum._limbs, count);
    printf("ccc");
    mpz_out_str(stdout, 10, c[index]);
    printf("\n");
  }
   for (i = 0; i < 9; i++) {
    printf("a");
    mpz_out_str(stdout, 10, a[i]);
    printf("\n");
    printf("b");
    mpz_out_str(stdout, 10, b[i]);
    printf("\n");
    printf("c");
    mpz_out_str(stdout, 10, c[i]);
    printf("\n");
  }
  // clean up
  free(instances);
  cudaFree(gpuInstances);
  cgbn_error_report_free(report);


}


 extern "C" int func() {


  gmp_randstate_t  state;
  gmp_randinit_default(state);
  //mpz_t         a,b,c;//高精度整数
  mpz_t a[9],b[9],c[9];
  int i;

  for (i = 0; i < 9; i++) {
    mpz_inits(a[i], b[i], c[i], NULL);
  }
  //mpz_inits(a[i], b[i], c[i], NULL);
  //mpz_inits(a, b, c, NULL);
  //a,b,cを配列にする
  
  for (i = 0; i < 9; i++) {
  mpz_urandomb(a[i], state, 3);
  mpz_urandomb(b[i], state, 3);
    }
  //mpz_urandomb(a, state, BITS);
  //mpz_urandomb(b, state, BITS);
  cgbn_add_func(a, b, c, 9);

  for (i = 0; i < 9; ++i) {
   mpz_clears(a[i],b[i],c[i],NULL);
  }

  
  return 0;
}

源码如下


```c++
/***

Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.

***/


#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <cuda.h>
#include <gmp.h>
#include "cgbn/cgbn.h"


/************************************************************************************************
 *  This example performs component-wise addition of two arrays of 1024-bit bignums.
 *
 *  The example uses a number of utility functions and macros:
 *
 *    random_words(uint32_t *words, uint32_t count)
 *       fills words[0 .. count-1] with random data
 *
 *    add_words(uint32_t *r, uint32_t *a, uint32_t *b, uint32_t count) 
 *       sets bignums r = a+b, where r, a, and b are count words in length
 *
 *    compare_words(uint32_t *a, uint32_t *b, uint32_t count)
 *       compare bignums a and b, where a and b are count words in length.
 *       return 1 if a>b, 0 if a==b, and -1 if b>a
 *    
 *    CUDA_CHECK(call) is a macro that checks a CUDA result for an error,
 *    if an error is present, it prints out the error, call, file and line.
 *
 *    CGBN_CHECK(report) is a macro that checks if a CGBN error has occurred.
 *    if so, it prints out the error, and instance information
 *
 ************************************************************************************************/
 
// IMPORTANT:  DO NOT DEFINE TPI OR BITS BEFORE INCLUDING CGBN
#define TPI 32
#define BITS 1024


typedef mpz_t mp;

// Declare the instance type
typedef struct {
  cgbn_mem_t<BITS> a;
  cgbn_mem_t<BITS> b;
  cgbn_mem_t<BITS> sum;
} instance_t;


void print_words(uint32_t *x, uint32_t count) {
  int index;

  for(index=count-1;index>=0;index--)
    printf("%08X", x[index]);
  printf("\n");
}

void from_mpz(uint32_t *words, uint32_t count, mpz_t value) {
  size_t written;

  if(mpz_sizeinbase(value, 2)>count*32) {
    fprintf(stderr, "from_mpz failed -- result does not fit\n");
    exit(1);
  }

  mpz_export(words, &written, -1, sizeof(uint32_t), 0, 0, value);
  while(written<count)
    words[written++]=0;
}

void to_mpz(mpz_t r, uint32_t *x, uint32_t count) {
  mpz_import(r, count, -1, sizeof(uint32_t), 0, 0, x);
}


// helpful typedefs for the kernel
typedef cgbn_context_t<TPI>         context_t;
typedef cgbn_env_t<context_t, BITS> env_t;

// the actual kernel
__global__ void kernel_add(cgbn_error_report_t *report, instance_t *instances, uint32_t count) {
  int32_t instance;
  
  // decode an instance number from the blockIdx and threadIdx
  instance=(blockIdx.x*blockDim.x + threadIdx.x)/TPI;
  if(instance>=count)
    return;

  context_t      bn_context(cgbn_report_monitor, report, instance);   // construct a context
  env_t          bn_env(bn_context.env<env_t>());                     // construct an environment for 1024-bit math
  env_t::cgbn_t  a, b, r;                                             // define a, b, r as 1024-bit bignums

  cgbn_load(bn_env, a, &(instances[instance].a));      // load my instance's a value
  cgbn_load(bn_env, b, &(instances[instance].b));      // load my instance's b value
  cgbn_add(bn_env, r, a, b);                           // r=a+b
  cgbn_store(bn_env, &(instances[instance].sum), r);   // store r into sum
}

void cgbn_add_func(mp a[], mp b[], mp c[], int n) {



  instance_t          *gpuInstances;
  cgbn_error_report_t *report;
  int i;
  size_t bits = mpz_sizeinbase(a[1], 2);
  uint32_t count = bits/TPI;
  instance_t *instances = (instance_t *)malloc(sizeof(instance_t)*count);

  //配列に変更する
  for(int index=0;index<count;index++) {
    from_mpz(instances[index].a._limbs, count, a[index]);//配列に変更する a[index]
    from_mpz(instances[index].b._limbs, count, b[index]);//配列に変更する b[index]

  }



  cudaSetDevice(0);
  cudaMalloc((void **)&gpuInstances, sizeof(instance_t)*count);
  cudaMemcpy(gpuInstances, instances, sizeof(instance_t)*count, cudaMemcpyHostToDevice);
  cgbn_error_report_alloc(&report);

  kernel_add<<<(count+3)/4, 128>>>(report, gpuInstances, count);

  cudaDeviceSynchronize();//計算が終わるまで待っています
  

  cudaMemcpy(instances, gpuInstances, sizeof(instance_t)*count, cudaMemcpyDeviceToHost);//GPUからCPUにコピーしています
  print_words(instances[0].a._limbs, count);
  print_words(instances[0].b._limbs, count);
  print_words(instances[0].sum._limbs, count);
  
  for(int index=0;index<count;index++) {
    to_mpz(c[index], instances[index].sum._limbs, count);
    printf("ccc");
    mpz_out_str(stdout, 10, c[index]);
    printf("\n");
  }
   for (i = 0; i < 9; i++) {
    printf("a");
    mpz_out_str(stdout, 10, a[i]);
    printf("\n");
    printf("b");
    mpz_out_str(stdout, 10, b[i]);
    printf("\n");
    printf("c");
    mpz_out_str(stdout, 10, c[i]);
    printf("\n");
  }
  // clean up
  free(instances);
  cudaFree(gpuInstances);
  cgbn_error_report_free(report);


}


 extern "C" int func() {


  gmp_randstate_t  state;
  gmp_randinit_default(state);
  //mpz_t         a,b,c;//高精度整数
  mpz_t a[9],b[9],c[9];
  int i;

  for (i = 0; i < 9; i++) {
    mpz_inits(a[i], b[i], c[i], NULL);
  }
  //mpz_inits(a[i], b[i], c[i], NULL);
  //mpz_inits(a, b, c, NULL);
  //a,b,cを配列にする
  
  for (i = 0; i < 9; i++) {
  mpz_urandomb(a[i], state, 3);
  mpz_urandomb(b[i], state, 3);
    }
  //mpz_urandomb(a, state, BITS);
  //mpz_urandomb(b, state, BITS);
  cgbn_add_func(a, b, c, 9);

  for (i = 0; i < 9; ++i) {
   mpz_clears(a[i],b[i],c[i],NULL);
  }

  
  return 0;
}




想通过修改的方式优化完成这个算法

  • 写回答

1条回答 默认 最新

  • 歇歇 2022-04-09 22:52
    关注

    Debug调试,应该是没有执行到,或者你看看每一步数据的变化

    评论

报告相同问题?

问题事件

  • 系统已结题 4月10日
  • 创建了问题 4月2日

悬赏问题

  • ¥20 怎么用dlib库的算法识别小麦病虫害
  • ¥15 华为ensp模拟器中S5700交换机在配置过程中老是反复重启
  • ¥15 java写代码遇到问题,求帮助
  • ¥15 uniapp uview http 如何实现统一的请求异常信息提示?
  • ¥15 有了解d3和topogram.js库的吗?有偿请教
  • ¥100 任意维数的K均值聚类
  • ¥15 stamps做sbas-insar,时序沉降图怎么画
  • ¥15 买了个传感器,根据商家发的代码和步骤使用但是代码报错了不会改,有没有人可以看看
  • ¥15 关于#Java#的问题,如何解决?
  • ¥15 加热介质是液体,换热器壳侧导热系数和总的导热系数怎么算