正在优化一段CUDA的算法,用到了CGNB的库,想要实现四则运算中从单一的加法运算变为数列的形式并行运算。
CGBN参考文献:https://github.com/NVlabs/CGBN
最基本的a+b=c的代码中,c的值一直为0
我尝试将a,b都设置为随机数但是c的结果始终不变
代码如下
void cgbn_add_func(mp a[], mp b[], mp c[], int n) {
instance_t *gpuInstances;
cgbn_error_report_t *report;
int i;
size_t bits = mpz_sizeinbase(a[1], 2);
uint32_t count = bits/TPI;
instance_t *instances = (instance_t *)malloc(sizeof(instance_t)*count);
//配列に変更する
for(int index=0;index<count;index++) {
from_mpz(instances[index].a._limbs, count, a[index]);//配列に変更する a[index]
from_mpz(instances[index].b._limbs, count, b[index]);//配列に変更する b[index]
}
cudaSetDevice(0);
cudaMalloc((void **)&gpuInstances, sizeof(instance_t)*count);
cudaMemcpy(gpuInstances, instances, sizeof(instance_t)*count, cudaMemcpyHostToDevice);
cgbn_error_report_alloc(&report);
kernel_add<<<(count+3)/4, 128>>>(report, gpuInstances, count);
cudaDeviceSynchronize();//計算が終わるまで待っています
cudaMemcpy(instances, gpuInstances, sizeof(instance_t)*count, cudaMemcpyDeviceToHost);//GPUからCPUにコピーしています
print_words(instances[0].a._limbs, count);
print_words(instances[0].b._limbs, count);
print_words(instances[0].sum._limbs, count);
for(int index=0;index<count;index++) {
to_mpz(c[index], instances[index].sum._limbs, count);
printf("ccc");
mpz_out_str(stdout, 10, c[index]);
printf("\n");
}
for (i = 0; i < 9; i++) {
printf("a");
mpz_out_str(stdout, 10, a[i]);
printf("\n");
printf("b");
mpz_out_str(stdout, 10, b[i]);
printf("\n");
printf("c");
mpz_out_str(stdout, 10, c[i]);
printf("\n");
}
// clean up
free(instances);
cudaFree(gpuInstances);
cgbn_error_report_free(report);
}
extern "C" int func() {
gmp_randstate_t state;
gmp_randinit_default(state);
//mpz_t a,b,c;//高精度整数
mpz_t a[9],b[9],c[9];
int i;
for (i = 0; i < 9; i++) {
mpz_inits(a[i], b[i], c[i], NULL);
}
//mpz_inits(a[i], b[i], c[i], NULL);
//mpz_inits(a, b, c, NULL);
//a,b,cを配列にする
for (i = 0; i < 9; i++) {
mpz_urandomb(a[i], state, 3);
mpz_urandomb(b[i], state, 3);
}
//mpz_urandomb(a, state, BITS);
//mpz_urandomb(b, state, BITS);
cgbn_add_func(a, b, c, 9);
for (i = 0; i < 9; ++i) {
mpz_clears(a[i],b[i],c[i],NULL);
}
return 0;
}
源码如下
```c++
/***
Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
***/
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <cuda.h>
#include <gmp.h>
#include "cgbn/cgbn.h"
/************************************************************************************************
* This example performs component-wise addition of two arrays of 1024-bit bignums.
*
* The example uses a number of utility functions and macros:
*
* random_words(uint32_t *words, uint32_t count)
* fills words[0 .. count-1] with random data
*
* add_words(uint32_t *r, uint32_t *a, uint32_t *b, uint32_t count)
* sets bignums r = a+b, where r, a, and b are count words in length
*
* compare_words(uint32_t *a, uint32_t *b, uint32_t count)
* compare bignums a and b, where a and b are count words in length.
* return 1 if a>b, 0 if a==b, and -1 if b>a
*
* CUDA_CHECK(call) is a macro that checks a CUDA result for an error,
* if an error is present, it prints out the error, call, file and line.
*
* CGBN_CHECK(report) is a macro that checks if a CGBN error has occurred.
* if so, it prints out the error, and instance information
*
************************************************************************************************/
// IMPORTANT: DO NOT DEFINE TPI OR BITS BEFORE INCLUDING CGBN
#define TPI 32
#define BITS 1024
typedef mpz_t mp;
// Declare the instance type
typedef struct {
cgbn_mem_t<BITS> a;
cgbn_mem_t<BITS> b;
cgbn_mem_t<BITS> sum;
} instance_t;
void print_words(uint32_t *x, uint32_t count) {
int index;
for(index=count-1;index>=0;index--)
printf("%08X", x[index]);
printf("\n");
}
void from_mpz(uint32_t *words, uint32_t count, mpz_t value) {
size_t written;
if(mpz_sizeinbase(value, 2)>count*32) {
fprintf(stderr, "from_mpz failed -- result does not fit\n");
exit(1);
}
mpz_export(words, &written, -1, sizeof(uint32_t), 0, 0, value);
while(written<count)
words[written++]=0;
}
void to_mpz(mpz_t r, uint32_t *x, uint32_t count) {
mpz_import(r, count, -1, sizeof(uint32_t), 0, 0, x);
}
// helpful typedefs for the kernel
typedef cgbn_context_t<TPI> context_t;
typedef cgbn_env_t<context_t, BITS> env_t;
// the actual kernel
__global__ void kernel_add(cgbn_error_report_t *report, instance_t *instances, uint32_t count) {
int32_t instance;
// decode an instance number from the blockIdx and threadIdx
instance=(blockIdx.x*blockDim.x + threadIdx.x)/TPI;
if(instance>=count)
return;
context_t bn_context(cgbn_report_monitor, report, instance); // construct a context
env_t bn_env(bn_context.env<env_t>()); // construct an environment for 1024-bit math
env_t::cgbn_t a, b, r; // define a, b, r as 1024-bit bignums
cgbn_load(bn_env, a, &(instances[instance].a)); // load my instance's a value
cgbn_load(bn_env, b, &(instances[instance].b)); // load my instance's b value
cgbn_add(bn_env, r, a, b); // r=a+b
cgbn_store(bn_env, &(instances[instance].sum), r); // store r into sum
}
void cgbn_add_func(mp a[], mp b[], mp c[], int n) {
instance_t *gpuInstances;
cgbn_error_report_t *report;
int i;
size_t bits = mpz_sizeinbase(a[1], 2);
uint32_t count = bits/TPI;
instance_t *instances = (instance_t *)malloc(sizeof(instance_t)*count);
//配列に変更する
for(int index=0;index<count;index++) {
from_mpz(instances[index].a._limbs, count, a[index]);//配列に変更する a[index]
from_mpz(instances[index].b._limbs, count, b[index]);//配列に変更する b[index]
}
cudaSetDevice(0);
cudaMalloc((void **)&gpuInstances, sizeof(instance_t)*count);
cudaMemcpy(gpuInstances, instances, sizeof(instance_t)*count, cudaMemcpyHostToDevice);
cgbn_error_report_alloc(&report);
kernel_add<<<(count+3)/4, 128>>>(report, gpuInstances, count);
cudaDeviceSynchronize();//計算が終わるまで待っています
cudaMemcpy(instances, gpuInstances, sizeof(instance_t)*count, cudaMemcpyDeviceToHost);//GPUからCPUにコピーしています
print_words(instances[0].a._limbs, count);
print_words(instances[0].b._limbs, count);
print_words(instances[0].sum._limbs, count);
for(int index=0;index<count;index++) {
to_mpz(c[index], instances[index].sum._limbs, count);
printf("ccc");
mpz_out_str(stdout, 10, c[index]);
printf("\n");
}
for (i = 0; i < 9; i++) {
printf("a");
mpz_out_str(stdout, 10, a[i]);
printf("\n");
printf("b");
mpz_out_str(stdout, 10, b[i]);
printf("\n");
printf("c");
mpz_out_str(stdout, 10, c[i]);
printf("\n");
}
// clean up
free(instances);
cudaFree(gpuInstances);
cgbn_error_report_free(report);
}
extern "C" int func() {
gmp_randstate_t state;
gmp_randinit_default(state);
//mpz_t a,b,c;//高精度整数
mpz_t a[9],b[9],c[9];
int i;
for (i = 0; i < 9; i++) {
mpz_inits(a[i], b[i], c[i], NULL);
}
//mpz_inits(a[i], b[i], c[i], NULL);
//mpz_inits(a, b, c, NULL);
//a,b,cを配列にする
for (i = 0; i < 9; i++) {
mpz_urandomb(a[i], state, 3);
mpz_urandomb(b[i], state, 3);
}
//mpz_urandomb(a, state, BITS);
//mpz_urandomb(b, state, BITS);
cgbn_add_func(a, b, c, 9);
for (i = 0; i < 9; ++i) {
mpz_clears(a[i],b[i],c[i],NULL);
}
return 0;
}
想通过修改的方式优化完成这个算法