该函数底层是cpu在拷贝还是gpu在拷贝,考入考出耗时这么大,gpu加速计算时是怎么加速的,原理是什么,程序应该怎么设计?
2条回答 默认 最新
- hello_hi_hi 2017-10-18 09:46关注
cudaMemcopy是CPU和GPU两者共同作用的结果。
GPU编程就是八股文,共分三个步骤:
1、在启动GPU计算前使用cudaMemcopy将计算机内存的数据拷贝入GPU内存中,
2、启动GPU计算
3、GPU计算结束后使用cudaMemcopy将GPU内存中的计算结果返回CPU内存中。
由于数据交换通过PCI-E接口进行,只要数据俩不是太大且接口数据带宽足够,耗时不会很大!
GPU加速主要适用于高并行度、高计算量、低内存占用的项目。
1、高并行度——充分利用GPU中众多处理器的特点,根据不同的GPU可用成千上万个线程并行处理;
2、高计算量——可以抵消cudaMemcopy带来的影响
3、低内存占用——尽量使用局部内存,减少对显存的使用,从而减少由于访问显存而带来的时延。
下面是一个完整的例子:
CudaSample.h
#ifndef HEADER_THREADTEST_H
#define HEADER_THREADTEST_H
#ifdef __cplusplus
extern "C" {
#endif//CUDA device property data structure
struct DevicdProp
{
int deviceNo;
char name[256];
size_t totalGlobalMem;
size_t sharedMemPerBlock;
int regsPerBlock;
int warpSize;
size_t memPitch;
int maxThreadsPerBlock;
int maxThreadsDim[3];
int maxGridSize[3];
size_t totalConstMem;
int major;
int minor;
int clockRate;
size_t textureAlignment;
int deviceOverlap;
int multiProcessorCount;
};//thread number is equal to 2 multiprocessor's thread number
#define ThreadNumPerBlock 64#define BlockNum 4
#define TotalThread BlockNum * ThreadNumPerBlock
//Self defined structure
struct MYSTRUCT
{
unsigned int A[TotalThread], B[TotalThread], C[TotalThread];
};//Init CUDA
bool InitCUDA(DevicdProp *lpDevicdProp);//Cuda Sample
void CudaSample(MYSTRUCT* lpHostStruct);#ifdef __cplusplus
}
#endif
#endifCudaCuda.cu
#include
#include#include
#include "CudaSample.h"
/************************************************************************/
/* Init CUDA /
/***********************************************************************/
cudaDeviceProp deviceProp;#if DEVICE_EMULATION
bool InitCUDA(DevicdProp *lpDevicdProp){return true;}
#else
bool InitCUDA(DevicdProp *lpDevicdProp)
{
int count = 0;
int i = 0;//Set no CUDA device is selected lpDevicdProp->deviceNo = -1; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "There is no device.\n"); return false; } else printf("\n\nThere are maybe %d devices supporting CUDA\n", count); for(i = 0; i < count; i++) { if(cudaGetDeviceProperties(&deviceProp, i) != cudaSuccess) { printf("\nDevice %d: Property cannot be get.\n", i); continue; } // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present if(deviceProp.major == 9999 && deviceProp.minor == 9999) { printf("\nDevice %d: Do not supporting CUDA.\n", i); continue; } if(lpDevicdProp->deviceNo == -1) { lpDevicdProp->deviceNo = i; memcpy(lpDevicdProp->name, deviceProp.name, 256); lpDevicdProp->totalGlobalMem = deviceProp.totalGlobalMem; lpDevicdProp->sharedMemPerBlock = deviceProp.sharedMemPerBlock; lpDevicdProp->regsPerBlock = deviceProp.regsPerBlock; lpDevicdProp->warpSize = deviceProp.warpSize; lpDevicdProp->memPitch = deviceProp.memPitch; lpDevicdProp->maxThreadsPerBlock = deviceProp.maxThreadsPerBlock; lpDevicdProp->maxThreadsDim[0] = deviceProp.maxThreadsDim[0]; lpDevicdProp->maxThreadsDim[1] = deviceProp.maxThreadsDim[1]; lpDevicdProp->maxThreadsDim[2] = deviceProp.maxThreadsDim[2]; lpDevicdProp->maxGridSize[0] = deviceProp.maxGridSize[0]; lpDevicdProp->maxGridSize[1] = deviceProp.maxGridSize[1]; lpDevicdProp->maxGridSize[2] = deviceProp.maxGridSize[2]; lpDevicdProp->totalConstMem = deviceProp.totalConstMem; lpDevicdProp->major = deviceProp.major; lpDevicdProp->minor = deviceProp.minor; lpDevicdProp->clockRate = deviceProp.clockRate; lpDevicdProp->textureAlignment = deviceProp.textureAlignment; lpDevicdProp->deviceOverlap = deviceProp.deviceOverlap; lpDevicdProp->multiProcessorCount = deviceProp.multiProcessorCount; } printf("\nDevice %d: \"%s\"\n", i, deviceProp.name); printf(" CUDA Capability Major revision number: %d\n", deviceProp.major); printf(" CUDA Capability Minor revision number: %d\n", deviceProp.minor); printf(" Total amount of global memory: %u bytes\n", (unsigned int)(deviceProp.totalGlobalMem));
#if CUDART_VERSION >= 2000
printf(" Number of multiprocessors: %d\n", deviceProp.multiProcessorCount);
printf(" Number of cores: %d\n", 8 * deviceProp.multiProcessorCount);
#endif
printf(" Total amount of constant memory: %u bytes\n", (unsigned int)(deviceProp.totalConstMem));
printf(" Total amount of shared memory per block: %u bytes\n", (unsigned int)(deviceProp.sharedMemPerBlock));
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
printf(" Warp size: %d\n", deviceProp.warpSize);
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n",
deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",
deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf(" Maximum memory pitch: %u bytes\n", (unsigned int)(deviceProp.memPitch));
printf(" Texture alignment: %u bytes\n", (unsigned int)(deviceProp.textureAlignment));
printf(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 2000
printf(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 2020
printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated: %s\n", deviceProp.integrated ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
printf(" Compute mode: %s\n", deviceProp.computeMode == cudaComputeModeDefault ?
"Default (multiple host threads can use this device simultaneously)" :
deviceProp.computeMode == cudaComputeModeExclusive ?
"Exclusive (only one host thread at a time can use this device)" :
deviceProp.computeMode == cudaComputeModeProhibited ?
"Prohibited (no host thread can use this device)" :
"Unknown");
#endif
}i = lpDevicdProp->deviceNo; if(i == -1) { fprintf(stderr, "There is no device supporting CUDA.\n"); return false; } cudaSetDevice(i); printf("CUDA Device No. used = %d.\n", i); printf("CUDA initialized.\n"); return true;
}
/************************************************************************/
/* Example /
/***********************************************************************/
global static void CudaCalc(MYSTRUCT* lpMyStruct)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;if(gridDim.x!=BlockNum) return; if(blockDim.x!=ThreadNumPerBlock) return; if(idx<TotalThread) lpMyStruct->A[idx] = lpMyStruct->B[idx] + lpMyStruct->C[idx];
}
void CudaSample(MYSTRUCT* lpHostStruct)
{MYSTRUCT *lpDeviceStruct = 0; cudaMalloc((void**) &lpDeviceStruct, sizeof(MYSTRUCT)); cudaMemcpy(lpDeviceStruct, lpHostStruct, sizeof(MYSTRUCT), cudaMemcpyHostToDevice); CudaCalc<<<BlockNum, ThreadNumPerBlock, 0>>>(lpDeviceStruct);
// CUT_CHECK_ERROR("Kernel execution failed\n");
cudaThreadSynchronize(); cudaMemcpy(lpHostStruct, lpDeviceStruct, sizeof(MYSTRUCT), cudaMemcpyDeviceToHost); cudaFree(lpDeviceStruct);
}
#endif
CudaSample.cpp
#include
#include#include "CudaSample.h"
int main(int argc, char* argv[])
{
DevicdProp DevicdProps;
MYSTRUCT MyStruct;
unsigned int I, J, N;if(!InitCUDA(&DevicdProps)) return 1; for(I=0; I<TotalThread; I++) { MyStruct.B[I] = I; MyStruct.C[I] = I; } CudaSample(&MyStruct); for(I=0; I<TotalThread; I++) J = MyStruct.A[I]; return 0;
}
这个例子基本涵盖了用CUDA进行GPU编程的必要步骤(尚缺根据不同GPU选择不同的线程数以及在CPU中启用多线程进行多GPU并行计算)
其计算为
for(I=0; I<TotalThread; I++)
{
MyStruct.B[I] = I;
MyStruct.C[I] = I;
MyStruct.A[I] = MyStruct.B[I] + MyStruct.C[I];
}
其中MyStruct.A[I] = MyStruct.B[I] + MyStruct.C[I]在GPU中计算,每个线程算一个,总线程数为256个。本回答被题主选为最佳回答 , 对您是否有帮助呢?解决 无用评论 打赏 举报
悬赏问题
- ¥15 谁有desed数据集呀
- ¥20 手写数字识别运行c仿真时,程序报错错误代码sim211-100
- ¥15 关于#hadoop#的问题
- ¥15 (标签-Python|关键词-socket)
- ¥15 keil里为什么main.c定义的函数在it.c调用不了
- ¥50 切换TabTip键盘的输入法
- ¥15 可否在不同线程中调用封装数据库操作的类
- ¥15 微带串馈天线阵列每个阵元宽度计算
- ¥15 keil的map文件中Image component sizes各项意思
- ¥20 求个正点原子stm32f407开发版的贪吃蛇游戏