qq_37476710
qq_37476710
2017-10-18 06:30
采纳率: 100%
浏览 1.3k
已采纳

cuda编程困惑cudaMemcopy

该函数底层是cpu在拷贝还是gpu在拷贝,考入考出耗时这么大,gpu加速计算时是怎么加速的,原理是什么,程序应该怎么设计?

  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 邀请回答

2条回答 默认 最新

  • hello_hi_hi
    hello_hi_hi 2017-10-18 09:46
    已采纳

    cudaMemcopy是CPU和GPU两者共同作用的结果。
    GPU编程就是八股文,共分三个步骤:
    1、在启动GPU计算前使用cudaMemcopy将计算机内存的数据拷贝入GPU内存中,
    2、启动GPU计算
    3、GPU计算结束后使用cudaMemcopy将GPU内存中的计算结果返回CPU内存中。
    由于数据交换通过PCI-E接口进行,只要数据俩不是太大且接口数据带宽足够,耗时不会很大!
    GPU加速主要适用于高并行度、高计算量、低内存占用的项目。
    1、高并行度——充分利用GPU中众多处理器的特点,根据不同的GPU可用成千上万个线程并行处理;
    2、高计算量——可以抵消cudaMemcopy带来的影响
    3、低内存占用——尽量使用局部内存,减少对显存的使用,从而减少由于访问显存而带来的时延。
    下面是一个完整的例子:
    CudaSample.h
    #ifndef HEADER_THREADTEST_H
    #define HEADER_THREADTEST_H
    #ifdef __cplusplus
    extern "C" {
    #endif

    //CUDA device property data structure
    struct DevicdProp
    {
    int deviceNo;
    char name[256];
    size_t totalGlobalMem;
    size_t sharedMemPerBlock;
    int regsPerBlock;
    int warpSize;
    size_t memPitch;
    int maxThreadsPerBlock;
    int maxThreadsDim[3];
    int maxGridSize[3];
    size_t totalConstMem;
    int major;
    int minor;
    int clockRate;
    size_t textureAlignment;
    int deviceOverlap;
    int multiProcessorCount;
    };

    //thread number is equal to 2 multiprocessor's thread number
    #define ThreadNumPerBlock 64

    #define BlockNum 4

    #define TotalThread BlockNum * ThreadNumPerBlock

    //Self defined structure
    struct MYSTRUCT
    {
    unsigned int A[TotalThread], B[TotalThread], C[TotalThread];
    };

    //Init CUDA
    bool InitCUDA(DevicdProp *lpDevicdProp);

    //Cuda Sample
    void CudaSample(MYSTRUCT* lpHostStruct);

    #ifdef __cplusplus
    }
    #endif
    #endif

    CudaCuda.cu
    #include
    #include

    #include

    #include "CudaSample.h"

    /************************************************************************/
    /* Init CUDA /
    /
    ***********************************************************************/
    cudaDeviceProp deviceProp;

    #if DEVICE_EMULATION

    bool InitCUDA(DevicdProp *lpDevicdProp){return true;}

    #else
    bool InitCUDA(DevicdProp *lpDevicdProp)
    {
    int count = 0;
    int i = 0;

    //Set no CUDA device is selected
    lpDevicdProp->deviceNo = -1;
    
    cudaGetDeviceCount(&count);
    
    if(count == 0)
    {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    else
        printf("\n\nThere are maybe %d devices supporting CUDA\n", count);
    
    for(i = 0; i < count; i++)
    {
        if(cudaGetDeviceProperties(&deviceProp, i) != cudaSuccess)
        {
            printf("\nDevice %d: Property cannot be get.\n", i);
            continue;
        }
    
        // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
        if(deviceProp.major == 9999 && deviceProp.minor == 9999)
        {
            printf("\nDevice %d: Do not supporting CUDA.\n", i);
            continue;
        }
    
        if(lpDevicdProp->deviceNo == -1)
        {
            lpDevicdProp->deviceNo = i;                     
            memcpy(lpDevicdProp->name, deviceProp.name, 256);
            lpDevicdProp->totalGlobalMem        = deviceProp.totalGlobalMem; 
            lpDevicdProp->sharedMemPerBlock     = deviceProp.sharedMemPerBlock; 
            lpDevicdProp->regsPerBlock          = deviceProp.regsPerBlock;
            lpDevicdProp->warpSize              = deviceProp.warpSize;
            lpDevicdProp->memPitch              = deviceProp.memPitch;
            lpDevicdProp->maxThreadsPerBlock    = deviceProp.maxThreadsPerBlock; 
            lpDevicdProp->maxThreadsDim[0]      = deviceProp.maxThreadsDim[0]; 
            lpDevicdProp->maxThreadsDim[1]      = deviceProp.maxThreadsDim[1]; 
            lpDevicdProp->maxThreadsDim[2]      = deviceProp.maxThreadsDim[2]; 
            lpDevicdProp->maxGridSize[0]        = deviceProp.maxGridSize[0]; 
            lpDevicdProp->maxGridSize[1]        = deviceProp.maxGridSize[1]; 
            lpDevicdProp->maxGridSize[2]        = deviceProp.maxGridSize[2]; 
            lpDevicdProp->totalConstMem         = deviceProp.totalConstMem; 
            lpDevicdProp->major                 = deviceProp.major;
            lpDevicdProp->minor                 = deviceProp.minor;
            lpDevicdProp->clockRate             = deviceProp.clockRate;
            lpDevicdProp->textureAlignment      = deviceProp.textureAlignment; 
            lpDevicdProp->deviceOverlap         = deviceProp.deviceOverlap; 
            lpDevicdProp->multiProcessorCount   = deviceProp.multiProcessorCount;
        }
    
        printf("\nDevice %d: \"%s\"\n", i, deviceProp.name);
        printf("  CUDA Capability Major revision number:         %d\n", deviceProp.major);
        printf("  CUDA Capability Minor revision number:         %d\n", deviceProp.minor);
        printf("  Total amount of global memory:                 %u bytes\n", (unsigned int)(deviceProp.totalGlobalMem));
    

    #if CUDART_VERSION >= 2000
    printf(" Number of multiprocessors: %d\n", deviceProp.multiProcessorCount);
    printf(" Number of cores: %d\n", 8 * deviceProp.multiProcessorCount);
    #endif
    printf(" Total amount of constant memory: %u bytes\n", (unsigned int)(deviceProp.totalConstMem));
    printf(" Total amount of shared memory per block: %u bytes\n", (unsigned int)(deviceProp.sharedMemPerBlock));
    printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
    printf(" Warp size: %d\n", deviceProp.warpSize);
    printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
    printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n",
    deviceProp.maxThreadsDim[0],
    deviceProp.maxThreadsDim[1],
    deviceProp.maxThreadsDim[2]);
    printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n",
    deviceProp.maxGridSize[0],
    deviceProp.maxGridSize[1],
    deviceProp.maxGridSize[2]);
    printf(" Maximum memory pitch: %u bytes\n", (unsigned int)(deviceProp.memPitch));
    printf(" Texture alignment: %u bytes\n", (unsigned int)(deviceProp.textureAlignment));
    printf(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f);
    #if CUDART_VERSION >= 2000
    printf(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 2020
    printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
    printf(" Integrated: %s\n", deviceProp.integrated ? "Yes" : "No");
    printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
    printf(" Compute mode: %s\n", deviceProp.computeMode == cudaComputeModeDefault ?
    "Default (multiple host threads can use this device simultaneously)" :
    deviceProp.computeMode == cudaComputeModeExclusive ?
    "Exclusive (only one host thread at a time can use this device)" :
    deviceProp.computeMode == cudaComputeModeProhibited ?
    "Prohibited (no host thread can use this device)" :
    "Unknown");
    #endif
    }

    i = lpDevicdProp->deviceNo;                     
    if(i == -1)
    {
        fprintf(stderr, "There is no device supporting CUDA.\n");
        return false;
    }
    cudaSetDevice(i);
    printf("CUDA Device No. used = %d.\n", i);
    
    printf("CUDA initialized.\n");
    return true;
    

    }

    /************************************************************************/
    /* Example /
    /
    ***********************************************************************/
    global static void CudaCalc(MYSTRUCT* lpMyStruct)
    {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if(gridDim.x!=BlockNum)
        return;
    if(blockDim.x!=ThreadNumPerBlock)
        return;
    
    if(idx<TotalThread)
        lpMyStruct->A[idx] = lpMyStruct->B[idx] + lpMyStruct->C[idx];
    

    }

    void CudaSample(MYSTRUCT* lpHostStruct)
    {

    MYSTRUCT    *lpDeviceStruct = 0;
    
    cudaMalloc((void**) &lpDeviceStruct, sizeof(MYSTRUCT));
    cudaMemcpy(lpDeviceStruct, lpHostStruct, sizeof(MYSTRUCT), cudaMemcpyHostToDevice);
    
    CudaCalc<<<BlockNum, ThreadNumPerBlock, 0>>>(lpDeviceStruct);
    

    // CUT_CHECK_ERROR("Kernel execution failed\n");

    cudaThreadSynchronize();
    
    cudaMemcpy(lpHostStruct, lpDeviceStruct, sizeof(MYSTRUCT), cudaMemcpyDeviceToHost);
    
    cudaFree(lpDeviceStruct);
    

    }

    #endif

    CudaSample.cpp

    #include
    #include

    #include "CudaSample.h"

    int main(int argc, char* argv[])
    {
    DevicdProp DevicdProps;
    MYSTRUCT MyStruct;
    unsigned int I, J, N;

    if(!InitCUDA(&DevicdProps))
        return 1;
    
    for(I=0; I<TotalThread; I++)
    {
        MyStruct.B[I] = I;
        MyStruct.C[I] = I;
    }
    
    CudaSample(&MyStruct);
    
    for(I=0; I<TotalThread; I++)
        J = MyStruct.A[I];
    
    return 0;
    

    }

    这个例子基本涵盖了用CUDA进行GPU编程的必要步骤(尚缺根据不同GPU选择不同的线程数以及在CPU中启用多线程进行多GPU并行计算)
    其计算为
    for(I=0; I<TotalThread; I++)
    {
    MyStruct.B[I] = I;
    MyStruct.C[I] = I;
    MyStruct.A[I] = MyStruct.B[I] + MyStruct.C[I];
    }
    其中MyStruct.A[I] = MyStruct.B[I] + MyStruct.C[I]在GPU中计算,每个线程算一个,总线程数为256个。

    点赞 评论
  • hello_hi_hi
    hello_hi_hi 2017-10-18 09:51

    stdio.h
    stdlib.h

    cuda_runtime.h

    点赞 评论

相关推荐