问题遇到的现象和发生背景
我有一块开发板使用的是rk3399这款瑞芯微的芯片,带有一个mali-T860的GPU,烧入了ubuntu18.04系统,系统中有用OpenCL。
我是用OpenCL测试了GPU带宽性能,发现Device2Host和Host2Device这两个方向的带宽是4000MB/s,而Device2Device这个方向的带宽是2000MB/s。
请问为何Device2Host和Host2Device这两个方向的带宽会比Device2Device这个方向的带宽多一倍?个人认为因为mali-gpu和cpu是共享内存的,难道不应该是三个方向都差不多吗?
问题相关代码,请勿粘贴截图
///////////////////////////////////////////////////////////////////////////////
// test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
double testHostToDeviceTransfer(unsigned int memSize, accessMode accMode, memoryMode memMode)
{
double elapsedTimeInSec = 0.0;
double bandwidthInMBs = 0.0;
unsigned char* h_data = NULL;
cl_mem cmPinnedData = NULL;
cl_mem cmDevData = NULL;
cl_int ciErrNum = CL_SUCCESS;
// Allocate and init host memory, pinned or conventional
if(memMode == PINNED)
{
// Create a host buffer
cmPinnedData = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, memSize, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
// Get a mapped pointer
h_data = (unsigned char*)clEnqueueMapBuffer(cqCommandQueue, cmPinnedData, CL_TRUE, CL_MAP_WRITE, 0, memSize, 0, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
//initialize
for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
{
h_data[i] = (unsigned char)(i & 0xff);
}
// unmap and make data in the host buffer valid
ciErrNum = clEnqueueUnmapMemObject(cqCommandQueue, cmPinnedData, (void*)h_data, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
h_data = NULL; // buffer is unmapped
}
else
{
// standard host alloc
h_data = (unsigned char *)malloc(memSize);
//initialize
for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
{
h_data[i] = (unsigned char)(i & 0xff);
}
}
// allocate device memory
cmDevData = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
// Sync queue to host, start timer 0, and copy data from Host to GPU
clFinish(cqCommandQueue);
shrDeltaT(0);
if(accMode == DIRECT)
{
if(memMode == PINNED)
{
// Get a mapped pointer
h_data = (unsigned char*)clEnqueueMapBuffer(cqCommandQueue, cmPinnedData, CL_TRUE, CL_MAP_READ, 0, memSize, 0, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
}
// DIRECT: API access to device buffer
for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevData, CL_FALSE, 0, memSize, h_data, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
}
ciErrNum = clFinish(cqCommandQueue);
oclCheckError(ciErrNum, CL_SUCCESS);
}
else
{
// MAPPED: mapped pointers to device buffer and conventional pointer access
void* dm_idata = clEnqueueMapBuffer(cqCommandQueue, cmDevData, CL_TRUE, CL_MAP_WRITE, 0, memSize, 0, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
if(memMode == PINNED )
{
h_data = (unsigned char*)clEnqueueMapBuffer(cqCommandQueue, cmPinnedData, CL_TRUE, CL_MAP_READ, 0, memSize, 0, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
}
for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
memcpy(dm_idata, h_data, memSize);
}
ciErrNum = clEnqueueUnmapMemObject(cqCommandQueue, cmDevData, dm_idata, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
}
//get the the elapsed time in seconds
elapsedTimeInSec = shrDeltaT(0);
//calculate bandwidth in MB/s
bandwidthInMBs = ((double)memSize * (double)MEMCOPY_ITERATIONS)/(elapsedTimeInSec * (double)(1 << 20));
//clean up memory
if(cmDevData)clReleaseMemObject(cmDevData);
if(cmPinnedData)
{
clEnqueueUnmapMemObject(cqCommandQueue, cmPinnedData, (void*)h_data, 0, NULL, NULL);
clReleaseMemObject(cmPinnedData);
}
h_data = NULL;
return bandwidthInMBs;
}
///////////////////////////////////////////////////////////////////////////////
// test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
double testDeviceToDeviceTransfer(unsigned int memSize)
{
double elapsedTimeInSec = 0.0;
double bandwidthInMBs = 0.0;
unsigned char* h_idata = NULL;
cl_int ciErrNum = CL_SUCCESS;
//allocate host memory
h_idata = (unsigned char *)malloc( memSize );
//initialize the memory
for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
{
h_idata[i] = (unsigned char) (i & 0xff);
}
// allocate device input and output memory and initialize the device input memory
cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, memSize, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, memSize, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, d_idata, CL_TRUE, 0, memSize, h_idata, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
// Sync queue to host, start timer 0, and copy data from one GPU buffer to another GPU bufffer
clFinish(cqCommandQueue);
shrDeltaT(0);
for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
ciErrNum = clEnqueueCopyBuffer(cqCommandQueue, d_idata, d_odata, 0, 0, memSize, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
}
// Sync with GPU
clFinish(cqCommandQueue);
//get the the elapsed time in seconds
elapsedTimeInSec = shrDeltaT(0);
// Calculate bandwidth in MB/s
// This is for kernels that read and write GMEM simultaneously
// Obtained Throughput for unidirectional block copies will be 1/2 of this #
bandwidthInMBs = 2.0 * ((double)memSize * (double)MEMCOPY_ITERATIONS)/(elapsedTimeInSec * (double)(1 << 20));
//clean up memory on host and device
free(h_idata);
clReleaseMemObject(d_idata);
clReleaseMemObject(d_odata);
return bandwidthInMBs;
}