新手刚接触CUDA C,这段中值滤波的代码一直跑不通,如果读取一张小图片(2790,2560)没有问题,如果读取一张大图片(5580,5120),就会导致黑屏+没有结果。
我的显卡是NVIDIA Geforce 820M
#include <iostream>
#include "ImMedFilter.h"
#define DIM_GRID 128
#define DIM_BLOCK 16
using namespace std;
__device__ void mid(unsigned char* data, int size, int midIndex)
{
unsigned char temp = 0;
for (int i = 0; i <= midIndex; i++)
{
for (int j = i+1; j < size; j++)
{
if (data[j] > data[i])
{
temp = data[i];
data[i] = data[j];
data[j] = temp;
}
}
}
}
__global__ void global_ZhongSmooth(unsigned char *dev_src, unsigned char *dev_dst, int width , int height, int bytesPerLine, int size, int markSize)
{
int x = threadIdx.x + blockIdx.x*blockDim.x;
int y = threadIdx.y + blockIdx.y*blockDim.y;
int step_x = blockDim.x * gridDim.x;
int step_y = blockDim.y * gridDim.y;
for (int t_y = y; t_y < height - markSize; t_y = t_y + step_y)
{
if (t_y < markSize || t_y > height-markSize-1)
{
continue;
}
for (int t_x = x; t_x < width - markSize; t_x = t_x + step_x)
{
if (t_x < markSize || t_x > width-markSize-1)
{
continue;
}
unsigned int index = t_x + t_y*width;
if( index < size )
{
unsigned char len = (markSize*2+1)*(markSize*2+1);
unsigned char midIndex = len/2;
// 创建保存中值区域的数组
// size = (marksize * 2 + 1) ^ 2;
// 滤波大小:3、5、7、9、...、29、31
unsigned char*a = NULL;
switch (markSize)
{
case 1:
unsigned char c[9];
a = c;
break;
case 2:
unsigned char d[25];
a = d;
break;
case 3:
unsigned char e[49];
a = e;
break;
case 4:
unsigned char f[81];
a = f;
break;
case 5:
unsigned char g[121];
a = g;
break;
case 6:
unsigned char h[169];
a = h;
break;
case 7:
unsigned char v[225];
a = v;
break;
case 8:
unsigned char w[289];
a = w;
break;
case 9:
unsigned char x[361];
a = x;
break;
case 10:
unsigned char z[441];
a = z;
break;
case 11:
unsigned char u[529];
a = u;
break;
case 12:
unsigned char q[625];
a = q;
break;
case 13:
unsigned char i[729];
a = i;
break;
case 14:
unsigned char o[841];
a = o;
break;
default:
return;
}
unsigned char k = 0;
for (int i = -markSize; i <= markSize; i++)
{
for (int j = -markSize; j <= markSize; j++)
{
a[k++] = dev_src[t_x+j+(t_y+i)*width];
}
}
mid(a, len, midIndex);
dev_dst[index] = a[midIndex];
}
}
}
}
extern "C" void ImMedfilter(const unsigned char *host_src, unsigned char *host_dst, int width, int height, int bytesPerLine, int markSize)
{
int dataSize = bytesPerLine * height;
unsigned char *dev_src = NULL;
unsigned char *dev_dst = NULL;
cudaMalloc((void**) &dev_src, dataSize);
cudaMalloc((void**) &dev_dst, dataSize);
cudaMemset(dev_dst, 0, dataSize);
cudaMemcpy(dev_src, host_src, dataSize, cudaMemcpyHostToDevice);
dim3 blocks(DIM_BLOCK, DIM_BLOCK);
dim3 grids(DIM_GRID, DIM_GRID);
global_ZhongSmooth<<<grids, blocks>>>(dev_src, dev_dst, width, height, bytesPerLine, dataSize, markSize);
cudaMemcpy(host_dst, dev_dst, dataSize, cudaMemcpyDeviceToHost);
cudaFree(dev_src);
cudaFree(dev_dst);
}