使用带有cuda的最近邻域调整图像大小

时间:2018-06-26 08:28:20

标签: opencv image-processing cuda

我正在实现最近邻域内核函数以调整输入图像的大小。但是结果是错误的,我不知道。

这是输入图像

enter image description here

结果错误。

enter image description here

我使用opencv读取输入图像。

cv::Mat image = cv::imread("/home/tumh/test.jpg");
unsigned char* data = image.data;
int outH, outW;
float *out_data_host = test(data, image.rows, image.cols, outH, outW);
cv::Mat out_image(outH, outW, CV_32FC3);
memcpy(out_image.data, out_data_host, outH * outW * 3 * sizeof(float));

float* test(unsigned char* in_data_host, const int &inH, const int &inW, int &outH, int &outW) {
// get the output size
int im_size_min = std::min(inW, inH);
int im_size_max = std::max(inW, inH);

float scale_factor = static_cast<float>(640) / im_size_min;
float im_scale_x = std::floor(inW * scale_factor / 64) * 64 / inW;
float im_scale_y = std::floor(inH * scale_factor / 64) * 64 / inH;

outW = inW * im_scale_x;
outH = inH * im_scale_y;

int channel = 3;

unsigned char* in_data_dev;
CUDA_CHECK(cudaMalloc(&in_data_dev,  sizeof(unsigned char) * channel * inH * inW));
CUDA_CHECK(cudaMemcpy(in_data_dev, in_data_host, 1 * sizeof(unsigned char) * channel * inH * inW, cudaMemcpyHostToDevice));

// image pre process
const float2 scale = make_float2( im_scale_x, im_scale_y);
float * out_buffer = NULL;
CUDA_CHECK(cudaMalloc(&out_buffer,  sizeof(float) * channel * outH * outW));
float *out_data_host = new float[sizeof(float) * channel * outH * outW];
const dim3 threads(32, 32);
const dim3 block(iDivUp(outW, threads.x), iDivUp(outW, threads.y));
gpuPreImageNet<<<block, threads>>>(scale, in_data_dev, inW, out_buffer, outW, outH);
CUDA_CHECK(cudaFree(in_data_dev));
CUDA_CHECK(cudaMemcpy(out_data_host, out_buffer, sizeof(float) * channel * outH * outW, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(out_buffer));
return out_data_host;
}

这是调整内核大小的功能

__global__ void gpuPreImageNet( float2 scale, unsigned char* input, int iWidth, float* output, int oWidth, int oHeight )
{
 const int x = blockIdx.x * blockDim.x + threadIdx.x;
 const int y = blockIdx.y * blockDim.y + threadIdx.y;
 const int n = oWidth * oHeight;
 int channel = 3;

 if( x >= oWidth || y >= oHeight )
   return;

 const int dx = ((float)x * scale.x);
 const int dy = ((float)y * scale.y);

 const unsigned char* px  = input +  dy * iWidth * channel + dx * channel ;

 const float3 bgr = make_float3(*(px + 0),  *(px + 1), *(px + 2));

 output[channel * y * oWidth + channel * x + 0] = bgr.x;
 output[channel * y * oWidth + channel * x + 1] = bgr.y;
 output[channel * y * oWidth + channel * x + 2] = bgr.z;
}

大部分实现来自https://github.com/soulsheng/ResizeNN/blob/master/resizeCUDA/resizeNN.cu

有什么主意吗?

2 个答案:

答案 0 :(得分:2)

也许您正在观察未初始化的内存问题。

据我了解您的代码,out_data_host分配太大

unmatched-changes-changelog.xml

应该是

new float[sizeof(float) * channel * outH * outW];

然后 out_buffer 未初始化,在cudaMalloc行之后添加一个cudaMemset。

为澄清您的代码,由于您已经在使用OpenCV加载图像,所以为什么不使用opencv调整图像大小呢?

new float[channel * outH * outW]

答案 1 :(得分:0)

我花了大约两天时间才找到解决该问题的方法。基本上,我正在为我的项目构建基于GPU的图像预处理管道。这是自定义的Cuda Kernel。 对于“灰度图像调整大小”,请从3-> 1更改通道,它应该可以工作。

__global__ void resize_kernel( real* pIn, real* pOut, int widthIn, int heightIn, int widthOut, int heightOut)
{
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    int channel = 3;

    if( i < heightOut && j < widthOut )
    {
        int iIn = i * heightIn / heightOut;
        int jIn = j * widthIn / widthOut;
        for(int c = 0; c < channel; c++)
            pOut[ (i*widthOut + j)*channel + c ] = pIn[ (iIn*widthIn + jIn)*channel + c ];
    }
}