I developed a Bayer demosaicing routine using CUDA4, but it is slower than a single-threaded CPU implementation on a 16-core GTS250. The block size is set to (16,16) and image dimensions are multiples of 16. Changing these parameters doesn’t seem to speed it up.
Am I making some obvious mistakes?
— Calling Routine —
uchar4 *d_output;
size_t num_bytes;
cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);
cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource);
// Perform conversion and store result in PBO for display
kernel_wrapper(imageWidth, imageHeight, blockSize, gridSize, d_output);
cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);
— CUDA Kernel —
texture<uchar, 2, cudaReadModeElementType> tex;
cudaArray *d_imageArray = 0;
__global__ void demosaicKernel(uchar4 *d_output, uint width, uint height) {
uint x = blockIdx.x * blockDim.x + threadIdx.x;
uint y = blockIdx.y * blockDim.y + threadIdx.y;
uint i = y * width + x;
if (x < width && y < height) {
if (y % 2) {
if (x % 2) {
d_output[i].x = (tex2D(tex, x + 1, y) + tex2D(tex, x - 1, y)) / 2;
d_output[i].y = tex2D(tex, x, y);
d_output[i].z = (tex2D(tex, x, y + 1) + tex2D(tex, x, y - 1)) / 2;
} else {
d_output[i].x = tex2D(tex, x, y);
d_output[i].y = (tex2D(tex, x + 1, y) + tex2D(tex, x - 1, y) + tex2D(tex, x, y + 1) + tex2D(tex, x, y - 1)) / 4;
d_output[i].z = (tex2D(tex, x + 1, y + 1) + tex2D(tex, x + 1, y - 1) + tex2D(tex, x - 1, y + 1) + tex2D(tex, x - 1, y - 1)) / 4;
}
} else {
if (x % 2) {
d_output[i].y = (tex2D(tex, x + 1, y + 1) + tex2D(tex, x + 1, y - 1) + tex2D(tex, x - 1, y + 1) + tex2D(tex, x - 1, y - 1)) / 4;
d_output[i].z = tex2D(tex, x, y);
d_output[i].y = (tex2D(tex, x + 1, y) + tex2D(tex, x - 1, y) + tex2D(tex, x, y + 1) + tex2D(tex, x, y - 1)) / 4;
} else {
d_output[i].x = (tex2D(tex, x, y + 1) + tex2D(tex, x, y - 1)) / 2;
d_output[i].y = tex2D(tex, x, y);
d_output[i].z = (tex2D(tex, x + 1, y) + tex2D(tex, x - 1, y)) / 2;
}
}
}
}
— Texture Initialization —
void initializeTexture(int width, int height, uchar *imageData) {
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaMallocArray(&d_imageArray, &channelDesc, width, height);
size_t size = width * height * sizeof(uchar);
cudaMemcpyToArray(d_imageArray, 0, 0, imageData, size, cudaMemcpyHostToDevice);
free(imageData);
// Configure texture settings
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false;
cudaBindTextureToArray(tex, d_imageArray);
}