I am trying to convert rgb to gray-scale image with cuda gpu. The first step of converting rg2grayscale goes well. But when i trying to convert grayImage to smoothImage by applying filters, the programm compiles but give runtime error an illegal memory access was encountered(CUDA error no.=700) on line 160 CHECKCUDAERROR(cudaMemcpy(smoothImage, d3, size, cudaMemcpyDeviceToHost))
.
For further debugging i used cuda-memcheck it points the error as Invalid __global__ read of size 4
========= at 0x00000248 in /content/drive/My Drive/gpu/sequential/co.cu:77:triangularSmooth(unsigned char*, unsigned char*, int, int, float const *)
========= by thread (0,10,0) in block (20,0,0), Address 0x55e0d2eb83a0 is out of bounds
#include <Timer.hpp>
#include <iostream>
#include <iomanip>
#include <iostream>
#include <iomanip>
#include <cstring>
#include "CImg.h"
#define CHECKCUDAERROR(err) {if (cudaSuccess != err) {fprintf(stderr, "CUDA ERROR: %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err, __LINE__, __FILE__); exit(EXIT_FAILURE); }}
using LOFAR::NSTimer;
using std::cout;
using std::cerr;
using std::endl;
using std::fixed;
using std::setprecision;
using cimg_library::CImg;
// Constants
const bool displayImages = false;
const bool saveAllImages = false;
const unsigned int HISTOGRAM_SIZE = 256;
const unsigned int BAR_WIDTH = 4;
const unsigned int CONTRAST_THRESHOLD = 80;
const float filter[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 3.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
unsigned char *d1 , *d2, *d3;
double realtime()
{
struct timeval tp;
struct timezone tzp;
gettimeofday(&tp, &tzp);
return tp.tv_sec + tp.tv_usec * 1e-6;
}
__global__ void rgb2gray(unsigned char *inputImage, unsigned char *grayImage, const int width,
const int height)
{
int col = threadIdx.x + blockIdx.x * blockDim.x;
int row = threadIdx.y + blockIdx.y * blockDim.y;
if (col < width && row < height)
{
float grayPix = 0.0f;
float r = static_cast< float >(inputImage[(row * width) + col]);
float g = static_cast< float >(inputImage[(width * height) + (row * width) + col]);
float b = static_cast< float >(inputImage[(2 * width * height) + (row * width) + col]);
grayPix = (0.3f * r) + (0.59f * g) + (0.11f * b);
grayImage[(row * width) + col] = static_cast< unsigned char >(grayPix);
}
}
__global__ void triangularSmooth(unsigned char *grayImage, unsigned char *smoothImage,
const int width, const int height, const float *filter)
{
int col = threadIdx.x + blockIdx.x * blockDim.x;
int row = threadIdx.y + blockIdx.y * blockDim.y;
if (col < width && row < height)
{
unsigned int filterItem = 0;
float filterSum = 0.0f;
float smoothPix = 0.0f;
for ( int fy = row - 2; fy < row + 3; fy++ ) {
for ( int fx = col - 2; fx < col + 3; fx++ ) {
if ( ((fy < 0) || (fy >= height)) || ((fx < 0) || (fx >= width)) ) {
filterItem++;
continue;
}
smoothPix += grayImage[(fy * width) + fx] * filter[filterItem];
filterSum += filter[filterItem];
filterItem++;
}
}
smoothPix /= filterSum;
smoothImage[(row * width) + col] = static_cast< unsigned char >(smoothPix);
}
}
void rgb2gray_pl(unsigned char *inputImage, unsigned char *grayImage, const int width, const int height) {
// Initialize device pointers.
size_t size = width * height * sizeof(unsigned char);
double cudamalloc_time = realtime();
// Allocate device memory.
CHECKCUDAERROR(cudaMalloc(&d1, 3*size));
CHECKCUDAERROR(cudaMalloc(&d2, size));
cout << fixed << setprecision(6);
cout << "cudaMalloc: \t\t" << realtime() - cudamalloc_time << " seconds." << endl;
double cudamemcpy_d1 = realtime();
// Transfer from host to device.
CHECKCUDAERROR(cudaMemcpy(d1, inputImage, 3*size, cudaMemcpyHostToDevice));
cout << fixed << setprecision(6);
cout << "cudaMemcpy_host_to_device: \t\t" << realtime() - cudamemcpy_d1 << " seconds." << endl;
double kernel_time = realtime();
//define block and grid dimensions
const dim3 dimGrid((int)ceil(((width +16) /16)), (int)ceil(((height + 16) /16)));
const dim3 dimBlock(16, 16);
//execute cuda kernel
rgb2gray<<<dimGrid, dimBlock>>>(d1, d2, width, height);
CHECKCUDAERROR(cudaPeekAtLastError());
cout << fixed << setprecision(6);
cout << "kernel: \t\t" << realtime() - kernel_time << " seconds." << endl;
double cudamemcpy_d2 = realtime();
//copy computed gray data array from device to host
CHECKCUDAERROR(cudaMemcpy(grayImage, d2, size, cudaMemcpyDeviceToHost));
cout << fixed << setprecision(6);
cout << "cudaMemcpy_device_to_host: \t\t" << realtime() - cudamemcpy_d2 << " seconds." << endl;
}
void smooth_pl(unsigned char *grayImage, unsigned char *smoothImage, const int width, const int height) {
size_t size = width * height * sizeof(unsigned char);
double malloc_time = realtime();
CHECKCUDAERROR(cudaMalloc(&d3, size));
cout << fixed << setprecision(6);
cout << "triangular_smooth_malloc: \t\t" << realtime() - malloc_time << " seconds." << endl;
double t_s_kernel_time = realtime();
//execute cuda kernel
const dim3 dimGrid((int)ceil(((width +16) /16)), (int)ceil(((height + 16) /16)));
const dim3 dimBlock(16, 16);
triangularSmooth<<<dimGrid, dimBlock>>>(d2, d3, width, height, filter);
CHECKCUDAERROR(cudaPeekAtLastError());
cout << fixed << setprecision(6);
cout << "triangular_smooth_kernel: \t\t" << realtime() - t_s_kernel_time << " seconds." << endl;
//copy computed smooth data array from device to host
CHECKCUDAERROR(cudaMemcpy(smoothImage, d3, size, cudaMemcpyDeviceToHost));
double cuda_free = realtime();
CHECKCUDAERROR(cudaFree(d1));
CHECKCUDAERROR(cudaFree(d2));
CHECKCUDAERROR(cudaFree(d3));
cout << fixed << setprecision(6);
cout << "cudaFree: \t\t" << realtime() - cuda_free << " seconds." << endl;
}
int main(int argc, char *argv[])
{
//NSTimer total = NSTimer("total", false, false);
//double prev_time;
if ( argc != 2 ) {
cerr << "Usage: " << argv[0] << " <filename>" << endl;
cout << fixed << setprecision(6);
return 1;
}
// Load the input image
CImg< unsigned char > inputImage = CImg< unsigned char >(argv[1]);
if ( displayImages ) {
inputImage.display("Input Image");
}
if ( inputImage.spectrum() != 3 ) {
//cerr << "The input must be a color image." << endl;
//return 1;
}
double total_time = realtime();
CImg<unsigned char> grayImage = CImg<unsigned char>(inputImage.width(), inputImage.height(), 1, 1);
CImg< unsigned char > smoothImage = CImg< unsigned char >(grayImage.width(), grayImage.height(), 1, 1);
rgb2gray_pl(inputImage.data(), grayImage.data(), inputImage.width(), inputImage.height());
cout << fixed << setprecision(6);
cout << "Total: \t\t" << realtime() - total_time << " seconds." << endl;
grayImage.save("./grayscale.bmp");
smooth_pl(grayImage.data(),smoothImage.data(), grayImage.width(), grayImage.height());
smoothImage.save("./smooth.bmp");
//allocate and initialize memory on device
return 0;
}
User contributions licensed under CC BY-SA 3.0