an illegal memory access was encountered(CUDA error no.=700)

Question

an illegal memory access was encountered(CUDA error no.=700)

I am trying to convert rgb to gray-scale image with cuda gpu. The first step of converting rg2grayscale goes well. But when i trying to convert grayImage to smoothImage by applying filters, the programm compiles but give runtime error an illegal memory access was encountered(CUDA error no.=700) on line 160 CHECKCUDAERROR(cudaMemcpy(smoothImage, d3, size, cudaMemcpyDeviceToHost)). For further debugging i used cuda-memcheck it points the error as Invalid __global__ read of size 4 ========= at 0x00000248 in /content/drive/My Drive/gpu/sequential/co.cu:77:triangularSmooth(unsigned char*, unsigned char*, int, int, float const *) ========= by thread (0,10,0) in block (20,0,0), Address 0x55e0d2eb83a0 is out of bounds

#include <Timer.hpp>
#include <iostream>
#include <iomanip>
#include <iostream>
#include <iomanip>
#include <cstring>
#include "CImg.h"

#define CHECKCUDAERROR(err)     {if (cudaSuccess != err) {fprintf(stderr, "CUDA ERROR: %s(CUDA error no.=%d). Line no. %d in file %s\n", cudaGetErrorString(err), err,  __LINE__, __FILE__); exit(EXIT_FAILURE); }}

using LOFAR::NSTimer;
using std::cout;
using std::cerr;
using std::endl;
using std::fixed;
using std::setprecision;
using cimg_library::CImg;

// Constants
const bool displayImages = false;
const bool saveAllImages = false;
const unsigned int HISTOGRAM_SIZE = 256;
const unsigned int BAR_WIDTH = 4;
const unsigned int CONTRAST_THRESHOLD = 80;
const float filter[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 3.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};

unsigned char *d1 , *d2, *d3;

double realtime()
{
    struct timeval tp;
    struct timezone tzp;
    gettimeofday(&tp, &tzp);
    return tp.tv_sec + tp.tv_usec * 1e-6;
}
__global__ void rgb2gray(unsigned char *inputImage, unsigned char *grayImage, const int width, 
const int height) 
{

    int col = threadIdx.x + blockIdx.x * blockDim.x;
    int row = threadIdx.y + blockIdx.y * blockDim.y;

    if (col < width && row < height)
    {

            float grayPix = 0.0f;
            float r = static_cast< float >(inputImage[(row * width) + col]);
            float g = static_cast< float >(inputImage[(width * height) + (row * width) + col]);
            float b = static_cast< float >(inputImage[(2 * width * height) + (row * width) + col]);

            grayPix = (0.3f * r) + (0.59f * g) + (0.11f * b);

            grayImage[(row * width) + col] = static_cast< unsigned char >(grayPix);
        }
    }

__global__ void triangularSmooth(unsigned char *grayImage, unsigned char *smoothImage, 
                      const int width, const int height, const float *filter) 
{

    int col = threadIdx.x + blockIdx.x * blockDim.x;
    int row = threadIdx.y + blockIdx.y * blockDim.y;

    if (col < width && row < height)
        {
            unsigned int filterItem = 0;
            float filterSum = 0.0f;
            float smoothPix = 0.0f;

            for ( int fy = row - 2; fy < row + 3; fy++ ) {
                for ( int fx = col - 2; fx < col + 3; fx++ ) {
                    if ( ((fy < 0) || (fy >= height)) || ((fx < 0) || (fx >= width)) ) {
                        filterItem++;
                        continue;
                    }

                    smoothPix += grayImage[(fy * width) + fx] * filter[filterItem];
                    filterSum += filter[filterItem];
                    filterItem++;
                }
            }

            smoothPix /= filterSum;
            smoothImage[(row * width) + col] = static_cast< unsigned char >(smoothPix);

        }

}
void rgb2gray_pl(unsigned char *inputImage, unsigned char *grayImage, const int width, const int height) {

    // Initialize device pointers.
    size_t size = width * height * sizeof(unsigned char);

    double cudamalloc_time = realtime();

    // Allocate device memory.
    CHECKCUDAERROR(cudaMalloc(&d1, 3*size));
    CHECKCUDAERROR(cudaMalloc(&d2, size));

    cout << fixed << setprecision(6);
    cout << "cudaMalloc: \t\t" << realtime() - cudamalloc_time << " seconds." << endl;

    double cudamemcpy_d1 = realtime();

    // Transfer from host to device.
    CHECKCUDAERROR(cudaMemcpy(d1, inputImage, 3*size, cudaMemcpyHostToDevice));

    cout << fixed << setprecision(6);
    cout << "cudaMemcpy_host_to_device: \t\t" << realtime() - cudamemcpy_d1 << " seconds." << endl;

    double kernel_time = realtime();

    //define block and grid dimensions
    const dim3 dimGrid((int)ceil(((width +16) /16)), (int)ceil(((height + 16) /16)));
    const dim3 dimBlock(16, 16);

    //execute cuda kernel
    rgb2gray<<<dimGrid, dimBlock>>>(d1, d2, width, height);
    CHECKCUDAERROR(cudaPeekAtLastError());
    cout << fixed << setprecision(6);
    cout << "kernel: \t\t" << realtime() - kernel_time << " seconds." << endl;

    double cudamemcpy_d2 = realtime();

    //copy computed gray data array from device to host
    CHECKCUDAERROR(cudaMemcpy(grayImage, d2, size, cudaMemcpyDeviceToHost));

    cout << fixed << setprecision(6);
    cout << "cudaMemcpy_device_to_host: \t\t" << realtime() - cudamemcpy_d2 << " seconds." << endl;



}

void smooth_pl(unsigned char *grayImage, unsigned char *smoothImage, const int width, const int height) {

    size_t size = width * height * sizeof(unsigned char);

    double malloc_time = realtime();

    CHECKCUDAERROR(cudaMalloc(&d3, size));

     cout << fixed << setprecision(6);
    cout << "triangular_smooth_malloc: \t\t" << realtime() - malloc_time << " seconds." << endl;

    double t_s_kernel_time = realtime();


    //execute cuda kernel
    const dim3 dimGrid((int)ceil(((width +16) /16)), (int)ceil(((height + 16) /16)));
    const dim3 dimBlock(16, 16);

    triangularSmooth<<<dimGrid, dimBlock>>>(d2, d3, width, height, filter);
    CHECKCUDAERROR(cudaPeekAtLastError());
    cout << fixed << setprecision(6);
    cout << "triangular_smooth_kernel: \t\t" << realtime() - t_s_kernel_time << " seconds." << endl;


    //copy computed smooth data array from device to host
    CHECKCUDAERROR(cudaMemcpy(smoothImage, d3, size, cudaMemcpyDeviceToHost));


    double cuda_free = realtime();

    CHECKCUDAERROR(cudaFree(d1));
    CHECKCUDAERROR(cudaFree(d2));
    CHECKCUDAERROR(cudaFree(d3));


    cout << fixed << setprecision(6);
    cout << "cudaFree: \t\t" << realtime() - cuda_free << " seconds." << endl;
}

int main(int argc, char *argv[]) 
{
    //NSTimer total = NSTimer("total", false, false);
        //double prev_time;

    if ( argc != 2 ) {
        cerr << "Usage: " << argv[0] << " <filename>" << endl;        
        cout << fixed << setprecision(6);
        return 1;
    }

    // Load the input image
    CImg< unsigned char > inputImage = CImg< unsigned char >(argv[1]);
    if ( displayImages ) {
        inputImage.display("Input Image");
    }
    if ( inputImage.spectrum() != 3 ) {
        //cerr << "The input must be a color image." << endl;
        //return 1;
    }
    double total_time = realtime();


    CImg<unsigned char> grayImage = CImg<unsigned char>(inputImage.width(), inputImage.height(), 1, 1);
    CImg< unsigned char > smoothImage = CImg< unsigned char >(grayImage.width(), grayImage.height(), 1, 1);

    rgb2gray_pl(inputImage.data(), grayImage.data(), inputImage.width(), inputImage.height());

        cout << fixed << setprecision(6);
    cout << "Total: \t\t" << realtime() - total_time << " seconds." << endl;
    grayImage.save("./grayscale.bmp");

    smooth_pl(grayImage.data(),smoothImage.data(), grayImage.width(),  grayImage.height());

    smoothImage.save("./smooth.bmp");


    //allocate and initialize memory on device

    return 0;
}

c++

cuda

asked on Stack Overflow May 18, 2020 by

FarhanIoTDeveloper • edited May 20, 2020 by

FarhanIoTDeveloper

0 Answers

Nobody has answered this question yet.

User contributions licensed under CC BY-SA 3.0