How to create a temporary 2D variable for cuda kernel

Question

How to create a temporary 2D variable for cuda kernel

My Ix and Iy declared in the CUDA global kernel will cause illegal memory access encounters due to unknown reasons. This is the code:

#include "opencv2/opencv.hpp"
#include "opencv2/highgui.hpp"
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#include <stdlib.h>

// Cuda
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


#define CHECK_FINAL_RESULT
//#define CHECK_LOADING_DATA
using namespace std;
const int TSIZEX = 32;
const int TSIZEY = 256;
const int ft_size = 1;

// Mathematical algorithms
#define isl_min(x,y)        ((x) < (y) ? (x) : (y))         // compare value x is lesser than y, if correct use x, if wrong use y
#define isl_max(x,y)        ((x) > (y) ? (x) : (y))         // comapre value x is larger than y, if correct use y, if wrong use x

__device__ float cudafilter2sq(float A[16][34], float B[34][258], int i, int j);
__global__ void cudapipeline_harris(int  C, int  R, float* img, float* harris);

__device__ float cudafilter2sq(float A[34][258], float B[34][258], int i, int j) {

    return (A[i - 1][j - 1] * B[i - 1][j - 1] +
        A[i - 1][j] * B[i - 1][j] +
        A[i - 1][j + 1] * B[i - 1][j + 1] +
        A[i][j - 1] * B[i][j - 1] +
        A[i][j] * B[i][j] +
        A[i][j + 1] * B[i][j + 1] +
        A[i + 1][j - 1] * B[i + 1][j - 1] +
        A[i + 1][j] * B[i + 1][j] +
        A[i + 1][j + 1] * B[i + 1][j + 1]);
}



__global__ void cudapipeline_harris(int  C, int  R, float* img, float* harris) {

    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;
    int idz = threadIdx.z + blockIdx.z * blockDim.z;

    float Ix[TSIZEX + 2 * ft_size][TSIZEY + 2 * ft_size];
    float Iy[TSIZEX + 2 * ft_size][TSIZEY + 2 * ft_size];

    for (int Ti = idx; Ti < (float)(R / TSIZEX); Ti += gridDim.x * blockDim.x)
    //if (Ti < (R / TSIZEX))
    {
        //for (int Tj = 0; Tj <= (float)(C / TSIZEY); Tj++)
        for (int Tj = idy; Tj < (float)(C/ TSIZEY); Tj += gridDim.y * blockDim.y)
        {
            int bot0, top0, right0, left0;
            int height, width;

            bot0 = isl_min(isl_max(Ti * TSIZEX, ft_size), R - ft_size);
            top0 = isl_min((Ti + 1) * TSIZEX, R - ft_size);
            left0 = isl_min(isl_max(Tj * TSIZEY, ft_size), C - ft_size);
            right0 = isl_min((Tj + 1) * TSIZEY, C - ft_size);

            width = right0 - left0;
            height = top0 - bot0;


            for (int i = bot0; i <= top0; i++)
            {
                for (int j = left0; j <= right0; j++)
                {
                    //printf("Ix : %d ", i - bot0);
                    Ix[i - bot0][j - left0] = img[(i - 1) * C + j - 1] * (-0.0833333333333f) +
                        img[(i + 1) * C + j - 1] * 0.0833333333333f +
                        img[(i + 1) * C + j] * 0.166666666667f +
                        img[(i - 1) * C + j] * -0.166666666667f +
                        img[(i - 1) * C + j + 1] * -0.0833333333333f +
                        img[(i + 1) * C + j + 1] * 0.0833333333333f;

                    Iy[i - bot0][j - left0] = img[(i - 1) * C + j - 1] * (-0.0833333333333f) +
                        img[(i - 1) * C + j + 1] * 0.0833333333333f +
                        img[i * C + j - 1] * -0.166666666667f +
                        img[i * C + j + 1] * 0.166666666667f +
                        img[(i + 1) * C + j - 1] * -0.0833333333333f +
                        img[(i + 1) * C + j + 1] * 0.0833333333333f;

                }
            }

           // for (int i = idy + bot0;i < (float)top0; i += gridDim.y * blockDim.y)
            for (int i = bot0; i < top0; i++)
            {
                for (int j = left0; j < right0; j++)
                {
                    int newI = i - bot0;
                    int newJ = j - left0;

                    harris[((i)*C + (j))] = cudafilter2sq(Ix, Ix, newI, newJ) * cudafilter2sq(Iy, Iy, newI, newJ) -
                        cudafilter2sq(Ix, Iy, newI, newJ) * cudafilter2sq(Ix, Iy, newI, newJ) -
                        (0.04f * (cudafilter2sq(Ix, Ix, newI, newJ) + cudafilter2sq(Iy, Iy, newI, newJ))) *
                        (cudafilter2sq(Ix, Ix, newI, newJ) + cudafilter2sq(Iy, Iy, newI, newJ));
                }
            }

        }
    }


}


int main(int argc, char** argv)
{
    int i, j, run;                // looping variables
    int R, C, nruns;              // height, width and number of loops runs
    double begin, end;            // each loop start time and end time
    double init, finish;          // total loop start time and end time
    double stime, avgt;           // time used and total avgt time
    cv::Mat image, loaded_data;
    cv::Scalar sc;
    cv::Size size;

    float* t_res;
    float* t_data;

    // Might be unused depending on preprocessor macro definitions
    (void)t_res;
    (void)t_data;
    (void)loaded_data;

    float* data;
    float* res;

    if (argc != 3)
    {
        printf("Does not set the NRuns and image needed\n");
        return -1;
    }

    image = cv::imread(argv[1], 1);   // read image from command line argument [1]

    if (!image.data)
    {
        printf("No image data ! Are you sure %s is an image ?\n", argv[1]);
        return -1;
    }

    // Convert image input to grayscale floating point
    cv::cvtColor(image, image, cv::COLOR_BGR2GRAY);
    size = image.size();
    C = size.width;
    R = size.height;

    printf("Values settings :\n");
    printf("-------------------\n");
    printf("Image Used : %s [%i, %i] \n", argv[1], R, C);

    res = (float*)calloc(R * C, sizeof(*res));

    if (res == NULL)
    {
        printf("Error while allocating result table of size %ld B\n",
            (sizeof(*res) * C * R));
        return -1;
    }

    data = (float*)malloc(R * C * sizeof(float));
    for (i = 0; i < R; i++) {
        for (j = 0; j < C; j++) {
            sc = image.at<uchar>(i, j);
            data[i * C + j] = (float)sc.val[0] / 255;
        }
    }

    // Parallel Running Test
    printf("\n\n-----------------------------------\n");
    printf("Cuda\n");
    printf("-----------------------------------\n");
    res = (float*)calloc(R * C, sizeof(*res));                // reset resources value

    dim3 grid(2,2,2);
    dim3 block(16,16,1);

    // Data required to pass to device
    float* img, * harris;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMalloc((void**)&img, R * C * sizeof(*img));
    cudaMalloc((void**)&harris, R * C * sizeof(*harris));


    cudaMemcpy(img, data, C * R * sizeof(*data), cudaMemcpyHostToDevice);   // pass image value to the GPU

    cudaEventRecord(start);
    cudapipeline_harris << < grid, block >> > (C, R, img, harris);
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    //cudapipeline_harris << < 1, 8 >> > (C, R, img, harris);

   cudaDeviceSynchronize();


    cudaMemcpy(res, harris, C * R * sizeof(*harris), cudaMemcpyDeviceToHost);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        printf("CUDA ERROR : %s", cudaGetErrorString(err));
        exit(-1);
    }

    printf("Total time   :  \t %f ms\n", milliseconds);


#ifdef CHECK_FINAL_RESULT
    // Serial Show input
    cv::namedWindow("Input", cv::WINDOW_NORMAL);
    cv::imshow("Input", image);
    image.release();
    // Parallel Show output
    cv::Mat imres = cv::Mat(R, C, CV_32F, res);
    cv::namedWindow("Parallel Output", cv::WINDOW_NORMAL);
    cv::imshow("Parallel Output", imres * 65535.0);
    imres.release();
#endif

    cudaFree(harris);
    cudaFree(img);

    free(data);
    free(res);
    return 0;
}

This is the error shown:

CUDA ERROR : an illegal memory access was encountered

**CUDA ERROR : unspecified launch failure========= Invalid __global__ read of size 4
=========     at 0x000002d0 in C:/Users/Jiayih/source/repos/cuda/cuda/main.cu:383:cudafilter2sq(float[258]*, float[258]*, int, int)
=========     by thread (15,1,0) in block (0,0,1)
=========     Address 0x2c6f5fee774 is out of bounds
=========     Device Frame:C:/Users/Jiayih/source/repos/cuda/cuda/main.cu:453:cudapipeline_harris(int, int, float*, float*) (cudapipeline_harris(int, int, float*, float*) : 0x2130)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x81dcd]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x82167]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x8686e]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ba]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x176ea9]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuProfilerStop + 0xe97c2) [0x307342]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x361bd]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x365e1]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x368c4]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuLaunchKernel + 0x234) [0x20d954]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll [0x8dba]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll [0x8c66]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll (cudaLaunchKernel + 0x1c4) [0x29024]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (main + 0x1f) [0x516f]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__device_stub__Z19cudapipeline_harrisiiPfS_ + 0x22e) [0x4fbe]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (cudapipeline_harris + 0x41) [0x44c1]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (main + 0x577) [0x4a47]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (invoke_main + 0x39) [0xfa79]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__scrt_common_main_seh + 0x12e) [0xf95e]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__scrt_common_main + 0xe) [0xf81e]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (mainCRTStartup + 0x9) [0xfb09]
=========     Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17bd4]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6ce51]
=========**

multidimensional-array

cuda

global-variables

asked on Stack Overflow Sep 4, 2020 by

Jia Yih Ang • edited Sep 4, 2020 by

Robert Crovella

1 Answer

The debug process here is fairly straightforward. Your CUDA error output is pointing to an out-of-range access error in cudafilter2sq as indicated here:

Invalid global read of size 4 ========= at ...cuda/main.cu:383:cudafilter2sq
... Address ... is out of bounds

Looking at cudafilter2sq, ask yourself the question "how could one of those accesses be out of range?" Since that function is fairly simple, the answer is, "if one of the indexes ( computed from i or j) is out of range for A/Ix or B/Iy. Then you just test those computed indexes against the known possible ranges (0-33, 0-257).

It should be quite evident that cudafilter2sq requires an i value greater than 0, otherwise i-1 will index out of range. But you are not satisfying this requirement. Add:

 #include <assert.h>

and then add:

 assert(i > 0);

to the very beginning of cudafilter2sq. Then run your code with the memory checking feature enabled (as you are already doing). You will hit these device asserts, indicating you are indexing out-of-range. You have the same problem with j.

When I add the following code to the beginning of cudafilter2sq:

if (i < 1) i = 1;  if (j < 1) j = 1;

your code runs without error for me. It should be fairly evident that if your cudapipeline_harris kernel for-loop starts with:

 ...int i = bot0;...

then:

 int newI = i - bot0;

can produce a zero value for newI (and likewise for newJ). So this seems to be the "source" of the indexing problem. I assume you can fix it from here.

Also, note that your forward declaration for cudafilter2sq:

__device__ float cudafilter2sq(float A[16][34], float B[34][258], int i, int j);

doesn't match the definition

__device__ float cudafilter2sq(float A[34][258], float B[34][258], int i, int j)

answered on Stack Overflow Sep 4, 2020 by

Robert Crovella • edited Sep 4, 2020 by

Robert Crovella

User contributions licensed under CC BY-SA 3.0