My Ix
and Iy
declared in the CUDA global kernel will cause illegal memory access encounters due to unknown reasons. This is the code:
#include "opencv2/opencv.hpp"
#include "opencv2/highgui.hpp"
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#include <stdlib.h>
// Cuda
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CHECK_FINAL_RESULT
//#define CHECK_LOADING_DATA
using namespace std;
const int TSIZEX = 32;
const int TSIZEY = 256;
const int ft_size = 1;
// Mathematical algorithms
#define isl_min(x,y) ((x) < (y) ? (x) : (y)) // compare value x is lesser than y, if correct use x, if wrong use y
#define isl_max(x,y) ((x) > (y) ? (x) : (y)) // comapre value x is larger than y, if correct use y, if wrong use x
__device__ float cudafilter2sq(float A[16][34], float B[34][258], int i, int j);
__global__ void cudapipeline_harris(int C, int R, float* img, float* harris);
__device__ float cudafilter2sq(float A[34][258], float B[34][258], int i, int j) {
return (A[i - 1][j - 1] * B[i - 1][j - 1] +
A[i - 1][j] * B[i - 1][j] +
A[i - 1][j + 1] * B[i - 1][j + 1] +
A[i][j - 1] * B[i][j - 1] +
A[i][j] * B[i][j] +
A[i][j + 1] * B[i][j + 1] +
A[i + 1][j - 1] * B[i + 1][j - 1] +
A[i + 1][j] * B[i + 1][j] +
A[i + 1][j + 1] * B[i + 1][j + 1]);
}
__global__ void cudapipeline_harris(int C, int R, float* img, float* harris) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
int idz = threadIdx.z + blockIdx.z * blockDim.z;
float Ix[TSIZEX + 2 * ft_size][TSIZEY + 2 * ft_size];
float Iy[TSIZEX + 2 * ft_size][TSIZEY + 2 * ft_size];
for (int Ti = idx; Ti < (float)(R / TSIZEX); Ti += gridDim.x * blockDim.x)
//if (Ti < (R / TSIZEX))
{
//for (int Tj = 0; Tj <= (float)(C / TSIZEY); Tj++)
for (int Tj = idy; Tj < (float)(C/ TSIZEY); Tj += gridDim.y * blockDim.y)
{
int bot0, top0, right0, left0;
int height, width;
bot0 = isl_min(isl_max(Ti * TSIZEX, ft_size), R - ft_size);
top0 = isl_min((Ti + 1) * TSIZEX, R - ft_size);
left0 = isl_min(isl_max(Tj * TSIZEY, ft_size), C - ft_size);
right0 = isl_min((Tj + 1) * TSIZEY, C - ft_size);
width = right0 - left0;
height = top0 - bot0;
for (int i = bot0; i <= top0; i++)
{
for (int j = left0; j <= right0; j++)
{
//printf("Ix : %d ", i - bot0);
Ix[i - bot0][j - left0] = img[(i - 1) * C + j - 1] * (-0.0833333333333f) +
img[(i + 1) * C + j - 1] * 0.0833333333333f +
img[(i + 1) * C + j] * 0.166666666667f +
img[(i - 1) * C + j] * -0.166666666667f +
img[(i - 1) * C + j + 1] * -0.0833333333333f +
img[(i + 1) * C + j + 1] * 0.0833333333333f;
Iy[i - bot0][j - left0] = img[(i - 1) * C + j - 1] * (-0.0833333333333f) +
img[(i - 1) * C + j + 1] * 0.0833333333333f +
img[i * C + j - 1] * -0.166666666667f +
img[i * C + j + 1] * 0.166666666667f +
img[(i + 1) * C + j - 1] * -0.0833333333333f +
img[(i + 1) * C + j + 1] * 0.0833333333333f;
}
}
// for (int i = idy + bot0;i < (float)top0; i += gridDim.y * blockDim.y)
for (int i = bot0; i < top0; i++)
{
for (int j = left0; j < right0; j++)
{
int newI = i - bot0;
int newJ = j - left0;
harris[((i)*C + (j))] = cudafilter2sq(Ix, Ix, newI, newJ) * cudafilter2sq(Iy, Iy, newI, newJ) -
cudafilter2sq(Ix, Iy, newI, newJ) * cudafilter2sq(Ix, Iy, newI, newJ) -
(0.04f * (cudafilter2sq(Ix, Ix, newI, newJ) + cudafilter2sq(Iy, Iy, newI, newJ))) *
(cudafilter2sq(Ix, Ix, newI, newJ) + cudafilter2sq(Iy, Iy, newI, newJ));
}
}
}
}
}
int main(int argc, char** argv)
{
int i, j, run; // looping variables
int R, C, nruns; // height, width and number of loops runs
double begin, end; // each loop start time and end time
double init, finish; // total loop start time and end time
double stime, avgt; // time used and total avgt time
cv::Mat image, loaded_data;
cv::Scalar sc;
cv::Size size;
float* t_res;
float* t_data;
// Might be unused depending on preprocessor macro definitions
(void)t_res;
(void)t_data;
(void)loaded_data;
float* data;
float* res;
if (argc != 3)
{
printf("Does not set the NRuns and image needed\n");
return -1;
}
image = cv::imread(argv[1], 1); // read image from command line argument [1]
if (!image.data)
{
printf("No image data ! Are you sure %s is an image ?\n", argv[1]);
return -1;
}
// Convert image input to grayscale floating point
cv::cvtColor(image, image, cv::COLOR_BGR2GRAY);
size = image.size();
C = size.width;
R = size.height;
printf("Values settings :\n");
printf("-------------------\n");
printf("Image Used : %s [%i, %i] \n", argv[1], R, C);
res = (float*)calloc(R * C, sizeof(*res));
if (res == NULL)
{
printf("Error while allocating result table of size %ld B\n",
(sizeof(*res) * C * R));
return -1;
}
data = (float*)malloc(R * C * sizeof(float));
for (i = 0; i < R; i++) {
for (j = 0; j < C; j++) {
sc = image.at<uchar>(i, j);
data[i * C + j] = (float)sc.val[0] / 255;
}
}
// Parallel Running Test
printf("\n\n-----------------------------------\n");
printf("Cuda\n");
printf("-----------------------------------\n");
res = (float*)calloc(R * C, sizeof(*res)); // reset resources value
dim3 grid(2,2,2);
dim3 block(16,16,1);
// Data required to pass to device
float* img, * harris;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc((void**)&img, R * C * sizeof(*img));
cudaMalloc((void**)&harris, R * C * sizeof(*harris));
cudaMemcpy(img, data, C * R * sizeof(*data), cudaMemcpyHostToDevice); // pass image value to the GPU
cudaEventRecord(start);
cudapipeline_harris << < grid, block >> > (C, R, img, harris);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
//cudapipeline_harris << < 1, 8 >> > (C, R, img, harris);
cudaDeviceSynchronize();
cudaMemcpy(res, harris, C * R * sizeof(*harris), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("CUDA ERROR : %s", cudaGetErrorString(err));
exit(-1);
}
printf("Total time : \t %f ms\n", milliseconds);
#ifdef CHECK_FINAL_RESULT
// Serial Show input
cv::namedWindow("Input", cv::WINDOW_NORMAL);
cv::imshow("Input", image);
image.release();
// Parallel Show output
cv::Mat imres = cv::Mat(R, C, CV_32F, res);
cv::namedWindow("Parallel Output", cv::WINDOW_NORMAL);
cv::imshow("Parallel Output", imres * 65535.0);
imres.release();
#endif
cudaFree(harris);
cudaFree(img);
free(data);
free(res);
return 0;
}
This is the error shown:
CUDA ERROR : an illegal memory access was encountered
**CUDA ERROR : unspecified launch failure========= Invalid __global__ read of size 4
========= at 0x000002d0 in C:/Users/Jiayih/source/repos/cuda/cuda/main.cu:383:cudafilter2sq(float[258]*, float[258]*, int, int)
========= by thread (15,1,0) in block (0,0,1)
========= Address 0x2c6f5fee774 is out of bounds
========= Device Frame:C:/Users/Jiayih/source/repos/cuda/cuda/main.cu:453:cudapipeline_harris(int, int, float*, float*) (cudapipeline_harris(int, int, float*, float*) : 0x2130)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x81dcd]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x82167]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x8686e]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ba]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x176ea9]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuProfilerStop + 0xe97c2) [0x307342]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x361bd]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x365e1]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x368c4]
========= Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuLaunchKernel + 0x234) [0x20d954]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll [0x8dba]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll [0x8c66]
========= Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll (cudaLaunchKernel + 0x1c4) [0x29024]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (main + 0x1f) [0x516f]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__device_stub__Z19cudapipeline_harrisiiPfS_ + 0x22e) [0x4fbe]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (cudapipeline_harris + 0x41) [0x44c1]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (main + 0x577) [0x4a47]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (invoke_main + 0x39) [0xfa79]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__scrt_common_main_seh + 0x12e) [0xf95e]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__scrt_common_main + 0xe) [0xf81e]
========= Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (mainCRTStartup + 0x9) [0xfb09]
========= Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17bd4]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6ce51]
=========**
The debug process here is fairly straightforward. Your CUDA error output is pointing to an out-of-range access error in cudafilter2sq
as indicated here:
Invalid global read of size 4 ========= at ...cuda/main.cu:383:cudafilter2sq
... Address ... is out of bounds
Looking at cudafilter2sq
, ask yourself the question "how could one of those accesses be out of range?" Since that function is fairly simple, the answer is, "if one of the indexes ( computed from i
or j
) is out of range for A
/Ix
or B
/Iy
. Then you just test those computed indexes against the known possible ranges (0-33, 0-257).
It should be quite evident that cudafilter2sq
requires an i
value greater than 0, otherwise i-1
will index out of range. But you are not satisfying this requirement. Add:
#include <assert.h>
and then add:
assert(i > 0);
to the very beginning of cudafilter2sq
. Then run your code with the memory checking feature enabled (as you are already doing). You will hit these device asserts, indicating you are indexing out-of-range. You have the same problem with j
.
When I add the following code to the beginning of cudafilter2sq
:
if (i < 1) i = 1; if (j < 1) j = 1;
your code runs without error for me. It should be fairly evident that if your cudapipeline_harris
kernel for-loop starts with:
...int i = bot0;...
then:
int newI = i - bot0;
can produce a zero value for newI
(and likewise for newJ
). So this seems to be the "source" of the indexing problem. I assume you can fix it from here.
Also, note that your forward declaration for cudafilter2sq
:
__device__ float cudafilter2sq(float A[16][34], float B[34][258], int i, int j);
doesn't match the definition
__device__ float cudafilter2sq(float A[34][258], float B[34][258], int i, int j)
User contributions licensed under CC BY-SA 3.0