cudaMalloc and cudaMemcpy not working on kernel call

Question

cudaMalloc and cudaMemcpy not working on kernel call

I have an array already initialized that I am trying to use in each thread of the kernel call (each thread uses a different part of the array so there are no dependencies). I create the array and save memory on the device using cudaMalloc and the array is copied from host to device using cudaMemcpy.

I pass the pointer returned by cudaMalloc to the kernel call to be used by each thread.

int SIZE = 100;

int* data = new int[SIZE];
int* d_data = 0;

cutilSafeCall( cudaMalloc(&d_data, SIZE * sizeof(int)) );
for (int i = 0; i < SIZE; i++)
    data[i] = i;

cutilSafeCall( cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice) );

This code was taken from here. For the kernel call.

kernel<<<blocks, threads>>> (results, d_data);

I keep track of the results from each thread by using the struct Result. The next code works without errors.

__global__ void mainKernel(Result res[], int* data){
   int x = data[0];
}

But when I assign that value to res:

__global__ void mainKernel(Result res[], int* data){
   int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;

   int x = data[0];

   res[threadId].x = x;
}

An error is raised:

cudaSafeCall() Runtime API error in file , line 355 : an illegal memory access was encountered.

The same error appears with any operation involving the use of that pointer

__global__ void mainKernel(Result res[], int* data){
   int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;

   int x = data[0];

   if (x > 10)
      res[threadId].x = 5;
}

There is no problem with the definition of res. Assigning any other value to res[threadId].x does not give me any error.

This is the output of running cuda-memcheck:

========= Invalid __global__ read of size 4
========= at 0x00000150 in mainKernel(Result*, int*)
========= by thread (86,0,0) in block (49,0,0)
========= Address 0x13024c0000 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x150d6d]
========= Host Frame:./out [0x2cc4b]
========= Host Frame:./out [0x46c23]
========= Host Frame:./out [0x3e37]
========= Host Frame:./out [0x3ca1]
========= Host Frame:./out [0x3cd6]
========= Host Frame:./out [0x39e9]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
========= Host Frame:./out [0x31b9]

EDIT:

This is an example of the full code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <iostream>
#include <assert.h>

typedef struct    
{
   int x,y,z;
} Result;

__global__ void mainKernel(Result  pResults[], int* dataimage)
{

   int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;

   int xVal = dataimage[0];
   if (xVal > 10)
       pResults[threadId].x = 5;

}

int main (int argc, char** argv)
{

   int NUM_THREADS = 5*5;

   int SIZE = 100;

   int* data = new int[SIZE];
   int* d_data = 0;

   cutilSafeCall( cudaMalloc(&d_data, SIZE * sizeof(int)) );
   for (int i = 0; i < SIZE; i++)
       data[i] = i;

   cutilSafeCall( cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice) );

   unsigned int GPU_ID = 1;  // not actually :-)
   // unsigned int GPU_ID =  cutGetMaxGflopsDeviceId() ;
   cudaSetDevice(GPU_ID); 

   Result * results_GPU = 0;
   cutilSafeCall( cudaMalloc( &results_GPU,  NUM_THREADS * sizeof(Result)) );

   Result * results_CPU = 0;
   cutilSafeCall( cudaMallocHost( &results_CPU, NUM_THREADS * sizeof(Result)) );

   mainKernel<<<5,5>>> ( results_GPU, d_data );

   cudaThreadSynchronize(); 

   cutilSafeCall( cudaMemcpy(results_CPU, results_GPU, NUM_THREADS * sizeof(Result),cudaMemcpyDeviceToHost) );

   cutilSafeCall(cudaFree(results_GPU));
   cutilSafeCall(cudaFreeHost(results_CPU));
   cudaThreadExit();

 } // ()

memory

cuda

asked on Stack Overflow Feb 22, 2016 by

Moreau23 • edited Jan 6, 2018 by

tomix86

1 Answer

Your problem lies in this sequence of calls:

  cutilSafeCall( cudaMalloc(&d_data, SIZE * sizeof(int)) );
   for (int i = 0; i < SIZE; i++)
       data[i] = i;

   cutilSafeCall( cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice) );

   unsigned int GPU_ID = 1;
   cudaSetDevice(GPU_ID); 

   Result * results_GPU = 0;
   cutilSafeCall( cudaMalloc( &results_GPU,  NUM_THREADS * sizeof(Result)) );

   Result * results_CPU = 0;
   cutilSafeCall( cudaMallocHost( &results_CPU, NUM_THREADS * sizeof(Result)) );

   mainKernel<<<5,5>>> ( results_GPU, d_data );

What is effectively happening is that you are allocating d_data and running your kernel on different GPUs, and d_data is not valid on the GPU you are launching the kernel on.

In detail, because you call cudaMalloc for d_data before cudaSetDevice, you are allocating d_data on the default device, and then explicitly allocating results_GPU and running the kernel on device 1. Clearly device 1 and the default device are not the same GPU (enumeration of devices usually starts at 0 in the runtime API).

If you change the code like this:

   unsigned int GPU_ID = 1;
   cutilSafeCall(cudaSetDevice(GPU_ID)); 

   cutilSafeCall( cudaMalloc(&d_data, SIZE * sizeof(int)) );
   for (int i = 0; i < SIZE; i++)
       data[i] = i;

   cutilSafeCall( cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice) );

   Result * results_GPU = 0;
   cutilSafeCall( cudaMalloc( &results_GPU,  NUM_THREADS * sizeof(Result)) );

   Result * results_CPU = 0;
   cutilSafeCall( cudaMallocHost( &results_CPU, NUM_THREADS * sizeof(Result)) );

   mainKernel<<<5,5>>> ( results_GPU, d_data );

i.e. select the non-default device before any allocations are made, the problem should disappear. The reason this doesn't happen with your very simple kernel:

__global__ void mainKernel(Result res[], int* data){
   int x = data[0];
}

is simply that the CUDA compiler performs very aggressive optimisations by default, and because the result of the read of data[0] isn't actually used, the entire read can be optimised away and you are left with an empty stub kernel which doesn't do anything. Only when the result of the load from memory is used in a memory write will the code not be optimised away during compilation. You can confirm this yourself by dissassembling the code emitted by the compiler, if you are curious.

Note that there are ways to make this work on multi-GPU systems which supported it, via peer-to-peer access, but that must be explicitly configured in your code for that facility to be used.

answered on Stack Overflow Feb 22, 2016 by

talonmies • edited Jun 4, 2019 by

talonmies

User contributions licensed under CC BY-SA 3.0