invalid __local__ read when printf built-in cuda variables with dynamic parallelism under cuda-memcheck

0

I'm getting errors when trying to printf some built-in CUDA variables, for example threadIdx or blockIdx, from __device__ function called from nested kernel. Actually the most surprising thing is it occurs only when I compile with debug flags and only when I run code under cuda-memcheck. Otherwise things works as expected. Here is test code to reproduce the error:

#include <iostream>
#include <assert.h>

static constexpr int POINTS_NUM     = 1 << 20;
static constexpr int DIM            = 2;

__host__ __device__ __forceinline__ cudaError_t Debug(
    cudaError_t     error,
    const char*     filename,
    int             line)
{
    if (error)
    {
    #if (__CUDA_ARCH__ == 0)
        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n",
            error, filename, line, cudaGetErrorString(error));
        fflush(stderr);
    #elif (__CUDA_ARCH__ >= 200)
        printf("CUDA error %d [block (%3d,%3d,%3d) thread (%3d,%3d,%3d), %s, %d]\n",
            error, blockIdx.z, blockIdx.y, blockIdx.x,
            threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
    #endif
    }
    return error;
}

/**
 * @brief Macros for error checking.     
 */
#ifndef devCheckCall
    #define devCheckCall(e) if ( Debug((e), __FILE__, __LINE__) ) { assert(0); }
#endif

#ifndef checkCudaErrors
    #define checkCudaErrors(e) if ( Debug((e), __FILE__, __LINE__) ) { cudaDeviceReset(); exit(1); }
#endif

/**
 * Without __forceinline__ cuda-memcheck catches errors when trying to print out some
 * builtin variables like threadIdx or blockIdx
 *
 */
__device__ /*__forceinline__*/ void foo(float val)
{
    float val2 = val + (threadIdx.x << 2) * 0.3f;
    printf("bid: %d tid: %d, val: %f, val2: %f\n",
        blockIdx.x, threadIdx.x, val, val2);
    // printf("val: %f, val2: %f\n",
    //     val, val2);
}

__global__ void kernelProxy2(
    float const * __restrict__  in,
    int                         pointNum,
    int                         tileId,
    int                         depth)
{
    if (depth == 3)
    {
        if (threadIdx.x == 0)
        {
            printf("[bid: %d, tid: %d] depth : %d, tileId: %d, pointNum: %d, offset: %d\n",
                blockIdx.x, threadIdx.x, depth, tileId, pointNum, pointNum * tileId);
        }
        foo(in[pointNum * tileId + threadIdx.x]);
        return;
    }

    if (threadIdx.x == 0 || threadIdx.x == 1)
    {
        int offset = POINTS_NUM >> depth;
            printf("bid: %d, tid: %d, depth: %d, offset %d, tileId: %d\n",
                blockIdx.x, threadIdx.x, depth, offset * tileId, tileId);

        cudaStream_t stream;
        devCheckCall(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

        kernelProxy2<<<1, 32, 0, stream>>>(
            in, offset, tileId * 2 + threadIdx.x, depth+1);
        devCheckCall(cudaPeekAtLastError());
        devCheckCall(cudaStreamDestroy(stream));
    }

}

__global__ void kernelProxy(
    float const * __restrict__  in,
    int                         depth)
{
    if (threadIdx.x == 0 || threadIdx.x == 1)
    {
        cudaStream_t stream;
        devCheckCall(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

        int offset = POINTS_NUM >> depth;
        kernelProxy2<<<1, 32, 0, stream>>>(
            in, offset, threadIdx.x, depth+1);

        devCheckCall(cudaPeekAtLastError());
        devCheckCall(cudaStreamDestroy(stream));
    }
}

int main(void)
{

    float *h_data = new float[POINTS_NUM * DIM];
    float *d_in;

    checkCudaErrors(cudaSetDevice(0));

    checkCudaErrors(cudaMalloc(&d_in, POINTS_NUM * DIM * sizeof(float)));

    for (int k = 0; k < POINTS_NUM; ++k)
    {
        for (int d = 0; d < DIM; ++d)
        {
            h_data[k * DIM + d] = k * 0.1f + d * 0.03f;
        }
    }

    checkCudaErrors(cudaMemcpy(d_in, h_data, POINTS_NUM * DIM * sizeof(float),
        cudaMemcpyHostToDevice));

    checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 12));

    kernelProxy<<<1, 32>>>(d_in, 1);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());

    checkCudaErrors(cudaDeviceReset());

    return 0;
}

I compile it with following command (for debug):

nvcc -gencode=arch=compute_35,code=\"sm_35,compute_35\"  -rdc=true -lcudadevrt -O0 -G -g -lineinfo -Xcompiler -rdynamic -Xcompiler -Wall -Xcompiler -Wextra -Xcudafe -#  -Xcompiler -ffloat-store -ccbin=/usr/bin/g++-4.9 -std=c++11 -m64 -o foo foo.cu

or for release just replace -O0 -G -g -lineinfo -Xcompiler -rdynamic with only -O3. Code was tested on modified Linux Debian system with kernel version 4.2.6. Nvidia driver version is: 352.21, and CUDA SDK V7.5.17 with GeForce GTX TITAN card.

I'm wondering is it a bug, or am I missing something?

[EDIT]: The errors I'm getting are:

cuda-memcheck ./foo
========= CUDA-MEMCHECK
========= Invalid __local__ read of size 4
=========     at 0x000000f8 in /home/.../tests/foo.cu:47:foo(float)
=========     by thread (31,0,0) in block (0,0,0)
=========     Address 0x00fffdd4 is out of bounds

There are many lines like above. And in the end:

bid: 0, tid: 0, depth: 2, offset 0, tileId: 0
bid: 0, tid: 1, depth: 2, offset 0, tileId: 0
bid: 0, tid: 0, depth: 2, offset 262144, tileId: 1
bid: 0, tid: 1, depth: 2, offset 262144, tileId: 1
[bid: 0, tid: 0] depth : 3, tileId: 1, pointNum: 262144, offset: 262144
[bid: 0, tid: 0] depth : 3, tileId: 3, pointNum: 262144, offset: 786432
[bid: 0, tid: 0] depth : 3, tileId: 2, pointNum: 262144, offset: 524288
[bid: 0, tid: 0] depth : 3, tileId: 0, pointNum: 262144, offset: 0
CUDA error 4 [foo.cu, 130]: unspecified launch failure
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2f1ae3]
=========     Host Frame:./foo [0x3ba26]
=========     Host Frame:./foo (main + 0x42b) [0x3323]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20870]
=========     Host Frame:./foo (_start + 0x29) [0x2c89]
=========
========= ERROR SUMMARY: 129 errors

Correct behaviour is printing messages without any errors.

c++
linux
cuda
dynamic-parallelism
asked on Stack Overflow Jun 2, 2016 by hurin • edited Jun 11, 2016 by hurin

0 Answers

Nobody has answered this question yet.


User contributions licensed under CC BY-SA 3.0