I'm getting errors when trying to printf
some built-in CUDA variables, for example threadIdx
or blockIdx
, from __device__
function called from nested kernel. Actually the most surprising thing is it occurs only when I compile with debug flags and only when I run code under cuda-memcheck. Otherwise things works as expected. Here is test code to reproduce the error:
#include <iostream>
#include <assert.h>
static constexpr int POINTS_NUM = 1 << 20;
static constexpr int DIM = 2;
__host__ __device__ __forceinline__ cudaError_t Debug(
cudaError_t error,
const char* filename,
int line)
{
if (error)
{
#if (__CUDA_ARCH__ == 0)
fprintf(stderr, "CUDA error %d [%s, %d]: %s\n",
error, filename, line, cudaGetErrorString(error));
fflush(stderr);
#elif (__CUDA_ARCH__ >= 200)
printf("CUDA error %d [block (%3d,%3d,%3d) thread (%3d,%3d,%3d), %s, %d]\n",
error, blockIdx.z, blockIdx.y, blockIdx.x,
threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
#endif
}
return error;
}
/**
* @brief Macros for error checking.
*/
#ifndef devCheckCall
#define devCheckCall(e) if ( Debug((e), __FILE__, __LINE__) ) { assert(0); }
#endif
#ifndef checkCudaErrors
#define checkCudaErrors(e) if ( Debug((e), __FILE__, __LINE__) ) { cudaDeviceReset(); exit(1); }
#endif
/**
* Without __forceinline__ cuda-memcheck catches errors when trying to print out some
* builtin variables like threadIdx or blockIdx
*
*/
__device__ /*__forceinline__*/ void foo(float val)
{
float val2 = val + (threadIdx.x << 2) * 0.3f;
printf("bid: %d tid: %d, val: %f, val2: %f\n",
blockIdx.x, threadIdx.x, val, val2);
// printf("val: %f, val2: %f\n",
// val, val2);
}
__global__ void kernelProxy2(
float const * __restrict__ in,
int pointNum,
int tileId,
int depth)
{
if (depth == 3)
{
if (threadIdx.x == 0)
{
printf("[bid: %d, tid: %d] depth : %d, tileId: %d, pointNum: %d, offset: %d\n",
blockIdx.x, threadIdx.x, depth, tileId, pointNum, pointNum * tileId);
}
foo(in[pointNum * tileId + threadIdx.x]);
return;
}
if (threadIdx.x == 0 || threadIdx.x == 1)
{
int offset = POINTS_NUM >> depth;
printf("bid: %d, tid: %d, depth: %d, offset %d, tileId: %d\n",
blockIdx.x, threadIdx.x, depth, offset * tileId, tileId);
cudaStream_t stream;
devCheckCall(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
kernelProxy2<<<1, 32, 0, stream>>>(
in, offset, tileId * 2 + threadIdx.x, depth+1);
devCheckCall(cudaPeekAtLastError());
devCheckCall(cudaStreamDestroy(stream));
}
}
__global__ void kernelProxy(
float const * __restrict__ in,
int depth)
{
if (threadIdx.x == 0 || threadIdx.x == 1)
{
cudaStream_t stream;
devCheckCall(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
int offset = POINTS_NUM >> depth;
kernelProxy2<<<1, 32, 0, stream>>>(
in, offset, threadIdx.x, depth+1);
devCheckCall(cudaPeekAtLastError());
devCheckCall(cudaStreamDestroy(stream));
}
}
int main(void)
{
float *h_data = new float[POINTS_NUM * DIM];
float *d_in;
checkCudaErrors(cudaSetDevice(0));
checkCudaErrors(cudaMalloc(&d_in, POINTS_NUM * DIM * sizeof(float)));
for (int k = 0; k < POINTS_NUM; ++k)
{
for (int d = 0; d < DIM; ++d)
{
h_data[k * DIM + d] = k * 0.1f + d * 0.03f;
}
}
checkCudaErrors(cudaMemcpy(d_in, h_data, POINTS_NUM * DIM * sizeof(float),
cudaMemcpyHostToDevice));
checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 12));
kernelProxy<<<1, 32>>>(d_in, 1);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaDeviceReset());
return 0;
}
I compile it with following command (for debug):
nvcc -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt -O0 -G -g -lineinfo -Xcompiler -rdynamic -Xcompiler -Wall -Xcompiler -Wextra -Xcudafe -# -Xcompiler -ffloat-store -ccbin=/usr/bin/g++-4.9 -std=c++11 -m64 -o foo foo.cu
or for release just replace -O0 -G -g -lineinfo -Xcompiler -rdynamic
with only -O3
. Code was tested on modified Linux Debian system with kernel version 4.2.6. Nvidia driver version is: 352.21, and CUDA SDK V7.5.17 with GeForce GTX TITAN card.
I'm wondering is it a bug, or am I missing something?
[EDIT]: The errors I'm getting are:
cuda-memcheck ./foo
========= CUDA-MEMCHECK
========= Invalid __local__ read of size 4
========= at 0x000000f8 in /home/.../tests/foo.cu:47:foo(float)
========= by thread (31,0,0) in block (0,0,0)
========= Address 0x00fffdd4 is out of bounds
There are many lines like above. And in the end:
bid: 0, tid: 0, depth: 2, offset 0, tileId: 0
bid: 0, tid: 1, depth: 2, offset 0, tileId: 0
bid: 0, tid: 0, depth: 2, offset 262144, tileId: 1
bid: 0, tid: 1, depth: 2, offset 262144, tileId: 1
[bid: 0, tid: 0] depth : 3, tileId: 1, pointNum: 262144, offset: 262144
[bid: 0, tid: 0] depth : 3, tileId: 3, pointNum: 262144, offset: 786432
[bid: 0, tid: 0] depth : 3, tileId: 2, pointNum: 262144, offset: 524288
[bid: 0, tid: 0] depth : 3, tileId: 0, pointNum: 262144, offset: 0
CUDA error 4 [foo.cu, 130]: unspecified launch failure
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2f1ae3]
========= Host Frame:./foo [0x3ba26]
========= Host Frame:./foo (main + 0x42b) [0x3323]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20870]
========= Host Frame:./foo (_start + 0x29) [0x2c89]
=========
========= ERROR SUMMARY: 129 errors
Correct behaviour is printing messages without any errors.
User contributions licensed under CC BY-SA 3.0