Is this a bug in CUDA? (illegal memory access was encountered)

2

I am using the following CUDA kernel:

__global__
void sum_worker(int *data, int *sum_ptr)
{
        __shared__ int block_sum;
        int idx = threadIdx.x;
        int thread_sum = 0;

        if (threadIdx.x == 0)
                block_sum = 2;

        for (int i = idx; i < MAX_INDEX; i += blockDim.x)
                thread_sum += data[i];

        __syncthreads();

        atomicAdd(&block_sum, thread_sum);

        __syncthreads();

        if (threadIdx.x == 0)
                *sum_ptr = block_sum;
}

It is launched using this code:

sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);

And it is working fine (no runtime errors and produces the correct result). However, if I change i += blockDim.x to i += 32 I get an error the next time I call cudaDeviceSynchronize():

Cuda error 'an illegal memory access was encountered' in primes_gpu.cu at line 97

Running the kernel with cuda-memcheck:

========= Invalid __global__ read of size 4
=========     at 0x00000108 in /home/clifford/Work/handicraft/2016/perfmeas/primes_gpu.cu:35:sum_worker(int*, int*)
=========     by thread (31,0,0) in block (0,0,0)
=========     Address 0x703b70d7c is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x472225]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 [0x146ad]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 (cudaLaunch + 0x143) [0x2ece3]
=========     Host Frame:./perfmeas [0x17c7]
=========     Host Frame:./perfmeas [0x16b7]
=========     Host Frame:./perfmeas [0x16e2]
=========     Host Frame:./perfmeas [0x153f]
=========     Host Frame:./perfmeas [0xdcd]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
=========     Host Frame:./perfmeas [0xf39]
....

Address 0x703b70d7c is indeed out of bounds for data: The array starts at 0x703b40000 and has MAX_INDEX elements. MAX_INDEX is 50000 in this test. (0x703b70d7c - 0x703b40000) / 4 = 50015.

Adding an additional check for i >= 50000 makes to problem magically go away:

    for (int i = idx; i < MAX_INDEX; i += 32) {
            if (i >= MAX_INDEX)
                    printf("WTF!\n");
            thread_sum += data[i];
    }

Is this a bug in CUDA or am I doing something stupid here?

I'm using CUDA 7.5 on Ubuntu 2016.04. Output of nvcc --version:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17

The complete source code for this test case can be found here:
http://svn.clifford.at/handicraft/2016/perfmeas

(Run with options -gx. This version is using i += blockDim.x. Change that to i += 32 to reproduce the issue.)


Edit: @njuffa said in the comments he doesn't want to follow links off stack overflow because he is "too scared [his] computer might catch something" and would prefer a test case that he can copy&paste from stack overflow directly. So here it goes:

#include <string.h>
#include <stdio.h>
#include <stdbool.h>
#include <math.h>

#define MAX_PRIMES 100000
#define MAX_INDEX (MAX_PRIMES/2)

__global__
void primes_worker(int *data)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= MAX_INDEX)
        return;

    int p = 2*idx+1;
    for (int i = 3; i*i <= p; i += 2) {
        if (p % i == 0) {
            data[idx] = 0;
            return;
        }
    }

    data[idx] = idx ? p : 0;
}

__global__
void sum_worker(int *data, int *sum_ptr)
{
    __shared__ int block_sum;
    int idx = threadIdx.x;
    int thread_sum = 0;

    if (threadIdx.x == 0)
        block_sum = 2;

#ifdef ENABLE_BUG
    for (int i = idx; i < MAX_INDEX; i += 32)
        thread_sum += data[i];
#else
    for (int i = idx; i < MAX_INDEX; i += blockDim.x)
        thread_sum += data[i];
#endif

    __syncthreads();

    atomicAdd(&block_sum, thread_sum);

    __syncthreads();

    if (threadIdx.x == 0)
        *sum_ptr = block_sum;
}

int *primes_or_zeros;
int *sum_buffer;

void primes_gpu_init()
{
    cudaError_t err;

    err = cudaMalloc((void**)&primes_or_zeros, sizeof(int)*MAX_INDEX);

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);

    err = cudaMallocHost((void**)&sum_buffer, sizeof(int));

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}

void primes_gpu_done()
{
    cudaError_t err;

    err = cudaFree(primes_or_zeros);

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);

    err = cudaFreeHost(sum_buffer);

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}

int primes_gpu()
{
    int num_blocks = (MAX_INDEX + 31) / 32;
    int num_treads = 32;

    primes_worker<<<num_blocks, num_treads>>>(primes_or_zeros);
    sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);
    cudaError_t err = cudaDeviceSynchronize();

    if (err != cudaSuccess)
        printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);

    return *sum_buffer;
}

int main()
{
    primes_gpu_init();

    int result = primes_gpu();
    printf("Result: %d\n", result);

    if (result != 454396537) {
        printf("Incorrect result!\n");
        return 1;
    }

    primes_gpu_done();
    return 0;
}

Usage:

$ nvcc -o demo demo.cu 
$ ./demo 
Result: 454396537

$ nvcc -D ENABLE_BUG -o demo demo.cu 
$ ./demo 
Cuda error 'an illegal memory access was encountered' in demo.cu at line 99
Result: 0
Incorrect result!
cuda
asked on Stack Overflow Sep 21, 2016 by CliffordVienna • edited Sep 21, 2016 by CliffordVienna

1 Answer

4

TL;DR: The observed behavior is very likely caused by a bug in the ptxas component of the CUDA 7.5 toolchain, specifically the loop unroller. It is possible that the bug is already fixed in CUDA 8.0 RC, which is publicly available.

I was able to reproduce the behavior reported in the question on a 64-bit Windows 7 platform with a Quadro K2200 GPU, which is an sm_50 device. The primary difference in the generated machine code (SASS) with ENABLE_BUG defined is that the loop is unrolled by a factor of four. This is a direct consequence of the loop increment being changed from a variabe, namely threadIdx.x, to a compile time constant, 32, which allows the compiler to compute trip count at compile time.

It is interesting to note that at the intermediate PTX level, the loop is rolled even with increment of 32:

BB7_4:
ld.global.u32 %r12, [%rd10];
add.s32 %r16, %r12, %r16;
add.s64 %rd10, %rd10, 128;
add.s32 %r15, %r15, 32;
setp.lt.s32     %p3, %r15, 50000;
@%p3 bra BB7_4;

As the loop is unrolled in machine code, it must be the ptxas unroller applying that transformation.

If I lower the ptxas optimization level to -O1, by specifying -Xptxas -O1 on the nvcc command line, the code works as expected. If I build the code for sm_30 (causing JIT compilation when running on an sm_50 device) the code works as expected when run with the latest driver, Windows 369.26. This strongly suggests that there is a bug in the unroller of the ptxas component of CUDA 7.5, which however has already been fixed, since the ptxas component inside the CUDA driver is much more recent than the ptxas component of the CUDA 7.5 toolchain.

Placing a #pragma unroll 4 directly in front of the loop also fixes the problem, since in this case the unrolling is performed by the nvvm component of the compiler, meaning the unrolled loop is already present at the PTX level:

#if ENABLE_BUG
#pragma unroll 4
    for (int i = idx; i < MAX_INDEX; i += 32)
        thread_sum += data[i];
#else

Resulting PTX:

BB7_5:
.pragma "nounroll";
ld.global.u32 %r34, [%rd14];
add.s32 %r35, %r34, %r45;
ld.global.u32 %r36, [%rd14+128];
add.s32 %r37, %r36, %r35;
ld.global.u32 %r38, [%rd14+256];
add.s32 %r39, %r38, %r37;
ld.global.u32 %r40, [%rd14+384];
add.s32 %r45, %r40, %r39;
add.s64 %rd14, %rd14, 512;
add.s32 %r44, %r44, 128;
setp.lt.s32     %p5, %r44, %r3;
@%p5 bra BB7_5;
answered on Stack Overflow Sep 21, 2016 by njuffa • edited Sep 21, 2016 by njuffa

User contributions licensed under CC BY-SA 3.0