Numba CUDA: Changing xr.DataArray shape causes `Call to cuMemcpyDtoH results in UNKNOWN_CUDA_ERROR`

0

I'm having trouble with a Numba-based GPU kernel. Its works on certain size arrays (800 x 600), but fails on other sizes (1200 x 600).

from math import atan

import numpy as np
import numba as nb

from numba import cuda

from xarray import DataArray

from math import ceil


def has_cuda():
    """Check for supported CUDA device. If none found, return False"""
    local_cuda = False
    try:
        cuda.cudadrv.devices.gpus.current
        local_cuda = True
    except cuda.cudadrv.error.CudaSupportError:
        local_cuda = False

    return local_cuda


def cuda_args(shape):
    """
    Compute the blocks-per-grid and threads-per-block parameters for use when
    invoking cuda kernels
    Parameters
    ----------
    shape: int or tuple of ints
        The shape of the input array that the kernel will parallelize over
    Returns
    -------
    tuple
        Tuple of (blocks_per_grid, threads_per_block)
    """
    if isinstance(shape, int):
        shape = (shape,)

    max_threads = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    # Note: We divide max_threads by 2.0 to leave room for the registers
    threads_per_block = int(ceil(max_threads / 2.0) ** (1.0 / len(shape)))
    tpb = (threads_per_block,) * len(shape)
    bpg = tuple(int(ceil(d / threads_per_block)) for d in shape)
    return bpg, tpb


@cuda.jit(device=True)
def _gpu_slope(arr, cellsize_x, cellsize_y):
    a = arr[2, 0]
    b = arr[2, 1]
    c = arr[2, 2]
    d = arr[1, 0]
    f = arr[1, 2]
    g = arr[0, 0]
    h = arr[0, 1]
    i = arr[0, 2]

    dz_dx = ((c + 2 * f + i) - (a + 2 * d + g)) / (8 * cellsize_x[0])
    dz_dy = ((g + 2 * h + i) - (a + 2 * b + c)) / (nb.int8(8) * cellsize_y[0])
    p = (dz_dx * dz_dx + dz_dy * dz_dy) ** nb.float32(.5)
    return atan(p) * nb.float32(57.29578)


@cuda.jit
def _horn_slope_cuda(arr, cellsize_x_arr, cellsize_y_arr, out):
    i, j = cuda.grid(2)
    di = 1
    dj = 1
    if (i-di >= 0 and i+di < out.shape[0] and 
        j-dj >= 0 and j+dj < out.shape[1]):
        out[i, j] = _gpu_slope(arr[i-di:i+di+1, j-dj:j+dj+1],
                               cellsize_x_arr,
                               cellsize_y_arr)
    else:
        out[i, j] = np.nan


def slope(agg, name='slope', use_cuda=True, pad=True):
    """Returns slope of input aggregate in degrees.
    Parameters
    ----------
    agg : DataArray
    name : str - name property of output xr.DataArray
    use_cuda : bool - use CUDA device if available

    Returns
    -------
    data: DataArray

    Notes:
    ------
    Algorithm References:
     - http://desktop.arcgis.com/en/arcmap/10.3/tools/spatial-analyst-toolbox/how-slope-works.htm
     - Burrough, P. A., and McDonell, R. A., 1998.
      Principles of Geographical Information Systems
      (Oxford University Press, New York), pp 406
    """

    if not isinstance(agg, DataArray):
        raise TypeError("agg must be instance of DataArray")

    if not agg.attrs.get('res'):
        raise ValueError('input xarray must have `res` attr.')

    # get cellsize out from 'res' attribute
    cellsize = agg.attrs.get('res')
    if isinstance(cellsize, tuple) and len(cellsize) == 2 \
            and isinstance(cellsize[0], (int, float)) \
            and isinstance(cellsize[1], (int, float)):
        cellsize_x, cellsize_y = cellsize
    elif isinstance(cellsize, (int, float)):
        cellsize_x = cellsize
        cellsize_y = cellsize
    else:
        raise ValueError('`res` attr of input xarray must be a numeric'
                         ' or a tuple of numeric values.')
    
    if has_cuda() and use_cuda:
        cellsize_x_arr = np.array([float(cellsize_x)], dtype='f8')
        cellsize_y_arr = np.array([float(cellsize_y)], dtype='f8')

        if pad:
            pad_rows = 3 // 2
            pad_cols = 3 // 2
            pad_width = ((pad_rows, pad_rows),
                        (pad_cols, pad_cols))
        else:
            # If padding is not desired, set pads to 0
            pad_rows = 0
            pad_cols = 0
            pad_width = 0

        slope_data = np.pad(agg.data, pad_width=pad_width, mode="reflect")

        griddim, blockdim = cuda_args(slope_data.shape)
        print(griddim, blockdim)
        slope_agg = np.zeros(slope_data.shape, dtype='f4')
        _horn_slope_cuda[griddim, blockdim](slope_data,
                                            cellsize_x_arr,
                                            cellsize_y_arr,
                                            slope_agg)
        if pad:
            slope_agg = slope_agg[pad_rows:-pad_rows, pad_cols:-pad_cols]
    else:
        raise Exception('Cuda device not available')

    return DataArray(slope_agg,
                     name=name,
                     coords=agg.coords,
                     dims=agg.dims,
                     attrs=agg.attrs)

if __name__ == '__main__':
    import numpy as np
    import pandas as pd

    W = 800  # fails when width set to 1200, works with 800
    H = 600 

    data = np.random.randint(0., 1000.,(H,W))

    arr =  DataArray(data,
                     name='example',
                     dims=['y', 'x'],
                     attrs=dict(res=1.0))

    risky_cuda = slope(arr, use_cuda=True)

When I run cuda-memcheck, I see the following error message:

========= Invalid __global__ write of size 4
=========     at 0x000002e0 in cudapy::__main__::_horn_slope_cuda$241(Array<int, int=2, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>)
=========     by thread (18,21,0) in block (27,1,0)
=========     Address 0x7011df5cc is out of bounds
=========     Device Frame:cudapy::__main__::_horn_slope_cuda$241(Array<int, int=2, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>) (cudapy::__main__::_horn_slope_cuda$241(Array<int, int=2, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>) : 0x2e0)
=========     Saved host backtrace up to driver entry point at kernel launch time

RUNTIME STACK TRACE:

(xarray-spatial) C:\Users\bcoll\xarray-spatial\xrspatial>python slope_example.py
(28, 55) (22, 22)
Traceback (most recent call last):
  File "slope_example.py", line 169, in <module>
    risky_cuda = slope(arr, use_cuda=True)
  File "slope_example.py", line 143, in slope
    slope_agg)
  File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\compiler.py", line 770, in __call__
    self.stream, self.sharedmem)
  File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\compiler.py", line 862, in call
    kernel.launch(args, griddim, blockdim, stream, sharedmem)
  File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\compiler.py", line 690, in launch
    wb()
  File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\args.py", line 65, in <lambda>
    retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
  File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\cudadrv\devices.py", line 224, in _require_cuda_context
    return fn(*args, **kws)
  File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\cudadrv\devicearray.py", line 264, in copy_to_host
    _driver.device_to_host(hostary, self, self.alloc_size, stream=stream)
  File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\cudadrv\driver.py", line 2345, in device_to_host
    fn(host_pointer(dst), device_pointer(src), size, *varargs)
  File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\cudadrv\driver.py", line 302, in safe_cuda_api_call
    self._check_error(fname, retcode)
  File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\cudadrv\driver.py", line 337, in _check_error
    raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [700] Call to cuMemcpyDtoH results in UNKNOWN_CUDA_ERROR
python
numpy
cuda
gpu
numba
asked on Stack Overflow Nov 4, 2020 by bcollins • edited Nov 4, 2020 by bcollins

1 Answer

2

Your kernel grid sizing strategy, eg.:

bpg = tuple(int(ceil(d / threads_per_block)) for d in shape)

creates the possibility for the creation of threads that are outside of the data shape. There is nothing wrong with this "rounding-up" grid-sizing methodology; it is typical.

However this rounding-up sizing strategy means generally that your kernel code must have an if-test that prevents out-of-range threads from performing out-of-range accesses.

You have an if-test in your kernel that could be used for this:

if (i-di >= 0 and i+di < out.shape[0] and 
    j-dj >= 0 and j+dj < out.shape[1]):

but because you are still writing to the data array even if an out-of-bounds event occurs:

else:
    out[i, j] = np.nan

you are observing out-of-bounds accesses via cuda-memcheck.

A trivial solution in this case is to eliminate the else clause.

answered on Stack Overflow Nov 7, 2020 by Robert Crovella

User contributions licensed under CC BY-SA 3.0