I'm having trouble with a Numba-based GPU kernel. Its works on certain size arrays (800 x 600), but fails on other sizes (1200 x 600).
from math import atan
import numpy as np
import numba as nb
from numba import cuda
from xarray import DataArray
from math import ceil
def has_cuda():
"""Check for supported CUDA device. If none found, return False"""
local_cuda = False
try:
cuda.cudadrv.devices.gpus.current
local_cuda = True
except cuda.cudadrv.error.CudaSupportError:
local_cuda = False
return local_cuda
def cuda_args(shape):
"""
Compute the blocks-per-grid and threads-per-block parameters for use when
invoking cuda kernels
Parameters
----------
shape: int or tuple of ints
The shape of the input array that the kernel will parallelize over
Returns
-------
tuple
Tuple of (blocks_per_grid, threads_per_block)
"""
if isinstance(shape, int):
shape = (shape,)
max_threads = cuda.get_current_device().MAX_THREADS_PER_BLOCK
# Note: We divide max_threads by 2.0 to leave room for the registers
threads_per_block = int(ceil(max_threads / 2.0) ** (1.0 / len(shape)))
tpb = (threads_per_block,) * len(shape)
bpg = tuple(int(ceil(d / threads_per_block)) for d in shape)
return bpg, tpb
@cuda.jit(device=True)
def _gpu_slope(arr, cellsize_x, cellsize_y):
a = arr[2, 0]
b = arr[2, 1]
c = arr[2, 2]
d = arr[1, 0]
f = arr[1, 2]
g = arr[0, 0]
h = arr[0, 1]
i = arr[0, 2]
dz_dx = ((c + 2 * f + i) - (a + 2 * d + g)) / (8 * cellsize_x[0])
dz_dy = ((g + 2 * h + i) - (a + 2 * b + c)) / (nb.int8(8) * cellsize_y[0])
p = (dz_dx * dz_dx + dz_dy * dz_dy) ** nb.float32(.5)
return atan(p) * nb.float32(57.29578)
@cuda.jit
def _horn_slope_cuda(arr, cellsize_x_arr, cellsize_y_arr, out):
i, j = cuda.grid(2)
di = 1
dj = 1
if (i-di >= 0 and i+di < out.shape[0] and
j-dj >= 0 and j+dj < out.shape[1]):
out[i, j] = _gpu_slope(arr[i-di:i+di+1, j-dj:j+dj+1],
cellsize_x_arr,
cellsize_y_arr)
else:
out[i, j] = np.nan
def slope(agg, name='slope', use_cuda=True, pad=True):
"""Returns slope of input aggregate in degrees.
Parameters
----------
agg : DataArray
name : str - name property of output xr.DataArray
use_cuda : bool - use CUDA device if available
Returns
-------
data: DataArray
Notes:
------
Algorithm References:
- http://desktop.arcgis.com/en/arcmap/10.3/tools/spatial-analyst-toolbox/how-slope-works.htm
- Burrough, P. A., and McDonell, R. A., 1998.
Principles of Geographical Information Systems
(Oxford University Press, New York), pp 406
"""
if not isinstance(agg, DataArray):
raise TypeError("agg must be instance of DataArray")
if not agg.attrs.get('res'):
raise ValueError('input xarray must have `res` attr.')
# get cellsize out from 'res' attribute
cellsize = agg.attrs.get('res')
if isinstance(cellsize, tuple) and len(cellsize) == 2 \
and isinstance(cellsize[0], (int, float)) \
and isinstance(cellsize[1], (int, float)):
cellsize_x, cellsize_y = cellsize
elif isinstance(cellsize, (int, float)):
cellsize_x = cellsize
cellsize_y = cellsize
else:
raise ValueError('`res` attr of input xarray must be a numeric'
' or a tuple of numeric values.')
if has_cuda() and use_cuda:
cellsize_x_arr = np.array([float(cellsize_x)], dtype='f8')
cellsize_y_arr = np.array([float(cellsize_y)], dtype='f8')
if pad:
pad_rows = 3 // 2
pad_cols = 3 // 2
pad_width = ((pad_rows, pad_rows),
(pad_cols, pad_cols))
else:
# If padding is not desired, set pads to 0
pad_rows = 0
pad_cols = 0
pad_width = 0
slope_data = np.pad(agg.data, pad_width=pad_width, mode="reflect")
griddim, blockdim = cuda_args(slope_data.shape)
print(griddim, blockdim)
slope_agg = np.zeros(slope_data.shape, dtype='f4')
_horn_slope_cuda[griddim, blockdim](slope_data,
cellsize_x_arr,
cellsize_y_arr,
slope_agg)
if pad:
slope_agg = slope_agg[pad_rows:-pad_rows, pad_cols:-pad_cols]
else:
raise Exception('Cuda device not available')
return DataArray(slope_agg,
name=name,
coords=agg.coords,
dims=agg.dims,
attrs=agg.attrs)
if __name__ == '__main__':
import numpy as np
import pandas as pd
W = 800 # fails when width set to 1200, works with 800
H = 600
data = np.random.randint(0., 1000.,(H,W))
arr = DataArray(data,
name='example',
dims=['y', 'x'],
attrs=dict(res=1.0))
risky_cuda = slope(arr, use_cuda=True)
When I run cuda-memcheck
, I see the following error message:
========= Invalid __global__ write of size 4
========= at 0x000002e0 in cudapy::__main__::_horn_slope_cuda$241(Array<int, int=2, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>)
========= by thread (18,21,0) in block (27,1,0)
========= Address 0x7011df5cc is out of bounds
========= Device Frame:cudapy::__main__::_horn_slope_cuda$241(Array<int, int=2, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>) (cudapy::__main__::_horn_slope_cuda$241(Array<int, int=2, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<double, int=1, C, mutable, aligned>, Array<float, int=2, C, mutable, aligned>) : 0x2e0)
========= Saved host backtrace up to driver entry point at kernel launch time
RUNTIME STACK TRACE:
(xarray-spatial) C:\Users\bcoll\xarray-spatial\xrspatial>python slope_example.py
(28, 55) (22, 22)
Traceback (most recent call last):
File "slope_example.py", line 169, in <module>
risky_cuda = slope(arr, use_cuda=True)
File "slope_example.py", line 143, in slope
slope_agg)
File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\compiler.py", line 770, in __call__
self.stream, self.sharedmem)
File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\compiler.py", line 862, in call
kernel.launch(args, griddim, blockdim, stream, sharedmem)
File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\compiler.py", line 690, in launch
wb()
File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\args.py", line 65, in <lambda>
retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\cudadrv\devices.py", line 224, in _require_cuda_context
return fn(*args, **kws)
File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\cudadrv\devicearray.py", line 264, in copy_to_host
_driver.device_to_host(hostary, self, self.alloc_size, stream=stream)
File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\cudadrv\driver.py", line 2345, in device_to_host
fn(host_pointer(dst), device_pointer(src), size, *varargs)
File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\cudadrv\driver.py", line 302, in safe_cuda_api_call
self._check_error(fname, retcode)
File "C:\Users\bcoll\Anaconda3\envs\xarray-spatial\lib\site-packages\numba\cuda\cudadrv\driver.py", line 337, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [700] Call to cuMemcpyDtoH results in UNKNOWN_CUDA_ERROR
Your kernel grid sizing strategy, eg.:
bpg = tuple(int(ceil(d / threads_per_block)) for d in shape)
creates the possibility for the creation of threads that are outside of the data shape. There is nothing wrong with this "rounding-up" grid-sizing methodology; it is typical.
However this rounding-up sizing strategy means generally that your kernel code must have an if-test that prevents out-of-range threads from performing out-of-range accesses.
You have an if-test in your kernel that could be used for this:
if (i-di >= 0 and i+di < out.shape[0] and
j-dj >= 0 and j+dj < out.shape[1]):
but because you are still writing to the data array even if an out-of-bounds event occurs:
else:
out[i, j] = np.nan
you are observing out-of-bounds accesses via cuda-memcheck
.
A trivial solution in this case is to eliminate the else
clause.
User contributions licensed under CC BY-SA 3.0