Max reduce in CUDA Fortran

1

I am trying to perform reduction in CUDA Fortran; what I did so far is something like that, performing the reduction in two steps (see the CUDA kernels below).

In the first kernel I am doing some simple computation and I declare a shared array for a block of threads to store the value of abs(a - anew); once the threads are synchronized, I compute the max value of this shared array, that I store in an intermediate array of dimension gridDim%x * gridDim%y.

In the second kernel, I am reading this array (in a single block of threads) and try to compute the max value of it.

Here is the whole code:

module commons
   integer, parameter :: dp=kind(1.d0)
   integer, parameter :: nx=1024, ny=1024
   integer, parameter :: block_dimx=16, block_dimy=32
end module commons

module kernels
  use commons
contains
  attributes(global) subroutine kernel_gpu_reduce(a, anew, error, nxi, nyi)
    implicit none

    integer, value, intent(in) :: nxi, nyi
    real(dp), dimension(nxi,nyi), intent(in) :: a
    real(dp), dimension(nxi,nyi), intent(inout) :: anew
    real(dp), dimension(nxi/block_dimx+1,nyi/block_dimy+1), intent(inout) :: error
    real(dp), shared, dimension(block_dimx,block_dimy) :: err_sh
    integer :: i, j, k, tx, ty

    i = (blockIdx%x - 1)*blockDim%x + threadIdx%x
    j = (blockIdx%y - 1)*blockDim%y + threadIdx%y
    tx = threadIdx%x
    ty = threadIdx%y

    if (i > 1 .and. i < nxi .and. j > 1 .and. j < nyi) then
       anew(i,j) = 0.25d0*(a(i-1,j) + a(i+1,j) &
                       & + a(i,j-1) + a(i,j+1))
       err_sh(tx,ty) = abs(anew(i,j) - a(i,j))
    endif
    call syncthreads()

    error(blockIdx%x,blockIdx%y) = maxval(err_sh)

  end subroutine kernel_gpu_reduce

  attributes(global) subroutine max_reduce(local_error, error, nxi, nyi)
    implicit none

    integer, value, intent(in) :: nxi, nyi
    real(dp), dimension(nxi,nyi), intent(in) :: local_error
    real(dp), intent(out) :: error
    real(dp), shared, dimension(nxi) :: shared_error
    integer :: tx, i

    tx = threadIdx%x

    shared_error(tx) = 0.d0
    if (tx >=1 .and. tx <= nxi) shared_error(tx) = maxval(local_error(tx,:))
    call syncthreads()

    error = maxval(shared_error)

  end subroutine max_reduce
end module kernels

program laplace
  use cudafor
  use kernels
  use commons
  implicit none

  real(dp), allocatable, dimension(:,:) :: a, anew
  real(dp) :: error=1.d0
  real(dp), device, allocatable, dimension(:,:) :: adev, adevnew
  real(dp), device, allocatable, dimension(:,:) :: edev
  real(dp), allocatable, dimension(:,:) :: ehost
  real(dp), device :: error_dev
  integer    :: i
  integer    :: num_device, h_status, ierrSync, ierrAsync
  type(dim3) :: dimGrid, dimBlock

  num_device = 0
  h_status   = cudaSetDevice(num_device)

  dimGrid  = dim3(nx/block_dimx+1, ny/block_dimy+1, 1)
  dimBlock = dim3(block_dimx, block_dimy, 1)

  allocate(a(nx,ny), anew(nx,ny))
  allocate(adev(nx,ny), adevnew(nx,ny))
  allocate(edev(dimGrid%x,dimGrid%y), ehost(dimGrid%x,dimGrid%y))

  do i = 1, nx
     a(i,:) = 1.d0
     anew(i,:) = 1.d0
  enddo

  adev    = a
  adevnew = anew

  call kernel_gpu_reduce<<<dimGrid, dimBlock>>>(adev, adevnew, edev, nx, ny)

  ierrSync = cudaGetLastError()
  ierrAsync = cudaDeviceSynchronize()
  if (ierrSync /= cudaSuccess) write(*,*) &
     & 'Sync kernel error - 1st kernel:', cudaGetErrorString(ierrSync)
  if (ierrAsync /= cudaSuccess) write(*,*) &
     & 'Async kernel error - 1st kernel:', cudaGetErrorString(ierrAsync)

  call max_reduce<<<1, dimGrid%x>>>(edev, error_dev, dimGrid%x, dimGrid%y)

  ierrSync = cudaGetLastError()
  ierrAsync = cudaDeviceSynchronize()
  if (ierrSync /= cudaSuccess) write(*,*) &
     & 'Sync kernel error - 2nd kernel:', cudaGetErrorString(ierrSync)
  if (ierrAsync /= cudaSuccess) write(*,*) &
     & 'Async kernel error - 2nd kernel:', cudaGetErrorString(ierrAsync)

  error = error_dev
  print*, 'error from kernel: ', error
  ehost = edev
  error = maxval(ehost)
  print*, 'error from host: ', error

  deallocate(a, anew, adev, adevnew, edev, ehost)

end program laplace

I first had a problem because of the kernel configuration of the second kernel (which was <<<1, dimGrid>>>); I modified the code following Robert's answer. Now I have a memory access error:

 Async kernel error - 2nd kernel:
 an illegal memory access was encountered                                                                                        
0: copyout Memcpy (host=0x666bf0, dev=0x4203e20000, size=8) FAILED: 77(an illegal memory access was encountered)

And, if I run it with cuda-memcheck:

========= Invalid __shared__ write of size 8
=========     at 0x00000060 in kernels_max_reduce_
=========     by thread (1,0,0) in block (0,0,0)
=========     Address 0x00000008 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/libcuda.so (cuLaunchKernel + 0x2c5) [0x14ad95]

for every thread.

The code is compiled with PGI Fortran 14.9 and CUDA 6.5 on a Tesla K20 card (with CUDA capability 3.5). I compile it with:

pgfortran -Mcuda -ta:nvidia,cc35 laplace.f90 -o laplace
cuda
fortran
reduction
asked on Stack Overflow Feb 16, 2015 by MBR • edited Feb 17, 2015 by MBR

1 Answer

3

You can do proper cuda error checking in CUDA Fortran. You should do so in your code.

One problem is that you're trying to launch too many threads (per block) in your second kernel:

call max_reduce<<<1, dimGrid>>>(edev, error_dev, dimGrid%x, dimGrid%y)
                     ^^^^^^^

The dimGrid parameter has previously been computed to be:

dimGrid  = dim3(nx/block_dimx+1, ny/block_dimy+1, 1);

Substituting actual values, we have:

dimGrid = dim3(1024/16 + 1, 1024/32 +1);

i.e.

dimGrid = dim3(65,33);

But you are not allowed to request 65*33 = 2145 threads per block. The maximum is either 512 or 1024 depending on what device architecture target you are compiling for.

Because of this error, your second kernel is not running at all.

answered on Stack Overflow Feb 16, 2015 by Robert Crovella

User contributions licensed under CC BY-SA 3.0