My kernel for warp reduce looks like below
#include <stdio.h>
__global__ void reduce(int* var ) {
*var += __shfl_down_sync(0xffffffff, *var, 16,32);
*var += __shfl_down_sync(0xffffffff, *var, 8,32);
*var += __shfl_down_sync(0xffffffff, *var, 4,32);
*var += __shfl_down_sync(0xffffffff, *var, 2,32);
*var += __shfl_down_sync(0xffffffff, *var, 1,32);
}
int main() {
int c=22;
int *dev_c;
cudaMalloc( (void**)&dev_c, sizeof(int) ) ;
cudaMemcpy(dev_c,&c,sizeof(int), cudaMemcpyHostToDevice);
reduce<<<1,512>>>(dev_c);
cudaMemcpy( &c, dev_c, sizeof(int), cudaMemcpyDeviceToHost );
printf( "Value is %d\n", c );
return 0;
}
I am trying to understand how it works. I have launched it with varying blocks and threads and input as 22
Input: 22
Sno Blocks nthreads Warpsize Result
1 1 32 32 704
2 1 64 32 704
3 1 512 32 704
4 1 1024 32 1232
How does it depend on Blocks and threads
why am i getting 1232 when launching for 1,1024.
Does it depend on the warpSize in __shfl_down_sync(mask,value,delta,warpSize)
Can someone please explain how shfl_down_sync operation works for a single value rather than array.
User contributions licensed under CC BY-SA 3.0