I have this an example which is implemented by using shared memory. just the simple dot of two vectors ,it is a simple reduction problem.while ,i see cuda program document that there is a instruction called suffle,which could exchange data in a warp.
ref: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions
so i write two kernels to compare between the sharemememory and suffle.but ,it share memory is much faster than suffle: here is my code: USING TESLA T4
__device__ void warpReduce(volatile float* tile,int tid){
tile[tid] +=tile[tid+32];
tile[tid] +=tile[tid+16];
tile[tid] +=tile[tid+8];
tile[tid] +=tile[tid+4];
tile[tid] +=tile[tid+2];
tile[tid] +=tile[tid+1];
}
template <int BLOCKSIZE>
__global__ void dot_cuda_1_float_kernel5(int n,const float* x,const float* y,float* blockresult,float* result)//reduction,6207.838867
{
extern __shared__ float tile[];
cg::thread_block block = cg::this_thread_block();
cg::grid_group grid = cg::this_grid();
int tidx = block.thread_index().x;
int blocksize = block.group_dim().x;
int blockidx = block.group_index().x;
int tid = 4*tidx + 4*blocksize*blockidx;
if(grid.thread_rank() == 0)
{
*result = 0;
}
tile[tidx] = 0;
int stride = blocksize;
float tmp= 0;
while(tid < n ) {
tmp+= x[tid]*y[tid] + x[tid+1]*y[tid+1] + x[tid+2]*y[tid+2] + x[tid+3]*y[tid+3];
tid = tid+4*blockDim.x*gridDim.x;
}
tile[tidx] = tmp;
block.sync();
if(BLOCKSIZE >=512){ if(tidx<256){tile[tidx]+=tile[tidx+256];} block.sync();}
if(BLOCKSIZE >=256){ if(tidx<128){tile[tidx]+=tile[tidx+128];} block.sync();}
if(BLOCKSIZE >=64){ if(tidx<64){tile[tidx]+=tile[tidx+64];} block.sync();}
if(tidx < 32) warpReduce(tile,tidx);
grid.sync();
if(tidx == 0) {
atomicAdd(result,tile[0]);
}
}
suffle function:// 6495ms (cost: 6207.838867ms 16777216 float ,loop 10000 times blocksize:512,gridsize: 40)
__global__ void dot_cuda_1_float_kernel6(int n,const float* x,const float* y,float* blockresult,float* result)//6495ms
{
// extern __shared__ float tile[];
__shared__ float tile[32];
cg::thread_block block = cg::this_thread_block();
cg::grid_group grid = cg::this_grid();
int tidx = block.thread_index().x;
int blocksize = block.group_dim().x;
int blockidx = block.group_index().x;
int tid = 4*tidx + 4*blocksize*blockidx;
int laneId = tidx%32;
int warpId = tidx/32;
if(grid.thread_rank() == 0)
{
*result = 0;
}
int stride = blocksize;
float tmp= 0;
while(tid < n ) {
tmp+= x[tid]*y[tid] + x[tid+1]*y[tid+1] + x[tid+2]*y[tid+2] + x[tid+3]*y[tid+3];
tid = tid+4*blockDim.x*gridDim.x;
}
// tmp = warpReduceSuffle(tmp);
tmp+= __shfl_xor_sync(0xffffffff,tmp,16);
tmp+= __shfl_xor_sync(0xffffffff,tmp,8);
tmp+= __shfl_xor_sync(0xffffffff,tmp,4);
tmp+= __shfl_xor_sync(0xffffffff,tmp,2);
tmp+= __shfl_xor_sync(0xffffffff,tmp,1);
if(laneId == 0) tile[warpId] = tmp;
block.sync();
tmp = (tidx<32) ? tile[laneId]:0;
if(warpId == 0)
{
tmp+= __shfl_xor_sync(0xffffffff,tmp,16);
tmp+= __shfl_xor_sync(0xffffffff,tmp,8);
tmp+= __shfl_xor_sync(0xffffffff,tmp,4);
tmp+= __shfl_xor_sync(0xffffffff,tmp,2);
tmp+= __shfl_xor_sync(0xffffffff,tmp,1);
}
// // block.sync();
// // grid.sync();
if(tidx == 0) {
atomicAdd(result,tmp);
}
}
User contributions licensed under CC BY-SA 3.0