Why is __shfl slower than shared memory in cuda

Question

Why is __shfl slower than shared memory in cuda

I have this an example which is implemented by using shared memory. just the simple dot of two vectors ,it is a simple reduction problem.while ,i see cuda program document that there is a instruction called suffle,which could exchange data in a warp.

ref: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions

so i write two kernels to compare between the sharemememory and suffle.but ,it share memory is much faster than suffle: here is my code: USING TESLA T4

sharemem function:(cost: 6207.838867ms 16777216 float ,loop 10000 times blocksize:512,gridsize: 40)

__device__ void warpReduce(volatile float* tile,int tid){
    tile[tid] +=tile[tid+32];
    tile[tid] +=tile[tid+16];
    tile[tid] +=tile[tid+8];
    tile[tid] +=tile[tid+4];
    tile[tid] +=tile[tid+2];
    tile[tid] +=tile[tid+1];
}


template <int BLOCKSIZE>
__global__ void dot_cuda_1_float_kernel5(int n,const float* x,const float* y,float* blockresult,float* result)//reduction,6207.838867
{
    extern __shared__ float tile[];
    cg::thread_block block = cg::this_thread_block();
    cg::grid_group grid = cg::this_grid();
    int tidx = block.thread_index().x;
    int blocksize = block.group_dim().x;
    int blockidx = block.group_index().x;
    int tid = 4*tidx + 4*blocksize*blockidx;

    if(grid.thread_rank() == 0)
    {
        *result = 0;
    }
    tile[tidx] = 0;
    int stride = blocksize;
    float tmp= 0;
    while(tid < n ) {
        tmp+= x[tid]*y[tid] + x[tid+1]*y[tid+1] + x[tid+2]*y[tid+2] + x[tid+3]*y[tid+3];
        tid = tid+4*blockDim.x*gridDim.x;
    }
    tile[tidx] = tmp;
    block.sync();

    if(BLOCKSIZE >=512){ if(tidx<256){tile[tidx]+=tile[tidx+256];} block.sync();}
    if(BLOCKSIZE >=256){ if(tidx<128){tile[tidx]+=tile[tidx+128];} block.sync();}
    if(BLOCKSIZE >=64){ if(tidx<64){tile[tidx]+=tile[tidx+64];} block.sync();}

    if(tidx < 32) warpReduce(tile,tidx);

    grid.sync();

    if(tidx == 0) {
        atomicAdd(result,tile[0]);
    }
}

suffle function:// 6495ms (cost: 6207.838867ms 16777216 float ,loop 10000 times blocksize:512,gridsize: 40)

__global__ void dot_cuda_1_float_kernel6(int n,const float* x,const float* y,float* blockresult,float* result)//6495ms
{
    // extern __shared__ float tile[];
    __shared__ float tile[32];
    cg::thread_block block = cg::this_thread_block();
    cg::grid_group grid = cg::this_grid();
    int tidx = block.thread_index().x;
    int blocksize = block.group_dim().x;
    int blockidx = block.group_index().x;
    int tid = 4*tidx + 4*blocksize*blockidx;
    int laneId  = tidx%32;
    int warpId  = tidx/32;

    if(grid.thread_rank() == 0)
    {
        *result = 0;
    }

    int stride = blocksize;
    float tmp= 0;
    while(tid < n ) {
        tmp+= x[tid]*y[tid] + x[tid+1]*y[tid+1] + x[tid+2]*y[tid+2] + x[tid+3]*y[tid+3];
        tid = tid+4*blockDim.x*gridDim.x;
    }
    // tmp = warpReduceSuffle(tmp);
    tmp+= __shfl_xor_sync(0xffffffff,tmp,16);
    tmp+= __shfl_xor_sync(0xffffffff,tmp,8);
    tmp+= __shfl_xor_sync(0xffffffff,tmp,4);
    tmp+= __shfl_xor_sync(0xffffffff,tmp,2);
    tmp+= __shfl_xor_sync(0xffffffff,tmp,1);
    if(laneId == 0) tile[warpId] = tmp;

    block.sync();
    tmp = (tidx<32) ? tile[laneId]:0;
    if(warpId == 0)
    {
        tmp+= __shfl_xor_sync(0xffffffff,tmp,16);
        tmp+= __shfl_xor_sync(0xffffffff,tmp,8);
        tmp+= __shfl_xor_sync(0xffffffff,tmp,4);
        tmp+= __shfl_xor_sync(0xffffffff,tmp,2);
        tmp+= __shfl_xor_sync(0xffffffff,tmp,1);
    }
    // // block.sync();
    // // grid.sync();

    if(tidx == 0) {
        atomicAdd(result,tmp);
    }
}

c++

cuda

asked on Stack Overflow Apr 21, 2021 by

Haoran Wang

0 Answers

Nobody has answered this question yet.

User contributions licensed under CC BY-SA 3.0