question about __shfl_sync in CUDA programming

Question

question about __shfl_sync in CUDA programming

I am freshman in CUDA programming, and now reading the following code for kawpow algorithm. The purpose of this line "hash_seed[0] = __shfl_sync(0xFFFFFFFF, state2[0], h, PROGPOW_LANES);" is to braodcast the value of state[0] from h-th thread to all other threads. I am wondering there are many threads (blockIdx.x = 0~255, threadIdx.x = 0~511, so total 131072 threads) need to broadcast state[0] to other theads, what is the final result for these braodcasting?

void **progpow_search**(uint64_t start_nonce,   const hash32_t header,   const uint64_t target,    const dag_t *g_dag,   volatile Search_results* g_output,   bool hack_false)
{
__shared__ uint32_t c_dag[PROGPOW_CACHE_WORDS];
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
uint64_t const nonce = start_nonce + gid;    
//printf("threadIdx.x = %d\n",threadIdx.x);  // blockDim.x = 512, blockIdx.x = 0~255, threadIdx.x = 0~511
const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES - 1);

// Load the first portion of the DAG into the shared cache
for (uint32_t word = threadIdx.x*PROGPOW_DAG_LOADS; word < PROGPOW_CACHE_WORDS; word += blockDim.x*PROGPOW_DAG_LOADS)
{
    dag_t load = g_dag[word/PROGPOW_DAG_LOADS];
    for(int i=0; i<PROGPOW_DAG_LOADS; i++)
        c_dag[word + i] =  load.s[i];
}

// Force threads to sync and ensure shared mem is in sync
__syncthreads();

//uint32_t state[25];    // Keccak's state
uint32_t hash_seed[2];  // KISS99 initiator
hash32_t digest;        // Carry-over from mix output

uint32_t state2[8];

{
    // Absorb phase for initial round of keccak

    uint32_t state[25] = {0x0};  // Keccak's state

    // 1st fill with header data (8 words)
    for (int i = 0; i < 8; i++)
        state[i] = header.uint32s[i];

    // 2nd fill with nonce (2 words)
    state[8] = nonce;
    state[9] = nonce >> 32;

    // 3rd apply ravencoin input constraints
    for (int i = 10; i < 25; i++)
        state[i] = ravencoin_rndc[i-10];

    // Run intial keccak round
    
    keccak_f800(state);
    
    for (int i = 0; i < 8; i++)
        state2[i] = state[i];
}

uint32_t ttt0 = state2[0];
uint32_t ttt1 = state2[1];

#pragma unroll 1
for (uint32_t h = 0; h < PROGPOW_LANES; h++)
{
    uint32_t mix[PROGPOW_REGS];

    hash_seed[0] = __shfl_sync(0xFFFFFFFF, state2[0], h, PROGPOW_LANES);  // PROGPOW_LANES = 16
    hash_seed[1] = __shfl_sync(0xFFFFFFFF, state2[1], h, PROGPOW_LANES);  // PROGPOW_LANES = 16
    
    // initialize mix for all lanes
    fill_mix(hash_seed, lane_id, mix);

    #pragma unroll 1
    for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++)
        progPowLoop(l, mix, g_dag, c_dag, hack_false);

    // Reduce mix data to a per-lane 32-bit digest
    uint32_t digest_lane = FNV_OFFSET_BASIS;
    #pragma unroll
    for (int i = 0; i < PROGPOW_REGS; i++)
        fnv1a(digest_lane, mix[i]);

    // Reduce all lanes to a single 256-bit digest
    hash32_t digest_temp;
    #pragma unroll
    for (int i = 0; i < 8; i++)
        digest_temp.uint32s[i] = FNV_OFFSET_BASIS;

    for (int i = 0; i < PROGPOW_LANES; i += 8)
        #pragma unroll
        for (int j = 0; j < 8; j++)
            fnv1a(digest_temp.uint32s[j], SHFL(digest_lane, i + j, PROGPOW_LANES));

    if (h == lane_id)
        digest = digest_temp;
}

// Absorb phase for last round of keccak (256 bits)
uint64_t result;

{
    uint32_t state[25] = {0x0};  // Keccak's state

    // 1st initial 8 words of state are kept as carry-over from initial keccak
    for (int i = 0; i < 8; i++)
        state[i] = state2[i];

    // 2nd subsequent 8 words are carried from digest/mix
    for (int i = 8; i < 16; i++)
        state[i] = digest.uint32s[i - 8];

    // 3rd apply ravencoin input constraints
    for (int i = 16; i < 25; i++)
        state[i] = ravencoin_rndc[i - 16];

    // Run keccak loop
    keccak_f800(state);

    // Extract result, swap endianness, and compare with target
    result = (uint64_t) cuda_swab32(state[0]) << 32 | cuda_swab32(state[1]);
}

// Check result vs target
if (result >= target)
    return;

uint32_t index = atomicInc((uint32_t *)&g_output->count, 0xffffffff);
if (index >= MAX_SEARCH_RESULTS)
    return;

g_output->result[index].gid = gid;
#pragma unroll
for (int i = 0; i < 8; i++)
    g_output->result[index].mix[i] = digest.uint32s[i];
}

c++

cuda

asked on Stack Overflow Oct 19, 2020 by

user12980998 • edited Oct 19, 2020 by

talonmies

0 Answers

Nobody has answered this question yet.

User contributions licensed under CC BY-SA 3.0