I am freshman in CUDA programming, and now reading the following code for kawpow algorithm. The purpose of this line "hash_seed[0] = __shfl_sync(0xFFFFFFFF, state2[0], h, PROGPOW_LANES);" is to braodcast the value of state[0] from h-th thread to all other threads. I am wondering there are many threads (blockIdx.x = 0~255, threadIdx.x = 0~511, so total 131072 threads) need to broadcast state[0] to other theads, what is the final result for these braodcasting?
void **progpow_search**(uint64_t start_nonce, const hash32_t header, const uint64_t target, const dag_t *g_dag, volatile Search_results* g_output, bool hack_false)
{
__shared__ uint32_t c_dag[PROGPOW_CACHE_WORDS];
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
uint64_t const nonce = start_nonce + gid;
//printf("threadIdx.x = %d\n",threadIdx.x); // blockDim.x = 512, blockIdx.x = 0~255, threadIdx.x = 0~511
const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES - 1);
// Load the first portion of the DAG into the shared cache
for (uint32_t word = threadIdx.x*PROGPOW_DAG_LOADS; word < PROGPOW_CACHE_WORDS; word += blockDim.x*PROGPOW_DAG_LOADS)
{
dag_t load = g_dag[word/PROGPOW_DAG_LOADS];
for(int i=0; i<PROGPOW_DAG_LOADS; i++)
c_dag[word + i] = load.s[i];
}
// Force threads to sync and ensure shared mem is in sync
__syncthreads();
//uint32_t state[25]; // Keccak's state
uint32_t hash_seed[2]; // KISS99 initiator
hash32_t digest; // Carry-over from mix output
uint32_t state2[8];
{
// Absorb phase for initial round of keccak
uint32_t state[25] = {0x0}; // Keccak's state
// 1st fill with header data (8 words)
for (int i = 0; i < 8; i++)
state[i] = header.uint32s[i];
// 2nd fill with nonce (2 words)
state[8] = nonce;
state[9] = nonce >> 32;
// 3rd apply ravencoin input constraints
for (int i = 10; i < 25; i++)
state[i] = ravencoin_rndc[i-10];
// Run intial keccak round
keccak_f800(state);
for (int i = 0; i < 8; i++)
state2[i] = state[i];
}
uint32_t ttt0 = state2[0];
uint32_t ttt1 = state2[1];
#pragma unroll 1
for (uint32_t h = 0; h < PROGPOW_LANES; h++)
{
uint32_t mix[PROGPOW_REGS];
hash_seed[0] = __shfl_sync(0xFFFFFFFF, state2[0], h, PROGPOW_LANES); // PROGPOW_LANES = 16
hash_seed[1] = __shfl_sync(0xFFFFFFFF, state2[1], h, PROGPOW_LANES); // PROGPOW_LANES = 16
// initialize mix for all lanes
fill_mix(hash_seed, lane_id, mix);
#pragma unroll 1
for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++)
progPowLoop(l, mix, g_dag, c_dag, hack_false);
// Reduce mix data to a per-lane 32-bit digest
uint32_t digest_lane = FNV_OFFSET_BASIS;
#pragma unroll
for (int i = 0; i < PROGPOW_REGS; i++)
fnv1a(digest_lane, mix[i]);
// Reduce all lanes to a single 256-bit digest
hash32_t digest_temp;
#pragma unroll
for (int i = 0; i < 8; i++)
digest_temp.uint32s[i] = FNV_OFFSET_BASIS;
for (int i = 0; i < PROGPOW_LANES; i += 8)
#pragma unroll
for (int j = 0; j < 8; j++)
fnv1a(digest_temp.uint32s[j], SHFL(digest_lane, i + j, PROGPOW_LANES));
if (h == lane_id)
digest = digest_temp;
}
// Absorb phase for last round of keccak (256 bits)
uint64_t result;
{
uint32_t state[25] = {0x0}; // Keccak's state
// 1st initial 8 words of state are kept as carry-over from initial keccak
for (int i = 0; i < 8; i++)
state[i] = state2[i];
// 2nd subsequent 8 words are carried from digest/mix
for (int i = 8; i < 16; i++)
state[i] = digest.uint32s[i - 8];
// 3rd apply ravencoin input constraints
for (int i = 16; i < 25; i++)
state[i] = ravencoin_rndc[i - 16];
// Run keccak loop
keccak_f800(state);
// Extract result, swap endianness, and compare with target
result = (uint64_t) cuda_swab32(state[0]) << 32 | cuda_swab32(state[1]);
}
// Check result vs target
if (result >= target)
return;
uint32_t index = atomicInc((uint32_t *)&g_output->count, 0xffffffff);
if (index >= MAX_SEARCH_RESULTS)
return;
g_output->result[index].gid = gid;
#pragma unroll
for (int i = 0; i < 8; i++)
g_output->result[index].mix[i] = digest.uint32s[i];
}
User contributions licensed under CC BY-SA 3.0