I am trying to swap two variables in a thread-local context for CUDA device code.
As explored here, the CUDA compiler seems to be able to infer which registers to pass along.
However, will I experience warp-divergence if I were to run the following code?
template <typename T> __device__ void inline swap_test_device(T a, T b)
{
T c(a); a=b; b=c;
}
__global__ void swap_test_global(const int* __restrict__ input1, const int* __restrict__ input2, int* output1, int* output2) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int x = input1[tx]*input1[tx];
int y = input2[tx]*input2[tx];
if (tx % 2 == 0) swap_test_device(x,y);
output1[tx] = x;
output2[tx] = y;
}
Further still, what will be the semantics of the following piece of code?
__global__ void swap_test_global(const int* __restrict__ input1, const int* __restrict__ input2, int* output1, int* output2) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int x = input1[tx]*input1[tx];
int y = input2[tx]*input2[tx];
if (tx % 2 == 0) swap_test_device(x,y);
x = __shfl_xor_sync(0xFFFFFFFF, y, 2);
if (tx % 2 == 0) swap_test_device(x,y);
// Some operations involving x and y
}
Would the CUDA compiler be able to fuse the warp-shuffle instruction with the swap instruction?
It seems in this case that I am better off utilizing shared memory to swap the data?
User contributions licensed under CC BY-SA 3.0