I am trying to process a matrix in parallel in CUDA. I need to compute each column of the matrix against a given vector and if the result is greater than certain value I will keep this column, otherwise the column is removed for further computation. To avoid copying and restructuring the matrix I used column indices to indicate whether a column should be used for further computation.

This process needs to be done multiple times. Each time a subset of all columns needs to be checked. So I created another matrix to store the column indices to process each time. For example, if I have a matrix of 10 columns and I need to repeat this process 4 times, the `column_indices`

matrix may look like this:

```
thrust::device_vector<int> column_indices( std::vector<int>( {
0, 1, -1, -1, -1, // 2 columns contains useful information
5, 6, 7, -1, -1, // 3 columns contains useful information
9, 8, 7, 6, -1, // 4 columns contains useful information
4, 3, 2, 1, 0 // 5 columns contains useful information
} ) );
```

This is just a simplified example. In real code I have to process a matrix with roughly 500-1000 columns. Because not all columns need to be processed each time and the number of columns is big, it may not be a good idea to pass each column to a thread for processing, as this means maybe half of the threads will be idle.

So I decided to use dynamic parallelism - a parent kernel checks how many threads are needed to process and launch a child kernel with exact number of threads and allocate exact shared memory as needed.

Here is my code:

```
#include <iostream>
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
__device__
float calculate( const float* v1, const float* v2, const int length )
{
// mock calculation resulting 0.0 for even threads and 0.5 for odd threads
return threadIdx.x % 2 == 0 ? 0.0f : 0.5f;
}
__global__
void child( float const* input_a, const int nrow, float const* input_b, int* columns, int* counts )
{
extern __shared__ float results[];
// input_a are a matrix stored in column-major order, and input_b is a vector
int thread_column = columns[ threadIdx.x ];
float const* thread_input = input_a+ thread_column * nrow;
results[ threadIdx.x ] = calculate( thread_input, input_b, nrow );
//--------------Discussion-----------
//Race condition is gone if I replace the line above with this:
//atomicExch( results + threadIdx.x, calculate( thread_input, input_b, nrow ) );
//However it looks to me unnecessary as each thread is accessing a different address
//-----------------------------------
__syncthreads();
if ( threadIdx.x == 0 ) {
// sort the column indices in descending results order so all indices to be removed are at the end of the indices
thrust::sort_by_key( thrust::seq, results, results + blockDim.x, columns, thrust::greater<float>() );
// count the number of indices to be removed
int remove_count = thrust::count( thrust::seq, results, results + blockDim.x, 0.0f );
*counts -= remove_count;
}
}
__global__
void parent( float const* inputs, const int nrow, float const* output, int* column_indices, int* column_counts, const int column_size )
{
int row_per_group = blockDim.x;
int group_num = blockIdx.x, row_num = threadIdx.x;
int tid = group_num * row_per_group + row_num;
int* indices_for_this_block = column_indices + tid * column_size;
int* count_for_this_block = column_counts + tid;
// launch child kernels to process the row
int block_size = *count_for_this_block;
if ( block_size > 0 ) {
child<<< 1, block_size, sizeof( float ) * block_size >>>( inputs, nrow, output, indices_for_this_block, count_for_this_block );
cudaDeviceSynchronize();
}
}
int main()
{
thrust::device_vector<int> column_indices( std::vector<int>( {
0, 1, -1, -1, -1, // 2 columns contains useful information
5, 6, 7, -1, -1, // 3 columns contains useful information
9, 8, 7, 6, -1, // 4 columns contains useful information
4, 3, 2, 1, 0 // 5 columns contains useful information
} ) );
thrust::device_vector<int> column_count( std::vector<int>( { 2, 3, 4, 5 } ) );
// Processing column_indices in two groups and each group process two rows
// Because we are mocking the correlation results, we don't need real data, so we pass nullptr as the data pointer.
parent<<< 2, 2 >>>(
nullptr, 0, nullptr, column_indices.data().get(), column_count.data().get(), 5
);
//--------------Discussion-----------
// Race condition is also gone if I launch parent kernel like this:
//parent<<< 2, 2, sizeof( float ) * 5 >>>(
// nullptr, 0, nullptr, column_indices.data().get(), column_count.data().get(), 5
//);
// But when the total number of column is big, this approach will fail as it exceeds the maximum capacity of shared memory
// (although only a fraction of the allocation is actually used).
//-----------------------------------
cudaDeviceSynchronize();
std::cout << "Row #0: ";
std::copy( column_indices.begin(), column_indices.begin() + column_count[ 0 ], std::ostream_iterator<int>( std::cout, ", " ) );
std::cout << std::endl;
std::cout << "Row #1: ";
std::copy( column_indices.begin() + 5, column_indices.begin() + 5 + column_count[ 1 ], std::ostream_iterator<int>( std::cout, ", " ) );
std::cout << std::endl;
std::cout << "Row #2: ";
std::copy( column_indices.begin() + 10, column_indices.begin() + 10 + column_count[ 2 ], std::ostream_iterator<int>( std::cout, ", " ) );
std::cout << std::endl;
std::cout << "Row #3: ";
std::copy( column_indices.begin() + 15, column_indices.begin() + 15 + column_count[ 3 ], std::ostream_iterator<int>( std::cout, ", " ) );
std::cout << std::endl;
}
```

Running above code, I got the correct results:

```
Row #0: 1,
Row #1: 6,
Row #2: 8, 6,
Row #3: 3, 1,
```

However, `cuda-memcheck`

seems to complain about potential race conditions like this:

```
========= WARN:(Warp Level Programming) Potential RAW hazard detected at __shared__ 0x13 in block (0, 0, 0) :
========= Write Thread (4, 0, 0) at 0x00000070 in /path_to_file/main.cu:23:child(float const *, int, float const *, int*, int*)
========= Read Thread (0, 0, 0) at 0x00000648 in /usr/local/cuda/include/thrust/system/detail/sequential/insertion_sort.h:109:child(float const *, int, float const *, int*, int*)
========= Current Value : 0
```

line #23 in main.cu is this line:

```
results[ threadIdx.x ] = calculate( thread_input, input_b, nrow );
```

and the reading thread seems to be related to:

```
thrust::sort_by_key( thrust::seq, results, results + blockDim.x, columns, thrust::greater<float>() );
```

But why would this happen between two lines separated by __syncthreads()?

I don't understand why this is happening.

- With this example, each child block will only have up to 5 threads.
- I called
`__syncthreads()`

before letting thread 0 to process the calculated results. - My understanding is that shared memory is private to each block (maybe this is where the problem came from). So the multiple launches of child kernel should not interfere with each other.
- If I modify my code slightly (as outlined in the Discussion section in the code), I can remove the racing condition. But why these work and the other doesn't?

Could anyone please let me know what I did wrong? Thank you very much!

At this time (through CUDA 8.0), the `cuda-memcheck`

racecheck tool does not support dynamic parallelism.

answered on Stack Overflow Nov 19, 2016 by Robert Crovella

User contributions licensed under CC BY-SA 3.0