I am trying to process a matrix in parallel in CUDA. I need to compute each column of the matrix against a given vector and if the result is greater than certain value I will keep this column, otherwise the column is removed for further computation. To avoid copying and restructuring the matrix I used column indices to indicate whether a column should be used for further computation.
This process needs to be done multiple times. Each time a subset of all columns needs to be checked. So I created another matrix to store the column indices to process each time. For example, if I have a matrix of 10 columns and I need to repeat this process 4 times, the column_indices
matrix may look like this:
thrust::device_vector<int> column_indices( std::vector<int>( {
0, 1, -1, -1, -1, // 2 columns contains useful information
5, 6, 7, -1, -1, // 3 columns contains useful information
9, 8, 7, 6, -1, // 4 columns contains useful information
4, 3, 2, 1, 0 // 5 columns contains useful information
} ) );
This is just a simplified example. In real code I have to process a matrix with roughly 500-1000 columns. Because not all columns need to be processed each time and the number of columns is big, it may not be a good idea to pass each column to a thread for processing, as this means maybe half of the threads will be idle.
So I decided to use dynamic parallelism - a parent kernel checks how many threads are needed to process and launch a child kernel with exact number of threads and allocate exact shared memory as needed.
Here is my code:
#include <iostream>
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
__device__
float calculate( const float* v1, const float* v2, const int length )
{
// mock calculation resulting 0.0 for even threads and 0.5 for odd threads
return threadIdx.x % 2 == 0 ? 0.0f : 0.5f;
}
__global__
void child( float const* input_a, const int nrow, float const* input_b, int* columns, int* counts )
{
extern __shared__ float results[];
// input_a are a matrix stored in column-major order, and input_b is a vector
int thread_column = columns[ threadIdx.x ];
float const* thread_input = input_a+ thread_column * nrow;
results[ threadIdx.x ] = calculate( thread_input, input_b, nrow );
//--------------Discussion-----------
//Race condition is gone if I replace the line above with this:
//atomicExch( results + threadIdx.x, calculate( thread_input, input_b, nrow ) );
//However it looks to me unnecessary as each thread is accessing a different address
//-----------------------------------
__syncthreads();
if ( threadIdx.x == 0 ) {
// sort the column indices in descending results order so all indices to be removed are at the end of the indices
thrust::sort_by_key( thrust::seq, results, results + blockDim.x, columns, thrust::greater<float>() );
// count the number of indices to be removed
int remove_count = thrust::count( thrust::seq, results, results + blockDim.x, 0.0f );
*counts -= remove_count;
}
}
__global__
void parent( float const* inputs, const int nrow, float const* output, int* column_indices, int* column_counts, const int column_size )
{
int row_per_group = blockDim.x;
int group_num = blockIdx.x, row_num = threadIdx.x;
int tid = group_num * row_per_group + row_num;
int* indices_for_this_block = column_indices + tid * column_size;
int* count_for_this_block = column_counts + tid;
// launch child kernels to process the row
int block_size = *count_for_this_block;
if ( block_size > 0 ) {
child<<< 1, block_size, sizeof( float ) * block_size >>>( inputs, nrow, output, indices_for_this_block, count_for_this_block );
cudaDeviceSynchronize();
}
}
int main()
{
thrust::device_vector<int> column_indices( std::vector<int>( {
0, 1, -1, -1, -1, // 2 columns contains useful information
5, 6, 7, -1, -1, // 3 columns contains useful information
9, 8, 7, 6, -1, // 4 columns contains useful information
4, 3, 2, 1, 0 // 5 columns contains useful information
} ) );
thrust::device_vector<int> column_count( std::vector<int>( { 2, 3, 4, 5 } ) );
// Processing column_indices in two groups and each group process two rows
// Because we are mocking the correlation results, we don't need real data, so we pass nullptr as the data pointer.
parent<<< 2, 2 >>>(
nullptr, 0, nullptr, column_indices.data().get(), column_count.data().get(), 5
);
//--------------Discussion-----------
// Race condition is also gone if I launch parent kernel like this:
//parent<<< 2, 2, sizeof( float ) * 5 >>>(
// nullptr, 0, nullptr, column_indices.data().get(), column_count.data().get(), 5
//);
// But when the total number of column is big, this approach will fail as it exceeds the maximum capacity of shared memory
// (although only a fraction of the allocation is actually used).
//-----------------------------------
cudaDeviceSynchronize();
std::cout << "Row #0: ";
std::copy( column_indices.begin(), column_indices.begin() + column_count[ 0 ], std::ostream_iterator<int>( std::cout, ", " ) );
std::cout << std::endl;
std::cout << "Row #1: ";
std::copy( column_indices.begin() + 5, column_indices.begin() + 5 + column_count[ 1 ], std::ostream_iterator<int>( std::cout, ", " ) );
std::cout << std::endl;
std::cout << "Row #2: ";
std::copy( column_indices.begin() + 10, column_indices.begin() + 10 + column_count[ 2 ], std::ostream_iterator<int>( std::cout, ", " ) );
std::cout << std::endl;
std::cout << "Row #3: ";
std::copy( column_indices.begin() + 15, column_indices.begin() + 15 + column_count[ 3 ], std::ostream_iterator<int>( std::cout, ", " ) );
std::cout << std::endl;
}
Running above code, I got the correct results:
Row #0: 1,
Row #1: 6,
Row #2: 8, 6,
Row #3: 3, 1,
However, cuda-memcheck
seems to complain about potential race conditions like this:
========= WARN:(Warp Level Programming) Potential RAW hazard detected at __shared__ 0x13 in block (0, 0, 0) :
========= Write Thread (4, 0, 0) at 0x00000070 in /path_to_file/main.cu:23:child(float const *, int, float const *, int*, int*)
========= Read Thread (0, 0, 0) at 0x00000648 in /usr/local/cuda/include/thrust/system/detail/sequential/insertion_sort.h:109:child(float const *, int, float const *, int*, int*)
========= Current Value : 0
line #23 in main.cu is this line:
results[ threadIdx.x ] = calculate( thread_input, input_b, nrow );
and the reading thread seems to be related to:
thrust::sort_by_key( thrust::seq, results, results + blockDim.x, columns, thrust::greater<float>() );
But why would this happen between two lines separated by __syncthreads()?
I don't understand why this is happening.
__syncthreads()
before letting thread 0 to process the calculated results.Could anyone please let me know what I did wrong? Thank you very much!
At this time (through CUDA 8.0), the cuda-memcheck
racecheck tool does not support dynamic parallelism.
User contributions licensed under CC BY-SA 3.0