CUDA quantizing a 1D array given sorted quantization levels

Question

CUDA quantizing a 1D array given sorted quantization levels

I'm trying to develop a CUDA kernel to quantize a 1-D array, given an array of sorted quantization levels. Using a binary search tree I'm able to iterate through each element of the array and find the closest quantized level. I was able to functionally verify this using C++, and it works as intended, however, when trying to implement it using a CUDA kernel the executable crashes.

I'm unfamiliar with the debugging environment, but the dissasembly shows an exception at:

00007FF6D434EA18 movss xmm0,dword ptr [rcx+rax] (In page error reading location 0x0000000707200000 (status code 0xC0000022).

My current code is below:

#include <iostream>
#include <math.h>
#include <limits>

// CUDA kernel function
__global__
void quantize_(float quant_levels[], int num_elements, float *tensor) {
    for (int i = 0; i < num_elements; i++) {
        int l = 0;
        int h = sizeof(quant_levels) / sizeof(quant_levels[0]);
        int middle_point;
        float current_value = quant_levels[0];
        float current_difference;
        float difference = 999999.f;
        while (l <= h) {
            middle_point = l + (h - l) / 2;
            current_difference = abs(tensor[i] - quant_levels[middle_point]);
            if (current_difference < difference) {
                difference = current_difference;
                current_value = quant_levels[middle_point];
            }
            if (current_value < tensor[i]) {
                l = middle_point + 1;
            } else {
                h = middle_point - 1;
            }
        }
        tensor[i] = current_value;     
    }
    return;
}

int main(void) {
    const int num_elements = 100;
    float *tensor;

    cudaMallocManaged(&tensor, num_elements * sizeof(float));
    float quant_levels[5] = {0, 0.25, 0.5, 0.75, 1.0};

    for (int i = 0; i < num_elements; i++) {
        tensor[i] = (float)rand() / (float)RAND_MAX;
    }
    std::cout << tensor[0] << "\r\n";
    quantize_<<<1, 1>>>(quant_levels, num_elements, tensor);
    cudaDeviceSynchronize();

    std::cout << tensor[0] << "\r\n";
    cudaFree(tensor);

    return 0;
}

Your help would be greatly appreciated!

c++

cuda

binary-search-tree

asked on Stack Overflow Nov 7, 2019 by

nzbru • edited Nov 7, 2019 by

nzbru

1 Answer

Update: I managed to setup a development and use cudaMemcpy between devices to get it working as intended. Unfortunately, I'm still unable to use cudaMallocManaged inplace.

Please see below:

#include <iostream>
#include <math.h>
#include <limits>
#include "cuda_runtime.h"

// CUDA kernel function
__global__ void quantize(int num_quant_levels, float* quant_levels, int num_elements, float* tensor) {
    for (int i = 0; i < num_elements; i++) {
        int middle_point; // Middle point
        int optimal_point; // Optimal point
        int l = 0; // Lower bound
        int h = num_quant_levels; // Higher bound
        float difference = 1.0f; // Difference between a given point and the current middle point
        while (l <= h) {
            middle_point = l + (h - l) / 2;
            if (abs(tensor[i] - quant_levels[middle_point]) < difference) {
                // If the distance between the new point is smaller than the current distance
                difference = abs(tensor[i] - quant_levels[middle_point]);
                optimal_point = middle_point;
            }
            if (quant_levels[middle_point] < tensor[i]) {
                l = middle_point + 1;
            }
            else {
                h = middle_point - 1;
            }
        }
        tensor[i] = quant_levels[optimal_point];
    }
    return;
}

int main(void) {
    const int num_quant_levels = 5;
    const int num_elements = 10;
    float quant_levels[num_quant_levels] = { 0, 0.25, 0.5, 0.75, 1.0 };
    float* quant_levels_gpu;
    float* tensor;
    float* tensor_gpu;

    tensor = (float*)malloc(sizeof(float) * num_elements);
    cudaMalloc(&tensor_gpu, sizeof(float) * num_elements);
    cudaMalloc(&quant_levels_gpu, sizeof(float) * 5);

    for (int i = 0; i < num_elements; i++) {
        tensor[i] = (float)rand() / (float)RAND_MAX;
        std::cout << tensor[i] << std::endl;
    }
    std::cout << std::endl;

    cudaMemcpy(tensor_gpu, tensor, sizeof(float) * num_elements, cudaMemcpyHostToDevice);
    cudaMemcpy(quant_levels_gpu, &quant_levels, sizeof(float) * 5, cudaMemcpyHostToDevice);

    quantize <<<1, 1 >>> (num_quant_levels, quant_levels_gpu, num_elements, tensor_gpu);

    cudaMemcpy(tensor, tensor_gpu, sizeof(float) * num_elements, cudaMemcpyDeviceToHost);
    cudaFree(tensor_gpu);

    for (int i = 0; i < num_elements; i++) {
        std::cout << tensor[i] << std::endl;
    }

    free(tensor);
    return 0;
}

answered on Stack Overflow Nov 7, 2019 by

nzbru

User contributions licensed under CC BY-SA 3.0