I'm trying to develop a CUDA kernel to quantize a 1-D array, given an array of sorted quantization levels. Using a binary search tree I'm able to iterate through each element of the array and find the closest quantized level. I was able to functionally verify this using C++, and it works as intended, however, when trying to implement it using a CUDA kernel the executable crashes.
I'm unfamiliar with the debugging environment, but the dissasembly shows an exception at:
00007FF6D434EA18 movss xmm0,dword ptr [rcx+rax] (In page error reading location 0x0000000707200000 (status code 0xC0000022).
My current code is below:
#include <iostream>
#include <math.h>
#include <limits>
// CUDA kernel function
__global__
void quantize_(float quant_levels[], int num_elements, float *tensor) {
for (int i = 0; i < num_elements; i++) {
int l = 0;
int h = sizeof(quant_levels) / sizeof(quant_levels[0]);
int middle_point;
float current_value = quant_levels[0];
float current_difference;
float difference = 999999.f;
while (l <= h) {
middle_point = l + (h - l) / 2;
current_difference = abs(tensor[i] - quant_levels[middle_point]);
if (current_difference < difference) {
difference = current_difference;
current_value = quant_levels[middle_point];
}
if (current_value < tensor[i]) {
l = middle_point + 1;
} else {
h = middle_point - 1;
}
}
tensor[i] = current_value;
}
return;
}
int main(void) {
const int num_elements = 100;
float *tensor;
cudaMallocManaged(&tensor, num_elements * sizeof(float));
float quant_levels[5] = {0, 0.25, 0.5, 0.75, 1.0};
for (int i = 0; i < num_elements; i++) {
tensor[i] = (float)rand() / (float)RAND_MAX;
}
std::cout << tensor[0] << "\r\n";
quantize_<<<1, 1>>>(quant_levels, num_elements, tensor);
cudaDeviceSynchronize();
std::cout << tensor[0] << "\r\n";
cudaFree(tensor);
return 0;
}
Your help would be greatly appreciated!
Update: I managed to setup a development and use cudaMemcpy between devices to get it working as intended. Unfortunately, I'm still unable to use cudaMallocManaged inplace.
Please see below:
#include <iostream>
#include <math.h>
#include <limits>
#include "cuda_runtime.h"
// CUDA kernel function
__global__ void quantize(int num_quant_levels, float* quant_levels, int num_elements, float* tensor) {
for (int i = 0; i < num_elements; i++) {
int middle_point; // Middle point
int optimal_point; // Optimal point
int l = 0; // Lower bound
int h = num_quant_levels; // Higher bound
float difference = 1.0f; // Difference between a given point and the current middle point
while (l <= h) {
middle_point = l + (h - l) / 2;
if (abs(tensor[i] - quant_levels[middle_point]) < difference) {
// If the distance between the new point is smaller than the current distance
difference = abs(tensor[i] - quant_levels[middle_point]);
optimal_point = middle_point;
}
if (quant_levels[middle_point] < tensor[i]) {
l = middle_point + 1;
}
else {
h = middle_point - 1;
}
}
tensor[i] = quant_levels[optimal_point];
}
return;
}
int main(void) {
const int num_quant_levels = 5;
const int num_elements = 10;
float quant_levels[num_quant_levels] = { 0, 0.25, 0.5, 0.75, 1.0 };
float* quant_levels_gpu;
float* tensor;
float* tensor_gpu;
tensor = (float*)malloc(sizeof(float) * num_elements);
cudaMalloc(&tensor_gpu, sizeof(float) * num_elements);
cudaMalloc(&quant_levels_gpu, sizeof(float) * 5);
for (int i = 0; i < num_elements; i++) {
tensor[i] = (float)rand() / (float)RAND_MAX;
std::cout << tensor[i] << std::endl;
}
std::cout << std::endl;
cudaMemcpy(tensor_gpu, tensor, sizeof(float) * num_elements, cudaMemcpyHostToDevice);
cudaMemcpy(quant_levels_gpu, &quant_levels, sizeof(float) * 5, cudaMemcpyHostToDevice);
quantize <<<1, 1 >>> (num_quant_levels, quant_levels_gpu, num_elements, tensor_gpu);
cudaMemcpy(tensor, tensor_gpu, sizeof(float) * num_elements, cudaMemcpyDeviceToHost);
cudaFree(tensor_gpu);
for (int i = 0; i < num_elements; i++) {
std::cout << tensor[i] << std::endl;
}
free(tensor);
return 0;
}
User contributions licensed under CC BY-SA 3.0