My goal is to write a simple ray tracer with the phong shading model with CUDA in C++. It is supposed to calculate the appropriate colors and write them into a frame buffer on the GPU and afterwards I write the values in the frame buffer into a .ppm file on the CPU. The image size I have is 512x512 so for the thread layout in the kernel call I used the following arguments: dim3 thread_blocks(16, 16)
and dim3 threads_per_block(32, 32)
.
This should in theory give me access to (16*16) * (32*32) threads
which is equal to the amount of pixels in the image (512 * 512
). But this gives me a CUDA error with the error code 700 for cudaMemcpy on the line where I copy the data back from the device to the host. Using a smaller amount of threads_per_block
like dim3 threads_per_block(16, 16)
works without an error but will of course only render 1/4th of the image.
I have tried other thread layouts as well and even the ones that were specifically explained for a 2D image yielded the same error, so that's where I need help.
The kernel call:
void run_kernel(const int size, Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
// empty_kernel<<<dim3(16, 16, 1), dim3(32, 32, 1)>>>();
// cudaDeviceSynchronize();
Vec3f* fb_device = nullptr;
Sphere* spheres_dv = nullptr;
Light* light_dv = nullptr;
Vec3f* origin_dv = nullptr;
checkErrorsCuda(cudaMalloc((void**) &fb_device, sizeof(Vec3f) * size));
checkErrorsCuda(cudaMemcpy((void*) fb_device, fb, sizeof(Vec3f) * size, cudaMemcpyHostToDevice));
checkErrorsCuda(cudaMalloc((void**) &spheres_dv, sizeof(Sphere) * 3));
checkErrorsCuda(cudaMemcpy((void*) spheres_dv, spheres, sizeof(Sphere) * 3, cudaMemcpyHostToDevice));
checkErrorsCuda(cudaMalloc((void**) &light_dv, sizeof(Light) * 1));
checkErrorsCuda(cudaMemcpy((void*) light_dv, light, sizeof(Light) * 1, cudaMemcpyHostToDevice));
checkErrorsCuda(cudaMalloc((void**) &origin_dv, sizeof(Vec3f) * 1));
checkErrorsCuda(cudaMemcpy((void*) origin_dv, origin, sizeof(Vec3f) * 1, cudaMemcpyHostToDevice));
cudaEvent_t start, stop;
float time = 0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cast_ray<<<dim3(16, 16), dim3(32, 32)>>>(fb_device, spheres_dv, light_dv, origin_dv);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("%f ms\n", time);
checkErrorsCuda(cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost));
cudaEventDestroy(start);
cudaEventDestroy(stop);
checkErrorsCuda(cudaFree(fb_device));
checkErrorsCuda(cudaFree(spheres_dv));
checkErrorsCuda(cudaFree(light_dv));
checkErrorsCuda(cudaFree(origin_dv));
}
The cast_ray
function:
__global__ void cast_ray(Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
int i = (blockIdx.x * blockDim.x) + threadIdx.x;
int j = (blockIdx.y * blockDim.y) + threadIdx.y;
int tid = (j*WIDTH) + i;
if(i >= WIDTH || j >= HEIGHT) return;
Vec3f ij(2 * (float((i) + 0.5) / (WIDTH - 1)) - 1, 1 - 2 * (float((j) + 0.5) / (HEIGHT - 1)), -1);
Vec3f *dir = new Vec3f(ij - *origin);
Ray r(*origin, *dir);
float intersections[3];
int hp = -1;
for(int ii = 0; ii < 3; ii++) {
intersections[ii] = r.has_intersection(spheres[ii]);
}
int asize = sizeof(intersections) / sizeof(*intersections);
if(asize == 1) {
hp = intersections[0] < 0 ? -1 : 0;
} else {
if(asize != 0) {
float min_val = 100.0;
for (int ii = 0; ii < asize; ii++) {
if (intersections[ii] < 0.0) continue;
else if (intersections[ii] < min_val) {
min_val = intersections[ii];
hp = ii;
}
}
}
}
if(hp == -1) {
fb[tid] = Color(94, 156, 255);
} else {
auto color = get_color_at(r, intersections[hp], light, spheres[hp], spheres);
fb[tid] = color;
}
}
The error message: CUDA error at ./main.cu::195 with error code 700 for cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost)().
(The corresponding line is the cudaMemcpy
after the printf
in the kernel call function)
With cuda-memcheck
I get the following information:
========= Error: process didn't terminate successfully
========= Out-of-range Shared or Local Address
========= at 0x00000100 in __cuda_syscall_mc_dyn_globallock_check
========= by thread (0,7,0) in block (2,5,0)
(This was tried on a RTX 2060 SUPER)
Changing Vec3f *dir = new Vec3f(ij - *origin);
to Vec3f dir(ij - *origin);
solved the issue! dir
being a pointer was a remnant of previous iterations of the code that weren't needed anymore, but even then don't forget to delete
all your new
's.
User contributions licensed under CC BY-SA 3.0