Why is my CUDA ray tracer giving me error code 700 with this thread layout?

Question

Why is my CUDA ray tracer giving me error code 700 with this thread layout?

My goal is to write a simple ray tracer with the phong shading model with CUDA in C++. It is supposed to calculate the appropriate colors and write them into a frame buffer on the GPU and afterwards I write the values in the frame buffer into a .ppm file on the CPU. The image size I have is 512x512 so for the thread layout in the kernel call I used the following arguments: dim3 thread_blocks(16, 16) and dim3 threads_per_block(32, 32).

This should in theory give me access to (16*16) * (32*32) threads which is equal to the amount of pixels in the image (512 * 512). But this gives me a CUDA error with the error code 700 for cudaMemcpy on the line where I copy the data back from the device to the host. Using a smaller amount of threads_per_block like dim3 threads_per_block(16, 16) works without an error but will of course only render 1/4th of the image.

I have tried other thread layouts as well and even the ones that were specifically explained for a 2D image yielded the same error, so that's where I need help.

The kernel call:

void run_kernel(const int size, Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
    // empty_kernel<<<dim3(16, 16, 1), dim3(32, 32, 1)>>>();
    // cudaDeviceSynchronize();

    Vec3f* fb_device = nullptr;
    Sphere* spheres_dv = nullptr;
    Light* light_dv = nullptr;
    Vec3f* origin_dv = nullptr;

    checkErrorsCuda(cudaMalloc((void**) &fb_device, sizeof(Vec3f) * size));
    checkErrorsCuda(cudaMemcpy((void*) fb_device, fb, sizeof(Vec3f) * size, cudaMemcpyHostToDevice));

    checkErrorsCuda(cudaMalloc((void**) &spheres_dv, sizeof(Sphere) * 3));
    checkErrorsCuda(cudaMemcpy((void*) spheres_dv, spheres, sizeof(Sphere) * 3, cudaMemcpyHostToDevice));

    checkErrorsCuda(cudaMalloc((void**) &light_dv, sizeof(Light) * 1));
    checkErrorsCuda(cudaMemcpy((void*) light_dv, light, sizeof(Light) * 1, cudaMemcpyHostToDevice));

    checkErrorsCuda(cudaMalloc((void**) &origin_dv, sizeof(Vec3f) * 1));
    checkErrorsCuda(cudaMemcpy((void*) origin_dv, origin, sizeof(Vec3f) * 1, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    float time = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cast_ray<<<dim3(16, 16), dim3(32, 32)>>>(fb_device, spheres_dv, light_dv, origin_dv);
    
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("%f ms\n", time);

    checkErrorsCuda(cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost));

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    checkErrorsCuda(cudaFree(fb_device));
    checkErrorsCuda(cudaFree(spheres_dv));
    checkErrorsCuda(cudaFree(light_dv));
    checkErrorsCuda(cudaFree(origin_dv));
}

The cast_ray function:

__global__ void cast_ray(Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
    int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    int j = (blockIdx.y * blockDim.y) + threadIdx.y;

    int tid = (j*WIDTH) + i;
    if(i >= WIDTH || j >= HEIGHT) return;

    Vec3f ij(2 * (float((i) + 0.5) / (WIDTH - 1)) - 1, 1 - 2 * (float((j) + 0.5) / (HEIGHT - 1)), -1);
    Vec3f *dir = new Vec3f(ij - *origin);
    Ray r(*origin, *dir);

    float intersections[3];
    int hp = -1;
    for(int ii = 0; ii < 3; ii++) {
        intersections[ii] = r.has_intersection(spheres[ii]);
    }

    int asize = sizeof(intersections) / sizeof(*intersections);
    if(asize == 1) {
        hp = intersections[0] < 0 ? -1 : 0;
    } else {
        if(asize != 0) {
            float min_val = 100.0;
            for (int ii = 0; ii < asize; ii++) {
                if (intersections[ii] < 0.0) continue;
                else if (intersections[ii] < min_val) {
                    min_val = intersections[ii];
                    hp = ii;
                }
            }
        }
    }

    if(hp == -1) {
        fb[tid] = Color(94, 156, 255);
    } else {
        auto color = get_color_at(r, intersections[hp], light, spheres[hp], spheres);
        fb[tid] = color;
    }
}

The error message: CUDA error at ./main.cu::195 with error code 700 for cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost)(). (The corresponding line is the cudaMemcpy after the printf in the kernel call function)

With cuda-memcheck I get the following information:

========= Error: process didn't terminate successfully
========= Out-of-range Shared or Local Address
=========     at 0x00000100 in __cuda_syscall_mc_dyn_globallock_check
=========     by thread (0,7,0) in block (2,5,0)

(This was tried on a RTX 2060 SUPER)

c++

multithreading

memory

cuda

gpu

asked on Stack Overflow Feb 1, 2021 by

xkevio

1 Answer

Changing Vec3f *dir = new Vec3f(ij - *origin); to Vec3f dir(ij - *origin); solved the issue! dir being a pointer was a remnant of previous iterations of the code that weren't needed anymore, but even then don't forget to delete all your new's.

answered on Stack Overflow Feb 1, 2021 by

xkevio

User contributions licensed under CC BY-SA 3.0