cublasSgemm invalid __global__ read

0

When attempting to perform a tensor-matrix product using the cublasSgemm routine, address out of bounds errors occur, an example of which is provided below:-

========= Invalid __global__ read of size 4
=========     at 0x000019f8 in sgemm_sm35_ldg_nn_64x16x64x16x16
=========     by thread (6,3,0) in block (6,3,0)
=========     Address 0x7ffc059064a8 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15859d]
=========     Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x21fb31]
=========     Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x23a343]
=========     Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x1d4e92]
=========     Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x1d17b4]
=========     Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x1d2c5e]
=========     Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x1d37b2]
=========     Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0xecd31]
=========     Host Frame:./test [0x2c0e]
=========     Host Frame:./test [0x2a99]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21af5]
=========     Host Frame:./test [0x2749]

After checking dimensions multiple times in my application and determining that this is not the issue, I wrote a minimal working example. Below is a simple example that multiplies two square matrices:-

#include "stdlib.h"
#include "time.h"
#include "stdio.h"
#include "cuda.h"
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <math.h>
#include "cuda_error.h"

void matrixMult(cublasOperation_t transA, cublasOperation_t transB, int M, int N,
            int K, float alpha, float *A, float *B, float beta, float *C,
                cublasHandle_t *cb_handle);

int main(){
    int i, j, idx;
    int D = 500;

    int len = D*D;
    float *A_h, *B_h, *C_h;
    float *A_d, *B_d, *C_d;

    A_h = (float*)malloc(len*sizeof(float));
    B_h = (float*)malloc(len*sizeof(float));
    C_h = (float*)malloc(len*sizeof(float));

    srand48(time(NULL));
    for(i=0; i<D; i++){
        for(j=0; j<D; j++){
            A_h[i*D + j] = drand48();
            B_h[i*D + j] = drand48();
        }
    }

    cudaCheck(cudaMalloc((void**)&A_d, len*sizeof(float)));
    cudaCheck(cudaMalloc((void**)&B_d, len*sizeof(float)));
    cudaCheck(cudaMalloc((void**)&C_d, len*sizeof(float)));
    cudaCheck(cudaMemcpy(A_d, A_h, len*sizeof(float),  cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(B_d, B_h, len*sizeof(float), cudaMemcpyHostToDevice));

    cublasHandle_t cb_handle;
    cublasCheck(cublasCreate(&cb_handle));
    cublasSetPointerMode(cb_handle, CUBLAS_POINTER_MODE_DEVICE);
    matrixMult(CUBLAS_OP_N, CUBLAS_OP_N, D, D, D, 1.0, B_d, A_d, 0.0, C_d, &cb_handle);
    cublasDestroy(cb_handle);

    cudaCheck(cudaMemcpy(C_h, C_d, len*sizeof(float), cudaMemcpyDeviceToHost));
    cudaCheck(cudaFree(A_d));
    cudaCheck(cudaFree(B_d));
    cudaCheck(cudaFree(C_d));

    free(A_h);
    free(B_h);
    free(C_h);
}

void matrixMult(cublasOperation_t transA, cublasOperation_t transB, int M, int N,
            int K, float alpha, float *A, float *B, float beta, float *C,
            cublasHandle_t *cb_handle){
    int lda = (transA == CUBLAS_OP_N) ? K : M;
    int ldb = (transB == CUBLAS_OP_N) ? N : K;
    int ldc = N;
    cublasCheck(cublasSgemm(*cb_handle, transB, transA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
}

With the following trivial error capture header:-

#ifndef CUDA_ERROR_CHECK
#define CUDA_ERROR_CHECK

#include <cuda_runtime.h>
#include "cublas_v2.h"

#define cudaCheck(ans){cuda_assert((ans), __FILE__, __LINE__);}
#define cublasCheck(ans){cublas_assert((ans), __FILE__, __LINE__);}

inline void cuda_assert(cudaError_t code, const char *file, int line){
   if(code != cudaSuccess){
      fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
      exit(code);
   }
}

inline void cublas_assert(cublasStatus_t code, const char *file, int line){
    if(code != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "CUBLAS Error! %s line: %d error code: %d\n", file, line, code);
        exit(code);
    }
}

#endif

Note that the above error output was yielded by the above square matrix example. Similar output is yielded for my tensor product application.

I am using CUDA 7.5 with a Titan Black card. Am I doing something fundamentally wrong, or is it likely to be an issue with my cuBLAS installation?

cuda
cublas
asked on Stack Overflow Oct 21, 2015 by Jack H

1 Answer

1

If you eliminate this:

cublasSetPointerMode(cb_handle, CUBLAS_POINTER_MODE_DEVICE);

your code will run without error. It's not clear why you are setting the pointer mode to CUBLAS_POINTER_MODE_DEVICE. The documentation indicates:

There are two categories of the functions that use scalar parameters :

  • functions that take alpha and/or beta parameters by reference on the host or the device as scaling factors, such as gemm

  • functions that return a scalar result on the host or the device such as amax(), amin, asum(), rotg(), rotmg(), dot() and nrm2().

For the functions of the first category, when the pointer mode is set to CUBLAS_POINTER_MODE_HOST, the scalar parameters alpha and/or beta can be on the stack or allocated on the heap.

The CUBLAS_POINTER_MODE_HOST is the default setting, and it is the correct setting in your case, where &alpha and &beta are pointers to host memory:

cublasCheck(cublasSgemm(*cb_handle, transB, transA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
answered on Stack Overflow Oct 21, 2015 by Robert Crovella • edited Jun 20, 2020 by Community

User contributions licensed under CC BY-SA 3.0