How to do QR decomposition via modified Gram-Schmidt method in C and CUDA

Question

How to do QR decomposition via modified Gram-Schmidt method in C and CUDA

anyone know how to do the QR decomposition via modified Gram-Schmidt method in C and CUDA. Some example/source/paper or other else? Thanks so much.

Edit: I can't answer to my question because someone have closed it, so i decided to update my question.

/*
 * QR decomposition via modified Gram-Schmidt algorithm
 * 
 * @Package = QR-decomposition
 * @Program = QR_gpu
 * @Version = 13.0928
 */

// Libraries
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include <math.h>
#include <cuda.h>

// Constant
#define PROG "QR_cpu"
#define VERSION "13.1003"
#define PACKAGE "QR-Decomposition"

// Threads per block
#define THREAD_P_BLOCK 512
// Blocks per grid
#define BLOCKS_P_GRID  512

// macro
/* wrap each API call with the gpuErrchk macro, which will process the return 
status of the API call it wraps: http://bit.ly/1dTD0ZE */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

// Prototypes
__global__ void xTA (float *, float *, int, int, int, int, int);
__global__ void scale (float *, int, int, float *);
__global__ void r1_update(float *, float *, int, int, int, int);
__host__ void gpuAssert(cudaError_t, char *, int);
__host__ void print_matrix(float *, int, int, int);
__host__ void print_help (int);
__host__ int option_parser(int, char **, int *, int *);

// Host code
int main (int argc, char **argv) {

    int m, n, lda, ldr, i;
    float *A_d, *R_d;
    //cudaEvent_t t_start, t_stop;
    // Get "m" and "n" from command line
    if (0 != option_parser(argc, argv, &m, &n)) {
        fprintf(stderr, "Can\'t continue, exiting now!\n"); 
        exit(EXIT_FAILURE);
    }
    size_t A_size = m * n * sizeof(float);
    size_t R_size = n * n * sizeof(float);

    lda = n; ldr = n;

    // Allocate input matrices A_h and R_h in host memory
    float *A_h = (float *) malloc(A_size);
    float *R_h = (float *) malloc(R_size);
    memset(R_h, 0, R_size);

    // Initialize input matrix
    for (i = 0; i < n; i++)
        A_h[i*lda + i] = i + 1;

    // Allocate matrices in device memory
    gpuErrchk (cudaMalloc(&A_d, A_size));
    gpuErrchk (cudaMalloc(&R_d, R_size));
    // Copy the A matrix from host memory to device memory
    gpuErrchk (cudaMemcpy(A_d, A_h, A_size, cudaMemcpyHostToDevice));
    // Set R matrix to 0 
    gpuErrchk (cudaMemset(R_d, 0, R_size));

    /**** Invoke kernel ****/
    dim3 dimBlock (THREAD_P_BLOCK, 1, 1);
    // dimGrid 'monodimensional' (just x value)
    dim3 dimGrid_M ((m + THREAD_P_BLOCK - 1) / THREAD_P_BLOCK, 1, 1);
    // dimGrid 'bidimensional' (x and y values)
    dim3 dimGrid_B (BLOCKS_P_GRID, (m + THREAD_P_BLOCK - 1) / THREAD_P_BLOCK,1);
    // Gram-Schmidt algorithm step by step
    for (i = 0; i < n; i++) {
        // Step #1 --> R(i,i:n-1) = A'(:,i) * A(:,i:n-1)
        xTA <<< dimBlock, dimGrid_B >>> (R_d, A_d, m, n, lda, ldr, i);
        // Step #3 (Is the scale of a column vector)
        scale <<< dimBlock, dimGrid_M >>> (A_d + i, m, lda, R_d + i*ldr + i);
        // Step #4 (Is the scale of a row)
        scale <<< dimBlock, dimGrid_M >>> (R_d + ldr*i, m, 1, R_d + i*ldr + i);
        // Step #5 --> A(:,i+1:n−1) = A(:,i+1:n−1) − A(:,i) ∗ R(i,i+1:n−1)
        r1_update <<< dimBlock, dimGrid_B >>> (A_d, R_d + i*lda, m, n, lda, i);
    }
    // Copy the results from device memory to host memory
    gpuErrchk (cudaMemcpy(A_h, A_d, A_size, cudaMemcpyDeviceToHost));
    // Free device memory
    cudaFree(A_d); cudaFree(R_d);
    // Free host memory
    free(A_h); free(R_h);

    return 0;
}

/**
 * ## Kernel 1
 * 
 * Rank 1 update of columns of A
 */
__global__ void r1_update (float *A, float *R, int m, int n, int lda, int k) {
    // get x,y cordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.x;

    if (x < m && y < n-k-1)
        A[x*lda + y + k + 1] -= A[x*lda + k] * R[y + k + 1];
}

/**
 * ## Kernel 2
 * 
 * matrix vector product
 * Performs R[i] =  x'A where x' is a row of A
 * 
 * How leading dimension is used for matrices: http://ibm.co/19PLtIX
 */
__global__ void xTA (float *R, float *A, int m, int n, int lda, int ldr, int k){
    // block column * block dim + column (computed by each thread)
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j;
    // upper triangular matrix 
    if (i < n - k) {
        for (j = 0; j < m; j++)
            R[k*ldr + k + i] += A[k*lda + j] * A[j*lda + k + i];
    }
}

/**
 * ## Kernel 3
 * 
 * mult. for constant s
 * d vector
 * ld leading dimension (distance from elements)
 */
__global__ void scale (float *d, int m, int ld, float *R_x) {
    // block colum * block dim + column (computed by each thread)
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    // s = sqrt(R(i,i))
    // Static initialization of shared variables is illegal in CUDA. 
    // The problem is that the semantics of how every thread should treat 
    // static initialization of shared memory is undefined in the 
    // programming model. Which thread should do the write? What happens if 
    // the value is not uniform between threads? How should the compiler 
    // emit code for such a case and how should the hardware run it?
    __shared__ float s; s = sqrt(*R_x);
    // and scale
    if (i < m) d[i*ld] /= s;
}

/*
 * GPU Error Handler (CUDA_SAFE_CALL deprecated from CUDA 5.0)
 */
__host__ void gpuAssert(cudaError_t code, char *file, int line) {

    if (code != cudaSuccess) {
        fprintf(stderr,"GPUassert: %s %s %d\n", 
            cudaGetErrorString(code), file, line);
        exit(code);
    }
}

/* 
 * Print matrix
 * 
 * Print a matrix passed as argument
 */
__host__ void print_matrix (float * matrix, int m, int n, int ld) {

    int i, j;

    for (i = 0; i < m; i++) {
        for (j = 0; j < n; j++)
            printf("%0.5f ", matrix[i*ld + j]);
        printf("\n");
    }
}

/*
 * The option parser
 *
 * The function parses the parameters passed from the command line and run 
 * their own procedures.
 *
 * Return value:
 *   0 on success
 *  -1 on failure
 *
 * Please, see http://www.gnu.org/software/libc/manual/html_node/Getopt.html
 * for further informations. (thanks to Frodo Looijaard)
 */
__host__ int option_parser (int argc, char **argv, int * m, int * n) {

    int opt;

    if (argc < 2) {
        fprintf(stderr, "The program needs arguments...\n\n");
        print_help(1);
    }

    opterr = 0;

    while ( -1 != (opt = getopt (argc, argv, "hr:c:"))) {
        switch (opt) {
            case 'h': 
                print_help(0);
            case 'r': 
                printf("optarg: %s\n", optarg); 
                if ((*m = atoi(optarg)) < 2) return -1; 
                break;
            case 'c': 
                printf("optarg: %s\n", optarg);
                if ((*n = atoi(optarg)) < 2 || *n > *m) return -1; 
                break;
            case '?':
                if (optopt == 'r' || optopt == 'c')
                    fprintf(stderr,"Option -%c requires an argument.\n",optopt);
                else if (isprint (optopt))
                    fprintf(stderr,"Unknown option `-%c'.\n", optopt);
                else
                    fprintf(stderr,"Unknown option chr `\\x%x'.\n", optopt);
                return -1;
            default:
                fprintf(stderr, "default switch-case statement reached\n");
                return -1;
        }
        //for (ii = optind; ii < argc; ii++)
        //  printf ("non-option argument %s\n", argv[ii]);
    }
    return 0;
}

/*
 * The helper
 *
 * Show the info to run the program in the correct way
 */
__host__ void print_help (int exit_code) {

    printf("\nPKG : %s\nPROGRAM : %s\nVERSION : %s\n\n",PACKAGE,PROG,VERSION);

    printf("%s [-h] [-r num of rows] [-c num of columns]\n\n", PROG);
    printf("  -h    print this help and exit\n");
    printf("  -r    provide the number of rows\n");
    printf("  -c    provide the number of colums\n\n");

    printf("  Example: ./qr_gpu -r 800 -c 600\n\n");

    exit_code == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS);
}

When i run the program with cuda-memcheck I obtain this result:

[mcrociara@tesla project_CUDA]$ cuda-memcheck ./qr_gpu -r 4 -c 4 ========= CUDA-MEMCHECK optarg: 4 optarg: 4 GPUassert: unspecified launch failure src/qr_gpu.cu 99 ========= Invalid global read of size 4 ========= at 0x000000c8 in xTA ========= by thread (0,0,0) in block (0,0,0)

========= Address 0x3b5273104 is out of bounds

========= Invalid global read of size 4 ========= at 0x000000c8 in xTA ========= by thread (1,0,0) in block (0,0,0)

========= Address 0x3b5273108 is out of bounds

========= Invalid global read of size 4 ========= at 0x000000c8 in xTA ========= by thread (2,0,0) in block (0,0,0)

========= Address 0x3b527310c is out of bounds

========= ERROR SUMMARY: 3 errors

Someone may help to understand why? I implemented the serial version on this algorithm that seem to work properly:

/*
 * QR decomposition via modified Gram-Schmidt algorithm
 */

// Libraries
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include <math.h>

// Constant
#define PROG "QR_cpu"
#define VERSION "28.0913"
#define PACKAGE "QR-Decomposition"

// Prototypes
void r1_update(double *, double *, int);
void xTA (double *, double *, int);
void scale (double *, int, double);
void gram (double *, double *);
void print_matrix(double *);
int option_parser(int, char **);
void print_help (int);

// Number of rows
int M = -1; 
// Number of columns
int N = -1;
// Leading Dimension of A and R
int LDA = -1;
int LDR = -1;
// The clock
clock_t start, stop;

/*
 *
 */
int main (int argc, char **argv) {

    int i;

    // Get M, N from command line
    if (0 != option_parser(argc, argv)) {
        fprintf(stderr, "Bad option man!!!\n");
        exit(EXIT_FAILURE);
    }
    // Check the size of M and N 
    if (M > 5000 || N > 5000) {
        fprintf(stderr, "Too big man!!!\n");
        exit(EXIT_FAILURE);
    }

    // Set the leading dimension of A and R
    LDA = N; LDR = N;

    // Allocate memory for A and R 
    double *A = calloc(M * N, sizeof(*A));
    double *R = calloc(N * N, sizeof(*R));

    // Set the diagonal of A as A(i,i) = i + 1 with i from 0 to N−1
    for (i = 0; i < N; i++) 
        A[i*LDA + i] = i + 1;

    start = clock();
    gram(A, R);
    stop = clock();

    printf("\nTime: %0.4f\n\n",(stop - start) / (double)(CLOCKS_PER_SEC));
    // print_matrix(A);
    // print_matrix(R);

    free(A); free(R);

    return 0;
}

/**
 * Rank 1 update of columns of A
 */
void r1_update (double *A, double *R, int k) {

    int i, j;

    for(i = 0; i < M; i++)
        for(j = k + 1; j < N; j++)
            A[i*LDA + j] -= A[i*LDA + k] * R[j];
}

/**
 * Matrix vector product
 * Performs R[i] =  x'A where x' is a row of A
 * A : m x k, leading dimebsion, lda
 * 
 * How leading dimension is used for matrices: http://ibm.co/19PLtIX
 */
void xTA (double *R, double *A, int k) {

    int i, j;
    // upper triangular matrix 
    for (i = 0; i < N-k; i++)
        for (j = 0; j < M; j++)
            R[k*LDR + k + i] += A[k*LDA + j] * A[j*LDA + k + i];
}

/**
 * Mult. for constant s
 * d vector
 * ld leading dimension (distance from elements)
 */
void scale (double *d, int ld, double s) {

    int i;

    for (i = 0; i < M; i++) d[i*ld] *= s;
}

/**
 * Performs Modified Gram Schmidt
 * ortogonalization of columns of A
 * A m x n
 * R n x n
 */ 
void gram (double *A, double *R) {

    int i;
    double s;
    // Modified Gram Schmidt algorithm step by step
    for (i = 0; i < N; i++) {
        // Step #1 --> R(i,i:n-1) = A'(:,i) * A(:,i:n-1)
        xTA(R, A, i);
        // Step #2 (Normalizing) --> s = sqrt(R(i,i))
        s = 1 / sqrt(R[i*LDR + i]);
        // Step #3 (Is the scale of a column vector)
        scale(A + i, LDA, s);
        // Step #4 (Is the scale of a row)
        scale(R + LDR*i, 1, s);
        // Step #5 --> A(:,i+1:n−1) = A(:,i+1:n−1) − A(:,i) ∗ R(i,i+1:n−1)
        r1_update(A, R + i*LDA, i);
    }
}

/* 
 * Print Matrix
 * 
 * Print a matrix passed as argument
 */
void print_matrix (double * matrix) {

    int i, j;

    for (i = 0; i < M; i++) {
        for (j = 0; j < N; j++)
            printf("%0.2f ", matrix[i*LDA + j]);
        printf("\n");
    }
}

/*
 * The option parser
 *
 * The function parses the parameters passed from the command line and run 
 * their own procedures.
 *
 * Return value:
 *   0 on success
 *  -1 on failure
 *
 * Please, see http://www.gnu.org/software/libc/manual/html_node/Getopt.html
 * for further informations. (thanks to Frodo Looijaard)
 */
int option_parser (int argc, char **argv) {

    int opt;

    if (argc < 2) {
        fprintf(stderr, "This program needs arguments...\n\n");
        print_help(1);
    }

    opterr = 0;

    while ( -1 != (opt = getopt (argc, argv, "hr:c:"))) {
        switch (opt) {
            case 'h': 
                print_help(0);
            case 'r': 
                printf("optarg: %s\n", optarg); 
                if ((M = atoi(optarg)) < 2) return -1; 
                break;
            case 'c': 
                printf("optarg: %s\n", optarg);
                if ((N = atoi(optarg)) < 2 || N > M) return -1; 
                break;
            case '?':
                if (optopt == 'r' || optopt == 'c')
                    fprintf(stderr,"Option -%c requires an argument.\n",optopt);
                else if (isprint (optopt))
                    fprintf(stderr,"Unknown option `-%c'.\n", optopt);
                else
                    fprintf(stderr,"Unknown option chr `\\x%x'.\n", optopt);
                return -1;
            default:
                fprintf(stderr, "default switch-case statement reached\n");
                return -1;
        }
        //for (ii = optind; ii < argc; ii++)
        //  printf ("Non-option argument %s\n", argv[ii]);
    }
    return 0;
}

/*
 * The helper
 *
 * Shows the info to run the program in the correct way
 */
void print_help (int exit_val) {

    printf("\nPKG : %s\nPROGRAM : %s\nVERSION : %s\n\n",PACKAGE,PROG,VERSION);

    printf("%s [-h] [-r num of rows] [-c num of columns]\n\n", PROG);
    printf("  -h    print this help and exit\n");
    printf("  -r    provide the number of rows\n");
    printf("  -c    provide the number of colums\n\n");

    printf("  Example: ./qr_cpu -r 800 -c 600\n\n");

    exit_val == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS);
}

Thanks in advance!

c

cuda

qr-decomposition

asked on Stack Overflow Sep 13, 2013 by

realnot • edited Oct 4, 2013 by

realnot

1 Answer

There are different ways to calculate the QR decomposition of a matrix. The main methods are:

Gram-Schmidt process;
Householder reflections;
Givens rotations;

Gram-Schmidt is a sequence of projections and vector subtractions, which may be implemented as a sequence of kernels performing reductions (for projections) and element-wise array operations (vector subtractions). You can have a look at the papers

a) QR Decomposition on GPUs

b) Parallel Implementation of Classical Gram-Schmidt Orthogonalization on CUDA Graphics Cards

c) CPU vs. GPU - Performance comparison for the Gram-Schmidt algorithm

QR decomposition via Householder reflections is dominated by matrix-vector operations and you can find some information in paper a), paper

d) Benchmarking GPUs to Tune Dense Linear Algebra

and an implementation by V. Volkov and J.W. Demmel is available at

LU, QR and Cholesky factorizations using GPU

Givens rotations do not appear to me to be very popular as a parallel approach to QR decomposition. Basically, each Givens rotation modifies two rows, so that some parallelization possible also by aid of the Sameh-Kuck pattern allowing up to n concurrent rotations. You can find some information at

Benchmarking the NVIDIA 8800GTX with the CUDA Development Platform

Actually, a clear performance comparison between the different approaches as implemented in CUDA architectures isn't available. Be aware that some of the posted material regards optimizations on "old" architectures. So, perhaps further improvements could be achieved by further optimizations for the "newer" GPU generations.

answered on Stack Overflow Sep 14, 2013 by

Vitality • edited Sep 14, 2013 by

Vitality

User contributions licensed under CC BY-SA 3.0