OpenCL: Launch same kernel function multiple times on same input Buffer

Question

OpenCL: Launch same kernel function multiple times on same input Buffer

i want to make use of my GPU to compute hashes of data. I have a wordlist file. In this file, every word, lets say a password, has to be hashed. Currently i'm using SHA256 and my GTX 1060 is slower than a single core from my i5-2500k using openssl, which is hilarious. The hash should be written in another file. But the file handling isn't the problem here.

I want to slice the data in pieces of a couple MB and write the content in a buffer, lets call him dataA[]. dataA has the following structure: word1word2word3word4....wordN. Every word has the same length and this length is known. I also know how many words i saved in the buffer dataA[]. How can i tell every launched kernel, at which position he should take "wordI", e.g. 5 bytes and compute the hash of just this 5 bytes? I have a dirty solution with an input-index, which tells the kernel where he should start. But this solution is quite inefficient and does only work if workgroup size = number of work items, whichs destroys all my speedup. For testing purposes, dataA is filled with "abcde". It always breaks computation after 128 iterations. The computed hashes are correct. So my plan is: 1) read data from file into buffer (e.g. 1GB) 2) compute hashes for this 1GB of words in parallel using >128 work groups 3) write hashes to a file, get new 1GB of data and write in buffer 4) while doing 3), start to compute the next gigabyte of hashes and so on, till no input data is left

I'm stuck at points 2) and 4), i hope somebody can help. I'd appreciate it. Here is a part of my code:

#include <stdio.h>
#include <Core/Assert.hpp>
#include <Core/Time.hpp>
#include <OpenCL/cl-patched.hpp>
#include <OpenCL/Program.hpp>
#include <OpenCL/Event.hpp>
#include <OpenCL/Device.hpp>

#include <fstream>
#include <sstream>
#include <iostream>
#include <iomanip>
#include <cmath>
#include <math.h>
using namespace std;

int main(int argc, char** argv) {
// Create a context
    cl::Context context(CL_DEVICE_TYPE_GPU);

    // Get the first device of the context
    std::cout << "Context has " << context.getInfo<CL_CONTEXT_DEVICES>().size()
            << " devices" << std::endl;
    cl::Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
    std::vector<cl::Device> devices;
    devices.push_back(device);
    OpenCL::printDeviceInfo(std::cout, device);

    // Create a command queue
    cl::CommandQueue queue(context, device, CL_QUEUE_PROFILING_ENABLE);

    // Load the source code
    cl::Program program = OpenCL::loadProgramSource(context, "src/OpenCLExercise1_Basics.cl");
    // Compile the source code. This is similar to program.build(devices) but will print more detailed error messages
    OpenCL::buildProgram(program, devices);

    // Create a kernel object
    cl::Kernel hash(program, "hash");
//generate the password string

static const unsigned numberOfPasswords = 1000;
static const unsigned wgSizeDef = 128; //work-group size
static const unsigned countDef = wgSizeDef * numberOfPasswords; //number of work-items
static const unsigned charsPerString = 5; //password length
static const unsigned inputBufferSize = numberOfPasswords*charsPerString*sizeof(char);
static const unsigned outputBufferSize = 8*numberOfPasswords*sizeof(uint);
static const unsigned numberOfIterations = ceil(numberOfPasswords/wgSizeDef);
printf("NumberOfIterations=%d\n",numberOfIterations);
char *dataA = new char[inputBufferSize]; //First Buffer
for (unsigned i = 0; i < inputBufferSize-4; i+=5) {
      dataA[i] = 'a';
      dataA[i+1] = 'b';
      dataA[i+2] = 'c';
      dataA[i+3] = 'd';
      dataA[i+4] = 'e';
}
char *dataB = new char[inputBufferSize]; //Second Buffer
    for (unsigned i = 0; i < numberOfPasswords; ++i) {
      dataB[i*charsPerString] = 'b';
}
size_t wgSize = wgSizeDef;
size_t count = countDef;
size_t inputSize = inputBufferSize;
size_t size2 = charsPerString*sizeof (char);
size_t outputSize = outputBufferSize;

char temp[charsPerString]="";
uint result[outputSize];
// setup the inputindex
int inputindex[wgSize+1];
cout<<"Inputindex : ";


inputindex[0] = {(int)charsPerString};
for(size_t i=0;i<wgSize;i++)
    {
        inputindex[i+1]=i*charsPerString;
    }
int count2 =0;
for(size_t i=0;i<wgSize+1;i++)
    {
        count2++;
        cout<<inputindex[i]<<" ";
    }
cout<<endl;
printf("count2: %d",count2);
// Allocate space for input and output data on the device
cl::Buffer d_inputindex (context, CL_MEM_READ_WRITE, sizeof(int)*(numberOfPasswords+1));
cl::Buffer d_input (context, CL_MEM_READ_WRITE, inputSize);
cl::Buffer empty (context, CL_MEM_READ_WRITE, size2);
cl::Buffer output (context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, outputSize);

// Copy input data to GPU
cl::Event copy1;
queue.enqueueWriteBuffer(d_inputindex, true, 0, sizeof(int)*(numberOfPasswords+1), inputindex, NULL, &copy1);
//queue.enqueueWriteBuffer(d_input, true, 0, size, password, NULL, &copy1);
queue.enqueueWriteBuffer(d_input, true, 0, inputSize, dataA, NULL, &copy1);
queue.enqueueWriteBuffer(empty, true, 0, size2, temp, NULL, &copy1);

// Launch kernel on the device
cl::Event execution;
hash.setArg<cl::Buffer>(0, d_inputindex);
hash.setArg<cl::Buffer>(1, d_input);
hash.setArg<cl::Buffer>(2, empty);
hash.setArg<cl::Buffer>(3, output);
// Copy output data back to host
queue.enqueueNDRangeKernel(hash, cl::NullRange, cl::NDRange(wgSize), cl::NDRange(wgSize) , NULL, &execution);
cout<<"finish all kernels"<<endl;
cl::Event copy2;
queue.enqueueReadBuffer(output, true, 0, outputSize, result, NULL, &copy2);
for (size_t i = 0; i < numberOfIterations; i++) {
}


//output the result from GPU
for (unsigned i=0;i<numberOfPasswords;i++) //;i<(pwdlength*numberofpwd)
{
    cout<<"Origin pwd: ";
    for (unsigned j=0;j<charsPerString;j++)
    {
        cout<<dataA[inputindex[i+1]+j];
    }
    cout<<"  After hashed: ";
    cout<<hex<<setfill('0');
    cout<<setw(8)<<result[i*8]<<" "<<setw(8)<<result[i*8+1];
    cout<<" "<<setw(8)<<result[i*8+2]<<" "<<setw(8)<<result[i*8+3];
    cout<<" "<<setw(8)<<result[i*8+4]<<" "<<setw(8)<<result[i*8+5];
    cout<<" "<<setw(8)<<result[i*8+6]<<" "<<setw(8)<<result[i*8+7]<<endl;
}
cout<<endl<<"success"<<endl;
cout<<"///////////////////////////////////////////////////////"<<endl;

It does work, but only if the number of work items matches the number of work groups. And the kernel code:

#ifndef __OPENCL_VERSION__
#include <OpenCL/OpenCLKernel.hpp> // Hack to make syntax highlighting in Eclipse work
#endif

uint rotr(uint x, uint n) {
    return (x >> n) | (x << (32 - n));
}

uint shr(uint x, uint n) {
    return x >> n;
}

uint Ch(uint x, uint y, uint z) {
    return (x & y) ^ (~x & z);
}
uint Maj(uint x, uint y, uint z) {
    return (x & y) ^ (x & z) ^ (y & z);
}
uint bigSigma0(uint x) {
    return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
}
uint bigSigma1(uint x) {
    //printf("%x %x %x ",rotr(x, 6),rotr(x, 11), rotr(x, 25));

    return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
}
uint smallSigma0(uint x) {
    return rotr(x, 7) ^ rotr(x, 18) ^ shr(x, 3);
}
uint smallSigma1(uint x) {
    return rotr(x, 17) ^ rotr(x, 19) ^ shr(x, 10);
}

__constant uint K[] = {
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};


__constant uint origH[] = {
    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};


__kernel void hash (__global const int* d_inputindex,__global const char * d_input,__global char* empty,__global uint* output) {
    //, global const int* d_lengthperpwd
    size_t index = get_global_id(0);

    int k,i,stop,leftover, a,b,c,d,e,f,g,h, T1, T2;

    uint W[16],temp[8];
    k=d_inputindex[0];
    char* t;

    t=(char*) &empty;

    for (i=0; i<d_inputindex[0]; i++)
    {
        //d_output[d_inputindex[index+1]+ui] = d_input[d_inputindex[index+1]+ui];
        t[i] = d_input[d_inputindex[index+1]+i];
    }


    for (i=0; i<d_inputindex[0]; i++)
    {
        printf("kernel %d has t[%d] %c",index,i,t[i]);
    }

    //initial the message block
    for (i = 0; i < 16; i++){
        W[i] = 0x00000000;
    }

    stop = d_inputindex[0]/4; //4 char per W   //put in the first multiple of 4 numbers char to the message blcok
    for(i=0;i<stop;i++)
    {
        //W[i] = ((uchar)  d_output[index*d_inputindex[0]+ i * 4]) << 24;
        //W[i] |= ((uchar) d_output[index*d_inputindex[0]+ i * 4 + 1]) << 16;
        //W[i] |= ((uchar) d_output[index*d_inputindex[0]+ i * 4 + 2]) << 8;
        //W[i] |= (uchar) d_output[index*d_inputindex[0]+ i * 4 + 3];
        W[i] = ((uchar)  t[i * 4]) << 24;
        W[i] |= ((uchar)  t[i * 4+1]) << 16;
        W[i] |= ((uchar)  t[i * 4+2]) << 8;
        W[i] |= ((uchar)  t[i * 4+3]);
        //printf("stop finished");
    }

    // take care the rest of char and padding the 1
    leftover = d_inputindex[0]%4;
    if(leftover ==3)
    {
        //W[i] = ((uchar)  d_output[index*d_inputindex[0]+  i * 4]) << 24;
        //W[i] |= ((uchar) d_output[index*d_inputindex[0]+  i * 4 + 1]) << 16;
        //W[i] |= ((uchar) d_output[index*d_inputindex[0]+  i * 4 + 2]) << 8;
        //W[i] |= 0x80;

        W[i] = ((uchar)  t[i * 4]) << 24;
        W[i] |= ((uchar) t[i * 4 + 1]) << 16;
        W[i] |= ((uchar) t[i * 4 + 2]) << 8;
        W[i] |= 0x80;
        //printf("ok3");
    }
    else if(leftover ==2)
    {
        //W[i] = ((uchar) d_output[index*d_inputindex[0]+  i * 4]) << 24;
        //W[i] |= ((uchar) d_output[index*d_inputindex[0]+  i * 4 + 1]) << 16;
        //W[i] |= 0x8000;
        W[i] = ((uchar) t[i * 4]) << 24;
        W[i] |= ((uchar) t[i * 4 + 1]) << 16;
        W[i] |= 0x8000;
        //printf("ok2");
    }
    else if(leftover ==1)
    {
        //W[i] = ((uchar) d_output[index*d_inputindex[0]+  i * 4]) << 24;
        //W[i] |= 0x800000;
        W[i] = ((uchar) t[i * 4]) << 24;
        W[i] |= 0x800000;
        //printf("ok1");
    }
    else //if(leftover ==0)
    {
        W[i] = 0x80000000;
        //printf("ok0");
    }

    //pad the message length in bits in the last 64 bit
    W[15] = d_inputindex[0]*8;


    //W 16--64
    for(i=16;i<64;i++)
    {
        W[i] =smallSigma1(W[i-2]) + W[i-7] + smallSigma0(W[i-15]) + W[i-16];
    }




    for (i=0;i<8;i++)
    {
        temp[i] = origH[i];
    }


    a = temp[0];
    b = temp[1];
    c = temp[2];
    d = temp[3];
    e = temp[4];
    f = temp[5];
    g = temp[6];
    h = temp[7];

    for(i=0;i<64;i++)
    {
        T1 = h + bigSigma1(e) + Ch(e,f,g) + K[i] + W[i];
        T2 = bigSigma0(a) + Maj(a,b,c);
        h = g;
        g = f;
        f = e;
        e = d+T1;
        d = c;
        c = b;
        b = a;
        a = T1+T2;
    }

    temp[0] += a;
    temp[1] += b;
    temp[2] += c;
    temp[3] += d;
    temp[4] += e;
    temp[5] += f;
    temp[6] += g;
    temp[7] += h;


    for (i = 0; i < 8; i++){
        output[index*8+i]=temp[i];
        //printf("kernel %d, temp[%d]:%x, output[%d]: %x",index,i,temp[i],index*8+i,output[index*8+i]);
    }

    printf("ok kernel %d",index);
    //printf("%d",sizeof(output));
}

arrays

char

kernel

opencl

sha256

asked on Stack Overflow Jul 31, 2018 by

pat

0 Answers

Nobody has answered this question yet.

User contributions licensed under CC BY-SA 3.0