Compute Shader not compiling for specific (valid) code

0

I'm experiencing weird behavior with conditional loops, branches and updating the UAV with my compute shader (DX11), CS 5.0. I'm t esting a naive raymarching through Sparse Voxel Octtree bricks. The data is linearised by BFS. The structure (LWSVO = Light Weight SVO) looks like:

struct LWSVO
{
    int firstChild;
    int bitfield; 
    int level;
    int m_colour;
    int m_normal;
    float3 m_position;
};

firstChild is index of the first child of any node. if it is a leaf, firstchild is -1. bitfield is position of the valid (1) and invalid (0) child nodes. I'm also uncompressing the data from a packed 24byte smaller structure to this. I've extensively checked every part of the pack-unpack (and actually every individual functions used in the shader). All works perfectly and i've checked against the actual data.

Althoguh the following info is not very relevant but here's how the algo works (it works perfectly in the CPU). Now the raymarching (sort of) algorithm starts from a face of the octtree cube and traverses along z axis (each thread has fixed x,y in the ray generation). Upon hitting, it calculates the indexes of a node by a given level( mapCartesianToIndex() ). So when we have an index, against a point, we get an offset given a parent node and a child position. If the child position is invalid, it returns -1. We have 5 different states of the nodes: Empty(0), Full (1), leaf(2), maxlevel(3) and invalid (-1). the marching stops on leaf of maxlevel.

Here's the code:

/*
Although we dont have ENUMs, lets get this stright:

nodeState::EMPTY = 0;
nodeState::FULL = 1;
nodeState::LEAF = 2;
nodeState::MAXLEVEL = 3;
nodeState::INVALID = -1;

*/


//#define RTX_EPS 0.000001

/*----------------------------------------
*                                        |
*   main two LWSVO structures            |
*---------------------------------------*/

/*
** The main structure for linear SVO.
*/
struct LWSVO
{
    int firstChild;
    int bitfield; 
    int level;
    int m_colour;
    int m_normal;
    float3 m_position;
};


/*
** The compressed structure for LWSVO.
*/
struct LWSVO_
{
    //bitfield is Big Endian : 7 6 5 4 3 2 1 0
    int firstChild;
    int bitf_level_colr_colg;
    int colb_norx_nory_norz;
    float x;
    float y;
    float z;
};


StructuredBuffer<LWSVO_> lwSVO_ : register(t0);

RWStructuredBuffer<uint> Result : register (u0); 



/*----------------------------------------
*                                        |
*   Code related to LWSVO access         |
*---------------------------------------*/
LWSVO uncompressSVO(LWSVO_ d)
{
    LWSVO l;
    int r, g, b;
    int norx, nory, norz;
    l.firstChild = d.firstChild;

    g = (int)(d.bitf_level_colr_colg & 0x000000ff);
    r = (int)((d.bitf_level_colr_colg & 0x0000ff00) >> 8);
    l.level = (int)((d.bitf_level_colr_colg & 0x00ff0000) >> 16);
    l.bitfield = (int)((d.bitf_level_colr_colg & 0xff000000) >> 24);


    norz = (int)(d.colb_norx_nory_norz & 0x000000ff);
    nory = (int)((d.colb_norx_nory_norz & 0x0000ff00) >> 8);
    norx = (int)((d.colb_norx_nory_norz & 0x00ff0000) >> 16);
    b = (int)((d.colb_norx_nory_norz & 0xff000000) >> 24);

    l.m_colour = (r + (g << 8) + (b << 16));
    l.m_normal = (norx + (nory << 8) + (norz << 16));

    l.m_position = float3(d.x,d.y,d.z);

    return l;
}

void copyNode(LWSVO source, out LWSVO dest)
{
    dest.firstChild = source.firstChild;
    dest.bitfield = source.bitfield;
    dest.level = source.level;
    dest.m_colour = source.m_colour;
    dest.m_normal = source.m_normal;
    dest.m_position = source.m_position;

}

/*Given a node , outputs the state of it. 1 is FULL 2 is LEAF, 0 is EMPTY, but theres no empty node in the ds.*/
int getState(int child)
{
    //return NodeState::FULL;
    if (child > 0)
        return 1;
    else if (child == -1)
        return 2;
    else
        return -1; //wont get here
}


/*Given a parent node and a childindex, outputs the index of the child at given index*/
int getChild(LWSVO pNode, int childIndex)
{

    //check bitmask
    uint primarybf = 0x00000001;
    /*int a[8];
    a[0] = 1;
    a[1] = 2;
    a[2] = 4;
    a[3] = 8;
    a[4] = 16;
    a[5] = 32;
    a[6] = 64;
    a[7] = 128;
    */
    primarybf = (primarybf << childIndex);

    if ((pNode.bitfield & primarybf) != 0)
    {
        int count = 0;
        for (int i = 0; i < childIndex; i++)
        {
            uint a = 0x00000001;
            a = a << i;
            if ((pNode.bitfield & a) != 0) //if bit is set at position i.
            {
                count = count + 1;
            }
        }       
        return (pNode.firstChild + count);
    }   
    else //no child
    {
        return -1; //means empty
    }

    //return 0;
}


/*----------------------------------------
*                                        |
*   C++ library function ports           |
*   TODO: optimise later.                |
*---------------------------------------*/

//have to implement thge copysign  function!
float copysign(in float value, in float source)
{
    /*int sourcesign = (int)(source & 0x80000000) >> 31;
    int output = 0;
    output = (int)sourcesign << 31;
    value = (value & 0x7fffffff);
    return (output + value);    
    */

    //here's the thing.. will look for a bitpos solution later.
    //lets see if the damn engine works.
    if (source >= 0.0f)
    {
        if (value >= 0.0f)
            return value;
        return -value;
    }
    else
    {
        if (value >= 0.0f)
            return -value;
        return value;

    }
}

int signbit(float source)
{
    //int sourcesign = ((int)source >> 31) & 0x00000001;
    //return sourcesign;
    if (source >= 0)
        return 0;
    return 1;
}


/*----------------------------------------
*                                        |
*   HDR to 8 bit LDR colour & vice versa |
*---------------------------------------*/

int pack8BitColour(float3 colour) //Checked.
{
    int r, g, b;
    r = int(colour.x*255.0f);
    g = int(colour.y*255.0f) << 8;
    b = int(colour.z*255.0f) << 16;
    return(r + g + b);// +(int)0xff000000);
}

float3 unpack8BitColour(int val) //Checked.
{
    float3 output;
    output.x = (float)(val & 0x000000ff) / 255.0f;
    output.y = (float)((val & 0x0000ff00) >> 8) / 255.0f;
    output.z = (float)((val & 0x00ff0000) >> 16) / 255.0f;
    return output;
}



/*---------------------------------------
*                                       |
*   The brnachless index finding logic. |
*   finds the index in the SVO voxel    |
*   given a level and a position.       |
*---------------------------------------*/

void mapCartesianToIndex(float3 position, int maxlevel, out int index[12])
{

//This piece of logic works exactly as it should and produces perfect result in CPU.

    bool x, y, z;

    //transformation of value to 
    // -1 ---- 0 ---- +1
    //coordinate system

    float factor = (pow(2, maxlevel - 2));
    float stride = 1 / factor;
    float step = stride / 2;

    while (maxlevel > 1)
    {
        int pivot;
        float mid;

        pivot = floor(abs(position.x) / stride);
        mid = float((stride*pivot)) + step;

        mid = copysign(mid, position.x);
        x = !bool(signbit(position.x - mid));

        pivot = floor(abs(position.y) / stride);
        mid = float((stride*pivot)) + step;

        mid = copysign(mid, position.y);
        y = !bool(signbit(position.y - mid));

        pivot = floor(abs(position.z) / stride);
        mid = float((stride*pivot)) + step;

        mid = copysign(mid, position.z);
        z = !bool(signbit(position.z - mid));


        //index[level - 1].w = 0;
        //w = 0;

        index[maxlevel - 1] = (int)x * 4 + (int)y * 2 + (int)z;

        //factor /= 2;
        stride *= 2;
        step *= 2;
        maxlevel = maxlevel - 1;
    }


    x = !bool(signbit(position.x));
    y = !bool(signbit(position.y));
    z = !bool(signbit(position.z));


    index[0] = (int)x * 4 + (int)y * 2 + (int)z;

}




/*---------------------------------------
*                                       |
*   Raymarching logic.                  |
*                                       |
*   To be replaced by RTX soon.         |
*---------------------------------------*/


float3 makeRay(uint3 id)
{
    int i = id.x;
    int j = id.y;

    float zf = 2.0f / 256.0f;
    float xx, yy, zz;
    xx = max((zf*i - 1.0f) + 0.000001, -1.0f);
    yy = max((1.0f - zf * j) - 0.000001, -1.0f);
    zz = -1.0f + zf * 126;// +0.000001;
    return float3(xx*1.0f, yy*1.0f, zz*1.0f);
}



uint rayMarchAlgo(float3 rayOrigin)
{
    volatile uint m_col = 0;
    uint maxLev = 8;

    float zMinAdvanceFactor = 2.0f / 256.0f;
    LWSVO m_node;

    //in case while runs more that 256
    uint whilemax = 0;

    while (rayOrigin.z < 1.0f && whilemax <256) 
    {
        //LWSVO_ struct is compressed 24 bit where data is packed.
        //uncompress just converts it to a friendlier format.

        m_node = uncompressSVO(lwSVO_[0]); 
        int childindex[12];

        //calculate the indexes of the point rayOrigin for level by level and
        //copies the result to the childindex array.

        mapCartesianToIndex(rayOrigin, maxLev, childindex);
        int nodeState = -1;
        for (uint levCount = 0; levCount < maxLev; levCount++)
        {
            //check the indexes of the hitpoint for level by level.
            //if the node is full at this level, go to the next level
            //if empty, move z forward.
            //if leaf or maxlevel, return the colour.

            int offset = 0; //init
            offset = getChild(m_node, childindex[levCount]);  //gets the 
            m_node = uncompressSVO(lwSVO_[offset]);

            nodeState = getState(m_node.firstChild);
            if (offset == -1) //means empty node
            {
                nodeState = 0; //empty.
            }
            else
            {
                m_node = uncompressSVO(lwSVO_[offset]);
                nodeState = getState(m_node);
                if (m_node.level == maxLev)
                {
                    //m_col = 255 << 8;
                    m_col = m_node.m_colour;
                        //neither of above works.
                    nodeState = 3; //maxlevel
                    break;
                }

            }
            //dealing with three states.
            if (nodeState == 2)
            {
                m_col =  m_node.m_colour; //doesn't work
                //m_col = 255; //this doesn't work either.
                break;
            }
            else if (nodeState == 0)
            {
                break;
            }
            else if (nodeState == 3)
            {
                break;
            }
            else if (nodeState == -1)
            {               
                break;
            }

        }
        if (nodeState == 2 || nodeState == 3)
        {
            break;
        }
        else if (nodeState == 0)
        {
            rayOrigin.z = rayOrigin.z + (1.0f / 128.0f);
        }
        whilemax = whilemax + 1;
    }

    return m_col;
}


[numthreads(16, 16, 1)]
void CSMain(uint3 threadid : SV_DispatchThreadID)
{

    float3 pt = makeRay(threadid);

    int index = threadid.x + threadid.y*256;

    col = rayMarchAlgo(pt);
    Result[index] = col; 
}

Line 335, in the for loop, if I run it for 1 time (e.g. put levCount < 1 instead of levCount < maxLev), then the code compiles. Otherwise the code doesn't compile.

It compiles again if i put levCount < maxLev but do not update the Result[index] with the output colour (packed in integer) at line 410;

Logic Sanity Checking:

As I've mentioned, I've checked every individual functions and all seems to work exactly as intended against actual data.

Loading the CS: here's the code i'm using to load the shader.

bool RTX_Renderer::loadComputeShader(LPCWSTR filename, ID3D11ComputeShader** computeShader)
{
    DWORD dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;
#if defined( _DEBUG )
    dwShaderFlags |= D3DCOMPILE_DEBUG;
#endif

    LPCSTR pProfile = (g_d3dDevice->GetFeatureLevel() >= D3D_FEATURE_LEVEL_11_0) ? "cs_5_0" : "cs_4_0";

    ID3DBlob* pErrorBlob = NULL;
    ID3DBlob* pBlob = NULL;
    HRESULT hr = D3DCompileFromFile(filename, NULL, NULL, "CSMain", pProfile, dwShaderFlags, NULL, &pBlob, &pErrorBlob);
    if (FAILED(hr))
    {
        if (pErrorBlob)
            OutputDebugStringA((char*)pErrorBlob->GetBufferPointer());
        if (pErrorBlob)
            pErrorBlob->Release();
        if (pBlob)
            pBlob->Release();

        return false;
    }
    else
    {
        hr = g_d3dDevice->CreateComputeShader(pBlob->GetBufferPointer(), pBlob->GetBufferSize(), NULL, computeShader);
        if (pErrorBlob)
            pErrorBlob->Release();
        if (pBlob)
            pBlob->Release();

        return hr == S_OK;
    }
}

Please let me know if you need any more information. Any leads will be highly appreciated.

directx
shader
hlsl
asked on Stack Overflow Jan 24, 2019 by ecntrk

1 Answer

0

For any future reference: Solved this by fixing the initialisation of the passed array inside the called function. The function in question is mapCartesianToIndex(). Initialising the int index[12] solved the issue and the code compiles fine.

answered on Stack Overflow Jan 24, 2019 by ecntrk

User contributions licensed under CC BY-SA 3.0