using shared memory in cuda gives memory write error

0

I had a kernel which works fine as

__global__ static void  CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
    const Integer TID = CudaGetTargetID();
    const Integer ID  = TID;
    if(ID >= ComputeParticleNumber)
    {
        return ;
    }
    CDistance NearestDistance;
    Integer NearestID = -1;
    NearestDistance.Magnitude = 1e8;
    NearestDistance.Direction = make_Scalar3(0,0,0);
    if(c_daOutputParticleID[ID] < -1)
    {
        c_daSTLDistance[ID] = NearestDistance;
        c_daSTLID[ID] = NearestID;
        return;
    }
    Scalar3 TargetPosition = c_daParticlePosition[ID];

    Integer TriangleID;     
    Integer CIDX, CIDY, CIDZ;
    Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&TargetPosition,CIDX, CIDY, CIDZ);
    Integer Range = 1;
    if(CID >=0 && CID < c_CellNum)
    {
        for(Integer k = -Range; k <= Range; ++k)
        {
            for(Integer j = -Range; j <= Range; ++j)
            {
                for(Integer i = -Range; i <= Range; ++i)
                {
                    Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
                    if(MCID < 0 || MCID >= c_CellNum)
                    {
                        continue;
                    }
                    unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
                    for(unsigned int l = 0; l < TriangleNum; ++l)
                    {
                        TriangleID = c_daCell[MCID].m_TriangleID[l];
                        if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
                        {
                            CDistance Distance ;
                            Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &TargetPosition, &Distance.Direction);
                            if(Distance.Magnitude < NearestDistance.Magnitude)
                            {
                                NearestDistance = Distance;
                                NearestID = TriangleID;
                            }
                        }
                    }   
                }
            }
        }
    }
    c_daSTLDistance[ID] = NearestDistance;
    c_daSTLID[ID] = NearestID;
}

here c_daParticlePosition is constant memory float3 data type . so here I want to use shared memory so I tried to create float3 type shared memory and tried to copy constant date to shared memory however it shows unknown error and with cuda-memcheck it says

here thread number is 255 with 2 block size

shared_memory code

__global__ static void CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
    //const Integer TID = CudaGetTargetID();
    const Integer ID  =CudaGetTargetID(); 
    extern __shared__ float3 s[];
    /*if(ID >= ComputeParticleNumber)
    {
        return ;
    }*/
    s[ID] = c_daParticlePosition[ID];
    __syncthreads();

    CDistance NearestDistance;
    Integer NearestID = -1;
    NearestDistance.Magnitude = 1e8;
    NearestDistance.Direction.x = 0;
    NearestDistance.Direction.y = 0;
    NearestDistance.Direction.z = 0;//make_Scalar3(0,0,0);
    //if(c_daOutputParticleID[ID] < -1)
    //{
    //  c_daSTLDistance[ID] = NearestDistance;
    //  c_daSTLID[ID] = NearestID;
    //  return;
    //}

    //Scalar3 TargetPosition = c_daParticlePosition[ID];

    Integer TriangleID;     
    Integer CIDX, CIDY, CIDZ;
    Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&s[ID],CIDX, CIDY, CIDZ);
    if(CID >=0 && CID < c_CellNum)
    {
        //Integer Range = 1;
        for(Integer k = -1; k <= 1; ++k)
        {
            for(Integer j = -1; j <= 1; ++j)
            {
                for(Integer i = -1; i <= 1; ++i)
                {
                    Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
                    if(MCID < 0 || MCID >= c_CellNum)
                    {
                        continue;
                    }
                    unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
                    for(unsigned int l = 0; l < TriangleNum; ++l)
                    {
                        TriangleID = c_daCell[MCID].m_TriangleID[l];
                        /*if(c_daTrianglesParameters[c_daTriangles[TriangleID].ModelIDNumber].isDrag)
                        {
                            continue;
                        }*/

                        if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
                        {
                        CDistance Distance ;
                            Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &s[ID], &Distance.Direction);
                            if(Distance.Magnitude < NearestDistance.Magnitude)
                            {
                                NearestDistance = Distance;
                                NearestID = TriangleID;
                            }
                        }
                    }   
                }
            }
        }
    }
    c_daSTLDistance[ID] = NearestDistance;
    c_daSTLID[ID] = NearestID;
}

error

  Invalid __shared__ write of size 4
    =========     at 0x00000128 in CalcSTLDistance_Kernel(int)
    =========     by thread (159,0,0) in block (0,0,0)
    =========     Address 0x0000077c is out of bounds
cuda
shared-memory
asked on Stack Overflow Aug 26, 2014 by Malacu • edited Aug 26, 2014 by Malacu

1 Answer

3

You may find useful info on how to work with shared memory in this article. Focus especially on static shared memory and dynamic shared memory sections.

Based on above article you should find out that you are simply writing out of bounds of your array s, exactly as the error message says. To fix the issue you can:

  • either specify the size of shared memory array s at compile time, if you know it in advance, such as __shared__ float3 s[123456];
  • or use dynamically sized s array, thats basically what you are doing at the moment, but ALSO specify the third kernel launch parameter as CalcSTLDistance_Kernel<<<gridSize, blockSize, sharedMemorySizeInBytes>>>. In case you will be using an array of 123456 float3s then use int sharedMemorySizeInBytes = 123456 * sizeof(float3)
answered on Stack Overflow Aug 26, 2014 by Michal Hosala

User contributions licensed under CC BY-SA 3.0