How can one make this dynamic bit range code GCC compliant for 64 bit compilers?

1

I am trying to update for linux, GCC, and 64 bit use and preserve in a github Ken Silverman's Paint N Draw 3D C software. I got his permission but he's too busy to help. I don't want to do a bad job and I am not a bit-twiddling expert so I'd like to fix the main parts before I upload it.

In his code pnd3d.c he used a struct called bitmal_t * that contains a malloc (I think his element mal means the size of a malloc) and a size to indicate a voxel-distance as an unsigned int (in 2009 it was a 32 bit ) bit chain amongst the bits of a concatenated set of 32 bit ints. So basically, distance is a function of how many bits on (1) along the extended bit chain. For collisions, he looks up and down for zeros and ones.

Here is his bitmal_t:

    //buf: cast to: octv_t* or surf_t*
    //bit: 1 bit per sizeof(buf[0]); 0=free, 1=occupied
typedef struct bit { void *buf; unsigned int mal, *bit, ind, num, siz; } bitmal_t;

Here is his range finding code that goes up and down the bit-range looking for a one or a zero. I posted his originals, not my crappy nonworking version.

Here is all the code snippets you would need to reproduce it.

static __forceinline int dntil0 (unsigned int *lptr, int z, int zsiz)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z < zsiz) && (lptr[z>>5]&(1<<KMOD32(z)))) z++; return(z);
    int i;
        //WARNING: zsiz must be multiple of 32!
    i = (lptr[z>>5]|((1<<KMOD32(z))-1)); z &= ~31;
    while (i == 0xffffffff)
    {
        z += 32; if (z >= zsiz) return(zsiz);
        i = lptr[z>>5];
    }
    return(bsf(~i)+z);
}

static __forceinline int uptil0 (unsigned int *lptr, int z)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z > 0) && (lptr[(z-1)>>5]&(1<<KMOD32(z-1)))) z--; return(z);
    int i;
    if (!z) return(0); //Prevent possible crash
    i = (lptr[(z-1)>>5]|(-1<<KMOD32(z))); z &= ~31;
    while (i == 0xffffffff)
    {
        z -= 32; if (z < 0) return(0);
        i = lptr[z>>5];
    }
    return(bsr(~i)+z+1);
}

static __forceinline int dntil1 (unsigned int *lptr, int z, int zsiz)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z < zsiz) && (!(lptr[z>>5]&(1<<KMOD32(z))))) z++; return(z);
    int i;
        //WARNING: zsiz must be multiple of 32!
    i = (lptr[z>>5]&(-1<<KMOD32(z))); z &= ~31;
    while (!i)
    {
        z += 32; if (z >= zsiz) return(zsiz);
        i = lptr[z>>5];
    }
    return(bsf(i)+z);
}

static __forceinline int uptil1 (unsigned int *lptr, int z)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z > 0) && (!(lptr[(z-1)>>5]&(1<<KMOD32(z-1))))) z--; return(z);
    int i;
    if (!z) return(0); //Prevent possible crash
    i = (lptr[(z-1)>>5]&((1<<KMOD32(z))-1)); z &= ~31;
    while (!i)
    {
        z -= 32; if (z < 0) return(0);
        i = lptr[z>>5];
    }
    return(bsr(i)+z+1);
}

Here are his set range to ones and zeroes functions:

//Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 0's
#ifndef _WIN64

static __forceinline void setzrange0 (void *vptr, int z0, int z1)
{
    int z, ze, *iptr = (int *)vptr;
    if (!((z0^z1)&~31)) { iptr[z0>>5] &= ((~(-1<<z0))|(-1<<z1)); return; }
    z = (z0>>5); ze = (z1>>5);
    iptr[z] &=~(-1<<z0); for(z++;z<ze;z++) iptr[z] = 0;
    iptr[z] &= (-1<<z1);
}

    //Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 1's
static __forceinline void setzrange1 (void *vptr, int z0, int z1)
{
    int z, ze, *iptr = (int *)vptr;
    if (!((z0^z1)&~31)) { iptr[z0>>5] |= ((~(-1<<z1))&(-1<<z0)); return; }
    z = (z0>>5); ze = (z1>>5);
    iptr[z] |= (-1<<z0); for(z++;z<ze;z++) iptr[z] = -1;
    iptr[z] |=~(-1<<z1);
}

#else

static __forceinline void setzrange0 (void *vptr, __int64 z0, __int64 z1)
{
    unsigned __int64 z, ze, *iptr = (unsigned __int64 *)vptr;
    if (!((z0^z1)&~63)) { iptr[z0>>6] &= ((~(LL(-1)<<z0))|(LL(-1)<<z1)); return; }
    z = (z0>>6); ze = (z1>>6);
    iptr[z] &=~(LL(-1)<<z0); for(z++;z<ze;z++) iptr[z] = LL(0);
    iptr[z] &= (LL(-1)<<z1);
}

    //Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 1's
static __forceinline void setzrange1 (void *vptr, __int64 z0, __int64 z1)
{
    unsigned __int64 z, ze, *iptr = (unsigned __int64 *)vptr;
    if (!((z0^z1)&~63)) { iptr[z0>>6] |= ((~(LL(-1)<<z1))&(LL(-1)<<z0)); return; }
    z = (z0>>6); ze = (z1>>6);
    iptr[z] |= (LL(-1)<<z0); for(z++;z<ze;z++) iptr[z] = LL(-1);
    iptr[z] |=~(LL(-1)<<z1);
}

#endif
c
gcc
bit-manipulation
64-bit
bitwise-operators
asked on Stack Overflow May 4, 2019 by daemondave • edited May 4, 2019 by daemondave

1 Answer

2

Write some unit tests that pass on the original!

First of all, SSE2 is baseline for x86-64, so you should definitely be using that instead of just 64-bit integers.

GCC (unlike MSVC) assumes no strict-aliasing violations, so the set bit range functions (that cast an incoming pointer to signed int* (!!) or uint64_t* depending on WIN64 or not) might need to be compiled with -fno-strict-aliasing to make pointer-casting well-defined.

You could replace the loop part of the set/clear bit-range functions with memset (which gcc may inline), or a hand-written SSE intrinsics loop if you expect the size to usually be small (like under 200 bytes or so, not worth the overhead of calling libc memset)


I think those dntil0 functions in the first block are just bit-search loops for the first 0 or first 1 bit, forward or backward.

Rewrite them from scratch with SSE2 intrinsics: _mm_cmpeq_epi8 / _mm_movemask_epi8 to find the first byte that isn't all-0 or all-1 bits, then use bsf or bsr on that.

See the glibc source code for SSE2 memchr, or any simpler SSE2-optimized implementation, to find out how to do the byte-search part. Or glibc memmem for an example of comparing for equal, but that's easy: instead of looking for a non-zero _mm_movemask_epi8() (indicating there was a match), look for a result that's != 0xffff (all ones) to indicate that there was a mismatch. Use bsf or bsr on that bitmask to find the byte index into the SIMD vector.

So in total you'll use BSR or BSF twice in each function: one to find the byte index within the SIMD vector, and again to find the bit-index within the target byte.

For the bit-scan function, use GCC __builtin_clz or __builtin_ctz to find the first 1 bit. Bit twiddling: which bit is set?

To search for the first zero instead of the first one, bitwise invert, like __builtin_ctz( ~p[idx] ) where p is an unsigned char* into your search buffer (that you were using _mm_loadu_si128() on), and idx is an offset within that 16 byte window. (That you calculated with __builtin_ctz() on the movemask result that broke out of the vector loop.)


How the original worked:

z -= 32 is looping by 32 bits (the size of an int, because this was written assuming it would be compiled for x86 Windows or x86-64 Windows).

lptr[z>>5]; is converting the bit index to an int index. So it's simply looping over the buffer 1 int at a time.

When it finds a 4-byte element that's != 0xFFFFFFFF, it has found an int containing a bit that's not 1; i.e. it contains the bit we're looking for. So it uses bsf or bsr to bit-scan and find the position of that bit within this int.
It adds that to z (the bit-position of the start of this int).

This is exactly the same algorithm I described above, but implemented one integer at a time instead of 16 bytes at a time.

It should really be using uint32_t or unsigned int for bit-manipulations, not signed int, but it obviously works correctly on MSVC.

if (z >= zsiz) return(zsiz); This is the size check to break out of the loop if no bit is found.

answered on Stack Overflow May 21, 2019 by Peter Cordes • edited May 21, 2019 by Peter Cordes

User contributions licensed under CC BY-SA 3.0