How can one make this dynamic bit range code GCC compliant for 64 bit compilers?

Question

How can one make this dynamic bit range code GCC compliant for 64 bit compilers?

I am trying to update for linux, GCC, and 64 bit use and preserve in a github Ken Silverman's Paint N Draw 3D C software. I got his permission but he's too busy to help. I don't want to do a bad job and I am not a bit-twiddling expert so I'd like to fix the main parts before I upload it.

In his code pnd3d.c he used a struct called bitmal_t * that contains a malloc (I think his element mal means the size of a malloc) and a size to indicate a voxel-distance as an unsigned int (in 2009 it was a 32 bit ) bit chain amongst the bits of a concatenated set of 32 bit ints. So basically, distance is a function of how many bits on (1) along the extended bit chain. For collisions, he looks up and down for zeros and ones.

Here is his bitmal_t:

    //buf: cast to: octv_t* or surf_t*
    //bit: 1 bit per sizeof(buf[0]); 0=free, 1=occupied
typedef struct bit { void *buf; unsigned int mal, *bit, ind, num, siz; } bitmal_t;

Here is his range finding code that goes up and down the bit-range looking for a one or a zero. I posted his originals, not my crappy nonworking version.

Here is all the code snippets you would need to reproduce it.

static __forceinline int dntil0 (unsigned int *lptr, int z, int zsiz)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z < zsiz) && (lptr[z>>5]&(1<<KMOD32(z)))) z++; return(z);
    int i;
        //WARNING: zsiz must be multiple of 32!
    i = (lptr[z>>5]|((1<<KMOD32(z))-1)); z &= ~31;
    while (i == 0xffffffff)
    {
        z += 32; if (z >= zsiz) return(zsiz);
        i = lptr[z>>5];
    }
    return(bsf(~i)+z);
}

static __forceinline int uptil0 (unsigned int *lptr, int z)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z > 0) && (lptr[(z-1)>>5]&(1<<KMOD32(z-1)))) z--; return(z);
    int i;
    if (!z) return(0); //Prevent possible crash
    i = (lptr[(z-1)>>5]|(-1<<KMOD32(z))); z &= ~31;
    while (i == 0xffffffff)
    {
        z -= 32; if (z < 0) return(0);
        i = lptr[z>>5];
    }
    return(bsr(~i)+z+1);
}

static __forceinline int dntil1 (unsigned int *lptr, int z, int zsiz)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z < zsiz) && (!(lptr[z>>5]&(1<<KMOD32(z))))) z++; return(z);
    int i;
        //WARNING: zsiz must be multiple of 32!
    i = (lptr[z>>5]&(-1<<KMOD32(z))); z &= ~31;
    while (!i)
    {
        z += 32; if (z >= zsiz) return(zsiz);
        i = lptr[z>>5];
    }
    return(bsf(i)+z);
}

static __forceinline int uptil1 (unsigned int *lptr, int z)
{
    //   //This line does the same thing (but slow & brute force)
    //while ((z > 0) && (!(lptr[(z-1)>>5]&(1<<KMOD32(z-1))))) z--; return(z);
    int i;
    if (!z) return(0); //Prevent possible crash
    i = (lptr[(z-1)>>5]&((1<<KMOD32(z))-1)); z &= ~31;
    while (!i)
    {
        z -= 32; if (z < 0) return(0);
        i = lptr[z>>5];
    }
    return(bsr(i)+z+1);
}

Here are his set range to ones and zeroes functions:

//Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 0's
#ifndef _WIN64

static __forceinline void setzrange0 (void *vptr, int z0, int z1)
{
    int z, ze, *iptr = (int *)vptr;
    if (!((z0^z1)&~31)) { iptr[z0>>5] &= ((~(-1<<z0))|(-1<<z1)); return; }
    z = (z0>>5); ze = (z1>>5);
    iptr[z] &=~(-1<<z0); for(z++;z<ze;z++) iptr[z] = 0;
    iptr[z] &= (-1<<z1);
}

    //Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 1's
static __forceinline void setzrange1 (void *vptr, int z0, int z1)
{
    int z, ze, *iptr = (int *)vptr;
    if (!((z0^z1)&~31)) { iptr[z0>>5] |= ((~(-1<<z1))&(-1<<z0)); return; }
    z = (z0>>5); ze = (z1>>5);
    iptr[z] |= (-1<<z0); for(z++;z<ze;z++) iptr[z] = -1;
    iptr[z] |=~(-1<<z1);
}

#else

static __forceinline void setzrange0 (void *vptr, __int64 z0, __int64 z1)
{
    unsigned __int64 z, ze, *iptr = (unsigned __int64 *)vptr;
    if (!((z0^z1)&~63)) { iptr[z0>>6] &= ((~(LL(-1)<<z0))|(LL(-1)<<z1)); return; }
    z = (z0>>6); ze = (z1>>6);
    iptr[z] &=~(LL(-1)<<z0); for(z++;z<ze;z++) iptr[z] = LL(0);
    iptr[z] &= (LL(-1)<<z1);
}

    //Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 1's
static __forceinline void setzrange1 (void *vptr, __int64 z0, __int64 z1)
{
    unsigned __int64 z, ze, *iptr = (unsigned __int64 *)vptr;
    if (!((z0^z1)&~63)) { iptr[z0>>6] |= ((~(LL(-1)<<z1))&(LL(-1)<<z0)); return; }
    z = (z0>>6); ze = (z1>>6);
    iptr[z] |= (LL(-1)<<z0); for(z++;z<ze;z++) iptr[z] = LL(-1);
    iptr[z] |=~(LL(-1)<<z1);
}

#endif

c

gcc

bit-manipulation

64-bit

bitwise-operators

asked on Stack Overflow May 4, 2019 by

daemondave • edited May 4, 2019 by

daemondave

1 Answer

Write some unit tests that pass on the original!

First of all, SSE2 is baseline for x86-64, so you should definitely be using that instead of just 64-bit integers.

GCC (unlike MSVC) assumes no strict-aliasing violations, so the set bit range functions (that cast an incoming pointer to signed int* (!!) or uint64_t* depending on WIN64 or not) might need to be compiled with -fno-strict-aliasing to make pointer-casting well-defined.

You could replace the loop part of the set/clear bit-range functions with memset (which gcc may inline), or a hand-written SSE intrinsics loop if you expect the size to usually be small (like under 200 bytes or so, not worth the overhead of calling libc memset)

I think those dntil0 functions in the first block are just bit-search loops for the first 0 or first 1 bit, forward or backward.

Rewrite them from scratch with SSE2 intrinsics: _mm_cmpeq_epi8 / _mm_movemask_epi8 to find the first byte that isn't all-0 or all-1 bits, then use bsf or bsr on that.

See the glibc source code for SSE2 memchr, or any simpler SSE2-optimized implementation, to find out how to do the byte-search part. Or glibc memmem for an example of comparing for equal, but that's easy: instead of looking for a non-zero _mm_movemask_epi8() (indicating there was a match), look for a result that's != 0xffff (all ones) to indicate that there was a mismatch. Use bsf or bsr on that bitmask to find the byte index into the SIMD vector.

So in total you'll use BSR or BSF twice in each function: one to find the byte index within the SIMD vector, and again to find the bit-index within the target byte.

For the bit-scan function, use GCC __builtin_clz or __builtin_ctz to find the first 1 bit. Bit twiddling: which bit is set?

To search for the first zero instead of the first one, bitwise invert, like __builtin_ctz( ~p[idx] ) where p is an unsigned char* into your search buffer (that you were using _mm_loadu_si128() on), and idx is an offset within that 16 byte window. (That you calculated with __builtin_ctz() on the movemask result that broke out of the vector loop.)

How the original worked:

z -= 32 is looping by 32 bits (the size of an int, because this was written assuming it would be compiled for x86 Windows or x86-64 Windows).

lptr[z>>5]; is converting the bit index to an int index. So it's simply looping over the buffer 1 int at a time.

When it finds a 4-byte element that's != 0xFFFFFFFF, it has found an int containing a bit that's not 1; i.e. it contains the bit we're looking for. So it uses bsf or bsr to bit-scan and find the position of that bit within this int.
It adds that to z (the bit-position of the start of this int).

This is exactly the same algorithm I described above, but implemented one integer at a time instead of 16 bytes at a time.

It should really be using uint32_t or unsigned int for bit-manipulations, not signed int, but it obviously works correctly on MSVC.

if (z >= zsiz) return(zsiz); This is the size check to break out of the loop if no bit is found.

answered on Stack Overflow May 21, 2019 by

Peter Cordes • edited May 21, 2019 by

Peter Cordes

User contributions licensed under CC BY-SA 3.0