I am trying to update for linux, GCC, and 64 bit use and preserve in a github Ken Silverman's Paint N Draw 3D C software. I got his permission but he's too busy to help. I don't want to do a bad job and I am not a bit-twiddling expert so I'd like to fix the main parts before I upload it.
In his code pnd3d.c he used a struct called bitmal_t *
that contains a malloc (I think his element mal means the size of a malloc) and a size to indicate a voxel-distance as an unsigned int (in 2009 it was a 32 bit ) bit chain amongst the bits of a concatenated set of 32 bit ints. So basically, distance is a function of how many bits on (1) along the extended bit chain. For collisions, he looks up and down for zeros and ones.
Here is his bitmal_t:
//buf: cast to: octv_t* or surf_t*
//bit: 1 bit per sizeof(buf[0]); 0=free, 1=occupied
typedef struct bit { void *buf; unsigned int mal, *bit, ind, num, siz; } bitmal_t;
Here is his range finding code that goes up and down the bit-range looking for a one or a zero. I posted his originals, not my crappy nonworking version.
Here is all the code snippets you would need to reproduce it.
static __forceinline int dntil0 (unsigned int *lptr, int z, int zsiz)
{
// //This line does the same thing (but slow & brute force)
//while ((z < zsiz) && (lptr[z>>5]&(1<<KMOD32(z)))) z++; return(z);
int i;
//WARNING: zsiz must be multiple of 32!
i = (lptr[z>>5]|((1<<KMOD32(z))-1)); z &= ~31;
while (i == 0xffffffff)
{
z += 32; if (z >= zsiz) return(zsiz);
i = lptr[z>>5];
}
return(bsf(~i)+z);
}
static __forceinline int uptil0 (unsigned int *lptr, int z)
{
// //This line does the same thing (but slow & brute force)
//while ((z > 0) && (lptr[(z-1)>>5]&(1<<KMOD32(z-1)))) z--; return(z);
int i;
if (!z) return(0); //Prevent possible crash
i = (lptr[(z-1)>>5]|(-1<<KMOD32(z))); z &= ~31;
while (i == 0xffffffff)
{
z -= 32; if (z < 0) return(0);
i = lptr[z>>5];
}
return(bsr(~i)+z+1);
}
static __forceinline int dntil1 (unsigned int *lptr, int z, int zsiz)
{
// //This line does the same thing (but slow & brute force)
//while ((z < zsiz) && (!(lptr[z>>5]&(1<<KMOD32(z))))) z++; return(z);
int i;
//WARNING: zsiz must be multiple of 32!
i = (lptr[z>>5]&(-1<<KMOD32(z))); z &= ~31;
while (!i)
{
z += 32; if (z >= zsiz) return(zsiz);
i = lptr[z>>5];
}
return(bsf(i)+z);
}
static __forceinline int uptil1 (unsigned int *lptr, int z)
{
// //This line does the same thing (but slow & brute force)
//while ((z > 0) && (!(lptr[(z-1)>>5]&(1<<KMOD32(z-1))))) z--; return(z);
int i;
if (!z) return(0); //Prevent possible crash
i = (lptr[(z-1)>>5]&((1<<KMOD32(z))-1)); z &= ~31;
while (!i)
{
z -= 32; if (z < 0) return(0);
i = lptr[z>>5];
}
return(bsr(i)+z+1);
}
Here are his set range to ones and zeroes functions:
//Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 0's
#ifndef _WIN64
static __forceinline void setzrange0 (void *vptr, int z0, int z1)
{
int z, ze, *iptr = (int *)vptr;
if (!((z0^z1)&~31)) { iptr[z0>>5] &= ((~(-1<<z0))|(-1<<z1)); return; }
z = (z0>>5); ze = (z1>>5);
iptr[z] &=~(-1<<z0); for(z++;z<ze;z++) iptr[z] = 0;
iptr[z] &= (-1<<z1);
}
//Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 1's
static __forceinline void setzrange1 (void *vptr, int z0, int z1)
{
int z, ze, *iptr = (int *)vptr;
if (!((z0^z1)&~31)) { iptr[z0>>5] |= ((~(-1<<z1))&(-1<<z0)); return; }
z = (z0>>5); ze = (z1>>5);
iptr[z] |= (-1<<z0); for(z++;z<ze;z++) iptr[z] = -1;
iptr[z] |=~(-1<<z1);
}
#else
static __forceinline void setzrange0 (void *vptr, __int64 z0, __int64 z1)
{
unsigned __int64 z, ze, *iptr = (unsigned __int64 *)vptr;
if (!((z0^z1)&~63)) { iptr[z0>>6] &= ((~(LL(-1)<<z0))|(LL(-1)<<z1)); return; }
z = (z0>>6); ze = (z1>>6);
iptr[z] &=~(LL(-1)<<z0); for(z++;z<ze;z++) iptr[z] = LL(0);
iptr[z] &= (LL(-1)<<z1);
}
//Set all bits in vbit from (x,y,z0) to (x,y,z1-1) to 1's
static __forceinline void setzrange1 (void *vptr, __int64 z0, __int64 z1)
{
unsigned __int64 z, ze, *iptr = (unsigned __int64 *)vptr;
if (!((z0^z1)&~63)) { iptr[z0>>6] |= ((~(LL(-1)<<z1))&(LL(-1)<<z0)); return; }
z = (z0>>6); ze = (z1>>6);
iptr[z] |= (LL(-1)<<z0); for(z++;z<ze;z++) iptr[z] = LL(-1);
iptr[z] |=~(LL(-1)<<z1);
}
#endif
Write some unit tests that pass on the original!
First of all, SSE2 is baseline for x86-64, so you should definitely be using that instead of just 64-bit integers.
GCC (unlike MSVC) assumes no strict-aliasing violations, so the set bit range functions (that cast an incoming pointer to signed int*
(!!) or uint64_t*
depending on WIN64 or not) might need to be compiled with -fno-strict-aliasing to make pointer-casting well-defined.
You could replace the loop part of the set/clear bit-range functions with memset (which gcc may inline), or a hand-written SSE intrinsics loop if you expect the size to usually be small (like under 200 bytes or so, not worth the overhead of calling libc memset)
I think those dntil0
functions in the first block are just bit-search loops for the first 0 or first 1 bit, forward or backward.
Rewrite them from scratch with SSE2 intrinsics: _mm_cmpeq_epi8
/ _mm_movemask_epi8
to find the first byte that isn't all-0 or all-1 bits, then use bsf
or bsr
on that.
See the glibc source code for SSE2 memchr, or any simpler SSE2-optimized implementation, to find out how to do the byte-search part. Or glibc memmem
for an example of comparing for equal, but that's easy: instead of looking for a non-zero _mm_movemask_epi8()
(indicating there was a match), look for a result that's != 0xffff
(all ones) to indicate that there was a mismatch. Use bsf
or bsr
on that bitmask to find the byte index into the SIMD vector.
So in total you'll use BSR or BSF twice in each function: one to find the byte index within the SIMD vector, and again to find the bit-index within the target byte.
For the bit-scan function, use GCC __builtin_clz
or __builtin_ctz
to find the first 1
bit. Bit twiddling: which bit is set?
To search for the first zero instead of the first one, bitwise invert, like __builtin_ctz( ~p[idx] )
where p
is an unsigned char*
into your search buffer (that you were using _mm_loadu_si128()
on), and idx
is an offset within that 16 byte window. (That you calculated with __builtin_ctz()
on the movemask
result that broke out of the vector loop.)
How the original worked:
z -= 32
is looping by 32 bits (the size of an int
, because this was written assuming it would be compiled for x86 Windows or x86-64 Windows).
lptr[z>>5];
is converting the bit index to an int
index. So it's simply looping over the buffer 1 int
at a time.
When it finds a 4-byte element that's != 0xFFFFFFFF
, it has found an int
containing a bit that's not 1; i.e. it contains the bit we're looking for. So it uses bsf
or bsr
to bit-scan and find the position of that bit within this int
.
It adds that to z
(the bit-position of the start of this int
).
This is exactly the same algorithm I described above, but implemented one integer at a time instead of 16 bytes at a time.
It should really be using uint32_t
or unsigned int
for bit-manipulations, not signed int
, but it obviously works correctly on MSVC.
if (z >= zsiz) return(zsiz);
This is the size check to break out of the loop if no bit is found.
User contributions licensed under CC BY-SA 3.0