I have a 32-bit unsigned integer made of 4 bytes such as : 0x12ff3456
.
I'm trying to find a bitwise operation to replace the 0xff
byte by 0x01
, and anything else by 0x00
such as :
0x12ff3456
=> 0x00010000
0x543245ff
=> 0x00000001
...etc.
Only one of the bytes making the 32-bit unsigned integer can be equal to 0xff
at a time.
Does any one have an idea on how to perform this with the least possible amount of operations ? Folding (bitshifts + ands) would be an option but requires too many ops.
The Bit Twiddling Hacks page explains how to get a mask where each zero byte is marked with the high bit set. You can apply this here:
((~x - 0x01010101) & x & 0x80808080) >> 7
Multiple possibilities exist, and which one results in the minimum number of instructions, or the fastest execution time, will depend on the machine architecture and compiler being used. Some architectures offer an ANDN
instructions, others support three-input logical instructions, still others merge shifts with logical operations. Below I am showing three variants that pass an exhaustive test.
The two approaches are to either base the output on a byte-wise equality check with 0xFF
or a byte-wise "greater than" test against 0xFE
. These are selected via FUNC_VARIANT
. The "greater than" test is further bases on a "less than" test, for which two implementation variants are provided, selected by LTU_VARIANT
.
The sources of the algorithms for clever byte-wise processing are noted in the comments. Generally speaking, a certain amount of masking is required in the intermediate steps to prevent the processing of a particular byte to affects neighboring bytes.
Note that the code can easily be adapted for processing eight bytes at a time, rather than four as the question specifies.
A quick check with Compiler Explorer shows that with gcc, FUNC_VARIANT=1
,LTU_VARIANT=0
compiles to the shortest instruction sequences for both x86-64 and ARM64. This may not necessarily translate into the highest possible performance, though.
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#define FUNC_VARIANT 0
#define LTU_VARIANT 0
#define UINT32_H4 0x80808080U // byte-wise sign bits (MSBs)
#define UINT32_L4 0x01010101U // byte-wise LSBs
#define UINT32_M4 0xffffffffU // byte-wise maximum
uint32_t sign_to_bool4 (uint32_t a)
{
return (a >> 7) & UINT32_L4;
}
uint32_t vhaddu4 (uint32_t a, uint32_t b)
{
/* Peter L. Montgomery's observation (newsgroup comp.arch, 2000/02/11,
https://groups.google.com/d/msg/comp.arch/gXFuGZtZKag/_5yrz2zDbe4J):
(A+B)/2 = (A AND B) + (A XOR B)/2.
*/
return (a & b) + (((a ^ b) >> 1) & ~UINT32_H4);
}
uint32_t ltu4_core (uint32_t a, uint32_t b)
{
/* Sebastiano Vigna, "Broadword implementation of rank/select queries."
In: International Workshop on Experimental and Efficient Algorithms,
pp. 154-168, Springer Berlin Heidelberg, 2008.
*/
return (((a | UINT32_H4) - (b & ~UINT32_H4)) | (a ^ b)) ^ (a | ~b);
}
uint32_t vsetltu4 (uint32_t a, uint32_t b)
{
#if LTU_VARIANT==1
return sign_to_bool4 (ltu4_core (a, b));
#else // LTU_VARIANT
return sign_to_bool4 (vhaddu4 (~a, b));
#endif // LTU_VARIANT
}
uint32_t vsetgtu4 (uint32_t a, uint32_t b)
{
return vsetltu4 (b, a);
}
uint32_t vseteq4 (uint32_t a, uint32_t b)
{
uint32_t r, t;
/* Alan Mycroft's null-byte detection algorithm (newsgroup comp.lang.c, 1987/04/08,
https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ):
null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
*/
r = a ^ b; // 0x00 if a == b
t = r | UINT32_H4; // set msbs, to catch carry out
r = r ^ t; // extract msbs, msb = 1 if r < 0x80
t = t - UINT32_L4; // sign bit = 0, if r was 0x00 or 0x80
t = r & ~t; // sign_bit = 1, if r was 0x00
r = t >> 7; // convert to bool
return r;
}
uint32_t func (uint32_t a)
{
#if FUNC_VARIANT == 1
return vsetgtu4 (a, ~UINT32_L4); // byte-wise a >ᶸ 0xFE
#else // FUNC_VARIANT
return vseteq4 (a, UINT32_M4); // byte-wise a == 0xFF
#endif // FUNC_VARIANT
}
uint32_t ref_func (uint32_t a)
{
uint8_t a0 = (uint8_t)((a >> 0) & 0xff);
uint8_t a1 = (uint8_t)((a >> 8) & 0xff);
uint8_t a2 = (uint8_t)((a >> 16) & 0xff);
uint8_t a3 = (uint8_t)((a >> 24) & 0xff);
int p0 = (a0 == 0xff);
int p1 = (a1 == 0xff);
int p2 = (a2 == 0xff);
int p3 = (a3 == 0xff);
return (((uint32_t)p3 << 24) | ((uint32_t)p2 << 16) |
((uint32_t)p1 << 8) | ((uint32_t)p0 << 0));
}
int main (void)
{
uint32_t res, ref, x = 0;
do {
res = func (x);
ref = ref_func (x);
if (res != ref) {
printf ("error @ %08x: res=%08x ref=%08x\n", x, res, ref);
return EXIT_FAILURE;
}
x++;
} while (x);
printf ("test passed\n");
return EXIT_SUCCESS;
}
User contributions licensed under CC BY-SA 3.0