SSE alpha blending for pre-multiplied ARGB

3

I'm trying to write an SSE-enabled alpha compositor, this is what I've come up with. First, the code to blend two vectors of 4-pixels each:

// alpha blend two 128-bit (16 byte) SSE vectors containing 4 pre-multiplied ARGB values each
//
__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
    // shuffle masks for alpha and 255 vector for 255-alpha
    //
    // NOTE: storing static __m128i here with _mm_set_si128 was _very_ slow, compiler doesn't seem
    // to know it can store this as a const, so it had guard variables and did real static initialization,
    // stick with arrays.
    //
    static const uint64_t allo[2] __attribute__((aligned(16))) = { 0x03ff03ff03ff03ff, 0x07ff07ff07ff07ff };
    static const uint64_t alhi[2] __attribute__((aligned(16))) = { 0x0bff0bff0bff0bff, 0x0fff0fff0fff0fff };
    static const uint64_t m255[2] __attribute__((aligned(16))) = { 0xff00ff00ff00ff00, 0xff00ff00ff00ff00 };

    // replicate top two pixels from under
    __m128i underhi = (__m128i)_mm_movehl_ps(
        (__m128)under,
        (__m128)under
    );

    __m128i u16_0 = _mm_cvtepu8_epi16(under);                   // convert 8-bit fields to 16-bit with zero extension
    __m128i u16_1 = _mm_cvtepu8_epi16(underhi);  
    __m128i al8_0 = _mm_shuffle_epi8 (over, *(__m128i*)&allo);  // replicate (alpha << 8) to each field
    __m128i al8_1 = _mm_shuffle_epi8 (over, *(__m128i*)&alhi);
    __m128i mal_0 = _mm_sub_epi8     (*(__m128i*)&m255, al8_0); // compute 255-alpha
    __m128i mal_1 = _mm_sub_epi8     (*(__m128i*)&m255, al8_1);
    __m128i mul_0 = _mm_mulhi_epu16  (u16_0, mal_0);            // under*(255-over.alpha)
    __m128i mul_1 = _mm_mulhi_epu16  (u16_1, mal_1);
    __m128i pixel = _mm_packus_epi16 (mul_0, mul_1);

    // add to background pixel with saturation
    return _mm_adds_epi8(over, pixel);
}

And, secondly, a wrapper that unrolls multiple pixel operations and aggregates the load/stores. Going up to about 32-pixels/iter seems to be the sweet spot:

// perform N 4-pixel blending operations at once, load/blend/store paradigm.  We take a template parameter
// for the size so the compiler is more likely to unroll the loops for us.
// 
template <ssize_t N>
__attribute__((always_inline, optimize("unroll-loops")))
static inline void blendN(__m128i *dst, const __m128i *punder, const __m128i *pover, bool single=false) {
    __m128i under[N];
    __m128i  over[N];
    __m128i  cc = _mm_loadu_si128(pover);

    // load
    for (ssize_t ii=0; ii < N; ii++) {
        under[ii] =              _mm_loadu_si128(punder+ii);
        over[ii] = single ? cc : _mm_loadu_si128( pover+ii);
    }

    // blend
    for (ssize_t ii=0; ii < N; ii++) {
        under[ii] = blend4(under[ii], over[ii]);
    }

    // store
    for (ssize_t ii=0; ii < N; ii++) {
        _mm_storeu_si128(dst+ii, under[ii]);
    }
}

Called thusly:

 // blend 32/16/8/4 pixels at a time
    ssize_t ii=0;
    for (ii *= 2; ii < len/32; ii++) { blendN<8>(vdst+8*ii, vunder+8*ii, vover+8*ii); }
    for (ii *= 2; ii < len/16; ii++) { blendN<4>(vdst+4*ii, vunder+4*ii, vover+4*ii); }
    for (ii *= 2; ii < len/8;  ii++) { blendN<2>(vdst+2*ii, vunder+2*ii, vover+2*ii); }
    for (ii *= 2; ii < len/4;  ii++) { blendN<1>(vdst+1*ii, vunder+1*ii, vover+1*ii); }

    // handle remainder
    ii *= 4;
    for (; ii < len; ii++) {
        *(pdst+ii) = blend(*(punder+ii), *(pover+ii));
    }

Using this I can get about 2.5 ins/cycle throughput on a i7-2600K. Curious if anyone can suggest improvements to my SIMD.

Edit: Here's some updated code after conversing with Peter Cordes.

__attribute__((always_inline))
static inline __m128i blend4(__m128i under, __m128i over) {
    // shuffle masks for alpha and 255 vector for 255-alpha
    //
    // NOTE: storing static __m128i is _very_ slow, compiler doesn't seem to know it can store
    // this as a const, so it had guard variables and did real static initialization. Stick with 
    // just const
    //
    const __m128i allo = (__m128i)_mm_setr_epi32(0x03ff03ff, 0x03ff03ff, 0x07ff07ff, 0x07ff07ff);
    const __m128i alhi = (__m128i)_mm_setr_epi32(0x0bff0bff, 0x0bff0bff, 0x0fff0fff, 0x0fff0fff);
    const __m128i zero = (__m128i)_mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
    const __m128  m255 = (__m128 )_mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);

    __m128i u16_0 =   _mm_cvtepu8_epi16(under);               // convert 8-bit fields to 16-bit with zero extension
    __m128i u16_1 =   _mm_unpackhi_epi8(under, zero);
    __m128i al8_0 =   _mm_shuffle_epi8 (over,  allo);         // replicate (alpha << 8) to each field
    __m128i al8_1 =   _mm_shuffle_epi8 (over,  alhi);
    __m128i mal_0 = (__m128i)_mm_xor_ps(m255, (__m128)al8_0); // compute 255-alpha
    __m128i mal_1 = (__m128i)_mm_xor_ps(m255, (__m128)al8_1);
    __m128i mul_0 =   _mm_mulhi_epu16  (u16_0, mal_0);        // under*(255-over.alpha)
    __m128i mul_1 =   _mm_mulhi_epu16  (u16_1, mal_1);
    __m128i pixel =   _mm_packus_epi16 (mul_0, mul_1);

    // add to background pixel with saturation
    return _mm_adds_epi8(over, pixel);
}

Big change is using unpackhi instead of cvtepu8 to expand top 8 bytes of under pixel to 16-bits. Then using xor instead of subtraction to compute 255-alpha. xor can run on multiple ports rather than one that subtract is limited to. This blends ~2.2 billion pixels/second on my i7-2600K which seems adequate.

optimization
graphics
x86
sse
simd
asked on Stack Overflow Jul 16, 2018 by gct • edited Jul 17, 2018 by gct

1 Answer

0

Not a direct answer to your question, but this is too long for a comment, and maybe it will be useful to someone.

The trick of permuting the alpha into the upper half of each 16-bit lane so you can use _mm_mulhi_epu16 to get the products into the lower bits with one instruction is really neat. My problem is slightly different, as I don't have premultiplied alpha, and I need to be able to specify opacity for the whole texture. I extended the code to the following:

__m128i blend4(__m128i under, __m128i over, float opacity) {
    const __m128i alpha16 = _mm_set1_epi16(alpha * 255);
    const __m128i allo = (__m128i) _mm_setr_epi32(0xff03ff03, 0xff03ff03, 0xff07ff07, 0x0ff7ff07);
    const __m128i alhi = (__m128i) _mm_setr_epi32(0xff0bff0b, 0xff0bff0b, 0xff0fff0f, 0x0fffff0f);
    const __m128i zero = (__m128i) _mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000);
    const __m128i i255 = (__m128i) _mm_setr_epi32(0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00);

    __m128i under0 = _mm_cvtepu8_epi16(under);
    __m128i under1 = _mm_unpackhi_epi8(under, zero);
    __m128i over0 = _mm_cvtepu8_epi16(over);
    __m128i over1 = _mm_unpackhi_epi8(over, zero);
    __m128i alpha0 = _mm_mullo_epi16(_mm_shuffle_epi8(over, allo), alpha16);
    __m128i alpha1 = _mm_mullo_epi16(_mm_shuffle_epi8(over, alhi), alpha16);
    __m128i invAlpha0 = _mm_xor_si128(i255, alpha0);
    __m128i invAlpha1 = _mm_xor_si128(i255, alpha1);
    __m128i underMul0 = _mm_mulhi_epu16(under0, invAlpha0);
    __m128i underMul1 = _mm_mulhi_epu16(under1, invAlpha1);
    __m128i overMul0 = _mm_mulhi_epu16(over0, alpha0);
    __m128i overMul1 = _mm_mulhi_epu16(over1, alpha1);
    __m128i underFinal = _mm_packus_epi16(underMul0, underMul1);
    __m128i overFinal = _mm_packus_epi16(overMul0, overMul1);
    return _mm_adds_epu8(overFinal, underFinal);
}

I shuffle the alpha into the bottom half of each lane initially, so that the high bits of the result end up in the top half of each lane after multiplying with alpha16, and from there the _mm_mulhi_epu16 trick works as usual. The rest is just straightforward alpha multiplication.

answered on Stack Overflow Oct 27, 2019 by Stuntddude

User contributions licensed under CC BY-SA 3.0