Faster assembly optimized way to convert between RGB8 and RGB32 image

Question

Faster assembly optimized way to convert between RGB8 and RGB32 image

I'm trying to find an assembly optimized method for RGB8 to RGB32 image conversion.

Source is an 8 bits grey image, Destination should be an 32 bits grey image (BGRA) with 4th channel (alpha) to be ignored. Source address is not guaranteed to be 16 byte aligned, Count is a multiple of 16, Destination address is 16 byte aligned.

INPUT: 8 bits single channel grey image
OUTPUT: 32 bits BGRA (alpha channel ignored)
COUNT: Image size is a multiple of 16
CPU: x86-32 (SSE2/SSE3 allowed)

Here is my optimized assembly code. Is there an even faster way of conversion?

void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
    static unsigned int __declspec(align(64)) Masks[] = {
        0x80000000, 0x80010101, 0x80020202, 0x80030303,
        0x80040404, 0x80050505, 0x80060606, 0x80070707,
        0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
        0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
    };

    __asm {
        mov esi, Source
        mov edi, Destination
        mov edx, Count
        xor ecx, ecx
        movdqa xmm4, xmmword ptr [Masks + 0]
        movdqa xmm5, xmmword ptr [Masks + 16]
        movdqa xmm6, xmmword ptr [Masks + 32]
        movdqa xmm7, xmmword ptr [Masks + 48]
l1:
        movdqu xmm0, xmmword ptr [esi + ecx]
        movdqa xmm1, xmm0
        movdqa xmm2, xmm0
        movdqa xmm3, xmm0
        pshufb xmm0, xmm4
        pshufb xmm1, xmm5
        pshufb xmm2, xmm6
        pshufb xmm3, xmm7
        movntdq [edi + 0], xmm0
        movntdq [edi + 16], xmm1
        movntdq [edi + 32], xmm2
        movntdq [edi + 48], xmm3
        add edi, 64
        add ecx, 16
        cmp ecx, edx
        jb l1
    }
}

There is another aproach using several PUNPCKLBW and PUNPCKHBW but that seems to be sligthy slower.

Update: This is the basic non optimized algorithm:

BGRA* Destination = ...
unsigned char* Source ...
for (unsigned int i = 0; i < Size; i++) {
    Destination[i].Blue = Source[i];
    Destination[i].Green = Source[i];
    Destination[i].Red = Source[i]; 
}

PS: I also tried using C code with MS VS2008 SSE compiler intrinsics. It turned out that the compiler generated a lot of unnecessary memory moves which causes the code to be 10-20% slower than pure assembly.

Update 2: This is the same code by using intirnsics only.

void ConvertGreyToRgb32Assembler(__m128i* Source, __m128i* Destination, unsigned int Count) {
    static const unsigned int __declspec(align(64)) Masks[] = {
        0x80000000, 0x80010101, 0x80020202, 0x80030303,
        0x80040404, 0x80050505, 0x80060606, 0x80070707,
        0x80080808, 0x80090909, 0x800a0a0a, 0x800b0b0b,
        0x800c0c0c, 0x800d0d0d, 0x800e0e0e, 0x800f0f0f
    };

    register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
    register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 4));
    register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 8));
    register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 12));

    for (unsigned int i = 0; i < Count / 16; i++) {
        __m128i r0 = _mm_load_si128(Source + i);

        _mm_stream_si128(Destination + (i * 4) + 0, _mm_shuffle_epi8(r0, m0));
        _mm_stream_si128(Destination + (i * 4) + 1, _mm_shuffle_epi8(r0, m1));
        _mm_stream_si128(Destination + (i * 4) + 2, _mm_shuffle_epi8(r0, m2));
        _mm_stream_si128(Destination + (i * 4) + 3, _mm_shuffle_epi8(r0, m3));
    }
}

Update 3: This is the compiler generated code (beautified) (Visual Studio 2012, all optimization on):

    push ebp 
    mov ebp, esp 
    mov edx, dword ptr [ebp+8]
    movdqa xmm1, xmmword ptr ds:[Masks + 0]
    movdqa xmm2, xmmword ptr ds:[Masks + 16]
    movdqa xmm3, xmmword ptr ds:[Masks + 32]
    movdqa xmm4, xmmword ptr ds:[Masks + 48]
    push esi 
    test ecx, ecx 
    je l2
    lea esi, [ecx-1] 
    shr esi, 4 
    inc esi
l1:
    mov ecx, edx 
    movdqu xmm0, xmmword ptr [ecx] 
    mov ecx, eax 
    movdqa xmm5, xmm0 
    pshufb xmm5, xmm1 
    movdqa xmmword ptr [ecx], xmm5 
    movdqa xmm5, xmm0 
    pshufb xmm5, xmm2 
    movdqa xmmword ptr [eax+10h], xmm5 
    movdqa xmm5, xmm0
    pshufb xmm5, xmm3
    movdqa xmmword ptr [eax+20h], xmm5 
    lea ecx, [eax+30h]
    add edx, 10h 
    add eax, 40h 
    dec esi 
    pshufb xmm0, xmm4 
    movdqa xmmword ptr [ecx], xmm0 
    jne l1
l2:
    pop esi
    pop ebp
    ret

It seems that interleaving movdqa with pshufb is some what faster.

Update 4: This seems to be the optimal hand optimized code:

   __asm {
        mov esi, Source
        mov edi, Destination
        mov ecx, Count
        movdqu xmm0, xmmword ptr [esi]
        movdqa xmm4, xmmword ptr [Masks + 0]
        movdqa xmm5, xmmword ptr [Masks + 16]
        movdqa xmm6, xmmword ptr [Masks + 32]
        movdqa xmm7, xmmword ptr [Masks + 48]
l1:
        dec ecx
        lea edi, [ edi + 64 ]
        lea esi, [ esi + 16 ]
        movdqa xmm1, xmm0
        movdqa xmm2, xmm0
        movdqa xmm3, xmm0
        pshufb xmm0, xmm4
        movdqa [edi - 64], xmm0
        pshufb xmm1, xmm5
        movdqa [edi - 48], xmm1
        pshufb xmm2, xmm6
        movdqa [edi - 32], xmm2
        pshufb xmm3, xmm7
        movdqa [edi - 16], xmm3
        movdqu xmm0, xmmword ptr [esi]
        ja l1
    }

Update 5: This conversion algorithm uses punpck instruction. However this conversion routine is a bit slower than using masks and pushfb.

for (unsigned int i = 0; i < Count; i += 16) {
    register __m128i r0 = _mm_load_si128(Source++);
    register __m128i r1 = _mm_unpackhi_epi8(r0, r0);
    register __m128i r2 = _mm_unpacklo_epi8(r0, r0);
    register __m128i r3 = _mm_unpackhi_epi8(r1, r1);
    register __m128i r4 = _mm_unpacklo_epi8(r1, r1);
    register __m128i r5 = _mm_unpackhi_epi8(r2, r2);
    register __m128i r6 = _mm_unpacklo_epi8(r2, r2);

    _mm_store_si128(Destination++, r6);
    _mm_store_si128(Destination++, r5);
    _mm_store_si128(Destination++, r4);
    _mm_store_si128(Destination++, r3);
}

Update 6: For the sake of completeness this is the inverse method to convert from 32 bits back to 8 bits grey image.

static void ConvertRgb32ToGrey(const __m128i* Source, __m128i* Destination, unsigned int Count) {
    static const unsigned char __declspec(align(64)) Masks[] = {
        0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
        0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c, 0x80, 0x80, 0x80, 0x80,
        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x04, 0x08, 0x0c,
    };

    register __m128i m0 = _mm_load_si128((__m128i*) (Masks + 0));
    register __m128i m1 = _mm_load_si128((__m128i*) (Masks + 16));
    register __m128i m2 = _mm_load_si128((__m128i*) (Masks + 32));
    register __m128i m3 = _mm_load_si128((__m128i*) (Masks + 48));

    for (unsigned int i = 0; i < Count / 64; i++) {
        __m128i a = _mm_load_si128(Source + (i * 4) + 0);
        __m128i b = _mm_load_si128(Source + (i * 4) + 1);
        __m128i c = _mm_load_si128(Source + (i * 4) + 2);
        __m128i d = _mm_load_si128(Source + (i * 4) + 3);

        a = _mm_shuffle_epi8(a, m0);
        b = _mm_shuffle_epi8(b, m1);
        c = _mm_shuffle_epi8(c, m2);
        d = _mm_shuffle_epi8(d, m3);

        __m128i e = _mm_or_si128(a, b);
        __m128i f = _mm_or_si128(c, d);
        __m128i g = _mm_or_si128(e, f);

        _mm_stream_si128(Destination + i, g);

    }
}

visual-c++

optimization

assembly

x86

sse

asked on Stack Overflow Aug 17, 2012 by

bkausbk • edited Oct 23, 2014 by

bkausbk

1 Answer

Would try:

    __asm {
        mov esi, Source
        mov edi, Destination
        mov ecx, Count
        movdqu xmm0, xmmword ptr [esi]
        movdqa xmm4, xmmword ptr [Masks + 0]
        movdqa xmm5, xmmword ptr [Masks + 16]
        movdqa xmm6, xmmword ptr [Masks + 32]
        movdqa xmm7, xmmword ptr [Masks + 48]
l1:
        dec ecx
        lea edi, [ edi + 64 ]
        lea esi, [ esi + 16 ]
        movdqa xmm1, xmm0
        movdqa xmm2, xmm0
        movdqa xmm3, xmm0
        pshufb xmm0, xmm4
        pshufb xmm1, xmm5
        pshufb xmm2, xmm6
        pshufb xmm3, xmm7
        movntdq [edi - 64], xmm0
        movntdq [edi - 48], xmm1
        movntdq [edi - 32], xmm2
        movntdq [edi - 16], xmm3
        movdqu xmm0, xmmword ptr [esi]
        ja l1
    }

Haven't benchmarked it though; assumptions behind these changes:

the movdqu xmm0,... latency can be a little more hidden within the loop (your code has the load of xmm0 followed directly by an instruction using the value in that register)
the add ops on two regs as well as the cmp aren't really all necessary; address generation (lea) and the implicit zero test by dec/ja can be used. That way, there'll be no EFLAGS dependencies caused by operations on ecx/esi/edi as the only ALU op in the loop is decrementing the loop counter.

In the end, this is likely load/store bound in any case so the arithmetics are "free game"; I therefore expect little difference, even with the arguments as given.

If the input is large, then it'd make sense to strip the "unaligned head/tail" off, i.e. to do a duff's device for the first/last [0..15] bytes, and the main loop using movdqa.

Edit:

Running your intrinsics sources through gcc -msse4.2 -O8 -c (GCC 4.7.1) gives the following assembly:

Disassembly of section .text:

0000000000000000 <ConvertGreyToRgb32Assembler>:
   0:   85 d2                   test   edx,edx
   2:   74 76                   je     7a <ConvertGreyToRgb32Assembler+0x7a>
   4:   66 0f 6f 2d 00 00 00 00         movdqa xmm5,XMMWORD PTR [rip+0x0]
        # c <ConvertGreyToRgb32Assembler+0xc>
   c:   48 89 f8                mov    rax,rdi
   f:   66 0f 6f 25 00 00 00 00         movdqa xmm4,XMMWORD PTR [rip+0x0]
        # 17 <ConvertGreyToRgb32Assembler+0x17>
  17:   66 0f 6f 1d 00 00 00 00         movdqa xmm3,XMMWORD PTR [rip+0x0]
        # 1f <ConvertGreyToRgb32Assembler+0x1f>
  1f:   66 0f 6f 15 00 00 00 00         movdqa xmm2,XMMWORD PTR [rip+0x0]
        # 27 <ConvertGreyToRgb32Assembler+0x27>
  27:   66 0f 1f 84 00 00 00 00 00      nop    WORD PTR [rax+rax*1+0x0]
  30:   f3 0f 6f 00             movdqu xmm0,XMMWORD PTR [rax]
  34:   48 89 f1                mov    rcx,rsi
  37:   48 83 c0 10             add    rax,0x10
  3b:   66 0f 6f c8             movdqa xmm1,xmm0
  3f:   66 0f 38 00 cd          pshufb xmm1,xmm5
  44:   66 0f e7 0e             movntdq XMMWORD PTR [rsi],xmm1
  48:   66 0f 6f c8             movdqa xmm1,xmm0
  4c:   66 0f 38 00 cc          pshufb xmm1,xmm4
  51:   66 0f e7 4e 10          movntdq XMMWORD PTR [rsi+0x10],xmm1
  56:   66 0f 6f c8             movdqa xmm1,xmm0
  5a:   66 0f 38 00 c2          pshufb xmm0,xmm2
  5f:   66 0f 38 00 cb          pshufb xmm1,xmm3
  64:   66 0f e7 4e 20          movntdq XMMWORD PTR [rsi+0x20],xmm1
  69:   66 0f e7 41 30          movntdq XMMWORD PTR [rcx+0x30],xmm0
  6e:   89 c1                   mov    ecx,eax
  70:   29 f9                   sub    ecx,edi
  72:   48 83 c6 40             add    rsi,0x40
  76:   39 ca                   cmp    edx,ecx
  78:   77 b6                   ja     30 <ConvertGreyToRgb32Assembler+0x30>
  7a:   f3 c3                   repz ret

This reminds me extremely strongly of your initial assembly code. If MSVC creates something significantly worse than that, I'd say it's a bug/limitation in the compiler (version) you used.

answered on Stack Overflow Aug 17, 2012 by

FrankH. • edited Aug 17, 2012 by

FrankH.

User contributions licensed under CC BY-SA 3.0