Fastest way to partially load an array of uint8_t or uint16_t into _m256i register and fill remaining bits with 1s without AVX512

Question

Fastest way to partially load an array of uint8_t or uint16_t into _m256i register and fill remaining bits with 1s without AVX512

Basically what I am trying do is load an array of either uint8_t or uint16_t that is smaller than an __m256i register into an __m256i register and fill all the bits in the destination __m256i that are not filled by the array with 1s.

An example of what I want with AVX512 would be:

#define ARR_SIZE_EPI8 (some_constant_value < 32)

// partial load for uint8_t
partial_load_epi8(uint8_t * arr) {
__m256i ones = _mm256_set1_epi64x(-1)
 return _mm256_mask_loadu_epi8(ones, (1 << ARR_SIZE_EPI8) - 1, arr);
}


#define ARR_SIZE_EPI16 (some_constant_value < 16)

// partial load for uin16_t
partial_load_epi16(uint16_t * arr) {
__m256i ones = _mm256_set1_epi64x(-1)
 return _mm256_mask_loadu_epi16(ones, (1 << ARR_SIZE_EPI16) - 1, arr);
}

Using only AVX2 if ARR_SIZE * sizeof(T) % sizeof(int) == 0 I can use:

partial_load_epi16_avx2(uint16_t * arr) {

__m256i mask_vec = _mm256_set_epi32( /* proper values for ARR_SIZE_EPI16 elements */ );
__m256i fill_vec = _mm256_set_epi16( /* 1s until ARR_SIZE_EPI16 * sizeof(uint16_t) */ );
__m256i load_vec = _mm256_maskloadu_epi32((int32_t *)arr, mask_vec);
return _mm256_or_si256(load_vec, fill_vec);
}

This uses a fair about of .rodate but doesnt seem prohibatively expensive. On the other hand when ARR_SIZE * sizeof(T) % sizeof(int) != 0 i.e with uint16_t and an ARR_SIZE_EPI16 the best I've been able to come up with is

partial_load_epi16_avx2_not_aligned(uint16_t * arr) {

__m256i mask_vec = _mm256_set_epi32( /* proper values for ARR_SIZE_EPI16 elements */ );
uint32_t tmp = 0xffff0000 | arr[ARR_SIZE_EPI16];
__m256i fill_vec = _mm256_set_epi32( /* 1s until ARR_SIZE_EPI16 * sizeof(uint16_t) / sizeof(int32_t) */, tmp, /* 0s */ );
__m256i load_vec = _mm256_maskloadu_epi32((int32_t *)arr, mask_vec);
return _mm256_or_si256(load_vec, fill_vec);
}

// or

partial_load_epi16_avx_not_aligned(uint16_t * arr) {    
    __m256i fill_v = _mm256_set1_epi64x(-1);
    __m256i pload = _mm256_maskload_epi32((int32_t *)arr, _mm256_set_epi32( /* Assume proper mask */ ));
    fill_v = _mm256_insert_epi16(fill_v,arr[ARR_SIZE_EPI16], ARR_SIZE_EPI16);
    return _mm256_blend_epi32(fill_v, pload, (1 << ((ARR_SIZE_EPI16 / 2) - 1)));
}

Which adds an vextractsi128, vpinsrw and vinsertsi128. I'm wondering if there is a better approach that doesn't have so much overhead.

Thank you!

Edit: The memory will be provided by the user and I cannot make any assumptions about whether before start of arr or after arr + ARR_SIZE is accessible.

Use case: implementing sorting network. The instructions to implement a sorting network for a power of 2 size are often significantly more efficient than for a non-power of 2 size (especially for byte / 2 byte values) so what I am trying to do is load the user array then pad it with max value (just doing the unsigned case now) so that I can round up the sorting network size to the next power of 2.

Edit: VPBLENDD and VPBLENDVB ARE NOT REPLACEMENTS FOR VMOVDQU

edit: ~~Interestingly enough the best solution I have found is to inline vpblendvb with the array as operand 3.~~ DO NOT DO THIS

Edit2:

Test program to see if vpblendd and vpblendvb cause extra pagefaults.

#include <immintrin.h>
#include <stdint.h>
#include <sys/mman.h>
#include <utility>

#define N 5


template<uint32_t... e>
constexpr __m256i inline __attribute__((always_inline))
load_N_kernel2(std::integer_sequence<uint32_t, e...> _e) {
    return _mm256_set_epi8(e...);
}


template<uint32_t... e>
constexpr __m256i inline __attribute__((always_inline))
load_N_kernel(std::integer_sequence<uint32_t, e...> _e) {
    return load_N_kernel2(
        std::integer_sequence<uint32_t, ((((31 - e) / 4) < N) << 7)...>{});
}

constexpr __m256i inline __attribute__((always_inline)) load_N() {
    return load_N_kernel(std::make_integer_sequence<uint32_t, 32>{});
}



__m256i __attribute__((noinline)) mask_load(uint32_t * arr) {
    __m256i tmp;
    return _mm256_mask_loadu_epi32(tmp, (1 << N) - 1, arr);
}

__m256i __attribute__((noinline)) blend_load(uint32_t * arr) {
    __m256i tmp;
    asm volatile("vpblendd %[m], (%[arr]), %[tmp], %[tmp]\n\t"
                 : [ tmp ] "=x"(tmp)
                 : [ arr ] "r"(arr), [ m ] "i"(((1 << N) - 1))
                 :);
    return tmp;
}


__m256i __attribute__((noinline)) blend_load_epi8(uint32_t * arr) {
    __m256i tmp = _mm256_set1_epi8(uint8_t(0xff));;
    __m256i mask = load_N();
    asm volatile("vpblendvb %[mask], (%[arr]), %[tmp], %[tmp]\n\t"
                 : [ tmp ] "+x"(tmp)
                 : [ arr ] "r"(arr), [ mask ] "x"(mask)
                 :);
    return tmp;
}


void __attribute__((noinline)) mask_store(uint32_t * arr, __m256i v) {
    return _mm256_mask_storeu_epi32(arr, (1 << N) - 1, v);
}



#define NPAGES (1000)
#define END_OF_PAGE (1024 - N)

#ifndef LOAD_METHOD
#define LOAD_METHOD blend_load
#endif


    
int
main() {
    uint32_t * addr = (uint32_t *)
        mmap(NULL, NPAGES * 4096, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);

    for(uint32_t i = 0; i < NPAGES; i += 2) {
        mask_store(addr + 1024 * i + END_OF_PAGE, LOAD_METHOD(addr + END_OF_PAGE));
    }
}

Ran: $> perf stat -e page-faults,page-faults ./partial_load

Result is same with LOAD_METHOD as blend_load, mask_load and blend_load_epi8:

 Performance counter stats for './partial_load':

               548      page-faults                                                 
               548      page-faults                                                 

       0.002155974 seconds time elapsed

       0.000000000 seconds user
       0.002276000 seconds sys

Edit3: Note was compiled with clang which does not use vpblendd to implement _mm256_mask_loadu_epi32.

Here is assembly of the function:

0000000000401130 <_Z9mask_loadPj>:
  401130:   b0 1f                   mov    $0x1f,%al
  401132:   c5 fb 92 c8             kmovd  %eax,%k1
  401136:   62 f1 7e a9 6f 07       vmovdqu32 (%rdi),%ymm0{%k1}{z}
  40113c:   c3                      retq   
  40113d:   0f 1f 00                nopl   (%rax)

c

x86

avx

micro-optimization

avx2

asked on Stack Overflow Oct 27, 2020 by

Noah • edited Oct 30, 2020 by

Noah

0 Answers

Nobody has answered this question yet.

User contributions licensed under CC BY-SA 3.0