A better 8x8 bytes matrix transpose with SSE?
Posting this as an answer. I'm also going to change the title of the question from "... with SSE" to "... with SIMD" due to some answers and comments received so far.
I succeeded in transposing the matrix with AVX2 in 8 instructions only, 10 including load/store (excluding masks loads). EDIT: I found a shorter version. See below. This is the case where the matrices are all contiguous in memory, so direct load/store can be used.
Here's the C code:
void tran8x8b_AVX2(char *src, char *dst) {
__m256i perm = _mm256_set_epi8(
0, 0, 0, 7,
0, 0, 0, 5,
0, 0, 0, 3,
0, 0, 0, 1,
0, 0, 0, 6,
0, 0, 0, 4,
0, 0, 0, 2,
0, 0, 0, 0
);
__m256i tm = _mm256_set_epi8(
15, 11, 7, 3,
14, 10, 6, 2,
13, 9, 5, 1,
12, 8, 4, 0,
15, 11, 7, 3,
14, 10, 6, 2,
13, 9, 5, 1,
12, 8, 4, 0
);
__m256i load0 = _mm256_loadu_si256((__m256i*)&src[ 0]);
__m256i load1 = _mm256_loadu_si256((__m256i*)&src[32]);
__m256i perm0 = _mm256_permutevar8x32_epi32(load0, perm);
__m256i perm1 = _mm256_permutevar8x32_epi32(load1, perm);
__m256i transpose0 = _mm256_shuffle_epi8(perm0, tm);
__m256i transpose1 = _mm256_shuffle_epi8(perm1, tm);
__m256i unpack0 = _mm256_unpacklo_epi32(transpose0, transpose1);
__m256i unpack1 = _mm256_unpackhi_epi32(transpose0, transpose1);
perm0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(unpack0), _mm256_castsi256_ps(unpack1), 32));
perm1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(unpack0), _mm256_castsi256_ps(unpack1), 49));
_mm256_storeu_si256((__m256i*)&dst[ 0], perm0);
_mm256_storeu_si256((__m256i*)&dst[32], perm1);
}
GCC was smart enough to perform a permutation during AVX load, saving two instructions. Here's the compiler output:
tran8x8b_AVX2(char*, char*):
vmovdqa ymm1, YMMWORD PTR .LC0[rip]
vmovdqa ymm2, YMMWORD PTR .LC1[rip]
vpermd ymm0, ymm1, YMMWORD PTR [rdi]
vpermd ymm1, ymm1, YMMWORD PTR [rdi+32]
vpshufb ymm0, ymm0, ymm2
vpshufb ymm1, ymm1, ymm2
vpunpckldq ymm2, ymm0, ymm1
vpunpckhdq ymm0, ymm0, ymm1
vinsertf128 ymm1, ymm2, xmm0, 1
vperm2f128 ymm0, ymm2, ymm0, 49
vmovdqu YMMWORD PTR [rsi], ymm1
vmovdqu YMMWORD PTR [rsi+32], ymm0
vzeroupper
ret
It emitted the vzerupper
instruction with -O3, but going down to -O1 removes this.
In case of my original problem (a large matrix and I'm zooming in to an 8x8 part of it), handling strides destroys the output in a pretty bad way:
void tran8x8b_AVX2(char *src, char *dst, int srcStride, int dstStride) {
__m256i load0 = _mm256_set_epi64x(*(uint64_t*)(src + 3 * srcStride), *(uint64_t*)(src + 2 * srcStride), *(uint64_t*)(src + 1 * srcStride), *(uint64_t*)(src + 0 * srcStride));
__m256i load1 = _mm256_set_epi64x(*(uint64_t*)(src + 7 * srcStride), *(uint64_t*)(src + 6 * srcStride), *(uint64_t*)(src + 5 * srcStride), *(uint64_t*)(src + 4 * srcStride));
// ... the same as before, however we can skip the final permutations because we need to handle the destination stride...
*((uint64_t*)(dst + 0 * dstStride)) = _mm256_extract_epi64(unpack0, 0);
*((uint64_t*)(dst + 1 * dstStride)) = _mm256_extract_epi64(unpack0, 1);
*((uint64_t*)(dst + 2 * dstStride)) = _mm256_extract_epi64(unpack1, 0);
*((uint64_t*)(dst + 3 * dstStride)) = _mm256_extract_epi64(unpack1, 1);
*((uint64_t*)(dst + 4 * dstStride)) = _mm256_extract_epi64(unpack0, 2);
*((uint64_t*)(dst + 5 * dstStride)) = _mm256_extract_epi64(unpack0, 3);
*((uint64_t*)(dst + 6 * dstStride)) = _mm256_extract_epi64(unpack1, 2);
*((uint64_t*)(dst + 7 * dstStride)) = _mm256_extract_epi64(unpack1, 3);
}
Here's the compiler output:
tran8x8b_AVX2(char*, char*, int, int):
movsx rdx, edx
vmovq xmm5, QWORD PTR [rdi]
lea r9, [rdi+rdx]
vmovdqa ymm3, YMMWORD PTR .LC0[rip]
movsx rcx, ecx
lea r11, [r9+rdx]
vpinsrq xmm0, xmm5, QWORD PTR [r9], 1
lea r10, [r11+rdx]
vmovq xmm4, QWORD PTR [r11]
vpinsrq xmm1, xmm4, QWORD PTR [r10], 1
lea r8, [r10+rdx]
lea rax, [r8+rdx]
vmovq xmm7, QWORD PTR [r8]
vmovq xmm6, QWORD PTR [rax+rdx]
vpinsrq xmm2, xmm7, QWORD PTR [rax], 1
vinserti128 ymm1, ymm0, xmm1, 0x1
vpinsrq xmm0, xmm6, QWORD PTR [rax+rdx*2], 1
lea rax, [rsi+rcx]
vpermd ymm1, ymm3, ymm1
vinserti128 ymm0, ymm2, xmm0, 0x1
vmovdqa ymm2, YMMWORD PTR .LC1[rip]
vpshufb ymm1, ymm1, ymm2
vpermd ymm0, ymm3, ymm0
vpshufb ymm0, ymm0, ymm2
vpunpckldq ymm2, ymm1, ymm0
vpunpckhdq ymm0, ymm1, ymm0
vmovdqa xmm1, xmm2
vmovq QWORD PTR [rsi], xmm1
vpextrq QWORD PTR [rax], xmm1, 1
vmovdqa xmm1, xmm0
add rax, rcx
vextracti128 xmm0, ymm0, 0x1
vmovq QWORD PTR [rax], xmm1
add rax, rcx
vpextrq QWORD PTR [rax], xmm1, 1
add rax, rcx
vextracti128 xmm1, ymm2, 0x1
vmovq QWORD PTR [rax], xmm1
add rax, rcx
vpextrq QWORD PTR [rax], xmm1, 1
vmovq QWORD PTR [rax+rcx], xmm0
vpextrq QWORD PTR [rax+rcx*2], xmm0, 1
vzeroupper
ret
However, this seems not a big deal if compared against the output my original code.
EDIT: I found a shorter version. 4 instructions in total, 8 counting both load/stores. This is possible because I read the matrix in a different way, hiding some "shuffles" in the "gather" instruction during load. Also, note that the final permutation is needed to perform the store because AVX2 doesn't have a "scatter" instruction. Having a scatter instruction would bring down everything to 2 instructions only. Also, note that I can handle without hassles the src stride by changing the content of the vindex
vector.
Unfortunately this AVX_v2 seems to be slower than the previous one. Here's the code:
void tran8x8b_AVX2_v2(char *src1, char *dst1) {
__m256i tm = _mm256_set_epi8(
15, 11, 7, 3,
14, 10, 6, 2,
13, 9, 5, 1,
12, 8, 4, 0,
15, 11, 7, 3,
14, 10, 6, 2,
13, 9, 5, 1,
12, 8, 4, 0
);
__m256i vindex = _mm256_setr_epi32(0, 8, 16, 24, 32, 40, 48, 56);
__m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
__m256i load0 = _mm256_i32gather_epi32((int*)src1, vindex, 1);
__m256i load1 = _mm256_i32gather_epi32((int*)(src1 + 4), vindex, 1);
__m256i transpose0 = _mm256_shuffle_epi8(load0, tm);
__m256i transpose1 = _mm256_shuffle_epi8(load1, tm);
__m256i final0 = _mm256_permutevar8x32_epi32(transpose0, perm);
__m256i final1 = _mm256_permutevar8x32_epi32(transpose1, perm);
_mm256_storeu_si256((__m256i*)&dst1[ 0], final0);
_mm256_storeu_si256((__m256i*)&dst1[32], final1);
}
And here's the output of the compiler:
tran8x8b_AVX2_v2(char*, char*):
vpcmpeqd ymm3, ymm3, ymm3
vmovdqa ymm2, YMMWORD PTR .LC0[rip]
vmovdqa ymm4, ymm3
vpgatherdd ymm0, DWORD PTR [rdi+4+ymm2*8], ymm3
vpgatherdd ymm1, DWORD PTR [rdi+ymm2*8], ymm4
vmovdqa ymm2, YMMWORD PTR .LC1[rip]
vpshufb ymm1, ymm1, ymm2
vpshufb ymm0, ymm0, ymm2
vmovdqa ymm2, YMMWORD PTR .LC2[rip]
vpermd ymm1, ymm2, ymm1
vpermd ymm0, ymm2, ymm0
vmovdqu YMMWORD PTR [rsi], ymm1
vmovdqu YMMWORD PTR [rsi+32], ymm0
vzeroupper
ret
Apart from the loads, stores and pinsrq
-s to read from and write to memory, with possibly a stride not equal to 8 bytes,
you can do the transpose with only 12 instructions (this code can easily be used in combination with Z boson's test code):
void tran8x8b_SSE_v2(char *A, char *B) {
__m128i pshufbcnst = _mm_set_epi8(15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0);
__m128i B0, B1, B2, B3, T0, T1, T2, T3;
B0 = _mm_loadu_si128((__m128i*)&A[ 0]);
B1 = _mm_loadu_si128((__m128i*)&A[16]);
B2 = _mm_loadu_si128((__m128i*)&A[32]);
B3 = _mm_loadu_si128((__m128i*)&A[48]);
T0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B0),_mm_castsi128_ps(B1),0b10001000));
T1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B2),_mm_castsi128_ps(B3),0b10001000));
T2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B0),_mm_castsi128_ps(B1),0b11011101));
T3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B2),_mm_castsi128_ps(B3),0b11011101));
B0 = _mm_shuffle_epi8(T0,pshufbcnst);
B1 = _mm_shuffle_epi8(T1,pshufbcnst);
B2 = _mm_shuffle_epi8(T2,pshufbcnst);
B3 = _mm_shuffle_epi8(T3,pshufbcnst);
T0 = _mm_unpacklo_epi32(B0,B1);
T1 = _mm_unpackhi_epi32(B0,B1);
T2 = _mm_unpacklo_epi32(B2,B3);
T3 = _mm_unpackhi_epi32(B2,B3);
_mm_storeu_si128((__m128i*)&B[ 0], T0);
_mm_storeu_si128((__m128i*)&B[16], T1);
_mm_storeu_si128((__m128i*)&B[32], T2);
_mm_storeu_si128((__m128i*)&B[48], T3);
}
Here we use the 32 bit floating point shuffle which is more flexible than the epi32
shuffle.
The casts do not generate extra instructions (code generated with gcc 5.4):
tran8x8b_SSE_v2:
.LFB4885:
.cfi_startproc
vmovdqu 48(%rdi), %xmm5
vmovdqu 32(%rdi), %xmm2
vmovdqu 16(%rdi), %xmm0
vmovdqu (%rdi), %xmm1
vshufps $136, %xmm5, %xmm2, %xmm4
vshufps $221, %xmm5, %xmm2, %xmm2
vmovdqa .LC6(%rip), %xmm5
vshufps $136, %xmm0, %xmm1, %xmm3
vshufps $221, %xmm0, %xmm1, %xmm1
vpshufb %xmm5, %xmm3, %xmm3
vpshufb %xmm5, %xmm1, %xmm0
vpshufb %xmm5, %xmm4, %xmm4
vpshufb %xmm5, %xmm2, %xmm1
vpunpckldq %xmm4, %xmm3, %xmm5
vpunpckldq %xmm1, %xmm0, %xmm2
vpunpckhdq %xmm4, %xmm3, %xmm3
vpunpckhdq %xmm1, %xmm0, %xmm0
vmovups %xmm5, (%rsi)
vmovups %xmm3, 16(%rsi)
vmovups %xmm2, 32(%rsi)
vmovups %xmm0, 48(%rsi)
ret
.cfi_endproc
On some, but not all, older cpus there might be a small bypass delay (between 0 and 2 cycles) for moving data between the
integer and the floating point units. This increases the latency of the function, but it does not necessarily affect the
throughput of the code.
A simple latency test with 1e9 tranpositions:
for (int i=0;i<500000000;i++){
tran8x8b_SSE(A,C);
tran8x8b_SSE(C,A);
}
print8x8b(A);
This takes about 5.5 seconds (19.7e9 cycles) with tran8x8b_SSE and 4.5 seconds (16.0e9 cycles) with tran8x8b_SSE_v2 (Intel core i5-6500). Note that the load and stores were not eliminated by the compiler, although the functions were inlined in the for loop.
Update: AVX2-128 / SSE 4.1 solution with blends.
The 'shuffles' (unpack, shuffle) are handled by port 5, with 1 instruction per cpu cycle on modern cpus. Sometimes it pays off to replace one 'shuffle' with two blends. On Skylake the 32 bit blend instructions can run on either port 0, 1 or 5.
Unfortunately, _mm_blend_epi32
is only AVX2-128. An efficient SSE 4.1 alternative is _mm_blend_ps
in combination
with a few casts (which are usually free). The 12 'shuffles' are replaced by
8 shuffles in combination with 8 blends.
The simple latency test now runs in about 3.6 seconds (13e9 cpu cycles), which is 18 % faster than the results with tran8x8b_SSE_v2
.
Code:
/* AVX2-128 version, sse 4.1 version see ----------------> SSE 4.1 version of tran8x8b_AVX2_128() */
void tran8x8b_AVX2_128(char *A, char *B) { /* void tran8x8b_SSE4_1(char *A, char *B) { */
__m128i pshufbcnst_0 = _mm_set_epi8(15, 7,11, 3,
13, 5, 9, 1, 14, 6,10, 2, 12, 4, 8, 0); /* __m128i pshufbcnst_0 = _mm_set_epi8(15, 7,11, 3, 13, 5, 9, 1, 14, 6,10, 2, 12, 4, 8, 0); */
__m128i pshufbcnst_1 = _mm_set_epi8(13, 5, 9, 1,
15, 7,11, 3, 12, 4, 8, 0, 14, 6,10, 2); /* __m128i pshufbcnst_1 = _mm_set_epi8(13, 5, 9, 1, 15, 7,11, 3, 12, 4, 8, 0, 14, 6,10, 2); */
__m128i pshufbcnst_2 = _mm_set_epi8(11, 3,15, 7,
9, 1,13, 5, 10, 2,14, 6, 8, 0,12, 4); /* __m128i pshufbcnst_2 = _mm_set_epi8(11, 3,15, 7, 9, 1,13, 5, 10, 2,14, 6, 8, 0,12, 4); */
__m128i pshufbcnst_3 = _mm_set_epi8( 9, 1,13, 5,
11, 3,15, 7, 8, 0,12, 4, 10, 2,14, 6); /* __m128i pshufbcnst_3 = _mm_set_epi8( 9, 1,13, 5, 11, 3,15, 7, 8, 0,12, 4, 10, 2,14, 6); */
__m128i B0, B1, B2, B3, T0, T1, T2, T3; /* __m128 B0, B1, B2, B3, T0, T1, T2, T3; */
/* */
B0 = _mm_loadu_si128((__m128i*)&A[ 0]); /* B0 = _mm_loadu_ps((float*)&A[ 0]); */
B1 = _mm_loadu_si128((__m128i*)&A[16]); /* B1 = _mm_loadu_ps((float*)&A[16]); */
B2 = _mm_loadu_si128((__m128i*)&A[32]); /* B2 = _mm_loadu_ps((float*)&A[32]); */
B3 = _mm_loadu_si128((__m128i*)&A[48]); /* B3 = _mm_loadu_ps((float*)&A[48]); */
/* */
B1 = _mm_shuffle_epi32(B1,0b10110001); /* B1 = _mm_shuffle_ps(B1,B1,0b10110001); */
B3 = _mm_shuffle_epi32(B3,0b10110001); /* B3 = _mm_shuffle_ps(B3,B3,0b10110001); */
T0 = _mm_blend_epi32(B0,B1,0b1010); /* T0 = _mm_blend_ps(B0,B1,0b1010); */
T1 = _mm_blend_epi32(B2,B3,0b1010); /* T1 = _mm_blend_ps(B2,B3,0b1010); */
T2 = _mm_blend_epi32(B0,B1,0b0101); /* T2 = _mm_blend_ps(B0,B1,0b0101); */
T3 = _mm_blend_epi32(B2,B3,0b0101); /* T3 = _mm_blend_ps(B2,B3,0b0101); */
/* */
B0 = _mm_shuffle_epi8(T0,pshufbcnst_0); /* B0 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T0),pshufbcnst_0)); */
B1 = _mm_shuffle_epi8(T1,pshufbcnst_1); /* B1 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T1),pshufbcnst_1)); */
B2 = _mm_shuffle_epi8(T2,pshufbcnst_2); /* B2 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T2),pshufbcnst_2)); */
B3 = _mm_shuffle_epi8(T3,pshufbcnst_3); /* B3 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T3),pshufbcnst_3)); */
/* */
T0 = _mm_blend_epi32(B0,B1,0b1010); /* T0 = _mm_blend_ps(B0,B1,0b1010); */
T1 = _mm_blend_epi32(B0,B1,0b0101); /* T1 = _mm_blend_ps(B0,B1,0b0101); */
T2 = _mm_blend_epi32(B2,B3,0b1010); /* T2 = _mm_blend_ps(B2,B3,0b1010); */
T3 = _mm_blend_epi32(B2,B3,0b0101); /* T3 = _mm_blend_ps(B2,B3,0b0101); */
T1 = _mm_shuffle_epi32(T1,0b10110001); /* T1 = _mm_shuffle_ps(T1,T1,0b10110001); */
T3 = _mm_shuffle_epi32(T3,0b10110001); /* T3 = _mm_shuffle_ps(T3,T3,0b10110001); */
/* */
_mm_storeu_si128((__m128i*)&B[ 0], T0); /* _mm_storeu_ps((float*)&B[ 0], T0); */
_mm_storeu_si128((__m128i*)&B[16], T1); /* _mm_storeu_ps((float*)&B[16], T1); */
_mm_storeu_si128((__m128i*)&B[32], T2); /* _mm_storeu_ps((float*)&B[32], T2); */
_mm_storeu_si128((__m128i*)&B[48], T3); /* _mm_storeu_ps((float*)&B[48], T3); */
} /* } */
Normally when load and store instructions are not counted it's because the code is working with a matrix in register e.g. doing multiple operations in addition to the transpose in a loop. The loads and stores in this case are not counted because they are not part of the main loop.
But in your code the loads and stores (or rather sets and extracts) are doing part of the transpose.
GCC implements _mm_set_epi64x
for SSE4.1 in your code with _mm_insert_epi64
and _mm_loadl_epi64
. The insert instruction is doing part of the transpose i.e. the transpose starts at load0,1,2,3
not at shuffle0,1,2,3
. And then your final store0,1,2,3
values don't contain the transpose either. You have to use eight _mm_extract_epi64
instructions to finish the transpose in memory. So it does not really make sense to not count the set and extract intrinsics.
In any case, it turns out you can do the transpose from register with only 16 instructions using only SSSE3 like this:
//__m128i B0, __m128i B1, __m128i B2, __m128i B3
__m128i mask = _mm_setr_epi8(0x0,0x04,0x01,0x05, 0x02,0x06,0x03,0x07, 0x08,0x0c,0x09,0x0d, 0x0a,0x0e,0x0b,0x0f);
__m128i T0, T1, T2, T3;
T0 = _mm_unpacklo_epi8(B0,B1);
T1 = _mm_unpackhi_epi8(B0,B1);
T2 = _mm_unpacklo_epi8(B2,B3);
T3 = _mm_unpackhi_epi8(B2,B3);
B0 = _mm_unpacklo_epi16(T0,T2);
B1 = _mm_unpackhi_epi16(T0,T2);
B2 = _mm_unpacklo_epi16(T1,T3);
B3 = _mm_unpackhi_epi16(T1,T3);
T0 = _mm_unpacklo_epi32(B0,B2);
T1 = _mm_unpackhi_epi32(B0,B2);
T2 = _mm_unpacklo_epi32(B1,B3);
T3 = _mm_unpackhi_epi32(B1,B3);
B0 = _mm_shuffle_epi8(T0,mask);
B1 = _mm_shuffle_epi8(T1,mask);
B2 = _mm_shuffle_epi8(T2,mask);
B3 = _mm_shuffle_epi8(T3,mask);
I'm not sure if it makes sense to exclude the loads and store here either because I'm not sure how convenient it is to work with a 8x8 byte matrix in four 128-bit registers.
Here is code testing this:
#include <stdio.h>
#include <x86intrin.h>
void print8x8b(char *A) {
for(int i=0; i<8; i++) {
for(int j=0; j<8; j++) {
printf("%2d ", A[i*8+j]);
} puts("");
} puts("");
}
void tran8x8b(char *A, char *B) {
for(int i=0; i<8; i++) {
for(int j=0; j<8; j++) {
B[j*8+i] = A[i*8+j];
}
}
}
void tran8x8b_SSE(char *A, char *B) {
__m128i mask = _mm_setr_epi8(0x0,0x04,0x01,0x05, 0x02,0x06,0x03,0x07, 0x08,0x0c,0x09,0x0d, 0x0a,0x0e,0x0b,0x0f);
__m128i B0, B1, B2, B3, T0, T1, T2, T3;
B0 = _mm_loadu_si128((__m128i*)&A[ 0]);
B1 = _mm_loadu_si128((__m128i*)&A[16]);
B2 = _mm_loadu_si128((__m128i*)&A[32]);
B3 = _mm_loadu_si128((__m128i*)&A[48]);
T0 = _mm_unpacklo_epi8(B0,B1);
T1 = _mm_unpackhi_epi8(B0,B1);
T2 = _mm_unpacklo_epi8(B2,B3);
T3 = _mm_unpackhi_epi8(B2,B3);
B0 = _mm_unpacklo_epi16(T0,T2);
B1 = _mm_unpackhi_epi16(T0,T2);
B2 = _mm_unpacklo_epi16(T1,T3);
B3 = _mm_unpackhi_epi16(T1,T3);
T0 = _mm_unpacklo_epi32(B0,B2);
T1 = _mm_unpackhi_epi32(B0,B2);
T2 = _mm_unpacklo_epi32(B1,B3);
T3 = _mm_unpackhi_epi32(B1,B3);
B0 = _mm_shuffle_epi8(T0,mask);
B1 = _mm_shuffle_epi8(T1,mask);
B2 = _mm_shuffle_epi8(T2,mask);
B3 = _mm_shuffle_epi8(T3,mask);
_mm_storeu_si128((__m128i*)&B[ 0], B0);
_mm_storeu_si128((__m128i*)&B[16], B1);
_mm_storeu_si128((__m128i*)&B[32], B2);
_mm_storeu_si128((__m128i*)&B[48], B3);
}
int main(void) {
char A[64], B[64], C[64];
for(int i=0; i<64; i++) A[i] = i;
print8x8b(A);
tran8x8b(A,B);
print8x8b(B);
tran8x8b_SSE(A,C);
print8x8b(C);
}
A simplified one
void tp128_8x8(char *A, char *B) {
__m128i sv = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
__m128i iv[4], ov[4];
ov[0] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)A), sv);
ov[1] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(A+16)), sv);
ov[2] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(A+32)), sv);
ov[3] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(A+48)), sv);
iv[0] = _mm_unpacklo_epi16(ov[0], ov[1]);
iv[1] = _mm_unpackhi_epi16(ov[0], ov[1]);
iv[2] = _mm_unpacklo_epi16(ov[2], ov[3]);
iv[3] = _mm_unpackhi_epi16(ov[2], ov[3]);
_mm_storeu_si128((__m128i*)B, _mm_unpacklo_epi32(iv[0], iv[2]));
_mm_storeu_si128((__m128i*)(B+16), _mm_unpackhi_epi32(iv[0], iv[2]));
_mm_storeu_si128((__m128i*)(B+32), _mm_unpacklo_epi32(iv[1], iv[3]));
_mm_storeu_si128((__m128i*)(B+48), _mm_unpackhi_epi32(iv[1], iv[3]));
}
Benchmark:i5-5300U 2.3GHz (cycles per byte)
tran8x8b : 2.140
tran8x8b_SSE : 1.602
tran8x8b_SSE_v2 : 1.551
tp128_8x8 : 1.535
tran8x8b_AVX2 : 1.563
tran8x8b_AVX2_v2 : 1.731