//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT

TEXT ·_transpose_uint8_uint8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB0_1

LBB0_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x04c78348         // add    rdi, 4
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB0_5

LBB0_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB0_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB0_3:
	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842             // mov    byte [rsi + r8], al
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB0_3

LBB0_4:
	RET

TEXT ·_transpose_int8_uint8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB1_1

LBB1_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688                 // mov    byte [rsi], dl
	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01     // mov    byte [rsi + 1], dl
	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02     // mov    byte [rsi + 2], dl
	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03     // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x04c78348             // add    rdi, 4
	LONG $0x04c68348             // add    rsi, 4
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB1_5

LBB1_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB1_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB1_3:
	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842             // mov    byte [rsi + r8], al
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB1_3

LBB1_4:
	RET

TEXT ·_transpose_uint16_uint8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB2_1

LBB2_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x08c78348         // add    rdi, 8
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB2_5

LBB2_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB2_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB2_3:
	LONG $0x04b70f42; BYTE $0x47 // movzx    eax, word [rdi + 2*r8]
	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842             // mov    byte [rsi + r8], al
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB2_3

LBB2_4:
	RET

TEXT ·_transpose_int16_uint8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB3_1

LBB3_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688                 // mov    byte [rsi], dl
	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01     // mov    byte [rsi + 1], dl
	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02     // mov    byte [rsi + 2], dl
	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03     // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x08c78348             // add    rdi, 8
	LONG $0x04c68348             // add    rsi, 4
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB3_5

LBB3_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB3_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB3_3:
	LONG $0x04bf0f4a; BYTE $0x47 // movsx    rax, word [rdi + 2*r8]
	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842             // mov    byte [rsi + r8], al
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB3_3

LBB3_4:
	RET

TEXT ·_transpose_uint32_uint8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB4_1

LBB4_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x178b             // mov    edx, dword [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB4_5

LBB4_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB4_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB4_3:
	LONG $0x87048b42         // mov    eax, dword [rdi + 4*r8]
	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842         // mov    byte [rsi + r8], al
	LONG $0x01c08349         // add    r8, 1
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB4_3

LBB4_4:
	RET

TEXT ·_transpose_int32_uint8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB5_1

LBB5_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB5_5

LBB5_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB5_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB5_3:
	LONG $0x8704634a         // movsxd    rax, dword [rdi + 4*r8]
	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842         // mov    byte [rsi + r8], al
	LONG $0x01c08349         // add    r8, 1
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB5_3

LBB5_4:
	RET

TEXT ·_transpose_uint64_uint8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB6_1

LBB6_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB6_5

LBB6_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB6_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB6_3:
	LONG $0xc7048b4a         // mov    rax, qword [rdi + 8*r8]
	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842         // mov    byte [rsi + r8], al
	LONG $0x01c08349         // add    r8, 1
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB6_3

LBB6_4:
	RET

TEXT ·_transpose_int64_uint8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB7_1

LBB7_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB7_5

LBB7_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB7_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB7_3:
	LONG $0xc7048b4a         // mov    rax, qword [rdi + 8*r8]
	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842         // mov    byte [rsi + r8], al
	LONG $0x01c08349         // add    r8, 1
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB7_3

LBB7_4:
	RET

TEXT ·_transpose_uint8_int8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB8_1

LBB8_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x04c78348         // add    rdi, 4
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB8_5

LBB8_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB8_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB8_3:
	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842             // mov    byte [rsi + r8], al
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB8_3

LBB8_4:
	RET

TEXT ·_transpose_int8_int8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB9_1

LBB9_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688                 // mov    byte [rsi], dl
	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01     // mov    byte [rsi + 1], dl
	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02     // mov    byte [rsi + 2], dl
	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03     // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x04c78348             // add    rdi, 4
	LONG $0x04c68348             // add    rsi, 4
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB9_5

LBB9_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB9_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB9_3:
	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842             // mov    byte [rsi + r8], al
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB9_3

LBB9_4:
	RET

TEXT ·_transpose_uint16_int8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB10_1

LBB10_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x08c78348         // add    rdi, 8
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB10_5

LBB10_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB10_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB10_3:
	LONG $0x04b70f42; BYTE $0x47 // movzx    eax, word [rdi + 2*r8]
	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842             // mov    byte [rsi + r8], al
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB10_3

LBB10_4:
	RET

TEXT ·_transpose_int16_int8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB11_1

LBB11_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688                 // mov    byte [rsi], dl
	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01     // mov    byte [rsi + 1], dl
	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02     // mov    byte [rsi + 2], dl
	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
	LONG $0x9114b60f             // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03     // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x08c78348             // add    rdi, 8
	LONG $0x04c68348             // add    rsi, 4
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB11_5

LBB11_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB11_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB11_3:
	LONG $0x04bf0f4a; BYTE $0x47 // movsx    rax, word [rdi + 2*r8]
	LONG $0x8104b60f             // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842             // mov    byte [rsi + r8], al
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB11_3

LBB11_4:
	RET

TEXT ·_transpose_uint32_int8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB12_1

LBB12_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x178b             // mov    edx, dword [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB12_5

LBB12_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB12_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB12_3:
	LONG $0x87048b42         // mov    eax, dword [rdi + 4*r8]
	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842         // mov    byte [rsi + r8], al
	LONG $0x01c08349         // add    r8, 1
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB12_3

LBB12_4:
	RET

TEXT ·_transpose_int32_int8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB13_1

LBB13_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB13_5

LBB13_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB13_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB13_3:
	LONG $0x8704634a         // movsxd    rax, dword [rdi + 4*r8]
	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842         // mov    byte [rsi + r8], al
	LONG $0x01c08349         // add    r8, 1
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB13_3

LBB13_4:
	RET

TEXT ·_transpose_uint64_int8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB14_1

LBB14_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB14_5

LBB14_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB14_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB14_3:
	LONG $0xc7048b4a         // mov    rax, qword [rdi + 8*r8]
	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842         // mov    byte [rsi + r8], al
	LONG $0x01c08349         // add    r8, 1
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB14_3

LBB14_4:
	RET

TEXT ·_transpose_int64_int8_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB15_1

LBB15_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x1688             // mov    byte [rsi], dl
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x01 // mov    byte [rsi + 1], dl
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x02 // mov    byte [rsi + 2], dl
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x9114b60f         // movzx    edx, byte [rcx + 4*rdx]
	WORD $0x5688; BYTE $0x03 // mov    byte [rsi + 3], dl
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x04c68348         // add    rsi, 4
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB15_5

LBB15_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB15_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB15_3:
	LONG $0xc7048b4a         // mov    rax, qword [rdi + 8*r8]
	LONG $0x8104b60f         // movzx    eax, byte [rcx + 4*rax]
	LONG $0x06048842         // mov    byte [rsi + r8], al
	LONG $0x01c08349         // add    r8, 1
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB15_3

LBB15_4:
	RET

TEXT ·_transpose_uint8_uint16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB16_1

LBB16_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x04c78348         // add    rdi, 4
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB16_5

LBB16_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB16_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB16_3:
	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x46 // mov    word [rsi + 2*r8], ax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB16_3

LBB16_4:
	RET

TEXT ·_transpose_int8_uint16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB17_1

LBB17_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16     // mov    word [rsi], dx
	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966             // mov    word [rsi + 2], dx
	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966             // mov    word [rsi + 4], dx
	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966             // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x04c78348             // add    rdi, 4
	LONG $0x08c68348             // add    rsi, 8
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB17_5

LBB17_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB17_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB17_3:
	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x46 // mov    word [rsi + 2*r8], ax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB17_3

LBB17_4:
	RET

TEXT ·_transpose_uint16_uint16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB18_1

LBB18_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x08c78348         // add    rdi, 8
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB18_5

LBB18_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB18_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB18_3:
	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB18_3

LBB18_4:
	RET

TEXT ·_transpose_int16_uint16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB19_1

LBB19_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16     // mov    word [rsi], dx
	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966             // mov    word [rsi + 2], dx
	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966             // mov    word [rsi + 4], dx
	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966             // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x08c78348             // add    rdi, 8
	LONG $0x08c68348             // add    rsi, 8
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB19_5

LBB19_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB19_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB19_3:
	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB19_3

LBB19_4:
	RET

TEXT ·_transpose_uint32_uint16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB20_1

LBB20_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x178b             // mov    edx, dword [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB20_5

LBB20_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB20_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB20_3:
	LONG $0x47048b42             // mov    eax, dword [rdi + 2*r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB20_3

LBB20_4:
	RET

TEXT ·_transpose_int32_uint16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB21_1

LBB21_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB21_5

LBB21_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB21_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB21_3:
	LONG $0x4704634a             // movsxd    rax, dword [rdi + 2*r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB21_3

LBB21_4:
	RET

TEXT ·_transpose_uint64_uint16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB22_1

LBB22_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB22_5

LBB22_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB22_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB22_3:
	LONG $0x87048b4a             // mov    rax, qword [rdi + 4*r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB22_3

LBB22_4:
	RET

TEXT ·_transpose_int64_uint16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB23_1

LBB23_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB23_5

LBB23_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB23_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB23_3:
	LONG $0x87048b4a             // mov    rax, qword [rdi + 4*r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB23_3

LBB23_4:
	RET

TEXT ·_transpose_uint8_int16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB24_1

LBB24_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x04c78348         // add    rdi, 4
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB24_5

LBB24_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB24_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB24_3:
	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x46 // mov    word [rsi + 2*r8], ax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB24_3

LBB24_4:
	RET

TEXT ·_transpose_int8_int16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB25_1

LBB25_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16     // mov    word [rsi], dx
	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966             // mov    word [rsi + 2], dx
	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966             // mov    word [rsi + 4], dx
	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966             // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x04c78348             // add    rdi, 4
	LONG $0x08c68348             // add    rsi, 8
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB25_5

LBB25_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB25_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB25_3:
	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x46 // mov    word [rsi + 2*r8], ax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB25_3

LBB25_4:
	RET

TEXT ·_transpose_uint16_int16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB26_1

LBB26_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x08c78348         // add    rdi, 8
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB26_5

LBB26_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB26_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB26_3:
	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB26_3

LBB26_4:
	RET

TEXT ·_transpose_int16_int16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB27_1

LBB27_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16     // mov    word [rsi], dx
	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966             // mov    word [rsi + 2], dx
	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966             // mov    word [rsi + 4], dx
	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
	LONG $0x9114b70f             // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966             // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x08c78348             // add    rdi, 8
	LONG $0x08c68348             // add    rsi, 8
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB27_5

LBB27_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB27_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB27_3:
	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB27_3

LBB27_4:
	RET

TEXT ·_transpose_uint32_int16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB28_1

LBB28_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x178b             // mov    edx, dword [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB28_5

LBB28_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB28_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB28_3:
	LONG $0x47048b42             // mov    eax, dword [rdi + 2*r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB28_3

LBB28_4:
	RET

TEXT ·_transpose_int32_int16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB29_1

LBB29_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB29_5

LBB29_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB29_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB29_3:
	LONG $0x4704634a             // movsxd    rax, dword [rdi + 2*r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB29_3

LBB29_4:
	RET

TEXT ·_transpose_uint64_int16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB30_1

LBB30_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB30_5

LBB30_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB30_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB30_3:
	LONG $0x87048b4a             // mov    rax, qword [rdi + 4*r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB30_3

LBB30_4:
	RET

TEXT ·_transpose_int64_int16_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB31_1

LBB31_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	WORD $0x8966; BYTE $0x16 // mov    word [rsi], dx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x02568966         // mov    word [rsi + 2], dx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x04568966         // mov    word [rsi + 4], dx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x9114b70f         // movzx    edx, word [rcx + 4*rdx]
	LONG $0x06568966         // mov    word [rsi + 6], dx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x08c68348         // add    rsi, 8
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB31_5

LBB31_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB31_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB31_3:
	LONG $0x87048b4a             // mov    rax, qword [rdi + 4*r8]
	LONG $0x8104b70f             // movzx    eax, word [rcx + 4*rax]
	LONG $0x04894266; BYTE $0x06 // mov    word [rsi + r8], ax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB31_3

LBB31_4:
	RET

TEXT ·_transpose_uint8_uint32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB32_1

LBB32_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x04c78348         // add    rdi, 4
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB32_5

LBB32_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB32_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB32_3:
	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
	LONG $0x86048942             // mov    dword [rsi + 4*r8], eax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB32_3

LBB32_4:
	RET

TEXT ·_transpose_int8_uint32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB33_1

LBB33_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689                 // mov    dword [rsi], edx
	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04     // mov    dword [rsi + 4], edx
	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08     // mov    dword [rsi + 8], edx
	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c     // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x04c78348             // add    rdi, 4
	LONG $0x10c68348             // add    rsi, 16
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB33_5

LBB33_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB33_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB33_3:
	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
	LONG $0x86048942             // mov    dword [rsi + 4*r8], eax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB33_3

LBB33_4:
	RET

TEXT ·_transpose_uint16_uint32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB34_1

LBB34_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x08c78348         // add    rdi, 8
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB34_5

LBB34_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB34_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB34_3:
	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
	LONG $0x46048942             // mov    dword [rsi + 2*r8], eax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB34_3

LBB34_4:
	RET

TEXT ·_transpose_int16_uint32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB35_1

LBB35_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689                 // mov    dword [rsi], edx
	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04     // mov    dword [rsi + 4], edx
	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08     // mov    dword [rsi + 8], edx
	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c     // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x08c78348             // add    rdi, 8
	LONG $0x10c68348             // add    rsi, 16
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB35_5

LBB35_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB35_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB35_3:
	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
	LONG $0x46048942             // mov    dword [rsi + 2*r8], eax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB35_3

LBB35_4:
	RET

TEXT ·_transpose_uint32_uint32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB36_1

LBB36_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x178b             // mov    edx, dword [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB36_5

LBB36_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB36_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB36_3:
	LONG $0x07048b42         // mov    eax, dword [rdi + r8]
	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
	LONG $0x06048942         // mov    dword [rsi + r8], eax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB36_3

LBB36_4:
	RET

TEXT ·_transpose_int32_uint32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB37_1

LBB37_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB37_5

LBB37_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB37_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB37_3:
	LONG $0x0704634a         // movsxd    rax, dword [rdi + r8]
	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
	LONG $0x06048942         // mov    dword [rsi + r8], eax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB37_3

LBB37_4:
	RET

TEXT ·_transpose_uint64_uint32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB38_1

LBB38_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB38_5

LBB38_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB38_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB38_3:
	LONG $0x47048b4a         // mov    rax, qword [rdi + 2*r8]
	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
	LONG $0x06048942         // mov    dword [rsi + r8], eax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB38_3

LBB38_4:
	RET

TEXT ·_transpose_int64_uint32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB39_1

LBB39_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB39_5

LBB39_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB39_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB39_3:
	LONG $0x47048b4a         // mov    rax, qword [rdi + 2*r8]
	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
	LONG $0x06048942         // mov    dword [rsi + r8], eax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB39_3

LBB39_4:
	RET

TEXT ·_transpose_uint8_int32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB40_1

LBB40_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x04c78348         // add    rdi, 4
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB40_5

LBB40_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB40_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB40_3:
	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
	LONG $0x86048942             // mov    dword [rsi + 4*r8], eax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB40_3

LBB40_4:
	RET

TEXT ·_transpose_int8_int32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB41_1

LBB41_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689                 // mov    dword [rsi], edx
	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04     // mov    dword [rsi + 4], edx
	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08     // mov    dword [rsi + 8], edx
	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c     // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x04c78348             // add    rdi, 4
	LONG $0x10c68348             // add    rsi, 16
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB41_5

LBB41_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB41_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB41_3:
	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
	LONG $0x86048942             // mov    dword [rsi + 4*r8], eax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB41_3

LBB41_4:
	RET

TEXT ·_transpose_uint16_int32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB42_1

LBB42_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x08c78348         // add    rdi, 8
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB42_5

LBB42_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB42_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB42_3:
	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
	LONG $0x46048942             // mov    dword [rsi + 2*r8], eax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB42_3

LBB42_4:
	RET

TEXT ·_transpose_int16_int32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB43_1

LBB43_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689                 // mov    dword [rsi], edx
	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04     // mov    dword [rsi + 4], edx
	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08     // mov    dword [rsi + 8], edx
	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
	WORD $0x148b; BYTE $0x91     // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c     // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x08c78348             // add    rdi, 8
	LONG $0x10c68348             // add    rsi, 16
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB43_5

LBB43_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB43_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB43_3:
	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
	WORD $0x048b; BYTE $0x81     // mov    eax, dword [rcx + 4*rax]
	LONG $0x46048942             // mov    dword [rsi + 2*r8], eax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB43_3

LBB43_4:
	RET

TEXT ·_transpose_uint32_int32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB44_1

LBB44_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x178b             // mov    edx, dword [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB44_5

LBB44_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB44_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB44_3:
	LONG $0x07048b42         // mov    eax, dword [rdi + r8]
	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
	LONG $0x06048942         // mov    dword [rsi + r8], eax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB44_3

LBB44_4:
	RET

TEXT ·_transpose_int32_int32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB45_1

LBB45_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB45_5

LBB45_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB45_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB45_3:
	LONG $0x0704634a         // movsxd    rax, dword [rdi + r8]
	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
	LONG $0x06048942         // mov    dword [rsi + r8], eax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB45_3

LBB45_4:
	RET

TEXT ·_transpose_uint64_int32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB46_1

LBB46_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB46_5

LBB46_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB46_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB46_3:
	LONG $0x47048b4a         // mov    rax, qword [rdi + 2*r8]
	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
	LONG $0x06048942         // mov    dword [rsi + r8], eax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB46_3

LBB46_4:
	RET

TEXT ·_transpose_int64_int32_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB47_1

LBB47_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x1689             // mov    dword [rsi], edx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x04 // mov    dword [rsi + 4], edx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x08 // mov    dword [rsi + 8], edx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	WORD $0x148b; BYTE $0x91 // mov    edx, dword [rcx + 4*rdx]
	WORD $0x5689; BYTE $0x0c // mov    dword [rsi + 12], edx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x10c68348         // add    rsi, 16
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB47_5

LBB47_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB47_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB47_3:
	LONG $0x47048b4a         // mov    rax, qword [rdi + 2*r8]
	WORD $0x048b; BYTE $0x81 // mov    eax, dword [rcx + 4*rax]
	LONG $0x06048942         // mov    dword [rsi + r8], eax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB47_3

LBB47_4:
	RET

TEXT ·_transpose_uint8_uint64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB48_1

LBB48_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x04c78348         // add    rdi, 4
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB48_5

LBB48_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB48_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB48_3:
	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
	LONG $0xc604894a             // mov    qword [rsi + 8*r8], rax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB48_3

LBB48_4:
	RET

TEXT ·_transpose_int8_uint64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB49_1

LBB49_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16     // mov    qword [rsi], rdx
	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948             // mov    qword [rsi + 8], rdx
	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948             // mov    qword [rsi + 16], rdx
	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948             // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x04c78348             // add    rdi, 4
	LONG $0x20c68348             // add    rsi, 32
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB49_5

LBB49_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB49_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB49_3:
	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
	LONG $0xc604894a             // mov    qword [rsi + 8*r8], rax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB49_3

LBB49_4:
	RET

TEXT ·_transpose_uint16_uint64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB50_1

LBB50_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x08c78348         // add    rdi, 8
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB50_5

LBB50_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB50_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB50_3:
	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x8604894a             // mov    qword [rsi + 4*r8], rax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB50_3

LBB50_4:
	RET

TEXT ·_transpose_int16_uint64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB51_1

LBB51_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16     // mov    qword [rsi], rdx
	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948             // mov    qword [rsi + 8], rdx
	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948             // mov    qword [rsi + 16], rdx
	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948             // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x08c78348             // add    rdi, 8
	LONG $0x20c68348             // add    rsi, 32
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB51_5

LBB51_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB51_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB51_3:
	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x8604894a             // mov    qword [rsi + 4*r8], rax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB51_3

LBB51_4:
	RET

TEXT ·_transpose_uint32_uint64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB52_1

LBB52_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x178b             // mov    edx, dword [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB52_5

LBB52_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB52_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB52_3:
	LONG $0x07048b42         // mov    eax, dword [rdi + r8]
	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x4604894a         // mov    qword [rsi + 2*r8], rax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB52_3

LBB52_4:
	RET

TEXT ·_transpose_int32_uint64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB53_1

LBB53_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB53_5

LBB53_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB53_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB53_3:
	LONG $0x0704634a         // movsxd    rax, dword [rdi + r8]
	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x4604894a         // mov    qword [rsi + 2*r8], rax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB53_3

LBB53_4:
	RET

TEXT ·_transpose_uint64_uint64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB54_1

LBB54_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB54_5

LBB54_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB54_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB54_3:
	LONG $0x07048b4a         // mov    rax, qword [rdi + r8]
	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x0604894a         // mov    qword [rsi + r8], rax
	LONG $0x08c08349         // add    r8, 8
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB54_3

LBB54_4:
	RET

TEXT ·_transpose_int64_uint64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB55_1

LBB55_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB55_5

LBB55_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB55_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB55_3:
	LONG $0x07048b4a         // mov    rax, qword [rdi + r8]
	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x0604894a         // mov    qword [rsi + r8], rax
	LONG $0x08c08349         // add    r8, 8
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB55_3

LBB55_4:
	RET

TEXT ·_transpose_uint8_int64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB56_1

LBB56_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb60f; BYTE $0x17 // movzx    edx, byte [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	LONG $0x0157b60f         // movzx    edx, byte [rdi + 1]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	LONG $0x0257b60f         // movzx    edx, byte [rdi + 2]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	LONG $0x0357b60f         // movzx    edx, byte [rdi + 3]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x04c78348         // add    rdi, 4
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB56_5

LBB56_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB56_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB56_3:
	LONG $0x04b60f42; BYTE $0x07 // movzx    eax, byte [rdi + r8]
	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
	LONG $0xc604894a             // mov    qword [rsi + 8*r8], rax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB56_3

LBB56_4:
	RET

TEXT ·_transpose_int8_int64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB57_1

LBB57_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17be0f48             // movsx    rdx, byte [rdi]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16     // mov    qword [rsi], rdx
	LONG $0x57be0f48; BYTE $0x01 // movsx    rdx, byte [rdi + 1]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948             // mov    qword [rsi + 8], rdx
	LONG $0x57be0f48; BYTE $0x02 // movsx    rdx, byte [rdi + 2]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948             // mov    qword [rsi + 16], rdx
	LONG $0x57be0f48; BYTE $0x03 // movsx    rdx, byte [rdi + 3]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948             // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x04c78348             // add    rdi, 4
	LONG $0x20c68348             // add    rsi, 32
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB57_5

LBB57_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB57_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB57_3:
	LONG $0x04be0f4a; BYTE $0x07 // movsx    rax, byte [rdi + r8]
	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
	LONG $0xc604894a             // mov    qword [rsi + 8*r8], rax
	LONG $0x01c08349             // add    r8, 1
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB57_3

LBB57_4:
	RET

TEXT ·_transpose_uint16_int64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB58_1

LBB58_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0xb70f; BYTE $0x17 // movzx    edx, word [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	LONG $0x0257b70f         // movzx    edx, word [rdi + 2]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	LONG $0x0457b70f         // movzx    edx, word [rdi + 4]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	LONG $0x0657b70f         // movzx    edx, word [rdi + 6]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x08c78348         // add    rdi, 8
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB58_5

LBB58_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB58_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB58_3:
	LONG $0x04b70f42; BYTE $0x07 // movzx    eax, word [rdi + r8]
	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x8604894a             // mov    qword [rsi + 4*r8], rax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB58_3

LBB58_4:
	RET

TEXT ·_transpose_int16_int64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB59_1

LBB59_5:
	WORD $0xd089                 // mov    eax, edx
	LONG $0x17bf0f48             // movsx    rdx, word [rdi]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16     // mov    qword [rsi], rdx
	LONG $0x57bf0f48; BYTE $0x02 // movsx    rdx, word [rdi + 2]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948             // mov    qword [rsi + 8], rdx
	LONG $0x57bf0f48; BYTE $0x04 // movsx    rdx, word [rdi + 4]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948             // mov    qword [rsi + 16], rdx
	LONG $0x57bf0f48; BYTE $0x06 // movsx    rdx, word [rdi + 6]
	LONG $0x91146348             // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948             // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc     // lea    edx, [rax - 4]
	LONG $0x08c78348             // add    rdi, 8
	LONG $0x20c68348             // add    rsi, 32
	WORD $0xf883; BYTE $0x07     // cmp    eax, 7
	JG   LBB59_5

LBB59_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB59_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB59_3:
	LONG $0x04bf0f4a; BYTE $0x07 // movsx    rax, word [rdi + r8]
	LONG $0x81046348             // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x8604894a             // mov    qword [rsi + 4*r8], rax
	LONG $0x02c08349             // add    r8, 2
	WORD $0xc283; BYTE $0xff     // add    edx, -1
	WORD $0xfa83; BYTE $0x01     // cmp    edx, 1
	JG   LBB59_3

LBB59_4:
	RET

TEXT ·_transpose_uint32_int64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB60_1

LBB60_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x178b             // mov    edx, dword [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	WORD $0x578b; BYTE $0x04 // mov    edx, dword [rdi + 4]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	WORD $0x578b; BYTE $0x08 // mov    edx, dword [rdi + 8]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	WORD $0x578b; BYTE $0x0c // mov    edx, dword [rdi + 12]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB60_5

LBB60_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB60_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB60_3:
	LONG $0x07048b42         // mov    eax, dword [rdi + r8]
	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x4604894a         // mov    qword [rsi + 2*r8], rax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB60_3

LBB60_4:
	RET

TEXT ·_transpose_int32_int64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB61_1

LBB61_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x6348; BYTE $0x17 // movsxd    rdx, dword [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	LONG $0x04576348         // movsxd    rdx, dword [rdi + 4]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	LONG $0x08576348         // movsxd    rdx, dword [rdi + 8]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	LONG $0x0c576348         // movsxd    rdx, dword [rdi + 12]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x10c78348         // add    rdi, 16
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB61_5

LBB61_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB61_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB61_3:
	LONG $0x0704634a         // movsxd    rax, dword [rdi + r8]
	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x4604894a         // mov    qword [rsi + 2*r8], rax
	LONG $0x04c08349         // add    r8, 4
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB61_3

LBB61_4:
	RET

TEXT ·_transpose_uint64_int64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB62_1

LBB62_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB62_5

LBB62_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB62_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB62_3:
	LONG $0x07048b4a         // mov    rax, qword [rdi + r8]
	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x0604894a         // mov    qword [rsi + r8], rax
	LONG $0x08c08349         // add    r8, 8
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB62_3

LBB62_4:
	RET

TEXT ·_transpose_int64_int64_sse4(SB), $0-32

	MOVQ src+0(FP), DI
	MOVQ dest+8(FP), SI
	MOVQ length+16(FP), DX
	MOVQ transposeMap+24(FP), CX

	WORD $0xfa83; BYTE $0x04 // cmp    edx, 4
	JL   LBB63_1

LBB63_5:
	WORD $0xd089             // mov    eax, edx
	WORD $0x8b48; BYTE $0x17 // mov    rdx, qword [rdi]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	WORD $0x8948; BYTE $0x16 // mov    qword [rsi], rdx
	LONG $0x08578b48         // mov    rdx, qword [rdi + 8]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x08568948         // mov    qword [rsi + 8], rdx
	LONG $0x10578b48         // mov    rdx, qword [rdi + 16]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x10568948         // mov    qword [rsi + 16], rdx
	LONG $0x18578b48         // mov    rdx, qword [rdi + 24]
	LONG $0x91146348         // movsxd    rdx, dword [rcx + 4*rdx]
	LONG $0x18568948         // mov    qword [rsi + 24], rdx
	WORD $0x508d; BYTE $0xfc // lea    edx, [rax - 4]
	LONG $0x20c78348         // add    rdi, 32
	LONG $0x20c68348         // add    rsi, 32
	WORD $0xf883; BYTE $0x07 // cmp    eax, 7
	JG   LBB63_5

LBB63_1:
	WORD $0xd285             // test    edx, edx
	JLE  LBB63_4
	WORD $0xc283; BYTE $0x01 // add    edx, 1
	WORD $0x3145; BYTE $0xc0 // xor    r8d, r8d

LBB63_3:
	LONG $0x07048b4a         // mov    rax, qword [rdi + r8]
	LONG $0x81046348         // movsxd    rax, dword [rcx + 4*rax]
	LONG $0x0604894a         // mov    qword [rsi + r8], rax
	LONG $0x08c08349         // add    r8, 8
	WORD $0xc283; BYTE $0xff // add    edx, -1
	WORD $0xfa83; BYTE $0x01 // cmp    edx, 1
	JG   LBB63_3

LBB63_4:
	RET