// Code generated by command: go run gen.go -out compress_amd64.s. DO NOT EDIT.

#include "textflag.h"

DATA iv<>+0(SB)/4, $0x6a09e667
DATA iv<>+4(SB)/4, $0xbb67ae85
DATA iv<>+8(SB)/4, $0x3c6ef372
DATA iv<>+12(SB)/4, $0xa54ff53a
GLOBL iv<>(SB), RODATA|NOPTR, $16

DATA seq<>+0(SB)/4, $0x00000000
DATA seq<>+4(SB)/4, $0x00000001
DATA seq<>+8(SB)/4, $0x00000002
DATA seq<>+12(SB)/4, $0x00000003
DATA seq<>+16(SB)/4, $0x00000004
DATA seq<>+20(SB)/4, $0x00000005
DATA seq<>+24(SB)/4, $0x00000006
DATA seq<>+28(SB)/4, $0x00000007
DATA seq<>+32(SB)/4, $0x00000008
DATA seq<>+36(SB)/4, $0x00000009
DATA seq<>+40(SB)/4, $0x0000000a
DATA seq<>+44(SB)/4, $0x0000000b
DATA seq<>+48(SB)/4, $0x0000000c
DATA seq<>+52(SB)/4, $0x0000000d
DATA seq<>+56(SB)/4, $0x0000000e
DATA seq<>+60(SB)/4, $0x0000000f
GLOBL seq<>(SB), RODATA|NOPTR, $64

DATA seq64<>+0(SB)/8, $0x0000000000000000
DATA seq64<>+8(SB)/8, $0x0000000000000001
DATA seq64<>+16(SB)/8, $0x0000000000000002
DATA seq64<>+24(SB)/8, $0x0000000000000003
DATA seq64<>+32(SB)/8, $0x0000000000000004
DATA seq64<>+40(SB)/8, $0x0000000000000005
DATA seq64<>+48(SB)/8, $0x0000000000000006
DATA seq64<>+56(SB)/8, $0x0000000000000007
GLOBL seq64<>(SB), RODATA|NOPTR, $64

DATA shuffle_rot8<>+0(SB)/4, $0x00030201
DATA shuffle_rot8<>+4(SB)/4, $0x04070605
DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09
DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d
DATA shuffle_rot8<>+16(SB)/4, $0x10131211
DATA shuffle_rot8<>+20(SB)/4, $0x14171615
DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19
DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d
GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32

DATA shuffle_rot16<>+0(SB)/4, $0x01000302
DATA shuffle_rot16<>+4(SB)/4, $0x05040706
DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a
DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e
DATA shuffle_rot16<>+16(SB)/4, $0x11101312
DATA shuffle_rot16<>+20(SB)/4, $0x15141716
DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a
DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e
GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32

// func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
// Requires: AVX512BW, AVX512F
TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40
	MOVQ out+0(FP), AX
	MOVQ block+8(FP), CX
	MOVQ cv+16(FP), DX

	// Initialize block vectors
	VPBROADCASTD (CX), Z1
	VPBROADCASTD 4(CX), Z3
	VPBROADCASTD 8(CX), Z5
	VPBROADCASTD 12(CX), Z7
	VPBROADCASTD 16(CX), Z9
	VPBROADCASTD 20(CX), Z11
	VPBROADCASTD 24(CX), Z13
	VPBROADCASTD 28(CX), Z15
	VPBROADCASTD 32(CX), Z17
	VPBROADCASTD 36(CX), Z19
	VPBROADCASTD 40(CX), Z21
	VPBROADCASTD 44(CX), Z23
	VPBROADCASTD 48(CX), Z25
	VPBROADCASTD 52(CX), Z27
	VPBROADCASTD 56(CX), Z29
	VPBROADCASTD 60(CX), Z31

	// Initialize state vectors
	VPBROADCASTD (DX), Z0
	VPBROADCASTD 4(DX), Z2
	VPBROADCASTD 8(DX), Z4
	VPBROADCASTD 12(DX), Z6
	VPBROADCASTD 16(DX), Z8
	VPBROADCASTD 20(DX), Z10
	VPBROADCASTD 24(DX), Z12
	VPBROADCASTD 28(DX), Z14
	VPBROADCASTD iv<>+0(SB), Z16
	VPBROADCASTD iv<>+4(SB), Z18
	VPBROADCASTD iv<>+8(SB), Z20
	VPBROADCASTD iv<>+12(SB), Z22
	VPBROADCASTD counter+24(FP), Z24
	VPADDD       seq<>+0(SB), Z24, Z24
	VPCMPUD      $0x01, seq<>+0(SB), Z24, K1
	VPBROADCASTD counter+28(FP), Z26
	VPADDD.BCST  seq<>+4(SB), Z26, K1, Z26
	VPBROADCASTD blockLen+32(FP), Z28
	VPBROADCASTD flags+36(FP), Z30

	// Round 1
	VPADDD Z0, Z8, Z0
	VPADDD Z1, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z3, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z5, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z7, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z9, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z11, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z13, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z15, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z17, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z19, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z21, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z23, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z25, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z27, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z29, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z31, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 2
	VPADDD Z0, Z8, Z0
	VPADDD Z5, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z13, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z7, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z21, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z15, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z1, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z9, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z27, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z3, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z23, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z25, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z11, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z19, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z29, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z31, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z17, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 3
	VPADDD Z0, Z8, Z0
	VPADDD Z7, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z9, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z21, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z25, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z27, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z5, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z15, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z29, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z13, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z11, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z19, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z1, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z23, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z31, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z17, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z3, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 4
	VPADDD Z0, Z8, Z0
	VPADDD Z21, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z15, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z25, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z19, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z29, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z7, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z27, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z31, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z9, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z1, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z23, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z5, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z11, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z17, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z3, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z13, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 5
	VPADDD Z0, Z8, Z0
	VPADDD Z25, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z27, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z19, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z23, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z31, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z21, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z29, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z17, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z15, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z5, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z11, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z7, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z1, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z3, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z13, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z9, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 6
	VPADDD Z0, Z8, Z0
	VPADDD Z19, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z29, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z23, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z11, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z17, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z25, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z31, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z3, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z27, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z7, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z1, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z21, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z5, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z13, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z9, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z15, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 7
	VPADDD Z0, Z8, Z0
	VPADDD Z23, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z31, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z11, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z1, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z3, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z19, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z17, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z13, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z29, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z21, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z5, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z25, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z7, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z9, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z15, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z27, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Finalize CVs
	VPXORD      Z0, Z16, Z0
	VPXORD      Z2, Z18, Z2
	VPXORD      Z4, Z20, Z4
	VPXORD      Z6, Z22, Z6
	VPXORD      Z8, Z24, Z8
	VPXORD      Z10, Z26, Z10
	VPXORD      Z12, Z28, Z12
	VPXORD      Z14, Z30, Z14
	VPXORD.BCST (DX), Z16, Z16
	VPXORD.BCST 4(DX), Z18, Z18
	VPXORD.BCST 8(DX), Z20, Z20
	VPXORD.BCST 12(DX), Z22, Z22
	VPXORD.BCST 16(DX), Z24, Z24
	VPXORD.BCST 20(DX), Z26, Z26
	VPXORD.BCST 24(DX), Z28, Z28
	VPXORD.BCST 28(DX), Z30, Z30
	VMOVDQU32   seq<>+0(SB), Z1
	VPSLLD      $0x06, Z1, Z1
	KXNORD      K1, K1, K1
	VPSCATTERDD Z0, K1, (AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z2, K1, 4(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z4, K1, 8(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z6, K1, 12(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z8, K1, 16(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z10, K1, 20(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z12, K1, 24(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z14, K1, 28(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z16, K1, 32(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z18, K1, 36(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z20, K1, 40(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z22, K1, 44(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z24, K1, 48(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z26, K1, 52(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z28, K1, 56(AX)(Z1*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z30, K1, 60(AX)(Z1*1)
	RET

// func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)
// Requires: AVX512BW, AVX512F
TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-36
	MOVQ cvs+0(FP), AX
	MOVQ buf+8(FP), CX
	MOVQ key+16(FP), DX

	// Initialize counter
	VPBROADCASTD counter+24(FP), Z0
	VPADDD       seq<>+0(SB), Z0, Z0
	VPCMPUD      $0x01, seq<>+0(SB), Z0, K1
	VPBROADCASTD counter+28(FP), Z2
	VPADDD.BCST  seq<>+4(SB), Z2, K1, Z2
	VMOVDQU32    Z0, (SP)
	VMOVDQU32    Z2, 64(SP)

	// Initialize flags
	VPBROADCASTD flags+32(FP), Z0
	VMOVDQU32    Z0, 128(SP)
	ORL          $0x01, 128(SP)
	ORL          $0x02, 188(SP)

	// Load key
	VPBROADCASTD (DX), Z0
	VPBROADCASTD 4(DX), Z2
	VPBROADCASTD 8(DX), Z4
	VPBROADCASTD 12(DX), Z6
	VPBROADCASTD 16(DX), Z8
	VPBROADCASTD 20(DX), Z10
	VPBROADCASTD 24(DX), Z12
	VPBROADCASTD 28(DX), Z14

	// Loop index
	XORQ DX, DX

loop:
	// Load transposed block
	VMOVDQU32  seq<>+0(SB), Z16
	VPSLLD     $0x0a, Z16, Z16
	KXNORD     K1, K1, K1
	VPGATHERDD (CX)(Z16*1), K1, Z1
	KXNORD     K1, K1, K1
	VPGATHERDD 4(CX)(Z16*1), K1, Z3
	KXNORD     K1, K1, K1
	VPGATHERDD 8(CX)(Z16*1), K1, Z5
	KXNORD     K1, K1, K1
	VPGATHERDD 12(CX)(Z16*1), K1, Z7
	KXNORD     K1, K1, K1
	VPGATHERDD 16(CX)(Z16*1), K1, Z9
	KXNORD     K1, K1, K1
	VPGATHERDD 20(CX)(Z16*1), K1, Z11
	KXNORD     K1, K1, K1
	VPGATHERDD 24(CX)(Z16*1), K1, Z13
	KXNORD     K1, K1, K1
	VPGATHERDD 28(CX)(Z16*1), K1, Z15
	KXNORD     K1, K1, K1
	VPGATHERDD 32(CX)(Z16*1), K1, Z17
	KXNORD     K1, K1, K1
	VPGATHERDD 36(CX)(Z16*1), K1, Z19
	KXNORD     K1, K1, K1
	VPGATHERDD 40(CX)(Z16*1), K1, Z21
	KXNORD     K1, K1, K1
	VPGATHERDD 44(CX)(Z16*1), K1, Z23
	KXNORD     K1, K1, K1
	VPGATHERDD 48(CX)(Z16*1), K1, Z25
	KXNORD     K1, K1, K1
	VPGATHERDD 52(CX)(Z16*1), K1, Z27
	KXNORD     K1, K1, K1
	VPGATHERDD 56(CX)(Z16*1), K1, Z29
	KXNORD     K1, K1, K1
	VPGATHERDD 60(CX)(Z16*1), K1, Z31
	ADDQ       $0x40, CX

	// Reload state vectors (other than CVs)
	VPBROADCASTD iv<>+0(SB), Z16
	VPBROADCASTD iv<>+4(SB), Z18
	VPBROADCASTD iv<>+8(SB), Z20
	VPBROADCASTD iv<>+12(SB), Z22
	VMOVDQU32    (SP), Z24
	VMOVDQU32    64(SP), Z26
	VPBROADCASTD seq<>+4(SB), Z28
	VPSLLD       $0x06, Z28, Z28
	VPBROADCASTD 128(SP)(DX*4), Z30

	// Round 1
	VPADDD Z0, Z8, Z0
	VPADDD Z1, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z3, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z5, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z7, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z9, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z11, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z13, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z15, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z17, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z19, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z21, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z23, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z25, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z27, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z29, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z31, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 2
	VPADDD Z0, Z8, Z0
	VPADDD Z5, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z13, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z7, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z21, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z15, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z1, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z9, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z27, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z3, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z23, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z25, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z11, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z19, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z29, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z31, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z17, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 3
	VPADDD Z0, Z8, Z0
	VPADDD Z7, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z9, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z21, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z25, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z27, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z5, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z15, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z29, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z13, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z11, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z19, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z1, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z23, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z31, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z17, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z3, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 4
	VPADDD Z0, Z8, Z0
	VPADDD Z21, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z15, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z25, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z19, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z29, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z7, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z27, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z31, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z9, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z1, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z23, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z5, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z11, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z17, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z3, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z13, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 5
	VPADDD Z0, Z8, Z0
	VPADDD Z25, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z27, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z19, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z23, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z31, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z21, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z29, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z17, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z15, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z5, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z11, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z7, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z1, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z3, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z13, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z9, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 6
	VPADDD Z0, Z8, Z0
	VPADDD Z19, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z29, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z23, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z11, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z17, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z25, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z31, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z3, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z27, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z7, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z1, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z21, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z5, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z13, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z9, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z15, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Round 7
	VPADDD Z0, Z8, Z0
	VPADDD Z23, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z0, Z8, Z0
	VPADDD Z31, Z0, Z0
	VPXORD Z24, Z0, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z16, Z24, Z16
	VPXORD Z8, Z16, Z8
	VPRORD $0x07, Z8, Z8
	VPADDD Z2, Z10, Z2
	VPADDD Z11, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z2, Z10, Z2
	VPADDD Z1, Z2, Z2
	VPXORD Z26, Z2, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z18, Z26, Z18
	VPXORD Z10, Z18, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z4, Z12, Z4
	VPADDD Z3, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z4, Z12, Z4
	VPADDD Z19, Z4, Z4
	VPXORD Z28, Z4, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z20, Z28, Z20
	VPXORD Z12, Z20, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z6, Z14, Z6
	VPADDD Z17, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z6, Z14, Z6
	VPADDD Z13, Z6, Z6
	VPXORD Z30, Z6, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z22, Z30, Z22
	VPXORD Z14, Z22, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z0, Z10, Z0
	VPADDD Z29, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x10, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x0c, Z10, Z10
	VPADDD Z0, Z10, Z0
	VPADDD Z21, Z0, Z0
	VPXORD Z30, Z0, Z30
	VPRORD $0x08, Z30, Z30
	VPADDD Z20, Z30, Z20
	VPXORD Z10, Z20, Z10
	VPRORD $0x07, Z10, Z10
	VPADDD Z2, Z12, Z2
	VPADDD Z5, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x10, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x0c, Z12, Z12
	VPADDD Z2, Z12, Z2
	VPADDD Z25, Z2, Z2
	VPXORD Z24, Z2, Z24
	VPRORD $0x08, Z24, Z24
	VPADDD Z22, Z24, Z22
	VPXORD Z12, Z22, Z12
	VPRORD $0x07, Z12, Z12
	VPADDD Z4, Z14, Z4
	VPADDD Z7, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x10, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x0c, Z14, Z14
	VPADDD Z4, Z14, Z4
	VPADDD Z9, Z4, Z4
	VPXORD Z26, Z4, Z26
	VPRORD $0x08, Z26, Z26
	VPADDD Z16, Z26, Z16
	VPXORD Z14, Z16, Z14
	VPRORD $0x07, Z14, Z14
	VPADDD Z6, Z8, Z6
	VPADDD Z15, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x10, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x0c, Z8, Z8
	VPADDD Z6, Z8, Z6
	VPADDD Z27, Z6, Z6
	VPXORD Z28, Z6, Z28
	VPRORD $0x08, Z28, Z28
	VPADDD Z18, Z28, Z18
	VPXORD Z8, Z18, Z8
	VPRORD $0x07, Z8, Z8

	// Finalize CVs
	VPXORD Z0, Z16, Z0
	VPXORD Z2, Z18, Z2
	VPXORD Z4, Z20, Z4
	VPXORD Z6, Z22, Z6
	VPXORD Z8, Z24, Z8
	VPXORD Z10, Z26, Z10
	VPXORD Z12, Z28, Z12
	VPXORD Z14, Z30, Z14

	// Loop
	INCQ DX
	CMPQ DX, $0x00000010
	JNE  loop

	// Finished; transpose CVs
	VMOVDQU32   seq<>+0(SB), Z16
	VPSLLD      $0x05, Z16, Z16
	KXNORD      K1, K1, K1
	VPSCATTERDD Z0, K1, (AX)(Z16*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z2, K1, 4(AX)(Z16*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z4, K1, 8(AX)(Z16*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z6, K1, 12(AX)(Z16*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z8, K1, 16(AX)(Z16*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z10, K1, 20(AX)(Z16*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z12, K1, 24(AX)(Z16*1)
	KXNORD      K1, K1, K1
	VPSCATTERDD Z14, K1, 28(AX)(Z16*1)
	RET

// func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
// Requires: AVX, AVX2
TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40
	MOVQ out+0(FP), AX
	MOVQ block+8(FP), CX
	MOVQ cv+16(FP), DX

	// Load block
	VPBROADCASTD (CX), Y0
	VMOVDQU      Y0, (SP)
	VPBROADCASTD 4(CX), Y0
	VMOVDQU      Y0, 32(SP)
	VPBROADCASTD 8(CX), Y0
	VMOVDQU      Y0, 64(SP)
	VPBROADCASTD 12(CX), Y0
	VMOVDQU      Y0, 96(SP)
	VPBROADCASTD 16(CX), Y0
	VMOVDQU      Y0, 128(SP)
	VPBROADCASTD 20(CX), Y0
	VMOVDQU      Y0, 160(SP)
	VPBROADCASTD 24(CX), Y0
	VMOVDQU      Y0, 192(SP)
	VPBROADCASTD 28(CX), Y0
	VMOVDQU      Y0, 224(SP)
	VPBROADCASTD 32(CX), Y0
	VMOVDQU      Y0, 256(SP)
	VPBROADCASTD 36(CX), Y0
	VMOVDQU      Y0, 288(SP)
	VPBROADCASTD 40(CX), Y0
	VMOVDQU      Y0, 320(SP)
	VPBROADCASTD 44(CX), Y0
	VMOVDQU      Y0, 352(SP)
	VPBROADCASTD 48(CX), Y0
	VMOVDQU      Y0, 384(SP)
	VPBROADCASTD 52(CX), Y0
	VMOVDQU      Y0, 416(SP)
	VPBROADCASTD 56(CX), Y0
	VMOVDQU      Y0, 448(SP)
	VPBROADCASTD 60(CX), Y0
	VMOVDQU      Y0, 480(SP)

	// Initialize state vectors
	VPBROADCASTD (DX), Y0
	VPBROADCASTD 4(DX), Y1
	VPBROADCASTD 8(DX), Y2
	VPBROADCASTD 12(DX), Y3
	VPBROADCASTD 16(DX), Y4
	VPBROADCASTD 20(DX), Y5
	VPBROADCASTD 24(DX), Y6
	VPBROADCASTD 28(DX), Y7
	VPBROADCASTD iv<>+0(SB), Y8
	VPBROADCASTD iv<>+4(SB), Y9
	VPBROADCASTD iv<>+8(SB), Y10
	VPBROADCASTD iv<>+12(SB), Y11
	VPBROADCASTQ counter+24(FP), Y12
	VPBROADCASTQ counter+24(FP), Y13
	VPADDQ       seq64<>+0(SB), Y12, Y12
	VPADDQ       seq64<>+32(SB), Y13, Y13
	VPUNPCKLDQ   Y13, Y12, Y14
	VPUNPCKHDQ   Y13, Y12, Y15
	VPUNPCKLDQ   Y15, Y14, Y12
	VPUNPCKHDQ   Y15, Y14, Y13
	VPERMQ       $0xd8, Y12, Y12
	VPERMQ       $0xd8, Y13, Y13
	VPBROADCASTD blockLen+32(FP), Y14
	VPBROADCASTD flags+36(FP), Y15
	VMOVDQU      Y8, 512(SP)

	// Round 1
	VPADDD  Y0, Y4, Y0
	VPADDD  (SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  32(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  64(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  96(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  128(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  160(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  256(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  288(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  384(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  416(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  448(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 2
	VPADDD  Y0, Y4, Y0
	VPADDD  64(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  192(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  96(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  224(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  (SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  128(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  416(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  32(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  352(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  288(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  448(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 3
	VPADDD  Y0, Y4, Y0
	VPADDD  96(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  128(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  416(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  64(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  448(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  192(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  160(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  288(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  (SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  352(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  480(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  32(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 4
	VPADDD  Y0, Y4, Y0
	VPADDD  320(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  224(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  288(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  448(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  96(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  416(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  128(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  (SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  64(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  160(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  256(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  32(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 5
	VPADDD  Y0, Y4, Y0
	VPADDD  384(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  416(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  288(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  480(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  320(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  448(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  224(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  64(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  96(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  (SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  32(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  128(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 6
	VPADDD  Y0, Y4, Y0
	VPADDD  288(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  448(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  256(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  384(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  32(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  416(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  96(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  (SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  64(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  192(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  128(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 7
	VPADDD  Y0, Y4, Y0
	VPADDD  352(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  480(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  (SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  32(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  288(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  448(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  320(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  64(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  96(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  128(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  416(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VMOVDQU 512(SP), Y8

	// Finalize CVs
	VMOVDQU      Y8, 256(SP)
	VMOVDQU      Y9, 288(SP)
	VMOVDQU      Y10, 320(SP)
	VMOVDQU      Y11, 352(SP)
	VMOVDQU      Y12, 384(SP)
	VMOVDQU      Y13, 416(SP)
	VMOVDQU      Y14, 448(SP)
	VMOVDQU      Y15, 480(SP)
	VPXOR        Y0, Y8, Y0
	VPXOR        Y1, Y9, Y1
	VPXOR        Y2, Y10, Y2
	VPXOR        Y3, Y11, Y3
	VPXOR        Y4, Y12, Y4
	VPXOR        Y5, Y13, Y5
	VPXOR        Y6, Y14, Y6
	VPXOR        Y7, Y15, Y7
	VPUNPCKLDQ   Y1, Y0, Y8
	VPUNPCKHDQ   Y1, Y0, Y9
	VPUNPCKLDQ   Y3, Y2, Y10
	VPUNPCKHDQ   Y3, Y2, Y11
	VPUNPCKLDQ   Y5, Y4, Y12
	VPUNPCKHDQ   Y5, Y4, Y13
	VPUNPCKLDQ   Y7, Y6, Y14
	VPUNPCKHDQ   Y7, Y6, Y15
	VPUNPCKLQDQ  Y10, Y8, Y0
	VPUNPCKHQDQ  Y10, Y8, Y1
	VPUNPCKLQDQ  Y11, Y9, Y2
	VPUNPCKHQDQ  Y11, Y9, Y3
	VPUNPCKLQDQ  Y14, Y12, Y4
	VPUNPCKHQDQ  Y14, Y12, Y5
	VPUNPCKLQDQ  Y15, Y13, Y6
	VPUNPCKHQDQ  Y15, Y13, Y7
	VPERM2I128   $0x20, Y4, Y0, Y8
	VPERM2I128   $0x31, Y4, Y0, Y12
	VPERM2I128   $0x20, Y5, Y1, Y9
	VPERM2I128   $0x31, Y5, Y1, Y13
	VPERM2I128   $0x20, Y6, Y2, Y10
	VPERM2I128   $0x31, Y6, Y2, Y14
	VPERM2I128   $0x20, Y7, Y3, Y11
	VPERM2I128   $0x31, Y7, Y3, Y15
	VMOVDQU      Y8, (AX)
	VMOVDQU      Y9, 64(AX)
	VMOVDQU      Y10, 128(AX)
	VMOVDQU      Y11, 192(AX)
	VMOVDQU      Y12, 256(AX)
	VMOVDQU      Y13, 320(AX)
	VMOVDQU      Y14, 384(AX)
	VMOVDQU      Y15, 448(AX)
	VMOVDQU      256(SP), Y8
	VMOVDQU      288(SP), Y9
	VMOVDQU      320(SP), Y10
	VMOVDQU      352(SP), Y11
	VMOVDQU      384(SP), Y12
	VMOVDQU      416(SP), Y13
	VMOVDQU      448(SP), Y14
	VMOVDQU      480(SP), Y15
	VPBROADCASTD (DX), Y0
	VPXOR        Y0, Y8, Y8
	VPBROADCASTD 4(DX), Y0
	VPXOR        Y0, Y9, Y9
	VPBROADCASTD 8(DX), Y0
	VPXOR        Y0, Y10, Y10
	VPBROADCASTD 12(DX), Y0
	VPXOR        Y0, Y11, Y11
	VPBROADCASTD 16(DX), Y0
	VPXOR        Y0, Y12, Y12
	VPBROADCASTD 20(DX), Y0
	VPXOR        Y0, Y13, Y13
	VPBROADCASTD 24(DX), Y0
	VPXOR        Y0, Y14, Y14
	VPBROADCASTD 28(DX), Y0
	VPXOR        Y0, Y15, Y15
	VPUNPCKLDQ   Y9, Y8, Y0
	VPUNPCKHDQ   Y9, Y8, Y1
	VPUNPCKLDQ   Y11, Y10, Y2
	VPUNPCKHDQ   Y11, Y10, Y3
	VPUNPCKLDQ   Y13, Y12, Y4
	VPUNPCKHDQ   Y13, Y12, Y5
	VPUNPCKLDQ   Y15, Y14, Y6
	VPUNPCKHDQ   Y15, Y14, Y7
	VPUNPCKLQDQ  Y2, Y0, Y8
	VPUNPCKHQDQ  Y2, Y0, Y9
	VPUNPCKLQDQ  Y3, Y1, Y10
	VPUNPCKHQDQ  Y3, Y1, Y11
	VPUNPCKLQDQ  Y6, Y4, Y12
	VPUNPCKHQDQ  Y6, Y4, Y13
	VPUNPCKLQDQ  Y7, Y5, Y14
	VPUNPCKHQDQ  Y7, Y5, Y15
	VPERM2I128   $0x20, Y12, Y8, Y0
	VPERM2I128   $0x31, Y12, Y8, Y4
	VPERM2I128   $0x20, Y13, Y9, Y1
	VPERM2I128   $0x31, Y13, Y9, Y5
	VPERM2I128   $0x20, Y14, Y10, Y2
	VPERM2I128   $0x31, Y14, Y10, Y6
	VPERM2I128   $0x20, Y15, Y11, Y3
	VPERM2I128   $0x31, Y15, Y11, Y7
	VMOVDQU      Y0, 32(AX)
	VMOVDQU      Y1, 96(AX)
	VMOVDQU      Y2, 160(AX)
	VMOVDQU      Y3, 224(AX)
	VMOVDQU      Y4, 288(AX)
	VMOVDQU      Y5, 352(AX)
	VMOVDQU      Y6, 416(AX)
	VMOVDQU      Y7, 480(AX)
	VZEROUPPER
	RET

// func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
// Requires: AVX, AVX2
TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-36
	MOVQ cvs+0(FP), AX
	MOVQ buf+8(FP), CX
	MOVQ key+16(FP), DX

	// Load key
	VPBROADCASTD (DX), Y0
	VPBROADCASTD 4(DX), Y1
	VPBROADCASTD 8(DX), Y2
	VPBROADCASTD 12(DX), Y3
	VPBROADCASTD 16(DX), Y4
	VPBROADCASTD 20(DX), Y5
	VPBROADCASTD 24(DX), Y6
	VPBROADCASTD 28(DX), Y7

	// Initialize counter
	VPBROADCASTQ counter+24(FP), Y12
	VPBROADCASTQ counter+24(FP), Y13
	VPADDQ       seq64<>+0(SB), Y12, Y12
	VPADDQ       seq64<>+32(SB), Y13, Y13
	VPUNPCKLDQ   Y13, Y12, Y14
	VPUNPCKHDQ   Y13, Y12, Y15
	VPUNPCKLDQ   Y15, Y14, Y12
	VPUNPCKHDQ   Y15, Y14, Y13
	VPERMQ       $0xd8, Y12, Y12
	VPERMQ       $0xd8, Y13, Y13
	VMOVDQU      Y12, 512(SP)
	VMOVDQU      Y13, 544(SP)

	// Initialize flags
	VPBROADCASTD flags+32(FP), Y14
	VMOVDQU      Y14, 576(SP)
	VMOVDQU      Y14, 608(SP)
	ORL          $0x01, 576(SP)
	ORL          $0x02, 636(SP)

	// Loop index
	XORQ DX, DX

loop:
	// Load transposed block
	VMOVDQU    seq<>+0(SB), Y9
	VPSLLD     $0x0a, Y9, Y9
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, (CX)(Y9*1), Y10
	VMOVDQU    Y10, (SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 4(CX)(Y9*1), Y10
	VMOVDQU    Y10, 32(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 8(CX)(Y9*1), Y10
	VMOVDQU    Y10, 64(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 12(CX)(Y9*1), Y10
	VMOVDQU    Y10, 96(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 16(CX)(Y9*1), Y10
	VMOVDQU    Y10, 128(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 20(CX)(Y9*1), Y10
	VMOVDQU    Y10, 160(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 24(CX)(Y9*1), Y10
	VMOVDQU    Y10, 192(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 28(CX)(Y9*1), Y10
	VMOVDQU    Y10, 224(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 32(CX)(Y9*1), Y10
	VMOVDQU    Y10, 256(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 36(CX)(Y9*1), Y10
	VMOVDQU    Y10, 288(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 40(CX)(Y9*1), Y10
	VMOVDQU    Y10, 320(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 44(CX)(Y9*1), Y10
	VMOVDQU    Y10, 352(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 48(CX)(Y9*1), Y10
	VMOVDQU    Y10, 384(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 52(CX)(Y9*1), Y10
	VMOVDQU    Y10, 416(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 56(CX)(Y9*1), Y10
	VMOVDQU    Y10, 448(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 60(CX)(Y9*1), Y10
	VMOVDQU    Y10, 480(SP)
	ADDQ       $0x40, CX

	// Reload state vectors (other than CVs)
	VPBROADCASTD iv<>+0(SB), Y8
	VPBROADCASTD iv<>+4(SB), Y9
	VPBROADCASTD iv<>+8(SB), Y10
	VPBROADCASTD iv<>+12(SB), Y11
	VMOVDQU      512(SP), Y12
	VMOVDQU      544(SP), Y13
	VPBROADCASTD seq<>+4(SB), Y14
	VPSLLD       $0x06, Y14, Y14
	VPBROADCASTD 576(SP)(DX*4), Y15
	VMOVDQU      Y8, 640(SP)

	// Round 1
	VPADDD  Y0, Y4, Y0
	VPADDD  (SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  32(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  64(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  96(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  128(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  160(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  256(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  288(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  384(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  416(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  448(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 2
	VPADDD  Y0, Y4, Y0
	VPADDD  64(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  192(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  96(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  224(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  (SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  128(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  416(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  32(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  352(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  288(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  448(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 3
	VPADDD  Y0, Y4, Y0
	VPADDD  96(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  128(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  416(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  64(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  448(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  192(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  160(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  288(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  (SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  352(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  480(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  32(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 4
	VPADDD  Y0, Y4, Y0
	VPADDD  320(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  224(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  288(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  448(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  96(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  416(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  128(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  (SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  64(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  160(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  256(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  32(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 5
	VPADDD  Y0, Y4, Y0
	VPADDD  384(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  416(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  288(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  480(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  320(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  448(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  224(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  64(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  96(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  (SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  32(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  128(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 6
	VPADDD  Y0, Y4, Y0
	VPADDD  288(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  448(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  256(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  384(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  32(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  416(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  96(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  (SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  64(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  192(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  128(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 7
	VPADDD  Y0, Y4, Y0
	VPADDD  352(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  480(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  (SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  32(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  288(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  448(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  320(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  64(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  96(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  128(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 640(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 640(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  416(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VMOVDQU 640(SP), Y8

	// Finalize CVs
	VPXOR Y0, Y8, Y0
	VPXOR Y1, Y9, Y1
	VPXOR Y2, Y10, Y2
	VPXOR Y3, Y11, Y3
	VPXOR Y4, Y12, Y4
	VPXOR Y5, Y13, Y5
	VPXOR Y6, Y14, Y6
	VPXOR Y7, Y15, Y7

	// Loop
	INCQ DX
	CMPQ DX, $0x00000010
	JNE  loop

	// Finished; transpose CVs
	VPUNPCKLDQ  Y1, Y0, Y8
	VPUNPCKHDQ  Y1, Y0, Y9
	VPUNPCKLDQ  Y3, Y2, Y10
	VPUNPCKHDQ  Y3, Y2, Y11
	VPUNPCKLDQ  Y5, Y4, Y12
	VPUNPCKHDQ  Y5, Y4, Y13
	VPUNPCKLDQ  Y7, Y6, Y14
	VPUNPCKHDQ  Y7, Y6, Y15
	VPUNPCKLQDQ Y10, Y8, Y0
	VPUNPCKHQDQ Y10, Y8, Y1
	VPUNPCKLQDQ Y11, Y9, Y2
	VPUNPCKHQDQ Y11, Y9, Y3
	VPUNPCKLQDQ Y14, Y12, Y4
	VPUNPCKHQDQ Y14, Y12, Y5
	VPUNPCKLQDQ Y15, Y13, Y6
	VPUNPCKHQDQ Y15, Y13, Y7
	VPERM2I128  $0x20, Y4, Y0, Y8
	VPERM2I128  $0x31, Y4, Y0, Y12
	VPERM2I128  $0x20, Y5, Y1, Y9
	VPERM2I128  $0x31, Y5, Y1, Y13
	VPERM2I128  $0x20, Y6, Y2, Y10
	VPERM2I128  $0x31, Y6, Y2, Y14
	VPERM2I128  $0x20, Y7, Y3, Y11
	VPERM2I128  $0x31, Y7, Y3, Y15
	VMOVDQU     Y8, (AX)
	VMOVDQU     Y9, 32(AX)
	VMOVDQU     Y10, 64(AX)
	VMOVDQU     Y11, 96(AX)
	VMOVDQU     Y12, 128(AX)
	VMOVDQU     Y13, 160(AX)
	VMOVDQU     Y14, 192(AX)
	VMOVDQU     Y15, 224(AX)
	VZEROUPPER
	RET

// func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
// Requires: AVX, AVX2
TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-28
	MOVQ parents+0(FP), AX
	MOVQ cvs+8(FP), CX
	MOVQ key+16(FP), DX

	// Load transposed block
	VMOVDQU    seq<>+0(SB), Y9
	VPSLLD     $0x06, Y9, Y9
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, (CX)(Y9*1), Y10
	VMOVDQU    Y10, (SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 4(CX)(Y9*1), Y10
	VMOVDQU    Y10, 32(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 8(CX)(Y9*1), Y10
	VMOVDQU    Y10, 64(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 12(CX)(Y9*1), Y10
	VMOVDQU    Y10, 96(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 16(CX)(Y9*1), Y10
	VMOVDQU    Y10, 128(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 20(CX)(Y9*1), Y10
	VMOVDQU    Y10, 160(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 24(CX)(Y9*1), Y10
	VMOVDQU    Y10, 192(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 28(CX)(Y9*1), Y10
	VMOVDQU    Y10, 224(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 32(CX)(Y9*1), Y10
	VMOVDQU    Y10, 256(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 36(CX)(Y9*1), Y10
	VMOVDQU    Y10, 288(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 40(CX)(Y9*1), Y10
	VMOVDQU    Y10, 320(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 44(CX)(Y9*1), Y10
	VMOVDQU    Y10, 352(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 48(CX)(Y9*1), Y10
	VMOVDQU    Y10, 384(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 52(CX)(Y9*1), Y10
	VMOVDQU    Y10, 416(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 56(CX)(Y9*1), Y10
	VMOVDQU    Y10, 448(SP)
	VPCMPEQD   Y8, Y8, Y8
	VPGATHERDD Y8, 60(CX)(Y9*1), Y10
	VMOVDQU    Y10, 480(SP)

	// Initialize state vectors
	VPBROADCASTD (DX), Y0
	VPBROADCASTD 4(DX), Y1
	VPBROADCASTD 8(DX), Y2
	VPBROADCASTD 12(DX), Y3
	VPBROADCASTD 16(DX), Y4
	VPBROADCASTD 20(DX), Y5
	VPBROADCASTD 24(DX), Y6
	VPBROADCASTD 28(DX), Y7
	VPBROADCASTD iv<>+0(SB), Y8
	VPBROADCASTD iv<>+4(SB), Y9
	VPBROADCASTD iv<>+8(SB), Y10
	VPBROADCASTD iv<>+12(SB), Y11
	VPXOR        Y12, Y12, Y12
	VPXOR        Y13, Y13, Y13
	VPBROADCASTD seq<>+4(SB), Y14
	VPSLLD       $0x06, Y14, Y14
	ORL          $0x04, flags+24(FP)
	VPBROADCASTD flags+24(FP), Y15
	VMOVDQU      Y8, 512(SP)

	// Round 1
	VPADDD  Y0, Y4, Y0
	VPADDD  (SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  32(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  64(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  96(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  128(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  160(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  256(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  288(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  384(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  416(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  448(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 2
	VPADDD  Y0, Y4, Y0
	VPADDD  64(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  192(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  96(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  224(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  (SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  128(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  416(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  32(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  352(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  288(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  448(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 3
	VPADDD  Y0, Y4, Y0
	VPADDD  96(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  128(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  416(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  64(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  448(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  192(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  160(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  288(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  (SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  352(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  480(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  32(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 4
	VPADDD  Y0, Y4, Y0
	VPADDD  320(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  224(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  288(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  448(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  96(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  416(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  128(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  (SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  64(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  160(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  256(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  32(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 5
	VPADDD  Y0, Y4, Y0
	VPADDD  384(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  416(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  288(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  480(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  320(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  448(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  224(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  64(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  96(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  (SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  32(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  128(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 6
	VPADDD  Y0, Y4, Y0
	VPADDD  288(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  448(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  352(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  256(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  384(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  480(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  32(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  416(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  96(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  (SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  320(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  64(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  192(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  128(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4

	// Round 7
	VPADDD  Y0, Y4, Y0
	VPADDD  352(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y0, Y4, Y0
	VPADDD  480(SP), Y0, Y0
	VPXOR   Y12, Y0, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y12, Y8
	VPXOR   Y4, Y8, Y4
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y1, Y5, Y1
	VPADDD  160(SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y5, Y1
	VPADDD  (SP), Y1, Y1
	VPXOR   Y13, Y1, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VPADDD  Y9, Y13, Y9
	VPXOR   Y5, Y9, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y2, Y6, Y2
	VPADDD  32(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y6, Y2
	VPADDD  288(SP), Y2, Y2
	VPXOR   Y14, Y2, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y10, Y14, Y10
	VPXOR   Y6, Y10, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y3, Y7, Y3
	VPADDD  256(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y7, Y3
	VPADDD  192(SP), Y3, Y3
	VPXOR   Y15, Y3, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y11, Y15, Y11
	VPXOR   Y7, Y11, Y7
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y0, Y5, Y0
	VPADDD  448(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x0c, Y5, Y8
	VPSLLD  $0x14, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y0, Y5, Y0
	VPADDD  320(SP), Y0, Y0
	VPXOR   Y15, Y0, Y15
	VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
	VPADDD  Y10, Y15, Y10
	VPXOR   Y5, Y10, Y5
	VPSRLD  $0x07, Y5, Y8
	VPSLLD  $0x19, Y5, Y5
	VPOR    Y5, Y8, Y5
	VPADDD  Y1, Y6, Y1
	VPADDD  64(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x0c, Y6, Y8
	VPSLLD  $0x14, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y1, Y6, Y1
	VPADDD  384(SP), Y1, Y1
	VPXOR   Y12, Y1, Y12
	VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
	VPADDD  Y11, Y12, Y11
	VPXOR   Y6, Y11, Y6
	VPSRLD  $0x07, Y6, Y8
	VPSLLD  $0x19, Y6, Y6
	VPOR    Y6, Y8, Y6
	VPADDD  Y2, Y7, Y2
	VPADDD  96(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x0c, Y7, Y8
	VPSLLD  $0x14, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y2, Y7, Y2
	VPADDD  128(SP), Y2, Y2
	VPXOR   Y13, Y2, Y13
	VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
	VMOVDQU 512(SP), Y8
	VPADDD  Y8, Y13, Y8
	VPXOR   Y7, Y8, Y7
	VMOVDQU Y8, 512(SP)
	VPSRLD  $0x07, Y7, Y8
	VPSLLD  $0x19, Y7, Y7
	VPOR    Y7, Y8, Y7
	VPADDD  Y3, Y4, Y3
	VPADDD  224(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x0c, Y4, Y8
	VPSLLD  $0x14, Y4, Y4
	VPOR    Y4, Y8, Y4
	VPADDD  Y3, Y4, Y3
	VPADDD  416(SP), Y3, Y3
	VPXOR   Y14, Y3, Y14
	VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
	VPADDD  Y9, Y14, Y9
	VPXOR   Y4, Y9, Y4
	VPSRLD  $0x07, Y4, Y8
	VPSLLD  $0x19, Y4, Y4
	VPOR    Y4, Y8, Y4
	VMOVDQU 512(SP), Y8

	// Finalize CVs
	VPXOR       Y0, Y8, Y0
	VPXOR       Y1, Y9, Y1
	VPXOR       Y2, Y10, Y2
	VPXOR       Y3, Y11, Y3
	VPXOR       Y4, Y12, Y4
	VPXOR       Y5, Y13, Y5
	VPXOR       Y6, Y14, Y6
	VPXOR       Y7, Y15, Y7
	VPUNPCKLDQ  Y1, Y0, Y8
	VPUNPCKHDQ  Y1, Y0, Y9
	VPUNPCKLDQ  Y3, Y2, Y10
	VPUNPCKHDQ  Y3, Y2, Y11
	VPUNPCKLDQ  Y5, Y4, Y12
	VPUNPCKHDQ  Y5, Y4, Y13
	VPUNPCKLDQ  Y7, Y6, Y14
	VPUNPCKHDQ  Y7, Y6, Y15
	VPUNPCKLQDQ Y10, Y8, Y0
	VPUNPCKHQDQ Y10, Y8, Y1
	VPUNPCKLQDQ Y11, Y9, Y2
	VPUNPCKHQDQ Y11, Y9, Y3
	VPUNPCKLQDQ Y14, Y12, Y4
	VPUNPCKHQDQ Y14, Y12, Y5
	VPUNPCKLQDQ Y15, Y13, Y6
	VPUNPCKHQDQ Y15, Y13, Y7
	VPERM2I128  $0x20, Y4, Y0, Y8
	VPERM2I128  $0x31, Y4, Y0, Y12
	VPERM2I128  $0x20, Y5, Y1, Y9
	VPERM2I128  $0x31, Y5, Y1, Y13
	VPERM2I128  $0x20, Y6, Y2, Y10
	VPERM2I128  $0x31, Y6, Y2, Y14
	VPERM2I128  $0x20, Y7, Y3, Y11
	VPERM2I128  $0x31, Y7, Y3, Y15
	VMOVDQU     Y8, (AX)
	VMOVDQU     Y9, 32(AX)
	VMOVDQU     Y10, 64(AX)
	VMOVDQU     Y11, 96(AX)
	VMOVDQU     Y12, 128(AX)
	VMOVDQU     Y13, 160(AX)
	VMOVDQU     Y14, 192(AX)
	VMOVDQU     Y15, 224(AX)
	VZEROUPPER
	RET