package guts

import (
	
)

//go:generate go run avo/gen.go -out blake3_amd64.s

//go:noescape
func compressChunksAVX512( *[16][8]uint32,  *[16 * ChunkSize]byte,  *[8]uint32,  uint64,  uint32)

//go:noescape
func compressChunksAVX2( *[8][8]uint32,  *[8 * ChunkSize]byte,  *[8]uint32,  uint64,  uint32)

//go:noescape
func compressBlocksAVX512( *[1024]byte,  *[16]uint32,  *[8]uint32,  uint64,  uint32,  uint32)

//go:noescape
func compressBlocksAVX2( *[512]byte,  *[16]uint32,  *[8]uint32,  uint64,  uint32,  uint32)

//go:noescape
func compressParentsAVX2( *[8][8]uint32,  *[16][8]uint32,  *[8]uint32,  uint32)

func compressBufferAVX512( *[MaxSIMD * ChunkSize]byte,  int,  *[8]uint32,  uint64,  uint32) Node {
	var  [MaxSIMD][8]uint32
	compressChunksAVX512(&, , , , )
	 := uint64( / ChunkSize)
	if %ChunkSize != 0 {
		// use non-asm for remainder
		 := [-%ChunkSize : ]
		[] = ChainingValue(CompressChunk(, , +, ))
		++
	}
	return mergeSubtrees(&, , , )
}

func compressBufferAVX2( *[MaxSIMD * ChunkSize]byte,  int,  *[8]uint32,  uint64,  uint32) Node {
	var  [MaxSIMD][8]uint32
	 := (*[2][8][8]uint32)(unsafe.Pointer(&))
	 := (*[2][8 * ChunkSize]byte)(unsafe.Pointer())
	compressChunksAVX2(&[0], &[0], , , )
	 := uint64( / ChunkSize)
	if  > 8 {
		compressChunksAVX2(&[1], &[1], , +8, )
	}
	if %ChunkSize != 0 {
		// use non-asm for remainder
		 := [-%ChunkSize : ]
		[] = ChainingValue(CompressChunk(, , +, ))
		++
	}
	return mergeSubtrees(&, , , )
}

// CompressBuffer compresses up to MaxSIMD chunks in parallel and returns their
// root node.
func ( *[MaxSIMD * ChunkSize]byte,  int,  *[8]uint32,  uint64,  uint32) Node {
	if  <= ChunkSize {
		return CompressChunk([:], , , )
	}
	switch {
	case haveAVX512 &&  >= ChunkSize*2:
		return compressBufferAVX512(, , , , )
	case haveAVX2 &&  >= ChunkSize*2:
		return compressBufferAVX2(, , , , )
	default:
		return compressBufferGeneric(, , , , )
	}
}

// CompressChunk compresses a single chunk, returning its final (uncompressed)
// node.
func ( []byte,  *[8]uint32,  uint64,  uint32) Node {
	 := Node{
		CV:       *,
		Counter:  ,
		BlockLen: BlockSize,
		Flags:     | FlagChunkStart,
	}
	 := (*[64]byte)(unsafe.Pointer(&.Block))[:]
	for len() > BlockSize {
		copy(, )
		 = [BlockSize:]
		.CV = ChainingValue()
		.Flags &^= FlagChunkStart
	}
	// pad last block with zeros
	.Block = [16]uint32{}
	copy(, )
	.BlockLen = uint32(len())
	.Flags |= FlagChunkEnd
	return 
}

// CompressBlocks compresses MaxSIMD copies of n with successive counter values,
// storing the results in out.
func ( *[MaxSIMD * BlockSize]byte,  Node) {
	switch {
	case haveAVX512:
		compressBlocksAVX512(, &.Block, &.CV, .Counter, .BlockLen, .Flags)
	case haveAVX2:
		 := (*[2][512]byte)(unsafe.Pointer())
		compressBlocksAVX2(&[0], &.Block, &.CV, .Counter, .BlockLen, .Flags)
		compressBlocksAVX2(&[1], &.Block, &.CV, .Counter+8, .BlockLen, .Flags)
	default:
		 := (*[MaxSIMD][64]byte)(unsafe.Pointer())
		compressBlocksGeneric(, )
	}
}

func mergeSubtrees( *[MaxSIMD][8]uint32,  uint64,  *[8]uint32,  uint32) Node {
	if !haveAVX2 {
		return mergeSubtreesGeneric(, , , )
	}
	for  > 2 {
		if %2 == 0 {
			compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer()), , , )
		} else {
			 := [-1]
			compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer()), , , )
			[/2] = 
			++
		}
		 /= 2
	}
	return ParentNode([0], [1], , )
}

// BytesToWords converts an array of 64 bytes to an array of 16 bytes.
func ( [64]byte) [16]uint32 {
	return *(*[16]uint32)(unsafe.Pointer(&))
}

// WordsToBytes converts an array of 16 words to an array of 64 bytes.
func ( [16]uint32) [64]byte {
	return *(*[64]byte)(unsafe.Pointer(&))
}