// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package snapref

func load32( []byte,  int) uint32 {
	 = [ : +4 : len()] // Help the compiler eliminate bounds checks on the next line.
	return uint32([0]) | uint32([1])<<8 | uint32([2])<<16 | uint32([3])<<24
}

func load64( []byte,  int) uint64 {
	 = [ : +8 : len()] // Help the compiler eliminate bounds checks on the next line.
	return uint64([0]) | uint64([1])<<8 | uint64([2])<<16 | uint64([3])<<24 |
		uint64([4])<<32 | uint64([5])<<40 | uint64([6])<<48 | uint64([7])<<56
}

// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
//
//	dst is long enough to hold the encoded bytes
//	1 <= len(lit) && len(lit) <= 65536
func emitLiteral(,  []byte) int {
	,  := 0, uint(len()-1)
	switch {
	case  < 60:
		[0] = uint8()<<2 | tagLiteral
		 = 1
	case  < 1<<8:
		[0] = 60<<2 | tagLiteral
		[1] = uint8()
		 = 2
	default:
		[0] = 61<<2 | tagLiteral
		[1] = uint8()
		[2] = uint8( >> 8)
		 = 3
	}
	return  + copy([:], )
}

// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
//
//	dst is long enough to hold the encoded bytes
//	1 <= offset && offset <= 65535
//	4 <= length && length <= 65535
func emitCopy( []byte, ,  int) int {
	 := 0
	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
	// length emitted down below is a little lower (at 60 = 64 - 4), because
	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
	for  >= 68 {
		// Emit a length 64 copy, encoded as 3 bytes.
		[+0] = 63<<2 | tagCopy2
		[+1] = uint8()
		[+2] = uint8( >> 8)
		 += 3
		 -= 64
	}
	if  > 64 {
		// Emit a length 60 copy, encoded as 3 bytes.
		[+0] = 59<<2 | tagCopy2
		[+1] = uint8()
		[+2] = uint8( >> 8)
		 += 3
		 -= 60
	}
	if  >= 12 ||  >= 2048 {
		// Emit the remaining copy, encoded as 3 bytes.
		[+0] = uint8(-1)<<2 | tagCopy2
		[+1] = uint8()
		[+2] = uint8( >> 8)
		return  + 3
	}
	// Emit the remaining copy, encoded as 2 bytes.
	[+0] = uint8(>>8)<<5 | uint8(-4)<<2 | tagCopy1
	[+1] = uint8()
	return  + 2
}

func hash(,  uint32) uint32 {
	return ( * 0x1e35a7bd) >> 
}

// EncodeBlockInto exposes encodeBlock but checks dst size.
func (,  []byte) ( int) {
	if MaxEncodedLen(len()) > len() {
		return 0
	}

	// encodeBlock breaks on too big blocks, so split.
	for len() > 0 {
		 := 
		 = nil
		if len() > maxBlockSize {
			,  = [:maxBlockSize], [maxBlockSize:]
		}
		if len() < minNonLiteralBlockSize {
			 += emitLiteral([:], )
		} else {
			 += encodeBlock([:], )
		}
	}
	return 
}

// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
//
//	len(dst) >= MaxEncodedLen(len(src)) &&
//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlock(,  []byte) ( int) {
	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
	// The table element type is uint16, as s < sLimit and sLimit < len(src)
	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
	const (
		 = 1 << 14
		// tableMask is redundant, but helps the compiler eliminate bounds
		// checks.
		 =  - 1
	)
	 := uint32(32 - 8)
	for  := 1 << 8;  <  &&  < len();  *= 2 {
		--
	}
	// In Go, all array elements are zero-initialized, so there is no advantage
	// to a smaller tableSize per se. However, it matches the C++ algorithm,
	// and in the asm versions of this code, we can get away with zeroing only
	// the first tableSize elements.
	var  []uint16

	// sLimit is when to stop looking for offset/length copies. The inputMargin
	// lets us use a fast path for emitLiteral in the main loop, while we are
	// looking for copies.
	 := len() - inputMargin

	// nextEmit is where in src the next emitLiteral should start from.
	 := 0

	// The encoded form must start with a literal, as there are no previous
	// bytes to copy, so we start looking for hash matches at s == 1.
	 := 1
	 := hash(load32(, ), )

	for {
		// Copied from the C++ snappy implementation:
		//
		// Heuristic match skipping: If 32 bytes are scanned with no matches
		// found, start looking only at every other byte. If 32 more bytes are
		// scanned (or skipped), look at every third byte, etc.. When a match
		// is found, immediately go back to looking at every byte. This is a
		// small loss (~5% performance, ~0.1% density) for compressible data
		// due to more bookkeeping, but for non-compressible data (such as
		// JPEG) it's a huge win since the compressor quickly "realizes" the
		// data is incompressible and doesn't bother looking for matches
		// everywhere.
		//
		// The "skip" variable keeps track of how many bytes there are since
		// the last match; dividing it by 32 (ie. right-shifting by five) gives
		// the number of bytes to move ahead for each iteration.
		 := 32

		 := 
		 := 0
		for {
			 = 
			 :=  >> 5
			 =  + 
			 += 
			if  >  {
				goto 
			}
			 = int([&])
			[&] = uint16()
			 = hash(load32(, ), )
			if load32(, ) == load32(, ) {
				break
			}
		}

		// A 4-byte match has been found. We'll later see if more than 4 bytes
		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
		// them as literal bytes.
		 += emitLiteral([:], [:])

		// Call emitCopy, and then see if another emitCopy could be our next
		// move. Repeat until we find no match for the input immediately after
		// what was consumed by the last emitCopy call.
		//
		// If we exit this loop normally then we need to call emitLiteral next,
		// though we don't yet know how big the literal will be. We handle that
		// by proceeding to the next iteration of the main loop. We also can
		// exit this loop via goto if we get close to exhausting the input.
		for {
			// Invariant: we have a 4-byte match at s, and no need to emit any
			// literal bytes prior to s.
			 := 

			// Extend the 4-byte match as long as possible.
			//
			// This is an inlined version of:
			//	s = extendMatch(src, candidate+4, s+4)
			 += 4
			for  :=  + 4;  < len() && [] == []; ,  = +1, +1 {
			}

			 += emitCopy([:], -, -)
			 = 
			if  >=  {
				goto 
			}

			// We could immediately start working at s now, but to improve
			// compression we first update the hash table at s-1 and at s. If
			// another emitCopy is not our next move, also calculate nextHash
			// at s+1. At least on GOARCH=amd64, these three hash calculations
			// are faster as one load64 call (with some shifts) instead of
			// three load32 calls.
			 := load64(, -1)
			 := hash(uint32(>>0), )
			[&] = uint16( - 1)
			 := hash(uint32(>>8), )
			 = int([&])
			[&] = uint16()
			if uint32(>>8) != load32(, ) {
				 = hash(uint32(>>16), )
				++
				break
			}
		}
	}

:
	if  < len() {
		 += emitLiteral([:], [:])
	}
	return 
}