//go:build amd64 && !appengine && !noasm && gc
// +build amd64,!appengine,!noasm,gc

// This file contains the specialisation of Decoder.Decompress4X // and Decoder.Decompress1X that use an asm implementation of thir main loops.
package huff0 import ( ) // decompress4x_main_loop_x86 is an x86 assembler implementation // of Decompress4X when tablelog > 8. // //go:noescape func decompress4x_main_loop_amd64( *decompress4xContext) // decompress4x_8b_loop_x86 is an x86 assembler implementation // of Decompress4X when tablelog <= 8 which decodes 4 entries // per loop. // //go:noescape func decompress4x_8b_main_loop_amd64( *decompress4xContext) // fallback8BitSize is the size where using Go version is faster. const fallback8BitSize = 800 type decompress4xContext struct { pbr *[4]bitReaderShifted peekBits uint8 out *byte dstEvery int tbl *dEntrySingle decoded int limit *byte } // Decompress4X will decompress a 4X encoded stream. // The length of the supplied input must match the end of a block exactly. // The *capacity* of the dst slice must match the destination size of // the uncompressed data exactly. func ( *Decoder) (, []byte) ([]byte, error) { if len(.dt.single) == 0 { return nil, errors.New("no table loaded") } if len() < 6+(4*1) { return nil, errors.New("input too small") } := .actualTableLog <= 8 if cap() < fallback8BitSize && { return .decompress4X8bit(, ) } var [4]bitReaderShifted // Decode "jump table" := 6 for := 0; < 3; ++ { := int([*2]) | (int([*2+1]) << 8) if + >= len() { return nil, errors.New("truncated input (or invalid offset)") } := [].init([ : +]) if != nil { return nil, } += } := [3].init([:]) if != nil { return nil, } // destination, offset to match first output := cap() = [:] := := ( + 3) / 4 const = 1 << tableLogMax const = - 1 := .dt.single[:] var int if len() > 4*4 && !([0].off < 4 || [1].off < 4 || [2].off < 4 || [3].off < 4) { := decompress4xContext{ pbr: &, peekBits: uint8((64 - .actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() out: &[0], dstEvery: , tbl: &[0], limit: &[-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last. } if { decompress4x_8b_main_loop_amd64(&) } else { decompress4x_main_loop_amd64(&) } = .decoded = [/4:] } // Decode remaining. := - ( / 4) for := range { := * := + if > len() { = len() } := &[] := .remaining() for > 0 { .fill() if >= { return nil, errors.New("corruption detected: stream overrun 4") } // Read value and increment offset. := .peekBitsFast(.actualTableLog) := [&].entry := uint8() .advance() -= uint() [] = uint8( >> 8) ++ } if != { return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", , , ) } += - * = .close() if != nil { return nil, } } if != { return nil, errors.New("corruption detected: short output block") } return , nil } // decompress4x_main_loop_x86 is an x86 assembler implementation // of Decompress1X when tablelog > 8. // //go:noescape func decompress1x_main_loop_amd64( *decompress1xContext) // decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation // of Decompress1X when tablelog > 8. // //go:noescape func decompress1x_main_loop_bmi2( *decompress1xContext) type decompress1xContext struct { pbr *bitReaderShifted peekBits uint8 out *byte outCap int tbl *dEntrySingle decoded int } // Error reported by asm implementations const error_max_decoded_size_exeeded = -1 // Decompress1X will decompress a 1X encoded stream. // The cap of the output buffer will be the maximum decompressed size. // The length of the supplied input must match the end of a block exactly. func ( *Decoder) (, []byte) ([]byte, error) { if len(.dt.single) == 0 { return nil, errors.New("no table loaded") } var bitReaderShifted := .init() if != nil { return , } := cap() = [:] const = 1 << tableLogMax const = - 1 if >= 4 { := decompress1xContext{ pbr: &, out: &[0], outCap: , peekBits: uint8((64 - .actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() tbl: &.dt.single[0], } if cpuinfo.HasBMI2() { decompress1x_main_loop_bmi2(&) } else { decompress1x_main_loop_amd64(&) } if .decoded == error_max_decoded_size_exeeded { return nil, ErrMaxDecodedSizeExceeded } = [:.decoded] } // br < 8, so uint8 is fine := uint8(.off)*8 + 64 - .bitsRead for > 0 { .fill() if len() >= { .close() return nil, ErrMaxDecodedSizeExceeded } := .dt.single[.peekBitsFast(.actualTableLog)&] := uint8(.entry) .advance() -= = append(, uint8(.entry>>8)) } return , .close() }