package zstd
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"math"
"sort"
"github.com/klauspost/compress/huff0"
)
type dict struct {
id uint32
litEnc *huff0 .Scratch
llDec, ofDec, mlDec sequenceDec
offsets [3 ]int
content []byte
}
const dictMagic = "\x37\xa4\x30\xec"
const dictMaxLength = 1 << 31
func (d *dict ) ID () uint32 {
if d == nil {
return 0
}
return d .id
}
func (d *dict ) ContentSize () int {
if d == nil {
return 0
}
return len (d .content )
}
func (d *dict ) Content () []byte {
if d == nil {
return nil
}
return d .content
}
func (d *dict ) Offsets () [3 ]int {
if d == nil {
return [3 ]int {}
}
return d .offsets
}
func (d *dict ) LitEncoder () *huff0 .Scratch {
if d == nil {
return nil
}
return d .litEnc
}
func loadDict(b []byte ) (*dict , error ) {
if len (b ) <= 8 +(3 *4 ) {
return nil , io .ErrUnexpectedEOF
}
d := dict {
llDec : sequenceDec {fse : &fseDecoder {}},
ofDec : sequenceDec {fse : &fseDecoder {}},
mlDec : sequenceDec {fse : &fseDecoder {}},
}
if string (b [:4 ]) != dictMagic {
return nil , ErrMagicMismatch
}
d .id = binary .LittleEndian .Uint32 (b [4 :8 ])
if d .id == 0 {
return nil , errors .New ("dictionaries cannot have ID 0" )
}
var err error
d .litEnc , b , err = huff0 .ReadTable (b [8 :], nil )
if err != nil {
return nil , fmt .Errorf ("loading literal table: %w" , err )
}
d .litEnc .Reuse = huff0 .ReusePolicyMust
br := byteReader {
b : b ,
off : 0 ,
}
readDec := func (i tableIndex , dec *fseDecoder ) error {
if err := dec .readNCount (&br , uint16 (maxTableSymbol [i ])); err != nil {
return err
}
if br .overread () {
return io .ErrUnexpectedEOF
}
err = dec .transform (symbolTableX [i ])
if err != nil {
println ("Transform table error:" , err )
return err
}
if debugDecoder || debugEncoder {
println ("Read table ok" , "symbolLen:" , dec .symbolLen )
}
dec .preDefined = true
return nil
}
if err := readDec (tableOffsets , d .ofDec .fse ); err != nil {
return nil , err
}
if err := readDec (tableMatchLengths , d .mlDec .fse ); err != nil {
return nil , err
}
if err := readDec (tableLiteralLengths , d .llDec .fse ); err != nil {
return nil , err
}
if br .remain () < 12 {
return nil , io .ErrUnexpectedEOF
}
d .offsets [0 ] = int (br .Uint32 ())
br .advance (4 )
d .offsets [1 ] = int (br .Uint32 ())
br .advance (4 )
d .offsets [2 ] = int (br .Uint32 ())
br .advance (4 )
if d .offsets [0 ] <= 0 || d .offsets [1 ] <= 0 || d .offsets [2 ] <= 0 {
return nil , errors .New ("invalid offset in dictionary" )
}
d .content = make ([]byte , br .remain ())
copy (d .content , br .unread ())
if d .offsets [0 ] > len (d .content ) || d .offsets [1 ] > len (d .content ) || d .offsets [2 ] > len (d .content ) {
return nil , fmt .Errorf ("initial offset bigger than dictionary content size %d, offsets: %v" , len (d .content ), d .offsets )
}
return &d , nil
}
func InspectDictionary (b []byte ) (interface {
ID () uint32
ContentSize () int
Content () []byte
Offsets () [3 ]int
LitEncoder () *huff0 .Scratch
}, error ) {
initPredefined ()
d , err := loadDict (b )
return d , err
}
type BuildDictOptions struct {
ID uint32
Contents [][]byte
History []byte
Offsets [3 ]int
CompatV155 bool
Level EncoderLevel
DebugOut io .Writer
}
func BuildDict (o BuildDictOptions ) ([]byte , error ) {
initPredefined ()
hist := o .History
contents := o .Contents
debug := o .DebugOut != nil
println := func (args ...interface {}) {
if o .DebugOut != nil {
fmt .Fprintln (o .DebugOut , args ...)
}
}
printf := func (s string , args ...interface {}) {
if o .DebugOut != nil {
fmt .Fprintf (o .DebugOut , s , args ...)
}
}
print := func (args ...interface {}) {
if o .DebugOut != nil {
fmt .Fprint (o .DebugOut , args ...)
}
}
if int64 (len (hist )) > dictMaxLength {
return nil , fmt .Errorf ("dictionary of size %d > %d" , len (hist ), int64 (dictMaxLength ))
}
if len (hist ) < 8 {
return nil , fmt .Errorf ("dictionary of size %d < %d" , len (hist ), 8 )
}
if len (contents ) == 0 {
return nil , errors .New ("no content provided" )
}
d := dict {
id : o .ID ,
litEnc : nil ,
llDec : sequenceDec {},
ofDec : sequenceDec {},
mlDec : sequenceDec {},
offsets : o .Offsets ,
content : hist ,
}
block := blockEnc {lowMem : false }
block .init ()
enc := encoder (&bestFastEncoder {fastBase : fastBase {maxMatchOff : int32 (maxMatchLen ), bufferReset : math .MaxInt32 - int32 (maxMatchLen *2 ), lowMem : false }})
if o .Level != 0 {
eOpts := encoderOptions {
level : o .Level ,
blockSize : maxMatchLen ,
windowSize : maxMatchLen ,
dict : &d ,
lowMem : false ,
}
enc = eOpts .encoder ()
} else {
o .Level = SpeedBestCompression
}
var (
remain [256 ]int
ll [256 ]int
ml [256 ]int
of [256 ]int
)
addValues := func (dst *[256 ]int , src []byte ) {
for _ , v := range src {
dst [v ]++
}
}
addHist := func (dst *[256 ]int , src *[256 ]uint32 ) {
for i , v := range src {
dst [i ] += int (v )
}
}
seqs := 0
nUsed := 0
litTotal := 0
newOffsets := make (map [uint32 ]int , 1000 )
for _ , b := range contents {
block .reset (nil )
if len (b ) < 8 {
continue
}
nUsed ++
enc .Reset (&d , true )
enc .Encode (&block , b )
addValues (&remain , block .literals )
litTotal += len (block .literals )
if len (block .sequences ) == 0 {
continue
}
seqs += len (block .sequences )
block .genCodes ()
addHist (&ll , block .coders .llEnc .Histogram ())
addHist (&ml , block .coders .mlEnc .Histogram ())
addHist (&of , block .coders .ofEnc .Histogram ())
for i , seq := range block .sequences {
if i > 3 {
break
}
offset := seq .offset
if offset == 0 {
continue
}
if int (offset ) >= len (o .History ) {
continue
}
if offset > 3 {
newOffsets [offset -3 ]++
} else {
newOffsets [uint32 (o .Offsets [offset -1 ])]++
}
}
}
var sortedOffsets []uint32
for k := range newOffsets {
sortedOffsets = append (sortedOffsets , k )
}
sort .Slice (sortedOffsets , func (i , j int ) bool {
a , b := sortedOffsets [i ], sortedOffsets [j ]
if a == b {
return sortedOffsets [i ] > sortedOffsets [j ]
}
return newOffsets [sortedOffsets [i ]] > newOffsets [sortedOffsets [j ]]
})
if len (sortedOffsets ) > 3 {
if debug {
print ("Offsets:" )
for i , v := range sortedOffsets {
if i > 20 {
break
}
printf ("[%d: %d]," , v , newOffsets [v ])
}
println ("" )
}
sortedOffsets = sortedOffsets [:3 ]
}
for i , v := range sortedOffsets {
o .Offsets [i ] = int (v )
}
if debug {
println ("New repeat offsets" , o .Offsets )
}
if nUsed == 0 || seqs == 0 {
return nil , fmt .Errorf ("%d blocks, %d sequences found" , nUsed , seqs )
}
if debug {
println ("Sequences:" , seqs , "Blocks:" , nUsed , "Literals:" , litTotal )
}
if seqs /nUsed < 512 {
nUsed = seqs / 512
if nUsed == 0 {
nUsed = 1
}
}
copyHist := func (dst *fseEncoder , src *[256 ]int ) ([]byte , error ) {
hist := dst .Histogram ()
var maxSym uint8
var maxCount int
var fakeLength int
for i , v := range src {
if v > 0 {
v = v / nUsed
if v == 0 {
v = 1
}
}
if v > maxCount {
maxCount = v
}
if v != 0 {
maxSym = uint8 (i )
}
fakeLength += v
hist [i ] = uint32 (v )
}
if maxCount == fakeLength {
for i := range hist {
if uint8 (i ) == maxSym {
fakeLength ++
maxSym ++
hist [i +1 ] = 1
if maxSym > 1 {
break
}
}
if hist [0 ] == 0 {
fakeLength ++
hist [i ] = 1
if maxSym > 1 {
break
}
}
}
}
dst .HistogramFinished (maxSym , maxCount )
dst .reUsed = false
dst .useRLE = false
err := dst .normalizeCount (fakeLength )
if err != nil {
return nil , err
}
if debug {
println ("RAW:" , dst .count [:maxSym +1 ], "NORM:" , dst .norm [:maxSym +1 ], "LEN:" , fakeLength )
}
return dst .writeCount (nil )
}
if debug {
print ("Literal lengths: " )
}
llTable , err := copyHist (block .coders .llEnc , &ll )
if err != nil {
return nil , err
}
if debug {
print ("Match lengths: " )
}
mlTable , err := copyHist (block .coders .mlEnc , &ml )
if err != nil {
return nil , err
}
if debug {
print ("Offsets: " )
}
ofTable , err := copyHist (block .coders .ofEnc , &of )
if err != nil {
return nil , err
}
avgSize := litTotal
if avgSize > huff0 .BlockSizeMax /2 {
avgSize = huff0 .BlockSizeMax / 2
}
huffBuff := make ([]byte , 0 , avgSize )
div := litTotal / avgSize
if div < 1 {
div = 1
}
if debug {
println ("Huffman weights:" )
}
for i , n := range remain [:] {
if n > 0 {
n = n / div
if n == 0 {
n = 1
}
huffBuff = append (huffBuff , bytes .Repeat ([]byte {byte (i )}, n )...)
if debug {
printf ("[%d: %d], " , i , n )
}
}
}
if o .CompatV155 && remain [255 ]/div == 0 {
huffBuff = append (huffBuff , 255 )
}
scratch := &huff0 .Scratch {TableLog : 11 }
for tries := 0 ; tries < 255 ; tries ++ {
scratch = &huff0 .Scratch {TableLog : 11 }
_, _, err = huff0 .Compress1X (huffBuff , scratch )
if err == nil {
break
}
if debug {
printf ("Try %d: Huffman error: %v\n" , tries +1 , err )
}
huffBuff = huffBuff [:0 ]
if tries == 250 {
if debug {
println ("Huffman: Bailing out with predefined table" )
}
huffBuff = append (huffBuff , bytes .Repeat ([]byte {255 }, 10000 )...)
for i := 0 ; i < 128 ; i ++ {
huffBuff = append (huffBuff , byte (i ))
}
continue
}
if errors .Is (err , huff0 .ErrIncompressible ) {
for i , n := range remain [:] {
if n > 0 {
n = n / (div * (i + 1 ))
if n > 0 {
huffBuff = append (huffBuff , bytes .Repeat ([]byte {byte (i )}, n )...)
}
}
}
if o .CompatV155 && len (huffBuff ) > 0 && huffBuff [len (huffBuff )-1 ] != 255 {
huffBuff = append (huffBuff , 255 )
}
if len (huffBuff ) == 0 {
huffBuff = append (huffBuff , 0 , 255 )
}
}
if errors .Is (err , huff0 .ErrUseRLE ) {
for i , n := range remain [:] {
n = n / (div * (i + 1 ))
if n == 0 {
n = 1
}
huffBuff = append (huffBuff , bytes .Repeat ([]byte {byte (i )}, n )...)
}
}
}
var out bytes .Buffer
out .Write ([]byte (dictMagic ))
out .Write (binary .LittleEndian .AppendUint32 (nil , o .ID ))
out .Write (scratch .OutTable )
if debug {
println ("huff table:" , len (scratch .OutTable ), "bytes" )
println ("of table:" , len (ofTable ), "bytes" )
println ("ml table:" , len (mlTable ), "bytes" )
println ("ll table:" , len (llTable ), "bytes" )
}
out .Write (ofTable )
out .Write (mlTable )
out .Write (llTable )
out .Write (binary .LittleEndian .AppendUint32 (nil , uint32 (o .Offsets [0 ])))
out .Write (binary .LittleEndian .AppendUint32 (nil , uint32 (o .Offsets [1 ])))
out .Write (binary .LittleEndian .AppendUint32 (nil , uint32 (o .Offsets [2 ])))
out .Write (hist )
if debug {
_ , err := loadDict (out .Bytes ())
if err != nil {
panic (err )
}
i , err := InspectDictionary (out .Bytes ())
if err != nil {
panic (err )
}
println ("ID:" , i .ID ())
println ("Content size:" , i .ContentSize ())
println ("Encoder:" , i .LitEncoder () != nil )
println ("Offsets:" , i .Offsets ())
var totalSize int
for _ , b := range contents {
totalSize += len (b )
}
encWith := func (opts ...EOption ) int {
enc , err := NewWriter (nil , opts ...)
if err != nil {
panic (err )
}
defer enc .Close ()
var dst []byte
var totalSize int
for _ , b := range contents {
dst = enc .EncodeAll (b , dst [:0 ])
totalSize += len (dst )
}
return totalSize
}
plain := encWith (WithEncoderLevel (o .Level ))
withDict := encWith (WithEncoderLevel (o .Level ), WithEncoderDict (out .Bytes ()))
println ("Input size:" , totalSize )
println ("Plain Compressed:" , plain )
println ("Dict Compressed:" , withDict )
println ("Saved:" , plain -withDict , (plain -withDict )/len (contents ), "bytes per input (rounded down)" )
}
return out .Bytes (), nil
}
The pages are generated with Golds v0.8.2 . (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu .
PR and bug reports are welcome and can be submitted to the issue list .
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds .