package matchfinderimport ()// M4 is an implementation of the MatchFinder// interface that uses a hash table to find matches,// optional match chains,// and the advanced parsing technique from// https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html.typeM4struct {// MaxDistance is the maximum distance (in bytes) to look back for // a match. The default is 65535. MaxDistance int// MinLength is the length of the shortest match to return. // The default is 4. MinLength int// HashLen is the number of bytes to use to calculate the hashes. // The maximum is 8 and the default is 6. HashLen int// TableBits is the number of bits in the hash table indexes. // The default is 17 (128K entries). TableBits int// ChainLength is how many entries to search on the "match chain" of older // locations with the same hash as the current location. ChainLength int// DistanceBitCost is used when comparing two matches to see // which is better. The comparison is primarily based on the length // of the matches, but it can also take the distance into account, // in terms of the number of bits needed to represent the distance. // One byte of length is given a score of 256, so 32 (256/8) would // be a reasonable first guess for the value of one bit. // (The default is 0, which bases the comparison solely on length.) DistanceBitCost int table []uint32 chain []uint32 history []byte}func ( *M4) () {for := range .table { .table[] = 0 } .history = .history[:0] .chain = .chain[:0]}func ( *M4) ( absoluteMatch) int {return (.End-.Start)*256 + (bits.LeadingZeros32(uint32(.Start-.Match))-32)*.DistanceBitCost}func ( *M4) ( []Match, []byte) []Match {if .MaxDistance == 0 { .MaxDistance = 65535 }if .MinLength == 0 { .MinLength = 4 }if .HashLen == 0 { .HashLen = 6 }if .TableBits == 0 { .TableBits = 17 }iflen(.table) < 1<<.TableBits { .table = make([]uint32, 1<<.TableBits) } := matchEmitter{Dst: }iflen(.history) > .MaxDistance*2 {// Trim down the history buffer. := len(.history) - .MaxDistancecopy(.history, .history[:]) .history = .history[:.MaxDistance]if .ChainLength > 0 { .chain = .chain[:.MaxDistance] }for , := range .table { := int() - if < 0 { = 0 } .table[] = uint32() } }// Append src to the history buffer. .NextEmit = len(.history) .history = append(.history, ...)if .ChainLength > 0 { .chain = append(.chain, make([]uint32, len())...) } = .history// matches stores the matches that have been found but not emitted, // in reverse order. (matches[0] is the most recent one.)var [3]absoluteMatchfor := .NextEmit; < len()-7; ++ {if [0] != (absoluteMatch{}) && >= [0].End {// We have found some matches, and we're far enough along that we probably // won't find overlapping matches, so we might as well emit them.if [1] != (absoluteMatch{}) {if [1].End > [0].Start { [1].End = [0].Start }if [1].End-[1].Start >= .MinLength && .score([1]) > 0 { .emit([1]) } } .emit([0]) = [3]absoluteMatch{} }// Look for a repeat match one byte after the current position.if [0] == (absoluteMatch{}) && len(.Dst) > 0 { := .Dst[len(.Dst)-1].Distanceifbinary.LittleEndian.Uint32([+1:]) == binary.LittleEndian.Uint32([+1-:]) {// We have a 4-byte match. := extendMatch2(, +1, +1-, .NextEmit+1)if .End-.Start >= .MinLength { [0] = } } }// Calculate and store the hash. := ((binary.LittleEndian.Uint64([:]) & (1<<(8*.HashLen) - 1)) * hashMul64) >> (64 - .TableBits) := int(.table[]) .table[] = uint32()if .ChainLength > 0 && != 0 { := - .chain[] = uint32() }if < [0].End && != [0].End+2-.HashLen {continue }if == 0 || - > .MaxDistance {continue }// Look for a match.varabsoluteMatchifbinary.LittleEndian.Uint32([:]) == binary.LittleEndian.Uint32([:]) { := extendMatch2(, , , .NextEmit)if .End-.Start > .MinLength && .score() > 0 { = } }for := 0; < .ChainLength; ++ { := .chain[]if == 0 {break } -= int()if <= 0 || - > .MaxDistance {break }ifbinary.LittleEndian.Uint32([:]) == binary.LittleEndian.Uint32([:]) { := extendMatch2(, , , .NextEmit)if .End-.Start > .MinLength && .score() > .score() { = } } }if .End-.Start < .MinLength {continue } := 0if [0] != (absoluteMatch{}) { = 275if .Start <= [1].End {// This match would completely replace the previous match, // so there is no penalty for overlap. = 0 } }if .score() <= .score([0])+ {continue } = [3]absoluteMatch{ , [0], [1], }if [2] == (absoluteMatch{}) {continue }// We have three matches, so it's time to emit one and/or eliminate one.switch {case [0].Start < [2].End:// The first and third matches overlap; discard the one in between. = [3]absoluteMatch{ [0], [2],absoluteMatch{}, }case [0].Start < [2].End+.MinLength:// The first and third matches don't overlap, but there's no room for // another match between them. Emit the first match and discard the second. .emit([2]) = [3]absoluteMatch{ [0],absoluteMatch{},absoluteMatch{}, }default:// Emit the first match, shortening it if necessary to avoid overlap with the second.if [2].End > [1].Start { [2].End = [1].Startif .ChainLength > 0 && [2].End-[2].Start >= .MinLength {// Since the match length was trimmed, we may be able to find a closer match // to replace it. := [2].Startfor { := int(.chain[])if == 0 {break } -= if <= [2].Match {break }ifbytes.Equal([[2].Start:[2].End], [:+[2].End-[2].Start]) { [2].Match = break } } } }if [2].End-[2].Start >= .MinLength && .score([2]) > 0 { .emit([2]) } [2] = absoluteMatch{} } }// We've found all the matches now; emit the remaining ones.if [1] != (absoluteMatch{}) {if [1].End > [0].Start { [1].End = [0].Start }if [1].End-[1].Start >= .MinLength && .score([1]) > 0 { .emit([1]) } }if [0] != (absoluteMatch{}) { .emit([0]) } = .Dstif .NextEmit < len() { = append(, Match{Unmatched: len() - .NextEmit, }) }return}const hashMul64 = 0x1E35A7BD1E35A7BD// extendMatch returns the largest k such that k <= len(src) and that// src[i:i+k-j] and src[j:k] have the same contents.//// It assumes that://// 0 <= i && i < j && j <= len(src)func extendMatch( []byte, , int) int {switchruntime.GOARCH {case"amd64", "arm64":// As long as we are 8 or more bytes before the end of src, we can load and // compare 8 bytes at a time. If those 8 bytes are equal, repeat.for +8 < len() { := binary.LittleEndian.Uint64([:]) := binary.LittleEndian.Uint64([:])if != {// If those 8 bytes were not equal, XOR the two 8 byte values, and return // the index of the first byte that differs. The BSF instruction finds the // least significant 1 bit, the amd64 architecture is little-endian, and // the shift by 3 converts a bit index to a byte index.return + bits.TrailingZeros64(^)>>3 } , = +8, +8 }case"386":// On a 32-bit CPU, we do it 4 bytes at a time.for +4 < len() { := binary.LittleEndian.Uint32([:]) := binary.LittleEndian.Uint32([:])if != {return + bits.TrailingZeros32(^)>>3 } , = +4, +4 } }for ; < len() && [] == []; , = +1, +1 { }return}// Given a 4-byte match at src[start] and src[candidate], extendMatch2 extends it// upward as far as possible, and downward no farther than to min.func extendMatch2( []byte, , , int) absoluteMatch { := extendMatch(, +4, +4)for > && > 0 && [-1] == [-1] { -- -- }returnabsoluteMatch{Start: ,End: ,Match: , }}
The pages are generated with Goldsv0.8.2. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.