// Package difflib is a partial port of Python difflib module.//// It provides tools to compare sequences of strings and generate textual diffs.//// The following class and functions have been ported://// - SequenceMatcher//// - unified_diff//// - context_diff//// Getting unified diffs was the main goal of the port. Keep in mind this code// is mostly suitable to output text differences in a human friendly way, there// are no guarantees generated diffs are consumable by patch(1).
package difflibimport ()func min(, int) int {if < {return }return}func max(, int) int {if > {return }return}func calculateRatio(, int) float64 {if > 0 {return2.0 * float64() / float64() }return1.0}typeMatchstruct { A int B int Size int}typeOpCodestruct { Tag byte I1 int I2 int J1 int J2 int}// SequenceMatcher compares sequence of strings. The basic// algorithm predates, and is a little fancier than, an algorithm// published in the late 1980's by Ratcliff and Obershelp under the// hyperbolic name "gestalt pattern matching". The basic idea is to find// the longest contiguous matching subsequence that contains no "junk"// elements (R-O doesn't address junk). The same idea is then applied// recursively to the pieces of the sequences to the left and to the right// of the matching subsequence. This does not yield minimal edit// sequences, but does tend to yield matches that "look right" to people.//// SequenceMatcher tries to compute a "human-friendly diff" between two// sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the// longest *contiguous* & junk-free matching subsequence. That's what// catches peoples' eyes. The Windows(tm) windiff has another interesting// notion, pairing up elements that appear uniquely in each sequence.// That, and the method here, appear to yield more intuitive difference// reports than does diff. This method appears to be the least vulnerable// to synching up on blocks of "junk lines", though (like blank lines in// ordinary text files, or maybe "<P>" lines in HTML files). That may be// because this is the only method of the 3 that has a *concept* of// "junk" <wink>.//// Timing: Basic R-O is cubic time worst case and quadratic time expected// case. SequenceMatcher is quadratic time for the worst case and has// expected-case behavior dependent in a complicated way on how many// elements the sequences have in common; best case time is linear.typeSequenceMatcherstruct { a []string b []string b2j map[string][]int IsJunk func(string) bool autoJunk bool bJunk map[string]struct{} matchingBlocks []Match fullBCount map[string]int bPopular map[string]struct{} opCodes []OpCode}func (, []string) *SequenceMatcher { := SequenceMatcher{autoJunk: true} .SetSeqs(, )return &}func (, []string, bool,func(string) bool) *SequenceMatcher { := SequenceMatcher{IsJunk: , autoJunk: } .SetSeqs(, )return &}// Set two sequences to be compared.func ( *SequenceMatcher) (, []string) { .SetSeq1() .SetSeq2()}// Set the first sequence to be compared. The second sequence to be compared is// not changed.//// SequenceMatcher computes and caches detailed information about the second// sequence, so if you want to compare one sequence S against many sequences,// use .SetSeq2(s) once and call .SetSeq1(x) repeatedly for each of the other// sequences.//// See also SetSeqs() and SetSeq2().func ( *SequenceMatcher) ( []string) {if & == &.a {return } .a = .matchingBlocks = nil .opCodes = nil}// Set the second sequence to be compared. The first sequence to be compared is// not changed.func ( *SequenceMatcher) ( []string) {if & == &.b {return } .b = .matchingBlocks = nil .opCodes = nil .fullBCount = nil .chainB()}func ( *SequenceMatcher) () {// Populate line -> index mapping := map[string][]int{}for , := range .b { := [] = append(, ) [] = }// Purge junk elements .bJunk = map[string]struct{}{}if .IsJunk != nil { := .bJunkfor , := range {if .IsJunk() { [] = struct{}{} } }for , := range {delete(, ) } }// Purge remaining popular elements := map[string]struct{}{} := len(.b)if .autoJunk && >= 200 { := /100 + 1for , := range {iflen() > { [] = struct{}{} } }for , := range {delete(, ) } } .bPopular = .b2j = }func ( *SequenceMatcher) ( string) bool { , := .bJunk[]return}// Find longest matching block in a[alo:ahi] and b[blo:bhi].//// If IsJunk is not defined://// Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where// alo <= i <= i+k <= ahi// blo <= j <= j+k <= bhi// and for all (i',j',k') meeting those conditions,// k >= k'// i <= i'// and if i == i', j <= j'//// In other words, of all maximal matching blocks, return one that// starts earliest in a, and of all those maximal matching blocks that// start earliest in a, return the one that starts earliest in b.//// If IsJunk is defined, first the longest matching block is// determined as above, but with the additional restriction that no// junk element appears in the block. Then that block is extended as// far as possible by matching (only) junk elements on both sides. So// the resulting block never matches on junk except as identical junk// happens to be adjacent to an "interesting" match.//// If no blocks match, return (alo, blo, 0).func ( *SequenceMatcher) (, , , int) Match {// CAUTION: stripping common prefix or suffix would be incorrect. // E.g., // ab // acab // Longest matching block is "ab", but if common prefix is // stripped, it's "a" (tied with "b"). UNIX(tm) diff does so // strip, so ends up claiming that ab is changed to acab by // inserting "ca" in the middle. That's minimal but unintuitive: // "it's obvious" that someone inserted "ac" at the front. // Windiff ends up at the same place as diff, but by pairing up // the unique 'b's and then matching the first two 'a's. , , := , , 0// find longest junk-free match // during an iteration of the loop, j2len[j] = length of longest // junk-free match ending with a[i-1] and b[j] := map[int]int{}for := ; != ; ++ {// look at all instances of a[i] in b; note that because // b2j has no junk keys, the loop is skipped if a[i] is junk := map[int]int{}for , := range .b2j[.a[]] {// a[i] matches b[j]if < {continue }if >= {break } := [-1] + 1 [] = if > { , , = -+1, -+1, } } = }// Extend the best by non-junk elements on each end. In particular, // "popular" non-junk elements aren't in b2j, which greatly speeds // the inner loop above, but also means "the best" match so far // doesn't contain any junk *or* popular non-junk elements.for > && > && !.isBJunk(.b[-1]) && .a[-1] == .b[-1] { , , = -1, -1, +1 }for + < && + < && !.isBJunk(.b[+]) && .a[+] == .b[+] { += 1 }// Now that we have a wholly interesting match (albeit possibly // empty!), we may as well suck up the matching junk on each // side of it too. Can't think of a good reason not to, and it // saves post-processing the (possibly considerable) expense of // figuring out what to do with it. In the case of an empty // interesting match, this is clearly the right thing to do, // because no other kind of match is possible in the regions.for > && > && .isBJunk(.b[-1]) && .a[-1] == .b[-1] { , , = -1, -1, +1 }for + < && + < && .isBJunk(.b[+]) && .a[+] == .b[+] { += 1 }returnMatch{A: , B: , Size: }}// Return list of triples describing matching subsequences.//// Each triple is of the form (i, j, n), and means that// a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in// i and in j. It's also guaranteed that if (i, j, n) and (i', j', n') are// adjacent triples in the list, and the second is not the last triple in the// list, then i+n != i' or j+n != j'. IOW, adjacent triples never describe// adjacent equal blocks.//// The last triple is a dummy, (len(a), len(b), 0), and is the only// triple with n==0.func ( *SequenceMatcher) () []Match {if .matchingBlocks != nil {return .matchingBlocks }varfunc(, , , int, []Match) []Match = func(, , , int, []Match) []Match { := .findLongestMatch(, , , ) , , := .A, .B, .Sizeif .Size > 0 {if < && < { = (, , , , ) } = append(, )if + < && + < { = (+, , +, , ) } }return } := (0, len(.a), 0, len(.b), nil)// It's possible that we have adjacent equal blocks in the // matching_blocks list now. := []Match{} , , := 0, 0, 0for , := range {// Is this block adjacent to i1, j1, k1? , , := .A, .B, .Sizeif + == && + == {// Yes, so collapse them -- this just increases the length of // the first block by the length of the second, and the first // block so lengthened remains the block to compare against. += } else {// Not adjacent. Remember the first block (k1==0 means it's // the dummy we started with), and make the second block the // new block to compare against.if > 0 { = append(, Match{, , }) } , , = , , } }if > 0 { = append(, Match{, , }) } = append(, Match{len(.a), len(.b), 0}) .matchingBlocks = return .matchingBlocks}// Return list of 5-tuples describing how to turn a into b.//// Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple// has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the// tuple preceding it, and likewise for j1 == the previous j2.//// The tags are characters, with these meanings://// 'r' (replace): a[i1:i2] should be replaced by b[j1:j2]//// 'd' (delete): a[i1:i2] should be deleted, j1==j2 in this case.//// 'i' (insert): b[j1:j2] should be inserted at a[i1:i1], i1==i2 in this case.//// 'e' (equal): a[i1:i2] == b[j1:j2]func ( *SequenceMatcher) () []OpCode {if .opCodes != nil {return .opCodes } , := 0, 0 := .GetMatchingBlocks() := make([]OpCode, 0, len())for , := range {// invariant: we've pumped out correct diffs to change // a[:i] into b[:j], and the next matching block is // a[ai:ai+size] == b[bj:bj+size]. So we need to pump // out a diff to change a[i:ai] into b[j:bj], pump out // the matching block, and move (i,j) beyond the match , , := .A, .B, .Size := byte(0)if < && < { = 'r' } elseif < { = 'd' } elseif < { = 'i' }if > 0 { = append(, OpCode{, , , , }) } , = +, +// the list of matching blocks is terminated by a // sentinel with size 0if > 0 { = append(, OpCode{'e', , , , }) } } .opCodes = return .opCodes}// Isolate change clusters by eliminating ranges with no changes.//// Return a generator of groups with up to n lines of context.// Each group is in the same format as returned by GetOpCodes().func ( *SequenceMatcher) ( int) [][]OpCode {if < 0 { = 3 } := .GetOpCodes()iflen() == 0 { = []OpCode{OpCode{'e', 0, 1, 0, 1}} }// Fixup leading and trailing groups if they show no changes.if [0].Tag == 'e' { := [0] , , , := .I1, .I2, .J1, .J2 [0] = OpCode{.Tag, max(, -), , max(, -), } }if [len()-1].Tag == 'e' { := [len()-1] , , , := .I1, .I2, .J1, .J2 [len()-1] = OpCode{.Tag, , min(, +), , min(, +)} } := + := [][]OpCode{} := []OpCode{}for , := range { , , , := .I1, .I2, .J1, .J2// End the current group and start a new one whenever // there is a large range with no changes.if .Tag == 'e' && - > { = append(, OpCode{.Tag, , min(, +), , min(, +)}) = append(, ) = []OpCode{} , = max(, -), max(, -) } = append(, OpCode{.Tag, , , , }) }iflen() > 0 && !(len() == 1 && [0].Tag == 'e') { = append(, ) }return}// Return a measure of the sequences' similarity (float in [0,1]).//// Where T is the total number of elements in both sequences, and// M is the number of matches, this is 2.0*M / T.// Note that this is 1 if the sequences are identical, and 0 if// they have nothing in common.//// .Ratio() is expensive to compute if you haven't already computed// .GetMatchingBlocks() or .GetOpCodes(), in which case you may// want to try .QuickRatio() or .RealQuickRation() first to get an// upper bound.func ( *SequenceMatcher) () float64 { := 0for , := range .GetMatchingBlocks() { += .Size }returncalculateRatio(, len(.a)+len(.b))}// Return an upper bound on ratio() relatively quickly.//// This isn't defined beyond that it is an upper bound on .Ratio(), and// is faster to compute.func ( *SequenceMatcher) () float64 {// viewing a and b as multisets, set matches to the cardinality // of their intersection; this counts the number of matches // without regard to order, so is clearly an upper boundif .fullBCount == nil { .fullBCount = map[string]int{}for , := range .b { .fullBCount[] = .fullBCount[] + 1 } }// avail[x] is the number of times x appears in 'b' less the // number of times we've seen it in 'a' so far ... kinda := map[string]int{} := 0for , := range .a { , := []if ! { = .fullBCount[] } [] = - 1if > 0 { += 1 } }returncalculateRatio(, len(.a)+len(.b))}// Return an upper bound on ratio() very quickly.//// This isn't defined beyond that it is an upper bound on .Ratio(), and// is faster to compute than either .Ratio() or .QuickRatio().func ( *SequenceMatcher) () float64 { , := len(.a), len(.b)returncalculateRatio(min(, ), +)}// Convert range to the "ed" formatfunc formatRangeUnified(, int) string {// Per the diff spec at http://www.unix.org/single_unix_specification/ := + 1// lines start numbering with one := - if == 1 {returnfmt.Sprintf("%d", ) }if == 0 { -= 1// empty ranges begin at line just before the range }returnfmt.Sprintf("%d,%d", , )}// Unified diff parameterstypeUnifiedDiffstruct { A []string// First sequence lines FromFile string// First file name FromDate string// First file time B []string// Second sequence lines ToFile string// Second file name ToDate string// Second file time Eol string// Headers end of line, defaults to LF Context int// Number of context lines}// Compare two sequences of lines; generate the delta as a unified diff.//// Unified diffs are a compact way of showing line changes and a few// lines of context. The number of context lines is set by 'n' which// defaults to three.//// By default, the diff control lines (those with ---, +++, or @@) are// created with a trailing newline. This is helpful so that inputs// created from file.readlines() result in diffs that are suitable for// file.writelines() since both the inputs and outputs have trailing// newlines.//// For inputs that do not have trailing newlines, set the lineterm// argument to "" so that the output will be uniformly newline free.//// The unidiff format normally has a header for filenames and modification// times. Any or all of these may be specified using strings for// 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.// The modification times are normally expressed in the ISO 8601 format.func ( io.Writer, UnifiedDiff) error { := bufio.NewWriter()defer .Flush() := func( string, ...interface{}) error { , := .WriteString(fmt.Sprintf(, ...))return } := func( string) error { , := .WriteString()return }iflen(.Eol) == 0 { .Eol = "\n" } := false := NewMatcher(.A, .B)for , := range .GetGroupedOpCodes(.Context) {if ! { = true := ""iflen(.FromDate) > 0 { = "\t" + .FromDate } := ""iflen(.ToDate) > 0 { = "\t" + .ToDate }if .FromFile != "" || .ToFile != "" { := ("--- %s%s%s", .FromFile, , .Eol)if != nil {return } = ("+++ %s%s%s", .ToFile, , .Eol)if != nil {return } } } , := [0], [len()-1] := formatRangeUnified(.I1, .I2) := formatRangeUnified(.J1, .J2)if := ("@@ -%s +%s @@%s", , , .Eol); != nil {return }for , := range { , , , := .I1, .I2, .J1, .J2if .Tag == 'e' {for , := range .A[:] {if := (" " + ); != nil {return } }continue }if .Tag == 'r' || .Tag == 'd' {for , := range .A[:] {if := ("-" + ); != nil {return } } }if .Tag == 'r' || .Tag == 'i' {for , := range .B[:] {if := ("+" + ); != nil {return } } } } }returnnil}// Like WriteUnifiedDiff but returns the diff a string.func ( UnifiedDiff) (string, error) { := &bytes.Buffer{} := WriteUnifiedDiff(, )returnstring(.Bytes()), }// Convert range to the "ed" format.func formatRangeContext(, int) string {// Per the diff spec at http://www.unix.org/single_unix_specification/ := + 1// lines start numbering with one := - if == 0 { -= 1// empty ranges begin at line just before the range }if <= 1 {returnfmt.Sprintf("%d", ) }returnfmt.Sprintf("%d,%d", , +-1)}typeContextDiffUnifiedDiff// Compare two sequences of lines; generate the delta as a context diff.//// Context diffs are a compact way of showing line changes and a few// lines of context. The number of context lines is set by diff.Context// which defaults to three.//// By default, the diff control lines (those with *** or ---) are// created with a trailing newline.//// For inputs that do not have trailing newlines, set the diff.Eol// argument to "" so that the output will be uniformly newline free.//// The context diff format normally has a header for filenames and// modification times. Any or all of these may be specified using// strings for diff.FromFile, diff.ToFile, diff.FromDate, diff.ToDate.// The modification times are normally expressed in the ISO 8601 format.// If not specified, the strings default to blanks.func ( io.Writer, ContextDiff) error { := bufio.NewWriter()defer .Flush()varerror := func( string, ...interface{}) { , := .WriteString(fmt.Sprintf(, ...))if == nil && != nil { = } } := func( string) { , := .WriteString()if == nil && != nil { = } }iflen(.Eol) == 0 { .Eol = "\n" } := map[byte]string{'i': "+ ",'d': "- ",'r': "! ",'e': " ", } := false := NewMatcher(.A, .B)for , := range .GetGroupedOpCodes(.Context) {if ! { = true := ""iflen(.FromDate) > 0 { = "\t" + .FromDate } := ""iflen(.ToDate) > 0 { = "\t" + .ToDate }if .FromFile != "" || .ToFile != "" { ("*** %s%s%s", .FromFile, , .Eol) ("--- %s%s%s", .ToFile, , .Eol) } } , := [0], [len()-1] ("***************" + .Eol) := formatRangeContext(.I1, .I2) ("*** %s ****%s", , .Eol)for , := range {if .Tag == 'r' || .Tag == 'd' {for , := range {if .Tag == 'i' {continue }for , := range .A[.I1:.I2] { ([.Tag] + ) } }break } } := formatRangeContext(.J1, .J2) ("--- %s ----%s", , .Eol)for , := range {if .Tag == 'r' || .Tag == 'i' {for , := range {if .Tag == 'd' {continue }for , := range .B[.J1:.J2] { ([.Tag] + ) } }break } } }return}// Like WriteContextDiff but returns the diff a string.func ( ContextDiff) (string, error) { := &bytes.Buffer{} := WriteContextDiff(, )returnstring(.Bytes()), }// Split a string on "\n" while preserving them. The output can be used// as input for UnifiedDiff and ContextDiff structures.func ( string) []string { := strings.SplitAfter(, "\n") [len()-1] += "\n"return}
The pages are generated with Goldsv0.8.2. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.