package graphemesimport ()// is determines if lookup intersects propert(ies)func ( property) ( property) bool {return ( & ) != 0}const _Ignore = _Extend// SplitFunc is a bufio.SplitFunc implementation of Unicode grapheme cluster segmentation, for use with bufio.Scanner.//// See https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.varSplitFuncbufio.SplitFunc = splitFunc[[]byte]func splitFunc[ iterators.Stringish]( , bool) ( int, , error) {variflen() == 0 {return0, , nil }// These vars are stateful across loop iterationsvarintvarproperty = 0// "last excluding ignored categories"varproperty = 0// "last one before that"varint// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property // to the right of the ×, from which we look back or forward , := lookup([:])if == 0 {if ! {// Rune extends past current data, request morereturn0, , nil } = len()return , [:], nil }// https://unicode.org/reports/tr29/#GB1 // Start of text always advances += for { := == len() // "end of text"if {if ! {// Token extends past current data, request morereturn0, , nil }// https://unicode.org/reports/tr29/#GB2break }/* We've switched the evaluation order of GB1↓ and GB2↑. It's ok: because we've checked for len(data) at the top of this function, sot and eot are mutually exclusive, order doesn't matter. */// Rules are usually of the form Cat1 × Cat2; "current" refers to the first property // to the right of the ×, from which we look back or forward// Remember previous properties to avoid lookups/lookbacks := if !.is(_Ignore) { = = } , = lookup([:])if == 0 {if {// Just return the bytes, we can't do anything with them = len()break }// Rune extends past current data, request morereturn0, , nil }// Optimization: no rule can possibly applyif | == 0 { // i.e. both are zerobreak }// https://unicode.org/reports/tr29/#GB3if .is(_LF) && .is(_CR) { += continue }// https://unicode.org/reports/tr29/#GB4 // https://unicode.org/reports/tr29/#GB5if ( | ).is(_Control | _CR | _LF) {break }// https://unicode.org/reports/tr29/#GB6if .is(_L|_V|_LV|_LVT) && .is(_L) { += continue }// https://unicode.org/reports/tr29/#GB7if .is(_V|_T) && .is(_LV|_V) { += continue }// https://unicode.org/reports/tr29/#GB8if .is(_T) && .is(_LVT|_T) { += continue }// https://unicode.org/reports/tr29/#GB9if .is(_Extend | _ZWJ) { += continue }// https://unicode.org/reports/tr29/#GB9aif .is(_SpacingMark) { += continue }// https://unicode.org/reports/tr29/#GB9bif .is(_Prepend) { += continue }// https://unicode.org/reports/tr29/#GB9c // TODO(clipperhouse): // It appears to be added in Unicode 15.1.0: // https://unicode.org/versions/Unicode15.1.0/#Migration // This package currently supports Unicode 15.0.0, so // out of scope for now// https://unicode.org/reports/tr29/#GB11if .is(_ExtendedPictographic) && .is(_ZWJ) && .is(_ExtendedPictographic) { += continue }// https://unicode.org/reports/tr29/#GB12 // https://unicode.org/reports/tr29/#GB13if ( & ).is(_RegionalIndicator) { ++ := %2 == 1if { += continue } }// If we fall through all the above rules, it's a grapheme cluster breakbreak }// Return tokenreturn , [:], nil}
The pages are generated with Goldsv0.8.2. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.