Source File
grapheme.go
Belonging Package
github.com/rivo/uniseg
package unisegimport// Graphemes implements an iterator over Unicode grapheme clusters, or// user-perceived characters. While iterating, it also provides information// about word boundaries, sentence boundaries, line breaks, and monospace// character widths.//// After constructing the class via [NewGraphemes] for a given string "str",// [Graphemes.Next] is called for every grapheme cluster in a loop until it// returns false. Inside the loop, information about the grapheme cluster as// well as boundary information and character width is available via the various// methods (see examples below).//// This class basically wraps the [StepString] parser and provides a convenient// interface to it. If you are only interested in some parts of this package's// functionality, using the specialized functions starting with "First" is// almost always faster.type Graphemes struct {// The original string.original string// The remaining string to be parsed.remaining string// The current grapheme cluster.cluster string// The byte offset of the current grapheme cluster relative to the original// string.offset int// The current boundary information of the [Step] parser.boundaries int// The current state of the [Step] parser.state int}// NewGraphemes returns a new grapheme cluster iterator.func ( string) *Graphemes {return &Graphemes{original: ,remaining: ,state: -1,}}// Next advances the iterator by one grapheme cluster and returns false if no// clusters are left. This function must be called before the first cluster is// accessed.func ( *Graphemes) () bool {if len(.remaining) == 0 {// We're already past the end..state = -2.cluster = ""return false}.offset += len(.cluster).cluster, .remaining, .boundaries, .state = StepString(.remaining, .state)return true}// Runes returns a slice of runes (code points) which corresponds to the current// grapheme cluster. If the iterator is already past the end or [Graphemes.Next]// has not yet been called, nil is returned.func ( *Graphemes) () []rune {if .state < 0 {return nil}return []rune(.cluster)}// Str returns a substring of the original string which corresponds to the// current grapheme cluster. If the iterator is already past the end or// [Graphemes.Next] has not yet been called, an empty string is returned.func ( *Graphemes) () string {return .cluster}// Bytes returns a byte slice which corresponds to the current grapheme cluster.// If the iterator is already past the end or [Graphemes.Next] has not yet been// called, nil is returned.func ( *Graphemes) () []byte {if .state < 0 {return nil}return []byte(.cluster)}// Positions returns the interval of the current grapheme cluster as byte// positions into the original string. The first returned value "from" indexes// the first byte and the second returned value "to" indexes the first byte that// is not included anymore, i.e. str[from:to] is the current grapheme cluster of// the original string "str". If [Graphemes.Next] has not yet been called, both// values are 0. If the iterator is already past the end, both values are 1.func ( *Graphemes) () (int, int) {if .state == -1 {return 0, 0} else if .state == -2 {return 1, 1}return .offset, .offset + len(.cluster)}// IsWordBoundary returns true if a word ends after the current grapheme// cluster.func ( *Graphemes) () bool {if .state < 0 {return true}return .boundaries&MaskWord != 0}// IsSentenceBoundary returns true if a sentence ends after the current// grapheme cluster.func ( *Graphemes) () bool {if .state < 0 {return true}return .boundaries&MaskSentence != 0}// LineBreak returns whether the line can be broken after the current grapheme// cluster. A value of [LineDontBreak] means the line may not be broken, a value// of [LineMustBreak] means the line must be broken, and a value of// [LineCanBreak] means the line may or may not be broken.func ( *Graphemes) () int {if .state == -1 {return LineDontBreak}if .state == -2 {return LineMustBreak}return .boundaries & MaskLine}// Width returns the monospace width of the current grapheme cluster.func ( *Graphemes) () int {if .state < 0 {return 0}return .boundaries >> ShiftWidth}// Reset puts the iterator into its initial state such that the next call to// [Graphemes.Next] sets it to the first grapheme cluster again.func ( *Graphemes) () {.state = -1.offset = 0.cluster = "".remaining = .original}// GraphemeClusterCount returns the number of user-perceived characters// (grapheme clusters) for the given string.func ( string) ( int) {:= -1for len() > 0 {_, , _, = FirstGraphemeClusterInString(, )++}return}// ReverseString reverses the given string while observing grapheme cluster// boundaries.func ( string) string {:= []byte():= make([]byte, len()):= -1:= len()for len() > 0 {var []byte, , _, = FirstGraphemeCluster(, )-= len()copy([:], )if <= len()/2 {break}}return string()}// The number of bits the grapheme property must be shifted to make place for// grapheme states.const shiftGraphemePropState = 4// FirstGraphemeCluster returns the first grapheme cluster found in the given// byte slice according to the rules of [Unicode Standard Annex #29, Grapheme// Cluster Boundaries]. This function can be called continuously to extract all// grapheme clusters from a byte slice, as illustrated in the example below.//// If you don't know the current state, for example when calling the function// for the first time, you must pass -1. For consecutive calls, pass the state// and rest slice returned by the previous call.//// The "rest" slice is the sub-slice of the original byte slice "b" starting// after the last byte of the identified grapheme cluster. If the length of the// "rest" slice is 0, the entire byte slice "b" has been processed. The// "cluster" byte slice is the sub-slice of the input slice containing the// identified grapheme cluster.//// The returned width is the width of the grapheme cluster for most monospace// fonts where a value of 1 represents one character cell.//// Given an empty byte slice "b", the function returns nil values.//// While slightly less convenient than using the Graphemes class, this function// has much better performance and makes no allocations. It lends itself well to// large byte slices.//// [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundariesfunc ( []byte, int) (, []byte, , int) {// An empty byte slice returns nothing.if len() == 0 {return}// Extract the first rune., := utf8.DecodeRune()if len() <= { // If we're already past the end, there is nothing else to parse.var intif < 0 {= propertyGraphemes()} else {= >> shiftGraphemePropState}return , nil, runeWidth(, ), grAny | ( << shiftGraphemePropState)}// If we don't know the state, determine it now.var intif < 0 {, , _ = transitionGraphemeState(, )} else {= >> shiftGraphemePropState}+= runeWidth(, )// Transition until we find a boundary.for {var (intbool), := utf8.DecodeRune([:]), , = transitionGraphemeState(&maskGraphemeState, )if {return [:], [:], , | ( << shiftGraphemePropState)}if == prExtendedPictographic {if == vs15 {= 1} else if == vs16 {= 2}} else if != prRegionalIndicator && != prL {+= runeWidth(, )}+=if len() <= {return , nil, , grAny | ( << shiftGraphemePropState)}}}// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and// outputs are strings.func ( string, int) (, string, , int) {// An empty string returns nothing.if len() == 0 {return}// Extract the first rune., := utf8.DecodeRuneInString()if len() <= { // If we're already past the end, there is nothing else to parse.var intif < 0 {= propertyGraphemes()} else {= >> shiftGraphemePropState}return , "", runeWidth(, ), grAny | ( << shiftGraphemePropState)}// If we don't know the state, determine it now.var intif < 0 {, , _ = transitionGraphemeState(, )} else {= >> shiftGraphemePropState}+= runeWidth(, )// Transition until we find a boundary.for {var (intbool), := utf8.DecodeRuneInString([:]), , = transitionGraphemeState(&maskGraphemeState, )if {return [:], [:], , | ( << shiftGraphemePropState)}if == prExtendedPictographic {if == vs15 {= 1} else if == vs16 {= 2}} else if != prRegionalIndicator && != prL {+= runeWidth(, )}+=if len() <= {return , "", , grAny | ( << shiftGraphemePropState)}}}
![]() |
The pages are generated with Golds v0.8.2. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds. |