package syntax
import (
"bytes"
"fmt"
"strconv"
"unicode"
"unicode/utf8"
)
type Prefix struct {
PrefixStr []rune
PrefixSet CharSet
CaseInsensitive bool
}
func getFirstCharsPrefix(tree *RegexTree ) *Prefix {
s := regexFcd {
fcStack : make ([]regexFc , 32 ),
intStack : make ([]int , 32 ),
}
fc := s .regexFCFromRegexTree (tree )
if fc == nil || fc .nullable || fc .cc .IsEmpty () {
return nil
}
fcSet := fc .getFirstChars ()
return &Prefix {PrefixSet : fcSet , CaseInsensitive : fc .caseInsensitive }
}
type regexFcd struct {
intStack []int
intDepth int
fcStack []regexFc
fcDepth int
skipAllChildren bool
skipchild bool
failed bool
}
func (s *regexFcd ) regexFCFromRegexTree (tree *RegexTree ) *regexFc {
curNode := tree .root
curChild := 0
for {
if len (curNode .children ) == 0 {
s .calculateFC (curNode .t , curNode , 0 )
} else if curChild < len (curNode .children ) && !s .skipAllChildren {
s .calculateFC (curNode .t |beforeChild , curNode , curChild )
if !s .skipchild {
curNode = curNode .children [curChild ]
s .pushInt (curChild )
curChild = 0
} else {
curChild ++
s .skipchild = false
}
continue
}
s .skipAllChildren = false
if s .intIsEmpty () {
break
}
curChild = s .popInt ()
curNode = curNode .next
s .calculateFC (curNode .t |afterChild , curNode , curChild )
if s .failed {
return nil
}
curChild ++
}
if s .fcIsEmpty () {
return nil
}
return s .popFC ()
}
func (s *regexFcd ) pushInt (I int ) {
if s .intDepth >= len (s .intStack ) {
expanded := make ([]int , s .intDepth *2 )
copy (expanded , s .intStack )
s .intStack = expanded
}
s .intStack [s .intDepth ] = I
s .intDepth ++
}
func (s *regexFcd ) intIsEmpty () bool {
return s .intDepth == 0
}
func (s *regexFcd ) popInt () int {
s .intDepth --
return s .intStack [s .intDepth ]
}
func (s *regexFcd ) pushFC (fc regexFc ) {
if s .fcDepth >= len (s .fcStack ) {
expanded := make ([]regexFc , s .fcDepth *2 )
copy (expanded , s .fcStack )
s .fcStack = expanded
}
s .fcStack [s .fcDepth ] = fc
s .fcDepth ++
}
func (s *regexFcd ) fcIsEmpty () bool {
return s .fcDepth == 0
}
func (s *regexFcd ) popFC () *regexFc {
s .fcDepth --
return &s .fcStack [s .fcDepth ]
}
func (s *regexFcd ) topFC () *regexFc {
return &s .fcStack [s .fcDepth -1 ]
}
func (s *regexFcd ) skipChild () {
s .skipchild = true
}
func (s *regexFcd ) calculateFC (nt nodeType , node *regexNode , CurIndex int ) {
ci := false
rtl := false
if nt <= ntRef {
if (node .options & IgnoreCase ) != 0 {
ci = true
}
if (node .options & RightToLeft ) != 0 {
rtl = true
}
}
switch nt {
case ntConcatenate | beforeChild , ntAlternate | beforeChild , ntTestref | beforeChild , ntLoop | beforeChild , ntLazyloop | beforeChild :
break
case ntTestgroup | beforeChild :
if CurIndex == 0 {
s .skipChild ()
}
break
case ntEmpty :
s .pushFC (regexFc {nullable : true })
break
case ntConcatenate | afterChild :
if CurIndex != 0 {
child := s .popFC ()
cumul := s .topFC ()
s .failed = !cumul .addFC (*child , true )
}
fc := s .topFC ()
if !fc .nullable {
s .skipAllChildren = true
}
break
case ntTestgroup | afterChild :
if CurIndex > 1 {
child := s .popFC ()
cumul := s .topFC ()
s .failed = !cumul .addFC (*child , false )
}
break
case ntAlternate | afterChild , ntTestref | afterChild :
if CurIndex != 0 {
child := s .popFC ()
cumul := s .topFC ()
s .failed = !cumul .addFC (*child , false )
}
break
case ntLoop | afterChild , ntLazyloop | afterChild :
if node .m == 0 {
fc := s .topFC ()
fc .nullable = true
}
break
case ntGroup | beforeChild , ntGroup | afterChild , ntCapture | beforeChild , ntCapture | afterChild , ntGreedy | beforeChild , ntGreedy | afterChild :
break
case ntRequire | beforeChild , ntPrevent | beforeChild :
s .skipChild ()
s .pushFC (regexFc {nullable : true })
break
case ntRequire | afterChild , ntPrevent | afterChild :
break
case ntOne , ntNotone :
s .pushFC (newRegexFc (node .ch , nt == ntNotone , false , ci ))
break
case ntOneloop , ntOnelazy :
s .pushFC (newRegexFc (node .ch , false , node .m == 0 , ci ))
break
case ntNotoneloop , ntNotonelazy :
s .pushFC (newRegexFc (node .ch , true , node .m == 0 , ci ))
break
case ntMulti :
if len (node .str ) == 0 {
s .pushFC (regexFc {nullable : true })
} else if !rtl {
s .pushFC (newRegexFc (node .str [0 ], false , false , ci ))
} else {
s .pushFC (newRegexFc (node .str [len (node .str )-1 ], false , false , ci ))
}
break
case ntSet :
s .pushFC (regexFc {cc : node .set .Copy (), nullable : false , caseInsensitive : ci })
break
case ntSetloop , ntSetlazy :
s .pushFC (regexFc {cc : node .set .Copy (), nullable : node .m == 0 , caseInsensitive : ci })
break
case ntRef :
s .pushFC (regexFc {cc : *AnyClass (), nullable : true , caseInsensitive : false })
break
case ntNothing , ntBol , ntEol , ntBoundary , ntNonboundary , ntECMABoundary , ntNonECMABoundary , ntBeginning , ntStart , ntEndZ , ntEnd :
s .pushFC (regexFc {nullable : true })
break
default :
panic (fmt .Sprintf ("unexpected op code: %v" , nt ))
}
}
type regexFc struct {
cc CharSet
nullable bool
caseInsensitive bool
}
func newRegexFc(ch rune , not , nullable , caseInsensitive bool ) regexFc {
r := regexFc {
caseInsensitive : caseInsensitive ,
nullable : nullable ,
}
if not {
if ch > 0 {
r .cc .addRange ('\x00' , ch -1 )
}
if ch < 0xFFFF {
r .cc .addRange (ch +1 , utf8 .MaxRune )
}
} else {
r .cc .addRange (ch , ch )
}
return r
}
func (r *regexFc ) getFirstChars () CharSet {
if r .caseInsensitive {
r .cc .addLowercase ()
}
return r .cc
}
func (r *regexFc ) addFC (fc regexFc , concatenate bool ) bool {
if !r .cc .IsMergeable () || !fc .cc .IsMergeable () {
return false
}
if concatenate {
if !r .nullable {
return true
}
if !fc .nullable {
r .nullable = false
}
} else {
if fc .nullable {
r .nullable = true
}
}
r .caseInsensitive = r .caseInsensitive || fc .caseInsensitive
r .cc .addSet (fc .cc )
return true
}
func getPrefix(tree *RegexTree ) *Prefix {
var concatNode *regexNode
nextChild := 0
curNode := tree .root
for {
switch curNode .t {
case ntConcatenate :
if len (curNode .children ) > 0 {
concatNode = curNode
nextChild = 0
}
case ntGreedy , ntCapture :
curNode = curNode .children [0 ]
concatNode = nil
continue
case ntOneloop , ntOnelazy :
if curNode .m > 0 {
return &Prefix {
PrefixStr : repeat (curNode .ch , curNode .m ),
CaseInsensitive : (curNode .options & IgnoreCase ) != 0 ,
}
}
return nil
case ntOne :
return &Prefix {
PrefixStr : []rune {curNode .ch },
CaseInsensitive : (curNode .options & IgnoreCase ) != 0 ,
}
case ntMulti :
return &Prefix {
PrefixStr : curNode .str ,
CaseInsensitive : (curNode .options & IgnoreCase ) != 0 ,
}
case ntBol , ntEol , ntBoundary , ntECMABoundary , ntBeginning , ntStart ,
ntEndZ , ntEnd , ntEmpty , ntRequire , ntPrevent :
default :
return nil
}
if concatNode == nil || nextChild >= len (concatNode .children ) {
return nil
}
curNode = concatNode .children [nextChild ]
nextChild ++
}
}
func repeat(r rune , c int ) []rune {
if c > MaxPrefixSize {
c = MaxPrefixSize
}
ret := make ([]rune , c )
ret [0 ] = r
bp := 1
for bp < len (ret ) {
copy (ret [bp :], ret [:bp ])
bp *= 2
}
return ret
}
type BmPrefix struct {
positive []int
negativeASCII []int
negativeUnicode [][]int
pattern []rune
lowASCII rune
highASCII rune
rightToLeft bool
caseInsensitive bool
}
func newBmPrefix(pattern []rune , caseInsensitive , rightToLeft bool ) *BmPrefix {
b := &BmPrefix {
rightToLeft : rightToLeft ,
caseInsensitive : caseInsensitive ,
pattern : pattern ,
}
if caseInsensitive {
for i := 0 ; i < len (b .pattern ); i ++ {
b .pattern [i ] = unicode .ToLower (b .pattern [i ])
}
}
var beforefirst , last , bump int
var scan , match int
if !rightToLeft {
beforefirst = -1
last = len (b .pattern ) - 1
bump = 1
} else {
beforefirst = len (b .pattern )
last = 0
bump = -1
}
b .positive = make ([]int , len (b .pattern ))
examine := last
ch := b .pattern [examine ]
b .positive [examine ] = bump
examine -= bump
Outerloop :
for {
for {
if examine == beforefirst {
break Outerloop
}
if b .pattern [examine ] == ch {
break
}
examine -= bump
}
match = last
scan = examine
for {
if scan == beforefirst || b .pattern [match ] != b .pattern [scan ] {
if b .positive [match ] == 0 {
b .positive [match ] = match - scan
}
break
}
scan -= bump
match -= bump
}
examine -= bump
}
match = last - bump
for match != beforefirst {
if b .positive [match ] == 0 {
b .positive [match ] = bump
}
match -= bump
}
b .negativeASCII = make ([]int , 128 )
for i := 0 ; i < len (b .negativeASCII ); i ++ {
b .negativeASCII [i ] = last - beforefirst
}
b .lowASCII = 127
b .highASCII = 0
for examine = last ; examine != beforefirst ; examine -= bump {
ch = b .pattern [examine ]
switch {
case ch < 128 :
if b .lowASCII > ch {
b .lowASCII = ch
}
if b .highASCII < ch {
b .highASCII = ch
}
if b .negativeASCII [ch ] == last -beforefirst {
b .negativeASCII [ch ] = last - examine
}
case ch <= 0xffff :
i , j := ch >>8 , ch &0xFF
if b .negativeUnicode == nil {
b .negativeUnicode = make ([][]int , 256 )
}
if b .negativeUnicode [i ] == nil {
newarray := make ([]int , 256 )
for k := 0 ; k < len (newarray ); k ++ {
newarray [k ] = last - beforefirst
}
if i == 0 {
copy (newarray , b .negativeASCII )
b .negativeASCII = newarray
}
b .negativeUnicode [i ] = newarray
}
if b .negativeUnicode [i ][j ] == last -beforefirst {
b .negativeUnicode [i ][j ] = last - examine
}
default :
return nil
}
}
return b
}
func (b *BmPrefix ) String () string {
return string (b .pattern )
}
func (b *BmPrefix ) Dump (indent string ) string {
buf := &bytes .Buffer {}
fmt .Fprintf (buf , "%sBM Pattern: %s\n%sPositive: " , indent , string (b .pattern ), indent )
for i := 0 ; i < len (b .positive ); i ++ {
buf .WriteString (strconv .Itoa (b .positive [i ]))
buf .WriteRune (' ' )
}
buf .WriteRune ('\n' )
if b .negativeASCII != nil {
buf .WriteString (indent )
buf .WriteString ("Negative table\n" )
for i := 0 ; i < len (b .negativeASCII ); i ++ {
if b .negativeASCII [i ] != len (b .pattern ) {
fmt .Fprintf (buf , "%s %s %s\n" , indent , Escape (string (rune (i ))), strconv .Itoa (b .negativeASCII [i ]))
}
}
}
return buf .String ()
}
func (b *BmPrefix ) Scan (text []rune , index , beglimit , endlimit int ) int {
var (
defadv , test , test2 int
match , startmatch , endmatch int
bump , advance int
chTest rune
unicodeLookup []int
)
if !b .rightToLeft {
defadv = len (b .pattern )
startmatch = len (b .pattern ) - 1
endmatch = 0
test = index + defadv - 1
bump = 1
} else {
defadv = -len (b .pattern )
startmatch = 0
endmatch = -defadv - 1
test = index + defadv
bump = -1
}
chMatch := b .pattern [startmatch ]
for {
if test >= endlimit || test < beglimit {
return -1
}
chTest = text [test ]
if b .caseInsensitive {
chTest = unicode .ToLower (chTest )
}
if chTest != chMatch {
if chTest < 128 {
advance = b .negativeASCII [chTest ]
} else if chTest < 0xffff && len (b .negativeUnicode ) > 0 {
unicodeLookup = b .negativeUnicode [chTest >>8 ]
if len (unicodeLookup ) > 0 {
advance = unicodeLookup [chTest &0xFF ]
} else {
advance = defadv
}
} else {
advance = defadv
}
test += advance
} else {
test2 = test
match = startmatch
for {
if match == endmatch {
if b .rightToLeft {
return test2 + 1
} else {
return test2
}
}
match -= bump
test2 -= bump
chTest = text [test2 ]
if b .caseInsensitive {
chTest = unicode .ToLower (chTest )
}
if chTest != b .pattern [match ] {
advance = b .positive [match ]
if chTest < 128 {
test2 = (match - startmatch ) + b .negativeASCII [chTest ]
} else if chTest < 0xffff && len (b .negativeUnicode ) > 0 {
unicodeLookup = b .negativeUnicode [chTest >>8 ]
if len (unicodeLookup ) > 0 {
test2 = (match - startmatch ) + unicodeLookup [chTest &0xFF ]
} else {
test += advance
break
}
} else {
test += advance
break
}
if b .rightToLeft {
if test2 < advance {
advance = test2
}
} else if test2 > advance {
advance = test2
}
test += advance
break
}
}
}
}
}
func (b *BmPrefix ) IsMatch (text []rune , index , beglimit , endlimit int ) bool {
if !b .rightToLeft {
if index < beglimit || endlimit -index < len (b .pattern ) {
return false
}
return b .matchPattern (text , index )
} else {
if index > endlimit || index -beglimit < len (b .pattern ) {
return false
}
return b .matchPattern (text , index -len (b .pattern ))
}
}
func (b *BmPrefix ) matchPattern (text []rune , index int ) bool {
if len (text )-index < len (b .pattern ) {
return false
}
if b .caseInsensitive {
for i := 0 ; i < len (b .pattern ); i ++ {
if unicode .ToLower (text [index +i ]) != b .pattern [i ] {
return false
}
}
return true
} else {
for i := 0 ; i < len (b .pattern ); i ++ {
if text [index +i ] != b .pattern [i ] {
return false
}
}
return true
}
}
type AnchorLoc int16
const (
AnchorBeginning AnchorLoc = 0x0001
AnchorBol = 0x0002
AnchorStart = 0x0004
AnchorEol = 0x0008
AnchorEndZ = 0x0010
AnchorEnd = 0x0020
AnchorBoundary = 0x0040
AnchorECMABoundary = 0x0080
)
func getAnchors(tree *RegexTree ) AnchorLoc {
var concatNode *regexNode
nextChild , result := 0 , AnchorLoc (0 )
curNode := tree .root
for {
switch curNode .t {
case ntConcatenate :
if len (curNode .children ) > 0 {
concatNode = curNode
nextChild = 0
}
case ntGreedy , ntCapture :
curNode = curNode .children [0 ]
concatNode = nil
continue
case ntBol , ntEol , ntBoundary , ntECMABoundary , ntBeginning ,
ntStart , ntEndZ , ntEnd :
return result | anchorFromType (curNode .t )
case ntEmpty , ntRequire , ntPrevent :
default :
return result
}
if concatNode == nil || nextChild >= len (concatNode .children ) {
return result
}
curNode = concatNode .children [nextChild ]
nextChild ++
}
}
func anchorFromType(t nodeType ) AnchorLoc {
switch t {
case ntBol :
return AnchorBol
case ntEol :
return AnchorEol
case ntBoundary :
return AnchorBoundary
case ntECMABoundary :
return AnchorECMABoundary
case ntBeginning :
return AnchorBeginning
case ntStart :
return AnchorStart
case ntEndZ :
return AnchorEndZ
case ntEnd :
return AnchorEnd
default :
return 0
}
}
func (anchors AnchorLoc ) String () string {
buf := &bytes .Buffer {}
if 0 != (anchors & AnchorBeginning ) {
buf .WriteString (", Beginning" )
}
if 0 != (anchors & AnchorStart ) {
buf .WriteString (", Start" )
}
if 0 != (anchors & AnchorBol ) {
buf .WriteString (", Bol" )
}
if 0 != (anchors & AnchorBoundary ) {
buf .WriteString (", Boundary" )
}
if 0 != (anchors & AnchorECMABoundary ) {
buf .WriteString (", ECMABoundary" )
}
if 0 != (anchors & AnchorEol ) {
buf .WriteString (", Eol" )
}
if 0 != (anchors & AnchorEnd ) {
buf .WriteString (", End" )
}
if 0 != (anchors & AnchorEndZ ) {
buf .WriteString (", EndZ" )
}
if buf .Len () >= 2 {
return buf .String ()[2 :]
}
return "None"
}
The pages are generated with Golds v0.8.2 . (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu .
PR and bug reports are welcome and can be submitted to the issue list .
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds .