// Package jlexer contains a JSON lexer implementation. // // It is expected that it is mostly used with generated parser code, so the interface is tuned // for a parser that knows what kind of data is expected.
package jlexer import ( ) // tokenKind determines type of a token. type tokenKind byte const ( tokenUndef tokenKind = iota // No token. tokenDelim // Delimiter: one of '{', '}', '[' or ']'. tokenString // A string literal, e.g. "abc\u1234" tokenNumber // Number literal, e.g. 1.5e5 tokenBool // Boolean literal: true or false. tokenNull // null keyword. ) // token describes a single token: type, position in the input and value. type token struct { kind tokenKind // Type of a token. boolValue bool // Value if a boolean literal token. byteValueCloned bool // true if byteValue was allocated and does not refer to original json body byteValue []byte // Raw value of a token. delimValue byte } // Lexer is a JSON lexer: it iterates over JSON tokens in a byte slice. type Lexer struct { Data []byte // Input data given to the lexer. start int // Start of the current token. pos int // Current unscanned position in the input stream. token token // Last scanned token, if token.kind != tokenUndef. firstElement bool // Whether current element is the first in array or an object. wantSep byte // A comma or a colon character, which need to occur before a token. UseMultipleErrors bool // If we want to use multiple errors. fatalError error // Fatal error occurred during lexing. It is usually a syntax error. multipleErrors []*LexerError // Semantic errors occurred during lexing. Marshalling will be continued after finding this errors. } // FetchToken scans the input for the next token. func ( *Lexer) () { .token.kind = tokenUndef .start = .pos // Check if r.Data has r.pos element // If it doesn't, it mean corrupted input data if len(.Data) < .pos { .errParse("Unexpected end of data") return } // Determine the type of a token by skipping whitespace and reading the // first character. for , := range .Data[.pos:] { switch { case ':', ',': if .wantSep == { .pos++ .start++ .wantSep = 0 } else { .errSyntax() } case ' ', '\t', '\r', '\n': .pos++ .start++ case '"': if .wantSep != 0 { .errSyntax() } .token.kind = tokenString .fetchString() return case '{', '[': if .wantSep != 0 { .errSyntax() } .firstElement = true .token.kind = tokenDelim .token.delimValue = .Data[.pos] .pos++ return case '}', ']': if !.firstElement && (.wantSep != ',') { .errSyntax() } .wantSep = 0 .token.kind = tokenDelim .token.delimValue = .Data[.pos] .pos++ return case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-': if .wantSep != 0 { .errSyntax() } .token.kind = tokenNumber .fetchNumber() return case 'n': if .wantSep != 0 { .errSyntax() } .token.kind = tokenNull .fetchNull() return case 't': if .wantSep != 0 { .errSyntax() } .token.kind = tokenBool .token.boolValue = true .fetchTrue() return case 'f': if .wantSep != 0 { .errSyntax() } .token.kind = tokenBool .token.boolValue = false .fetchFalse() return default: .errSyntax() return } } .fatalError = io.EOF return } // isTokenEnd returns true if the char can follow a non-delimiter token func isTokenEnd( byte) bool { return == ' ' || == '\t' || == '\r' || == '\n' || == '[' || == ']' || == '{' || == '}' || == ',' || == ':' } // fetchNull fetches and checks remaining bytes of null keyword. func ( *Lexer) () { .pos += 4 if .pos > len(.Data) || .Data[.pos-3] != 'u' || .Data[.pos-2] != 'l' || .Data[.pos-1] != 'l' || (.pos != len(.Data) && !isTokenEnd(.Data[.pos])) { .pos -= 4 .errSyntax() } } // fetchTrue fetches and checks remaining bytes of true keyword. func ( *Lexer) () { .pos += 4 if .pos > len(.Data) || .Data[.pos-3] != 'r' || .Data[.pos-2] != 'u' || .Data[.pos-1] != 'e' || (.pos != len(.Data) && !isTokenEnd(.Data[.pos])) { .pos -= 4 .errSyntax() } } // fetchFalse fetches and checks remaining bytes of false keyword. func ( *Lexer) () { .pos += 5 if .pos > len(.Data) || .Data[.pos-4] != 'a' || .Data[.pos-3] != 'l' || .Data[.pos-2] != 's' || .Data[.pos-1] != 'e' || (.pos != len(.Data) && !isTokenEnd(.Data[.pos])) { .pos -= 5 .errSyntax() } } // fetchNumber scans a number literal token. func ( *Lexer) () { := false := false := false .pos++ for , := range .Data[.pos:] { switch { case >= '0' && <= '9': = false case == '.' && !: = true case ( == 'e' || == 'E') && !: = true = true = true case ( == '+' || == '-') && : = false default: .pos += if !isTokenEnd() { .errSyntax() } else { .token.byteValue = .Data[.start:.pos] } return } } .pos = len(.Data) .token.byteValue = .Data[.start:] } // findStringLen tries to scan into the string literal for ending quote char to determine required size. // The size will be exact if no escapes are present and may be inexact if there are escaped chars. func findStringLen( []byte) ( bool, int) { for { := bytes.IndexByte(, '"') if == -1 { return false, len() } if == 0 || ( > 0 && [-1] != '\\') { return true, + } // count \\\\\\\ sequences. even number of slashes means quote is not really escaped := 1 for --1 >= 0 && [--1] == '\\' { ++ } if %2 == 0 { return true, + } += + 1 = [+1:] } } // unescapeStringToken performs unescaping of string token. // if no escaping is needed, original string is returned, otherwise - a new one allocated func ( *Lexer) () ( error) { := .token.byteValue var []byte for { := bytes.IndexByte(, '\\') if == -1 { break } , , := decodeEscape([:]) if != nil { .errParse(.Error()) return } if == nil { = make([]byte, 0, len(.token.byteValue)) } var [4]byte := utf8.EncodeRune([:], ) = append(, [:]...) = append(, [:]...) = [+:] } if != nil { .token.byteValue = append(, ...) .token.byteValueCloned = true } return } // getu4 decodes \uXXXX from the beginning of s, returning the hex value, // or it returns -1. func getu4( []byte) rune { if len() < 6 || [0] != '\\' || [1] != 'u' { return -1 } var rune for := 2; < len() && < 6; ++ { var byte := [] switch { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': = - '0' case 'a', 'b', 'c', 'd', 'e', 'f': = - 'a' + 10 case 'A', 'B', 'C', 'D', 'E', 'F': = - 'A' + 10 default: return -1 } <<= 4 |= rune() } return } // decodeEscape processes a single escape sequence and returns number of bytes processed. func decodeEscape( []byte) ( rune, int, error) { if len() < 2 { return 0, 0, errors.New("incorrect escape symbol \\ at the end of token") } := [1] switch { case '"', '/', '\\': return rune(), 2, nil case 'b': return '\b', 2, nil case 'f': return '\f', 2, nil case 'n': return '\n', 2, nil case 'r': return '\r', 2, nil case 't': return '\t', 2, nil case 'u': := getu4() if < 0 { return 0, 0, errors.New("incorrectly escaped \\uXXXX sequence") } := 6 if utf16.IsSurrogate() { := getu4([:]) if := utf16.DecodeRune(, ); != unicode.ReplacementChar { += 6 = } else { = unicode.ReplacementChar } } return , , nil } return 0, 0, errors.New("incorrectly escaped bytes") } // fetchString scans a string literal token. func ( *Lexer) () { .pos++ := .Data[.pos:] , := findStringLen() if ! { .pos += .errParse("unterminated string literal") return } .token.byteValue = [:] .pos += + 1 // skip closing '"' as well } // scanToken scans the next token if no token is currently available in the lexer. func ( *Lexer) () { if .token.kind != tokenUndef || .fatalError != nil { return } .FetchToken() } // consume resets the current token to allow scanning the next one. func ( *Lexer) () { .token.kind = tokenUndef .token.byteValueCloned = false .token.delimValue = 0 } // Ok returns true if no error (including io.EOF) was encountered during scanning. func ( *Lexer) () bool { return .fatalError == nil } const maxErrorContextLen = 13 func ( *Lexer) ( string) { if .fatalError == nil { var string if len(.Data)-.pos <= maxErrorContextLen { = string(.Data) } else { = string(.Data[.pos:.pos+maxErrorContextLen-3]) + "..." } .fatalError = &LexerError{ Reason: , Offset: .pos, Data: , } } } func ( *Lexer) () { .errParse("syntax error") } func ( *Lexer) ( string) { if .fatalError != nil { return } if .UseMultipleErrors { .pos = .start .consume() .SkipRecursive() switch { case "[": .token.delimValue = ']' .token.kind = tokenDelim case "{": .token.delimValue = '}' .token.kind = tokenDelim } .addNonfatalError(&LexerError{ Reason: fmt.Sprintf("expected %s", ), Offset: .start, Data: string(.Data[.start:.pos]), }) return } var string if len(.token.byteValue) <= maxErrorContextLen { = string(.token.byteValue) } else { = string(.token.byteValue[:maxErrorContextLen-3]) + "..." } .fatalError = &LexerError{ Reason: fmt.Sprintf("expected %s", ), Offset: .pos, Data: , } } func ( *Lexer) () int { return .pos } // Delim consumes a token and verifies that it is the given delimiter. func ( *Lexer) ( byte) { if .token.kind == tokenUndef && .Ok() { .FetchToken() } if !.Ok() || .token.delimValue != { .consume() // errInvalidToken can change token if UseMultipleErrors is enabled. .errInvalidToken(string([]byte{})) } else { .consume() } } // IsDelim returns true if there was no scanning error and next token is the given delimiter. func ( *Lexer) ( byte) bool { if .token.kind == tokenUndef && .Ok() { .FetchToken() } return !.Ok() || .token.delimValue == } // Null verifies that the next token is null and consumes it. func ( *Lexer) () { if .token.kind == tokenUndef && .Ok() { .FetchToken() } if !.Ok() || .token.kind != tokenNull { .errInvalidToken("null") } .consume() } // IsNull returns true if the next token is a null keyword. func ( *Lexer) () bool { if .token.kind == tokenUndef && .Ok() { .FetchToken() } return .Ok() && .token.kind == tokenNull } // Skip skips a single token. func ( *Lexer) () { if .token.kind == tokenUndef && .Ok() { .FetchToken() } .consume() } // SkipRecursive skips next array or object completely, or just skips a single token if not // an array/object. // // Note: no syntax validation is performed on the skipped data. func ( *Lexer) () { .scanToken() var , byte := .start switch .token.delimValue { case '{': , = '{', '}' case '[': , = '[', ']' default: .consume() return } .consume() := 1 := false := false for , := range .Data[.pos:] { switch { case == && !: ++ case == && !: -- if == 0 { .pos += + 1 if !json.Valid(.Data[:.pos]) { .pos = len(.Data) .fatalError = &LexerError{ Reason: "skipped array/object json value is invalid", Offset: .pos, Data: string(.Data[.pos:]), } } return } case == '\\' && : = ! continue case == '"' && : = case == '"': = true } = false } .pos = len(.Data) .fatalError = &LexerError{ Reason: "EOF reached while skipping array/object or token", Offset: .pos, Data: string(.Data[.pos:]), } } // Raw fetches the next item recursively as a data slice func ( *Lexer) () []byte { .SkipRecursive() if !.Ok() { return nil } return .Data[.start:.pos] } // IsStart returns whether the lexer is positioned at the start // of an input string. func ( *Lexer) () bool { return .pos == 0 } // Consumed reads all remaining bytes from the input, publishing an error if // there is anything but whitespace remaining. func ( *Lexer) () { if .pos > len(.Data) || !.Ok() { return } for , := range .Data[.pos:] { if != ' ' && != '\t' && != '\r' && != '\n' { .AddError(&LexerError{ Reason: "invalid character '" + string() + "' after top-level value", Offset: .pos, Data: string(.Data[.pos:]), }) return } .pos++ .start++ } } func ( *Lexer) ( bool) (string, []byte) { if .token.kind == tokenUndef && .Ok() { .FetchToken() } if !.Ok() || .token.kind != tokenString { .errInvalidToken("string") return "", nil } if ! { if := .unescapeStringToken(); != nil { .errInvalidToken("string") return "", nil } } := .token.byteValue := bytesToStr(.token.byteValue) .consume() return , } // UnsafeString returns the string value if the token is a string literal. // // Warning: returned string may point to the input buffer, so the string should not outlive // the input buffer. Intended pattern of usage is as an argument to a switch statement. func ( *Lexer) () string { , := .unsafeString(false) return } // UnsafeBytes returns the byte slice if the token is a string literal. func ( *Lexer) () []byte { , := .unsafeString(false) return } // UnsafeFieldName returns current member name string token func ( *Lexer) ( bool) string { , := .unsafeString() return } // String reads a string literal. func ( *Lexer) () string { if .token.kind == tokenUndef && .Ok() { .FetchToken() } if !.Ok() || .token.kind != tokenString { .errInvalidToken("string") return "" } if := .unescapeStringToken(); != nil { .errInvalidToken("string") return "" } var string if .token.byteValueCloned { = bytesToStr(.token.byteValue) } else { = string(.token.byteValue) } .consume() return } // StringIntern reads a string literal, and performs string interning on it. func ( *Lexer) () string { if .token.kind == tokenUndef && .Ok() { .FetchToken() } if !.Ok() || .token.kind != tokenString { .errInvalidToken("string") return "" } if := .unescapeStringToken(); != nil { .errInvalidToken("string") return "" } := intern.Bytes(.token.byteValue) .consume() return } // Bytes reads a string literal and base64 decodes it into a byte slice. func ( *Lexer) () []byte { if .token.kind == tokenUndef && .Ok() { .FetchToken() } if !.Ok() || .token.kind != tokenString { .errInvalidToken("string") return nil } if := .unescapeStringToken(); != nil { .errInvalidToken("string") return nil } := make([]byte, base64.StdEncoding.DecodedLen(len(.token.byteValue))) , := base64.StdEncoding.Decode(, .token.byteValue) if != nil { .fatalError = &LexerError{ Reason: .Error(), } return nil } .consume() return [:] } // Bool reads a true or false boolean keyword. func ( *Lexer) () bool { if .token.kind == tokenUndef && .Ok() { .FetchToken() } if !.Ok() || .token.kind != tokenBool { .errInvalidToken("bool") return false } := .token.boolValue .consume() return } func ( *Lexer) () string { if .token.kind == tokenUndef && .Ok() { .FetchToken() } if !.Ok() || .token.kind != tokenNumber { .errInvalidToken("number") return "" } := bytesToStr(.token.byteValue) .consume() return } func ( *Lexer) () uint8 { := .number() if !.Ok() { return 0 } , := strconv.ParseUint(, 10, 8) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: , }) } return uint8() } func ( *Lexer) () uint16 { := .number() if !.Ok() { return 0 } , := strconv.ParseUint(, 10, 16) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: , }) } return uint16() } func ( *Lexer) () uint32 { := .number() if !.Ok() { return 0 } , := strconv.ParseUint(, 10, 32) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: , }) } return uint32() } func ( *Lexer) () uint64 { := .number() if !.Ok() { return 0 } , := strconv.ParseUint(, 10, 64) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: , }) } return } func ( *Lexer) () uint { return uint(.Uint64()) } func ( *Lexer) () int8 { := .number() if !.Ok() { return 0 } , := strconv.ParseInt(, 10, 8) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: , }) } return int8() } func ( *Lexer) () int16 { := .number() if !.Ok() { return 0 } , := strconv.ParseInt(, 10, 16) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: , }) } return int16() } func ( *Lexer) () int32 { := .number() if !.Ok() { return 0 } , := strconv.ParseInt(, 10, 32) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: , }) } return int32() } func ( *Lexer) () int64 { := .number() if !.Ok() { return 0 } , := strconv.ParseInt(, 10, 64) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: , }) } return } func ( *Lexer) () int { return int(.Int64()) } func ( *Lexer) () uint8 { , := .unsafeString(false) if !.Ok() { return 0 } , := strconv.ParseUint(, 10, 8) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: string(), }) } return uint8() } func ( *Lexer) () uint16 { , := .unsafeString(false) if !.Ok() { return 0 } , := strconv.ParseUint(, 10, 16) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: string(), }) } return uint16() } func ( *Lexer) () uint32 { , := .unsafeString(false) if !.Ok() { return 0 } , := strconv.ParseUint(, 10, 32) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: string(), }) } return uint32() } func ( *Lexer) () uint64 { , := .unsafeString(false) if !.Ok() { return 0 } , := strconv.ParseUint(, 10, 64) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: string(), }) } return } func ( *Lexer) () uint { return uint(.Uint64Str()) } func ( *Lexer) () uintptr { return uintptr(.Uint64Str()) } func ( *Lexer) () int8 { , := .unsafeString(false) if !.Ok() { return 0 } , := strconv.ParseInt(, 10, 8) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: string(), }) } return int8() } func ( *Lexer) () int16 { , := .unsafeString(false) if !.Ok() { return 0 } , := strconv.ParseInt(, 10, 16) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: string(), }) } return int16() } func ( *Lexer) () int32 { , := .unsafeString(false) if !.Ok() { return 0 } , := strconv.ParseInt(, 10, 32) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: string(), }) } return int32() } func ( *Lexer) () int64 { , := .unsafeString(false) if !.Ok() { return 0 } , := strconv.ParseInt(, 10, 64) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: string(), }) } return } func ( *Lexer) () int { return int(.Int64Str()) } func ( *Lexer) () float32 { := .number() if !.Ok() { return 0 } , := strconv.ParseFloat(, 32) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: , }) } return float32() } func ( *Lexer) () float32 { , := .unsafeString(false) if !.Ok() { return 0 } , := strconv.ParseFloat(, 32) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: string(), }) } return float32() } func ( *Lexer) () float64 { := .number() if !.Ok() { return 0 } , := strconv.ParseFloat(, 64) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: , }) } return } func ( *Lexer) () float64 { , := .unsafeString(false) if !.Ok() { return 0 } , := strconv.ParseFloat(, 64) if != nil { .addNonfatalError(&LexerError{ Offset: .start, Reason: .Error(), Data: string(), }) } return } func ( *Lexer) () error { return .fatalError } func ( *Lexer) ( error) { if .fatalError == nil { .fatalError = } } func ( *Lexer) ( error) { .addNonfatalError(&LexerError{ Offset: .start, Data: string(.Data[.start:.pos]), Reason: .Error(), }) } func ( *Lexer) ( *LexerError) { if .UseMultipleErrors { // We don't want to add errors with the same offset. if len(.multipleErrors) != 0 && .multipleErrors[len(.multipleErrors)-1].Offset == .Offset { return } .multipleErrors = append(.multipleErrors, ) return } .fatalError = } func ( *Lexer) () []*LexerError { return .multipleErrors } // JsonNumber fetches and json.Number from 'encoding/json' package. // Both int, float or string, contains them are valid values func ( *Lexer) () json.Number { if .token.kind == tokenUndef && .Ok() { .FetchToken() } if !.Ok() { .errInvalidToken("json.Number") return json.Number("") } switch .token.kind { case tokenString: return json.Number(.String()) case tokenNumber: return json.Number(.Raw()) case tokenNull: .Null() return json.Number("") default: .errSyntax() return json.Number("") } } // Interface fetches an interface{} analogous to the 'encoding/json' package. func ( *Lexer) () interface{} { if .token.kind == tokenUndef && .Ok() { .FetchToken() } if !.Ok() { return nil } switch .token.kind { case tokenString: return .String() case tokenNumber: return .Float64() case tokenBool: return .Bool() case tokenNull: .Null() return nil } if .token.delimValue == '{' { .consume() := map[string]interface{}{} for !.IsDelim('}') { := .String() .WantColon() [] = .() .WantComma() } .Delim('}') if .Ok() { return } else { return nil } } else if .token.delimValue == '[' { .consume() := []interface{}{} for !.IsDelim(']') { = append(, .()) .WantComma() } .Delim(']') if .Ok() { return } else { return nil } } .errSyntax() return nil } // WantComma requires a comma to be present before fetching next token. func ( *Lexer) () { .wantSep = ',' .firstElement = false } // WantColon requires a colon to be present before fetching next token. func ( *Lexer) () { .wantSep = ':' .firstElement = false }