13 itemError itemType = iota
14 itemNIL // used in the parser to indicate no type
20 itemRawMultilineString
25 itemArray // the start of an array
53 inlineTableStart = '{'
57 type stateFn func(lx *lexer) stateFn
67 // Allow for backing up up to three runes.
68 // This is necessary because TOML contains 3-rune tokens (""" and ''').
70 nprev int // how many of prevWidths are in use
71 // If we emit an eof, we can still back up, but it is not OK to call
75 // A stack of state functions used to maintain context.
76 // The idea is to reuse parts of the state machine in various places.
77 // For example, values can appear at the top level or within arbitrarily
78 // nested arrays. The last state on the stack is used after a value has
79 // been lexed. Similarly for comments.
89 func (lx *lexer) nextItem() item {
92 case item := <-lx.items:
95 lx.state = lx.state(lx)
100 func lex(input string) *lexer {
105 items: make(chan item, 10),
106 stack: make([]stateFn, 0, 10),
111 func (lx *lexer) push(state stateFn) {
112 lx.stack = append(lx.stack, state)
115 func (lx *lexer) pop() stateFn {
116 if len(lx.stack) == 0 {
117 return lx.errorf("BUG in lexer: no states to pop")
119 last := lx.stack[len(lx.stack)-1]
120 lx.stack = lx.stack[0 : len(lx.stack)-1]
124 func (lx *lexer) current() string {
125 return lx.input[lx.start:lx.pos]
128 func (lx *lexer) emit(typ itemType) {
129 lx.items <- item{typ, lx.current(), lx.line}
133 func (lx *lexer) emitTrim(typ itemType) {
134 lx.items <- item{typ, strings.TrimSpace(lx.current()), lx.line}
138 func (lx *lexer) next() (r rune) {
140 panic("next called after EOF")
142 if lx.pos >= len(lx.input) {
147 if lx.input[lx.pos] == '\n' {
150 lx.prevWidths[2] = lx.prevWidths[1]
151 lx.prevWidths[1] = lx.prevWidths[0]
155 r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
161 // ignore skips over the pending input before this point.
162 func (lx *lexer) ignore() {
166 // backup steps back one rune. Can be called only twice between calls to next.
167 func (lx *lexer) backup() {
173 panic("backed up too far")
175 w := lx.prevWidths[0]
176 lx.prevWidths[0] = lx.prevWidths[1]
177 lx.prevWidths[1] = lx.prevWidths[2]
180 if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
185 // accept consumes the next rune if it's equal to `valid`.
186 func (lx *lexer) accept(valid rune) bool {
187 if lx.next() == valid {
194 // peek returns but does not consume the next rune in the input.
195 func (lx *lexer) peek() rune {
201 // skip ignores all input that matches the given predicate.
202 func (lx *lexer) skip(pred func(rune) bool) {
214 // errorf stops all lexing by emitting an error and returning `nil`.
215 // Note that any value that is a character is escaped if it's a special
216 // character (newlines, tabs, etc.).
217 func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
220 fmt.Sprintf(format, values...),
226 // lexTop consumes elements at the top level of TOML data.
227 func lexTop(lx *lexer) stateFn {
229 if isWhitespace(r) || isNL(r) {
230 return lexSkip(lx, lexTop)
235 return lexCommentStart
239 if lx.pos > lx.start {
240 return lx.errorf("unexpected EOF")
246 // At this point, the only valid item can be a key, so we back up
247 // and let the key lexer do the rest.
253 // lexTopEnd is entered whenever a top-level item has been consumed. (A value
254 // or a table.) It must see only whitespace, and will turn back to lexTop
255 // upon a newline. If it sees EOF, it will quit the lexer successfully.
256 func lexTopEnd(lx *lexer) stateFn {
259 case r == commentStart:
260 // a comment will read to a newline for us.
262 return lexCommentStart
263 case isWhitespace(r):
272 return lx.errorf("expected a top-level item to end with a newline, "+
273 "comment, or EOF, but got %q instead", r)
276 // lexTable lexes the beginning of a table. Namely, it makes sure that
277 // it starts with a character other than '.' and ']'.
278 // It assumes that '[' has already been consumed.
279 // It also handles the case that this is an item in an array of tables.
281 func lexTableStart(lx *lexer) stateFn {
282 if lx.peek() == arrayTableStart {
284 lx.emit(itemArrayTableStart)
285 lx.push(lexArrayTableEnd)
287 lx.emit(itemTableStart)
290 return lexTableNameStart
293 func lexTableEnd(lx *lexer) stateFn {
294 lx.emit(itemTableEnd)
298 func lexArrayTableEnd(lx *lexer) stateFn {
299 if r := lx.next(); r != arrayTableEnd {
300 return lx.errorf("expected end of table array name delimiter %q, "+
301 "but got %q instead", arrayTableEnd, r)
303 lx.emit(itemArrayTableEnd)
307 func lexTableNameStart(lx *lexer) stateFn {
308 lx.skip(isWhitespace)
309 switch r := lx.peek(); {
310 case r == tableEnd || r == eof:
311 return lx.errorf("unexpected end of table name " +
312 "(table names cannot be empty)")
314 return lx.errorf("unexpected table separator " +
315 "(table names cannot be empty)")
316 case r == stringStart || r == rawStringStart:
318 lx.push(lexTableNameEnd)
319 return lexValue // reuse string lexing
321 return lexBareTableName
325 // lexBareTableName lexes the name of a table. It assumes that at least one
326 // valid character for the table has already been read.
327 func lexBareTableName(lx *lexer) stateFn {
329 if isBareKeyChar(r) {
330 return lexBareTableName
334 return lexTableNameEnd
337 // lexTableNameEnd reads the end of a piece of a table name, optionally
338 // consuming whitespace.
339 func lexTableNameEnd(lx *lexer) stateFn {
340 lx.skip(isWhitespace)
341 switch r := lx.next(); {
342 case isWhitespace(r):
343 return lexTableNameEnd
346 return lexTableNameStart
350 return lx.errorf("expected '.' or ']' to end table name, "+
351 "but got %q instead", r)
355 // lexKeyStart consumes a key name up until the first non-whitespace character.
356 // lexKeyStart will ignore whitespace.
357 func lexKeyStart(lx *lexer) stateFn {
361 return lx.errorf("unexpected key separator %q", keySep)
362 case isWhitespace(r) || isNL(r):
364 return lexSkip(lx, lexKeyStart)
365 case r == stringStart || r == rawStringStart:
367 lx.emit(itemKeyStart)
369 return lexValue // reuse string lexing
372 lx.emit(itemKeyStart)
377 // lexBareKey consumes the text of a bare key. Assumes that the first character
378 // (which is not whitespace) has not yet been consumed.
379 func lexBareKey(lx *lexer) stateFn {
380 switch r := lx.next(); {
381 case isBareKeyChar(r):
383 case isWhitespace(r):
392 return lx.errorf("bare keys cannot contain %q", r)
396 // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
398 func lexKeyEnd(lx *lexer) stateFn {
399 switch r := lx.next(); {
401 return lexSkip(lx, lexValue)
402 case isWhitespace(r):
403 return lexSkip(lx, lexKeyEnd)
405 return lx.errorf("expected key separator %q, but got %q instead",
410 // lexValue starts the consumption of a value anywhere a value is expected.
411 // lexValue will ignore whitespace.
412 // After a value is lexed, the last state on the next is popped and returned.
413 func lexValue(lx *lexer) stateFn {
414 // We allow whitespace to precede a value, but NOT newlines.
415 // In array syntax, the array states are responsible for ignoring newlines.
418 case isWhitespace(r):
419 return lexSkip(lx, lexValue)
421 lx.backup() // avoid an extra state and use the same as above
422 return lexNumberOrDateStart
429 case inlineTableStart:
431 lx.emit(itemInlineTableStart)
432 return lexInlineTableValue
434 if lx.accept(stringStart) {
435 if lx.accept(stringStart) {
436 lx.ignore() // Ignore """
437 return lexMultilineString
441 lx.ignore() // ignore the '"'
444 if lx.accept(rawStringStart) {
445 if lx.accept(rawStringStart) {
446 lx.ignore() // Ignore """
447 return lexMultilineRawString
451 lx.ignore() // ignore the "'"
454 return lexNumberStart
455 case '.': // special error case, be kind to users
456 return lx.errorf("floats must start with a digit, not '.'")
458 if unicode.IsLetter(r) {
459 // Be permissive here; lexBool will give a nice error if the
460 // user wrote something like
462 // (i.e. not 'true' or 'false' but is something else word-like.)
466 return lx.errorf("expected value but found %q instead", r)
469 // lexArrayValue consumes one value in an array. It assumes that '[' or ','
470 // have already been consumed. All whitespace and newlines are ignored.
471 func lexArrayValue(lx *lexer) stateFn {
474 case isWhitespace(r) || isNL(r):
475 return lexSkip(lx, lexArrayValue)
476 case r == commentStart:
477 lx.push(lexArrayValue)
478 return lexCommentStart
480 return lx.errorf("unexpected comma")
482 // NOTE(caleb): The spec isn't clear about whether you can have
483 // a trailing comma or not, so we'll allow it.
488 lx.push(lexArrayValueEnd)
492 // lexArrayValueEnd consumes everything between the end of an array value and
493 // the next value (or the end of the array): it ignores whitespace and newlines
494 // and expects either a ',' or a ']'.
495 func lexArrayValueEnd(lx *lexer) stateFn {
498 case isWhitespace(r) || isNL(r):
499 return lexSkip(lx, lexArrayValueEnd)
500 case r == commentStart:
501 lx.push(lexArrayValueEnd)
502 return lexCommentStart
505 return lexArrayValue // move on to the next value
510 "expected a comma or array terminator %q, but got %q instead",
515 // lexArrayEnd finishes the lexing of an array.
516 // It assumes that a ']' has just been consumed.
517 func lexArrayEnd(lx *lexer) stateFn {
519 lx.emit(itemArrayEnd)
523 // lexInlineTableValue consumes one key/value pair in an inline table.
524 // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
525 func lexInlineTableValue(lx *lexer) stateFn {
528 case isWhitespace(r):
529 return lexSkip(lx, lexInlineTableValue)
531 return lx.errorf("newlines not allowed within inline tables")
532 case r == commentStart:
533 lx.push(lexInlineTableValue)
534 return lexCommentStart
536 return lx.errorf("unexpected comma")
537 case r == inlineTableEnd:
538 return lexInlineTableEnd
541 lx.push(lexInlineTableValueEnd)
545 // lexInlineTableValueEnd consumes everything between the end of an inline table
546 // key/value pair and the next pair (or the end of the table):
547 // it ignores whitespace and expects either a ',' or a '}'.
548 func lexInlineTableValueEnd(lx *lexer) stateFn {
551 case isWhitespace(r):
552 return lexSkip(lx, lexInlineTableValueEnd)
554 return lx.errorf("newlines not allowed within inline tables")
555 case r == commentStart:
556 lx.push(lexInlineTableValueEnd)
557 return lexCommentStart
560 return lexInlineTableValue
561 case r == inlineTableEnd:
562 return lexInlineTableEnd
564 return lx.errorf("expected a comma or an inline table terminator %q, "+
565 "but got %q instead", inlineTableEnd, r)
568 // lexInlineTableEnd finishes the lexing of an inline table.
569 // It assumes that a '}' has just been consumed.
570 func lexInlineTableEnd(lx *lexer) stateFn {
572 lx.emit(itemInlineTableEnd)
576 // lexString consumes the inner contents of a string. It assumes that the
577 // beginning '"' has already been consumed and ignored.
578 func lexString(lx *lexer) stateFn {
582 return lx.errorf("unexpected EOF")
584 return lx.errorf("strings cannot contain newlines")
587 return lexStringEscape
598 // lexMultilineString consumes the inner contents of a string. It assumes that
599 // the beginning '"""' has already been consumed and ignored.
600 func lexMultilineString(lx *lexer) stateFn {
603 return lx.errorf("unexpected EOF")
605 return lexMultilineStringEscape
607 if lx.accept(stringEnd) {
608 if lx.accept(stringEnd) {
612 lx.emit(itemMultilineString)
622 return lexMultilineString
625 // lexRawString consumes a raw string. Nothing can be escaped in such a string.
626 // It assumes that the beginning "'" has already been consumed and ignored.
627 func lexRawString(lx *lexer) stateFn {
631 return lx.errorf("unexpected EOF")
633 return lx.errorf("strings cannot contain newlines")
634 case r == rawStringEnd:
636 lx.emit(itemRawString)
644 // lexMultilineRawString consumes a raw string. Nothing can be escaped in such
645 // a string. It assumes that the beginning "'''" has already been consumed and
647 func lexMultilineRawString(lx *lexer) stateFn {
650 return lx.errorf("unexpected EOF")
652 if lx.accept(rawStringEnd) {
653 if lx.accept(rawStringEnd) {
657 lx.emit(itemRawMultilineString)
667 return lexMultilineRawString
670 // lexMultilineStringEscape consumes an escaped character. It assumes that the
671 // preceding '\\' has already been consumed.
672 func lexMultilineStringEscape(lx *lexer) stateFn {
673 // Handle the special case first:
675 return lexMultilineString
678 lx.push(lexMultilineString)
679 return lexStringEscape(lx)
682 func lexStringEscape(lx *lexer) stateFn {
700 return lexShortUnicodeEscape
702 return lexLongUnicodeEscape
704 return lx.errorf("invalid escape character %q; only the following "+
705 "escape characters are allowed: "+
706 `\b, \t, \n, \f, \r, \", \\, \uXXXX, and \UXXXXXXXX`, r)
709 func lexShortUnicodeEscape(lx *lexer) stateFn {
711 for i := 0; i < 4; i++ {
713 if !isHexadecimal(r) {
714 return lx.errorf(`expected four hexadecimal digits after '\u', `+
715 "but got %q instead", lx.current())
721 func lexLongUnicodeEscape(lx *lexer) stateFn {
723 for i := 0; i < 8; i++ {
725 if !isHexadecimal(r) {
726 return lx.errorf(`expected eight hexadecimal digits after '\U', `+
727 "but got %q instead", lx.current())
733 // lexNumberOrDateStart consumes either an integer, a float, or datetime.
734 func lexNumberOrDateStart(lx *lexer) stateFn {
737 return lexNumberOrDate
745 return lx.errorf("floats must start with a digit, not '.'")
747 return lx.errorf("expected a digit but got %q", r)
750 // lexNumberOrDate consumes either an integer, float or datetime.
751 func lexNumberOrDate(lx *lexer) stateFn {
754 return lexNumberOrDate
770 // lexDatetime consumes a Datetime, to a first approximation.
771 // The parser validates that it matches one of the accepted formats.
772 func lexDatetime(lx *lexer) stateFn {
778 case '-', 'T', ':', '.', 'Z', '+':
783 lx.emit(itemDatetime)
787 // lexNumberStart consumes either an integer or a float. It assumes that a sign
788 // has already been read, but that *no* digits have been consumed.
789 // lexNumberStart will move to the appropriate integer or float states.
790 func lexNumberStart(lx *lexer) stateFn {
791 // We MUST see a digit. Even floats have to start with a digit.
795 return lx.errorf("floats must start with a digit, not '.'")
797 return lx.errorf("expected a digit but got %q", r)
802 // lexNumber consumes an integer or a float after seeing the first digit.
803 func lexNumber(lx *lexer) stateFn {
820 // lexFloat consumes the elements of a float. It allows any sequence of
821 // float-like characters, so floats emitted by the lexer are only a first
822 // approximation and must be validated by the parser.
823 func lexFloat(lx *lexer) stateFn {
829 case '_', '.', '-', '+', 'e', 'E':
838 // lexBool consumes a bool string: 'true' or 'false.
839 func lexBool(lx *lexer) stateFn {
843 if !unicode.IsLetter(r) {
851 case "true", "false":
855 return lx.errorf("expected value but found %q instead", s)
858 // lexCommentStart begins the lexing of a comment. It will emit
859 // itemCommentStart and consume no characters, passing control to lexComment.
860 func lexCommentStart(lx *lexer) stateFn {
862 lx.emit(itemCommentStart)
866 // lexComment lexes an entire comment. It assumes that '#' has been consumed.
867 // It will consume *up to* the first newline character, and pass control
868 // back to the last state on the stack.
869 func lexComment(lx *lexer) stateFn {
871 if isNL(r) || r == eof {
879 // lexSkip ignores all slurped input and moves on to the next state.
880 func lexSkip(lx *lexer, nextState stateFn) stateFn {
881 return func(lx *lexer) stateFn {
887 // isWhitespace returns true if `r` is a whitespace character according
889 func isWhitespace(r rune) bool {
890 return r == '\t' || r == ' '
893 func isNL(r rune) bool {
894 return r == '\n' || r == '\r'
897 func isDigit(r rune) bool {
898 return r >= '0' && r <= '9'
901 func isHexadecimal(r rune) bool {
902 return (r >= '0' && r <= '9') ||
903 (r >= 'a' && r <= 'f') ||
904 (r >= 'A' && r <= 'F')
907 func isBareKeyChar(r rune) bool {
908 return (r >= 'A' && r <= 'Z') ||
909 (r >= 'a' && r <= 'z') ||
910 (r >= '0' && r <= '9') ||
915 func (itype itemType) String() string {
925 case itemString, itemRawString, itemMultilineString, itemRawMultilineString:
945 case itemCommentStart:
946 return "CommentStart"
948 panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
951 func (item item) String() string {
952 return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)