Fix whitespace lexing/parsing.
Our consumeWhitespace method had a bug where it wouldn't emit the whitespace
token, meaning we weren't actually skipping the whitespace. Oops.
While in there, I removed an outdated TODO (we already supplied a default
state).
I updated the lexing of unquoted strings to take into account the rules about
spaces and unquoted strings; basically, spaces before or after are ignored as
whitespace, spaces in the middle ocunt, and empty strings are not allowed.
I removed an extra case when detecting what to do when lexing an unquoted
string; we already consumed all the whitespace, so the next character shouldn't
be whitespace, so no need to test for it.
We need to consume whitespace before we start lexing the separator character.
I updated the token debugging to be a bit more useful, by defining a String()
method on the token type itself, so it'll expose both the type and the value.
This makes unexpected errors easier to deal with, and is used in all the errors
raised by the parser now.
I added a bunch of whitespace tests for lexing and parsing.
20 tokenError tokenType = iota
30 func (t tokenType) String() string {
49 return "unknown token"
53 type stateFunc func(*lexer) stateFunc
71 func (t token) String() string {
72 return fmt.Sprintf("%s: %s", t.typ.String(), t.val)
75 func lex(input string) *lexer {
78 tokens: make(chan token),
84 func (l *lexer) nextToken() token {
88 func (l *lexer) run() {
89 for l.state = lexStart; l.state != nil; {
94 func (l *lexer) emit(t tokenType) {
96 if len(l.omitted) < 1 {
97 val = l.input[l.start:l.pos]
100 for _, pos := range l.omitted {
101 val += l.input[start:pos]
105 val += l.input[start:l.pos]
108 l.tokens <- token{typ: t, val: val}
110 l.omitted = l.omitted[0:0]
113 func (l *lexer) next() rune {
114 if l.pos >= len(l.input) {
119 r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
124 func (l *lexer) omit() {
125 l.omitted = append(l.omitted, l.pos-1)
128 func (l *lexer) ignore() {
132 func (l *lexer) backup() {
136 func (l *lexer) peek() rune {
142 func (l *lexer) accept(valid string) bool {
143 if strings.IndexRune(valid, l.next()) >= 0 {
150 func (l *lexer) acceptRun(valid string) {
151 for strings.IndexRune(valid, l.next()) >= 0 {
156 func (l *lexer) errorf(format string, args ...interface{}) stateFunc {
157 l.tokens <- token{tokenError, fmt.Sprintf(format, args...)}
161 func (l *lexer) consumeWhitespace() {
162 for unicode.IsSpace(l.peek()) {
166 l.emit(tokenWhitespace)
170 func lexStart(l *lexer) stateFunc {
171 l.consumeWhitespace()
175 func lexArrayStart(l *lexer) stateFunc {
176 if strings.HasPrefix(l.input[l.pos:], leftDelim) {
179 return l.errorf("expected array to start before %s", l.input[l.pos:])
182 func lexLeftDelim(l *lexer) stateFunc {
183 l.pos += len(leftDelim)
184 l.emit(tokenArrayStart)
189 func lexRightDelim(l *lexer) stateFunc {
190 l.pos += len(rightDelim)
191 l.emit(tokenArrayEnd)
196 func lexItem(l *lexer) stateFunc {
197 l.consumeWhitespace()
198 if strings.HasPrefix(l.input[l.pos:], rightDelim) {
201 if strings.HasPrefix(l.input[l.pos:], leftDelim) {
204 switch r := l.peek(); {
206 return l.errorf("unclosed array")
208 return l.errorf("empty item in array")
209 case unicode.IsSpace(r):
212 return lexQuotedString
218 func lexQuotedString(l *lexer) stateFunc {
220 l.ignore() // ignore the open quote
222 switch r := l.next(); {
224 return l.errorf("unclosed quoted string")
234 // always skip over the character following a \
237 return l.errorf("unclosed quoted string")
243 func lexString(l *lexer) stateFunc {
245 if strings.HasPrefix(l.input[l.pos:], leftDelim) {
246 return l.errorf(leftDelim + " in unquoted string")
248 if strings.HasPrefix(l.input[l.pos:], rightDelim) {
249 if l.pos <= l.start {
250 return l.errorf(rightDelim + " in unquoted string")
253 s := l.input[l.start:l.pos]
254 for pos, r := range s {
255 if !unicode.IsSpace(r) {
256 lastNonSpace = l.start + pos + 1
259 if lastNonSpace < 0 {
260 return l.errorf("unquoted empty string")
262 for lastNonSpace < l.pos {
265 if string(l.input[l.start:l.pos]) == "NULL" {
270 l.consumeWhitespace()
273 switch r := l.next(); {
275 return l.errorf("eof while parsing string")
277 return l.errorf("\" in unquoted string")
279 return l.errorf("\\ in unquoted string")
282 if l.pos <= l.start {
283 return l.errorf("unquoted empty string")
285 if string(l.input[l.start:l.pos]) == "NULL" {
295 func lexSeparator(l *lexer) stateFunc {
296 l.consumeWhitespace()
297 if strings.HasPrefix(l.input[l.pos:], rightDelim) {
302 l.emit(tokenSeparator)
305 if l.arrayDepth > 0 {
306 return l.errorf("unclosed array")
312 return l.errorf("expected %s, none found before %s\n", string(separator), l.input[l.pos:])