pqarrays
pqarrays/lexer.go
Fix whitespace lexing/parsing. Our consumeWhitespace method had a bug where it wouldn't emit the whitespace token, meaning we weren't actually skipping the whitespace. Oops. While in there, I removed an outdated TODO (we already supplied a default state). I updated the lexing of unquoted strings to take into account the rules about spaces and unquoted strings; basically, spaces before or after are ignored as whitespace, spaces in the middle ocunt, and empty strings are not allowed. I removed an extra case when detecting what to do when lexing an unquoted string; we already consumed all the whitespace, so the next character shouldn't be whitespace, so no need to test for it. We need to consume whitespace before we start lexing the separator character. I updated the token debugging to be a bit more useful, by defining a String() method on the token type itself, so it'll expose both the type and the value. This makes unexpected errors easier to deal with, and is used in all the errors raised by the parser now. I added a bunch of whitespace tests for lexing and parsing.
| paddy@0 | 1 package pqarrays |
| paddy@0 | 2 |
| paddy@0 | 3 import ( |
| paddy@0 | 4 "fmt" |
| paddy@0 | 5 "strings" |
| paddy@0 | 6 "unicode" |
| paddy@0 | 7 "unicode/utf8" |
| paddy@0 | 8 ) |
| paddy@0 | 9 |
| paddy@0 | 10 const ( |
| paddy@0 | 11 eof = -1 |
| paddy@0 | 12 leftDelim = "{" |
| paddy@0 | 13 rightDelim = "}" |
| paddy@0 | 14 separator = ',' |
| paddy@0 | 15 ) |
| paddy@0 | 16 |
| paddy@0 | 17 type tokenType int |
| paddy@0 | 18 |
| paddy@0 | 19 const ( |
| paddy@0 | 20 tokenError tokenType = iota |
| paddy@0 | 21 tokenWhitespace |
| paddy@0 | 22 tokenArrayStart |
| paddy@0 | 23 tokenString |
| paddy@0 | 24 tokenNull |
| paddy@0 | 25 tokenSeparator |
| paddy@0 | 26 tokenArrayEnd |
| paddy@0 | 27 tokenEOF |
| paddy@0 | 28 ) |
| paddy@0 | 29 |
| paddy@0 | 30 func (t tokenType) String() string { |
| paddy@0 | 31 switch t { |
| paddy@0 | 32 case tokenError: |
| paddy@0 | 33 return "error" |
| paddy@0 | 34 case tokenWhitespace: |
| paddy@0 | 35 return "whitespace" |
| paddy@0 | 36 case tokenArrayStart: |
| paddy@0 | 37 return "array start" |
| paddy@0 | 38 case tokenString: |
| paddy@0 | 39 return "string" |
| paddy@0 | 40 case tokenNull: |
| paddy@0 | 41 return "null" |
| paddy@0 | 42 case tokenSeparator: |
| paddy@0 | 43 return "separator" |
| paddy@0 | 44 case tokenArrayEnd: |
| paddy@0 | 45 return "array end" |
| paddy@0 | 46 case tokenEOF: |
| paddy@0 | 47 return "eof" |
| paddy@0 | 48 default: |
| paddy@0 | 49 return "unknown token" |
| paddy@0 | 50 } |
| paddy@0 | 51 } |
| paddy@0 | 52 |
| paddy@0 | 53 type stateFunc func(*lexer) stateFunc |
| paddy@0 | 54 |
| paddy@0 | 55 type lexer struct { |
| paddy@0 | 56 tokens chan token |
| paddy@0 | 57 input string |
| paddy@0 | 58 start int |
| paddy@0 | 59 pos int |
| paddy@0 | 60 omitted []int |
| paddy@0 | 61 width int |
| paddy@0 | 62 state stateFunc |
| paddy@0 | 63 arrayDepth int |
| paddy@0 | 64 } |
| paddy@0 | 65 |
| paddy@0 | 66 type token struct { |
| paddy@0 | 67 typ tokenType |
| paddy@0 | 68 val string |
| paddy@0 | 69 } |
| paddy@0 | 70 |
| paddy@2 | 71 func (t token) String() string { |
| paddy@2 | 72 return fmt.Sprintf("%s: %s", t.typ.String(), t.val) |
| paddy@2 | 73 } |
| paddy@2 | 74 |
| paddy@0 | 75 func lex(input string) *lexer { |
| paddy@0 | 76 l := &lexer{ |
| paddy@0 | 77 input: input, |
| paddy@0 | 78 tokens: make(chan token), |
| paddy@0 | 79 } |
| paddy@0 | 80 go l.run() |
| paddy@0 | 81 return l |
| paddy@0 | 82 } |
| paddy@0 | 83 |
| paddy@0 | 84 func (l *lexer) nextToken() token { |
| paddy@0 | 85 return <-l.tokens |
| paddy@0 | 86 } |
| paddy@0 | 87 |
| paddy@0 | 88 func (l *lexer) run() { |
| paddy@2 | 89 for l.state = lexStart; l.state != nil; { |
| paddy@0 | 90 l.state = l.state(l) |
| paddy@0 | 91 } |
| paddy@0 | 92 } |
| paddy@0 | 93 |
| paddy@0 | 94 func (l *lexer) emit(t tokenType) { |
| paddy@0 | 95 var val string |
| paddy@0 | 96 if len(l.omitted) < 1 { |
| paddy@0 | 97 val = l.input[l.start:l.pos] |
| paddy@0 | 98 } else { |
| paddy@0 | 99 start := l.start |
| paddy@0 | 100 for _, pos := range l.omitted { |
| paddy@0 | 101 val += l.input[start:pos] |
| paddy@0 | 102 start = pos + 1 |
| paddy@0 | 103 } |
| paddy@0 | 104 if l.pos > start { |
| paddy@0 | 105 val += l.input[start:l.pos] |
| paddy@0 | 106 } |
| paddy@0 | 107 } |
| paddy@0 | 108 l.tokens <- token{typ: t, val: val} |
| paddy@0 | 109 l.start = l.pos |
| paddy@0 | 110 l.omitted = l.omitted[0:0] |
| paddy@0 | 111 } |
| paddy@0 | 112 |
| paddy@0 | 113 func (l *lexer) next() rune { |
| paddy@0 | 114 if l.pos >= len(l.input) { |
| paddy@0 | 115 l.width = 0 |
| paddy@0 | 116 return eof |
| paddy@0 | 117 } |
| paddy@0 | 118 var r rune |
| paddy@0 | 119 r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) |
| paddy@0 | 120 l.pos += l.width |
| paddy@0 | 121 return r |
| paddy@0 | 122 } |
| paddy@0 | 123 |
| paddy@0 | 124 func (l *lexer) omit() { |
| paddy@0 | 125 l.omitted = append(l.omitted, l.pos-1) |
| paddy@0 | 126 } |
| paddy@0 | 127 |
| paddy@0 | 128 func (l *lexer) ignore() { |
| paddy@0 | 129 l.start = l.pos |
| paddy@0 | 130 } |
| paddy@0 | 131 |
| paddy@0 | 132 func (l *lexer) backup() { |
| paddy@0 | 133 l.pos -= l.width |
| paddy@0 | 134 } |
| paddy@0 | 135 |
| paddy@0 | 136 func (l *lexer) peek() rune { |
| paddy@0 | 137 r := l.next() |
| paddy@0 | 138 l.backup() |
| paddy@0 | 139 return r |
| paddy@0 | 140 } |
| paddy@0 | 141 |
| paddy@0 | 142 func (l *lexer) accept(valid string) bool { |
| paddy@0 | 143 if strings.IndexRune(valid, l.next()) >= 0 { |
| paddy@0 | 144 return true |
| paddy@0 | 145 } |
| paddy@0 | 146 l.backup() |
| paddy@0 | 147 return false |
| paddy@0 | 148 } |
| paddy@0 | 149 |
| paddy@0 | 150 func (l *lexer) acceptRun(valid string) { |
| paddy@0 | 151 for strings.IndexRune(valid, l.next()) >= 0 { |
| paddy@0 | 152 } |
| paddy@0 | 153 l.backup() |
| paddy@0 | 154 } |
| paddy@0 | 155 |
| paddy@0 | 156 func (l *lexer) errorf(format string, args ...interface{}) stateFunc { |
| paddy@0 | 157 l.tokens <- token{tokenError, fmt.Sprintf(format, args...)} |
| paddy@0 | 158 return nil |
| paddy@0 | 159 } |
| paddy@0 | 160 |
| paddy@0 | 161 func (l *lexer) consumeWhitespace() { |
| paddy@0 | 162 for unicode.IsSpace(l.peek()) { |
| paddy@0 | 163 l.next() |
| paddy@0 | 164 } |
| paddy@2 | 165 if l.pos > l.start { |
| paddy@0 | 166 l.emit(tokenWhitespace) |
| paddy@0 | 167 } |
| paddy@0 | 168 } |
| paddy@0 | 169 |
| paddy@0 | 170 func lexStart(l *lexer) stateFunc { |
| paddy@0 | 171 l.consumeWhitespace() |
| paddy@0 | 172 return lexArrayStart |
| paddy@0 | 173 } |
| paddy@0 | 174 |
| paddy@0 | 175 func lexArrayStart(l *lexer) stateFunc { |
| paddy@0 | 176 if strings.HasPrefix(l.input[l.pos:], leftDelim) { |
| paddy@0 | 177 return lexLeftDelim |
| paddy@0 | 178 } |
| paddy@0 | 179 return l.errorf("expected array to start before %s", l.input[l.pos:]) |
| paddy@0 | 180 } |
| paddy@0 | 181 |
| paddy@0 | 182 func lexLeftDelim(l *lexer) stateFunc { |
| paddy@0 | 183 l.pos += len(leftDelim) |
| paddy@0 | 184 l.emit(tokenArrayStart) |
| paddy@1 | 185 l.arrayDepth++ |
| paddy@0 | 186 return lexItem |
| paddy@0 | 187 } |
| paddy@0 | 188 |
| paddy@0 | 189 func lexRightDelim(l *lexer) stateFunc { |
| paddy@0 | 190 l.pos += len(rightDelim) |
| paddy@0 | 191 l.emit(tokenArrayEnd) |
| paddy@1 | 192 l.arrayDepth-- |
| paddy@0 | 193 return lexSeparator |
| paddy@0 | 194 } |
| paddy@0 | 195 |
| paddy@0 | 196 func lexItem(l *lexer) stateFunc { |
| paddy@0 | 197 l.consumeWhitespace() |
| paddy@0 | 198 if strings.HasPrefix(l.input[l.pos:], rightDelim) { |
| paddy@0 | 199 return lexRightDelim |
| paddy@0 | 200 } |
| paddy@0 | 201 if strings.HasPrefix(l.input[l.pos:], leftDelim) { |
| paddy@0 | 202 return lexLeftDelim |
| paddy@0 | 203 } |
| paddy@0 | 204 switch r := l.peek(); { |
| paddy@0 | 205 case r == eof: |
| paddy@0 | 206 return l.errorf("unclosed array") |
| paddy@0 | 207 case r == separator: |
| paddy@0 | 208 return l.errorf("empty item in array") |
| paddy@0 | 209 case unicode.IsSpace(r): |
| paddy@0 | 210 return lexItem |
| paddy@0 | 211 case r == '"': |
| paddy@0 | 212 return lexQuotedString |
| paddy@0 | 213 default: |
| paddy@0 | 214 return lexString |
| paddy@0 | 215 } |
| paddy@0 | 216 } |
| paddy@0 | 217 |
| paddy@0 | 218 func lexQuotedString(l *lexer) stateFunc { |
| paddy@0 | 219 l.next() |
| paddy@0 | 220 l.ignore() // ignore the open quote |
| paddy@0 | 221 for { |
| paddy@0 | 222 switch r := l.next(); { |
| paddy@0 | 223 case r == eof: |
| paddy@0 | 224 return l.errorf("unclosed quoted string") |
| paddy@0 | 225 case r == '"': |
| paddy@0 | 226 l.backup() |
| paddy@0 | 227 l.emit(tokenString) |
| paddy@0 | 228 l.next() |
| paddy@0 | 229 l.ignore() |
| paddy@0 | 230 return lexSeparator |
| paddy@0 | 231 case r == '\\': |
| paddy@0 | 232 // omit the \ itself |
| paddy@0 | 233 l.omit() |
| paddy@0 | 234 // always skip over the character following a \ |
| paddy@0 | 235 l.next() |
| paddy@0 | 236 if r == eof { |
| paddy@0 | 237 return l.errorf("unclosed quoted string") |
| paddy@0 | 238 } |
| paddy@0 | 239 } |
| paddy@0 | 240 } |
| paddy@0 | 241 } |
| paddy@0 | 242 |
| paddy@0 | 243 func lexString(l *lexer) stateFunc { |
| paddy@0 | 244 for { |
| paddy@0 | 245 if strings.HasPrefix(l.input[l.pos:], leftDelim) { |
| paddy@0 | 246 return l.errorf(leftDelim + " in unquoted string") |
| paddy@0 | 247 } |
| paddy@0 | 248 if strings.HasPrefix(l.input[l.pos:], rightDelim) { |
| paddy@0 | 249 if l.pos <= l.start { |
| paddy@0 | 250 return l.errorf(rightDelim + " in unquoted string") |
| paddy@0 | 251 } |
| paddy@2 | 252 lastNonSpace := -1 |
| paddy@2 | 253 s := l.input[l.start:l.pos] |
| paddy@2 | 254 for pos, r := range s { |
| paddy@2 | 255 if !unicode.IsSpace(r) { |
| paddy@2 | 256 lastNonSpace = l.start + pos + 1 |
| paddy@2 | 257 } |
| paddy@2 | 258 } |
| paddy@2 | 259 if lastNonSpace < 0 { |
| paddy@2 | 260 return l.errorf("unquoted empty string") |
| paddy@2 | 261 } |
| paddy@2 | 262 for lastNonSpace < l.pos { |
| paddy@2 | 263 l.backup() |
| paddy@2 | 264 } |
| paddy@0 | 265 if string(l.input[l.start:l.pos]) == "NULL" { |
| paddy@0 | 266 l.emit(tokenNull) |
| paddy@0 | 267 } else { |
| paddy@0 | 268 l.emit(tokenString) |
| paddy@0 | 269 } |
| paddy@2 | 270 l.consumeWhitespace() |
| paddy@0 | 271 return lexRightDelim |
| paddy@0 | 272 } |
| paddy@0 | 273 switch r := l.next(); { |
| paddy@0 | 274 case r == eof: |
| paddy@0 | 275 return l.errorf("eof while parsing string") |
| paddy@0 | 276 case r == '"': |
| paddy@0 | 277 return l.errorf("\" in unquoted string") |
| paddy@0 | 278 case r == '\\': |
| paddy@0 | 279 return l.errorf("\\ in unquoted string") |
| paddy@0 | 280 case r == separator: |
| paddy@0 | 281 l.backup() |
| paddy@0 | 282 if l.pos <= l.start { |
| paddy@0 | 283 return l.errorf("unquoted empty string") |
| paddy@0 | 284 } |
| paddy@0 | 285 if string(l.input[l.start:l.pos]) == "NULL" { |
| paddy@0 | 286 l.emit(tokenNull) |
| paddy@0 | 287 } else { |
| paddy@0 | 288 l.emit(tokenString) |
| paddy@0 | 289 } |
| paddy@0 | 290 return lexSeparator |
| paddy@0 | 291 } |
| paddy@0 | 292 } |
| paddy@0 | 293 } |
| paddy@0 | 294 |
| paddy@0 | 295 func lexSeparator(l *lexer) stateFunc { |
| paddy@2 | 296 l.consumeWhitespace() |
| paddy@0 | 297 if strings.HasPrefix(l.input[l.pos:], rightDelim) { |
| paddy@0 | 298 return lexRightDelim |
| paddy@0 | 299 } |
| paddy@0 | 300 r := l.next() |
| paddy@0 | 301 if r == separator { |
| paddy@0 | 302 l.emit(tokenSeparator) |
| paddy@0 | 303 return lexItem |
| paddy@0 | 304 } else if r == eof { |
| paddy@0 | 305 if l.arrayDepth > 0 { |
| paddy@0 | 306 return l.errorf("unclosed array") |
| paddy@0 | 307 } |
| paddy@0 | 308 l.emit(tokenEOF) |
| paddy@0 | 309 return nil |
| paddy@0 | 310 } else { |
| paddy@0 | 311 l.backup() |
| paddy@1 | 312 return l.errorf("expected %s, none found before %s\n", string(separator), l.input[l.pos:]) |
| paddy@0 | 313 } |
| paddy@0 | 314 } |