pqarrays

Paddy 2016-02-26 Parent:ce9c92fc81ab

2:9a415db0346a Go to Latest

pqarrays/lexer.go

Fix whitespace lexing/parsing. Our consumeWhitespace method had a bug where it wouldn't emit the whitespace token, meaning we weren't actually skipping the whitespace. Oops. While in there, I removed an outdated TODO (we already supplied a default state). I updated the lexing of unquoted strings to take into account the rules about spaces and unquoted strings; basically, spaces before or after are ignored as whitespace, spaces in the middle ocunt, and empty strings are not allowed. I removed an extra case when detecting what to do when lexing an unquoted string; we already consumed all the whitespace, so the next character shouldn't be whitespace, so no need to test for it. We need to consume whitespace before we start lexing the separator character. I updated the token debugging to be a bit more useful, by defining a String() method on the token type itself, so it'll expose both the type and the value. This makes unexpected errors easier to deal with, and is used in all the errors raised by the parser now. I added a bunch of whitespace tests for lexing and parsing.

History
1 package pqarrays
3 import (
4 "fmt"
5 "strings"
6 "unicode"
7 "unicode/utf8"
8 )
10 const (
11 eof = -1
12 leftDelim = "{"
13 rightDelim = "}"
14 separator = ','
15 )
17 type tokenType int
19 const (
20 tokenError tokenType = iota
21 tokenWhitespace
22 tokenArrayStart
23 tokenString
24 tokenNull
25 tokenSeparator
26 tokenArrayEnd
27 tokenEOF
28 )
30 func (t tokenType) String() string {
31 switch t {
32 case tokenError:
33 return "error"
34 case tokenWhitespace:
35 return "whitespace"
36 case tokenArrayStart:
37 return "array start"
38 case tokenString:
39 return "string"
40 case tokenNull:
41 return "null"
42 case tokenSeparator:
43 return "separator"
44 case tokenArrayEnd:
45 return "array end"
46 case tokenEOF:
47 return "eof"
48 default:
49 return "unknown token"
50 }
51 }
53 type stateFunc func(*lexer) stateFunc
55 type lexer struct {
56 tokens chan token
57 input string
58 start int
59 pos int
60 omitted []int
61 width int
62 state stateFunc
63 arrayDepth int
64 }
66 type token struct {
67 typ tokenType
68 val string
69 }
71 func (t token) String() string {
72 return fmt.Sprintf("%s: %s", t.typ.String(), t.val)
73 }
75 func lex(input string) *lexer {
76 l := &lexer{
77 input: input,
78 tokens: make(chan token),
79 }
80 go l.run()
81 return l
82 }
84 func (l *lexer) nextToken() token {
85 return <-l.tokens
86 }
88 func (l *lexer) run() {
89 for l.state = lexStart; l.state != nil; {
90 l.state = l.state(l)
91 }
92 }
94 func (l *lexer) emit(t tokenType) {
95 var val string
96 if len(l.omitted) < 1 {
97 val = l.input[l.start:l.pos]
98 } else {
99 start := l.start
100 for _, pos := range l.omitted {
101 val += l.input[start:pos]
102 start = pos + 1
103 }
104 if l.pos > start {
105 val += l.input[start:l.pos]
106 }
107 }
108 l.tokens <- token{typ: t, val: val}
109 l.start = l.pos
110 l.omitted = l.omitted[0:0]
111 }
113 func (l *lexer) next() rune {
114 if l.pos >= len(l.input) {
115 l.width = 0
116 return eof
117 }
118 var r rune
119 r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
120 l.pos += l.width
121 return r
122 }
124 func (l *lexer) omit() {
125 l.omitted = append(l.omitted, l.pos-1)
126 }
128 func (l *lexer) ignore() {
129 l.start = l.pos
130 }
132 func (l *lexer) backup() {
133 l.pos -= l.width
134 }
136 func (l *lexer) peek() rune {
137 r := l.next()
138 l.backup()
139 return r
140 }
142 func (l *lexer) accept(valid string) bool {
143 if strings.IndexRune(valid, l.next()) >= 0 {
144 return true
145 }
146 l.backup()
147 return false
148 }
150 func (l *lexer) acceptRun(valid string) {
151 for strings.IndexRune(valid, l.next()) >= 0 {
152 }
153 l.backup()
154 }
156 func (l *lexer) errorf(format string, args ...interface{}) stateFunc {
157 l.tokens <- token{tokenError, fmt.Sprintf(format, args...)}
158 return nil
159 }
161 func (l *lexer) consumeWhitespace() {
162 for unicode.IsSpace(l.peek()) {
163 l.next()
164 }
165 if l.pos > l.start {
166 l.emit(tokenWhitespace)
167 }
168 }
170 func lexStart(l *lexer) stateFunc {
171 l.consumeWhitespace()
172 return lexArrayStart
173 }
175 func lexArrayStart(l *lexer) stateFunc {
176 if strings.HasPrefix(l.input[l.pos:], leftDelim) {
177 return lexLeftDelim
178 }
179 return l.errorf("expected array to start before %s", l.input[l.pos:])
180 }
182 func lexLeftDelim(l *lexer) stateFunc {
183 l.pos += len(leftDelim)
184 l.emit(tokenArrayStart)
185 l.arrayDepth++
186 return lexItem
187 }
189 func lexRightDelim(l *lexer) stateFunc {
190 l.pos += len(rightDelim)
191 l.emit(tokenArrayEnd)
192 l.arrayDepth--
193 return lexSeparator
194 }
196 func lexItem(l *lexer) stateFunc {
197 l.consumeWhitespace()
198 if strings.HasPrefix(l.input[l.pos:], rightDelim) {
199 return lexRightDelim
200 }
201 if strings.HasPrefix(l.input[l.pos:], leftDelim) {
202 return lexLeftDelim
203 }
204 switch r := l.peek(); {
205 case r == eof:
206 return l.errorf("unclosed array")
207 case r == separator:
208 return l.errorf("empty item in array")
209 case unicode.IsSpace(r):
210 return lexItem
211 case r == '"':
212 return lexQuotedString
213 default:
214 return lexString
215 }
216 }
218 func lexQuotedString(l *lexer) stateFunc {
219 l.next()
220 l.ignore() // ignore the open quote
221 for {
222 switch r := l.next(); {
223 case r == eof:
224 return l.errorf("unclosed quoted string")
225 case r == '"':
226 l.backup()
227 l.emit(tokenString)
228 l.next()
229 l.ignore()
230 return lexSeparator
231 case r == '\\':
232 // omit the \ itself
233 l.omit()
234 // always skip over the character following a \
235 l.next()
236 if r == eof {
237 return l.errorf("unclosed quoted string")
238 }
239 }
240 }
241 }
243 func lexString(l *lexer) stateFunc {
244 for {
245 if strings.HasPrefix(l.input[l.pos:], leftDelim) {
246 return l.errorf(leftDelim + " in unquoted string")
247 }
248 if strings.HasPrefix(l.input[l.pos:], rightDelim) {
249 if l.pos <= l.start {
250 return l.errorf(rightDelim + " in unquoted string")
251 }
252 lastNonSpace := -1
253 s := l.input[l.start:l.pos]
254 for pos, r := range s {
255 if !unicode.IsSpace(r) {
256 lastNonSpace = l.start + pos + 1
257 }
258 }
259 if lastNonSpace < 0 {
260 return l.errorf("unquoted empty string")
261 }
262 for lastNonSpace < l.pos {
263 l.backup()
264 }
265 if string(l.input[l.start:l.pos]) == "NULL" {
266 l.emit(tokenNull)
267 } else {
268 l.emit(tokenString)
269 }
270 l.consumeWhitespace()
271 return lexRightDelim
272 }
273 switch r := l.next(); {
274 case r == eof:
275 return l.errorf("eof while parsing string")
276 case r == '"':
277 return l.errorf("\" in unquoted string")
278 case r == '\\':
279 return l.errorf("\\ in unquoted string")
280 case r == separator:
281 l.backup()
282 if l.pos <= l.start {
283 return l.errorf("unquoted empty string")
284 }
285 if string(l.input[l.start:l.pos]) == "NULL" {
286 l.emit(tokenNull)
287 } else {
288 l.emit(tokenString)
289 }
290 return lexSeparator
291 }
292 }
293 }
295 func lexSeparator(l *lexer) stateFunc {
296 l.consumeWhitespace()
297 if strings.HasPrefix(l.input[l.pos:], rightDelim) {
298 return lexRightDelim
299 }
300 r := l.next()
301 if r == separator {
302 l.emit(tokenSeparator)
303 return lexItem
304 } else if r == eof {
305 if l.arrayDepth > 0 {
306 return l.errorf("unclosed array")
307 }
308 l.emit(tokenEOF)
309 return nil
310 } else {
311 l.backup()
312 return l.errorf("expected %s, none found before %s\n", string(separator), l.input[l.pos:])
313 }
314 }