pqarrays

Paddy 2016-02-25 Parent:bfe2a4af6bdf Child:9a415db0346a

1:ce9c92fc81ab Go to Latest

pqarrays/lexer.go

Fix bug parsing empty arrays, make golint and go vet happy. Add comments to make golint happy. Also, because comments are a good thing to have. Turn += 1 and -= 1 into ++ and --, respectively, so golint will be happy. Fix an improperly formated errorf, where a rune was being treated as a string. Thanks, go vet! Fix whitespace parsing, returning the parse functions again instead of just skipping the one character. Now if we have more than one whitespace character in a row, they'll all be skipped. Add a parseStringOrNullOrEnd parse function that will be called after the tokenArrayStart character, to fix a bug where empty arrays were expecting a string or null and getting the array end character. This is only valid after tokenArrayStart, however; in other places where parseSeparatorOrDelim is used, it wouldn't be appropriate. Add a parser test for an empty array.

History
1 package pqarrays
3 import (
4 "fmt"
5 "strings"
6 "unicode"
7 "unicode/utf8"
8 )
10 const (
11 eof = -1
12 leftDelim = "{"
13 rightDelim = "}"
14 separator = ','
15 )
17 type tokenType int
19 const (
20 tokenError tokenType = iota
21 tokenWhitespace
22 tokenArrayStart
23 tokenString
24 tokenNull
25 tokenSeparator
26 tokenArrayEnd
27 tokenEOF
28 )
30 func (t tokenType) String() string {
31 switch t {
32 case tokenError:
33 return "error"
34 case tokenWhitespace:
35 return "whitespace"
36 case tokenArrayStart:
37 return "array start"
38 case tokenString:
39 return "string"
40 case tokenNull:
41 return "null"
42 case tokenSeparator:
43 return "separator"
44 case tokenArrayEnd:
45 return "array end"
46 case tokenEOF:
47 return "eof"
48 default:
49 return "unknown token"
50 }
51 }
53 type stateFunc func(*lexer) stateFunc
55 type lexer struct {
56 tokens chan token
57 input string
58 start int
59 pos int
60 omitted []int
61 width int
62 state stateFunc
63 arrayDepth int
64 }
66 type token struct {
67 typ tokenType
68 val string
69 }
71 func lex(input string) *lexer {
72 l := &lexer{
73 input: input,
74 tokens: make(chan token),
75 }
76 go l.run()
77 return l
78 }
80 func (l *lexer) nextToken() token {
81 return <-l.tokens
82 }
84 func (l *lexer) run() {
85 for l.state = lexStart; l.state != nil; { // TODO(paddy): default state
86 l.state = l.state(l)
87 }
88 }
90 func (l *lexer) emit(t tokenType) {
91 var val string
92 if len(l.omitted) < 1 {
93 val = l.input[l.start:l.pos]
94 } else {
95 start := l.start
96 for _, pos := range l.omitted {
97 val += l.input[start:pos]
98 start = pos + 1
99 }
100 if l.pos > start {
101 val += l.input[start:l.pos]
102 }
103 }
104 l.tokens <- token{typ: t, val: val}
105 l.start = l.pos
106 l.omitted = l.omitted[0:0]
107 }
109 func (l *lexer) next() rune {
110 if l.pos >= len(l.input) {
111 l.width = 0
112 return eof
113 }
114 var r rune
115 r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
116 l.pos += l.width
117 return r
118 }
120 func (l *lexer) omit() {
121 l.omitted = append(l.omitted, l.pos-1)
122 }
124 func (l *lexer) ignore() {
125 l.start = l.pos
126 }
128 func (l *lexer) backup() {
129 l.pos -= l.width
130 }
132 func (l *lexer) peek() rune {
133 r := l.next()
134 l.backup()
135 return r
136 }
138 func (l *lexer) accept(valid string) bool {
139 if strings.IndexRune(valid, l.next()) >= 0 {
140 return true
141 }
142 l.backup()
143 return false
144 }
146 func (l *lexer) acceptRun(valid string) {
147 for strings.IndexRune(valid, l.next()) >= 0 {
148 }
149 l.backup()
150 }
152 func (l *lexer) errorf(format string, args ...interface{}) stateFunc {
153 l.tokens <- token{tokenError, fmt.Sprintf(format, args...)}
154 return nil
155 }
157 func (l *lexer) consumeWhitespace() {
158 for unicode.IsSpace(l.peek()) {
159 l.next()
160 }
161 if l.start > l.pos {
162 l.emit(tokenWhitespace)
163 }
164 }
166 func lexStart(l *lexer) stateFunc {
167 l.consumeWhitespace()
168 return lexArrayStart
169 }
171 func lexArrayStart(l *lexer) stateFunc {
172 if strings.HasPrefix(l.input[l.pos:], leftDelim) {
173 return lexLeftDelim
174 }
175 return l.errorf("expected array to start before %s", l.input[l.pos:])
176 }
178 func lexLeftDelim(l *lexer) stateFunc {
179 l.pos += len(leftDelim)
180 l.emit(tokenArrayStart)
181 l.arrayDepth++
182 return lexItem
183 }
185 func lexRightDelim(l *lexer) stateFunc {
186 l.pos += len(rightDelim)
187 l.emit(tokenArrayEnd)
188 l.arrayDepth--
189 return lexSeparator
190 }
192 func lexItem(l *lexer) stateFunc {
193 l.consumeWhitespace()
194 if strings.HasPrefix(l.input[l.pos:], rightDelim) {
195 return lexRightDelim
196 }
197 if strings.HasPrefix(l.input[l.pos:], leftDelim) {
198 return lexLeftDelim
199 }
200 switch r := l.peek(); {
201 case r == eof:
202 return l.errorf("unclosed array")
203 case r == separator:
204 return l.errorf("empty item in array")
205 case unicode.IsSpace(r):
206 l.consumeWhitespace()
207 return lexItem
208 case r == '"':
209 return lexQuotedString
210 default:
211 return lexString
212 }
213 }
215 func lexQuotedString(l *lexer) stateFunc {
216 l.next()
217 l.ignore() // ignore the open quote
218 for {
219 switch r := l.next(); {
220 case r == eof:
221 return l.errorf("unclosed quoted string")
222 case r == '"':
223 l.backup()
224 l.emit(tokenString)
225 l.next()
226 l.ignore()
227 return lexSeparator
228 case r == '\\':
229 // omit the \ itself
230 l.omit()
231 // always skip over the character following a \
232 l.next()
233 if r == eof {
234 return l.errorf("unclosed quoted string")
235 }
236 }
237 }
238 }
240 func lexString(l *lexer) stateFunc {
241 for {
242 if strings.HasPrefix(l.input[l.pos:], leftDelim) {
243 return l.errorf(leftDelim + " in unquoted string")
244 }
245 if strings.HasPrefix(l.input[l.pos:], rightDelim) {
246 if l.pos <= l.start {
247 return l.errorf(rightDelim + " in unquoted string")
248 }
249 if string(l.input[l.start:l.pos]) == "NULL" {
250 l.emit(tokenNull)
251 } else {
252 l.emit(tokenString)
253 }
254 return lexRightDelim
255 }
256 switch r := l.next(); {
257 case r == eof:
258 return l.errorf("eof while parsing string")
259 case r == '"':
260 return l.errorf("\" in unquoted string")
261 case unicode.IsSpace(r):
262 return l.errorf("unquoted empty string")
263 case r == '\\':
264 return l.errorf("\\ in unquoted string")
265 case r == separator:
266 l.backup()
267 if l.pos <= l.start {
268 return l.errorf("unquoted empty string")
269 }
270 if string(l.input[l.start:l.pos]) == "NULL" {
271 l.emit(tokenNull)
272 } else {
273 l.emit(tokenString)
274 }
275 return lexSeparator
276 }
277 }
278 }
280 func lexSeparator(l *lexer) stateFunc {
281 if strings.HasPrefix(l.input[l.pos:], rightDelim) {
282 return lexRightDelim
283 }
284 r := l.next()
285 if r == separator {
286 l.emit(tokenSeparator)
287 return lexItem
288 } else if r == eof {
289 if l.arrayDepth > 0 {
290 return l.errorf("unclosed array")
291 }
292 l.emit(tokenEOF)
293 return nil
294 } else {
295 l.backup()
296 return l.errorf("expected %s, none found before %s\n", string(separator), l.input[l.pos:])
297 }
298 }