pqarrays
2015-04-19
Child:ce9c92fc81ab
pqarrays/lexer.go
First pass implementation. Use a lexer to generate tokens out of the Array type responses that PostgreSQL will send. Write a parser for string[] array types. Create a StringArray type that fulfills the driver.Valuer and sql.Scanner interfaces using the parser and lexer.
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/lexer.go Sun Apr 19 23:47:36 2015 -0400 1.3 @@ -0,0 +1,298 @@ 1.4 +package pqarrays 1.5 + 1.6 +import ( 1.7 + "fmt" 1.8 + "strings" 1.9 + "unicode" 1.10 + "unicode/utf8" 1.11 +) 1.12 + 1.13 +const ( 1.14 + eof = -1 1.15 + leftDelim = "{" 1.16 + rightDelim = "}" 1.17 + separator = ',' 1.18 +) 1.19 + 1.20 +type tokenType int 1.21 + 1.22 +const ( 1.23 + tokenError tokenType = iota 1.24 + tokenWhitespace 1.25 + tokenArrayStart 1.26 + tokenString 1.27 + tokenNull 1.28 + tokenSeparator 1.29 + tokenArrayEnd 1.30 + tokenEOF 1.31 +) 1.32 + 1.33 +func (t tokenType) String() string { 1.34 + switch t { 1.35 + case tokenError: 1.36 + return "error" 1.37 + case tokenWhitespace: 1.38 + return "whitespace" 1.39 + case tokenArrayStart: 1.40 + return "array start" 1.41 + case tokenString: 1.42 + return "string" 1.43 + case tokenNull: 1.44 + return "null" 1.45 + case tokenSeparator: 1.46 + return "separator" 1.47 + case tokenArrayEnd: 1.48 + return "array end" 1.49 + case tokenEOF: 1.50 + return "eof" 1.51 + default: 1.52 + return "unknown token" 1.53 + } 1.54 +} 1.55 + 1.56 +type stateFunc func(*lexer) stateFunc 1.57 + 1.58 +type lexer struct { 1.59 + tokens chan token 1.60 + input string 1.61 + start int 1.62 + pos int 1.63 + omitted []int 1.64 + width int 1.65 + state stateFunc 1.66 + arrayDepth int 1.67 +} 1.68 + 1.69 +type token struct { 1.70 + typ tokenType 1.71 + val string 1.72 +} 1.73 + 1.74 +func lex(input string) *lexer { 1.75 + l := &lexer{ 1.76 + input: input, 1.77 + tokens: make(chan token), 1.78 + } 1.79 + go l.run() 1.80 + return l 1.81 +} 1.82 + 1.83 +func (l *lexer) nextToken() token { 1.84 + return <-l.tokens 1.85 +} 1.86 + 1.87 +func (l *lexer) run() { 1.88 + for l.state = lexStart; l.state != nil; { // TODO(paddy): default state 1.89 + l.state = l.state(l) 1.90 + } 1.91 +} 1.92 + 1.93 +func (l *lexer) emit(t tokenType) { 1.94 + var val string 1.95 + if len(l.omitted) < 1 { 1.96 + val = l.input[l.start:l.pos] 1.97 + } else { 1.98 + start := l.start 1.99 + for _, pos := range l.omitted { 1.100 + val += l.input[start:pos] 1.101 + start = pos + 1 1.102 + } 1.103 + if l.pos > start { 1.104 + val += l.input[start:l.pos] 1.105 + } 1.106 + } 1.107 + l.tokens <- token{typ: t, val: val} 1.108 + l.start = l.pos 1.109 + l.omitted = l.omitted[0:0] 1.110 +} 1.111 + 1.112 +func (l *lexer) next() rune { 1.113 + if l.pos >= len(l.input) { 1.114 + l.width = 0 1.115 + return eof 1.116 + } 1.117 + var r rune 1.118 + r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) 1.119 + l.pos += l.width 1.120 + return r 1.121 +} 1.122 + 1.123 +func (l *lexer) omit() { 1.124 + l.omitted = append(l.omitted, l.pos-1) 1.125 +} 1.126 + 1.127 +func (l *lexer) ignore() { 1.128 + l.start = l.pos 1.129 +} 1.130 + 1.131 +func (l *lexer) backup() { 1.132 + l.pos -= l.width 1.133 +} 1.134 + 1.135 +func (l *lexer) peek() rune { 1.136 + r := l.next() 1.137 + l.backup() 1.138 + return r 1.139 +} 1.140 + 1.141 +func (l *lexer) accept(valid string) bool { 1.142 + if strings.IndexRune(valid, l.next()) >= 0 { 1.143 + return true 1.144 + } 1.145 + l.backup() 1.146 + return false 1.147 +} 1.148 + 1.149 +func (l *lexer) acceptRun(valid string) { 1.150 + for strings.IndexRune(valid, l.next()) >= 0 { 1.151 + } 1.152 + l.backup() 1.153 +} 1.154 + 1.155 +func (l *lexer) errorf(format string, args ...interface{}) stateFunc { 1.156 + l.tokens <- token{tokenError, fmt.Sprintf(format, args...)} 1.157 + return nil 1.158 +} 1.159 + 1.160 +func (l *lexer) consumeWhitespace() { 1.161 + for unicode.IsSpace(l.peek()) { 1.162 + l.next() 1.163 + } 1.164 + if l.start > l.pos { 1.165 + l.emit(tokenWhitespace) 1.166 + } 1.167 +} 1.168 + 1.169 +func lexStart(l *lexer) stateFunc { 1.170 + l.consumeWhitespace() 1.171 + return lexArrayStart 1.172 +} 1.173 + 1.174 +func lexArrayStart(l *lexer) stateFunc { 1.175 + if strings.HasPrefix(l.input[l.pos:], leftDelim) { 1.176 + return lexLeftDelim 1.177 + } 1.178 + return l.errorf("expected array to start before %s", l.input[l.pos:]) 1.179 +} 1.180 + 1.181 +func lexLeftDelim(l *lexer) stateFunc { 1.182 + l.pos += len(leftDelim) 1.183 + l.emit(tokenArrayStart) 1.184 + l.arrayDepth += 1 1.185 + return lexItem 1.186 +} 1.187 + 1.188 +func lexRightDelim(l *lexer) stateFunc { 1.189 + l.pos += len(rightDelim) 1.190 + l.emit(tokenArrayEnd) 1.191 + l.arrayDepth -= 1 1.192 + return lexSeparator 1.193 +} 1.194 + 1.195 +func lexItem(l *lexer) stateFunc { 1.196 + l.consumeWhitespace() 1.197 + if strings.HasPrefix(l.input[l.pos:], rightDelim) { 1.198 + return lexRightDelim 1.199 + } 1.200 + if strings.HasPrefix(l.input[l.pos:], leftDelim) { 1.201 + return lexLeftDelim 1.202 + } 1.203 + switch r := l.peek(); { 1.204 + case r == eof: 1.205 + return l.errorf("unclosed array") 1.206 + case r == separator: 1.207 + return l.errorf("empty item in array") 1.208 + case unicode.IsSpace(r): 1.209 + l.consumeWhitespace() 1.210 + return lexItem 1.211 + case r == '"': 1.212 + return lexQuotedString 1.213 + default: 1.214 + return lexString 1.215 + } 1.216 +} 1.217 + 1.218 +func lexQuotedString(l *lexer) stateFunc { 1.219 + l.next() 1.220 + l.ignore() // ignore the open quote 1.221 + for { 1.222 + switch r := l.next(); { 1.223 + case r == eof: 1.224 + return l.errorf("unclosed quoted string") 1.225 + case r == '"': 1.226 + l.backup() 1.227 + l.emit(tokenString) 1.228 + l.next() 1.229 + l.ignore() 1.230 + return lexSeparator 1.231 + case r == '\\': 1.232 + // omit the \ itself 1.233 + l.omit() 1.234 + // always skip over the character following a \ 1.235 + l.next() 1.236 + if r == eof { 1.237 + return l.errorf("unclosed quoted string") 1.238 + } 1.239 + } 1.240 + } 1.241 +} 1.242 + 1.243 +func lexString(l *lexer) stateFunc { 1.244 + for { 1.245 + if strings.HasPrefix(l.input[l.pos:], leftDelim) { 1.246 + return l.errorf(leftDelim + " in unquoted string") 1.247 + } 1.248 + if strings.HasPrefix(l.input[l.pos:], rightDelim) { 1.249 + if l.pos <= l.start { 1.250 + return l.errorf(rightDelim + " in unquoted string") 1.251 + } 1.252 + if string(l.input[l.start:l.pos]) == "NULL" { 1.253 + l.emit(tokenNull) 1.254 + } else { 1.255 + l.emit(tokenString) 1.256 + } 1.257 + return lexRightDelim 1.258 + } 1.259 + switch r := l.next(); { 1.260 + case r == eof: 1.261 + return l.errorf("eof while parsing string") 1.262 + case r == '"': 1.263 + return l.errorf("\" in unquoted string") 1.264 + case unicode.IsSpace(r): 1.265 + return l.errorf("unquoted empty string") 1.266 + case r == '\\': 1.267 + return l.errorf("\\ in unquoted string") 1.268 + case r == separator: 1.269 + l.backup() 1.270 + if l.pos <= l.start { 1.271 + return l.errorf("unquoted empty string") 1.272 + } 1.273 + if string(l.input[l.start:l.pos]) == "NULL" { 1.274 + l.emit(tokenNull) 1.275 + } else { 1.276 + l.emit(tokenString) 1.277 + } 1.278 + return lexSeparator 1.279 + } 1.280 + } 1.281 +} 1.282 + 1.283 +func lexSeparator(l *lexer) stateFunc { 1.284 + if strings.HasPrefix(l.input[l.pos:], rightDelim) { 1.285 + return lexRightDelim 1.286 + } 1.287 + r := l.next() 1.288 + if r == separator { 1.289 + l.emit(tokenSeparator) 1.290 + return lexItem 1.291 + } else if r == eof { 1.292 + if l.arrayDepth > 0 { 1.293 + return l.errorf("unclosed array") 1.294 + } 1.295 + l.emit(tokenEOF) 1.296 + return nil 1.297 + } else { 1.298 + l.backup() 1.299 + return l.errorf("expected %s, none found before %s\n", separator, l.input[l.pos:]) 1.300 + } 1.301 +}