pqarrays

Paddy 2016-02-26 Parent:ce9c92fc81ab

2:9a415db0346a Go to Latest

pqarrays/lexer.go

Fix whitespace lexing/parsing. Our consumeWhitespace method had a bug where it wouldn't emit the whitespace token, meaning we weren't actually skipping the whitespace. Oops. While in there, I removed an outdated TODO (we already supplied a default state). I updated the lexing of unquoted strings to take into account the rules about spaces and unquoted strings; basically, spaces before or after are ignored as whitespace, spaces in the middle ocunt, and empty strings are not allowed. I removed an extra case when detecting what to do when lexing an unquoted string; we already consumed all the whitespace, so the next character shouldn't be whitespace, so no need to test for it. We need to consume whitespace before we start lexing the separator character. I updated the token debugging to be a bit more useful, by defining a String() method on the token type itself, so it'll expose both the type and the value. This makes unexpected errors easier to deal with, and is used in all the errors raised by the parser now. I added a bunch of whitespace tests for lexing and parsing.

Download raw file

View source Diff to previous Annotate

1 package pqarrays

3 import (

4 "fmt"

5 "strings"

6 "unicode"

7 "unicode/utf8"

10 const (

11 eof = -1

12 leftDelim = "{"

13 rightDelim = "}"

14 separator = ','

17 type tokenType int

19 const (

20 tokenError tokenType = iota

21 tokenWhitespace

22 tokenArrayStart

23 tokenString

24 tokenNull

25 tokenSeparator

26 tokenArrayEnd

27 tokenEOF

30 func (t tokenType) String() string {

31 switch t {

32 case tokenError:

33 return "error"

34 case tokenWhitespace:

35 return "whitespace"

36 case tokenArrayStart:

37 return "array start"

38 case tokenString:

39 return "string"

40 case tokenNull:

41 return "null"

42 case tokenSeparator:

43 return "separator"

44 case tokenArrayEnd:

45 return "array end"

46 case tokenEOF:

47 return "eof"

48 default:

49 return "unknown token"

53 type stateFunc func(*lexer) stateFunc

55 type lexer struct {

56 tokens chan token

57 input string

58 start int

59 pos int

60 omitted []int

61 width int

62 state stateFunc

63 arrayDepth int

66 type token struct {

67 typ tokenType

68 val string

71 func (t token) String() string {

72 return fmt.Sprintf("%s: %s", t.typ.String(), t.val)

75 func lex(input string) *lexer {

76 l := &lexer{

77 input: input,

78 tokens: make(chan token),

80 go l.run()

81 return l

84 func (l *lexer) nextToken() token {

85 return <-l.tokens

88 func (l *lexer) run() {

89 for l.state = lexStart; l.state != nil; {

90 l.state = l.state(l)

94 func (l *lexer) emit(t tokenType) {

95 var val string

96 if len(l.omitted) < 1 {

97 val = l.input[l.start:l.pos]

98 } else {

99 start := l.start

100 for _, pos := range l.omitted {

101 val += l.input[start:pos]

102 start = pos + 1

104 if l.pos > start {

105 val += l.input[start:l.pos]

108 l.tokens <- token{typ: t, val: val}

109 l.start = l.pos

110 l.omitted = l.omitted[0:0]

113 func (l *lexer) next() rune {

114 if l.pos >= len(l.input) {

115 l.width = 0

116 return eof

118 var r rune

119 r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])

120 l.pos += l.width

121 return r

124 func (l *lexer) omit() {

125 l.omitted = append(l.omitted, l.pos-1)

128 func (l *lexer) ignore() {

129 l.start = l.pos

132 func (l *lexer) backup() {

133 l.pos -= l.width

136 func (l *lexer) peek() rune {

137 r := l.next()

138 l.backup()

139 return r

142 func (l *lexer) accept(valid string) bool {

143 if strings.IndexRune(valid, l.next()) >= 0 {

144 return true

146 l.backup()

147 return false

150 func (l *lexer) acceptRun(valid string) {

151 for strings.IndexRune(valid, l.next()) >= 0 {

153 l.backup()

156 func (l *lexer) errorf(format string, args ...interface{}) stateFunc {

157 l.tokens <- token{tokenError, fmt.Sprintf(format, args...)}

158 return nil

161 func (l *lexer) consumeWhitespace() {

162 for unicode.IsSpace(l.peek()) {

163 l.next()

165 if l.pos > l.start {

166 l.emit(tokenWhitespace)

170 func lexStart(l *lexer) stateFunc {

171 l.consumeWhitespace()

172 return lexArrayStart

175 func lexArrayStart(l *lexer) stateFunc {

176 if strings.HasPrefix(l.input[l.pos:], leftDelim) {

177 return lexLeftDelim

179 return l.errorf("expected array to start before %s", l.input[l.pos:])

182 func lexLeftDelim(l *lexer) stateFunc {

183 l.pos += len(leftDelim)

184 l.emit(tokenArrayStart)

185 l.arrayDepth++

186 return lexItem

189 func lexRightDelim(l *lexer) stateFunc {

190 l.pos += len(rightDelim)

191 l.emit(tokenArrayEnd)

192 l.arrayDepth--

193 return lexSeparator

196 func lexItem(l *lexer) stateFunc {

197 l.consumeWhitespace()

198 if strings.HasPrefix(l.input[l.pos:], rightDelim) {

199 return lexRightDelim

201 if strings.HasPrefix(l.input[l.pos:], leftDelim) {

202 return lexLeftDelim

204 switch r := l.peek(); {

205 case r == eof:

206 return l.errorf("unclosed array")

207 case r == separator:

208 return l.errorf("empty item in array")

209 case unicode.IsSpace(r):

210 return lexItem

211 case r == '"':

212 return lexQuotedString

213 default:

214 return lexString

218 func lexQuotedString(l *lexer) stateFunc {

219 l.next()

220 l.ignore() // ignore the open quote

221 for {

222 switch r := l.next(); {

223 case r == eof:

224 return l.errorf("unclosed quoted string")

225 case r == '"':

226 l.backup()

227 l.emit(tokenString)

228 l.next()

229 l.ignore()

230 return lexSeparator

231 case r == '\\':

232 // omit the \ itself

233 l.omit()

234 // always skip over the character following a \

235 l.next()

236 if r == eof {

237 return l.errorf("unclosed quoted string")

238 }

243 func lexString(l *lexer) stateFunc {

244 for {

245 if strings.HasPrefix(l.input[l.pos:], leftDelim) {

246 return l.errorf(leftDelim + " in unquoted string")

248 if strings.HasPrefix(l.input[l.pos:], rightDelim) {

249 if l.pos <= l.start {

250 return l.errorf(rightDelim + " in unquoted string")

251 }

252 lastNonSpace := -1

253 s := l.input[l.start:l.pos]

254 for pos, r := range s {

255 if !unicode.IsSpace(r) {

256 lastNonSpace = l.start + pos + 1

257 }

258 }

259 if lastNonSpace < 0 {

260 return l.errorf("unquoted empty string")

261 }

262 for lastNonSpace < l.pos {

263 l.backup()

264 }

265 if string(l.input[l.start:l.pos]) == "NULL" {

266 l.emit(tokenNull)

267 } else {

268 l.emit(tokenString)

269 }

270 l.consumeWhitespace()

271 return lexRightDelim

273 switch r := l.next(); {

274 case r == eof:

275 return l.errorf("eof while parsing string")

276 case r == '"':

277 return l.errorf("\" in unquoted string")

278 case r == '\\':

279 return l.errorf("\\ in unquoted string")

280 case r == separator:

281 l.backup()

282 if l.pos <= l.start {

283 return l.errorf("unquoted empty string")

284 }

285 if string(l.input[l.start:l.pos]) == "NULL" {

286 l.emit(tokenNull)

287 } else {

288 l.emit(tokenString)

289 }

290 return lexSeparator

295 func lexSeparator(l *lexer) stateFunc {

296 l.consumeWhitespace()

297 if strings.HasPrefix(l.input[l.pos:], rightDelim) {

298 return lexRightDelim

300 r := l.next()

301 if r == separator {

302 l.emit(tokenSeparator)

303 return lexItem

304 } else if r == eof {

305 if l.arrayDepth > 0 {

306 return l.errorf("unclosed array")

308 l.emit(tokenEOF)

309 return nil

310 } else {

311 l.backup()

312 return l.errorf("expected %s, none found before %s\n", string(separator), l.input[l.pos:])