main
1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package scanner implements a scanner for gcfg configuration text.
6// It takes a []byte as source which can then be tokenized
7// through repeated calls to the Scan method.
8//
9// Note that the API for the scanner package may change to accommodate new
10// features or implementation changes in gcfg.
11package scanner
12
13import (
14 "fmt"
15 "path/filepath"
16 "unicode"
17 "unicode/utf8"
18
19 "github.com/go-git/gcfg/token"
20)
21
22// An ErrorHandler may be provided to Scanner.Init. If a syntax error is
23// encountered and a handler was installed, the handler is called with a
24// position and an error message. The position points to the beginning of
25// the offending token.
26type ErrorHandler func(pos token.Position, msg string)
27
28// A Scanner holds the scanner's internal state while processing
29// a given text. It can be allocated as part of another data
30// structure but must be initialized via Init before use.
31type Scanner struct {
32 // immutable state
33 file *token.File // source file handle
34 dir string // directory portion of file.Name()
35 src []byte // source
36 err ErrorHandler // error reporting; or nil
37 mode Mode // scanning mode
38
39 // scanning state
40 ch rune // current character
41 offset int // character offset
42 rdOffset int // reading offset (position after current character)
43 lineOffset int // current line offset
44 nextVal bool // next token is expected to be a value
45
46 // public state - ok to modify
47 ErrorCount int // number of errors encountered
48}
49
50// Read the next Unicode char into s.ch.
51// s.ch < 0 means end-of-file.
52func (s *Scanner) next() {
53 if s.rdOffset < len(s.src) {
54 s.offset = s.rdOffset
55 if s.ch == '\n' {
56 s.lineOffset = s.offset
57 s.file.AddLine(s.offset)
58 }
59 r, w := rune(s.src[s.rdOffset]), 1
60 switch {
61 case r == 0:
62 s.error(s.offset, "illegal character NUL")
63 case r >= 0x80:
64 // not ASCII
65 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
66 if r == utf8.RuneError && w == 1 {
67 s.error(s.offset, "illegal UTF-8 encoding")
68 }
69 }
70 s.rdOffset += w
71 s.ch = r
72 } else {
73 s.offset = len(s.src)
74 if s.ch == '\n' {
75 s.lineOffset = s.offset
76 s.file.AddLine(s.offset)
77 }
78 s.ch = -1 // eof
79 }
80}
81
82// A mode value is a set of flags (or 0).
83// They control scanner behavior.
84type Mode uint
85
86const (
87 ScanComments Mode = 1 << iota // return comments as COMMENT tokens
88)
89
90// Init prepares the scanner s to tokenize the text src by setting the
91// scanner at the beginning of src. The scanner uses the file set file
92// for position information and it adds line information for each line.
93// It is ok to re-use the same file when re-scanning the same file as
94// line information which is already present is ignored. Init causes a
95// panic if the file size does not match the src size.
96//
97// Calls to Scan will invoke the error handler err if they encounter a
98// syntax error and err is not nil. Also, for each error encountered,
99// the Scanner field ErrorCount is incremented by one. The mode parameter
100// determines how comments are handled.
101//
102// Note that Init may call err if there is an error in the first character
103// of the file.
104func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
105 // Explicitly initialize all fields since a scanner may be reused.
106 if file.Size() != len(src) {
107 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
108 }
109 s.file = file
110 s.dir, _ = filepath.Split(file.Name())
111 s.src = src
112 s.err = err
113 s.mode = mode
114
115 s.ch = ' '
116 s.offset = 0
117 s.rdOffset = 0
118 s.lineOffset = 0
119 s.ErrorCount = 0
120 s.nextVal = false
121
122 s.next()
123}
124
125func (s *Scanner) error(offs int, msg string) {
126 if s.err != nil {
127 s.err(s.file.Position(s.file.Pos(offs)), msg)
128 }
129 s.ErrorCount++
130}
131
132func (s *Scanner) scanComment() string {
133 // initial [;#] already consumed
134 offs := s.offset - 1 // position of initial [;#]
135
136 for s.ch != '\n' && s.ch >= 0 {
137 s.next()
138 }
139 return string(s.src[offs:s.offset])
140}
141
142func isLetter(ch rune) bool {
143 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch >= 0x80 && unicode.IsLetter(ch)
144}
145
146func isDigit(ch rune) bool {
147 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
148}
149
150func (s *Scanner) scanIdentifier() string {
151 offs := s.offset
152 for isLetter(s.ch) || isDigit(s.ch) || s.ch == '-' {
153 s.next()
154 }
155 return string(s.src[offs:s.offset])
156}
157
158// val indicate if we are scanning a value (vs a header)
159func (s *Scanner) scanEscape(val bool) {
160 offs := s.offset
161 ch := s.ch
162 s.next() // always make progress
163 switch ch {
164 case '\\', '"', '\n':
165 // ok
166 case 'n', 't', 'b':
167 if val {
168 break // ok
169 }
170 fallthrough
171 default:
172 s.error(offs, "unknown escape sequence")
173 }
174}
175
176func (s *Scanner) scanString() string {
177 // '"' opening already consumed
178 offs := s.offset - 1
179
180 for s.ch != '"' {
181 ch := s.ch
182 s.next()
183 if ch == '\n' || ch < 0 {
184 s.error(offs, "string not terminated")
185 break
186 }
187 if ch == '\\' {
188 s.scanEscape(false)
189 }
190 }
191
192 s.next()
193
194 return string(s.src[offs:s.offset])
195}
196
197func stripCR(b []byte) []byte {
198 c := make([]byte, len(b))
199 i := 0
200 for _, ch := range b {
201 if ch != '\r' {
202 c[i] = ch
203 i++
204 }
205 }
206 return c[:i]
207}
208
209func (s *Scanner) scanValString() string {
210 offs := s.offset
211
212 hasCR := false
213 end := offs
214 inQuote := false
215loop:
216 for inQuote || s.ch >= 0 && s.ch != '\n' && s.ch != ';' && s.ch != '#' {
217 ch := s.ch
218 s.next()
219 switch {
220 case inQuote && ch == '\\':
221 s.scanEscape(true)
222 case !inQuote && ch == '\\':
223 if s.ch == '\r' {
224 hasCR = true
225 s.next()
226 }
227 if s.ch != '\n' {
228 s.scanEscape(true)
229 } else {
230 s.next()
231 }
232 case ch == '"':
233 inQuote = !inQuote
234 case ch == '\r':
235 hasCR = true
236 case ch < 0 || inQuote && ch == '\n':
237 s.error(offs, "string not terminated")
238 break loop
239 }
240 if inQuote || !isWhiteSpace(ch) {
241 end = s.offset
242 }
243 }
244
245 lit := s.src[offs:end]
246 if hasCR {
247 lit = stripCR(lit)
248 }
249
250 return string(lit)
251}
252
253func isWhiteSpace(ch rune) bool {
254 return ch == ' ' || ch == '\t' || ch == '\r'
255}
256
257func (s *Scanner) skipWhitespace() {
258 for isWhiteSpace(s.ch) {
259 s.next()
260 }
261}
262
263// Scan scans the next token and returns the token position, the token,
264// and its literal string if applicable. The source end is indicated by
265// token.EOF.
266//
267// If the returned token is a literal (token.IDENT, token.STRING) or
268// token.COMMENT, the literal string has the corresponding value.
269//
270// If the returned token is token.ILLEGAL, the literal string is the
271// offending character.
272//
273// In all other cases, Scan returns an empty literal string.
274//
275// For more tolerant parsing, Scan will return a valid token if
276// possible even if a syntax error was encountered. Thus, even
277// if the resulting token sequence contains no illegal tokens,
278// a client may not assume that no error occurred. Instead it
279// must check the scanner's ErrorCount or the number of calls
280// of the error handler, if there was one installed.
281//
282// Scan adds line information to the file added to the file
283// set with Init. Token positions are relative to that file
284// and thus relative to the file set.
285func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
286scanAgain:
287 s.skipWhitespace()
288
289 // current token start
290 pos = s.file.Pos(s.offset)
291
292 // determine token value
293 switch ch := s.ch; {
294 case s.nextVal:
295 lit = s.scanValString()
296 tok = token.STRING
297 s.nextVal = false
298 case isLetter(ch):
299 lit = s.scanIdentifier()
300 tok = token.IDENT
301 default:
302 s.next() // always make progress
303 switch ch {
304 case -1:
305 tok = token.EOF
306 case '\n':
307 tok = token.EOL
308 case '"':
309 tok = token.STRING
310 lit = s.scanString()
311 case '[':
312 tok = token.LBRACK
313 case ']':
314 tok = token.RBRACK
315 case ';', '#':
316 // comment
317 lit = s.scanComment()
318 if s.mode&ScanComments == 0 {
319 // skip comment
320 goto scanAgain
321 }
322 tok = token.COMMENT
323 case '=':
324 tok = token.ASSIGN
325 s.nextVal = true
326 default:
327 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
328 tok = token.ILLEGAL
329 lit = string(ch)
330 }
331 }
332
333 return
334}