main
Raw Download raw file
  1// Copyright 2009 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5// Package scanner implements a scanner for gcfg configuration text.
  6// It takes a []byte as source which can then be tokenized
  7// through repeated calls to the Scan method.
  8//
  9// Note that the API for the scanner package may change to accommodate new
 10// features or implementation changes in gcfg.
 11package scanner
 12
 13import (
 14	"fmt"
 15	"path/filepath"
 16	"unicode"
 17	"unicode/utf8"
 18
 19	"github.com/go-git/gcfg/token"
 20)
 21
 22// An ErrorHandler may be provided to Scanner.Init. If a syntax error is
 23// encountered and a handler was installed, the handler is called with a
 24// position and an error message. The position points to the beginning of
 25// the offending token.
 26type ErrorHandler func(pos token.Position, msg string)
 27
 28// A Scanner holds the scanner's internal state while processing
 29// a given text.  It can be allocated as part of another data
 30// structure but must be initialized via Init before use.
 31type Scanner struct {
 32	// immutable state
 33	file *token.File  // source file handle
 34	dir  string       // directory portion of file.Name()
 35	src  []byte       // source
 36	err  ErrorHandler // error reporting; or nil
 37	mode Mode         // scanning mode
 38
 39	// scanning state
 40	ch         rune // current character
 41	offset     int  // character offset
 42	rdOffset   int  // reading offset (position after current character)
 43	lineOffset int  // current line offset
 44	nextVal    bool // next token is expected to be a value
 45
 46	// public state - ok to modify
 47	ErrorCount int // number of errors encountered
 48}
 49
 50// Read the next Unicode char into s.ch.
 51// s.ch < 0 means end-of-file.
 52func (s *Scanner) next() {
 53	if s.rdOffset < len(s.src) {
 54		s.offset = s.rdOffset
 55		if s.ch == '\n' {
 56			s.lineOffset = s.offset
 57			s.file.AddLine(s.offset)
 58		}
 59		r, w := rune(s.src[s.rdOffset]), 1
 60		switch {
 61		case r == 0:
 62			s.error(s.offset, "illegal character NUL")
 63		case r >= 0x80:
 64			// not ASCII
 65			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
 66			if r == utf8.RuneError && w == 1 {
 67				s.error(s.offset, "illegal UTF-8 encoding")
 68			}
 69		}
 70		s.rdOffset += w
 71		s.ch = r
 72	} else {
 73		s.offset = len(s.src)
 74		if s.ch == '\n' {
 75			s.lineOffset = s.offset
 76			s.file.AddLine(s.offset)
 77		}
 78		s.ch = -1 // eof
 79	}
 80}
 81
 82// A mode value is a set of flags (or 0).
 83// They control scanner behavior.
 84type Mode uint
 85
 86const (
 87	ScanComments Mode = 1 << iota // return comments as COMMENT tokens
 88)
 89
 90// Init prepares the scanner s to tokenize the text src by setting the
 91// scanner at the beginning of src. The scanner uses the file set file
 92// for position information and it adds line information for each line.
 93// It is ok to re-use the same file when re-scanning the same file as
 94// line information which is already present is ignored. Init causes a
 95// panic if the file size does not match the src size.
 96//
 97// Calls to Scan will invoke the error handler err if they encounter a
 98// syntax error and err is not nil. Also, for each error encountered,
 99// the Scanner field ErrorCount is incremented by one. The mode parameter
100// determines how comments are handled.
101//
102// Note that Init may call err if there is an error in the first character
103// of the file.
104func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
105	// Explicitly initialize all fields since a scanner may be reused.
106	if file.Size() != len(src) {
107		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
108	}
109	s.file = file
110	s.dir, _ = filepath.Split(file.Name())
111	s.src = src
112	s.err = err
113	s.mode = mode
114
115	s.ch = ' '
116	s.offset = 0
117	s.rdOffset = 0
118	s.lineOffset = 0
119	s.ErrorCount = 0
120	s.nextVal = false
121
122	s.next()
123}
124
125func (s *Scanner) error(offs int, msg string) {
126	if s.err != nil {
127		s.err(s.file.Position(s.file.Pos(offs)), msg)
128	}
129	s.ErrorCount++
130}
131
132func (s *Scanner) scanComment() string {
133	// initial [;#] already consumed
134	offs := s.offset - 1 // position of initial [;#]
135
136	for s.ch != '\n' && s.ch >= 0 {
137		s.next()
138	}
139	return string(s.src[offs:s.offset])
140}
141
142func isLetter(ch rune) bool {
143	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch >= 0x80 && unicode.IsLetter(ch)
144}
145
146func isDigit(ch rune) bool {
147	return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
148}
149
150func (s *Scanner) scanIdentifier() string {
151	offs := s.offset
152	for isLetter(s.ch) || isDigit(s.ch) || s.ch == '-' {
153		s.next()
154	}
155	return string(s.src[offs:s.offset])
156}
157
158// val indicate if we are scanning a value (vs a header)
159func (s *Scanner) scanEscape(val bool) {
160	offs := s.offset
161	ch := s.ch
162	s.next() // always make progress
163	switch ch {
164	case '\\', '"', '\n':
165		// ok
166	case 'n', 't', 'b':
167		if val {
168			break // ok
169		}
170		fallthrough
171	default:
172		s.error(offs, "unknown escape sequence")
173	}
174}
175
176func (s *Scanner) scanString() string {
177	// '"' opening already consumed
178	offs := s.offset - 1
179
180	for s.ch != '"' {
181		ch := s.ch
182		s.next()
183		if ch == '\n' || ch < 0 {
184			s.error(offs, "string not terminated")
185			break
186		}
187		if ch == '\\' {
188			s.scanEscape(false)
189		}
190	}
191
192	s.next()
193
194	return string(s.src[offs:s.offset])
195}
196
197func stripCR(b []byte) []byte {
198	c := make([]byte, len(b))
199	i := 0
200	for _, ch := range b {
201		if ch != '\r' {
202			c[i] = ch
203			i++
204		}
205	}
206	return c[:i]
207}
208
209func (s *Scanner) scanValString() string {
210	offs := s.offset
211
212	hasCR := false
213	end := offs
214	inQuote := false
215loop:
216	for inQuote || s.ch >= 0 && s.ch != '\n' && s.ch != ';' && s.ch != '#' {
217		ch := s.ch
218		s.next()
219		switch {
220		case inQuote && ch == '\\':
221			s.scanEscape(true)
222		case !inQuote && ch == '\\':
223			if s.ch == '\r' {
224				hasCR = true
225				s.next()
226			}
227			if s.ch != '\n' {
228				s.scanEscape(true)
229			} else {
230				s.next()
231			}
232		case ch == '"':
233			inQuote = !inQuote
234		case ch == '\r':
235			hasCR = true
236		case ch < 0 || inQuote && ch == '\n':
237			s.error(offs, "string not terminated")
238			break loop
239		}
240		if inQuote || !isWhiteSpace(ch) {
241			end = s.offset
242		}
243	}
244
245	lit := s.src[offs:end]
246	if hasCR {
247		lit = stripCR(lit)
248	}
249
250	return string(lit)
251}
252
253func isWhiteSpace(ch rune) bool {
254	return ch == ' ' || ch == '\t' || ch == '\r'
255}
256
257func (s *Scanner) skipWhitespace() {
258	for isWhiteSpace(s.ch) {
259		s.next()
260	}
261}
262
263// Scan scans the next token and returns the token position, the token,
264// and its literal string if applicable. The source end is indicated by
265// token.EOF.
266//
267// If the returned token is a literal (token.IDENT, token.STRING) or
268// token.COMMENT, the literal string has the corresponding value.
269//
270// If the returned token is token.ILLEGAL, the literal string is the
271// offending character.
272//
273// In all other cases, Scan returns an empty literal string.
274//
275// For more tolerant parsing, Scan will return a valid token if
276// possible even if a syntax error was encountered. Thus, even
277// if the resulting token sequence contains no illegal tokens,
278// a client may not assume that no error occurred. Instead it
279// must check the scanner's ErrorCount or the number of calls
280// of the error handler, if there was one installed.
281//
282// Scan adds line information to the file added to the file
283// set with Init. Token positions are relative to that file
284// and thus relative to the file set.
285func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
286scanAgain:
287	s.skipWhitespace()
288
289	// current token start
290	pos = s.file.Pos(s.offset)
291
292	// determine token value
293	switch ch := s.ch; {
294	case s.nextVal:
295		lit = s.scanValString()
296		tok = token.STRING
297		s.nextVal = false
298	case isLetter(ch):
299		lit = s.scanIdentifier()
300		tok = token.IDENT
301	default:
302		s.next() // always make progress
303		switch ch {
304		case -1:
305			tok = token.EOF
306		case '\n':
307			tok = token.EOL
308		case '"':
309			tok = token.STRING
310			lit = s.scanString()
311		case '[':
312			tok = token.LBRACK
313		case ']':
314			tok = token.RBRACK
315		case ';', '#':
316			// comment
317			lit = s.scanComment()
318			if s.mode&ScanComments == 0 {
319				// skip comment
320				goto scanAgain
321			}
322			tok = token.COMMENT
323		case '=':
324			tok = token.ASSIGN
325			s.nextVal = true
326		default:
327			s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
328			tok = token.ILLEGAL
329			lit = string(ch)
330		}
331	}
332
333	return
334}