blob: b210b2bbeed5039abb13a78b6b690194230b1f81 [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// To use the lexer, call Tokenize with the source string to obtain
// a TokenStream. The lexer is run concurrently so you should be able to
// use the TokenStream before the lexer is done with the source.
//
// The lexer is implemented as a state machine. The states are represented
// by functions (the stateFn type) which accept a lexer and return the
// new state.
//
// Most states also have an isFooStart function which helps determine if
// a transition to Foo is appropriate. Those functions accept a single
// rune as a parameter and return true if the state machine should
// transition to state Foo. Some states do not have such functions on
// account of the transition condition being trivial.
//
// The lexer implementation was inspired by
// http://cuddle.googlecode.com/hg/talk/lex.html
package lexer
import (
"unicode/utf8"
)
// Tokenize accepts a source string and parses it into a stream of tokens which
// can be read from the returned TokenStream.
func Tokenize(source string) TokenStream {
tokens := make(chan Token)
l := lexer{source: source, tokens: tokens}
go l.run()
return &TokenChan{tokenChan: tokens}
}
type lexer struct {
// source is the source code to be lexed.
source string
// offset is the number of bytes that have been consumed.
offset int
// tokens is a channel to which the found tokens are emitted.
tokens chan Token
// sourcePos is the number of runes that have been consumed.
sourcePos int
// lineno is the current line number.
lineNo int
// linePos is how many runes have been consumed since the beginning of the
// line.
linePos int
// curTokenOffset is the number of bytes consumed prior to the beginning of
// the current token.
curTokenOffset int
// curTokenSourcePos is the number of runes consumed prior to the beginning of
// the current token.
curTokenSourcePos int
// curTokenLineNo is the line number on which the current token begins.
curTokenLineNo int
// curTokenLinePos is the number of runes since the beginning of the line
// where the current token begins.
curTokenLinePos int
}
// CurText returns the consumed part of the current token.
func (l *lexer) CurText() string {
return l.source[l.curTokenOffset:l.offset]
}
// emitToken emits the current token and begins a new token.
func (l *lexer) emitToken(tokenType TokenKind) {
l.tokens <- Token{
Kind: tokenType,
Text: l.source[l.curTokenOffset:l.offset],
CharPos: l.curTokenSourcePos,
LineNo: l.curTokenLineNo,
LinePos: l.curTokenLinePos}
l.beginToken()
}
// beginToken begins the new token.
func (l *lexer) beginToken() {
l.curTokenOffset = l.offset
l.curTokenSourcePos = l.sourcePos
l.curTokenLineNo = l.lineNo
l.curTokenLinePos = l.linePos
}
// Consume consumes the next rune in the source.
func (l *lexer) Consume() {
if l.IsEos() {
return
}
c, width := utf8.DecodeRuneInString(l.source[l.offset:])
if c == '\n' {
l.lineNo += 1
l.linePos = 0
} else {
l.linePos += 1
}
l.offset += width
l.sourcePos += 1
}
// Peek returns the next rune in the source.
func (l *lexer) Peek() rune {
// At the end of the string, there is no sane answer to Peek.
if l.IsEos() {
return utf8.RuneError
}
// If RuneError is returned, it will be handled as any other rune, likely
// resulting in an ErrorIllegalChar token being emitted.
char, _ := utf8.DecodeRuneInString(l.source[l.offset:])
return char
}
// IsEos returns true if the whole source has been consumed false
// otherwise.
func (l *lexer) IsEos() bool {
return l.offset >= len(l.source)
}
// run is the lexer's main loop.
func (l *lexer) run() {
// We are implementing a state machine.
// lexRoot is the beginning state.
// nil is the end state.
// States are functions which are called on the lexer. They return the
// next state.
for state := lexRoot; state != nil; {
state = state(l)
}
close(l.tokens)
}
// A stateFn represents a state in the lexer state machine.
type stateFn func(*lexer) stateFn
// This is the beginning state and also the state which is returned to after
// most tokens are emitted.
func lexRoot(l *lexer) stateFn {
if l.IsEos() {
return nil
}
switch c := l.Peek(); {
case isSingleCharTokens(c):
return lexSingleCharTokens
case isEqualsOrResponseStart(c):
return lexEqualsOrResponse
case isNameStart(c):
return lexName
case isOrdinalStart(c):
return lexOrdinal
case isNumberStart(c):
return lexNumber
case isStringStart(c):
return lexString
case isSkippable(c):
return lexSkip
case isMaybeComment(c):
return lexComment
}
l.Consume()
l.emitToken(ErrorIllegalChar)
return nil
}
// isSkippable determines if a rune is skippable.
func isSkippable(c rune) bool {
return c == ' ' || c == '\t' || c == '\r' || c == '\n'
}
// lexSkip consumes skippable runes.
func lexSkip(l *lexer) stateFn {
for isSkippable(l.Peek()) {
l.Consume()
}
l.beginToken()
return lexRoot
}
// singleCharTokens is a map of single-rune tokens.
var singleCharTokens = map[rune]TokenKind{
'(': LParen,
')': RParen,
'[': LBracket,
']': RBracket,
'{': LBrace,
'}': RBrace,
'<': LAngle,
'>': RAngle,
';': Semi,
',': Comma,
'.': Dot,
'-': Minus,
'+': Plus,
'&': Amp,
'?': Qstn,
}
// isSingleCharTokens returns true if the rune is a single character token.
func isSingleCharTokens(c rune) bool {
_, ok := singleCharTokens[c]
return ok
}
// lexSingleCharTokens lexes single character tokens.
func lexSingleCharTokens(l *lexer) stateFn {
c := l.Peek()
l.Consume()
t, _ := singleCharTokens[c]
l.emitToken(t)
return lexRoot
}
// isEqualsOrResponseStart returns true if the rune corresponds to be
// beginning of either the '=' or '=>' tokens.
func isEqualsOrResponseStart(c rune) bool {
return c == '='
}
// lexEqualsOrResponse lexes the '=' or the '=>' token.
func lexEqualsOrResponse(l *lexer) stateFn {
l.Consume()
if l.Peek() == '>' {
l.Consume()
l.emitToken(Response)
} else {
l.emitToken(Equals)
}
return lexRoot
}
// isAlpha returns true if the rune is a letter of the alphabet.
func isAlpha(c rune) bool {
return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
}
// isDigit returns true if the rune is a digit.
func isDigit(c rune) bool {
return ('0' <= c && c <= '9')
}
// isHexDigit returns true if the rune is a hexadecimal digit.
func isHexDigit(c rune) bool {
return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
}
// isNameStart returns true if the rune is the beginning of a Name token.
func isNameStart(c rune) bool {
return isAlpha(c) || c == '_'
}
// keywordTokens maps keywords to their associate tokens.
var keywordTokens = map[string]TokenKind{
"import": Import,
"module": Module,
"struct": Struct,
"union": Union,
"interface": Interface,
"enum": Enum,
"const": Const,
"true": True,
"false": False,
"default": Default,
}
// lexName lexes valid C identifiers. (K&R2: A.2.3)
func lexName(l *lexer) stateFn {
l.Consume()
// isNameRune returns true if the rune is valid in a Name token.
isNameRune := func(c rune) bool {
return isAlpha(c) || isDigit(c) || c == '_'
}
for isNameRune(l.Peek()) {
l.Consume()
}
// Emit the appropriate keyword token if the current name is a
// keyword or a Name token otherwise.
if token, found := keywordTokens[l.CurText()]; found {
l.emitToken(token)
} else {
l.emitToken(Name)
}
return lexRoot
}
// isOrdinalStart returns true if the rune is the beginning of an Ordinal
// token.
func isOrdinalStart(c rune) bool {
return '@' == c
}
// lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one
// or more digits.
func lexOrdinal(l *lexer) stateFn {
// Consume the '@'.
l.Consume()
for isDigit(l.Peek()) {
l.Consume()
}
l.emitToken(Ordinal)
return lexRoot
}
// isNumberStart returns true if the rune is the beginning of a number.
func isNumberStart(c rune) bool {
// Even hexadecimals must begin with a digit (namely 0).
return isDigit(c)
}
// lexNumber lexes a number token.
func lexNumber(l *lexer) stateFn {
// If the number begins with 0 it cannot be a decimal integer.
if l.Peek() == '0' {
return lexNumberStartWithZero
}
return lexDec
}
// lexDec lexes a base-10 number.
func lexDec(l *lexer) stateFn {
for isDigit(l.Peek()) {
l.Consume()
}
// If a decimal part is found, transition to the decimal state.
if isDecimalPartStart(l.Peek()) {
return lexDecimalPart
}
l.emitToken(IntConstDec)
return lexRoot
}
// lexNumberStartWithZero lexes hexadecimals, some floats or 0.
func lexNumberStartWithZero(l *lexer) stateFn {
// Consume the leading 0
l.Consume()
// Here we check to see whether we are in the hexadecimal or floating
// point case.
switch c := l.Peek(); {
case c == 'x' || c == 'X':
return lexHexNumber
case isDecimalPartStart(c):
return lexDecimalPart
}
// Found a naked 0.
l.emitToken(IntConstDec)
return lexRoot
}
// lexHexNumber lexes hexadecimal integers.
func lexHexNumber(l *lexer) stateFn {
// Consume the x or X
l.Consume()
for isHexDigit(l.Peek()) {
l.Consume()
}
l.emitToken(IntConstHex)
return lexRoot
}
// isDecimalPartStart returns true if the rune represents the beginning of
// the decimal part of a floating point number.
func isDecimalPartStart(c rune) bool {
return c == '.' || c == 'e' || c == 'E'
}
// lexDecimalPart lexes the decimal part of a floating point number.
func lexDecimalPart(l *lexer) stateFn {
// Consume '.' or 'e' or 'E'
l.Consume()
if c := l.Peek(); c == '+' || c == '-' {
l.Consume()
}
for isDigit(l.Peek()) {
l.Consume()
}
l.emitToken(FloatConst)
return lexRoot
}
// isStringStart returns true if the rune represents the beginning of a string.
func isStringStart(c rune) bool {
return '"' == c
}
// lexString lexes a quoted string.
func lexString(l *lexer) stateFn {
// Consume opening quotes.
l.Consume()
for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {
if l.Peek() == '\\' {
// If we see an escape character consume whatever follows blindly.
// TODO(azani): Consider parsing escape sequences.
l.Consume()
}
l.Consume()
}
if l.IsEos() || l.Peek() == '\n' {
l.emitToken(ErrorUnterminatedStringLiteral)
return nil
}
// Consume the closing quotes
l.Consume()
l.emitToken(StringLiteral)
return lexRoot
}
// isMaybeComment returns true if the rune may be the beginning of a
// comment.
func isMaybeComment(c rune) bool {
return c == '/'
}
// lexComment consumes a single-line or multi-line comment.
func lexComment(l *lexer) stateFn {
// Consume the '/'.
l.Consume()
switch l.Peek() {
case '/':
return lexSingleLineComment
case '*':
return lexMultiLineComment
}
l.emitToken(ErrorIllegalChar)
return nil
}
// lexSingleLineComment consumes a single line comment.
func lexSingleLineComment(l *lexer) stateFn {
// Consume the '/'
l.Consume()
for !l.IsEos() && l.Peek() != '\n' {
l.Consume()
}
l.beginToken()
return lexRoot
}
// lexMultiLineComment consumes a multi-line comment.
func lexMultiLineComment(l *lexer) stateFn {
// Consume the '*'.
l.Consume()
for !l.IsEos() {
if l.Peek() == '*' {
return lexPossibleEndOfComment
}
l.Consume()
}
l.emitToken(ErrorUnterminatedComment)
return nil
}
// lexPossibleEndOfComment consumes the possible end of a multiline
// comment and determines whether the comment in fact ended or not.
func lexPossibleEndOfComment(l *lexer) stateFn {
// Consume the '*'
l.Consume()
if l.IsEos() {
l.emitToken(ErrorUnterminatedComment)
return nil
}
if l.Peek() == '/' {
l.Consume()
l.beginToken()
return lexRoot
}
return lexMultiLineComment
}