New lexer for mojom written in go.
BUG=fixes #464
R=mattr@google.com, rudominer@chromium.org
Review URL: https://codereview.chromium.org/1387893002 .
diff --git a/mojom/mojom_parser/lexer/lexer.go b/mojom/mojom_parser/lexer/lexer.go
new file mode 100644
index 0000000..b210b2b
--- /dev/null
+++ b/mojom/mojom_parser/lexer/lexer.go
@@ -0,0 +1,519 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// To use the lexer, call Tokenize with the source string to obtain
+// a TokenStream. The lexer is run concurrently so you should be able to
+// use the TokenStream before the lexer is done with the source.
+//
+// The lexer is implemented as a state machine. The states are represented
+// by functions (the stateFn type) which accept a lexer and return the
+// new state.
+//
+// Most states also have an isFooStart function which helps determine if
+// a transition to Foo is appropriate. Those functions accept a single
+// rune as a parameter and return true if the state machine should
+// transition to state Foo. Some states do not have such functions on
+// account of the transition condition being trivial.
+//
+// The lexer implementation was inspired by
+// http://cuddle.googlecode.com/hg/talk/lex.html
+
+package lexer
+
+import (
+ "unicode/utf8"
+)
+
+// Tokenize accepts a source string and parses it into a stream of tokens which
+// can be read from the returned TokenStream.
+func Tokenize(source string) TokenStream {
+ tokens := make(chan Token)
+ l := lexer{source: source, tokens: tokens}
+ go l.run()
+ return &TokenChan{tokenChan: tokens}
+}
+
+type lexer struct {
+ // source is the source code to be lexed.
+ source string
+
+ // offset is the number of bytes that have been consumed.
+ offset int
+
+ // tokens is a channel to which the found tokens are emitted.
+ tokens chan Token
+
+ // sourcePos is the number of runes that have been consumed.
+ sourcePos int
+
+ // lineno is the current line number.
+ lineNo int
+
+ // linePos is how many runes have been consumed since the beginning of the
+ // line.
+ linePos int
+
+ // curTokenOffset is the number of bytes consumed prior to the beginning of
+ // the current token.
+ curTokenOffset int
+
+ // curTokenSourcePos is the number of runes consumed prior to the beginning of
+ // the current token.
+ curTokenSourcePos int
+
+ // curTokenLineNo is the line number on which the current token begins.
+ curTokenLineNo int
+
+ // curTokenLinePos is the number of runes since the beginning of the line
+ // where the current token begins.
+ curTokenLinePos int
+}
+
+// CurText returns the consumed part of the current token.
+func (l *lexer) CurText() string {
+ return l.source[l.curTokenOffset:l.offset]
+}
+
+// emitToken emits the current token and begins a new token.
+func (l *lexer) emitToken(tokenType TokenKind) {
+ l.tokens <- Token{
+ Kind: tokenType,
+ Text: l.source[l.curTokenOffset:l.offset],
+ CharPos: l.curTokenSourcePos,
+ LineNo: l.curTokenLineNo,
+ LinePos: l.curTokenLinePos}
+ l.beginToken()
+}
+
+// beginToken begins the new token.
+func (l *lexer) beginToken() {
+ l.curTokenOffset = l.offset
+ l.curTokenSourcePos = l.sourcePos
+ l.curTokenLineNo = l.lineNo
+ l.curTokenLinePos = l.linePos
+}
+
+// Consume consumes the next rune in the source.
+func (l *lexer) Consume() {
+ if l.IsEos() {
+ return
+ }
+
+ c, width := utf8.DecodeRuneInString(l.source[l.offset:])
+
+ if c == '\n' {
+ l.lineNo += 1
+ l.linePos = 0
+ } else {
+ l.linePos += 1
+ }
+ l.offset += width
+ l.sourcePos += 1
+}
+
+// Peek returns the next rune in the source.
+func (l *lexer) Peek() rune {
+ // At the end of the string, there is no sane answer to Peek.
+ if l.IsEos() {
+ return utf8.RuneError
+ }
+
+ // If RuneError is returned, it will be handled as any other rune, likely
+ // resulting in an ErrorIllegalChar token being emitted.
+ char, _ := utf8.DecodeRuneInString(l.source[l.offset:])
+ return char
+}
+
+// IsEos returns true if the whole source has been consumed false
+// otherwise.
+func (l *lexer) IsEos() bool {
+ return l.offset >= len(l.source)
+}
+
+// run is the lexer's main loop.
+func (l *lexer) run() {
+ // We are implementing a state machine.
+ // lexRoot is the beginning state.
+ // nil is the end state.
+ // States are functions which are called on the lexer. They return the
+ // next state.
+ for state := lexRoot; state != nil; {
+ state = state(l)
+ }
+ close(l.tokens)
+}
+
+// A stateFn represents a state in the lexer state machine.
+type stateFn func(*lexer) stateFn
+
+// This is the beginning state and also the state which is returned to after
+// most tokens are emitted.
+func lexRoot(l *lexer) stateFn {
+ if l.IsEos() {
+ return nil
+ }
+
+ switch c := l.Peek(); {
+ case isSingleCharTokens(c):
+ return lexSingleCharTokens
+ case isEqualsOrResponseStart(c):
+ return lexEqualsOrResponse
+ case isNameStart(c):
+ return lexName
+ case isOrdinalStart(c):
+ return lexOrdinal
+ case isNumberStart(c):
+ return lexNumber
+ case isStringStart(c):
+ return lexString
+ case isSkippable(c):
+ return lexSkip
+ case isMaybeComment(c):
+ return lexComment
+ }
+
+ l.Consume()
+ l.emitToken(ErrorIllegalChar)
+ return nil
+}
+
+// isSkippable determines if a rune is skippable.
+func isSkippable(c rune) bool {
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n'
+}
+
+// lexSkip consumes skippable runes.
+func lexSkip(l *lexer) stateFn {
+ for isSkippable(l.Peek()) {
+ l.Consume()
+ }
+ l.beginToken()
+ return lexRoot
+}
+
+// singleCharTokens is a map of single-rune tokens.
+var singleCharTokens = map[rune]TokenKind{
+ '(': LParen,
+ ')': RParen,
+ '[': LBracket,
+ ']': RBracket,
+ '{': LBrace,
+ '}': RBrace,
+ '<': LAngle,
+ '>': RAngle,
+ ';': Semi,
+ ',': Comma,
+ '.': Dot,
+ '-': Minus,
+ '+': Plus,
+ '&': Amp,
+ '?': Qstn,
+}
+
+// isSingleCharTokens returns true if the rune is a single character token.
+func isSingleCharTokens(c rune) bool {
+ _, ok := singleCharTokens[c]
+ return ok
+}
+
+// lexSingleCharTokens lexes single character tokens.
+func lexSingleCharTokens(l *lexer) stateFn {
+ c := l.Peek()
+ l.Consume()
+ t, _ := singleCharTokens[c]
+ l.emitToken(t)
+
+ return lexRoot
+}
+
+// isEqualsOrResponseStart returns true if the rune corresponds to be
+// beginning of either the '=' or '=>' tokens.
+func isEqualsOrResponseStart(c rune) bool {
+ return c == '='
+}
+
+// lexEqualsOrResponse lexes the '=' or the '=>' token.
+func lexEqualsOrResponse(l *lexer) stateFn {
+ l.Consume()
+
+ if l.Peek() == '>' {
+ l.Consume()
+ l.emitToken(Response)
+ } else {
+ l.emitToken(Equals)
+ }
+
+ return lexRoot
+}
+
+// isAlpha returns true if the rune is a letter of the alphabet.
+func isAlpha(c rune) bool {
+ return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
+}
+
+// isDigit returns true if the rune is a digit.
+func isDigit(c rune) bool {
+ return ('0' <= c && c <= '9')
+}
+
+// isHexDigit returns true if the rune is a hexadecimal digit.
+func isHexDigit(c rune) bool {
+ return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
+}
+
+// isNameStart returns true if the rune is the beginning of a Name token.
+func isNameStart(c rune) bool {
+ return isAlpha(c) || c == '_'
+}
+
+// keywordTokens maps keywords to their associate tokens.
+var keywordTokens = map[string]TokenKind{
+ "import": Import,
+ "module": Module,
+ "struct": Struct,
+ "union": Union,
+ "interface": Interface,
+ "enum": Enum,
+ "const": Const,
+ "true": True,
+ "false": False,
+ "default": Default,
+}
+
+// lexName lexes valid C identifiers. (K&R2: A.2.3)
+func lexName(l *lexer) stateFn {
+ l.Consume()
+
+ // isNameRune returns true if the rune is valid in a Name token.
+ isNameRune := func(c rune) bool {
+ return isAlpha(c) || isDigit(c) || c == '_'
+ }
+
+ for isNameRune(l.Peek()) {
+ l.Consume()
+ }
+
+ // Emit the appropriate keyword token if the current name is a
+ // keyword or a Name token otherwise.
+ if token, found := keywordTokens[l.CurText()]; found {
+ l.emitToken(token)
+ } else {
+ l.emitToken(Name)
+ }
+
+ return lexRoot
+}
+
+// isOrdinalStart returns true if the rune is the beginning of an Ordinal
+// token.
+func isOrdinalStart(c rune) bool {
+ return '@' == c
+}
+
+// lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one
+// or more digits.
+func lexOrdinal(l *lexer) stateFn {
+ // Consume the '@'.
+ l.Consume()
+
+ for isDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ l.emitToken(Ordinal)
+
+ return lexRoot
+}
+
+// isNumberStart returns true if the rune is the beginning of a number.
+func isNumberStart(c rune) bool {
+ // Even hexadecimals must begin with a digit (namely 0).
+ return isDigit(c)
+}
+
+// lexNumber lexes a number token.
+func lexNumber(l *lexer) stateFn {
+ // If the number begins with 0 it cannot be a decimal integer.
+ if l.Peek() == '0' {
+ return lexNumberStartWithZero
+ }
+ return lexDec
+}
+
+// lexDec lexes a base-10 number.
+func lexDec(l *lexer) stateFn {
+ for isDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ // If a decimal part is found, transition to the decimal state.
+ if isDecimalPartStart(l.Peek()) {
+ return lexDecimalPart
+ }
+
+ l.emitToken(IntConstDec)
+
+ return lexRoot
+}
+
+// lexNumberStartWithZero lexes hexadecimals, some floats or 0.
+func lexNumberStartWithZero(l *lexer) stateFn {
+ // Consume the leading 0
+ l.Consume()
+
+ // Here we check to see whether we are in the hexadecimal or floating
+ // point case.
+ switch c := l.Peek(); {
+ case c == 'x' || c == 'X':
+ return lexHexNumber
+ case isDecimalPartStart(c):
+ return lexDecimalPart
+ }
+
+ // Found a naked 0.
+ l.emitToken(IntConstDec)
+
+ return lexRoot
+}
+
+// lexHexNumber lexes hexadecimal integers.
+func lexHexNumber(l *lexer) stateFn {
+ // Consume the x or X
+ l.Consume()
+
+ for isHexDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ l.emitToken(IntConstHex)
+
+ return lexRoot
+}
+
+// isDecimalPartStart returns true if the rune represents the beginning of
+// the decimal part of a floating point number.
+func isDecimalPartStart(c rune) bool {
+ return c == '.' || c == 'e' || c == 'E'
+}
+
+// lexDecimalPart lexes the decimal part of a floating point number.
+func lexDecimalPart(l *lexer) stateFn {
+ // Consume '.' or 'e' or 'E'
+ l.Consume()
+
+ if c := l.Peek(); c == '+' || c == '-' {
+ l.Consume()
+ }
+
+ for isDigit(l.Peek()) {
+ l.Consume()
+ }
+
+ l.emitToken(FloatConst)
+
+ return lexRoot
+}
+
+// isStringStart returns true if the rune represents the beginning of a string.
+func isStringStart(c rune) bool {
+ return '"' == c
+}
+
+// lexString lexes a quoted string.
+func lexString(l *lexer) stateFn {
+ // Consume opening quotes.
+ l.Consume()
+
+ for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {
+ if l.Peek() == '\\' {
+ // If we see an escape character consume whatever follows blindly.
+ // TODO(azani): Consider parsing escape sequences.
+ l.Consume()
+ }
+ l.Consume()
+ }
+
+ if l.IsEos() || l.Peek() == '\n' {
+ l.emitToken(ErrorUnterminatedStringLiteral)
+ return nil
+ }
+
+ // Consume the closing quotes
+ l.Consume()
+
+ l.emitToken(StringLiteral)
+
+ return lexRoot
+}
+
+// isMaybeComment returns true if the rune may be the beginning of a
+// comment.
+func isMaybeComment(c rune) bool {
+ return c == '/'
+}
+
+// lexComment consumes a single-line or multi-line comment.
+func lexComment(l *lexer) stateFn {
+ // Consume the '/'.
+ l.Consume()
+
+ switch l.Peek() {
+ case '/':
+ return lexSingleLineComment
+ case '*':
+ return lexMultiLineComment
+ }
+
+ l.emitToken(ErrorIllegalChar)
+ return nil
+}
+
+// lexSingleLineComment consumes a single line comment.
+func lexSingleLineComment(l *lexer) stateFn {
+ // Consume the '/'
+ l.Consume()
+
+ for !l.IsEos() && l.Peek() != '\n' {
+ l.Consume()
+ }
+
+ l.beginToken()
+ return lexRoot
+}
+
+// lexMultiLineComment consumes a multi-line comment.
+func lexMultiLineComment(l *lexer) stateFn {
+ // Consume the '*'.
+ l.Consume()
+
+ for !l.IsEos() {
+ if l.Peek() == '*' {
+ return lexPossibleEndOfComment
+ }
+ l.Consume()
+ }
+
+ l.emitToken(ErrorUnterminatedComment)
+ return nil
+}
+
+// lexPossibleEndOfComment consumes the possible end of a multiline
+// comment and determines whether the comment in fact ended or not.
+func lexPossibleEndOfComment(l *lexer) stateFn {
+ // Consume the '*'
+ l.Consume()
+
+ if l.IsEos() {
+ l.emitToken(ErrorUnterminatedComment)
+ return nil
+ }
+
+ if l.Peek() == '/' {
+ l.Consume()
+ l.beginToken()
+ return lexRoot
+ }
+
+ return lexMultiLineComment
+}
diff --git a/mojom/mojom_parser/lexer/lexer_test.go b/mojom/mojom_parser/lexer/lexer_test.go
index 1b378a7..19f9aae 100644
--- a/mojom/mojom_parser/lexer/lexer_test.go
+++ b/mojom/mojom_parser/lexer/lexer_test.go
@@ -6,10 +6,207 @@
import "testing"
-// TODO(rudominer) This dummy test is here in order to be able to test the
-// go unit test infrastructure. It will eventually be replaced by a real test.
-func TestDummyLexerTest(t *testing.T) {
- if 5.1 > 2.1*3.1 {
- t.Fatalf("Something is wrong.")
+func checkEq(t *testing.T, expected, actual interface{}) {
+ if expected != actual {
+ t.Fatalf("Failed check: Expected (%v), Actual (%v)", expected, actual)
}
}
+
+// pumpTokens pumps all the tokens from a channel into a slice.
+func pumpTokens(tokensChan chan Token) []Token {
+ tokens := []Token{}
+ for token := range tokensChan {
+ tokens = append(tokens, token)
+ }
+ return tokens
+}
+
+// TestAllSingleTokens tests for each token that a valid string is accepted as
+// the correct token.
+func TestAllSingleTokens(t *testing.T) {
+ testData := []struct {
+ source string
+ token TokenKind
+ }{
+ {"(", LParen},
+ {")", RParen},
+ {"[", LBracket},
+ {"]", RBracket},
+ {"{", LBrace},
+ {"}", RBrace},
+ {"<", LAngle},
+ {">", RAngle},
+ {";", Semi},
+ {",", Comma},
+ {".", Dot},
+ {"-", Minus},
+ {"+", Plus},
+ {"&", Amp},
+ {"?", Qstn},
+ {"=", Equals},
+ {"=>", Response},
+ {"somet_hi3ng", Name},
+ {"import", Import},
+ {"module", Module},
+ {"struct", Struct},
+ {"union", Union},
+ {"interface", Interface},
+ {"enum", Enum},
+ {"const", Const},
+ {"true", True},
+ {"false", False},
+ {"default", Default},
+ {"@10", Ordinal},
+ {"10", IntConstDec},
+ {"0", IntConstDec},
+ {"0xA10", IntConstHex},
+ {"0xa10", IntConstHex},
+ {"0XA10", IntConstHex},
+ {"0Xa10", IntConstHex},
+ {"10.5", FloatConst},
+ {"10e5", FloatConst},
+ {"0.5", FloatConst},
+ {"0e5", FloatConst},
+ {"10e+5", FloatConst},
+ {"10e-5", FloatConst},
+ {"\"hello world\"", StringLiteral},
+ {"\"hello \\\"real\\\" world\"", StringLiteral},
+ }
+
+ for i := range testData {
+ l := lexer{source: testData[i].source, tokens: make(chan Token)}
+ go l.run()
+ tokens := pumpTokens(l.tokens)
+
+ if len(tokens) != 1 {
+ t.Fatalf("Source('%v'): Expected 1 token but got %v instead: %v",
+ testData[i].source, len(tokens), tokens)
+ }
+
+ checkEq(t, testData[i].source, tokens[0].Text)
+ checkEq(t, testData[i].token, tokens[0].Kind)
+ }
+}
+
+// TestTokenPosition tests that the position in the source string, the line
+// number and the position in the line of the lexed token are correctly found.
+func TestTokenPosition(t *testing.T) {
+ source := " \n ."
+ l := lexer{source: source, tokens: make(chan Token)}
+ go l.run()
+ tokens := pumpTokens(l.tokens)
+ token := tokens[0]
+
+ checkEq(t, 5, token.CharPos)
+ checkEq(t, 1, token.LineNo)
+ checkEq(t, 2, token.LinePos)
+}
+
+// TestTokenPositionChineseString tests that CharPos is expressed as a number
+// of runes and not a number of bytes.
+func TestTokenPositionChineseString(t *testing.T) {
+ source := "\"您好\" is"
+ ts := Tokenize(source)
+ checkEq(t, StringLiteral, ts.PeekNext().Kind)
+ ts.ConsumeNext()
+ checkEq(t, 5, ts.PeekNext().CharPos)
+}
+
+// TestSkipSkippable tests that all skippable characters are skipped.
+func TestSkipSkippable(t *testing.T) {
+ source := " \t \r \n ."
+ l := lexer{source: source, tokens: make(chan Token)}
+ go l.run()
+ tokens := pumpTokens(l.tokens)
+
+ checkEq(t, Dot, tokens[0].Kind)
+}
+
+// TestTokenize tests that a single token embedded in a larger string is
+// correctly lexed.
+func TestTokenize(t *testing.T) {
+ ts := Tokenize(" \t . ")
+ token := ts.PeekNext()
+ checkEq(t, Dot, token.Kind)
+
+ ts.ConsumeNext()
+ token = ts.PeekNext()
+ checkEq(t, EOF, token.Kind)
+}
+
+// TestTokenizeBadUTF8String tests that an invalid UTF8 string is handled.
+func TestTokenizeBadUTF8String(t *testing.T) {
+ ts := Tokenize("\xF0")
+ checkEq(t, ErrorIllegalChar, ts.PeekNext().Kind)
+}
+
+// TestTokenizeEmptyString tests that empty strings are handled correctly.
+func TestTokenizeEmptyString(t *testing.T) {
+ ts := Tokenize("")
+ checkEq(t, EOF, ts.PeekNext().Kind)
+}
+
+// TestTokenizeMoreThanOne tests that more than one token is correctly lexed.
+func TestTokenizeMoreThanOne(t *testing.T) {
+ ts := Tokenize("()")
+ checkEq(t, LParen, ts.PeekNext().Kind)
+ ts.ConsumeNext()
+ checkEq(t, RParen, ts.PeekNext().Kind)
+ ts.ConsumeNext()
+ checkEq(t, EOF, ts.PeekNext().Kind)
+}
+
+// TestIllegalChar tests that an illegal character is correctly spotted.
+func TestIllegalChar(t *testing.T) {
+ ts := Tokenize(" \t $ ")
+ checkEq(t, ErrorIllegalChar, ts.PeekNext().Kind)
+}
+
+// TestUnterminatedStringLiteralEos tests that the correct error is emitted if
+// a quoted string is never closed.
+func TestUnterminatedStringLiteralEos(t *testing.T) {
+ ts := Tokenize("\"hello world")
+ checkEq(t, ErrorUnterminatedStringLiteral, ts.PeekNext().Kind)
+}
+
+// TestUnterminatedStringLiteralEol tests that the correct error is emitted if
+// a quoted string is closed on a subsequent line.
+func TestUnterminatedStringLiteralEol(t *testing.T) {
+ ts := Tokenize("\"hello\n world\"")
+ checkEq(t, ErrorUnterminatedStringLiteral, ts.PeekNext().Kind)
+}
+
+// TestSingleLineComment tests that single line comments are correctly skipped.
+func TestSingleLineComment(t *testing.T) {
+ ts := Tokenize("( // some stuff\n)")
+ checkEq(t, LParen, ts.PeekNext().Kind)
+ ts.ConsumeNext()
+ checkEq(t, RParen, ts.PeekNext().Kind)
+}
+
+// TestMultiLineComment tests that multi line comments are correctly skipped.
+func TestMultiLineComment(t *testing.T) {
+ ts := Tokenize("( /* hello world/ * *\n */)")
+ checkEq(t, LParen, ts.PeekNext().Kind)
+ ts.ConsumeNext()
+ checkEq(t, RParen, ts.PeekNext().Kind)
+}
+
+// TestUnterminatedMultiLineComment tests that unterminated multiline comments
+// emit the correct error.
+func TestUnterminatedMultiLineComment(t *testing.T) {
+ ts := Tokenize("( /* hello world/ * *\n )")
+ checkEq(t, LParen, ts.PeekNext().Kind)
+ ts.ConsumeNext()
+ checkEq(t, ErrorUnterminatedComment, ts.PeekNext().Kind)
+}
+
+// TestUnterminatedMultiLineCommentAtStar tests that if the string ends at a *
+// (which could be the beginning of the close of a multiline comment) the right
+// error is emitted.
+func TestUnterminatedMultiLineCommentAtStar(t *testing.T) {
+ ts := Tokenize("( /* hello world/ *")
+ checkEq(t, LParen, ts.PeekNext().Kind)
+ ts.ConsumeNext()
+ checkEq(t, ErrorUnterminatedComment, ts.PeekNext().Kind)
+}
diff --git a/mojom/mojom_parser/lexer/token_stream.go b/mojom/mojom_parser/lexer/token_stream.go
new file mode 100644
index 0000000..beb399a
--- /dev/null
+++ b/mojom/mojom_parser/lexer/token_stream.go
@@ -0,0 +1,51 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// TokenStream is the interface between the lexer and the parser. The lexer
+// creates a TokenStream which the parser consumes.
+
+package lexer
+
+type TokenStream interface {
+ // Returns the next Token in the stream without advancing the cursor,
+ // or returns the EOF token if the cursor is already past the end.
+ PeekNext() Token
+
+ // Advances the cursor in the stream or does nothing if the cursor is
+ // already past the end of the stream
+ ConsumeNext()
+}
+
+// The EOF token is returned by TokenStream to signal the end of the stream.
+var eofToken = Token{Kind: EOF}
+
+// *TokenChan implements TokenStream.
+// This implementation uses a non-buffered channel to pass the tokens from the
+// lexer to the parser. One end of the channel is held by the lexer and the
+// other is in the TokenChan object that is passed to the parser.
+type TokenChan struct {
+ tokenChan chan Token
+ nextToken Token
+ // read is true if a token has been read out of the channel into nextToken.
+ read bool
+}
+
+// See TokenStream.
+func (s *TokenChan) PeekNext() (token Token) {
+ if !s.read {
+ s.read = true
+ s.ConsumeNext()
+ }
+
+ return s.nextToken
+}
+
+// See TokenStream.
+func (s *TokenChan) ConsumeNext() {
+ if t, open := <-s.tokenChan; open {
+ s.nextToken = t
+ } else {
+ s.nextToken = eofToken
+ }
+}
diff --git a/mojom/mojom_parser/lexer/tokens.go b/mojom/mojom_parser/lexer/tokens.go
new file mode 100644
index 0000000..2ed9b5c
--- /dev/null
+++ b/mojom/mojom_parser/lexer/tokens.go
@@ -0,0 +1,218 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// TokenKinds is a type which describes the kinds of tokens which can be
+// encountered in a mojom file.
+
+package lexer
+
+import (
+ "fmt"
+)
+
+type TokenKind int
+
+// TokenKinds
+const (
+ // An error of an unknown nature has occured.
+ ErrorUnknown TokenKind = iota
+ // A character was found which is not part of a valid token.
+ ErrorIllegalChar
+ // A quoted string was opened but not closed.
+ ErrorUnterminatedStringLiteral
+ // A multiline comment was opened but not closed.
+ ErrorUnterminatedComment
+ // Indicates the end of a stream of tokens.
+ EOF
+
+ // Punctuators and Separators
+ LParen
+ RParen
+ LBracket
+ RBracket
+ LBrace
+ RBrace
+ LAngle
+ RAngle
+ Semi
+ Comma
+ Dot
+ Minus
+ Plus
+ Amp
+ Qstn
+ Equals
+ Response
+
+ // Names
+ Name
+
+ // Keywords
+ Import
+ Module
+ Struct
+ Union
+ Interface
+ Enum
+ Const
+ True
+ False
+ Default
+
+ // Constants
+ IntConstDec
+ IntConstHex
+ FloatConst
+ Ordinal
+ StringLiteral
+)
+
+// This method is used to generate user-facing strings in compilation error
+// messages. For example for LBrace we produce the string "'{'". Notice the
+// single-quotes. This will be used for example in an error message that looks
+// like the following:
+// Unexpected token at line 5, column 6: '###'. Expecting '{'.
+func (tokenKind TokenKind) String() string {
+ switch tokenKind {
+ // Errors
+ case ErrorUnknown:
+ return "unknown token"
+ case ErrorIllegalChar:
+ return "illegal token"
+ case ErrorUnterminatedStringLiteral:
+ return "unterminated string literal"
+ case ErrorUnterminatedComment:
+ return "unterminated comment"
+
+ // End of file
+ case EOF:
+ return "eof"
+
+ // Punctuators and Separators
+ case LParen:
+ return "'('"
+ case RParen:
+ return "')'"
+ case LBracket:
+ return "'['"
+ case RBracket:
+ return "']'"
+ case LBrace:
+ return "'{'"
+ case RBrace:
+ return "'}'"
+ case LAngle:
+ return "'<'"
+ case RAngle:
+ return "'>'"
+ case Semi:
+ return "';'"
+ case Comma:
+ return "','"
+ case Dot:
+ return "'.'"
+ case Minus:
+ return "'-'"
+ case Plus:
+ return "'+'"
+ case Amp:
+ return "'&'"
+ case Qstn:
+ return "'?'"
+ case Equals:
+ return "'='"
+ case Response:
+ return "'=>'"
+
+ // Names
+ case Name:
+ return "a name"
+
+ // Keywords
+ case Import:
+ return "'import'"
+ case Module:
+ return "'module'"
+ case Struct:
+ return "'struct'"
+ case Union:
+ return "'union'"
+ case Interface:
+ return "'interface'"
+ case Enum:
+ return "'enum'"
+ case Const:
+ return "'const'"
+ case True:
+ return "'true'"
+ case False:
+ return "'false'"
+ case Default:
+ return "'default'"
+
+ // Constants
+ case IntConstDec:
+ return "decimal integer literal"
+ case IntConstHex:
+ return "hex integer literal"
+ case FloatConst:
+ return "float literal"
+ case Ordinal:
+ return "an ordinal"
+ case StringLiteral:
+ return "a string literal"
+
+ default:
+ // Note(rudominer) It is important to use %d below so as to avoid
+ // re-invoking this method and causing an infinite recursion.
+ return fmt.Sprintf("%d", tokenKind)
+ }
+}
+
+type Token struct {
+ Kind TokenKind
+ Text string
+ // CharPos is the number of runes preceeding the token.
+ CharPos int
+ // LineNo is the line on which the token is found. (First line is 0.)
+ LineNo int
+ // LinePost is the number of runes preceeding the token on its line.
+ LinePos int
+}
+
+// ShortLocationString is used to generate user-facing strings in compilation
+// error messages. This will be used for example in an error message that looks
+// like the following:
+// Unexpected token at line 5, column 6: '###'. Expecting '{'.
+func (t Token) ShortLocationString() string {
+ return fmt.Sprintf("%d,%d", t.LineNo+1, t.LinePos+1)
+}
+
+func (t Token) LongLocationString() string {
+ return fmt.Sprintf("line %d, column %d", t.LineNo+1, t.LinePos+1)
+}
+
+// EOF returns true if the token on which it is called represents the end of the
+// token string.
+func (t Token) EOF() bool {
+ return t.Kind == EOF
+}
+
+// String is used to generate user-facing strings in compilation error
+// messages. For many token kinds the TokenKind.String() method will produce
+// good results for representing the token. But for other TokenKinds we will
+// want to include some information besides a representation of the kind.
+// For example for an ErrorUnknown kind we wnat to show the text.
+// This will be used for example in an error message that looks
+// like the following:
+// Unexpected token at line 5, column 6: '###'. Expecting '{'.
+func (token Token) String() string {
+ switch token.Kind {
+ case ErrorUnknown, Name, StringLiteral, IntConstDec, IntConstHex, FloatConst, Ordinal:
+ return fmt.Sprintf("'%s'", token.Text)
+
+ default:
+ return token.Kind.String()
+ }
+}