New lexer for mojom written in go. BUG=fixes #464 R=mattr@google.com, rudominer@chromium.org Review URL: https://codereview.chromium.org/1387893002 .

commit: 600c05201d29661317b2c7991cb2af1da9695e57 [log] [tgz]
author: Alexandre Zani <azani@chromium.org> Wed Oct 14 15:30:26 2015 -0700
committer: Alexandre Zani <azani@chromium.org> Wed Oct 14 15:30:26 2015 -0700
tree: 097f6034c350c8d853f821f75353decbdb47ac52
parent: 2abb21e3966badfbc7c4a48cd7d9d0f142eace55 [diff]
diff --git a/mojom/mojom_parser/lexer/lexer.go b/mojom/mojom_parser/lexer/lexer.go
new file mode 100644
index 0000000..b210b2b
--- /dev/null
+++ b/mojom/mojom_parser/lexer/lexer.go

@@ -0,0 +1,519 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// To use the lexer, call Tokenize with the source string to obtain
+// a TokenStream. The lexer is run concurrently so you should be able to
+// use the TokenStream before the lexer is done with the source.
+//
+// The lexer is implemented as a state machine. The states are represented
+// by functions (the stateFn type) which accept a lexer and return the
+// new state.
+//
+// Most states also have an isFooStart function which helps determine if
+// a transition to Foo is appropriate. Those functions accept a single
+// rune as a parameter and return true if the state machine should
+// transition to state Foo. Some states do not have such functions on
+// account of the transition condition being trivial.
+//
+// The lexer implementation was inspired by
+// http://cuddle.googlecode.com/hg/talk/lex.html
+
+package lexer
+
+import (
+	"unicode/utf8"
+)
+
+// Tokenize accepts a source string and parses it into a stream of tokens which
+// can be read from the returned TokenStream.
+func Tokenize(source string) TokenStream {
+	tokens := make(chan Token)
+	l := lexer{source: source, tokens: tokens}
+	go l.run()
+	return &TokenChan{tokenChan: tokens}
+}
+
+type lexer struct {
+	// source is the source code to be lexed.
+	source string
+
+	// offset is the number of bytes that have been consumed.
+	offset int
+
+	// tokens is a channel to which the found tokens are emitted.
+	tokens chan Token
+
+	// sourcePos is the number of runes that have been consumed.
+	sourcePos int
+
+	// lineno is the current line number.
+	lineNo int
+
+	// linePos is how many runes have been consumed since the beginning of the
+	// line.
+	linePos int
+
+	// curTokenOffset is the number of bytes consumed prior to the beginning of
+	// the current token.
+	curTokenOffset int
+
+	// curTokenSourcePos is the number of runes consumed prior to the beginning of
+	// the current token.
+	curTokenSourcePos int
+
+	// curTokenLineNo is the line number on which the current token begins.
+	curTokenLineNo int
+
+	// curTokenLinePos is the number of runes since the beginning of the line
+	// where the current token begins.
+	curTokenLinePos int
+}
+
+// CurText returns the consumed part of the current token.
+func (l *lexer) CurText() string {
+	return l.source[l.curTokenOffset:l.offset]
+}
+
+// emitToken emits the current token and begins a new token.
+func (l *lexer) emitToken(tokenType TokenKind) {
+	l.tokens <- Token{
+		Kind:    tokenType,
+		Text:    l.source[l.curTokenOffset:l.offset],
+		CharPos: l.curTokenSourcePos,
+		LineNo:  l.curTokenLineNo,
+		LinePos: l.curTokenLinePos}
+	l.beginToken()
+}
+
+// beginToken begins the new token.
+func (l *lexer) beginToken() {
+	l.curTokenOffset = l.offset
+	l.curTokenSourcePos = l.sourcePos
+	l.curTokenLineNo = l.lineNo
+	l.curTokenLinePos = l.linePos
+}
+
+// Consume consumes the next rune in the source.
+func (l *lexer) Consume() {
+	if l.IsEos() {
+		return
+	}
+
+	c, width := utf8.DecodeRuneInString(l.source[l.offset:])
+
+	if c == '\n' {
+		l.lineNo += 1
+		l.linePos = 0
+	} else {
+		l.linePos += 1
+	}
+	l.offset += width
+	l.sourcePos += 1
+}
+
+// Peek returns the next rune in the source.
+func (l *lexer) Peek() rune {
+	// At the end of the string, there is no sane answer to Peek.
+	if l.IsEos() {
+		return utf8.RuneError
+	}
+
+	// If RuneError is returned, it will be handled as any other rune, likely
+	// resulting in an ErrorIllegalChar token being emitted.
+	char, _ := utf8.DecodeRuneInString(l.source[l.offset:])
+	return char
+}
+
+// IsEos returns true if the whole source has been consumed false
+// otherwise.
+func (l *lexer) IsEos() bool {
+	return l.offset >= len(l.source)
+}
+
+// run is the lexer's main loop.
+func (l *lexer) run() {
+	// We are implementing a state machine.
+	// lexRoot is the beginning state.
+	// nil is the end state.
+	// States are functions which are called on the lexer. They return the
+	// next state.
+	for state := lexRoot; state != nil; {
+		state = state(l)
+	}
+	close(l.tokens)
+}
+
+// A stateFn represents a state in the lexer state machine.
+type stateFn func(*lexer) stateFn
+
+// This is the beginning state and also the state which is returned to after
+// most tokens are emitted.
+func lexRoot(l *lexer) stateFn {
+	if l.IsEos() {
+		return nil
+	}
+
+	switch c := l.Peek(); {
+	case isSingleCharTokens(c):
+		return lexSingleCharTokens
+	case isEqualsOrResponseStart(c):
+		return lexEqualsOrResponse
+	case isNameStart(c):
+		return lexName
+	case isOrdinalStart(c):
+		return lexOrdinal
+	case isNumberStart(c):
+		return lexNumber
+	case isStringStart(c):
+		return lexString
+	case isSkippable(c):
+		return lexSkip
+	case isMaybeComment(c):
+		return lexComment
+	}
+
+	l.Consume()
+	l.emitToken(ErrorIllegalChar)
+	return nil
+}
+
+// isSkippable determines if a rune is skippable.
+func isSkippable(c rune) bool {
+	return c == ' ' || c == '\t' || c == '\r' || c == '\n'
+}
+
+// lexSkip consumes skippable runes.
+func lexSkip(l *lexer) stateFn {
+	for isSkippable(l.Peek()) {
+		l.Consume()
+	}
+	l.beginToken()
+	return lexRoot
+}
+
+// singleCharTokens is a map of single-rune tokens.
+var singleCharTokens = map[rune]TokenKind{
+	'(': LParen,
+	')': RParen,
+	'[': LBracket,
+	']': RBracket,
+	'{': LBrace,
+	'}': RBrace,
+	'<': LAngle,
+	'>': RAngle,
+	';': Semi,
+	',': Comma,
+	'.': Dot,
+	'-': Minus,
+	'+': Plus,
+	'&': Amp,
+	'?': Qstn,
+}
+
+// isSingleCharTokens returns true if the rune is a single character token.
+func isSingleCharTokens(c rune) bool {
+	_, ok := singleCharTokens[c]
+	return ok
+}
+
+// lexSingleCharTokens lexes single character tokens.
+func lexSingleCharTokens(l *lexer) stateFn {
+	c := l.Peek()
+	l.Consume()
+	t, _ := singleCharTokens[c]
+	l.emitToken(t)
+
+	return lexRoot
+}
+
+// isEqualsOrResponseStart returns true if the rune corresponds to be
+// beginning of either the '=' or '=>' tokens.
+func isEqualsOrResponseStart(c rune) bool {
+	return c == '='
+}
+
+// lexEqualsOrResponse lexes the '=' or the '=>' token.
+func lexEqualsOrResponse(l *lexer) stateFn {
+	l.Consume()
+
+	if l.Peek() == '>' {
+		l.Consume()
+		l.emitToken(Response)
+	} else {
+		l.emitToken(Equals)
+	}
+
+	return lexRoot
+}
+
+// isAlpha returns true if the rune is a letter of the alphabet.
+func isAlpha(c rune) bool {
+	return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
+}
+
+// isDigit returns true if the rune is a digit.
+func isDigit(c rune) bool {
+	return ('0' <= c && c <= '9')
+}
+
+// isHexDigit returns true if the rune is a hexadecimal digit.
+func isHexDigit(c rune) bool {
+	return isDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
+}
+
+// isNameStart returns true if the rune is the beginning of a Name token.
+func isNameStart(c rune) bool {
+	return isAlpha(c) || c == '_'
+}
+
+// keywordTokens maps keywords to their associate tokens.
+var keywordTokens = map[string]TokenKind{
+	"import":    Import,
+	"module":    Module,
+	"struct":    Struct,
+	"union":     Union,
+	"interface": Interface,
+	"enum":      Enum,
+	"const":     Const,
+	"true":      True,
+	"false":     False,
+	"default":   Default,
+}
+
+// lexName lexes valid C identifiers. (K&R2: A.2.3)
+func lexName(l *lexer) stateFn {
+	l.Consume()
+
+	// isNameRune returns true if the rune is valid in a Name token.
+	isNameRune := func(c rune) bool {
+		return isAlpha(c) || isDigit(c) || c == '_'
+	}
+
+	for isNameRune(l.Peek()) {
+		l.Consume()
+	}
+
+	// Emit the appropriate keyword token if the current name is a
+	// keyword or a Name token otherwise.
+	if token, found := keywordTokens[l.CurText()]; found {
+		l.emitToken(token)
+	} else {
+		l.emitToken(Name)
+	}
+
+	return lexRoot
+}
+
+// isOrdinalStart returns true if the rune is the beginning of an Ordinal
+// token.
+func isOrdinalStart(c rune) bool {
+	return '@' == c
+}
+
+// lexOrdinal lexes an Ordinal token. Ordinals are a '@' followed by one
+// or more digits.
+func lexOrdinal(l *lexer) stateFn {
+	// Consume the '@'.
+	l.Consume()
+
+	for isDigit(l.Peek()) {
+		l.Consume()
+	}
+
+	l.emitToken(Ordinal)
+
+	return lexRoot
+}
+
+// isNumberStart returns true if the rune is the beginning of a number.
+func isNumberStart(c rune) bool {
+	// Even hexadecimals must begin with a digit (namely 0).
+	return isDigit(c)
+}
+
+// lexNumber lexes a number token.
+func lexNumber(l *lexer) stateFn {
+	// If the number begins with 0 it cannot be a decimal integer.
+	if l.Peek() == '0' {
+		return lexNumberStartWithZero
+	}
+	return lexDec
+}
+
+// lexDec lexes a base-10 number.
+func lexDec(l *lexer) stateFn {
+	for isDigit(l.Peek()) {
+		l.Consume()
+	}
+
+	// If a decimal part is found, transition to the decimal state.
+	if isDecimalPartStart(l.Peek()) {
+		return lexDecimalPart
+	}
+
+	l.emitToken(IntConstDec)
+
+	return lexRoot
+}
+
+// lexNumberStartWithZero lexes hexadecimals, some floats or 0.
+func lexNumberStartWithZero(l *lexer) stateFn {
+	// Consume the leading 0
+	l.Consume()
+
+	// Here we check to see whether we are in the hexadecimal or floating
+	// point case.
+	switch c := l.Peek(); {
+	case c == 'x' || c == 'X':
+		return lexHexNumber
+	case isDecimalPartStart(c):
+		return lexDecimalPart
+	}
+
+	// Found a naked 0.
+	l.emitToken(IntConstDec)
+
+	return lexRoot
+}
+
+// lexHexNumber lexes hexadecimal integers.
+func lexHexNumber(l *lexer) stateFn {
+	// Consume the x or X
+	l.Consume()
+
+	for isHexDigit(l.Peek()) {
+		l.Consume()
+	}
+
+	l.emitToken(IntConstHex)
+
+	return lexRoot
+}
+
+// isDecimalPartStart returns true if the rune represents the beginning of
+// the decimal part of a floating point number.
+func isDecimalPartStart(c rune) bool {
+	return c == '.' || c == 'e' || c == 'E'
+}
+
+// lexDecimalPart lexes the decimal part of a floating point number.
+func lexDecimalPart(l *lexer) stateFn {
+	// Consume '.' or 'e' or 'E'
+	l.Consume()
+
+	if c := l.Peek(); c == '+' || c == '-' {
+		l.Consume()
+	}
+
+	for isDigit(l.Peek()) {
+		l.Consume()
+	}
+
+	l.emitToken(FloatConst)
+
+	return lexRoot
+}
+
+// isStringStart returns true if the rune represents the beginning of a string.
+func isStringStart(c rune) bool {
+	return '"' == c
+}
+
+// lexString lexes a quoted string.
+func lexString(l *lexer) stateFn {
+	// Consume opening quotes.
+	l.Consume()
+
+	for !l.IsEos() && l.Peek() != '"' && l.Peek() != '\n' {
+		if l.Peek() == '\\' {
+			// If we see an escape character consume whatever follows blindly.
+			// TODO(azani): Consider parsing escape sequences.
+			l.Consume()
+		}
+		l.Consume()
+	}
+
+	if l.IsEos() || l.Peek() == '\n' {
+		l.emitToken(ErrorUnterminatedStringLiteral)
+		return nil
+	}
+
+	// Consume the closing quotes
+	l.Consume()
+
+	l.emitToken(StringLiteral)
+
+	return lexRoot
+}
+
+// isMaybeComment returns true if the rune may be the beginning of a
+// comment.
+func isMaybeComment(c rune) bool {
+	return c == '/'
+}
+
+// lexComment consumes a single-line or multi-line comment.
+func lexComment(l *lexer) stateFn {
+	// Consume the '/'.
+	l.Consume()
+
+	switch l.Peek() {
+	case '/':
+		return lexSingleLineComment
+	case '*':
+		return lexMultiLineComment
+	}
+
+	l.emitToken(ErrorIllegalChar)
+	return nil
+}
+
+// lexSingleLineComment consumes a single line comment.
+func lexSingleLineComment(l *lexer) stateFn {
+	// Consume the '/'
+	l.Consume()
+
+	for !l.IsEos() && l.Peek() != '\n' {
+		l.Consume()
+	}
+
+	l.beginToken()
+	return lexRoot
+}
+
+// lexMultiLineComment consumes a multi-line comment.
+func lexMultiLineComment(l *lexer) stateFn {
+	// Consume the '*'.
+	l.Consume()
+
+	for !l.IsEos() {
+		if l.Peek() == '*' {
+			return lexPossibleEndOfComment
+		}
+		l.Consume()
+	}
+
+	l.emitToken(ErrorUnterminatedComment)
+	return nil
+}
+
+// lexPossibleEndOfComment consumes the possible end of a multiline
+// comment and determines whether the comment in fact ended or not.
+func lexPossibleEndOfComment(l *lexer) stateFn {
+	// Consume the '*'
+	l.Consume()
+
+	if l.IsEos() {
+		l.emitToken(ErrorUnterminatedComment)
+		return nil
+	}
+
+	if l.Peek() == '/' {
+		l.Consume()
+		l.beginToken()
+		return lexRoot
+	}
+
+	return lexMultiLineComment
+}

diff --git a/mojom/mojom_parser/lexer/lexer_test.go b/mojom/mojom_parser/lexer/lexer_test.go
index 1b378a7..19f9aae 100644
--- a/mojom/mojom_parser/lexer/lexer_test.go
+++ b/mojom/mojom_parser/lexer/lexer_test.go

@@ -6,10 +6,207 @@
 
 import "testing"
 
-// TODO(rudominer) This dummy test is here in order to be able to test the
-// go unit test infrastructure. It will eventually be replaced by a real test.
-func TestDummyLexerTest(t *testing.T) {
-	if 5.1 > 2.1*3.1 {
-		t.Fatalf("Something is wrong.")
+func checkEq(t *testing.T, expected, actual interface{}) {
+	if expected != actual {
+		t.Fatalf("Failed check: Expected (%v), Actual (%v)", expected, actual)
 	}
 }
+
+// pumpTokens pumps all the tokens from a channel into a slice.
+func pumpTokens(tokensChan chan Token) []Token {
+	tokens := []Token{}
+	for token := range tokensChan {
+		tokens = append(tokens, token)
+	}
+	return tokens
+}
+
+// TestAllSingleTokens tests for each token that a valid string is accepted as
+// the correct token.
+func TestAllSingleTokens(t *testing.T) {
+	testData := []struct {
+		source string
+		token  TokenKind
+	}{
+		{"(", LParen},
+		{")", RParen},
+		{"[", LBracket},
+		{"]", RBracket},
+		{"{", LBrace},
+		{"}", RBrace},
+		{"<", LAngle},
+		{">", RAngle},
+		{";", Semi},
+		{",", Comma},
+		{".", Dot},
+		{"-", Minus},
+		{"+", Plus},
+		{"&", Amp},
+		{"?", Qstn},
+		{"=", Equals},
+		{"=>", Response},
+		{"somet_hi3ng", Name},
+		{"import", Import},
+		{"module", Module},
+		{"struct", Struct},
+		{"union", Union},
+		{"interface", Interface},
+		{"enum", Enum},
+		{"const", Const},
+		{"true", True},
+		{"false", False},
+		{"default", Default},
+		{"@10", Ordinal},
+		{"10", IntConstDec},
+		{"0", IntConstDec},
+		{"0xA10", IntConstHex},
+		{"0xa10", IntConstHex},
+		{"0XA10", IntConstHex},
+		{"0Xa10", IntConstHex},
+		{"10.5", FloatConst},
+		{"10e5", FloatConst},
+		{"0.5", FloatConst},
+		{"0e5", FloatConst},
+		{"10e+5", FloatConst},
+		{"10e-5", FloatConst},
+		{"\"hello world\"", StringLiteral},
+		{"\"hello \\\"real\\\" world\"", StringLiteral},
+	}
+
+	for i := range testData {
+		l := lexer{source: testData[i].source, tokens: make(chan Token)}
+		go l.run()
+		tokens := pumpTokens(l.tokens)
+
+		if len(tokens) != 1 {
+			t.Fatalf("Source('%v'): Expected 1 token but got %v instead: %v",
+				testData[i].source, len(tokens), tokens)
+		}
+
+		checkEq(t, testData[i].source, tokens[0].Text)
+		checkEq(t, testData[i].token, tokens[0].Kind)
+	}
+}
+
+// TestTokenPosition tests that the position in the source string, the line
+// number and the position in the line of the lexed token are correctly found.
+func TestTokenPosition(t *testing.T) {
+	source := "  \n  ."
+	l := lexer{source: source, tokens: make(chan Token)}
+	go l.run()
+	tokens := pumpTokens(l.tokens)
+	token := tokens[0]
+
+	checkEq(t, 5, token.CharPos)
+	checkEq(t, 1, token.LineNo)
+	checkEq(t, 2, token.LinePos)
+}
+
+// TestTokenPositionChineseString tests that CharPos is expressed as a number
+// of runes and not a number of bytes.
+func TestTokenPositionChineseString(t *testing.T) {
+	source := "\"您好\" is"
+	ts := Tokenize(source)
+	checkEq(t, StringLiteral, ts.PeekNext().Kind)
+	ts.ConsumeNext()
+	checkEq(t, 5, ts.PeekNext().CharPos)
+}
+
+// TestSkipSkippable tests that all skippable characters are skipped.
+func TestSkipSkippable(t *testing.T) {
+	source := "  \t  \r \n  ."
+	l := lexer{source: source, tokens: make(chan Token)}
+	go l.run()
+	tokens := pumpTokens(l.tokens)
+
+	checkEq(t, Dot, tokens[0].Kind)
+}
+
+// TestTokenize tests that a single token embedded in a larger string is
+// correctly lexed.
+func TestTokenize(t *testing.T) {
+	ts := Tokenize("   \t .   ")
+	token := ts.PeekNext()
+	checkEq(t, Dot, token.Kind)
+
+	ts.ConsumeNext()
+	token = ts.PeekNext()
+	checkEq(t, EOF, token.Kind)
+}
+
+// TestTokenizeBadUTF8String tests that an invalid UTF8 string is handled.
+func TestTokenizeBadUTF8String(t *testing.T) {
+	ts := Tokenize("\xF0")
+	checkEq(t, ErrorIllegalChar, ts.PeekNext().Kind)
+}
+
+// TestTokenizeEmptyString tests that empty strings are handled correctly.
+func TestTokenizeEmptyString(t *testing.T) {
+	ts := Tokenize("")
+	checkEq(t, EOF, ts.PeekNext().Kind)
+}
+
+// TestTokenizeMoreThanOne tests that more than one token is correctly lexed.
+func TestTokenizeMoreThanOne(t *testing.T) {
+	ts := Tokenize("()")
+	checkEq(t, LParen, ts.PeekNext().Kind)
+	ts.ConsumeNext()
+	checkEq(t, RParen, ts.PeekNext().Kind)
+	ts.ConsumeNext()
+	checkEq(t, EOF, ts.PeekNext().Kind)
+}
+
+// TestIllegalChar tests that an illegal character is correctly spotted.
+func TestIllegalChar(t *testing.T) {
+	ts := Tokenize("   \t $   ")
+	checkEq(t, ErrorIllegalChar, ts.PeekNext().Kind)
+}
+
+// TestUnterminatedStringLiteralEos tests that the correct error is emitted if
+// a quoted string is never closed.
+func TestUnterminatedStringLiteralEos(t *testing.T) {
+	ts := Tokenize("\"hello world")
+	checkEq(t, ErrorUnterminatedStringLiteral, ts.PeekNext().Kind)
+}
+
+// TestUnterminatedStringLiteralEol tests that the correct error is emitted if
+// a quoted string is closed on a subsequent line.
+func TestUnterminatedStringLiteralEol(t *testing.T) {
+	ts := Tokenize("\"hello\n world\"")
+	checkEq(t, ErrorUnterminatedStringLiteral, ts.PeekNext().Kind)
+}
+
+// TestSingleLineComment tests that single line comments are correctly skipped.
+func TestSingleLineComment(t *testing.T) {
+	ts := Tokenize("( // some stuff\n)")
+	checkEq(t, LParen, ts.PeekNext().Kind)
+	ts.ConsumeNext()
+	checkEq(t, RParen, ts.PeekNext().Kind)
+}
+
+// TestMultiLineComment tests that multi line comments are correctly skipped.
+func TestMultiLineComment(t *testing.T) {
+	ts := Tokenize("( /* hello world/  * *\n */)")
+	checkEq(t, LParen, ts.PeekNext().Kind)
+	ts.ConsumeNext()
+	checkEq(t, RParen, ts.PeekNext().Kind)
+}
+
+// TestUnterminatedMultiLineComment tests that unterminated multiline comments
+// emit the correct error.
+func TestUnterminatedMultiLineComment(t *testing.T) {
+	ts := Tokenize("( /* hello world/  * *\n )")
+	checkEq(t, LParen, ts.PeekNext().Kind)
+	ts.ConsumeNext()
+	checkEq(t, ErrorUnterminatedComment, ts.PeekNext().Kind)
+}
+
+// TestUnterminatedMultiLineCommentAtStar tests that if the string ends at a *
+// (which could be the beginning of the close of a multiline comment) the right
+// error is emitted.
+func TestUnterminatedMultiLineCommentAtStar(t *testing.T) {
+	ts := Tokenize("( /* hello world/  *")
+	checkEq(t, LParen, ts.PeekNext().Kind)
+	ts.ConsumeNext()
+	checkEq(t, ErrorUnterminatedComment, ts.PeekNext().Kind)
+}

diff --git a/mojom/mojom_parser/lexer/token_stream.go b/mojom/mojom_parser/lexer/token_stream.go
new file mode 100644
index 0000000..beb399a
--- /dev/null
+++ b/mojom/mojom_parser/lexer/token_stream.go

@@ -0,0 +1,51 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// TokenStream is the interface between the lexer and the parser. The lexer
+// creates a TokenStream which the parser consumes.
+
+package lexer
+
+type TokenStream interface {
+	// Returns the next Token in the stream without advancing the cursor,
+	// or returns the EOF token if the cursor is already past the end.
+	PeekNext() Token
+
+	// Advances the cursor in the stream or does nothing if the cursor is
+	// already past the end of the stream
+	ConsumeNext()
+}
+
+// The EOF token is returned by TokenStream to signal the end of the stream.
+var eofToken = Token{Kind: EOF}
+
+// *TokenChan implements TokenStream.
+// This implementation uses a non-buffered channel to pass the tokens from the
+// lexer to the parser. One end of the channel is held by the lexer and the
+// other is in the TokenChan object that is passed to the parser.
+type TokenChan struct {
+	tokenChan chan Token
+	nextToken Token
+	// read is true if a token has been read out of the channel into nextToken.
+	read bool
+}
+
+// See TokenStream.
+func (s *TokenChan) PeekNext() (token Token) {
+	if !s.read {
+		s.read = true
+		s.ConsumeNext()
+	}
+
+	return s.nextToken
+}
+
+// See TokenStream.
+func (s *TokenChan) ConsumeNext() {
+	if t, open := <-s.tokenChan; open {
+		s.nextToken = t
+	} else {
+		s.nextToken = eofToken
+	}
+}

diff --git a/mojom/mojom_parser/lexer/tokens.go b/mojom/mojom_parser/lexer/tokens.go
new file mode 100644
index 0000000..2ed9b5c
--- /dev/null
+++ b/mojom/mojom_parser/lexer/tokens.go

@@ -0,0 +1,218 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// TokenKinds is a type which describes the kinds of tokens which can be
+// encountered in a mojom file.
+
+package lexer
+
+import (
+	"fmt"
+)
+
+type TokenKind int
+
+// TokenKinds
+const (
+	// An error of an unknown nature has occured.
+	ErrorUnknown TokenKind = iota
+	// A character was found which is not part of a valid token.
+	ErrorIllegalChar
+	// A quoted string was opened but not closed.
+	ErrorUnterminatedStringLiteral
+	// A multiline comment was opened but not closed.
+	ErrorUnterminatedComment
+	// Indicates the end of a stream of tokens.
+	EOF
+
+	// Punctuators and Separators
+	LParen
+	RParen
+	LBracket
+	RBracket
+	LBrace
+	RBrace
+	LAngle
+	RAngle
+	Semi
+	Comma
+	Dot
+	Minus
+	Plus
+	Amp
+	Qstn
+	Equals
+	Response
+
+	// Names
+	Name
+
+	// Keywords
+	Import
+	Module
+	Struct
+	Union
+	Interface
+	Enum
+	Const
+	True
+	False
+	Default
+
+	// Constants
+	IntConstDec
+	IntConstHex
+	FloatConst
+	Ordinal
+	StringLiteral
+)
+
+// This method is used to generate user-facing strings in compilation error
+// messages. For example for LBrace we produce the string "'{'". Notice the
+// single-quotes. This will be used for example in an error message that looks
+// like the following:
+// Unexpected token at line 5, column 6: '###'. Expecting '{'.
+func (tokenKind TokenKind) String() string {
+	switch tokenKind {
+	// Errors
+	case ErrorUnknown:
+		return "unknown token"
+	case ErrorIllegalChar:
+		return "illegal token"
+	case ErrorUnterminatedStringLiteral:
+		return "unterminated string literal"
+	case ErrorUnterminatedComment:
+		return "unterminated comment"
+
+	// End of file
+	case EOF:
+		return "eof"
+
+	// Punctuators and Separators
+	case LParen:
+		return "'('"
+	case RParen:
+		return "')'"
+	case LBracket:
+		return "'['"
+	case RBracket:
+		return "']'"
+	case LBrace:
+		return "'{'"
+	case RBrace:
+		return "'}'"
+	case LAngle:
+		return "'<'"
+	case RAngle:
+		return "'>'"
+	case Semi:
+		return "';'"
+	case Comma:
+		return "','"
+	case Dot:
+		return "'.'"
+	case Minus:
+		return "'-'"
+	case Plus:
+		return "'+'"
+	case Amp:
+		return "'&'"
+	case Qstn:
+		return "'?'"
+	case Equals:
+		return "'='"
+	case Response:
+		return "'=>'"
+
+	// Names
+	case Name:
+		return "a name"
+
+	// Keywords
+	case Import:
+		return "'import'"
+	case Module:
+		return "'module'"
+	case Struct:
+		return "'struct'"
+	case Union:
+		return "'union'"
+	case Interface:
+		return "'interface'"
+	case Enum:
+		return "'enum'"
+	case Const:
+		return "'const'"
+	case True:
+		return "'true'"
+	case False:
+		return "'false'"
+	case Default:
+		return "'default'"
+
+	// Constants
+	case IntConstDec:
+		return "decimal integer literal"
+	case IntConstHex:
+		return "hex integer literal"
+	case FloatConst:
+		return "float literal"
+	case Ordinal:
+		return "an ordinal"
+	case StringLiteral:
+		return "a string literal"
+
+	default:
+		// Note(rudominer) It is important to use %d below so as to avoid
+		// re-invoking this method and causing an infinite recursion.
+		return fmt.Sprintf("%d", tokenKind)
+	}
+}
+
+type Token struct {
+	Kind TokenKind
+	Text string
+	// CharPos is the number of runes preceeding the token.
+	CharPos int
+	// LineNo is the line on which the token is found. (First line is 0.)
+	LineNo int
+	// LinePost is the number of runes preceeding the token on its line.
+	LinePos int
+}
+
+// ShortLocationString is used to generate user-facing strings in compilation
+// error messages. This will be used for example in an error message that looks
+// like the following:
+// Unexpected token at line 5, column 6: '###'. Expecting '{'.
+func (t Token) ShortLocationString() string {
+	return fmt.Sprintf("%d,%d", t.LineNo+1, t.LinePos+1)
+}
+
+func (t Token) LongLocationString() string {
+	return fmt.Sprintf("line %d, column %d", t.LineNo+1, t.LinePos+1)
+}
+
+// EOF returns true if the token on which it is called represents the end of the
+// token string.
+func (t Token) EOF() bool {
+	return t.Kind == EOF
+}
+
+// String is used to generate user-facing strings in compilation error
+// messages. For many token kinds the TokenKind.String() method will produce
+// good results for representing the token. But for other TokenKinds we will
+// want to include some information besides a representation of the kind.
+// For example for an ErrorUnknown kind we wnat to show the text.
+// This will be used for example in an error message that looks
+// like the following:
+// Unexpected token at line 5, column 6: '###'. Expecting '{'.
+func (token Token) String() string {
+	switch token.Kind {
+	case ErrorUnknown, Name, StringLiteral, IntConstDec, IntConstHex, FloatConst, Ordinal:
+		return fmt.Sprintf("'%s'", token.Text)
+
+	default:
+		return token.Kind.String()
+	}
+}
commit	600c05201d29661317b2c7991cb2af1da9695e57	[log] [tgz]
author	Alexandre Zani <azani@chromium.org>	Wed Oct 14 15:30:26 2015 -0700
committer	Alexandre Zani <azani@chromium.org>	Wed Oct 14 15:30:26 2015 -0700
tree	097f6034c350c8d853f821f75353decbdb47ac52
parent	2abb21e3966badfbc7c4a48cd7d9d0f142eace55 [diff]