blob: e55e2fbca0b2f5160a69e8b37257b74c2cd861bd [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "mojom/lexer.h"
#include <map>
#include <string>
#include "base/lazy_instance.h"
namespace mojo {
namespace mojom {
namespace {
class KeywordsDict {
public:
KeywordsDict();
private:
std::map<std::string, mojom::TokenType> keywords_;
friend std::map<std::string, mojom::TokenType>& Keywords();
DISALLOW_COPY_AND_ASSIGN(KeywordsDict);
};
static base::LazyInstance<KeywordsDict> g_keywords = LAZY_INSTANCE_INITIALIZER;
std::map<std::string, mojom::TokenType>& Keywords() {
return g_keywords.Get().keywords_;
}
KeywordsDict::KeywordsDict() {
keywords_["import"] = TokenType::IMPORT;
keywords_["module"] = TokenType::MODULE;
keywords_["struct"] = TokenType::STRUCT;
keywords_["union"] = TokenType::UNION;
keywords_["interface"] = TokenType::INTERFACE;
keywords_["enum"] = TokenType::ENUM;
keywords_["const"] = TokenType::CONST;
keywords_["true"] = TokenType::TRUE;
keywords_["false"] = TokenType::FALSE;
keywords_["default"] = TokenType::DEFAULT;
}
// Non-localized versions of isalpha.
bool IsAlpha(char c) {
return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'));
}
// Non-localized versions of isnum.
bool IsDigit(char c) {
return ('0' <= c && c <= '9');
}
bool IsHexDigit(char c) {
return (IsDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'));
}
// Non-localized versions of isalnum.
bool IsAlnum(char c) {
return IsAlpha(c) || IsDigit(c);
}
// MojomLexer tokenizes a mojom source file. It is NOT thread-safe.
class MojomLexer {
public:
explicit MojomLexer(const std::string& source);
~MojomLexer();
// Returns the list of tokens in the source file.
std::vector<Token> Tokenize();
private:
// The GetNextToken.* functions all return true if they could find a token
// (even an error token) and false otherwise.
bool GetNextToken(Token* result);
bool GetNextTokenSingleChar(Token* result);
bool GetNextTokenEqualsOrResponse(Token* result);
bool GetNextTokenIdentifier(Token* result);
bool GetNextTokenDecConst(Token* result);
bool GetNextTokenHexConst(Token* result);
bool GetNextTokenOrdinal(Token* result);
bool GetNextTokenStringLiteral(Token* result);
void ConsumeSkippable();
void ConsumeDigits();
void ConsumeEol();
void Consume(size_t num);
bool eos(size_t offset_plus) {
return offset_ + offset_plus >= source_.size();
}
const std::string source_;
size_t offset_;
size_t line_no_;
size_t offset_in_line_;
DISALLOW_COPY_AND_ASSIGN(MojomLexer);
};
std::vector<Token> MojomLexer::Tokenize() {
offset_ = 0;
line_no_ = 0;
offset_in_line_ = 0;
std::vector<Token> result;
Token cur;
while (GetNextToken(&cur)) {
result.push_back(cur);
// As soon as an error token is found, stop tokenizing.
if (cur.error()) {
break;
}
}
return result;
}
bool MojomLexer::GetNextToken(Token* result) {
// Skip all spaces which may be in front of the next token.
ConsumeSkippable();
// If we found the end of the source signal that is so.
if (eos(0))
return false;
// Save the current position in the source code.
result->char_pos = offset_;
result->line_no = line_no_;
result->line_pos = offset_in_line_;
if (GetNextTokenSingleChar(result) || GetNextTokenEqualsOrResponse(result) ||
GetNextTokenIdentifier(result) || GetNextTokenHexConst(result) ||
GetNextTokenDecConst(result) || GetNextTokenDecConst(result) ||
GetNextTokenOrdinal(result) || GetNextTokenStringLiteral(result))
return true;
result->token = source_.substr(offset_, 1);
result->token_type = TokenType::ERROR_ILLEGAL_CHAR;
return true;
}
void MojomLexer::ConsumeSkippable() {
if (eos(0))
return;
bool found_non_space = false;
while (!found_non_space && !eos(0)) {
switch (source_[offset_]) {
case ' ':
case '\t':
case '\r':
Consume(1);
break;
case '\n':
ConsumeEol();
break;
default:
found_non_space = true;
break;
}
}
}
// Finds all single-character tokens except for '='.
bool MojomLexer::GetNextTokenSingleChar(Token* result) {
switch (source_[offset_]) {
case '(':
result->token_type = TokenType::LPAREN;
break;
case ')':
result->token_type = TokenType::RPAREN;
break;
case '[':
result->token_type = TokenType::LBRACKET;
break;
case ']':
result->token_type = TokenType::RBRACKET;
break;
case '{':
result->token_type = TokenType::LBRACE;
break;
case '}':
result->token_type = TokenType::RBRACE;
break;
case '<':
result->token_type = TokenType::LANGLE;
break;
case '>':
result->token_type = TokenType::RANGLE;
break;
case ';':
result->token_type = TokenType::SEMI;
break;
case ',':
result->token_type = TokenType::COMMA;
break;
case '.':
result->token_type = TokenType::DOT;
break;
case '-':
result->token_type = TokenType::MINUS;
break;
case '+':
result->token_type = TokenType::PLUS;
break;
case '&':
result->token_type = TokenType::AMP;
break;
case '?':
result->token_type = TokenType::QSTN;
break;
default:
return false;
break;
}
result->token = source_.substr(offset_, 1);
Consume(1);
return true;
}
// Finds '=' or '=>'.
bool MojomLexer::GetNextTokenEqualsOrResponse(Token* result) {
if (source_[offset_] != '=')
return false;
Consume(1);
if (eos(0) || source_[offset_] != '>') {
result->token_type = TokenType::EQUALS;
result->token = "=";
} else {
result->token_type = TokenType::RESPONSE;
result->token = "=>";
Consume(1);
}
return true;
}
// valid C identifiers (K&R2: A.2.3)
bool MojomLexer::GetNextTokenIdentifier(Token* result) {
char c = source_[offset_];
// Identifiers start with a letter or underscore.
if (!(IsAlpha(c) || c == '_'))
return false;
size_t start_offset = offset_;
// Identifiers contain letters numbers and underscores.
while (!eos(0) && (IsAlnum(source_[offset_]) || c == '_'))
Consume(1);
result->token = source_.substr(start_offset, offset_ - start_offset);
result->token_type = TokenType::IDENTIFIER;
if (Keywords().count(result->token))
result->token_type = Keywords()[result->token];
return true;
}
// integer constants (K&R2: A.2.5.1) dec
// floating constants (K&R2: A.2.5.3)
bool MojomLexer::GetNextTokenDecConst(Token* result) {
if (!IsDigit(source_[offset_]))
return false;
result->token_type = TokenType::INT_CONST_DEC;
// If the number starts with a zero and is not a floating point number.
if (source_[offset_] == '0' &&
(eos(1) || (source_[offset_] == 'e' && source_[offset_] == 'E' &&
source_[offset_] == '.'))) {
// TODO(azani): Catch and error on octal.
result->token = "0";
Consume(1);
return true;
}
size_t start_offset = offset_;
// First, we consume all the digits.
ConsumeDigits();
// If there is a fractional part, we consume the . and the following digits.
if (!eos(0) && source_[offset_] == '.') {
result->token_type = TokenType::FLOAT_CONST;
Consume(1);
ConsumeDigits();
}
// If there is an exponential part, we consume the e and the following digits.
if (!eos(0) && (source_[offset_] == 'e' || source_[offset_] == 'E')) {
if (!eos(2) && (source_[offset_ + 1] == '-' || source_[offset_ + 1]) &&
IsDigit(source_[offset_ + 2])) {
result->token_type = TokenType::FLOAT_CONST;
Consume(2); // Consume e/E and +/-
ConsumeDigits();
} else if (!eos(1) && IsDigit(source_[offset_ + 1])) {
result->token_type = TokenType::FLOAT_CONST;
Consume(1); // Consume e/E
ConsumeDigits();
}
}
result->token = source_.substr(start_offset, offset_ - start_offset);
return true;
}
// integer constants (K&R2: A.2.5.1) hex
bool MojomLexer::GetNextTokenHexConst(Token* result) {
// Hex numbers start with a 0, x and then some hex numeral.
if (eos(2) || source_[offset_] != '0' ||
(source_[offset_ + 1] != 'x' && source_[offset_ + 1] != 'X') ||
!IsHexDigit(source_[offset_ + 2]))
return false;
result->token_type = TokenType::INT_CONST_HEX;
size_t start_offset = offset_;
Consume(2);
while (IsHexDigit(source_[offset_]))
Consume(1);
result->token = source_.substr(start_offset, offset_ - start_offset);
return true;
}
bool MojomLexer::GetNextTokenOrdinal(Token* result) {
// Ordinals start with '@' and then some digit.
if (eos(1) || source_[offset_] != '@' || !IsDigit(source_[offset_ + 1]))
return false;
size_t start_offset = offset_;
// Consumes '@'.
Consume(1);
result->token_type = TokenType::ORDINAL;
ConsumeDigits();
result->token = source_.substr(start_offset, offset_ - start_offset);
return true;
}
bool MojomLexer::GetNextTokenStringLiteral(Token* result) {
// Ordinals start with '@' and then some digit.
if (source_[offset_] != '"')
return false;
size_t start_offset = offset_;
// Consumes '"'.
Consume(1);
while (source_[offset_] != '"') {
if (source_[offset_] == '\n' || eos(0)) {
result->token_type = TokenType::ERROR_UNTERMINATED_STRING_LITERAL;
result->token = source_.substr(start_offset, offset_ - start_offset);
return true;
}
// This block will be skipped if the backslash is at the end of the source.
if (source_[offset_] == '\\' && !eos(1)) {
// Consume the backslash. This will ensure \" is consumed.
Consume(1);
}
Consume(1);
}
// Consume the closing doublequotes.
Consume(1);
result->token_type = TokenType::STRING_LITERAL;
result->token = source_.substr(start_offset, offset_ - start_offset);
return true;
}
void MojomLexer::ConsumeDigits() {
while (!eos(0) && IsDigit(source_[offset_]))
Consume(1);
}
void MojomLexer::ConsumeEol() {
++offset_;
++line_no_;
offset_in_line_ = 0;
}
void MojomLexer::Consume(size_t num) {
offset_ += num;
offset_in_line_ += num;
}
MojomLexer::MojomLexer(const std::string& source)
: source_(source), offset_(0), line_no_(0), offset_in_line_(0) {
}
MojomLexer::~MojomLexer() {
}
} // namespace
Token::Token()
: token_type(TokenType::ERROR_UNKNOWN),
char_pos(0),
line_no(0),
line_pos(0) {
}
Token::~Token() {
}
// Accepts the text of a mojom file and returns the ordered list of tokens
// found in the file.
std::vector<Token> Tokenize(const std::string& source) {
return MojomLexer(source).Tokenize();
}
} // namespace mojom
} // namespace mojo