| /* |
| * Copyright (C) 2008 Apple Inc. All Rights Reserved. |
| * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ |
| * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "sky/engine/config.h" |
| #include "sky/engine/core/html/parser/HTMLTokenizer.h" |
| |
| #include "gen/sky/core/HTMLNames.h" |
| #include "sky/engine/core/html/parser/AtomicHTMLToken.h" |
| #include "sky/engine/core/html/parser/HTMLEntityParser.h" |
| #include "sky/engine/core/html/parser/HTMLParserIdioms.h" |
| #include "sky/engine/core/html/parser/HTMLTreeBuilder.h" |
| #include "sky/engine/core/html/parser/MarkupTokenizerInlines.h" |
| #include "sky/engine/platform/NotImplemented.h" |
| #include "sky/engine/wtf/ASCIICType.h" |
| #include "sky/engine/wtf/text/AtomicString.h" |
| #include "sky/engine/wtf/unicode/Unicode.h" |
| |
| // Please don't use DEFINE_STATIC_LOCAL in this file. The HTMLTokenizer is used |
| // from multiple threads and DEFINE_STATIC_LOCAL isn't threadsafe. |
| #undef DEFINE_STATIC_LOCAL |
| |
| namespace blink { |
| |
| // This has to go in a .cpp file, as the linker doesn't like it being included more than once. |
| // We don't have an HTMLToken.cpp though, so this is the next best place. |
| QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const |
| { |
| return QualifiedName(AtomicString(attribute.name)); |
| } |
| |
| bool AtomicHTMLToken::usesName() const |
| { |
| return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; |
| } |
| |
| bool AtomicHTMLToken::usesAttributes() const |
| { |
| return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; |
| } |
| |
| static inline bool isEndTagBufferingState(HTMLTokenizer::State state) |
| { |
| return state == HTMLTokenizer::RawDataEndTagOpenState || state == HTMLTokenizer::RawDataEndTagNameState; |
| } |
| |
| #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName) |
| #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName) |
| #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName) |
| #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName) |
| |
| HTMLTokenizer::HTMLTokenizer() |
| : m_inputStreamPreprocessor(this) |
| { |
| reset(); |
| } |
| |
| HTMLTokenizer::~HTMLTokenizer() |
| { |
| } |
| |
| void HTMLTokenizer::reset() |
| { |
| m_state = HTMLTokenizer::DataState; |
| m_token = 0; |
| } |
| |
| bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) |
| { |
| ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized); |
| source.advanceAndUpdateLineNumber(); |
| if (m_token->type() == HTMLToken::Character) |
| return true; |
| m_token->beginEndTag(m_temporaryBuffer); |
| m_appropriateEndTagName.clear(); |
| m_temporaryBuffer.clear(); |
| return false; |
| } |
| |
| #define FLUSH_AND_ADVANCE_TO(stateName) \ |
| do { \ |
| m_state = HTMLTokenizer::stateName; \ |
| if (flushBufferedEndTag(source)) \ |
| return true; \ |
| if (source.isEmpty() \ |
| || !m_inputStreamPreprocessor.peek(source)) \ |
| return haveBufferedCharacterToken(); \ |
| cc = m_inputStreamPreprocessor.nextInputCharacter(); \ |
| goto stateName; \ |
| } while (false) |
| |
| bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state) |
| { |
| m_state = state; |
| flushBufferedEndTag(source); |
| return true; |
| } |
| |
| bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) |
| { |
| // If we have a token in progress, then we're supposed to be called back |
| // with the same token so we can finish it. |
| ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized); |
| m_token = &token; |
| |
| if (!m_temporaryBuffer.isEmpty() && !isEndTagBufferingState(m_state)) { |
| // FIXME: This should call flushBufferedEndTag(). |
| // We started an end tag during our last iteration. |
| m_token->beginEndTag(m_temporaryBuffer); |
| m_appropriateEndTagName.clear(); |
| m_temporaryBuffer.clear(); |
| if (m_state == HTMLTokenizer::DataState) { |
| // We're back in the data state, so we must be done with the tag. |
| return true; |
| } |
| } |
| |
| if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) |
| return haveBufferedCharacterToken(); |
| UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); |
| |
| // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 |
| switch (m_state) { |
| HTML_BEGIN_STATE(DataState) { |
| if (cc == '&') { |
| m_returnState = DataState; |
| m_entityParser.reset(); |
| HTML_ADVANCE_TO(CharacterReferenceInDataState); |
| } else if (cc == '<') { |
| if (m_token->type() == HTMLToken::Character) { |
| // We have a bunch of character tokens queued up that we |
| // are emitting lazily here. |
| return true; |
| } |
| HTML_ADVANCE_TO(TagOpenState); |
| } else if (cc == kEndOfFileMarker) { |
| return emitEndOfFile(source); |
| } else { |
| bufferCharacter(cc); |
| HTML_ADVANCE_TO(DataState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(CharacterReferenceInDataState) { |
| if (!m_entityParser.parse(source)) |
| return haveBufferedCharacterToken(); |
| for (const UChar& entityCharacter : m_entityParser.result()) |
| bufferCharacter(entityCharacter); |
| cc = m_inputStreamPreprocessor.nextInputCharacter(); |
| ASSERT(m_returnState == m_returnState); |
| HTML_SWITCH_TO(DataState); |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) { |
| if (!m_entityParser.parse(source)) |
| return haveBufferedCharacterToken(); |
| for (const UChar& entityCharacter : m_entityParser.result()) |
| m_token->appendToAttributeValue(entityCharacter); |
| cc = m_inputStreamPreprocessor.nextInputCharacter(); |
| |
| if (m_returnState == AttributeValueDoubleQuotedState) |
| HTML_SWITCH_TO(AttributeValueDoubleQuotedState); |
| else if (m_returnState == AttributeValueSingleQuotedState) |
| HTML_SWITCH_TO(AttributeValueSingleQuotedState); |
| else if (m_returnState == AttributeValueUnquotedState) |
| HTML_SWITCH_TO(AttributeValueUnquotedState); |
| else |
| ASSERT_NOT_REACHED(); |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(RawDataState) { |
| if (cc == '<') { |
| HTML_ADVANCE_TO(RawDataLessThanSignState); |
| } else { |
| bufferCharacter(cc); |
| HTML_ADVANCE_TO(RawDataState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(RawDataLessThanSignState) { |
| if (cc == '/') { |
| m_temporaryBuffer.clear(); |
| HTML_ADVANCE_TO(RawDataEndTagOpenState); |
| } else { |
| bufferCharacter('<'); |
| HTML_RECONSUME_IN(RawDataState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(RawDataEndTagOpenState) { |
| if (isASCIILower(cc)) { |
| m_temporaryBuffer.append(static_cast<LChar>(cc)); |
| HTML_ADVANCE_TO(RawDataEndTagNameState); |
| } else { |
| bufferCharacter('<'); |
| bufferCharacter('/'); |
| HTML_RECONSUME_IN(RawDataState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(RawDataEndTagNameState) { |
| if (isASCIILower(cc)) { |
| m_temporaryBuffer.append(static_cast<LChar>(cc)); |
| HTML_ADVANCE_TO(RawDataEndTagNameState); |
| } else { |
| if (isTokenizerWhitespace(cc)) { |
| if (isAppropriateEndTag()) |
| FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); |
| } else if (cc == '/') { |
| if (isAppropriateEndTag()) |
| FLUSH_AND_ADVANCE_TO(VoidTagState); |
| } else if (cc == '>') { |
| if (isAppropriateEndTag()) |
| return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); |
| } |
| bufferCharacter('<'); |
| bufferCharacter('/'); |
| m_token->appendToCharacter(m_temporaryBuffer); |
| m_temporaryBuffer.clear(); |
| HTML_RECONSUME_IN(RawDataState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(TagOpenState) { |
| if (cc == '!') { |
| HTML_ADVANCE_TO(CommentStart1State); |
| } else if (cc == '/') { |
| HTML_ADVANCE_TO(CloseTagState); |
| } else if (isTokenizerTagName(cc)) { |
| m_token->beginStartTag(static_cast<LChar>(cc)); |
| HTML_ADVANCE_TO(TagNameState); |
| } else { |
| bufferCharacter('<'); |
| HTML_RECONSUME_IN(DataState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(CloseTagState) { |
| if (isTokenizerTagName(cc)) { |
| m_token->beginEndTag(static_cast<LChar>(cc)); |
| HTML_ADVANCE_TO(TagNameState); |
| } else if (cc == '>') { |
| bufferCharacter('<'); |
| bufferCharacter('/'); |
| bufferCharacter('>'); |
| HTML_ADVANCE_TO(DataState); |
| } else { |
| bufferCharacter('<'); |
| bufferCharacter('/'); |
| HTML_RECONSUME_IN(DataState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(TagNameState) { |
| if (isTokenizerWhitespace(cc)) { |
| HTML_ADVANCE_TO(BeforeAttributeNameState); |
| } else if (cc == '/') { |
| HTML_ADVANCE_TO(VoidTagState); |
| } else if (cc == '>') { |
| return emitAndResumeIn(source, HTMLTokenizer::DataState); |
| } else { |
| m_token->appendToName(cc); |
| HTML_ADVANCE_TO(TagNameState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(BeforeAttributeNameState) { |
| if (isTokenizerWhitespace(cc)) { |
| HTML_ADVANCE_TO(BeforeAttributeNameState); |
| } else if (cc == '/') { |
| HTML_ADVANCE_TO(VoidTagState); |
| } else if (cc == '>') { |
| return emitAndResumeIn(source, HTMLTokenizer::DataState); |
| } else { |
| m_token->addNewAttribute(); |
| m_token->beginAttributeName(source.numberOfCharactersConsumed()); |
| m_token->appendToAttributeName(cc); |
| HTML_ADVANCE_TO(AttributeNameState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(AttributeNameState) { |
| if (isTokenizerWhitespace(cc)) { |
| m_token->endAttributeName(source.numberOfCharactersConsumed()); |
| HTML_ADVANCE_TO(AfterAttributeNameState); |
| } else if (cc == '/') { |
| m_token->endAttributeName(source.numberOfCharactersConsumed()); |
| HTML_ADVANCE_TO(VoidTagState); |
| } else if (cc == '=') { |
| m_token->endAttributeName(source.numberOfCharactersConsumed()); |
| HTML_ADVANCE_TO(BeforeAttributeValueState); |
| } else if (cc == '>') { |
| m_token->endAttributeName(source.numberOfCharactersConsumed()); |
| return emitAndResumeIn(source, HTMLTokenizer::DataState); |
| } else { |
| m_token->appendToAttributeName(cc); |
| HTML_ADVANCE_TO(AttributeNameState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(AfterAttributeNameState) { |
| if (isTokenizerWhitespace(cc)) { |
| HTML_ADVANCE_TO(AfterAttributeNameState); |
| } else if (cc == '/') { |
| HTML_ADVANCE_TO(VoidTagState); |
| } else if (cc == '=') { |
| HTML_ADVANCE_TO(BeforeAttributeValueState); |
| } else if (cc == '>') { |
| return emitAndResumeIn(source, HTMLTokenizer::DataState); |
| } else { |
| m_token->addNewAttribute(); |
| m_token->beginAttributeName(source.numberOfCharactersConsumed()); |
| m_token->appendToAttributeName(cc); |
| HTML_ADVANCE_TO(AttributeNameState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(BeforeAttributeValueState) { |
| if (isTokenizerWhitespace(cc)) |
| HTML_ADVANCE_TO(BeforeAttributeValueState); |
| else if (cc == '"') { |
| m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); |
| HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); |
| } else if (cc == '&') { |
| m_token->beginAttributeValue(source.numberOfCharactersConsumed()); |
| HTML_RECONSUME_IN(AttributeValueUnquotedState); |
| } else if (cc == '\'') { |
| m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); |
| HTML_ADVANCE_TO(AttributeValueSingleQuotedState); |
| } else if (cc == '>') { |
| return emitAndResumeIn(source, HTMLTokenizer::DataState); |
| } else { |
| m_token->beginAttributeValue(source.numberOfCharactersConsumed()); |
| m_token->appendToAttributeValue(cc); |
| HTML_ADVANCE_TO(AttributeValueUnquotedState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { |
| if (cc == '"') { |
| m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| HTML_ADVANCE_TO(BeforeAttributeNameState); |
| } else if (cc == '&') { |
| m_returnState = AttributeValueDoubleQuotedState; |
| m_entityParser.reset(); |
| HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); |
| } else { |
| m_token->appendToAttributeValue(cc); |
| HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { |
| if (cc == '\'') { |
| m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| HTML_ADVANCE_TO(BeforeAttributeNameState); |
| } else if (cc == '&') { |
| m_returnState = AttributeValueSingleQuotedState; |
| m_entityParser.reset(); |
| HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); |
| } else { |
| m_token->appendToAttributeValue(cc); |
| HTML_ADVANCE_TO(AttributeValueSingleQuotedState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(AttributeValueUnquotedState) { |
| if (isTokenizerWhitespace(cc)) { |
| m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| HTML_ADVANCE_TO(BeforeAttributeNameState); |
| } else if (cc == '&') { |
| m_returnState = AttributeValueUnquotedState; |
| m_entityParser.reset(); |
| HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); |
| } else if (cc == '>') { |
| m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| return emitAndResumeIn(source, HTMLTokenizer::DataState); |
| } else { |
| m_token->appendToAttributeValue(cc); |
| HTML_ADVANCE_TO(AttributeValueUnquotedState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(VoidTagState) { |
| if (cc == '>') { |
| m_token->setSelfClosing(); |
| return emitAndResumeIn(source, HTMLTokenizer::DataState); |
| } else { |
| HTML_RECONSUME_IN(BeforeAttributeNameState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(CommentStart1State) { |
| if (cc == '-') { |
| HTML_ADVANCE_TO(CommentStart2State); |
| } else { |
| bufferCharacter('<'); |
| bufferCharacter('!'); |
| HTML_RECONSUME_IN(DataState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(CommentStart2State) { |
| if (cc == '-') { |
| HTML_ADVANCE_TO(CommentState); |
| } else { |
| bufferCharacter('<'); |
| bufferCharacter('!'); |
| bufferCharacter('-'); |
| HTML_RECONSUME_IN(DataState); |
| } |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(CommentState) { |
| if (cc == '-') |
| HTML_ADVANCE_TO(CommentEnd1State); |
| else |
| HTML_ADVANCE_TO(CommentState); |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(CommentEnd1State) { |
| if (cc == '-') |
| HTML_ADVANCE_TO(CommentEnd2State); |
| else |
| HTML_ADVANCE_TO(CommentState); |
| } |
| END_STATE() |
| |
| HTML_BEGIN_STATE(CommentEnd2State) { |
| if (cc == '-') |
| HTML_ADVANCE_TO(CommentEnd2State); |
| else if (cc == '>') |
| HTML_ADVANCE_TO(DataState); |
| else |
| HTML_ADVANCE_TO(CommentState); |
| } |
| END_STATE() |
| } |
| |
| ASSERT_NOT_REACHED(); |
| return false; |
| } |
| |
| inline bool HTMLTokenizer::isAppropriateEndTag() |
| { |
| if (m_temporaryBuffer.size() != m_appropriateEndTagName.size()) |
| return false; |
| |
| size_t numCharacters = m_temporaryBuffer.size(); |
| |
| for (size_t i = 0; i < numCharacters; i++) { |
| if (m_temporaryBuffer[i] != m_appropriateEndTagName[i]) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| inline void HTMLTokenizer::parseError() |
| { |
| notImplemented(); |
| } |
| |
| } |