| /* |
| * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
| * Copyright (C) 2011 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "sky/engine/config.h" |
| #include "sky/engine/core/html/parser/HTMLConstructionSite.h" |
| |
| #include <limits> |
| #include "gen/sky/core/HTMLElementFactory.h" |
| #include "sky/engine/core/dom/DocumentFragment.h" |
| #include "sky/engine/core/dom/Element.h" |
| #include "sky/engine/core/dom/Text.h" |
| #include "sky/engine/core/frame/LocalFrame.h" |
| #include "sky/engine/core/html/HTMLScriptElement.h" |
| #include "sky/engine/core/html/HTMLTemplateElement.h" |
| #include "sky/engine/core/html/parser/AtomicHTMLToken.h" |
| #include "sky/engine/core/html/parser/HTMLParserIdioms.h" |
| #include "sky/engine/core/html/parser/HTMLToken.h" |
| #include "sky/engine/core/loader/FrameLoaderClient.h" |
| #include "sky/engine/platform/NotImplemented.h" |
| #include "sky/engine/platform/text/TextBreakIterator.h" |
| |
| namespace blink { |
| |
| static const unsigned maximumHTMLParserDOMTreeDepth = 512; |
| |
| static inline void setAttributes(Element* element, AtomicHTMLToken* token) |
| { |
| element->parserSetAttributes(token->attributes()); |
| } |
| |
| static bool shouldUseLengthLimit(const ContainerNode& node) |
| { |
| return !isHTMLScriptElement(node) |
| && !isHTMLStyleElement(node); |
| } |
| |
| static unsigned textLengthLimitForContainer(const ContainerNode& node) |
| { |
| return shouldUseLengthLimit(node) ? Text::defaultLengthLimit : std::numeric_limits<unsigned>::max(); |
| } |
| |
| static inline bool isAllWhitespace(const String& string) |
| { |
| return string.isAllSpecialCharacters<isHTMLSpace<UChar> >(); |
| } |
| |
| static inline void insert(HTMLConstructionSiteTask& task) |
| { |
| if (isHTMLTemplateElement(*task.parent)) |
| task.parent = toHTMLTemplateElement(task.parent.get())->content(); |
| task.parent->parserAppendChild(task.child.get()); |
| } |
| |
| static inline void executeInsertTask(HTMLConstructionSiteTask& task) |
| { |
| ASSERT(task.operation == HTMLConstructionSiteTask::Insert); |
| insert(task); |
| } |
| |
| static inline void executeInsertTextTask(HTMLConstructionSiteTask& task) |
| { |
| ASSERT(task.operation == HTMLConstructionSiteTask::InsertText); |
| ASSERT(task.child->isTextNode()); |
| |
| // Merge text nodes into previous ones if possible: |
| // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#insert-a-character |
| Text* newText = toText(task.child.get()); |
| Node* previousChild = task.parent->lastChild(); |
| if (previousChild && previousChild->isTextNode()) { |
| Text* previousText = toText(previousChild); |
| unsigned lengthLimit = textLengthLimitForContainer(*task.parent); |
| if (previousText->length() + newText->length() < lengthLimit) { |
| previousText->parserAppendData(newText->data()); |
| return; |
| } |
| } |
| |
| insert(task); |
| } |
| |
| static inline void executeTask(HTMLConstructionSiteTask& task) |
| { |
| if (task.operation == HTMLConstructionSiteTask::Insert) |
| return executeInsertTask(task); |
| |
| ASSERT(task.operation == HTMLConstructionSiteTask::InsertText); |
| return executeInsertTextTask(task); |
| } |
| |
| // This is only needed for TextDocuments where we might have text nodes |
| // approaching the default length limit (~64k) and we don't want to |
| // break a text node in the middle of a combining character. |
| static unsigned findBreakIndexBetween(const StringBuilder& string, unsigned currentPosition, unsigned proposedBreakIndex) |
| { |
| ASSERT(currentPosition < proposedBreakIndex); |
| ASSERT(proposedBreakIndex <= string.length()); |
| // The end of the string is always a valid break. |
| if (proposedBreakIndex == string.length()) |
| return proposedBreakIndex; |
| |
| // Latin-1 does not have breakable boundaries. If we ever moved to a differnet 8-bit encoding this could be wrong. |
| if (string.is8Bit()) |
| return proposedBreakIndex; |
| |
| const UChar* breakSearchCharacters = string.characters16() + currentPosition; |
| // We need at least two characters look-ahead to account for UTF-16 surrogates, but can't search off the end of the buffer! |
| unsigned breakSearchLength = std::min(proposedBreakIndex - currentPosition + 2, string.length() - currentPosition); |
| NonSharedCharacterBreakIterator it(breakSearchCharacters, breakSearchLength); |
| |
| if (it.isBreak(proposedBreakIndex - currentPosition)) |
| return proposedBreakIndex; |
| |
| int adjustedBreakIndexInSubstring = it.preceding(proposedBreakIndex - currentPosition); |
| if (adjustedBreakIndexInSubstring > 0) |
| return currentPosition + adjustedBreakIndexInSubstring; |
| // We failed to find a breakable point, let the caller figure out what to do. |
| return 0; |
| } |
| |
| void HTMLConstructionSite::flushPendingText() |
| { |
| if (m_pendingText.isEmpty()) |
| return; |
| |
| PendingText pendingText; |
| // Hold onto the current pending text on the stack so that queueTask doesn't recurse infinitely. |
| m_pendingText.swap(pendingText); |
| ASSERT(m_pendingText.isEmpty()); |
| |
| // Splitting text nodes into smaller chunks contradicts HTML5 spec, but is necessary |
| // for performance, see: https://bugs.webkit.org/show_bug.cgi?id=55898 |
| unsigned lengthLimit = textLengthLimitForContainer(*pendingText.parent); |
| |
| unsigned currentPosition = 0; |
| const StringBuilder& string = pendingText.stringBuilder; |
| while (currentPosition < string.length()) { |
| unsigned proposedBreakIndex = std::min(currentPosition + lengthLimit, string.length()); |
| unsigned breakIndex = findBreakIndexBetween(string, currentPosition, proposedBreakIndex); |
| ASSERT(breakIndex <= string.length()); |
| String substring = string.substring(currentPosition, breakIndex - currentPosition); |
| |
| ASSERT(breakIndex > currentPosition); |
| ASSERT(breakIndex - currentPosition == substring.length()); |
| currentPosition = breakIndex; |
| |
| if (isAllWhitespace(substring)) { |
| // Ignore whitespace nodes not inside inside a <t>. If we're splitting |
| // a text node this isn't really a whitespace node and we can't ignore |
| // it either. |
| if (!m_openElements.preserveWhiteSpace() && string.length() == substring.length()) |
| continue; |
| |
| // Strings composed entirely of whitespace are likely to be repeated. |
| // Turn them into AtomicString so we share a single string for each. |
| substring = AtomicString(substring).string(); |
| } |
| |
| HTMLConstructionSiteTask task(HTMLConstructionSiteTask::InsertText); |
| task.parent = pendingText.parent; |
| task.child = Text::create(task.parent->document(), substring); |
| queueTask(task); |
| ASSERT(toText(task.child.get())->length() == substring.length()); |
| } |
| } |
| |
| void HTMLConstructionSite::queueTask(const HTMLConstructionSiteTask& task) |
| { |
| flushPendingText(); |
| ASSERT(m_pendingText.isEmpty()); |
| m_taskQueue.append(task); |
| } |
| |
| void HTMLConstructionSite::attachLater(ContainerNode* parent, PassRefPtr<Node> prpChild, bool selfClosing) |
| { |
| HTMLConstructionSiteTask task(HTMLConstructionSiteTask::Insert); |
| task.parent = parent; |
| task.child = prpChild; |
| task.selfClosing = selfClosing; |
| |
| // Add as a sibling of the parent if we have reached the maximum depth allowed. |
| if (m_openElements.stackDepth() > maximumHTMLParserDOMTreeDepth && task.parent->parentNode()) |
| task.parent = task.parent->parentNode(); |
| |
| ASSERT(task.parent); |
| queueTask(task); |
| } |
| |
| void HTMLConstructionSite::executeQueuedTasks() |
| { |
| // This has no affect on pendingText, and we may have pendingText |
| // remaining after executing all other queued tasks. |
| const size_t size = m_taskQueue.size(); |
| if (!size) |
| return; |
| |
| // Copy the task queue into a local variable in case executeTask |
| // re-enters the parser. |
| TaskQueue queue; |
| queue.swap(m_taskQueue); |
| |
| for (size_t i = 0; i < size; ++i) |
| executeTask(queue[i]); |
| |
| // We might be detached now. |
| } |
| |
| HTMLConstructionSite::HTMLConstructionSite(Document* document) |
| : m_document(document) |
| , m_attachmentRoot(document) |
| { |
| } |
| |
| HTMLConstructionSite::HTMLConstructionSite(DocumentFragment* fragment) |
| : m_document(&fragment->document()) |
| , m_attachmentRoot(fragment) |
| { |
| } |
| |
| HTMLConstructionSite::~HTMLConstructionSite() |
| { |
| // Depending on why we're being destroyed it might be OK |
| // to forget queued tasks, but currently we don't expect to. |
| ASSERT(m_taskQueue.isEmpty()); |
| // Currently we assume that text will never be the last token in the |
| // document and that we'll always queue some additional task to cause it to flush. |
| ASSERT(m_pendingText.isEmpty()); |
| } |
| |
| void HTMLConstructionSite::detach() |
| { |
| // FIXME: We'd like to ASSERT here that we're canceling and not just discarding |
| // text that really should have made it into the DOM earlier, but there |
| // doesn't seem to be a nice way to do that. |
| m_pendingText.discard(); |
| m_document = nullptr; |
| m_attachmentRoot = nullptr; |
| } |
| |
| void HTMLConstructionSite::processEndOfFile() |
| { |
| flush(); |
| openElements()->popAll(); |
| } |
| |
| void HTMLConstructionSite::finishedParsing() |
| { |
| // We shouldn't have any queued tasks but we might have pending text which we need to promote to tasks and execute. |
| ASSERT(m_taskQueue.isEmpty()); |
| flush(); |
| m_document->finishedParsing(); |
| } |
| |
| void HTMLConstructionSite::insertHTMLElement(AtomicHTMLToken* token) |
| { |
| RefPtr<Element> element = createElement(token); |
| attachLater(currentNode(), element); |
| m_openElements.push(element.release()); |
| } |
| |
| void HTMLConstructionSite::insertSelfClosingHTMLElement(AtomicHTMLToken* token) |
| { |
| ASSERT(token->type() == HTMLToken::StartTag); |
| // Normally HTMLElementStack is responsible for calling finishParsingChildren, |
| // but self-closing elements are never in the element stack so the stack |
| // doesn't get a chance to tell them that we're done parsing their children. |
| attachLater(currentNode(), createElement(token), true); |
| // FIXME: Do we want to acknowledge the token's self-closing flag? |
| // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#acknowledge-self-closing-flag |
| } |
| |
| void HTMLConstructionSite::insertScriptElement(AtomicHTMLToken* token) |
| { |
| RefPtr<HTMLScriptElement> element = HTMLScriptElement::create(ownerDocumentForCurrentNode()); |
| setAttributes(element.get(), token); |
| attachLater(currentNode(), element); |
| m_openElements.push(element.release()); |
| } |
| |
| void HTMLConstructionSite::insertTextNode(const String& string) |
| { |
| HTMLConstructionSiteTask dummyTask(HTMLConstructionSiteTask::Insert); |
| dummyTask.parent = currentNode(); |
| |
| // FIXME: This probably doesn't need to be done both here and in insert(Task). |
| if (isHTMLTemplateElement(*dummyTask.parent)) |
| dummyTask.parent = toHTMLTemplateElement(dummyTask.parent.get())->content(); |
| |
| // Unclear when parent != case occurs. Somehow we insert text into two separate |
| // nodes while processing the same Token. When it happens we have to flush the |
| // pending text into the task queue before making more. |
| if (!m_pendingText.isEmpty() && (m_pendingText.parent != dummyTask.parent)) |
| flushPendingText(); |
| m_pendingText.append(dummyTask.parent, string); |
| } |
| |
| inline Document& HTMLConstructionSite::ownerDocumentForCurrentNode() |
| { |
| if (isHTMLTemplateElement(*currentNode())) |
| return toHTMLTemplateElement(currentElement())->content()->document(); |
| return currentNode()->document(); |
| } |
| |
| PassRefPtr<Element> HTMLConstructionSite::createElement(AtomicHTMLToken* token) |
| { |
| Document& document = ownerDocumentForCurrentNode(); |
| RefPtr<Element> element = HTMLElementFactory::createElement(token->name(), document, true); |
| setAttributes(element.get(), token); |
| return element.release(); |
| } |
| |
| } |