From 64e271dce2e5aca07abdca8e335b3f7cb9ba1114 Mon Sep 17 00:00:00 2001 From: "Felix (xq) Queißner" Date: Sun, 7 Mar 2021 11:58:37 +0100 Subject: Starts to implement new HTML renderer based on gumbo. Everything is a bit borked and only trivial documents work correctly. --- lib/gumbo-parser/parser.c | 4192 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 4192 insertions(+) create mode 100644 lib/gumbo-parser/parser.c (limited to 'lib/gumbo-parser/parser.c') diff --git a/lib/gumbo-parser/parser.c b/lib/gumbo-parser/parser.c new file mode 100644 index 0000000..d935002 --- /dev/null +++ b/lib/gumbo-parser/parser.c @@ -0,0 +1,4192 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) + +#include +#include +#include +#include +#include +#include + +#include "attribute.h" +#include "error.h" +#include "gumbo.h" +#include "insertion_mode.h" +#include "parser.h" +#include "tokenizer.h" +#include "tokenizer_states.h" +#include "gumbo-utf8.h" +#include "util.h" +#include "vector.h" + +#define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i) + +#define GUMBO_STRING(literal) \ + { literal, sizeof(literal) - 1 } +#define TERMINATOR \ + { "", 0 } + +typedef char gumbo_tagset[GUMBO_TAG_LAST]; +#define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML) +#define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG) +#define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML) + +#define TAGSET_INCLUDES(tagset, namespace, tag) \ + (tag < GUMBO_TAG_LAST && tagset[(int) tag] == (1 << (int) namespace)) + +// selected forward declarations as it is getting hard to find +// an appropriate order +static bool node_html_tag_is(const GumboNode*, GumboTag); +static GumboInsertionMode get_current_template_insertion_mode( + const GumboParser*); +static bool handle_in_template(GumboParser*, GumboToken*); +static void destroy_node(GumboParser*, GumboNode*); + +static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); } + +static void free_wrapper(void* unused, void* ptr) { free(ptr); } + +const GumboOptions kGumboDefaultOptions = {&malloc_wrapper, &free_wrapper, NULL, + 8, false, -1, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML}; + +static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html"); +static const GumboStringPiece kPublicIdHtml4_0 = + GUMBO_STRING("-//W3C//DTD HTML 4.0//EN"); +static const GumboStringPiece kPublicIdHtml4_01 = + GUMBO_STRING("-//W3C//DTD HTML 4.01//EN"); +static const GumboStringPiece kPublicIdXhtml1_0 = + GUMBO_STRING("-//W3C//DTD XHTML 1.0 Strict//EN"); +static const GumboStringPiece kPublicIdXhtml1_1 = + GUMBO_STRING("-//W3C//DTD XHTML 1.1//EN"); +static const GumboStringPiece kSystemIdRecHtml4_0 = + GUMBO_STRING("http://www.w3.org/TR/REC-html40/strict.dtd"); +static const GumboStringPiece kSystemIdHtml4 = + GUMBO_STRING("http://www.w3.org/TR/html4/strict.dtd"); +static const GumboStringPiece kSystemIdXhtmlStrict1_1 = + GUMBO_STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"); +static const GumboStringPiece kSystemIdXhtml1_1 = + GUMBO_STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"); +static const GumboStringPiece kSystemIdLegacyCompat = + GUMBO_STRING("about:legacy-compat"); + +// The doctype arrays have an explicit terminator because we want to pass them +// to a helper function, and passing them as a pointer discards sizeof +// information. The SVG arrays are used only by one-off functions, and so loops +// over them use sizeof directly instead of a terminator. + +static const GumboStringPiece kQuirksModePublicIdPrefixes[] = { + GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"), + GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"), + GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"), + GUMBO_STRING("-//IETF//DTD HTML 2.0//"), + GUMBO_STRING("-//IETF//DTD HTML 2.1E//"), + GUMBO_STRING("-//IETF//DTD HTML 3.0//"), + GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"), + GUMBO_STRING("-//IETF//DTD HTML 3.2//"), + GUMBO_STRING("-//IETF//DTD HTML 3//"), + GUMBO_STRING("-//IETF//DTD HTML Level 0//"), + GUMBO_STRING("-//IETF//DTD HTML Level 1//"), + GUMBO_STRING("-//IETF//DTD HTML Level 2//"), + GUMBO_STRING("-//IETF//DTD HTML Level 3//"), + GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"), + GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"), + GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"), + GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"), + GUMBO_STRING("-//IETF//DTD HTML Strict//"), + GUMBO_STRING("-//IETF//DTD HTML//"), + GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"), + GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"), + GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"), + GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"), + GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"), + GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"), + GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"), + GUMBO_STRING( + "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)" + "extensions to HTML 4.0//"), + GUMBO_STRING( + "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::" + "extensions to HTML 4.0//"), + GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"), + GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"), + GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"), + GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"), + GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"), + GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"), + GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"), + GUMBO_STRING("-//W3C//DTD HTML 3.2//"), + GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"), + GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"), + GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"), + GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"), + GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"), + GUMBO_STRING("-//W3C//DTD W3 HTML//"), + GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"), + GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"), + GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"), TERMINATOR}; + +static const GumboStringPiece kQuirksModePublicIdExactMatches[] = { + GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"), + GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), GUMBO_STRING("HTML"), + TERMINATOR}; + +static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = { + GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"), + TERMINATOR}; + +static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = { + GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"), + GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"), TERMINATOR}; + +static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] = + {GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"), + GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"), TERMINATOR}; + +// Indexed by GumboNamespaceEnum; keep in sync with that. +static const char* kLegalXmlns[] = {"http://www.w3.org/1999/xhtml", + "http://www.w3.org/2000/svg", "http://www.w3.org/1998/Math/MathML"}; + +typedef struct _ReplacementEntry { + const GumboStringPiece from; + const GumboStringPiece to; +} ReplacementEntry; + +#define REPLACEMENT_ENTRY(from, to) \ + { GUMBO_STRING(from), GUMBO_STRING(to) } + +// Static data for SVG attribute replacements. +// https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes +static const ReplacementEntry kSvgAttributeReplacements[] = { + REPLACEMENT_ENTRY("attributename", "attributeName"), + REPLACEMENT_ENTRY("attributetype", "attributeType"), + REPLACEMENT_ENTRY("basefrequency", "baseFrequency"), + REPLACEMENT_ENTRY("baseprofile", "baseProfile"), + REPLACEMENT_ENTRY("calcmode", "calcMode"), + REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"), + // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"), + // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"), + REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"), + REPLACEMENT_ENTRY("edgemode", "edgeMode"), + // REPLACEMENT_ENTRY("externalresourcesrequired", + // "externalResourcesRequired"), + // REPLACEMENT_ENTRY("filterres", "filterRes"), + REPLACEMENT_ENTRY("filterunits", "filterUnits"), + REPLACEMENT_ENTRY("glyphref", "glyphRef"), + REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"), + REPLACEMENT_ENTRY("gradientunits", "gradientUnits"), + REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"), + REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"), + REPLACEMENT_ENTRY("keypoints", "keyPoints"), + REPLACEMENT_ENTRY("keysplines", "keySplines"), + REPLACEMENT_ENTRY("keytimes", "keyTimes"), + REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"), + REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"), + REPLACEMENT_ENTRY("markerheight", "markerHeight"), + REPLACEMENT_ENTRY("markerunits", "markerUnits"), + REPLACEMENT_ENTRY("markerwidth", "markerWidth"), + REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"), + REPLACEMENT_ENTRY("maskunits", "maskUnits"), + REPLACEMENT_ENTRY("numoctaves", "numOctaves"), + REPLACEMENT_ENTRY("pathlength", "pathLength"), + REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"), + REPLACEMENT_ENTRY("patterntransform", "patternTransform"), + REPLACEMENT_ENTRY("patternunits", "patternUnits"), + REPLACEMENT_ENTRY("pointsatx", "pointsAtX"), + REPLACEMENT_ENTRY("pointsaty", "pointsAtY"), + REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"), + REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"), + REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"), + REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"), + REPLACEMENT_ENTRY("refx", "refX"), REPLACEMENT_ENTRY("refy", "refY"), + REPLACEMENT_ENTRY("repeatcount", "repeatCount"), + REPLACEMENT_ENTRY("repeatdur", "repeatDur"), + REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"), + REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"), + REPLACEMENT_ENTRY("specularconstant", "specularConstant"), + REPLACEMENT_ENTRY("specularexponent", "specularExponent"), + REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"), + REPLACEMENT_ENTRY("startoffset", "startOffset"), + REPLACEMENT_ENTRY("stddeviation", "stdDeviation"), + REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"), + REPLACEMENT_ENTRY("surfacescale", "surfaceScale"), + REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"), + REPLACEMENT_ENTRY("tablevalues", "tableValues"), + REPLACEMENT_ENTRY("targetx", "targetX"), + REPLACEMENT_ENTRY("targety", "targetY"), + REPLACEMENT_ENTRY("textlength", "textLength"), + REPLACEMENT_ENTRY("viewbox", "viewBox"), + REPLACEMENT_ENTRY("viewtarget", "viewTarget"), + REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"), + REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"), + REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"), +}; + +static const ReplacementEntry kSvgTagReplacements[] = { + REPLACEMENT_ENTRY("altglyph", "altGlyph"), + REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"), + REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"), + REPLACEMENT_ENTRY("animatecolor", "animateColor"), + REPLACEMENT_ENTRY("animatemotion", "animateMotion"), + REPLACEMENT_ENTRY("animatetransform", "animateTransform"), + REPLACEMENT_ENTRY("clippath", "clipPath"), + REPLACEMENT_ENTRY("feblend", "feBlend"), + REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"), + REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"), + REPLACEMENT_ENTRY("fecomposite", "feComposite"), + REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"), + REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"), + REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"), + REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"), + REPLACEMENT_ENTRY("feflood", "feFlood"), + REPLACEMENT_ENTRY("fefunca", "feFuncA"), + REPLACEMENT_ENTRY("fefuncb", "feFuncB"), + REPLACEMENT_ENTRY("fefuncg", "feFuncG"), + REPLACEMENT_ENTRY("fefuncr", "feFuncR"), + REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"), + REPLACEMENT_ENTRY("feimage", "feImage"), + REPLACEMENT_ENTRY("femerge", "feMerge"), + REPLACEMENT_ENTRY("femergenode", "feMergeNode"), + REPLACEMENT_ENTRY("femorphology", "feMorphology"), + REPLACEMENT_ENTRY("feoffset", "feOffset"), + REPLACEMENT_ENTRY("fepointlight", "fePointLight"), + REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"), + REPLACEMENT_ENTRY("fespotlight", "feSpotLight"), + REPLACEMENT_ENTRY("fetile", "feTile"), + REPLACEMENT_ENTRY("feturbulence", "feTurbulence"), + REPLACEMENT_ENTRY("foreignobject", "foreignObject"), + REPLACEMENT_ENTRY("glyphref", "glyphRef"), + REPLACEMENT_ENTRY("lineargradient", "linearGradient"), + REPLACEMENT_ENTRY("radialgradient", "radialGradient"), + REPLACEMENT_ENTRY("textpath", "textPath"), +}; + +typedef struct _NamespacedAttributeReplacement { + const char* from; + const char* local_name; + const GumboAttributeNamespaceEnum attr_namespace; +} NamespacedAttributeReplacement; + +static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = { + {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}, + {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML}, + {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML}, + {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS}, + {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS}, +}; + +// The "scope marker" for the list of active formatting elements. We use a +// pointer to this as a generic marker element, since the particular element +// scope doesn't matter. +static const GumboNode kActiveFormattingScopeMarker; + +// The tag_is and tag_in function use true & false to denote start & end tags, +// but for readability, we define constants for them here. +static const bool kStartTag = true; +static const bool kEndTag = false; + +// Because GumboStringPieces are immutable, we can't insert a character directly +// into a text node. Instead, we accumulate all pending characters here and +// flush them out to a text node whenever a new element is inserted. +// +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character +typedef struct _TextNodeBufferState { + // The accumulated text to be inserted into the current text node. + GumboStringBuffer _buffer; + + // A pointer to the original text represented by this text node. Note that + // because of foster parenting and other strange DOM manipulations, this may + // include other non-text HTML tags in it; it is defined as the span of + // original text from the first character in this text node to the last + // character in this text node. + const char* _start_original_text; + + // The source position of the start of this text node. + GumboSourcePosition _start_position; + + // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE). + GumboNodeType _type; +} TextNodeBufferState; + +typedef struct GumboInternalParserState { + // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode + GumboInsertionMode _insertion_mode; + + // Used for run_generic_parsing_algorithm, which needs to switch back to the + // original insertion mode at its conclusion. + GumboInsertionMode _original_insertion_mode; + + // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements + GumboVector /*GumboNode*/ _open_elements; + + // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements + GumboVector /*GumboNode*/ _active_formatting_elements; + + // The stack of template insertion modes. + // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode + GumboVector /*InsertionMode*/ _template_insertion_modes; + + // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers + GumboNode* _head_element; + GumboNode* _form_element; + + // The element used as fragment context when parsing in fragment mode + GumboNode* _fragment_ctx; + + // The flag for when the spec says "Reprocess the current token in..." + bool _reprocess_current_token; + + // The flag for "acknowledge the token's self-closing flag". + bool _self_closing_flag_acknowledged; + + // The "frameset-ok" flag from the spec. + bool _frameset_ok; + + // The flag for "If the next token is a LINE FEED, ignore that token...". + bool _ignore_next_linefeed; + + // The flag for "whenever a node would be inserted into the current node, it + // must instead be foster parented". This is used for misnested table + // content, which needs to be handled according to "in body" rules yet foster + // parented outside of the table. + // It would perhaps be more explicit to have this as a parameter to + // handle_in_body and insert_element, but given how special-purpose this is + // and the number of call-sites that would need to take the extra parameter, + // it's easier just to have a state flag. + bool _foster_parent_insertions; + + // The accumulated text node buffer state. + TextNodeBufferState _text_node; + + // The current token. + GumboToken* _current_token; + + // The way that the spec is written, the and tags are *always* + // implicit, because encountering one of those tokens merely switches the + // insertion mode out of "in body". So we have individual state flags for + // those end tags that are then inspected by pop_current_node when the + // and nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG + // flag appropriately. + bool _closed_body_tag; + bool _closed_html_tag; +} GumboParserState; + +static bool token_has_attribute(const GumboToken* token, const char* name) { + assert(token->type == GUMBO_TOKEN_START_TAG); + return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL; +} + +// Checks if the value of the specified attribute is a case-insensitive match +// for the specified string. +static bool attribute_matches( + const GumboVector* attributes, const char* name, const char* value) { + const GumboAttribute* attr = gumbo_get_attribute(attributes, name); + return attr ? strcasecmp(value, attr->value) == 0 : false; +} + +// Checks if the value of the specified attribute is a case-sensitive match +// for the specified string. +static bool attribute_matches_case_sensitive( + const GumboVector* attributes, const char* name, const char* value) { + const GumboAttribute* attr = gumbo_get_attribute(attributes, name); + return attr ? strcmp(value, attr->value) == 0 : false; +} + +// Checks if the specified attribute vectors are identical. +static bool all_attributes_match( + const GumboVector* attr1, const GumboVector* attr2) { + unsigned int num_unmatched_attr2_elements = attr2->length; + for (unsigned int i = 0; i < attr1->length; ++i) { + const GumboAttribute* attr = attr1->data[i]; + if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) { + --num_unmatched_attr2_elements; + } else { + return false; + } + } + return num_unmatched_attr2_elements == 0; +} + +static void set_frameset_not_ok(GumboParser* parser) { + gumbo_debug("Setting frameset_ok to false.\n"); + parser->_parser_state->_frameset_ok = false; +} + +static GumboNode* create_node(GumboParser* parser, GumboNodeType type) { + GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode)); + node->parent = NULL; + node->index_within_parent = -1; + node->type = type; + node->parse_flags = GUMBO_INSERTION_NORMAL; + return node; +} + +static GumboNode* new_document_node(GumboParser* parser) { + GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT); + document_node->parse_flags = GUMBO_INSERTION_BY_PARSER; + gumbo_vector_init(parser, 1, &document_node->v.document.children); + + // Must be initialized explicitly, as there's no guarantee that we'll see a + // doc type token. + GumboDocument* document = &document_node->v.document; + document->has_doctype = false; + document->name = NULL; + document->public_identifier = NULL; + document->system_identifier = NULL; + return document_node; +} + +static void output_init(GumboParser* parser) { + GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput)); + output->root = NULL; + output->document = new_document_node(parser); + parser->_output = output; + gumbo_init_errors(parser); +} + +static void parser_state_init(GumboParser* parser) { + GumboParserState* parser_state = + gumbo_parser_allocate(parser, sizeof(GumboParserState)); + parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL; + parser_state->_reprocess_current_token = false; + parser_state->_frameset_ok = true; + parser_state->_ignore_next_linefeed = false; + parser_state->_foster_parent_insertions = false; + parser_state->_text_node._type = GUMBO_NODE_WHITESPACE; + gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer); + gumbo_vector_init(parser, 10, &parser_state->_open_elements); + gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements); + gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes); + parser_state->_head_element = NULL; + parser_state->_form_element = NULL; + parser_state->_fragment_ctx = NULL; + parser_state->_current_token = NULL; + parser_state->_closed_body_tag = false; + parser_state->_closed_html_tag = false; + parser->_parser_state = parser_state; +} + +static void parser_state_destroy(GumboParser* parser) { + GumboParserState* state = parser->_parser_state; + if (state->_fragment_ctx) { + destroy_node(parser, state->_fragment_ctx); + } + gumbo_vector_destroy(parser, &state->_active_formatting_elements); + gumbo_vector_destroy(parser, &state->_open_elements); + gumbo_vector_destroy(parser, &state->_template_insertion_modes); + gumbo_string_buffer_destroy(parser, &state->_text_node._buffer); + gumbo_parser_deallocate(parser, state); +} + +static GumboNode* get_document_node(GumboParser* parser) { + return parser->_output->document; +} + +static bool is_fragment_parser(const GumboParser* parser) { + return !!parser->_parser_state->_fragment_ctx; +} + +// Returns the node at the bottom of the stack of open elements, or NULL if no +// elements have been added yet. +static GumboNode* get_current_node(GumboParser* parser) { + GumboVector* open_elements = &parser->_parser_state->_open_elements; + if (open_elements->length == 0) { + assert(!parser->_output->root); + return NULL; + } + assert(open_elements->length > 0); + assert(open_elements->data != NULL); + return open_elements->data[open_elements->length - 1]; +} + +static GumboNode* get_adjusted_current_node(GumboParser* parser) { + GumboParserState* state = parser->_parser_state; + if (state->_open_elements.length == 1 && state->_fragment_ctx) { + return state->_fragment_ctx; + } + return get_current_node(parser); +} + +// Returns true if the given needle is in the given array of literal +// GumboStringPieces. If exact_match is true, this requires that they match +// exactly; otherwise, this performs a prefix match to check if any of the +// elements in haystack start with needle. This always performs a +// case-insensitive match. +static bool is_in_static_list( + const char* needle, const GumboStringPiece* haystack, bool exact_match) { + for (unsigned int i = 0; haystack[i].length > 0; ++i) { + if ((exact_match && !strcmp(needle, haystack[i].data)) || + (!exact_match && !strcasecmp(needle, haystack[i].data))) { + return true; + } + } + return false; +} + +static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { + parser->_parser_state->_insertion_mode = mode; +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately +// This is a helper function that returns the appropriate insertion mode instead +// of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to +// indicate that there is no appropriate insertion mode, and the loop should +// continue. +static GumboInsertionMode get_appropriate_insertion_mode( + const GumboParser* parser, int index) { + const GumboVector* open_elements = &parser->_parser_state->_open_elements; + const GumboNode* node = open_elements->data[index]; + const bool is_last = index == 0; + + if (is_last && is_fragment_parser(parser)) { + node = parser->_parser_state->_fragment_ctx; + } + + assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); + if (node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML) + return is_last ? + GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; + + switch (node->v.element.tag) { + case GUMBO_TAG_SELECT: { + if (is_last) { + return GUMBO_INSERTION_MODE_IN_SELECT; + } + for (int i = index; i > 0; --i) { + const GumboNode* ancestor = open_elements->data[i]; + if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) { + return GUMBO_INSERTION_MODE_IN_SELECT; + } + if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) { + return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE; + } + } + return GUMBO_INSERTION_MODE_IN_SELECT; + } + case GUMBO_TAG_TD: + case GUMBO_TAG_TH: + if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL; + break; + case GUMBO_TAG_TR: + return GUMBO_INSERTION_MODE_IN_ROW; + case GUMBO_TAG_TBODY: + case GUMBO_TAG_THEAD: + case GUMBO_TAG_TFOOT: + return GUMBO_INSERTION_MODE_IN_TABLE_BODY; + case GUMBO_TAG_CAPTION: + return GUMBO_INSERTION_MODE_IN_CAPTION; + case GUMBO_TAG_COLGROUP: + return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP; + case GUMBO_TAG_TABLE: + return GUMBO_INSERTION_MODE_IN_TABLE; + case GUMBO_TAG_TEMPLATE: + return get_current_template_insertion_mode(parser); + case GUMBO_TAG_HEAD: + if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD; + break; + case GUMBO_TAG_BODY: + return GUMBO_INSERTION_MODE_IN_BODY; + case GUMBO_TAG_FRAMESET: + return GUMBO_INSERTION_MODE_IN_FRAMESET; + case GUMBO_TAG_HTML: + return parser->_parser_state->_head_element + ? GUMBO_INSERTION_MODE_AFTER_HEAD + : GUMBO_INSERTION_MODE_BEFORE_HEAD; + default: + break; + } + return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; +} + +// This performs the actual "reset the insertion mode" loop. +static void reset_insertion_mode_appropriately(GumboParser* parser) { + const GumboVector* open_elements = &parser->_parser_state->_open_elements; + for (int i = open_elements->length; --i >= 0;) { + GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i); + if (mode != GUMBO_INSERTION_MODE_INITIAL) { + set_insertion_mode(parser, mode); + return; + } + } + // Should never get here, because is_last will be set on the last iteration + // and will force GUMBO_INSERTION_MODE_IN_BODY. + assert(0); +} + +static GumboError* parser_add_parse_error( + GumboParser* parser, const GumboToken* token) { + gumbo_debug("Adding parse error.\n"); + GumboError* error = gumbo_add_error(parser); + if (!error) { + return NULL; + } + error->type = GUMBO_ERR_PARSER; + error->position = token->position; + error->original_text = token->original_text.data; + GumboParserError* extra_data = &error->v.parser; + extra_data->input_type = token->type; + extra_data->input_tag = GUMBO_TAG_UNKNOWN; + if (token->type == GUMBO_TOKEN_START_TAG) { + extra_data->input_tag = token->v.start_tag.tag; + } else if (token->type == GUMBO_TOKEN_END_TAG) { + extra_data->input_tag = token->v.end_tag; + } + GumboParserState* state = parser->_parser_state; + extra_data->parser_state = state->_insertion_mode; + gumbo_vector_init( + parser, state->_open_elements.length, &extra_data->tag_stack); + for (unsigned int i = 0; i < state->_open_elements.length; ++i) { + const GumboNode* node = state->_open_elements.data[i]; + assert( + node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); + gumbo_vector_add( + parser, (void*) node->v.element.tag, &extra_data->tag_stack); + } + return error; +} + +// Returns true if the specified token is either a start or end tag (specified +// by is_start) with one of the tag types in the varargs list. Terminate the +// list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of +// the spec references tags that are not in the spec. +static bool tag_in( + const GumboToken* token, bool is_start, const gumbo_tagset tags) { + GumboTag token_tag; + if (is_start && token->type == GUMBO_TOKEN_START_TAG) { + token_tag = token->v.start_tag.tag; + } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { + token_tag = token->v.end_tag; + } else { + return false; + } + return (token_tag < GUMBO_TAG_LAST && tags[(int) token_tag] != 0); +} + +// Like tag_in, but for the single-tag case. +static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) { + if (is_start && token->type == GUMBO_TOKEN_START_TAG) { + return token->v.start_tag.tag == tag; + } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { + return token->v.end_tag == tag; + } else { + return false; + } +} + +// Like tag_in, but checks for the tag of a node, rather than a token. +static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) { + assert(node != NULL); + if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { + return false; + } + return TAGSET_INCLUDES( + tags, node->v.element.tag_namespace, node->v.element.tag); +} + +// Like node_tag_in, but for the single-tag case. +static bool node_qualified_tag_is( + const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) { + assert(node); + return (node->type == GUMBO_NODE_ELEMENT || + node->type == GUMBO_NODE_TEMPLATE) && + node->v.element.tag == tag && node->v.element.tag_namespace == ns; +} + +// Like node_tag_in, but for the single-tag case in the HTML namespace +static bool node_html_tag_is(const GumboNode* node, GumboTag tag) { + return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag); +} + +static void push_template_insertion_mode( + GumboParser* parser, GumboInsertionMode mode) { + gumbo_vector_add( + parser, (void*) mode, &parser->_parser_state->_template_insertion_modes); +} + +static void pop_template_insertion_mode(GumboParser* parser) { + gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes); +} + +// Returns the current template insertion mode. If the stack of template +// insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL. +static GumboInsertionMode get_current_template_insertion_mode( + const GumboParser* parser) { + GumboVector* template_insertion_modes = + &parser->_parser_state->_template_insertion_modes; + if (template_insertion_modes->length == 0) { + return GUMBO_INSERTION_MODE_INITIAL; + } + return (GumboInsertionMode) + template_insertion_modes->data[(template_insertion_modes->length - 1)]; +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point +static bool is_mathml_integration_point(const GumboNode* node) { + return node_tag_in_set( + node, (gumbo_tagset){TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), + TAG_MATHML(MS), TAG_MATHML(MTEXT)}); +} + +// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point +static bool is_html_integration_point(const GumboNode* node) { + return node_tag_in_set(node, (gumbo_tagset){TAG_SVG(FOREIGNOBJECT), + TAG_SVG(DESC), TAG_SVG(TITLE)}) || + (node_qualified_tag_is( + node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && + (attribute_matches( + &node->v.element.attributes, "encoding", "text/html") || + attribute_matches(&node->v.element.attributes, "encoding", + "application/xhtml+xml"))); +} + +// This represents a place to insert a node, consisting of a target parent and a +// child index within that parent. If the node should be inserted at the end of +// the parent's child, index will be -1. +typedef struct { + GumboNode* target; + int index; +} InsertionLocation; + +InsertionLocation get_appropriate_insertion_location( + GumboParser* parser, GumboNode* override_target) { + InsertionLocation retval = {override_target, -1}; + if (retval.target == NULL) { + // No override target; default to the current node, but special-case the + // root node since get_current_node() assumes the stack of open elements is + // non-empty. + retval.target = parser->_output->root != NULL ? get_current_node(parser) + : get_document_node(parser); + } + if (!parser->_parser_state->_foster_parent_insertions || + !node_tag_in_set(retval.target, (gumbo_tagset){TAG(TABLE), TAG(TBODY), + TAG(TFOOT), TAG(THEAD), TAG(TR)})) { + return retval; + } + + // Foster-parenting case. + int last_template_index = -1; + int last_table_index = -1; + GumboVector* open_elements = &parser->_parser_state->_open_elements; + for (unsigned int i = 0; i < open_elements->length; ++i) { + if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) { + last_template_index = i; + } + if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) { + last_table_index = i; + } + } + if (last_template_index != -1 && + (last_table_index == -1 || last_template_index > last_table_index)) { + retval.target = open_elements->data[last_template_index]; + return retval; + } + if (last_table_index == -1) { + retval.target = open_elements->data[0]; + return retval; + } + GumboNode* last_table = open_elements->data[last_table_index]; + if (last_table->parent != NULL) { + retval.target = last_table->parent; + retval.index = last_table->index_within_parent; + return retval; + } + + retval.target = open_elements->data[last_table_index - 1]; + return retval; +} + +// Appends a node to the end of its parent, setting the "parent" and +// "index_within_parent" fields appropriately. +static void append_node( + GumboParser* parser, GumboNode* parent, GumboNode* node) { + assert(node->parent == NULL); + assert(node->index_within_parent == -1); + GumboVector* children; + if (parent->type == GUMBO_NODE_ELEMENT || + parent->type == GUMBO_NODE_TEMPLATE) { + children = &parent->v.element.children; + } else { + assert(parent->type == GUMBO_NODE_DOCUMENT); + children = &parent->v.document.children; + } + node->parent = parent; + node->index_within_parent = children->length; + gumbo_vector_add(parser, (void*) node, children); + assert(node->index_within_parent < children->length); +} + +// Inserts a node at the specified InsertionLocation, updating the +// "parent" and "index_within_parent" fields of it and all its siblings. +// If the index of the location is -1, this calls append_node. +static void insert_node( + GumboParser* parser, GumboNode* node, InsertionLocation location) { + assert(node->parent == NULL); + assert(node->index_within_parent == -1); + GumboNode* parent = location.target; + int index = location.index; + if (index != -1) { + GumboVector* children = NULL; + if (parent->type == GUMBO_NODE_ELEMENT || + parent->type == GUMBO_NODE_TEMPLATE) { + children = &parent->v.element.children; + } else if (parent->type == GUMBO_NODE_DOCUMENT) { + children = &parent->v.document.children; + assert(children->length == 0); + } else { + assert(0); + } + + assert(index >= 0); + assert((unsigned int) index < children->length); + node->parent = parent; + node->index_within_parent = index; + gumbo_vector_insert_at(parser, (void*) node, index, children); + assert(node->index_within_parent < children->length); + for (unsigned int i = index + 1; i < children->length; ++i) { + GumboNode* sibling = children->data[i]; + sibling->index_within_parent = i; + assert(sibling->index_within_parent < children->length); + } + } else { + append_node(parser, parent, node); + } +} + +static void maybe_flush_text_node_buffer(GumboParser* parser) { + GumboParserState* state = parser->_parser_state; + TextNodeBufferState* buffer_state = &state->_text_node; + if (buffer_state->_buffer.length == 0) { + return; + } + + assert(buffer_state->_type == GUMBO_NODE_WHITESPACE || + buffer_state->_type == GUMBO_NODE_TEXT || + buffer_state->_type == GUMBO_NODE_CDATA); + GumboNode* text_node = create_node(parser, buffer_state->_type); + GumboText* text_node_data = &text_node->v.text; + text_node_data->text = + gumbo_string_buffer_to_string(parser, &buffer_state->_buffer); + text_node_data->original_text.data = buffer_state->_start_original_text; + text_node_data->original_text.length = + state->_current_token->original_text.data - + buffer_state->_start_original_text; + text_node_data->start_pos = buffer_state->_start_position; + + gumbo_debug("Flushing text node buffer of %.*s.\n", + (int) buffer_state->_buffer.length, buffer_state->_buffer.data); + + InsertionLocation location = get_appropriate_insertion_location(parser, NULL); + if (location.target->type == GUMBO_NODE_DOCUMENT) { + // The DOM does not allow Document nodes to have Text children, so per the + // spec, they are dropped on the floor. + destroy_node(parser, text_node); + } else { + insert_node(parser, text_node, location); + } + + gumbo_string_buffer_clear(parser, &buffer_state->_buffer); + buffer_state->_type = GUMBO_NODE_WHITESPACE; + assert(buffer_state->_buffer.length == 0); +} + +static void record_end_of_element( + GumboToken* current_token, GumboElement* element) { + element->end_pos = current_token->position; + element->original_end_tag = current_token->type == GUMBO_TOKEN_END_TAG + ? current_token->original_text + : kGumboEmptyString; +} + +static GumboNode* pop_current_node(GumboParser* parser) { + GumboParserState* state = parser->_parser_state; + maybe_flush_text_node_buffer(parser); + if (state->_open_elements.length > 0) { + assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); + gumbo_debug("Popping %s node.\n", + gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)); + } + GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements); + if (!current_node) { + assert(state->_open_elements.length == 0); + return NULL; + } + assert(current_node->type == GUMBO_NODE_ELEMENT || + current_node->type == GUMBO_NODE_TEMPLATE); + bool is_closed_body_or_html_tag = + (node_html_tag_is(current_node, GUMBO_TAG_BODY) && + state->_closed_body_tag) || + (node_html_tag_is(current_node, GUMBO_TAG_HTML) && + state->_closed_html_tag); + if ((state->_current_token->type != GUMBO_TOKEN_END_TAG || + !node_html_tag_is(current_node, state->_current_token->v.end_tag)) && + !is_closed_body_or_html_tag) { + current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; + } + if (!is_closed_body_or_html_tag) { + record_end_of_element(state->_current_token, ¤t_node->v.element); + } + return current_node; +} + +static void append_comment_node( + GumboParser* parser, GumboNode* node, const GumboToken* token) { + maybe_flush_text_node_buffer(parser); + GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT); + comment->type = GUMBO_NODE_COMMENT; + comment->parse_flags = GUMBO_INSERTION_NORMAL; + comment->v.text.text = token->v.text; + comment->v.text.original_text = token->original_text; + comment->v.text.start_pos = token->position; + append_node(parser, node, comment); +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context +static void clear_stack_to_table_row_context(GumboParser* parser) { + while (!node_tag_in_set(get_current_node(parser), + (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) { + pop_current_node(parser); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context +static void clear_stack_to_table_context(GumboParser* parser) { + while (!node_tag_in_set(get_current_node(parser), + (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) { + pop_current_node(parser); + } +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context +void clear_stack_to_table_body_context(GumboParser* parser) { + while (!node_tag_in_set(get_current_node(parser), + (gumbo_tagset){TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD), + TAG(TEMPLATE)})) { + pop_current_node(parser); + } +} + +// Creates a parser-inserted element in the HTML namespace and returns it. +static GumboNode* create_element(GumboParser* parser, GumboTag tag) { + GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT); + GumboElement* element = &node->v.element; + gumbo_vector_init(parser, 1, &element->children); + gumbo_vector_init(parser, 0, &element->attributes); + element->tag = tag; + element->tag_namespace = GUMBO_NAMESPACE_HTML; + element->original_tag = kGumboEmptyString; + element->original_end_tag = kGumboEmptyString; + element->start_pos = (parser->_parser_state->_current_token) + ? parser->_parser_state->_current_token->position + : kGumboEmptySourcePosition; + element->end_pos = kGumboEmptySourcePosition; + return node; +} + +// Constructs an element from the given start tag token. +static GumboNode* create_element_from_token( + GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) { + assert(token->type == GUMBO_TOKEN_START_TAG); + GumboTokenStartTag* start_tag = &token->v.start_tag; + + GumboNodeType type = (tag_namespace == GUMBO_NAMESPACE_HTML && + start_tag->tag == GUMBO_TAG_TEMPLATE) + ? GUMBO_NODE_TEMPLATE + : GUMBO_NODE_ELEMENT; + + GumboNode* node = create_node(parser, type); + GumboElement* element = &node->v.element; + gumbo_vector_init(parser, 1, &element->children); + element->attributes = start_tag->attributes; + element->tag = start_tag->tag; + element->tag_namespace = tag_namespace; + + assert(token->original_text.length >= 2); + assert(token->original_text.data[0] == '<'); + assert(token->original_text.data[token->original_text.length - 1] == '>'); + element->original_tag = token->original_text; + element->start_pos = token->position; + element->original_end_tag = kGumboEmptyString; + element->end_pos = kGumboEmptySourcePosition; + + // The element takes ownership of the attributes from the token, so any + // allocated-memory fields should be nulled out. + start_tag->attributes = kGumboEmptyVector; + return node; +} + +// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element +static void insert_element(GumboParser* parser, GumboNode* node, + bool is_reconstructing_formatting_elements) { + GumboParserState* state = parser->_parser_state; + // NOTE(jdtang): The text node buffer must always be flushed before inserting + // a node, otherwise we're handling nodes in a different order than the spec + // mandated. However, one clause of the spec (character tokens in the body) + // requires that we reconstruct the active formatting elements *before* adding + // the character, and reconstructing the active formatting elements may itself + // result in the insertion of new elements (which should be pushed onto the + // stack of open elements before the buffer is flushed). We solve this (for + // the time being, the spec has been rewritten for