diff --git a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php index d73561843bcb2..8fdb5db0a9a7a 100644 --- a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php +++ b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -114,15 +114,11 @@ public function insert_marker(): void { */ public function push( WP_HTML_Token $token ) { /* - * > If there are already three elements in the list of active formatting elements after the last marker, - * > if any, or anywhere in the list if there are no markers, that have the same tag name, namespace, and - * > attributes as element, then remove the earliest such element from the list of active formatting - * > elements. For these purposes, the attributes must be compared as they were when the elements were - * > created by the parser; two elements have the same attributes if all their parsed attributes can be - * > paired such that the two attributes in each pair have identical names, namespaces, and values - * > (the order of the attributes does not matter). + * The "Noah's Ark clause", which limits the list to three elements sharing + * a tag name, namespace, and attributes, requires reading the attributes + * of the source tags and is enforced by the HTML Processor before pushing. * - * @todo Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack. + * @see WP_HTML_Processor::push_onto_active_formatting_elements */ // > Add element to the list of active formatting elements. $this->stack[] = $token; @@ -150,6 +146,82 @@ public function remove_node( WP_HTML_Token $token ) { return false; } + /** + * Returns the position of a node in the list of active formatting elements. + * + * Positions are counted from the start of the list: the earliest entry + * is at position zero. + * + * @since 7.1.0 + * + * @param WP_HTML_Token $token Find this node in the list of active formatting elements. + * @return int|null Position of the node, or `null` if it isn't in the list. + */ + public function position_of( WP_HTML_Token $token ): ?int { + foreach ( $this->stack as $position => $item ) { + if ( $token === $item ) { + return $position; + } + } + + return null; + } + + /** + * Removes the node at the given position in the list of active formatting elements. + * + * @since 7.1.0 + * + * @param int $position Remove the node at this position, counting from the start of the list. + * @return bool Whether a node was removed, false when the position was out of range. + */ + public function remove_at( int $position ): bool { + if ( $position < 0 || $position >= count( $this->stack ) ) { + return false; + } + + array_splice( $this->stack, $position, 1 ); + return true; + } + + /** + * Inserts a node at the given position in the list of active formatting elements. + * + * A node inserted at position zero becomes the earliest entry in the list, + * while one inserted at the position returned by {@see self::count} becomes + * the last (most recently added) entry. + * + * @since 7.1.0 + * + * @param int $position Insert the node at this position, counting from the start of the list. + * @param WP_HTML_Token $token Insert this node. + */ + public function insert_at( int $position, WP_HTML_Token $token ): void { + array_splice( $this->stack, $position, 0, array( $token ) ); + } + + /** + * Replaces a node in the list of active formatting elements with another node. + * + * This is distinct from removing the existing node and pushing the new one: + * the replacement occupies the exact position of the node it replaces. + * + * @since 7.1.0 + * + * @param WP_HTML_Token $old_node Node to find and replace. + * @param WP_HTML_Token $new_node Node to substitute in its place. + * @return bool Whether the node was found and replaced. + */ + public function replace_node( WP_HTML_Token $old_node, WP_HTML_Token $new_node ): bool { + $position = $this->position_of( $old_node ); + if ( null === $position ) { + return false; + } + + $this->stack[ $position ] = $new_node; + return true; + } + /** * Steps through the stack of active formatting elements, starting with the * top element (added first) and walking downwards to the one added last. diff --git a/src/wp-includes/html-api/class-wp-html-open-elements.php b/src/wp-includes/html-api/class-wp-html-open-elements.php index 5c99db6d5eb4e..e165b867bb1ff 100644 --- a/src/wp-includes/html-api/class-wp-html-open-elements.php +++ b/src/wp-includes/html-api/class-wp-html-open-elements.php @@ -29,6 +29,45 @@ * @see WP_HTML_Processor */ class WP_HTML_Open_Elements { + /** + * Elements which terminate the search when determining whether an + * element is "in scope". + * + * > The stack of open elements is said to have a particular element in + * > scope when it has that element in the specific scope consisting of + * > the following element types: … + * + * @since 7.1.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-scope + * @see WP_HTML_Open_Elements::has_element_in_scope + * @see WP_HTML_Open_Elements::has_node_in_scope + * + * @var string[] + */ + const ELEMENT_IN_SCOPE_TERMINATION_LIST = array( + 'APPLET', + 'CAPTION', + 'HTML', + 'TABLE', + 'TD', + 'TH', + 'MARQUEE', + 'OBJECT', + 'TEMPLATE', + + 'math MI', + 'math MO', + 'math MN', + 'math MS', + 'math MTEXT', + 'math ANNOTATION-XML', + + 'svg FOREIGNOBJECT', + 'svg DESC', + 'svg TITLE', + ); + /** * Holds the stack of open element references. * @@ -301,31 +340,42 @@ public function has_element_in_specific_scope( string $tag_name, $termination_li * @return bool Whether given element is in scope. */ public function has_element_in_scope( string $tag_name ): bool { - return $this->has_element_in_specific_scope( - $tag_name, - array( - 'APPLET', - 'CAPTION', - 'HTML', - 'TABLE', - 'TD', - 'TH', - 'MARQUEE', - 'OBJECT', - 'TEMPLATE', + return $this->has_element_in_specific_scope( $tag_name, self::ELEMENT_IN_SCOPE_TERMINATION_LIST ); + } - 'math MI', - 'math MO', - 'math MN', - 'math MS', - 'math MTEXT', - 'math ANNOTATION-XML', + /** + * Returns whether a specific node is in scope. + * + * Whereas {@see self::has_element_in_scope} reports whether *any* element + * of a given tag name is in scope, this reports whether the given node + * itself is. The two may disagree when multiple elements sharing the tag + * name are in the stack of open elements: the adoption agency algorithm, + * for example, must determine whether a specific formatting element is in + * scope, regardless of other elements with the same tag name. + * + * @since 7.1.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-scope + * + * @param WP_HTML_Token $token Check whether this node is in scope. + * @return bool Whether the given node is in scope. + */ + public function has_node_in_scope( WP_HTML_Token $token ): bool { + foreach ( $this->walk_up() as $node ) { + if ( $token === $node ) { + return true; + } - 'svg FOREIGNOBJECT', - 'svg DESC', - 'svg TITLE', - ) - ); + $namespaced_name = 'html' === $node->namespace + ? $node->node_name + : "{$node->namespace} {$node->node_name}"; + + if ( in_array( $namespaced_name, self::ELEMENT_IN_SCOPE_TERMINATION_LIST, true ) ) { + return false; + } + } + + return false; } /** diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 6513db35c1243..0113b0efda1a2 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -99,17 +99,14 @@ * * The HTML Processor supports all elements other than a specific set: * - * - Any element inside a TABLE. - * - Any element inside foreign content, including SVG and MATH. - * - Any element outside the IN BODY insertion mode, e.g. doctype declarations, meta, links. + * - PLAINTEXT elements. + * - FRAMESET documents. + * - Non-table content found inside a TABLE element, which requires foster parenting. + * - Content found after closing the BODY or HTML elements which reopens them. + * - META tags which change the document encoding, when parsing a full document. * * ### Supported markup * - * Some kinds of non-normative HTML involve reconstruction of formatting elements and - * re-parenting of mis-nested elements. For example, a DIV tag found inside a TABLE - * may in fact belong _before_ the table in the DOM. If the HTML Processor encounters - * such a case it will stop processing. - * * The following list illustrates some common examples of unexpected HTML inputs that * the HTML Processor properly parses and represents: * @@ -120,6 +117,11 @@ * - Elements containing text that looks like other tags but isn't, e.g. `The <img> is plaintext`. * - SCRIPT and STYLE tags containing text that looks like HTML but isn't, e.g. ``. * - SCRIPT content which has been escaped, e.g. ``. + * - Misnested formatting elements, e.g. `bold both italic`, including + * reconstruction of implicitly-closed formatting elements and the adoption agency + * algorithm. Formatting elements reopened by the parser appear as "virtual" nodes: + * they report the attributes of the tag which opened the original element, but + * cannot be modified. * * ### Unsupported Features * @@ -131,9 +133,13 @@ * parser does not add those additional attributes. * * In certain situations, elements are moved to a different part of the document in - * a process called "adoption" and "fostering." Because the nodes move to a location - * in the document that the parser had already processed, this parser does not support - * these situations and will bail. + * processes called "adoption" and "fostering." Because a single-pass parser visits + * each node once, nodes which have already been visited cannot move: when adoption + * relocates such nodes, they are reported where they were originally found, while + * every node visited afterwards is reported with the path a browser would report + * for it. Fostering, which moves content found inside a TABLE to a location before + * the table, is not supported, and this parser will bail when non-table content + * is found inside a TABLE element. * * @since 6.4.0 * @@ -258,6 +264,50 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $context_node = null; + /** + * Indicates whether the token currently being processed has already + * produced a stack event of "real" provenance. + * + * Each token in the input HTML must be presented to a visitor of the + * document at most once. Algorithms such as the adoption agency + * algorithm, however, may pop multiple elements whose tag name matches + * a single closing tag in the input HTML: this flag records that a tag + * closer has already been matched with a popped element so that the + * others are reported as closing "virtual" nodes. + * + * @since 7.1.0 + * + * @var bool + */ + private $current_token_produced_real_event = false; + + /** + * Reads attributes from the source tag referenced by a virtual node. + * + * Virtual nodes created for the tokens of existing tags, such as the + * formatting elements reconstructed from the list of active formatting + * elements, share the attributes of the tag which created their original + * token. This processor reads those attributes on demand and is cached + * here while the same virtual node remains matched. + * + * @see WP_HTML_Processor::get_virtual_node_attribute_reader() + * + * @since 7.1.0 + * + * @var WP_HTML_Tag_Processor|null + */ + private $virtual_node_attribute_reader = null; + + /** + * Bookmark name of the token for which the virtual-node attribute reader + * was created, indicating when the reader must be re-created. + * + * @since 7.1.0 + * + * @var string|null + */ + private $virtual_node_attribute_reader_bookmark = null; + /* * Public Interface Functions */ @@ -401,10 +451,25 @@ public function __construct( $html, $use_the_static_create_methods_instead = nul $this->state->stack_of_open_elements->set_push_handler( function ( WP_HTML_Token $token ): void { - $is_virtual = ! isset( $this->state->current_token ) || $this->is_tag_closer(); - $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; - $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; - $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $provenance ); + /* + * A push event is "real" when it pushes the token currently being + * processed and that token is a tag opener found in the input HTML. + * All other pushes open "virtual" nodes: elements implied by context, + * formatting elements reconstructed from the list of active formatting + * elements, or clones created by the adoption agency algorithm. + * + * Token identity is compared instead of the tag name because multiple + * nodes sharing the current token's tag name may be pushed while + * processing a single token. For example, when a "B" formatting element + * is reconstructed while processing the opening tag of another "B" + * element, only the push for the latter is real. + */ + $is_real = ( + isset( $this->state->current_token ) && + $token === $this->state->current_token && + ! $this->is_tag_closer() + ); + $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $is_real ? 'real' : 'virtual' ); $this->change_parsing_namespace( $token->integration_node_type ? 'html' : $token->namespace ); } @@ -412,10 +477,27 @@ function ( WP_HTML_Token $token ): void { $this->state->stack_of_open_elements->set_pop_handler( function ( WP_HTML_Token $token ): void { - $is_virtual = ! isset( $this->state->current_token ) || ! $this->is_tag_closer(); - $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; - $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; - $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $provenance ); + /* + * A pop event is "real" when the token currently being processed is + * a tag closer found in the input HTML whose tag name matches the + * popped node, and when no real event has been produced for the + * current token yet. All other pops close "virtual" nodes. + * + * At most one popped node may be matched with a given tag closer. + * The adoption agency algorithm, for example, may pop multiple + * elements sharing the tag name of the closing tag it processes: + * only the first of them corresponds to the tag in the input HTML. + */ + $is_real = ( + ! $this->current_token_produced_real_event && + isset( $this->state->current_token ) && + $this->is_tag_closer() && + $token->node_name === $this->state->current_token->node_name + ); + if ( $is_real ) { + $this->current_token_produced_real_event = true; + } + $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $is_real ? 'real' : 'virtual' ); $adjusted_current_node = $this->get_adjusted_current_node(); @@ -808,12 +890,11 @@ private function next_visitable_token(): bool { /* * Prime the events if there are none. * - * @todo In some cases, probably related to the adoption agency - * algorithm, this call to step() doesn't create any new - * events. Calling it again creates them. Figure out why - * this is and if it's inherent or if it's a bug. Looping - * until there are events or until there are no more - * tokens works in the meantime and isn't obviously wrong. + * Some tokens never create stack events: tokens which the HTML + * specification directs the parser to ignore, such as a stray + * closing tag or a DOCTYPE found inside BODY. Stepping past such + * a token succeeds but enqueues nothing, so this method recurses + * until an event appears or the document is exhausted. */ if ( empty( $this->element_queue ) ) { if ( $this->step() ) { @@ -1072,6 +1153,8 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ): bool { $this->has_self_closing_flag(), $this->release_internal_bookmark_on_destruct ); + + $this->current_token_produced_real_event = false; } $parse_in_current_insertion_mode = ( @@ -2713,13 +2796,10 @@ private function step_in_body(): bool { /* * > If node is null or if the stack of open elements does not have node * > in scope, then this is a parse error; return and ignore the token. - * - * @todo It's necessary to check if the form token itself is in scope, not - * simply whether any FORM is in scope. */ if ( null === $node || - ! $this->state->stack_of_open_elements->has_element_in_scope( 'FORM' ) + ! $this->state->stack_of_open_elements->has_node_in_scope( $node ) ) { /* * Parse error: ignore the token. @@ -2738,10 +2818,9 @@ private function step_in_body(): bool { $this->generate_implied_end_tags(); if ( $node !== $this->state->stack_of_open_elements->current_node() ) { // @todo Indicate a parse error once it's possible. This error does not impact the logic here. - $this->bail( 'Cannot close a FORM when other elements remain open as this would throw off the breadcrumbs for the following tokens.' ); } - $this->state->stack_of_open_elements->remove_node( $node ); + $this->remove_node_from_stack_of_open_elements( $node ); return true; } else { /* @@ -2859,16 +2938,26 @@ private function step_in_body(): bool { break 2; case 'A': + /* + * > …run the adoption agency algorithm for the token, then remove that + * > element from the list of active formatting elements and the stack + * > of open elements if the adoption agency algorithm didn't already + * > remove it (it might not have if the element is not in table scope). + * + * The adoption agency algorithm cannot require "any other end tag" + * treatment here: it searches the same span of the list of active + * formatting elements in which this A element was just found. + */ $this->run_adoption_agency_algorithm(); $this->state->active_formatting_elements->remove_node( $item ); - $this->state->stack_of_open_elements->remove_node( $item ); + $this->remove_node_from_stack_of_open_elements( $item ); break 2; } } $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); - $this->state->active_formatting_elements->push( $this->state->current_token ); + $this->push_onto_active_formatting_elements( $this->state->current_token ); return true; /* @@ -2889,7 +2978,7 @@ private function step_in_body(): bool { case '+U': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); - $this->state->active_formatting_elements->push( $this->state->current_token ); + $this->push_onto_active_formatting_elements( $this->state->current_token ); return true; /* @@ -2905,7 +2994,7 @@ private function step_in_body(): bool { } $this->insert_html_element( $this->state->current_token ); - $this->state->active_formatting_elements->push( $this->state->current_token ); + $this->push_onto_active_formatting_elements( $this->state->current_token ); return true; /* @@ -2926,7 +3015,13 @@ private function step_in_body(): bool { case '-STRONG': case '-TT': case '-U': - $this->run_adoption_agency_algorithm(); + if ( ! $this->run_adoption_agency_algorithm() ) { + /* + * > If there is no such element, then return and instead act as + * > described in the "any other end tag" entry above. + */ + return $this->in_body_any_other_end_tag(); + } return true; /* @@ -5425,7 +5520,56 @@ public function get_token_type(): ?string { * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. */ public function get_attribute( $name ) { - return $this->is_virtual() ? null : parent::get_attribute( $name ); + if ( ! $this->is_virtual() ) { + return parent::get_attribute( $name ); + } + + $reader = $this->get_virtual_node_attribute_reader(); + return null !== $reader ? $reader->get_attribute( $name ) : null; + } + + /** + * Returns a reader stopped at the source tag referenced by the currently- + * matched virtual node, if the node refers to a tag in the input HTML. + * + * Virtual nodes created for elements implied by context, such as missing + * HTML, HEAD, or BODY tags, do not refer to any tag in the input HTML and + * have no attributes: for these, no reader exists. + * + * Nodes created by active format reconstruction or by the adoption agency + * algorithm, however, are created for the tokens of tags in the input HTML + * and share their attributes: reading such a node's attributes reads the + * attributes of its source tag. + * + * @since 7.1.0 + * + * @return WP_HTML_Tag_Processor|null Reader stopped at the source tag, if one exists. + */ + private function get_virtual_node_attribute_reader(): ?WP_HTML_Tag_Processor { + $token = $this->current_element->token; + + if ( ! isset( $token->bookmark_name, $this->bookmarks[ $token->bookmark_name ] ) ) { + return null; + } + + $span = $this->bookmarks[ $token->bookmark_name ]; + if ( 0 === $span->length ) { + return null; + } + + if ( $token->bookmark_name !== $this->virtual_node_attribute_reader_bookmark ) { + $reader = new WP_HTML_Tag_Processor( substr( $this->html, $span->start, $span->length ) ); + $reader->compat_mode = $this->compat_mode; + $reader->change_parsing_namespace( $token->namespace ); + if ( ! $reader->next_token() ) { + return null; + } + + $this->virtual_node_attribute_reader = $reader; + $this->virtual_node_attribute_reader_bookmark = $token->bookmark_name; + } + + return $this->virtual_node_attribute_reader; } /** @@ -5503,7 +5647,32 @@ public function remove_attribute( $name ): bool { * @return array|null List of attribute names, or `null` when no tag opener is matched. */ public function get_attribute_names_with_prefix( $prefix ): ?array { - return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix ); + if ( ! $this->is_virtual() ) { + return parent::get_attribute_names_with_prefix( $prefix ); + } + + $reader = $this->get_virtual_node_attribute_reader(); + return null !== $reader ? $reader->get_attribute_names_with_prefix( $prefix ) : null; + } + + /** + * Returns the adjusted attribute name for a given attribute, taking into + * account the current parsing context, whether HTML, SVG, or MathML. + * + * @since 7.1.0 Subclassed for the HTML Processor. + * + * @see WP_HTML_Tag_Processor::get_qualified_attribute_name + * + * @param string $attribute_name Which attribute to adjust. + * @return string|null Adjusted attribute name, or `null` when no tag opener is matched. + */ + public function get_qualified_attribute_name( $attribute_name ): ?string { + if ( ! $this->is_virtual() ) { + return parent::get_qualified_attribute_name( $attribute_name ); + } + + $reader = $this->get_virtual_node_attribute_reader(); + return null !== $reader ? $reader->get_qualified_attribute_name( $attribute_name ) : null; } /** @@ -5534,16 +5703,19 @@ public function remove_class( $class_name ): bool { * Returns if a matched tag contains the given ASCII case-insensitive class name. * * @since 6.6.0 Subclassed for the HTML Processor. - * - * @todo When reconstructing active formatting elements with attributes, find a way - * to indicate if the virtually-reconstructed formatting elements contain the - * wanted class name. + * @since 7.1.0 Reports class names for reconstructed formatting elements, + * which contain the class names of their source tag. * * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive. * @return bool|null Whether the matched tag contains the given class name, or null if not matched. */ public function has_class( $wanted_class ): ?bool { - return $this->is_virtual() ? null : parent::has_class( $wanted_class ); + if ( ! $this->is_virtual() ) { + return parent::has_class( $wanted_class ); + } + + $reader = $this->get_virtual_node_attribute_reader(); + return null !== $reader ? $reader->has_class( $wanted_class ) : null; } /** @@ -5563,7 +5735,12 @@ public function has_class( $wanted_class ): ?bool { * @since 6.6.0 Subclassed for the HTML Processor. */ public function class_list() { - return $this->is_virtual() ? null : parent::class_list(); + if ( ! $this->is_virtual() ) { + return parent::class_list(); + } + + $reader = $this->get_virtual_node_attribute_reader(); + return null !== $reader ? $reader->class_list() : null; } /** @@ -6009,10 +6186,16 @@ private function get_adjusted_current_node(): ?WP_HTML_Token { * > in the current body, cell, or caption (whichever is youngest) that haven't * > been explicitly closed. * + * Reconstructed elements are reported as "virtual" nodes: they open where the + * reconstruction occurs, and reading their attributes reports the attributes + * of the tag which created the formatting element being reconstructed. + * * @since 6.4.0 + * @since 7.1.0 Full implementation: reconstructs formatting elements instead + * of bailing when reconstruction is required. * @ignore * - * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * @throws Exception When unable to allocate requisite bookmarks. * * @see https://html.spec.whatwg.org/#reconstruct-the-active-formatting-elements * @@ -6046,7 +6229,47 @@ private function reconstruct_active_formatting_elements(): bool { return false; } - $this->bail( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' ); + /* + * > Let entry be the last (most recently added) element in the list of active formatting elements. + * > Rewind: If there are no entries before entry in the list of active formatting elements, + * > then jump to the step labeled create. + * > Let entry be the entry one earlier than entry in the list of active formatting elements. + * > If entry is neither a marker nor an element that is also in the stack of open elements, + * > go to the step labeled rewind. + * > Advance: Let entry be the element one later than entry in the list of active formatting elements. + * + * The rewind and advance steps find the run of entries at the end of the list which are + * neither markers nor elements in the stack of open elements. These represent formatting + * elements which were implicitly closed and must be reopened, in the order in which they + * were originally opened. + */ + $entries_to_reconstruct = array(); + foreach ( $this->state->active_formatting_elements->walk_up() as $entry ) { + if ( + 'marker' === $entry->node_name || + $this->state->stack_of_open_elements->contains_node( $entry ) + ) { + break; + } + + $entries_to_reconstruct[] = $entry; + } + + /* + * > Create: Insert an HTML element for the token for which the element entry was created, + * > to obtain new element. + * > Replace the entry for entry in the list with an entry for new element. + * > If the entry for new element in the list of active formatting elements is not the last + * > entry in the list, return to the step labeled advance. + */ + for ( $i = count( $entries_to_reconstruct ) - 1; $i >= 0; $i-- ) { + $entry = $entries_to_reconstruct[ $i ]; + $new_element = $this->clone_token( $entry ); + $this->insert_html_element( $new_element ); + $this->state->active_formatting_elements->replace_node( $entry, $new_element ); + } + + return true; } /** @@ -6245,34 +6468,61 @@ private function reset_insertion_mode_appropriately(): void { /** * Runs the adoption agency algorithm. * + * This algorithm handles misnested formatting elements, deciding how + * formatting elements are closed and reopened ("adopted") so that + * formatting applies as a browser would apply it. + * + * Because the HTML Processor visits a document in a single pass, nodes + * which have already been visited cannot be moved: where a browser would + * re-parent nodes found before the misnesting was discovered, this + * processor reports them where they were originally visited. The stack of + * open elements and the list of active formatting elements are maintained + * as the specification demands, however, so every token visited after + * this algorithm runs is reported with the ancestor chain a browser would + * report for it at the same place in the document. + * * @since 6.4.0 + * @since 7.1.0 Full implementation: handles the furthest block case and + * "any other end tag" fallback instead of bailing. * @ignore * - * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * @throws Exception When unable to allocate requisite bookmarks. * * @see https://html.spec.whatwg.org/#adoption-agency-algorithm + * + * @return bool False when the current token must instead be treated as in + * the "any other end tag" entry of the "in body" insertion + * mode, true otherwise. */ - private function run_adoption_agency_algorithm(): void { - $budget = 1000; - $subject = $this->get_tag(); - $current_node = $this->state->stack_of_open_elements->current_node(); + private function run_adoption_agency_algorithm(): bool { + $stack = $this->state->stack_of_open_elements; + $afe = $this->state->active_formatting_elements; + + // > Let subject be token's tag name. + $subject = $this->get_tag(); + /* + * > If the current node is an HTML element whose tag name is subject, and the current + * > node is not in the list of active formatting elements, then pop the current node + * > off the stack of open elements and return. + */ + $current_node = $stack->current_node(); if ( - // > If the current node is an HTML element whose tag name is subject - $current_node && $subject === $current_node->node_name && - // > the current node is not in the list of active formatting elements - ! $this->state->active_formatting_elements->contains_node( $current_node ) + null !== $current_node && + 'html' === $current_node->namespace && + $subject === $current_node->node_name && + ! $afe->contains_node( $current_node ) ) { - $this->state->stack_of_open_elements->pop(); - return; + $stack->pop(); + return true; } - $outer_loop_counter = 0; - while ( $budget-- > 0 ) { - if ( $outer_loop_counter++ >= 8 ) { - return; - } - + /* + * > Let outer loop counter be 0. + * > While true: If outer loop counter is greater than or equal to 8, then return. + * > Increment outer loop counter by 1. + */ + for ( $outer_loop_counter = 0; $outer_loop_counter < 8; $outer_loop_counter++ ) { /* * > Let formatting element be the last element in the list of active formatting elements that: * > - is between the end of the list and the last marker in the list, @@ -6280,7 +6530,7 @@ private function run_adoption_agency_algorithm(): void { * > - and has the tag name subject. */ $formatting_element = null; - foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { + foreach ( $afe->walk_up() as $item ) { if ( 'marker' === $item->node_name ) { break; } @@ -6291,64 +6541,217 @@ private function run_adoption_agency_algorithm(): void { } } - // > If there is no such element, then return and instead act as described in the "any other end tag" entry above. + /* + * > If there is no such element, then return and instead act as described in the + * > "any other end tag" entry above. + */ if ( null === $formatting_element ) { - $this->bail( 'Cannot run adoption agency when "any other end tag" is required.' ); + return false; } - // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. - if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) { - $this->state->active_formatting_elements->remove_node( $formatting_element ); - return; + /* + * > If formatting element is not in the stack of open elements, then this is a + * > parse error; remove the element from the list, and return. + */ + if ( ! $stack->contains_node( $formatting_element ) ) { + $afe->remove_node( $formatting_element ); + return true; } - // > If formatting element is in the stack of open elements, but the element is not in scope, then this is a parse error; return. - if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $formatting_element->node_name ) ) { - return; + /* + * > If formatting element is in the stack of open elements, but the element is + * > not in scope, then this is a parse error; return. + */ + if ( ! $stack->has_node_in_scope( $formatting_element ) ) { + return true; } /* - * > Let furthest block be the topmost node in the stack of open elements that is lower in the stack - * > than formatting element, and is an element in the special category. There might not be one. + * > If formatting element is not the current node, this is a parse error. (But do not return.) */ - $is_above_formatting_element = true; - $furthest_block = null; - foreach ( $this->state->stack_of_open_elements->walk_down() as $item ) { - if ( $is_above_formatting_element && $formatting_element->bookmark_name !== $item->bookmark_name ) { - continue; - } - if ( $is_above_formatting_element ) { - $is_above_formatting_element = false; - continue; - } - - if ( self::is_special( $item ) ) { - $furthest_block = $item; + /* + * > Let furthest block be the topmost node in the stack of open elements that is lower in the + * > stack than formatting element, and is an element in the special category. There might not + * > be one. + * + * The stack is copied into a working array: while there is a furthest block, this algorithm + * removes, replaces, and inserts nodes in a random-access fashion which the stack of open + * elements cannot directly express. The working array is reconciled with the stack at the + * end of the loop. + */ + $working_stack = $stack->stack; + $formatting_element_index = array_search( $formatting_element, $working_stack, true ); + $furthest_block = null; + $furthest_block_index = null; + for ( $i = $formatting_element_index + 1, $stack_size = count( $working_stack ); $i < $stack_size; $i++ ) { + if ( self::is_special( $working_stack[ $i ] ) ) { + $furthest_block = $working_stack[ $i ]; + $furthest_block_index = $i; break; } } /* - * > If there is no furthest block, then the UA must first pop all the nodes from the bottom of the - * > stack of open elements, from the current node up to and including formatting element, then - * > remove formatting element from the list of active formatting elements, and finally return. + * > If there is no furthest block, then the UA must first pop all the nodes from the bottom of + * > the stack of open elements, from the current node up to and including formatting element, + * > then remove formatting element from the list of active formatting elements, and finally + * > return. */ if ( null === $furthest_block ) { - foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { - $this->state->stack_of_open_elements->pop(); + foreach ( $stack->walk_up() as $item ) { + $stack->pop(); - if ( $formatting_element->bookmark_name === $item->bookmark_name ) { - $this->state->active_formatting_elements->remove_node( $formatting_element ); - return; + if ( $formatting_element === $item ) { + break; } } + + $afe->remove_node( $formatting_element ); + return true; } - $this->bail( 'Cannot extract common ancestor in adoption agency algorithm.' ); + /* + * > Let common ancestor be the element immediately above formatting element in the stack + * > of open elements. + * + * The common ancestor is only used as a target when re-parenting nodes which have already + * been visited; since this processor cannot re-parent visited nodes, it goes unused here. + */ + + // > Let a bookmark note the position of formatting element in the list of active formatting elements. + $bookmark = $afe->position_of( $formatting_element ); + + // > Let node and last node be furthest block. + $node_index = $furthest_block_index; + $last_node = $furthest_block; + $inner_loop_counter = 0; + + while ( true ) { + // > Increment inner loop counter by 1. + ++$inner_loop_counter; + + /* + * > Let node be the element immediately above node in the stack of open elements, + * > or if node is no longer in the stack of open elements (e.g. because it got + * > removed by this algorithm), the element that was immediately above node in + * > the stack of open elements at the time when node was removed. + * + * Removed nodes are spliced out of the working array, so the element which was + * above a removed node is found at the removed node's old index. + */ + $node = $working_stack[ --$node_index ]; + + // > If node is formatting element, then break out of the inner loop. + if ( $node === $formatting_element ) { + break; + } + + /* + * > If inner loop counter is greater than 3 and node is in the list of active + * > formatting elements, then remove node from the list of active formatting elements. + */ + $node_afe_position = $afe->position_of( $node ); + if ( $inner_loop_counter > 3 && null !== $node_afe_position ) { + $afe->remove_at( $node_afe_position ); + if ( $node_afe_position < $bookmark ) { + --$bookmark; + } + $node_afe_position = null; + } + + /* + * > If node is not in the list of active formatting elements, then remove node + * > from the stack of open elements and continue. + */ + if ( null === $node_afe_position ) { + array_splice( $working_stack, $node_index, 1 ); + continue; + } + + /* + * > Create an element for the token for which the element node was created, in the + * > HTML namespace, with common ancestor as the intended parent; replace the entry + * > for node in the list of active formatting elements with an entry for the new + * > element, replace the entry for node in the stack of open elements with an entry + * > for the new element, and let node be the new element. + */ + $node_clone = $this->clone_token( $node ); + $afe->replace_node( $node, $node_clone ); + $working_stack[ $node_index ] = $node_clone; + $node = $node_clone; + + /* + * > If last node is furthest block, then move the aforementioned bookmark to be + * > immediately after the new node in the list of active formatting elements. + */ + if ( $last_node === $furthest_block ) { + $bookmark = $node_afe_position + 1; + } + + /* + * > Insert last node into node, first removing it from its previous parent node if any. + * + * This re-parents a node which has already been visited: it has no effect on the + * stack of open elements or on the tokens which have yet to be visited. + */ + + // > Let last node be node. + $last_node = $node; + } + + /* + * > Insert whatever last node ended up being in the previous step at the appropriate place + * > for inserting a node, but using common ancestor as the override target. + * + * As above, this re-parents a node which has already been visited and has no effect on + * the parse of the remaining document. + */ + + /* + * > Create an element for the token for which formatting element was created, in the HTML + * > namespace, with furthest block as the intended parent. + * > Take all of the child nodes of furthest block and append them to the element created + * > in the last step. + * > Append that new element to furthest block. + * + * The children of the furthest block have already been visited and moving them has no + * effect on the remaining parse. The new element itself, however, becomes an open element + * below the furthest block, where content which follows will be found. + */ + $formatting_clone = $this->clone_token( $formatting_element ); + + /* + * > Remove formatting element from the list of active formatting elements, and insert the + * > new element into the list of active formatting elements at the position of the + * > aforementioned bookmark. + */ + $formatting_element_afe_position = $afe->position_of( $formatting_element ); + $afe->remove_at( $formatting_element_afe_position ); + if ( $formatting_element_afe_position < $bookmark ) { + --$bookmark; + } + $afe->insert_at( $bookmark, $formatting_clone ); + + /* + * > Remove formatting element from the stack of open elements, and insert the new element + * > into the stack of open elements immediately below the position of furthest block in + * > that stack. + */ + array_splice( $working_stack, array_search( $formatting_element, $working_stack, true ), 1 ); + array_splice( $working_stack, array_search( $furthest_block, $working_stack, true ) + 1, 0, array( $formatting_clone ) ); + + /* + * The working stack now describes the stack of open elements after this iteration of the + * algorithm: reconcile the stack of open elements so that the rearrangement is expressed + * as properly-nested closing and opening events. + */ + $this->reconcile_stack_of_open_elements( $working_stack ); + + // > Jump back to the step labeled outer loop. } - $this->bail( 'Cannot run adoption agency when looping required.' ); + return true; } /** @@ -6457,6 +6860,220 @@ private function insert_virtual_node( $token_name, $bookmark_name = null ): WP_H return $token; } + /** + * Creates a token that is a clone of a given element token, as when the + * HTML parsing algorithms create a new element for an existing token. + * + * The clone receives its own bookmark spanning the same input HTML as the + * original token so that its attributes may be read: reading an attribute + * of a reconstructed element reports the attribute of the tag which + * created its original token. The clone remains a distinct node, however, + * and modifying it is not supported. + * + * @since 7.1.0 + * @ignore + * + * @throws Exception When unable to allocate requisite bookmark. + * + * @param WP_HTML_Token $token Create a clone of this token. + * @return WP_HTML_Token Clone of the given token. + */ + private function clone_token( WP_HTML_Token $token ): WP_HTML_Token { + $name = $this->bookmark_token(); + $here = isset( $token->bookmark_name ) ? ( $this->bookmarks[ $token->bookmark_name ] ?? null ) : null; + + $this->bookmarks[ $name ] = null !== $here + ? new WP_HTML_Span( $here->start, $here->length ) + : new WP_HTML_Span( $this->bookmarks[ $this->state->current_token->bookmark_name ]->start, 0 ); + + $clone = new WP_HTML_Token( $name, $token->node_name, $token->has_self_closing_flag, $this->release_internal_bookmark_on_destruct ); + $clone->namespace = $token->namespace; + $clone->integration_node_type = $token->integration_node_type; + + return $clone; + } + + /** + * Updates the stack of open elements to contain a given arrangement of + * nodes, expressing the transformation as a properly-nested sequence of + * closing and opening events. + * + * The HTML Processor reports a document as a stream of tokens whose + * nesting structure is implied by the order of the opening and closing + * events for each element. Algorithms such as the adoption agency + * algorithm rearrange the stack of open elements in a random-access + * fashion, which has no direct representation in a stream of properly- + * nested events. This method expresses such rearrangements by closing + * elements down to the deepest ancestor shared with the desired + * arrangement and then opening the desired elements below it. + * + * Nodes which have already been visited cannot be re-parented: they were + * reported where they were originally found. Every token visited after + * this update, however, is reported with breadcrumbs matching the + * ancestor chain a browser would report at the same place in the document. + * + * @since 7.1.0 + * @ignore + * + * @param WP_HTML_Token[] $desired_stack Nodes the stack of open elements should contain, in order. + */ + private function reconcile_stack_of_open_elements( array $desired_stack ): void { + $stack = $this->state->stack_of_open_elements; + + $shared_depth = 0; + $max_shared = min( $stack->count(), count( $desired_stack ) ); + while ( $shared_depth < $max_shared && $stack->stack[ $shared_depth ] === $desired_stack[ $shared_depth ] ) { + ++$shared_depth; + } + + for ( $i = $stack->count(); $i > $shared_depth; $i-- ) { + $stack->pop(); + } + + for ( $i = $shared_depth, $desired_depth = count( $desired_stack ); $i < $desired_depth; $i++ ) { + $stack->push( $desired_stack[ $i ] ); + } + } + + /** + * Removes a node from the stack of open elements, expressing the removal + * as a properly-nested sequence of closing and opening events when the + * node is not the current node. + * + * @since 7.1.0 + * @ignore + * + * @see WP_HTML_Processor::reconcile_stack_of_open_elements + * + * @param WP_HTML_Token $token Node to remove from the stack of open elements. + * @return bool Whether the node was found and removed. + */ + private function remove_node_from_stack_of_open_elements( WP_HTML_Token $token ): bool { + $desired_stack = $this->state->stack_of_open_elements->stack; + $position = array_search( $token, $desired_stack, true ); + if ( false === $position ) { + return false; + } + + array_splice( $desired_stack, $position, 1 ); + $this->reconcile_stack_of_open_elements( $desired_stack ); + return true; + } + + /** + * Pushes an element onto the list of active formatting elements, limiting + * the number of equivalent elements as required by the "Noah's Ark clause". + * + * > If there are already three elements in the list of active formatting + * > elements after the last marker, if any, or anywhere in the list if + * > there are no markers, that have the same tag name, namespace, and + * > attributes as element, then remove the earliest such element from the + * > list of active formatting elements. For these purposes, the attributes + * > must be compared as they were when the elements were created by the + * > parser; two elements have the same attributes if all their parsed + * > attributes can be paired such that the two attributes in each pair + * > have identical names, namespaces, and values (the order of the + * > attributes does not matter). + * + * @since 7.1.0 + * @ignore + * + * @see https://html.spec.whatwg.org/#push-onto-the-list-of-active-formatting-elements + * + * @param WP_HTML_Token $token Push this node onto the list of active formatting elements. + */ + private function push_onto_active_formatting_elements( WP_HTML_Token $token ): void { + /* + * Find entries which might be equivalent to the pushed element. + * Attributes are only compared once three or more entries share the + * tag name and namespace, because comparing attributes requires + * parsing each candidate's source tag. + */ + $candidates = array(); + foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { + if ( 'marker' === $item->node_name ) { + break; + } + + if ( $token->node_name === $item->node_name && $token->namespace === $item->namespace ) { + $candidates[] = $item; + } + } + + if ( count( $candidates ) >= 3 ) { + $signature = $this->get_attribute_comparison_signature( $token ); + $earliest_match = null; + $match_count = 0; + + // Candidates were collected from the end of the list: the last match found is the earliest. + foreach ( $candidates as $candidate ) { + if ( $signature === $this->get_attribute_comparison_signature( $candidate ) ) { + ++$match_count; + $earliest_match = $candidate; + } + } + + if ( $match_count >= 3 ) { + $this->state->active_formatting_elements->remove_node( $earliest_match ); + } + } + + // > Add element to the list of active formatting elements. + $this->state->active_formatting_elements->push( $token ); + } + + /** + * Builds a canonical representation of the attribute set of a token's + * source tag, for determining whether two elements have the same + * attributes as required by the "Noah's Ark clause". + * + * Attribute names are unique within a tag, so sorting the name/value + * pairs by name produces a stable representation: two tags receive the + * same signature if and only if their parsed attribute sets are the same. + * Attribute namespaces need not be represented: elements subject to this + * comparison are HTML formatting elements, whose attributes are never + * placed in a foreign namespace. + * + * @since 7.1.0 + * @ignore + * + * @param WP_HTML_Token $token Token whose source tag's attributes are represented. + * @return string Canonical representation of the tag's attribute set. + */ + private function get_attribute_comparison_signature( WP_HTML_Token $token ): string { + if ( $token === $this->state->current_token ) { + // The parser is stopped at this tag: read its attributes directly. + $reader = $this; + $names = parent::get_attribute_names_with_prefix( '' ); + } else { + $span = isset( $token->bookmark_name ) ? ( $this->bookmarks[ $token->bookmark_name ] ?? null ) : null; + if ( null === $span || 0 === $span->length ) { + return ''; + } + + $reader = new WP_HTML_Tag_Processor( substr( $this->html, $span->start, $span->length ) ); + if ( ! $reader->next_token() ) { + return ''; + } + $names = $reader->get_attribute_names_with_prefix( '' ); + } + + if ( null === $names || array() === $names ) { + return ''; + } + + sort( $names, SORT_STRING ); + + $attributes = array(); + foreach ( $names as $name ) { + $attributes[ $name ] = $reader === $this + ? parent::get_attribute( $name ) + : $reader->get_attribute( $name ); + } + + return serialize( $attributes ); + } + /* * HTML Specification Helpers */ diff --git a/src/wp-includes/html-api/class-wp-html-unsupported-exception.php b/src/wp-includes/html-api/class-wp-html-unsupported-exception.php index 7b244a5e8a8dd..8ec37fd1db689 100644 --- a/src/wp-includes/html-api/class-wp-html-unsupported-exception.php +++ b/src/wp-includes/html-api/class-wp-html-unsupported-exception.php @@ -34,7 +34,7 @@ class WP_HTML_Unsupported_Exception extends Exception { * * This does not imply that the token itself was unsupported, but it * may have been the case that the token triggered part of the HTML - * parsing that isn't supported, such as the adoption agency algorithm. + * parsing that isn't supported, such as foster parenting. * * @since 6.7.0 * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index e332ec12a0a91..3d169348224ff 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -534,7 +534,7 @@ static function ( int $errno, string $errstr ) use ( &$errors ) { */ public static function data_provider_fuzzer_native_error_cases() { return array( - 'Unsupported active formatting' => array( '', null ), + 'Reconstructed active formatting' => array( '', '' ), ); } diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 8cece32438bd3..c9d28634e7994 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -185,18 +185,42 @@ public function test_clear_to_navigate_after_seeking() { } /** - * Ensures that support is added for reconstructing active formatting elements - * before the HTML Processor handles situations with unclosed formats requiring it. + * Ensures that unclosed formatting elements are reconstructed into each + * subsequent paragraph, accumulating as a browser would accumulate them. * * @ticket 58517 * * @covers WP_HTML_Processor::reconstruct_active_formatting_elements */ - public function test_fails_to_reconstruct_formatting_elements() { + public function test_reconstructs_formatting_elements() { $processor = WP_HTML_Processor::create_fragment( '

One

Two

Three

Four' ); - $this->assertTrue( $processor->next_tag( 'EM' ), 'Could not find first EM.' ); - $this->assertFalse( $processor->next_tag( 'EM' ), 'Should have aborted before finding second EM as it required reconstructing the first EM.' ); + /* + * Each opened EM element remains in the list of active formatting elements when its + * containing P closes. Every following paragraph reconstructs all of the unclosed + * EM elements and then adds its own, nesting one deeper each time: + * + *

One

+ *

Two

+ *

Three

+ *

Four

+ */ + $em_count = 0; + $deepest = array(); + while ( $processor->next_tag( 'EM' ) ) { + ++$em_count; + if ( count( $processor->get_breadcrumbs() ) > count( $deepest ) ) { + $deepest = $processor->get_breadcrumbs(); + } + } + + $this->assertNull( $processor->get_last_error(), 'Should have parsed the entire document without error.' ); + $this->assertSame( 10, $em_count, 'Should have visited every EM element, including those reconstructed.' ); + $this->assertSame( + array( 'HTML', 'BODY', 'P', 'EM', 'EM', 'EM', 'EM' ), + $deepest, + 'Should have reconstructed three unclosed EM elements inside the last paragraph.' + ); } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorActiveFormattingElements.php b/tests/phpunit/tests/html-api/wpHtmlProcessorActiveFormattingElements.php new file mode 100644 index 0000000000000..7c44d6cabe11a --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorActiveFormattingElements.php @@ -0,0 +1,392 @@ +One

Two' ); + + // The SOURCE element doesn't trigger reconstruction, and this test asserts that. + $this->assertTrue( + $processor->next_tag( 'SOURCE' ), + 'Should have found the first SOURCE element.' + ); + + $this->assertSame( + array( 'HTML', 'BODY', 'P', 'SOURCE' ), + $processor->get_breadcrumbs(), + 'Should have closed formatting element at first P element.' + ); + + $this->assertTrue( + $processor->next_tag( 'SOURCE' ), + 'Should have found the second SOURCE element.' + ); + + $this->assertSame( + array( 'HTML', 'BODY', 'P', 'B', 'SOURCE' ), + $processor->get_breadcrumbs(), + 'Should have reconstructed the implicitly-closed B element for the text node.' + ); + } + + /** + * Ensures that reconstructed formatting elements report the attributes + * of the tag which created the element being reconstructed. + * + * @ticket 58517 + * + * @covers ::get_attribute + * @covers ::get_attribute_names_with_prefix + * @covers ::has_class + * @covers ::class_list + */ + public function test_reconstructed_formatting_element_reports_original_attributes() { + $processor = WP_HTML_Processor::create_fragment( '

inside

outside' ); + + $this->assertTrue( $processor->next_tag( 'B' ), 'Should have found the original B element.' ); + $this->assertTrue( $processor->next_tag( 'B' ), 'Should have found the reconstructed B element.' ); + + $this->assertSame( + array( 'HTML', 'BODY', 'B' ), + $processor->get_breadcrumbs(), + 'Should have reconstructed the B element outside of the closed P element.' + ); + + $this->assertSame( + 'bold', + $processor->get_attribute( 'class' ), + 'Should have read the "class" attribute from the source tag of the reconstructed element.' + ); + + $this->assertSame( + '1&2', + $processor->get_attribute( 'data-test' ), + 'Should have decoded the attribute value from the source tag of the reconstructed element.' + ); + + $this->assertSame( + array( 'class', 'data-test' ), + $processor->get_attribute_names_with_prefix( '' ), + 'Should have listed the attribute names from the source tag of the reconstructed element.' + ); + + $this->assertTrue( + $processor->has_class( 'bold' ), + 'Should have found the class name on the reconstructed element.' + ); + + $this->assertSame( + array( 'bold' ), + iterator_to_array( $processor->class_list() ), + 'Should have listed the class names of the reconstructed element.' + ); + } + + /** + * Ensures that reconstructed formatting elements cannot be modified. + * + * Reconstructed elements don't exist in the input HTML: there is no tag + * to modify. Writing to one could otherwise corrupt the source tag of + * the original element, which is a distinct node. + * + * @ticket 58517 + * + * @covers ::set_attribute + */ + public function test_reconstructed_formatting_element_cannot_be_modified() { + $processor = WP_HTML_Processor::create_fragment( '

inside

outside' ); + + $this->assertTrue( $processor->next_tag( 'B' ), 'Should have found the original B element.' ); + $this->assertTrue( $processor->next_tag( 'B' ), 'Should have found the reconstructed B element.' ); + + $this->assertFalse( + $processor->set_attribute( 'id', 'not-writable' ), + 'Should have refused to set an attribute on a reconstructed element.' + ); + + $this->assertFalse( + $processor->remove_attribute( 'class' ), + 'Should have refused to remove an attribute from a reconstructed element.' + ); + } + + /** + * Ensures that the "Noah's Ark clause" limits reconstruction to three + * equivalent formatting elements. + * + * @ticket 58517 + * + * @covers ::push_onto_active_formatting_elements + */ + public function test_noahs_ark_clause_limits_equivalent_formatting_elements() { + $processor = WP_HTML_Processor::create_fragment( '

first

second' ); + + while ( $processor->next_token() && 'second' !== $processor->get_modifiable_text() ) { + continue; + } + + $this->assertSame( + array( 'HTML', 'BODY', 'P', 'B', 'B', 'B', '#text' ), + $processor->get_breadcrumbs(), + 'Should have reconstructed only three of the four equivalent B elements.' + ); + } + + /** + * Ensures that the "Noah's Ark clause" compares attributes and does not + * remove formatting elements whose attributes differ. + * + * @ticket 58517 + * + * @covers ::push_onto_active_formatting_elements + */ + public function test_noahs_ark_clause_compares_attributes() { + $processor = WP_HTML_Processor::create_fragment( '

first

second' ); + + while ( $processor->next_token() && 'second' !== $processor->get_modifiable_text() ) { + continue; + } + + $this->assertSame( + array( 'HTML', 'BODY', 'P', 'B', 'B', 'B', 'B', '#text' ), + $processor->get_breadcrumbs(), + 'Should have reconstructed all four B elements since their attributes differ.' + ); + } + + /** + * Ensures that the adoption agency algorithm closes and reopens formatting + * elements when a formatting element is closed while non-formatting elements + * remain open, and that content which follows is reported with the ancestor + * chain a browser would report. + * + * @ticket 58517 + * + * @covers ::run_adoption_agency_algorithm + */ + public function test_adoption_agency_no_furthest_block() { + $processor = WP_HTML_Processor::create_fragment( '

123' ); + + while ( $processor->next_token() && '3' !== $processor->get_modifiable_text() ) { + continue; + } + + $this->assertSame( + array( 'HTML', 'BODY', 'P', 'I', '#text' ), + $processor->get_breadcrumbs(), + 'Should have closed the B element and reconstructed the I element around the following text.' + ); + } + + /** + * Ensures that the adoption agency algorithm handles the "furthest block" + * case: content following the misnested closing tag must be found in the + * same ancestor chain a browser would report for it. + * + * @ticket 58517 + * + * @covers ::run_adoption_agency_algorithm + */ + public function test_adoption_agency_with_furthest_block() { + $processor = WP_HTML_Processor::create_fragment( '1

23' ); + + while ( $processor->next_token() && '3' !== $processor->get_modifiable_text() ) { + continue; + } + + $this->assertSame( + array( 'HTML', 'BODY', 'P', '#text' ), + $processor->get_breadcrumbs(), + 'Should have adopted the P element so that following text is inside it, outside the closed B.' + ); + } + + /** + * Ensures that content following a deeply-misnested formatting element is + * reported with the ancestor chain a browser would report for it. + * + * In this document, closing the A element adopts the inner DIV: browsers + * re-parent it under clones of the formatting elements U, I, and CODE. + * Content following the misnesting must be found at the same path. + * + * @ticket 58517 + * + * @covers ::run_adoption_agency_algorithm + */ + public function test_adoption_agency_deep_misnesting() { + $processor = WP_HTML_Processor::create_fragment( '

x' ); + + while ( $processor->next_token() && 'x' !== $processor->get_modifiable_text() ) { + continue; + } + + $this->assertSame( + array( 'HTML', 'BODY', 'DIV', 'U', 'I', 'CODE', 'DIV', '#text' ), + $processor->get_breadcrumbs(), + 'Should have reported following text with the ancestor chain a browser would produce.' + ); + } + + /** + * Ensures that a closing tag for a formatting element which is not an + * active format is ignored, as directed by the "any other end tag" + * fallback of the adoption agency algorithm. + * + * @ticket 58517 + * + * @covers ::run_adoption_agency_algorithm + * @covers ::in_body_any_other_end_tag + */ + public function test_adoption_agency_ignores_unopened_formatting_end_tag() { + $processor = WP_HTML_Processor::create_fragment( '

textmore' ); + + while ( $processor->next_token() && 'more' !== $processor->get_modifiable_text() ) { + continue; + } + + $this->assertNull( $processor->get_last_error(), 'Should have parsed the entire document without error.' ); + $this->assertSame( + array( 'HTML', 'BODY', 'P', '#text' ), + $processor->get_breadcrumbs(), + 'Should have ignored the stray closing tag and continued inside the P element.' + ); + } + + /** + * Ensures that the adoption agency algorithm expresses its rearrangement + * of the stack of open elements as a properly-nested stream of tokens. + * + * A browser parsing this document produces the following tree, in which + * the P element is re-parented out of the B element it started in, and a + * clone of the B element wraps the P element's earlier content: + * + * 1

23

+ * + * A single-pass parser cannot re-parent content it has already reported. + * Instead, when the misnesting is discovered at the closing B tag, the + * open elements are closed and reopened so that every token which follows + * is reported with browser-accurate breadcrumbs. This test pins down that + * event stream. + * + * @ticket 58517 + * + * @covers ::run_adoption_agency_algorithm + */ + public function test_adoption_agency_event_stream_remains_properly_nested() { + $processor = WP_HTML_Processor::create_fragment( '1

23' ); + + $events = array(); + while ( $processor->next_token() ) { + $events[] = array( + ( $processor->is_tag_closer() ? '-' : '+' ) . $processor->get_token_name(), + implode( ' ', $processor->get_breadcrumbs() ), + ); + } + + $this->assertNull( $processor->get_last_error(), 'Should have parsed the entire document without error.' ); + $this->assertSame( + array( + array( '+B', 'HTML BODY B' ), + array( '+#text', 'HTML BODY B #text' ), + array( '+P', 'HTML BODY B P' ), + array( '+#text', 'HTML BODY B P #text' ), + array( '-P', 'HTML BODY B' ), + array( '-B', 'HTML BODY' ), + array( '+P', 'HTML BODY P' ), + array( '+B', 'HTML BODY P B' ), + array( '-B', 'HTML BODY P' ), + array( '+#text', 'HTML BODY P #text' ), + array( '-P', 'HTML BODY' ), + ), + $events, + 'Should have expressed the adoption as a properly-nested stream of opening and closing events.' + ); + } + + /** + * Ensures that a new A element implicitly closes an open A element, even + * when the open element cannot be reached by generating end tags. + * + * @ticket 58517 + * + * @covers ::run_adoption_agency_algorithm + */ + public function test_a_implicitly_closes_open_a() { + $processor = WP_HTML_Processor::create_fragment( '12' ); + + $this->assertTrue( $processor->next_tag( 'A' ), 'Should have found the first A element.' ); + $this->assertTrue( $processor->next_tag( 'A' ), 'Should have found the second A element.' ); + + $this->assertSame( + array( 'HTML', 'BODY', 'A' ), + $processor->get_breadcrumbs(), + 'Should have closed the first A element before opening the second.' + ); + + $this->assertSame( + '/second', + $processor->get_attribute( 'href' ), + 'Should have matched the second A element.' + ); + } + + /** + * Ensures that formatting elements are reconstructed with stable breadcrumbs + * when seeking backwards and forwards across an adoption boundary. + * + * @ticket 58517 + * + * @covers ::seek + */ + public function test_seeking_across_adoption_produces_stable_breadcrumbs() { + $processor = WP_HTML_Processor::create_fragment( '1

23' ); + + $this->assertTrue( $processor->next_tag( 'P' ), 'Should have found the P element.' ); + $this->assertTrue( $processor->set_bookmark( 'p' ), 'Should have set a bookmark on the P element.' ); + + $first_pass = array(); + while ( $processor->next_token() ) { + $first_pass[] = array( $processor->get_token_name(), $processor->get_breadcrumbs() ); + } + $this->assertNull( $processor->get_last_error(), 'Should have parsed the entire document without error.' ); + + $this->assertTrue( $processor->seek( 'p' ), 'Should have sought back to the P element.' ); + $this->assertSame( + array( 'HTML', 'BODY', 'B', 'P' ), + $processor->get_breadcrumbs(), + 'Should have restored the original breadcrumbs at the bookmarked element.' + ); + + $second_pass = array(); + while ( $processor->next_token() ) { + $second_pass[] = array( $processor->get_token_name(), $processor->get_breadcrumbs() ); + } + + $this->assertSame( $first_pass, $second_pass, 'Should have reported identical tokens after seeking back.' ); + } +} diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php index b54fc047ab040..13bb18eeda323 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php @@ -195,14 +195,9 @@ public function test_fails_when_encountering_unsupported_markup( $html, $descrip */ public static function data_unsupported_markup() { return array( - 'A with formatting following unclosed A' => array( - 'Click Here', - 'Unclosed formatting requires complicated reconstruction.', - ), - - 'A after unclosed A inside DIV' => array( - '

', - 'A is a formatting element, which requires more complicated reconstruction.', + 'Foster parenting of A inside TABLE' => array( + 'Fostered
', + 'Fostered content requires moving nodes before the TABLE, which is not supported.', ), ); } diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index d87d784dbf2d4..43ad6a90239ef 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -23,21 +23,91 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase { const TREE_INDENT = ' '; + /** + * Reason to skip tests which require relocating already-visited nodes. + * + * The HTML Processor visits a document in a single pass and cannot move + * nodes it has already visited. When the adoption agency algorithm runs, + * browsers may re-parent nodes found before the misnesting was discovered; + * this parser reports them where they were originally visited, so the + * constructed tree differs even though the parser state after the + * algorithm matches browsers exactly for everything which follows. + */ + const SKIP_HTML_PARSER_REPARENTS_VISITED_NODES = 'Single-pass parser: the adoption agency algorithm cannot relocate nodes which have already been visited.'; + + /** + * Reason to skip tests in which a FORM element is closed while other + * elements remain open inside of it. + * + * In this case browsers remove the FORM from the stack of open elements + * while its still-open descendants remain in place: the FORM remains an + * ancestor of following content in the DOM even though no new content + * can reach it. A properly-nested token stream cannot express this; + * this parser reports following content outside of the closed FORM, + * mirroring the stack of open elements a browser would maintain. + */ + const SKIP_HTML_PARSER_CANNOT_HOLD_FORM_OPEN = 'Single-pass parser: a FORM closed while its descendants remain open stays in the document as their ancestor, which the token stream cannot express.'; + /** * Skip specific tests that may not be supported or have known issues. */ const SKIP_TESTS = array( - 'noscript01/line0014' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'tests14/line0022' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'tests14/line0055' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'tests19/line0488' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'tests19/line0500' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'tests19/line1079' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'tests2/line0207' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'tests2/line0686' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'tests2/line0697' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'tests2/line0709' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', - 'webkit01/line0231' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'adoption01/line0001' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'adoption01/line0014' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'adoption01/line0030' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'adoption01/line0062' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'adoption01/line0108' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'adoption01/line0124' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'adoption01/line0141' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'adoption01/line0241' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'adoption01/line0281' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'adoption02/line0001' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'html5test-com/line0252' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'noscript01/line0014' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'template/line1091' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line0237' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line0256' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line0706' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line0784' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line0850' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line0994' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line1015' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line1037' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line1061' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line1086' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line1111' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line1468' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests1/line1484' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests14/line0022' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests14/line0055' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests19/line0488' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests19/line0500' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests19/line1079' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests19/line1169' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests2/line0118' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests2/line0207' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests2/line0686' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests2/line0697' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests2/line0709' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests22/line0001' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests22/line0023' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests22/line0069' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests22/line0117' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests26/line0136' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tests6/line0012' => self::SKIP_HTML_PARSER_CANNOT_HOLD_FORM_OPEN, + 'tests8/line0133' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tricky01/line0001' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tricky01/line0019' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tricky01/line0078' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'tricky01/line0146' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'webkit01/line0231' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'webkit01/line0571' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'webkit01/line0586' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'webkit01/line0603' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'webkit02/line0186' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'webkit02/line0204' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'webkit02/line0224' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, + 'webkit02/line0242' => self::SKIP_HTML_PARSER_REPARENTS_VISITED_NODES, ); /** diff --git a/tests/phpunit/tests/html-api/wpHtmlSupportRequiredActiveFormatReconstruction.php b/tests/phpunit/tests/html-api/wpHtmlSupportRequiredActiveFormatReconstruction.php deleted file mode 100644 index a139850752f35..0000000000000 --- a/tests/phpunit/tests/html-api/wpHtmlSupportRequiredActiveFormatReconstruction.php +++ /dev/null @@ -1,70 +0,0 @@ -One

Two' ); - - // The SOURCE element doesn't trigger reconstruction, and this test asserts that. - $this->assertTrue( - $processor->next_tag( 'SOURCE' ), - 'Should have found the first custom element.' - ); - - $this->assertSame( - array( 'HTML', 'BODY', 'P', 'SOURCE' ), - $processor->get_breadcrumbs(), - 'Should have closed formatting element at first P element.' - ); - - /* - * There are two ways this test could fail. One is to appropriately find the - * second text node but fail to reconstruct the implicitly-closed B element. - * The other way is to fail to abort when encountering the second text node - * because the kind of active format reconstruction isn't supported. - * - * At the time of writing this test, the HTML Processor bails whenever it - * needs to reconstruct active formats, unless there are no active formats. - * To ensure that this test properly works once that support is expanded, - * it's written to verify both circumstances. Once support is added, this - * can be simplified to only contain the first clause of the conditional. - * - * The use of the SOURCE element is important here because most elements - * will also trigger reconstruction, which would conflate the test results - * with the text node triggering reconstruction. The SOURCE element won't - * do this, making it neutral. Therefore, the implicitly-closed B element - * will only be reconstructed by the text node. - */ - - if ( $processor->next_tag( 'SOURCE' ) ) { - $this->assertSame( - array( 'HTML', 'BODY', 'P', 'B', 'SOURCE' ), - $processor->get_breadcrumbs(), - 'Should have reconstructed the implicitly-closed B element.' - ); - } else { - $this->assertSame( - WP_HTML_Processor::ERROR_UNSUPPORTED, - $processor->get_last_error(), - 'Should have aborted for incomplete active format reconstruction when encountering the second text node.' - ); - } - } -}