diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 6513db35c1243..6e9c1a3840375 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1523,6 +1523,398 @@ public function serialize_token(): string { return $html; } + /** + * Replaces the inner HTML of the currently-matched element, if possible. + * + * The processor must be paused on the opening tag of an element which can + * contain content: it cannot be paused on a void element (e.g. an IMG + * element), on a self-contained element whose contents require a special + * tokenizer state (e.g. a SCRIPT or TEXTAREA element), or on any token + * which does not appear in the input HTML text (e.g. a TBODY element + * implied by a TR inside a TABLE). + * + * Unlike the DOM `innerHTML` setter, which operates on a tree, this method + * operates on the HTML text of the document. There are trees which can be + * created through the DOM whose HTML serialization does not reproduce the + * same tree. Content is rejected unless the final document parses with the + * new content fully contained inside the context element and with no + * changes of any kind outside of it. + * + * Rejected content leaves the document unmodified and returns `false`: + * + * - Content which would close the context element or continue outside of it, + * e.g. setting `

outside` inside a P element. + * - Content whose elements would implicitly close the context element, + * e.g. setting `
  • ` inside an LI element, or `` inside an A element, + * because nesting these elements cannot be represented in HTML text. + * - Content which would modify parts of the document following the context + * element, e.g. setting `unclosed` inside a DIV element which is + * followed by more content, because the unclosed B element would wrap + * that following content when the document is parsed again. + * - Content which the HTML Processor cannot parse, e.g. markup requiring + * foster-parenting, such as setting `text` directly inside a TABLE element. + * - Content inside a SELECT element other than OPTION, OPTGROUP, and HR + * elements, text, and comments, because the parsing rules for SELECT + * elements have recently changed in HTML and differ across parsers. + * + * Example: + * + * $processor = WP_HTML_Processor::create_fragment( '
    old

    kept

    ' ); + * $processor->next_tag( 'DIV' ); + * true === $processor->set_inner_html( '

    one

    two' ); + * $processor->get_updated_html() === '

    one

    two

    kept

    '; + * + * $processor = WP_HTML_Processor::create_fragment( '
    WordPress' ); + * $processor->next_tag( 'A' ); + * false === $processor->set_inner_html( 'links cannot nest' ); + * $processor->get_updated_html() === 'WordPress'; + * + * After a successful replacement the processor remains paused on the opening + * tag of the context element; proceeding onward parses the new content. + * Bookmarks pointing to tokens within the replaced content are released. + * + * Verification requires reparsing the document from its beginning both with + * and without the replacement; this is an expensive operation which should + * not be repeated in tight loops. + * + * @since 7.1.0 + * + * @param string $html Raw HTML to replace the contents of the currently-matched element. + * @return bool Whether the content was accepted and the document updated. + */ + public function set_inner_html( string $html ): bool { + if ( null !== $this->last_error ) { + return false; + } + + /* + * The processor must be paused on the opening tag of an element + * found in the input HTML text which expects a closing tag. + */ + if ( + null === $this->current_element || + '#tag' !== $this->get_token_type() || + $this->is_tag_closer() || + $this->is_virtual() || + true !== $this->expects_closer() || + ! isset( $this->bookmarks[ $this->current_element->token->bookmark_name ] ) + ) { + return false; + } + + // Only fragment parsers with a BODY context can be reproduced for verification. + if ( null !== $this->context_node && 'BODY' !== $this->context_node->node_name ) { + return false; + } + + /* + * Reject content whose parsing is known to diverge from browsers, + * because reparsing with the HTML Processor cannot detect whether + * such content modifies the document outside of the context element. + * + * - When an HTML or BODY start tag appears in a document, HTML parsers + * may adopt its attributes onto the existing HTML or BODY elements, + * modifying elements outside of the context element. The HTML + * Processor ignores these tags instead of adopting the attributes. + * + * - The HTML Processor still parses SELECT content under rules which + * predate the "customizable select element" changes to HTML. It + * ignores most tags inside a SELECT element, while up-to-date + * parsers allow many of them, possibly escaping the SELECT and + * reopening formatting elements beyond the context element. Only + * tokens treated identically under both revisions of HTML are + * allowed inside a SELECT element. See https://core.trac.wordpress.org/ticket/63736. + */ + $context_in_select = in_array( 'SELECT', $this->get_breadcrumbs(), true ); + $select_depth = $context_in_select ? 1 : 0; + + $scan = new WP_HTML_Tag_Processor( $html ); + while ( $scan->next_token() ) { + if ( '#tag' !== $scan->get_token_type() ) { + continue; + } + + $scanned_tag = $scan->get_token_name(); + + if ( $scan->is_tag_closer() ) { + if ( 'SELECT' === $scanned_tag ) { + // A SELECT containing the context element may not be closed. + if ( $context_in_select ) { + return false; + } + if ( $select_depth > 0 ) { + --$select_depth; + } + } elseif ( + $select_depth > 0 && + ! in_array( $scanned_tag, array( 'OPTION', 'OPTGROUP', 'HR' ), true ) + ) { + return false; + } + continue; + } + + if ( 'HTML' === $scanned_tag || 'BODY' === $scanned_tag ) { + return false; + } + + if ( 'SELECT' === $scanned_tag ) { + // A SELECT element may not appear inside another SELECT element. + if ( $select_depth > 0 ) { + return false; + } + ++$select_depth; + continue; + } + + if ( + $select_depth > 0 && + ! in_array( $scanned_tag, array( 'OPTION', 'OPTGROUP', 'HR' ), true ) + ) { + return false; + } + } + + // Apply any pending updates so that the document below is final. + $this->get_updated_html(); + + $document = $this->html; + $context_token = $this->current_element->token; + $context_span = $this->bookmarks[ $context_token->bookmark_name ]; + $inner_start = $context_span->start + $context_span->length; + + /* + * Walk a parser over the existing document to find where the context + * element closes, and record every stack operation which occurs from + * that closing through the end of the document. These operations + * describe the structure of the document outside (after) the context + * element, which the new content must leave fully unmodified. + */ + $original = $this->create_equivalent_parser( $document ); + if ( + null === $original || + ! self::advance_to_tag_opener_at( $original, $context_span->start, $context_token->node_name ) + ) { + return false; + } + + $original_context = $original->current_element->token; + $found_closing = false; + while ( $original->next_token() ) { + $event = $original->current_element; + if ( WP_HTML_Stack_Event::POP === $event->operation && $event->token === $original_context ) { + $found_closing = true; + break; + } + } + + if ( ! $found_closing ) { + // The document could not be fully parsed: unable to verify the replacement. + return false; + } + + /* + * The inner content ends where the token responsible for closing the + * context element begins: at its closing tag when explicitly closed, + * at the token which implicitly closed it, or at the end of the + * document when no content closed it. + */ + $at_end_of_document = ( + WP_HTML_Tag_Processor::STATE_COMPLETE === $original->parser_state || + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $original->parser_state || + null === $original->state->current_token + ); + + $inner_end = $at_end_of_document + ? strlen( $document ) + : $original->bookmarks[ $original->state->current_token->bookmark_name ]->start; + + $expected_events = array( self::stack_event_signature( $original, $inner_end ) ); + while ( $original->next_token() ) { + $expected_events[] = self::stack_event_signature( $original, $inner_end ); + } + + if ( null !== $original->get_last_error() ) { + return false; + } + + /* + * Reparse the document with the new content in place. The new content + * must remain fully contained within the context element, and from the + * closing of the context element onward, the document must produce + * exactly the same structure as it did without the replacement. + */ + $candidate_document = substr( $document, 0, $inner_start ) . $html . substr( $document, $inner_end ); + $candidate_end = $inner_start + strlen( $html ); + + $candidate = $this->create_equivalent_parser( $candidate_document ); + if ( + null === $candidate || + ! self::advance_to_tag_opener_at( $candidate, $context_span->start, $context_token->node_name ) + ) { + return false; + } + + $candidate_context = $candidate->current_element->token; + $context_depth = $candidate->get_current_depth(); + $opened_by_content = array(); + $found_closing = false; + while ( $candidate->next_token() ) { + $event = $candidate->current_element; + + if ( WP_HTML_Stack_Event::POP === $event->operation ) { + if ( $event->token === $candidate_context ) { + $found_closing = true; + break; + } + + // Only elements opened by the new content may be closed by it. + if ( ! isset( $opened_by_content[ spl_object_id( $event->token ) ] ) ) { + return false; + } + } else { + // Every element opened before the context element closes must be inside of it. + if ( $candidate->get_current_depth() <= $context_depth ) { + return false; + } + $opened_by_content[ spl_object_id( $event->token ) ] = true; + } + } + + if ( ! $found_closing ) { + return false; + } + + $expected_count = count( $expected_events ); + $expected_index = 0; + if ( self::stack_event_signature( $candidate, $candidate_end ) !== $expected_events[ $expected_index++ ] ) { + return false; + } + + while ( $candidate->next_token() ) { + if ( + $expected_index >= $expected_count || + self::stack_event_signature( $candidate, $candidate_end ) !== $expected_events[ $expected_index++ ] + ) { + return false; + } + } + + if ( $expected_index !== $expected_count || null !== $candidate->get_last_error() ) { + return false; + } + + /* + * The replacement is proven safe: apply it to the document. + * + * Bookmarks pointing into the replaced content refer to tokens + * which no longer exist and must be released. + */ + foreach ( $this->bookmarks as $bookmark_name => $bookmark_span ) { + if ( + $bookmark_span->start < $inner_end && + $bookmark_span->start + $bookmark_span->length > $inner_start + ) { + unset( $this->bookmarks[ $bookmark_name ] ); + } + } + + $this->lexical_updates[] = new WP_HTML_Text_Replacement( $inner_start, $inner_end - $inner_start, $html ); + $this->get_updated_html(); + + return true; + } + + /** + * Creates a new HTML Processor over the given document, constructed in + * the same parsing mode as this processor, for verifying modifications. + * + * The base class is used intentionally so that verification follows + * core HTML semantics even when called from a subclass. + * + * @since 7.1.0 + * + * @param string $document Full text of the document to parse. + * @return WP_HTML_Processor|null The created processor if successful, otherwise null. + */ + private function create_equivalent_parser( string $document ): ?WP_HTML_Processor { + return null === $this->context_node + ? WP_HTML_Processor::create_full_parser( $document ) + : WP_HTML_Processor::create_fragment( $document ); + } + + /** + * Advances a processor to the opening tag found at the given byte offset + * in its document. + * + * @since 7.1.0 + * + * @param WP_HTML_Processor $walker Processor to advance. + * @param int $at Byte offset into the document where the opening tag starts. + * @param string $node_name Node name the found tag must have. + * @return bool Whether the processor was paused on the described opening tag. + */ + private static function advance_to_tag_opener_at( WP_HTML_Processor $walker, int $at, string $node_name ): bool { + while ( $walker->next_token() ) { + $event = $walker->current_element; + if ( + WP_HTML_Stack_Event::PUSH === $event->operation && + 'real' === $event->provenance && + isset( $walker->bookmarks[ $event->token->bookmark_name ] ) && + $walker->bookmarks[ $event->token->bookmark_name ]->start === $at + ) { + return $event->token->node_name === $node_name; + } + } + + return false; + } + + /** + * Describes the stack operation a processor is currently paused on, in a + * form which can be compared across two parses of related documents. + * + * The signature contains the operation, the node it operates on, where in + * the document it occurred (relative to the given base offset), and the + * resulting path from the root of the document. Two matching parses + * produce equal signature sequences; any structural divergence produces + * a differing signature. + * + * @since 7.1.0 + * + * @param WP_HTML_Processor $walker Processor paused on a stack event. + * @param int $base_offset Byte offset from which event locations are measured. + * @return array Comparable description of the stack event. + */ + private static function stack_event_signature( WP_HTML_Processor $walker, int $base_offset ): array { + $at_end_of_document = ( + WP_HTML_Tag_Processor::STATE_COMPLETE === $walker->parser_state || + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $walker->parser_state || + null === $walker->state->current_token + ); + + if ( $at_end_of_document ) { + $at = 'end-of-document'; + $length = 0; + } else { + $span = $walker->bookmarks[ $walker->state->current_token->bookmark_name ]; + $at = $span->start - $base_offset; + $length = $span->length; + } + + $event = $walker->current_element; + + return array( + 'operation' => $event->operation, + 'provenance' => $event->provenance, + 'node_name' => $event->token->node_name, + 'namespace' => $event->token->namespace, + 'at' => $at, + 'length' => $length, + 'path' => implode( ' > ', $walker->get_breadcrumbs() ), + ); + } + /** * Parses next element in the 'initial' insertion mode. * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-set-inner-html.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-set-inner-html.php new file mode 100644 index 0000000000000..e630bf7b73930 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-set-inner-html.php @@ -0,0 +1,443 @@ +assertTrue( + $processor->next_tag( $target ), + "Could not find {$target} element in test document: check test setup." + ); + + $this->assertTrue( + $processor->set_inner_html( $new_html ), + 'Should have accepted content which is fully contained within the context element.' + ); + + $this->assertSame( + $expected, + $processor->get_updated_html(), + 'Should have replaced the inner content of the context element and nothing else.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_content_replaced_within_context_element(): array { + return array( + 'Plain text' => array( '
    old content
    after', 'DIV', 'fresh', '
    fresh
    after' ), + 'Empty content' => array( '
    full of stuff
    ', 'DIV', '', '
    ' ), + 'Element markup' => array( '
    abc
    ', 'DIV', '

    replaced

    ', '

    replaced

    ' ), + 'Markup with implied closers' => array( '
    x

    kept

    ', 'DIV', '

    one

    two', '

    one

    two

    kept

    ' ), + 'Same element nested' => array( '
    x
    ', 'DIV', '
    nested
    ', '
    nested
    ' ), + 'Identical content' => array( '
    same
    ', 'DIV', 'same', '
    same
    ' ), + 'A with phrasing content' => array( 'WordPress', 'A', 'the best CMS', 'the best CMS' ), + 'Implicitly-closed LI' => array( '', 'LI', 'replaced', '' ), + 'Implicitly-closed P' => array( '

    one

    two', 'P', 'styled text', '

    styled text

    two' ), + 'Unclosed element at end' => array( '

    dangling', 'P', 'replaced', '

    replaced' ), + 'Comment' => array( '

    x
    ', 'DIV', '', '
    ' ), + 'TD cell content' => array( '
    old
    ', 'TD', 'new', '
    new
    ' ), + 'TABLE rows with implied TBODY' => array( '
    a
    ', 'TABLE', 'b', '
    b
    ' ), + 'SELECT options' => array( '', 'SELECT', '

    escaped' ), + 'LI inside LI' => array( '

    • one
    • two
    ', 'LI', '
  • lists collapse' ), + 'P inside P' => array( '

    one

    ', 'P', '

    paragraphs close paragraphs

    ' ), + 'Heading inside heading' => array( '

    title

    ', 'H1', '

    headings close headings

    ' ), + 'BUTTON inside BUTTON' => array( '', 'BUTTON', '