From d8e1170bc2d2218cbb1f037c54a84176a387acb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 20 May 2026 20:30:58 +0200 Subject: [PATCH] Add native plain text literal URL rewrites --- bin/benchmark-native-apis.php | 213 ++++++++++ extensions/native-apis/README.md | 7 + extensions/native-apis/native_apis_shim.c | 376 ++++++++++++++++++ .../native-apis/playground/blueprint.json | 2 +- extensions/native-apis/src/lib.rs | 25 ++ extensions/native-apis/src/url_text.rs | 229 ++++++++++- .../native-apis/tests/verify-native-apis.php | 24 ++ 7 files changed, 874 insertions(+), 2 deletions(-) diff --git a/bin/benchmark-native-apis.php b/bin/benchmark-native-apis.php index fb22b34d7..24118a1f2 100644 --- a/bin/benchmark-native-apis.php +++ b/bin/benchmark-native-apis.php @@ -748,6 +748,13 @@ URLInTextProcessor::class, 'wp_toolkit_native_api_benchmark_url_in_text_processor' ); + $results[] = wp_toolkit_native_api_benchmark_run( + 'plain-text-literal-url-rewrite', + 'php', + $iterations, + URLInTextProcessor::class, + 'wp_toolkit_native_api_benchmark_php_plain_text_literal_url_rewrite' + ); } if ( wp_toolkit_native_api_benchmark_should_run( $mode, 'native' ) ) { @@ -758,6 +765,13 @@ 'WordPress\\DataLiberation\\URL\\NativeURLInTextProcessor', 'wp_toolkit_native_api_benchmark_native_url_in_text_processor' ); + $results[] = wp_toolkit_native_api_benchmark_run( + 'plain-text-literal-url-rewrite', + 'native', + $iterations, + 'WordPress\\DataLiberation\\URL\\NativeURLInTextProcessor', + 'wp_toolkit_native_api_benchmark_native_plain_text_literal_url_rewrite' + ); } } @@ -2365,6 +2379,179 @@ function wp_toolkit_native_api_benchmark_native_url_in_text_processor() { return $count; } +/** + * Benchmark the PHP plain text literal URL rewrite path. + * + * @return int Number of URLs rewritten. + */ +function wp_toolkit_native_api_benchmark_php_plain_text_literal_url_rewrite() { + static $mappings = null; + + $text = wp_toolkit_native_api_benchmark_plain_text_literal_url_document(); + if ( null === $mappings ) { + $mappings = wp_toolkit_native_api_benchmark_parse_plain_text_literal_mappings( + wp_toolkit_native_api_benchmark_plain_text_literal_url_mapping() + ); + } + + $rewritten = wp_toolkit_native_api_benchmark_php_rewrite_plain_text_literal_urls( $text, $mappings ); + if ( false === $rewritten ) { + throw new RuntimeException( 'PHP plain text literal URL rewrite benchmark did not rewrite the fixture.' ); + } + + $count = substr_count( $rewritten, 'https://new.example/base/' ); + if ( 360 !== $count ) { + throw new RuntimeException( "PHP plain text literal URL rewrite benchmark expected 360 URLs, rewrote {$count}." ); + } + + return $count; +} + +/** + * Benchmark the native plain text literal URL rewrite path. + * + * @return int Number of URLs rewritten. + */ +function wp_toolkit_native_api_benchmark_native_plain_text_literal_url_rewrite() { + if ( ! function_exists( 'wp_native_apis_rewrite_plain_text_literal_urls' ) ) { + throw new RuntimeException( 'Function wp_native_apis_rewrite_plain_text_literal_urls is not available.' ); + } + + $rewritten = wp_native_apis_rewrite_plain_text_literal_urls( + wp_toolkit_native_api_benchmark_plain_text_literal_url_document(), + wp_toolkit_native_api_benchmark_plain_text_literal_url_mapping() + ); + if ( false === $rewritten ) { + throw new RuntimeException( 'Native plain text literal URL rewrite benchmark did not rewrite the fixture.' ); + } + + $count = substr_count( $rewritten, 'https://new.example/base/' ); + if ( 360 !== $count ) { + throw new RuntimeException( "Native plain text literal URL rewrite benchmark expected 360 URLs, rewrote {$count}." ); + } + + return $count; +} + +/** + * Rewrite simple literal source-origin URLs in known plain text. + * + * @param string $text Plain text fixture. + * @param array $mappings Parsed source-origin to target-prefix mappings. + * @return false|string false when the generic parser path must handle it. + */ +function wp_toolkit_native_api_benchmark_php_rewrite_plain_text_literal_urls( $text, $mappings ) { + if ( array() === $mappings || false !== strpbrk( $text, "<>\"'\\{}[]()" ) ) { + return false; + } + + $replacements = array(); + foreach ( $mappings as $mapping ) { + $from = $mapping['from']; + $from_length = strlen( $from ); + $offset = 0; + + while ( true ) { + $position = strpos( $text, $from, $offset ); + if ( false === $position ) { + break; + } + + if ( + ! wp_toolkit_native_api_benchmark_literal_origin_has_valid_left_boundary( $text, $position ) || + ! wp_toolkit_native_api_benchmark_literal_origin_has_valid_right_boundary( $text, $position + $from_length ) + ) { + return false; + } + + $replacements[] = array( $position, $from_length, $mapping['to'] ); + $offset = $position + $from_length; + } + } + + if ( array() === $replacements ) { + return false; + } + + usort( + $replacements, + function ( $a, $b ) { + return $a[0] <=> $b[0]; + } + ); + + $rewritten = ''; + $cursor = 0; + foreach ( $replacements as $replacement ) { + $position = $replacement[0]; + $length = $replacement[1]; + $to = $replacement[2]; + if ( $position < $cursor ) { + return false; + } + + $rewritten .= substr( $text, $cursor, $position - $cursor ); + $rewritten .= $to; + $cursor = $position + $length; + } + $rewritten .= substr( $text, $cursor ); + + return $rewritten; +} + +/** + * Parse compact benchmark mappings into plain text literal rewrite rows. + * + * @param string $compact_mapping Compact source-origin to target-prefix mappings. + * @return array + */ +function wp_toolkit_native_api_benchmark_parse_plain_text_literal_mappings( $compact_mapping ) { + $mappings = array(); + foreach ( explode( "\x1e", $compact_mapping ) as $row ) { + $parts = explode( "\x1f", $row, 2 ); + if ( 2 !== count( $parts ) ) { + continue; + } + + $mappings[] = array( + 'from' => $parts[0], + 'to' => $parts[1], + ); + } + + return $mappings; +} + +/** + * Check the left boundary for a plain text literal source origin. + * + * @param string $text Plain text fixture. + * @param int $position Candidate position. + * @return bool + */ +function wp_toolkit_native_api_benchmark_literal_origin_has_valid_left_boundary( $text, $position ) { + if ( 0 === $position ) { + return true; + } + + return ctype_space( $text[ $position - 1 ] ); +} + +/** + * Check the right boundary for a plain text literal source origin. + * + * @param string $text Plain text fixture. + * @param int $position Candidate end position. + * @return bool + */ +function wp_toolkit_native_api_benchmark_literal_origin_has_valid_right_boundary( $text, $position ) { + if ( $position >= strlen( $text ) ) { + return true; + } + + return '/' === $text[ $position ] || '?' === $text[ $position ] || '#' === $text[ $position ]; +} + /** * Benchmark the XML processor. * @@ -4266,6 +4453,32 @@ function wp_toolkit_native_api_benchmark_url_in_text_document() { return implode( ' ', $items ); } +/** + * Build representative plain text for literal source-origin URL rewrites. + * + * @return string + */ +function wp_toolkit_native_api_benchmark_plain_text_literal_url_document() { + $items = array(); + for ( $i = 0; $i < 120; $i++ ) { + $items[] = sprintf( + 'Post %1$d references http://old.example/posts/%1$d, http://old.example/media/%1$d.jpg and http://old.example/meta/%1$d.', + $i + ); + } + + return implode( ' ', $items ); +} + +/** + * Build compact source-origin to target-prefix mappings for literal rewrites. + * + * @return string + */ +function wp_toolkit_native_api_benchmark_plain_text_literal_url_mapping() { + return "http://old.example\x1fhttps://new.example/base"; +} + /** * Build a representative XML document. * diff --git a/extensions/native-apis/README.md b/extensions/native-apis/README.md index 8b1ae983f..4f0bf7a74 100644 --- a/extensions/native-apis/README.md +++ b/extensions/native-apis/README.md @@ -103,6 +103,12 @@ candidate scanner. The public PHP class still validates candidates with the existing WHATWG parser and uses the PHP regular-expression scanner for non-ASCII text or when native defaults are disabled. +`wp_native_apis_rewrite_plain_text_literal_urls()` is a narrower primitive for +known plain text leaves. It accepts compact `source-origin\x1ftarget-prefix` +mappings separated by `\x1e`, rewrites exact HTTP(S) source-origin matches, and +returns `false` instead of guessing when the text contains structured-data +delimiters or an origin match is not bounded as a URL origin. + ## Build Details The build requires Rust, PHP development headers, `php-config`, and libclang. @@ -378,6 +384,7 @@ caller-shaped workflow. The benchmark harness includes rows for these paths: - XML tag, prefix, and sanitizer summaries through direct source scans. - URL-in-text scans through a direct native plain-text URL candidate processor, with public `URLInTextProcessor` rows preserving WHATWG validation. +- Plain text literal source-origin URL rewrites for parser-owned leaf text. The compact batch APIs return strings with `\x1f` field separators and `\x1e` record separators. They are intended for callers that need incremental processing but can aggregate without building one PHP array per tag or token. diff --git a/extensions/native-apis/native_apis_shim.c b/extensions/native-apis/native_apis_shim.c index 86a0736af..5fff1a4ff 100644 --- a/extensions/native-apis/native_apis_shim.c +++ b/extensions/native-apis/native_apis_shim.c @@ -6,6 +6,7 @@ #include "ext/standard/info.h" #include +#include #include typedef struct { @@ -18,6 +19,25 @@ typedef struct { zend_object std; } wp_native_smoke_object; +typedef struct { + char *value; + size_t length; + size_t capacity; +} wp_native_string_buffer; + +typedef struct { + size_t position; + size_t length; + const char *replacement; + size_t replacement_len; +} wp_native_literal_replacement; + +typedef struct { + wp_native_literal_replacement *items; + size_t count; + size_t capacity; +} wp_native_literal_replacement_list; + static zend_class_entry *wp_native_html_tag_processor_ce; static zend_class_entry *wp_native_html_processor_ce; static zend_class_entry *wp_native_xml_processor_ce; @@ -29,6 +49,11 @@ wp_native_ascii_is_space( char c ) { return ' ' == c || '\t' == c || '\n' == c || '\r' == c || '\f' == c; } +static zend_bool +wp_native_ascii_is_php_space( char c ) { + return ' ' == c || '\t' == c || '\n' == c || '\r' == c || '\f' == c || '\v' == c; +} + static zend_bool wp_native_ascii_is_alnum( char c ) { return ( 'a' <= c && 'z' >= c ) || ( 'A' <= c && 'Z' >= c ) || ( '0' <= c && '9' >= c ); @@ -89,6 +114,220 @@ wp_native_ascii_starts_with( const char *value, size_t value_len, const char *pr return 1; } +static zend_bool +wp_native_buffer_reserve( wp_native_string_buffer *buffer, size_t additional ) { + size_t required; + size_t capacity; + char *value; + + if ( additional > ( (size_t) -1 ) - buffer->length ) { + return 0; + } + + required = buffer->length + additional; + if ( required <= buffer->capacity ) { + return 1; + } + + capacity = buffer->capacity ? buffer->capacity : 128; + while ( capacity < required ) { + if ( capacity > ( (size_t) -1 ) / 2 ) { + capacity = required; + break; + } + capacity *= 2; + } + + value = safe_erealloc( buffer->value, capacity, sizeof( char ), 0 ); + if ( NULL == value ) { + return 0; + } + + buffer->value = value; + buffer->capacity = capacity; + return 1; +} + +static zend_bool +wp_native_buffer_append( wp_native_string_buffer *buffer, const char *value, size_t length ) { + if ( 0 == length ) { + return 1; + } + + if ( ! wp_native_buffer_reserve( buffer, length ) ) { + return 0; + } + + memcpy( buffer->value + buffer->length, value, length ); + buffer->length += length; + return 1; +} + +static zend_bool +wp_native_literal_replacements_append( + wp_native_literal_replacement_list *list, + size_t position, + size_t length, + const char *replacement, + size_t replacement_len +) { + wp_native_literal_replacement *items; + size_t capacity; + + if ( list->count == list->capacity ) { + capacity = list->capacity ? list->capacity * 2 : 4; + items = safe_erealloc( list->items, capacity, sizeof( wp_native_literal_replacement ), 0 ); + if ( NULL == items ) { + return 0; + } + + list->items = items; + list->capacity = capacity; + } + + list->items[ list->count ].position = position; + list->items[ list->count ].length = length; + list->items[ list->count ].replacement = replacement; + list->items[ list->count ].replacement_len = replacement_len; + list->count++; + return 1; +} + +static int +wp_native_compare_literal_replacements( const void *left, const void *right ) { + const wp_native_literal_replacement *left_replacement = (const wp_native_literal_replacement *) left; + const wp_native_literal_replacement *right_replacement = (const wp_native_literal_replacement *) right; + + if ( left_replacement->position < right_replacement->position ) { + return -1; + } + if ( left_replacement->position > right_replacement->position ) { + return 1; + } + return 0; +} + +static const char * +wp_native_ascii_find_case_insensitive( const char *haystack, size_t haystack_len, const char *needle, size_t needle_len ) { + size_t i; + + if ( 0 == needle_len || haystack_len < needle_len ) { + return NULL; + } + + for ( i = 0; i <= haystack_len - needle_len; i++ ) { + if ( wp_native_ascii_starts_with( haystack + i, haystack_len - i, needle, needle_len ) ) { + return haystack + i; + } + } + + return NULL; +} + +static zend_bool +wp_native_has_plain_text_literal_structural_delimiter( const char *text, size_t text_len ) { + size_t i; + + for ( i = 0; i < text_len; i++ ) { + switch ( text[ i ] ) { + case '<': + case '>': + case '"': + case '\'': + case '\\': + case '{': + case '}': + case '[': + case ']': + case '(': + case ')': + return 1; + } + } + + return 0; +} + +static zend_bool +wp_native_plain_text_literal_origin_has_valid_left_boundary( const char *text, size_t position ) { + return 0 == position || wp_native_ascii_is_php_space( text[ position - 1 ] ); +} + +static zend_bool +wp_native_plain_text_literal_origin_has_valid_right_boundary( const char *text, size_t text_len, size_t position ) { + return position >= text_len || '/' == text[ position ] || '?' == text[ position ] || '#' == text[ position ]; +} + +static size_t +wp_native_plain_text_literal_http_origin_end( const char *url, size_t url_len ) { + size_t authority_start; + size_t authority_end; + + if ( wp_native_ascii_starts_with( url, url_len, "http://", 7 ) ) { + authority_start = 7; + } else if ( wp_native_ascii_starts_with( url, url_len, "https://", 8 ) ) { + authority_start = 8; + } else { + return 0; + } + + authority_end = authority_start; + while ( authority_end < url_len && '/' != url[ authority_end ] && '?' != url[ authority_end ] && '#' != url[ authority_end ] ) { + if ( '@' == url[ authority_end ] ) { + return 0; + } + authority_end++; + } + + if ( authority_end == authority_start ) { + return 0; + } + + return authority_end; +} + +static zend_bool +wp_native_is_valid_plain_text_literal_mapping( const char *from, size_t from_len, const char *to, size_t to_len ) { + size_t i; + size_t from_origin_end; + size_t to_origin_end; + + if ( 0 == from_len || 0 == to_len ) { + return 0; + } + + for ( i = 0; i < from_len; i++ ) { + if ( (unsigned char) from[ i ] > 0x7f ) { + return 0; + } + } + for ( i = 0; i < to_len; i++ ) { + if ( (unsigned char) to[ i ] > 0x7f ) { + return 0; + } + } + + from_origin_end = wp_native_plain_text_literal_http_origin_end( from, from_len ); + if ( 0 == from_origin_end || from_origin_end != from_len ) { + return 0; + } + + to_origin_end = wp_native_plain_text_literal_http_origin_end( to, to_len ); + if ( 0 == to_origin_end ) { + return 0; + } + if ( to_origin_end < to_len && '/' != to[ to_origin_end ] ) { + return 0; + } + for ( i = to_origin_end; i < to_len; i++ ) { + if ( '?' == to[ i ] || '#' == to[ i ] ) { + return 0; + } + } + + return 1; +} + static zend_bool wp_native_is_trailing_url_punctuation( char c ) { return '.' == c || ',' == c || ';' == c || ':' == c || '!' == c || '?' == c || ')' == c || '"' == c || ']' == c || '}' == c; @@ -461,6 +700,137 @@ PHP_METHOD( NativeXMLProcessor, get_tag_local_name ) { RETURN_STRING( object->current_name ); } +PHP_FUNCTION( wp_native_apis_rewrite_plain_text_literal_urls ) { + char *text; + size_t text_len; + char *compact_mapping; + size_t compact_mapping_len; + size_t row_start = 0; + size_t copied_until = 0; + wp_native_literal_replacement_list replacements = { NULL, 0, 0 }; + wp_native_string_buffer buffer = { NULL, 0, 0 }; + + ZEND_PARSE_PARAMETERS_START( 2, 2 ) + Z_PARAM_STRING( text, text_len ) + Z_PARAM_STRING( compact_mapping, compact_mapping_len ) + ZEND_PARSE_PARAMETERS_END(); + + if ( wp_native_has_plain_text_literal_structural_delimiter( text, text_len ) ) { + RETURN_FALSE; + } + + while ( row_start < compact_mapping_len ) { + size_t row_end = row_start; + size_t separator; + const char *from; + const char *to; + size_t from_len; + size_t to_len; + size_t offset = 0; + + while ( row_end < compact_mapping_len && '\x1e' != compact_mapping[ row_end ] ) { + row_end++; + } + + separator = row_start; + while ( separator < row_end && '\x1f' != compact_mapping[ separator ] ) { + separator++; + } + + if ( separator == row_end ) { + row_start = row_end + ( row_end < compact_mapping_len ? 1 : 0 ); + continue; + } + + from = compact_mapping + row_start; + from_len = separator - row_start; + to = compact_mapping + separator + 1; + to_len = row_end - separator - 1; + + if ( ! wp_native_is_valid_plain_text_literal_mapping( from, from_len, to, to_len ) ) { + row_start = row_end + ( row_end < compact_mapping_len ? 1 : 0 ); + continue; + } + + while ( offset + from_len <= text_len ) { + const char *match = wp_native_ascii_find_case_insensitive( text + offset, text_len - offset, from, from_len ); + size_t position; + + if ( NULL == match ) { + break; + } + + position = (size_t) ( match - text ); + if ( + ! wp_native_plain_text_literal_origin_has_valid_left_boundary( text, position ) || + ! wp_native_plain_text_literal_origin_has_valid_right_boundary( text, text_len, position + from_len ) + ) { + if ( NULL != replacements.items ) { + efree( replacements.items ); + } + RETURN_FALSE; + } + + if ( ! wp_native_literal_replacements_append( &replacements, position, from_len, to, to_len ) ) { + if ( NULL != replacements.items ) { + efree( replacements.items ); + } + php_error_docref( NULL, E_WARNING, "Unable to allocate literal URL rewrite replacements" ); + RETURN_FALSE; + } + + offset = position + from_len; + } + + row_start = row_end + ( row_end < compact_mapping_len ? 1 : 0 ); + } + + if ( 0 == replacements.count ) { + RETURN_FALSE; + } + + qsort( replacements.items, replacements.count, sizeof( wp_native_literal_replacement ), wp_native_compare_literal_replacements ); + + { + size_t i; + for ( i = 0; i < replacements.count; i++ ) { + wp_native_literal_replacement *replacement = &replacements.items[ i ]; + + if ( replacement->position < copied_until ) { + efree( replacements.items ); + RETURN_FALSE; + } + + if ( + ! wp_native_buffer_append( &buffer, text + copied_until, replacement->position - copied_until ) || + ! wp_native_buffer_append( &buffer, replacement->replacement, replacement->replacement_len ) + ) { + efree( replacements.items ); + if ( NULL != buffer.value ) { + efree( buffer.value ); + } + php_error_docref( NULL, E_WARNING, "Unable to allocate rewritten literal URL text" ); + RETURN_FALSE; + } + + copied_until = replacement->position + replacement->length; + } + } + + if ( ! wp_native_buffer_append( &buffer, text + copied_until, text_len - copied_until ) ) { + efree( replacements.items ); + if ( NULL != buffer.value ) { + efree( buffer.value ); + } + php_error_docref( NULL, E_WARNING, "Unable to allocate rewritten literal URL text" ); + RETURN_FALSE; + } + + efree( replacements.items ); + RETVAL_STRINGL( buffer.value, buffer.length ); + efree( buffer.value ); +} + static zend_bool wp_native_url_next( wp_native_smoke_object *object ) { while ( object->cursor < object->source_len ) { @@ -578,6 +948,11 @@ ZEND_BEGIN_ARG_INFO_EX( arginfo_wp_native_create_from_string, 0, 0, 1 ) ZEND_ARG_TYPE_INFO( 0, xml, IS_STRING, 0 ) ZEND_END_ARG_INFO() +ZEND_BEGIN_ARG_INFO_EX( arginfo_wp_native_rewrite_plain_text_literal_urls, 0, 0, 2 ) + ZEND_ARG_TYPE_INFO( 0, text, IS_STRING, 0 ) + ZEND_ARG_TYPE_INFO( 0, compact_mapping, IS_STRING, 0 ) +ZEND_END_ARG_INFO() + static const zend_function_entry wp_native_html_tag_processor_methods[] = { PHP_ME( WP_HTML_Native_Tag_Processor, __construct, arginfo_wp_native_string_ctor, ZEND_ACC_PUBLIC ) PHP_ME( WP_HTML_Native_Tag_Processor, next_tag, arginfo_wp_native_next_tag, ZEND_ACC_PUBLIC ) @@ -609,6 +984,7 @@ static const zend_function_entry wp_native_url_processor_methods[] = { static const zend_function_entry wp_native_apis_functions[] = { PHP_FE( wp_native_apis_extension_version, arginfo_wp_native_void ) + PHP_FE( wp_native_apis_rewrite_plain_text_literal_urls, arginfo_wp_native_rewrite_plain_text_literal_urls ) PHP_FE_END }; diff --git a/extensions/native-apis/playground/blueprint.json b/extensions/native-apis/playground/blueprint.json index 19987510c..efe5e326f 100644 --- a/extensions/native-apis/playground/blueprint.json +++ b/extensions/native-apis/playground/blueprint.json @@ -14,7 +14,7 @@ { "step": "writeFile", "path": "/wordpress/native-api-smoke.php", - "data": "

Text

' );\nif ( ! $tag_processor->next_tag( array( 'tag_name' => 'p', 'class_name' => 'target' ) ) || 'P' !== $tag_processor->get_tag() ) {\n\tfail_native_api_smoke_test( 'WP_HTML_Native_Tag_Processor did not find the target paragraph.' );\n}\n\necho \"WP_HTML_Native_Tag_Processor: ok\\n\";\n\n$html_processor = WP_HTML_Native_Processor::create_fragment( '
Docs
' );\nif ( ! is_object( $html_processor ) || ! $html_processor->next_tag( array( 'tag_name' => 'a' ) ) || 'A' !== $html_processor->get_tag() ) {\n\tfail_native_api_smoke_test( 'WP_HTML_Native_Processor did not find the link in a fragment.' );\n}\n\necho \"WP_HTML_Native_Processor: ok\\n\";\n\n$xml_class = 'WordPress\\\\XML\\\\NativeXMLProcessor';\n$xml_processor = $xml_class::create_from_string( '' );\nif ( ! is_object( $xml_processor ) || ! $xml_processor->next_tag( 'item' ) || 'item' !== $xml_processor->get_tag_local_name() ) {\n\tfail_native_api_smoke_test( 'NativeXMLProcessor did not find the item element.' );\n}\n\necho \"WordPress\\\\XML\\\\NativeXMLProcessor: ok\\n\";\n\n$url_class = 'WordPress\\\\DataLiberation\\\\URL\\\\NativeURLInTextProcessor';\n$url_processor = new $url_class( 'Visit example.com/docs now.', 'https://wordpress.org' );\nif ( ! $url_processor->next_url() || 'example.com/docs' !== $url_processor->get_raw_url() || $url_processor->had_protocol() ) {\n\tfail_native_api_smoke_test( 'NativeURLInTextProcessor did not find the bare-domain URL.' );\n}\n\necho \"WordPress\\\\DataLiberation\\\\URL\\\\NativeURLInTextProcessor: ok\\n\";\necho \"PASS: Native API extension classes are available.\\n\";\n" + "data": "

Text

' );\nif ( ! $tag_processor->next_tag( array( 'tag_name' => 'p', 'class_name' => 'target' ) ) || 'P' !== $tag_processor->get_tag() ) {\n\tfail_native_api_smoke_test( 'WP_HTML_Native_Tag_Processor did not find the target paragraph.' );\n}\n\necho \"WP_HTML_Native_Tag_Processor: ok\\n\";\n\n$html_processor = WP_HTML_Native_Processor::create_fragment( '
Docs
' );\nif ( ! is_object( $html_processor ) || ! $html_processor->next_tag( array( 'tag_name' => 'a' ) ) || 'A' !== $html_processor->get_tag() ) {\n\tfail_native_api_smoke_test( 'WP_HTML_Native_Processor did not find the link in a fragment.' );\n}\n\necho \"WP_HTML_Native_Processor: ok\\n\";\n\n$xml_class = 'WordPress\\\\XML\\\\NativeXMLProcessor';\n$xml_processor = $xml_class::create_from_string( '' );\nif ( ! is_object( $xml_processor ) || ! $xml_processor->next_tag( 'item' ) || 'item' !== $xml_processor->get_tag_local_name() ) {\n\tfail_native_api_smoke_test( 'NativeXMLProcessor did not find the item element.' );\n}\n\necho \"WordPress\\\\XML\\\\NativeXMLProcessor: ok\\n\";\n\n$url_class = 'WordPress\\\\DataLiberation\\\\URL\\\\NativeURLInTextProcessor';\n$url_processor = new $url_class( 'Visit example.com/docs now.', 'https://wordpress.org' );\nif ( ! $url_processor->next_url() || 'example.com/docs' !== $url_processor->get_raw_url() || $url_processor->had_protocol() ) {\n\tfail_native_api_smoke_test( 'NativeURLInTextProcessor did not find the bare-domain URL.' );\n}\n\necho \"WordPress\\\\DataLiberation\\\\URL\\\\NativeURLInTextProcessor: ok\\n\";\n\nif ( ! function_exists( 'wp_native_apis_rewrite_plain_text_literal_urls' ) ) {\n\tfail_native_api_smoke_test( 'wp_native_apis_rewrite_plain_text_literal_urls is missing.' );\n}\n\n$rewritten = wp_native_apis_rewrite_plain_text_literal_urls( 'Visit http://old.example/posts/7.', \"http://old.example\\x1fhttps://new.example/base\" );\nif ( 'Visit https://new.example/base/posts/7.' !== $rewritten ) {\n\tfail_native_api_smoke_test( 'wp_native_apis_rewrite_plain_text_literal_urls returned unexpected output.' );\n}\n\necho \"wp_native_apis_rewrite_plain_text_literal_urls: ok\\n\";\necho \"PASS: Native API extension classes and functions are available.\\n\";\n" } ] } diff --git a/extensions/native-apis/src/lib.rs b/extensions/native-apis/src/lib.rs index 91ff052f7..ac0e943c9 100644 --- a/extensions/native-apis/src/lib.rs +++ b/extensions/native-apis/src/lib.rs @@ -3,6 +3,8 @@ #[cfg(feature = "php-extension")] use ext_php_rs::prelude::*; #[cfg(feature = "php-extension")] +use ext_php_rs::types::Zval; +#[cfg(feature = "php-extension")] use ext_php_rs::zend::ModuleEntry; #[cfg(feature = "php-extension")] use ext_php_rs::{info_table_end, info_table_row, info_table_start}; @@ -30,6 +32,26 @@ pub fn wp_native_apis_extension_version() -> &'static str { env!("CARGO_PKG_VERSION") } +#[cfg(feature = "php-extension")] +#[php_function] +pub fn wp_native_apis_rewrite_plain_text_literal_urls( + text: String, + compact_mapping: String, +) -> Zval { + match url_text::rewrite_plain_text_literal_urls(&text, &compact_mapping) { + Some(updated_text) => { + let mut zval = Zval::new(); + let _ = zval.set_string(&updated_text, false); + zval + } + None => { + let mut zval = Zval::new(); + zval.set_bool(false); + zval + } + } +} + #[cfg(feature = "php-extension")] #[php_module] pub fn get_module(module: ModuleBuilder) -> ModuleBuilder { @@ -39,5 +61,8 @@ pub fn get_module(module: ModuleBuilder) -> ModuleBuilder { .class::() .class::() .function(wrap_function!(wp_native_apis_extension_version)) + .function(wrap_function!( + wp_native_apis_rewrite_plain_text_literal_urls + )) .info_function(php_module_info) } diff --git a/extensions/native-apis/src/url_text.rs b/extensions/native-apis/src/url_text.rs index 1ba04d223..fac58db4c 100644 --- a/extensions/native-apis/src/url_text.rs +++ b/extensions/native-apis/src/url_text.rs @@ -232,6 +232,166 @@ pub fn find_next_url_text_candidate(text: &str, offset: usize) -> Option Option { + if has_plain_text_literal_structural_delimiter(text.as_bytes()) { + return None; + } + + let mappings = parse_plain_text_literal_mappings(compact_mapping); + if mappings.is_empty() { + return None; + } + + let mut replacements = Vec::new(); + for mapping in &mappings { + let mut offset = 0; + while offset + mapping.from.len() <= text.len() { + let Some(relative_position) = + ascii_find_case_insensitive(&text.as_bytes()[offset..], mapping.from.as_bytes()) + else { + break; + }; + + let position = offset + relative_position; + if !plain_text_literal_origin_has_valid_left_boundary(text.as_bytes(), position) + || !plain_text_literal_origin_has_valid_right_boundary( + text.as_bytes(), + position + mapping.from.len(), + ) + { + return None; + } + + replacements.push(PlainTextLiteralReplacement { + start: position, + length: mapping.from.len(), + text: mapping.to.clone(), + }); + offset = position + mapping.from.len(); + } + } + + if replacements.is_empty() { + return None; + } + + replacements.sort_by(|left, right| left.start.cmp(&right.start)); + + let mut output = String::with_capacity(text.len()); + let mut copied = 0; + for replacement in replacements { + if replacement.start < copied { + return None; + } + + output.push_str(&text[copied..replacement.start]); + output.push_str(&replacement.text); + copied = replacement.start + replacement.length; + } + output.push_str(&text[copied..]); + + Some(output) +} + +fn parse_plain_text_literal_mappings(compact_mapping: &str) -> Vec { + compact_mapping + .split('\x1e') + .filter_map(|row| { + let (from, to) = row.split_once('\x1f')?; + if !is_valid_plain_text_literal_mapping(from, to) { + return None; + } + + Some(PlainTextLiteralMapping { + from: from.to_string(), + to: to.to_string(), + }) + }) + .collect() +} + +fn is_valid_plain_text_literal_mapping(from: &str, to: &str) -> bool { + if from.is_empty() || to.is_empty() || !from.is_ascii() || !to.is_ascii() { + return false; + } + + let Some(from_origin_end) = plain_text_literal_http_origin_end(from) else { + return false; + }; + if from_origin_end != from.len() { + return false; + } + + let Some(to_origin_end) = plain_text_literal_http_origin_end(to) else { + return false; + }; + if to[to_origin_end..].contains('?') || to[to_origin_end..].contains('#') { + return false; + } + to_origin_end == to.len() || to.as_bytes()[to_origin_end] == b'/' +} + +fn plain_text_literal_http_origin_end(url: &str) -> Option { + let bytes = url.as_bytes(); + let authority_start = if ascii_starts_with(bytes, 0, b"http://") { + 7 + } else if ascii_starts_with(bytes, 0, b"https://") { + 8 + } else { + return None; + }; + + let authority_end = bytes[authority_start..] + .iter() + .position(|byte| matches!(*byte, b'/' | b'?' | b'#')) + .map(|offset| authority_start + offset) + .unwrap_or(bytes.len()); + if authority_end <= authority_start || bytes[authority_start..authority_end].contains(&b'@') { + return None; + } + + Some(authority_end) +} + +fn has_plain_text_literal_structural_delimiter(bytes: &[u8]) -> bool { + bytes.iter().any(|byte| { + matches!( + *byte, + b'<' | b'>' | b'"' | b'\'' | b'\\' | b'{' | b'}' | b'[' | b']' | b'(' | b')' + ) + }) +} + +fn plain_text_literal_origin_has_valid_left_boundary(bytes: &[u8], position: usize) -> bool { + position == 0 || bytes[position - 1].is_ascii_whitespace() +} + +fn plain_text_literal_origin_has_valid_right_boundary(bytes: &[u8], position: usize) -> bool { + position >= bytes.len() || matches!(bytes[position], b'/' | b'?' | b'#') +} + +fn ascii_find_case_insensitive(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() || haystack.len() < needle.len() { + return None; + } + + (0..=haystack.len() - needle.len()) + .find(|offset| haystack[*offset..*offset + needle.len()].eq_ignore_ascii_case(needle)) +} + fn parse_url_text_candidate_at(text: &str, start: usize) -> Option { let bytes = text.as_bytes(); let mut had_protocol = false; @@ -578,7 +738,10 @@ fn ascii_starts_with(bytes: &[u8], offset: usize, needle: &[u8]) -> bool { #[cfg(test)] mod tests { - use super::{find_next_url_text_candidate, validate_url_text_candidate, UrlTextCandidate}; + use super::{ + find_next_url_text_candidate, rewrite_plain_text_literal_urls, validate_url_text_candidate, + UrlTextCandidate, + }; #[test] fn finds_http_https_and_bare_domain_candidates() { @@ -651,4 +814,68 @@ mod tests { let mut candidate = find_next_url_text_candidate("Visit example.com", 0).expect("URL"); assert!(!validate_url_text_candidate(&mut candidate, None)); } + + #[test] + fn rewrites_plain_text_literal_url_origins() { + let mapping = "http://old.example\x1fhttps://new.example/base"; + assert_eq!( + Some( + "See https://new.example/base/posts/7 and https://new.example/base/meta." + .to_string() + ), + rewrite_plain_text_literal_urls( + "See http://old.example/posts/7 and http://old.example/meta.", + mapping, + ) + ); + } + + #[test] + fn rewrites_plain_text_literal_origins_case_insensitively() { + let mapping = "http://old.example\x1fhttps://new.example"; + assert_eq!( + Some("See https://new.example/posts/7.".to_string()), + rewrite_plain_text_literal_urls("See HTTP://OLD.EXAMPLE/posts/7.", mapping) + ); + } + + #[test] + fn refuses_structured_text_literal_url_rewrites() { + let mapping = "http://old.example\x1fhttps://new.example"; + assert_eq!( + None, + rewrite_plain_text_literal_urls("{\"url\":\"http://old.example/posts/7\"}", mapping) + ); + } + + #[test] + fn refuses_plain_text_literal_embedded_hosts() { + let mapping = "http://old.example\x1fhttps://new.example"; + assert_eq!( + None, + rewrite_plain_text_literal_urls("See http://old.example.com/posts/7.", mapping) + ); + assert_eq!( + None, + rewrite_plain_text_literal_urls("See xhttp://old.example/posts/7.", mapping) + ); + } + + #[test] + fn refuses_non_origin_literal_mappings() { + assert_eq!( + None, + rewrite_plain_text_literal_urls( + "See http://old.example/posts/7.", + "http://old.example/posts\x1fhttps://new.example", + ) + ); + assert_eq!( + None, + rewrite_plain_text_literal_urls( + "See http://old.example/posts/7.", + "http://old.example\x1fhttps://new.example?query=1", + ) + ); + } } diff --git a/extensions/native-apis/tests/verify-native-apis.php b/extensions/native-apis/tests/verify-native-apis.php index 6290b17d9..8832b6ca5 100644 --- a/extensions/native-apis/tests/verify-native-apis.php +++ b/extensions/native-apis/tests/verify-native-apis.php @@ -1903,6 +1903,30 @@ class_exists( 'WP_HTML_Doctype_Info' ); assert_true( $url_text_processor->set_raw_url( 'example.org/handbook' ), 'Expected native URL-in-text processor to replace current URL.' ); assert_same( 'Visit https://WordPress.org/plugins, then example.org/handbook.', $url_text_processor->get_updated_text(), 'Expected native URL-in-text replacement serialization.' ); +assert_true( function_exists( 'wp_native_apis_rewrite_plain_text_literal_urls' ), 'Expected native plain text literal URL rewrite function to exist.' ); +assert_same( + 'Visit https://new.example/base/posts/7 and https://new.example/base/meta.', + wp_native_apis_rewrite_plain_text_literal_urls( + 'Visit http://old.example/posts/7 and HTTP://OLD.EXAMPLE/meta.', + "http://old.example\x1fhttps://new.example/base" + ), + 'Expected native plain text literal URL rewrite function to rewrite source origins.' +); +assert_false( + wp_native_apis_rewrite_plain_text_literal_urls( + '{"url":"http://old.example/posts/7"}', + "http://old.example\x1fhttps://new.example" + ), + 'Expected native plain text literal URL rewrite function to refuse structured-looking text.' +); +assert_false( + wp_native_apis_rewrite_plain_text_literal_urls( + 'Visit http://old.example.com/posts/7.', + "http://old.example\x1fhttps://new.example" + ), + 'Expected native plain text literal URL rewrite function to refuse embedded host matches.' +); + fwrite( STDOUT, "Native API extension verification passed.\n" ); /**