From 53566b30d8a72fd392cdbcbcd0299e1332884827 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 19 May 2026 01:43:52 +0200 Subject: [PATCH 1/4] Add native batch text URL rewrite API --- extensions/native-apis/src/lib.rs | 13 +- extensions/native-apis/src/url_text.rs | 250 +++++++++++++++++- .../native-apis/tests/verify-native-apis.php | 14 + 3 files changed, 275 insertions(+), 2 deletions(-) diff --git a/extensions/native-apis/src/lib.rs b/extensions/native-apis/src/lib.rs index 91ff052f7..b67b3cbc8 100644 --- a/extensions/native-apis/src/lib.rs +++ b/extensions/native-apis/src/lib.rs @@ -18,7 +18,7 @@ extern "C" fn php_module_info(_module: *mut ModuleEntry) { info_table_row!("html", "registered"); info_table_row!( "url_text", - "registered under WordPress\\DataLiberation\\URL\\NativeURLInTextProcessor" + "registered under WordPress\\DataLiberation\\URL\\NativeURLInTextProcessor; batch rewrite function registered" ); info_table_row!("xml", "registered"); info_table_end!(); @@ -30,6 +30,16 @@ pub fn wp_native_apis_extension_version() -> &'static str { env!("CARGO_PKG_VERSION") } +#[cfg(feature = "php-extension")] +#[php_function] +pub fn wp_native_apis_rewrite_text_url_bases( + text: String, + base_url: Option, + compact_mapping: String, +) -> String { + url_text::rewrite_text_url_bases(&text, base_url.as_deref(), &compact_mapping) +} + #[cfg(feature = "php-extension")] #[php_module] pub fn get_module(module: ModuleBuilder) -> ModuleBuilder { @@ -39,5 +49,6 @@ pub fn get_module(module: ModuleBuilder) -> ModuleBuilder { .class::() .class::() .function(wrap_function!(wp_native_apis_extension_version)) + .function(wrap_function!(wp_native_apis_rewrite_text_url_bases)) .info_function(php_module_info) } diff --git a/extensions/native-apis/src/url_text.rs b/extensions/native-apis/src/url_text.rs index 1ba04d223..3c10bd2ef 100644 --- a/extensions/native-apis/src/url_text.rs +++ b/extensions/native-apis/src/url_text.rs @@ -232,6 +232,48 @@ pub fn find_next_url_text_candidate(text: &str, offset: usize) -> Option, compact_mapping: &str) -> String { + let mappings = parse_rewrite_mappings(compact_mapping); + if mappings.is_empty() { + return text.to_string(); + } + + let base_protocol = base_url.and_then(parse_url_scheme); + let mut output = String::with_capacity(text.len()); + let mut copied = 0; + let mut offset = 0; + let mut changed = false; + + while let Some(mut candidate) = find_next_url_text_candidate(text, offset) { + offset = candidate.starts_at + candidate.length; + if !validate_url_text_candidate(&mut candidate, base_protocol.as_deref()) { + continue; + } + + let Some(replacement) = + replacement_for_candidate(&candidate, base_protocol.as_deref(), &mappings) + else { + continue; + }; + + if replacement == candidate.raw_url { + continue; + } + + output.push_str(&text[copied..candidate.starts_at]); + output.push_str(&replacement); + copied = candidate.starts_at + candidate.length; + changed = true; + } + + if !changed { + return text.to_string(); + } + + output.push_str(&text[copied..]); + output +} + fn parse_url_text_candidate_at(text: &str, start: usize) -> Option { let bytes = text.as_bytes(); let mut had_protocol = false; @@ -348,6 +390,157 @@ fn validate_url_text_candidate( true } +#[derive(Clone, Debug)] +struct RewriteMapping { + from: ParsedHttpUrl, + to: String, +} + +#[derive(Clone, Debug)] +struct ParsedHttpUrl { + original: String, + scheme: String, + authority: String, + path_start: usize, + path_end: usize, + path: String, +} + +fn parse_rewrite_mappings(compact_mapping: &str) -> Vec { + compact_mapping + .split('\x1e') + .filter_map(|row| { + let (from, to) = row.split_once('\x1f')?; + if from.is_empty() || to.is_empty() { + return None; + } + + Some(RewriteMapping { + from: parse_absolute_http_url(from, None)?, + to: to.to_string(), + }) + }) + .collect() +} + +fn replacement_for_candidate( + candidate: &UrlTextCandidate, + base_protocol: Option<&str>, + mappings: &[RewriteMapping], +) -> Option { + let parsed_candidate = parse_absolute_http_url(&candidate.preprocessed_url, base_protocol)?; + for mapping in mappings { + let Some(suffix) = child_url_suffix(&parsed_candidate, &mapping.from) else { + continue; + }; + + let replacement = join_base_and_suffix(&mapping.to, suffix); + if candidate.did_prepend_protocol { + if let Some(stripped) = strip_scheme_authority_prefix(&replacement) { + return Some(stripped.to_string()); + } + } + + if candidate.raw_url.starts_with("//") { + if let Some(colon) = replacement.find(':') { + return Some(replacement[colon + 1..].to_string()); + } + } + + return Some(replacement); + } + + None +} + +fn parse_absolute_http_url(url: &str, base_protocol: Option<&str>) -> Option { + let bytes = url.as_bytes(); + let (scheme, authority_start) = if ascii_starts_with(bytes, 0, b"http://") { + ("http", 7) + } else if ascii_starts_with(bytes, 0, b"https://") { + ("https", 8) + } else if bytes.starts_with(b"//") { + let protocol = base_protocol?; + if !is_http_or_https_scheme(protocol) { + return None; + } + (protocol, 2) + } else { + return None; + }; + + let authority_end = bytes[authority_start..] + .iter() + .position(|byte| matches!(*byte, b'/' | b'?' | b'#')) + .map(|offset| authority_start + offset) + .unwrap_or(bytes.len()); + if authority_end <= authority_start { + return None; + } + + let path_start = authority_end; + let path_end = bytes[path_start..] + .iter() + .position(|byte| matches!(*byte, b'?' | b'#')) + .map(|offset| path_start + offset) + .unwrap_or(bytes.len()); + let path = if path_end > path_start && bytes[path_start] == b'/' { + url[path_start..path_end].to_string() + } else { + "/".to_string() + }; + + Some(ParsedHttpUrl { + original: url.to_string(), + scheme: scheme.to_ascii_lowercase(), + authority: url[authority_start..authority_end].to_ascii_lowercase(), + path_start, + path_end, + path, + }) +} + +fn child_url_suffix<'a>(child: &'a ParsedHttpUrl, parent: &ParsedHttpUrl) -> Option<&'a str> { + if child.scheme != parent.scheme || child.authority != parent.authority { + return None; + } + + let parent_path = parent.path.trim_end_matches('/'); + if parent_path.is_empty() { + return Some(&child.original[child.path_start..]); + } + + let child_path = child.path.trim_end_matches('/'); + if child_path == parent_path { + return Some(&child.original[child.path_end..]); + } + + if child.path.starts_with(parent_path) + && child.path.as_bytes().get(parent_path.len()) == Some(&b'/') + { + return Some(&child.original[child.path_start + parent_path.len()..]); + } + + None +} + +fn join_base_and_suffix(base: &str, suffix: &str) -> String { + if suffix.is_empty() { + return base.to_string(); + } + + if suffix.starts_with('/') || suffix.starts_with('?') || suffix.starts_with('#') { + format!("{}{}", base.trim_end_matches('/'), suffix) + } else { + format!("{base}{suffix}") + } +} + +fn strip_scheme_authority_prefix(url: &str) -> Option<&str> { + let scheme_end = url.find("://")?; + Some(&url[scheme_end + 3..]) +} + #[cfg(feature = "php-extension")] fn url_zval_bool(value: bool) -> Zval { let mut zval = Zval::new(); @@ -578,7 +771,10 @@ fn ascii_starts_with(bytes: &[u8], offset: usize, needle: &[u8]) -> bool { #[cfg(test)] mod tests { - use super::{find_next_url_text_candidate, validate_url_text_candidate, UrlTextCandidate}; + use super::{ + find_next_url_text_candidate, rewrite_text_url_bases, validate_url_text_candidate, + UrlTextCandidate, + }; #[test] fn finds_http_https_and_bare_domain_candidates() { @@ -651,4 +847,56 @@ mod tests { let mut candidate = find_next_url_text_candidate("Visit example.com", 0).expect("URL"); assert!(!validate_url_text_candidate(&mut candidate, None)); } + + #[test] + fn rewrites_absolute_url_bases_in_one_pass() { + let mapping = "http://old.example\x1fhttps://new.example/base"; + assert_eq!( + "Visit https://new.example/base/posts/7?x=1.", + rewrite_text_url_bases( + "Visit http://old.example/posts/7?x=1.", + Some("http://old.example"), + mapping, + ) + ); + } + + #[test] + fn rewrites_bare_domains_without_adding_protocol() { + let mapping = "https://example.com\x1fhttps://new.example"; + assert_eq!( + "Visit new.example/docs.", + rewrite_text_url_bases( + "Visit example.com/docs.", + Some("https://example.com"), + mapping, + ) + ); + } + + #[test] + fn preserves_protocol_relative_urls() { + let mapping = "https://example.com\x1fhttps://new.example"; + assert_eq!( + "Visit //new.example/docs.", + rewrite_text_url_bases( + "Visit //example.com/docs.", + Some("https://example.com"), + mapping, + ) + ); + } + + #[test] + fn leaves_sibling_paths_unchanged() { + let mapping = "https://example.com/base\x1fhttps://new.example/base"; + assert_eq!( + "Visit https://example.com/baseball.", + rewrite_text_url_bases( + "Visit https://example.com/baseball.", + Some("https://example.com"), + mapping, + ) + ); + } } diff --git a/extensions/native-apis/tests/verify-native-apis.php b/extensions/native-apis/tests/verify-native-apis.php index 6290b17d9..e4d10c092 100644 --- a/extensions/native-apis/tests/verify-native-apis.php +++ b/extensions/native-apis/tests/verify-native-apis.php @@ -38,6 +38,20 @@ require_once dirname( __DIR__, 3 ) . '/vendor/autoload.php'; class_exists( 'WP_HTML_Doctype_Info' ); +assert_true( + function_exists( 'wp_native_apis_rewrite_text_url_bases' ), + 'Expected native batch text URL rewrite function to be registered.' +); +assert_same( + 'Visit https://new.example/posts/7.', + wp_native_apis_rewrite_text_url_bases( + 'Visit http://old.example/posts/7.', + 'http://old.example', + "http://old.example\x1fhttps://new.example" + ), + 'Expected native batch text URL rewrite function to replace a URL base.' +); + $tag_processor = new WP_HTML_Native_Tag_Processor( '
Link
' ); assert_false( $tag_processor->paused_at_incomplete_token(), 'Expected native HTML tag processor not to start paused at an incomplete token.' ); assert_true( $tag_processor->next_tag(), 'Expected first HTML tag.' ); From dce2b4a4b5256f550c1374ea3a8036782cede54d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 19 May 2026 02:10:33 +0200 Subject: [PATCH 2/4] Add WASM batch URL rewrite shim --- extensions/native-apis/README.md | 6 +- extensions/native-apis/native_apis_shim.c | 292 ++++++++++++++++++ .../native-apis/playground/blueprint.json | 4 +- 3 files changed, 297 insertions(+), 5 deletions(-) diff --git a/extensions/native-apis/README.md b/extensions/native-apis/README.md index 8b1ae983f..8bc859065 100644 --- a/extensions/native-apis/README.md +++ b/extensions/native-apis/README.md @@ -213,9 +213,9 @@ extensions/native-apis/build-playground-extension.sh The host PHP extension is Rust-backed through `ext-php-rs`. The Playground bundle currently uses `native_apis_shim.c` instead, because Playground's PHP.wasm runtime only exports the PHP C ABI symbols needed by regular C -extensions. The shim registers the native extension classes and verifies the -Playground loading path while the full Rust-backed implementation remains the -host PHP artifact. +extensions. The shim registers the native extension classes and implements the +batch text URL base rewrite primitive used by Reprint while the full +Rust-backed implementation remains the host PHP artifact. The `Native APIs Playground Extension` workflow publishes the bundle after changes land on `trunk`. It stores the release history on the repository's diff --git a/extensions/native-apis/native_apis_shim.c b/extensions/native-apis/native_apis_shim.c index 86a0736af..58c84b101 100644 --- a/extensions/native-apis/native_apis_shim.c +++ b/extensions/native-apis/native_apis_shim.c @@ -506,6 +506,291 @@ wp_native_url_next( wp_native_smoke_object *object ) { return 0; } +typedef struct { + char *value; + size_t length; + size_t capacity; +} wp_native_string_buffer; + +static zend_bool +wp_native_buffer_append( wp_native_string_buffer *buffer, const char *value, size_t length ) { + size_t required; + size_t capacity; + + if ( 0 == length ) { + return 1; + } + + if ( length > ( (size_t) -1 ) - buffer->length - 1 ) { + return 0; + } + + required = buffer->length + length + 1; + if ( required > buffer->capacity ) { + capacity = buffer->capacity ? buffer->capacity : 256; + while ( capacity < required ) { + if ( capacity > ( (size_t) -1 ) / 2 ) { + capacity = required; + break; + } + capacity *= 2; + } + + buffer->value = buffer->value ? erealloc( buffer->value, capacity ) : emalloc( capacity ); + buffer->capacity = capacity; + } + + memcpy( buffer->value + buffer->length, value, length ); + buffer->length += length; + buffer->value[ buffer->length ] = '\0'; + + return 1; +} + +static size_t +wp_native_http_scheme_len( const char *value, size_t value_len ) { + if ( wp_native_ascii_starts_with( value, value_len, "http://", 7 ) ) { + return 7; + } + + if ( wp_native_ascii_starts_with( value, value_len, "https://", 8 ) ) { + return 8; + } + + return 0; +} + +static zend_bool +wp_native_base_boundary_allows_match( const char *candidate, size_t candidate_len, size_t from_len ) { + char next; + + if ( candidate_len == from_len ) { + return 1; + } + + if ( from_len > 0 && '/' == candidate[ from_len - 1 ] ) { + return 1; + } + + next = candidate[ from_len ]; + return '/' == next || '?' == next || '#' == next; +} + +static zend_bool +wp_native_find_url_rewrite( + const char *candidate, + size_t candidate_len, + const char *compact_mapping, + size_t compact_mapping_len, + const char **replacement, + size_t *replacement_len, + size_t *replacement_prefix_len, + size_t *replace_len +) { + size_t row_start = 0; + size_t candidate_scheme_len = wp_native_http_scheme_len( candidate, candidate_len ); + size_t candidate_compare_offset = 0; + zend_bool protocol_relative = 0; + + if ( + 0 == candidate_scheme_len && + candidate_len >= 2 && + '/' == candidate[0] && + '/' == candidate[1] + ) { + candidate_compare_offset = 2; + protocol_relative = 1; + } + + while ( row_start < compact_mapping_len ) { + size_t row_end = row_start; + size_t separator; + const char *from; + const char *to; + size_t from_len; + size_t to_len; + size_t from_scheme_len; + size_t to_scheme_len; + size_t from_compare_offset = 0; + size_t from_compare_len; + size_t to_offset = 0; + + while ( row_end < compact_mapping_len && '\x1e' != compact_mapping[ row_end ] ) { + row_end++; + } + + separator = row_start; + while ( separator < row_end && '\x1f' != compact_mapping[ separator ] ) { + separator++; + } + + if ( separator == row_end ) { + row_start = row_end + ( row_end < compact_mapping_len ? 1 : 0 ); + continue; + } + + from = compact_mapping + row_start; + from_len = separator - row_start; + to = compact_mapping + separator + 1; + to_len = row_end - separator - 1; + + from_scheme_len = wp_native_http_scheme_len( from, from_len ); + to_scheme_len = wp_native_http_scheme_len( to, to_len ); + + if ( 0 == from_len ) { + row_start = row_end + ( row_end < compact_mapping_len ? 1 : 0 ); + continue; + } + + if ( 0 == candidate_scheme_len && from_scheme_len > 0 ) { + from_compare_offset = from_scheme_len; + } + + from_compare_len = from_len - from_compare_offset; + if ( 0 == from_compare_len ) { + row_start = row_end + ( row_end < compact_mapping_len ? 1 : 0 ); + continue; + } + + if ( + candidate_len >= candidate_compare_offset + from_compare_len && + wp_native_ascii_starts_with( + candidate + candidate_compare_offset, + candidate_len - candidate_compare_offset, + from + from_compare_offset, + from_compare_len + ) && + wp_native_base_boundary_allows_match( + candidate + candidate_compare_offset, + candidate_len - candidate_compare_offset, + from_compare_len + ) + ) { + if ( 0 == candidate_scheme_len && to_scheme_len > 0 ) { + to_offset = to_scheme_len; + } + + *replacement = to + to_offset; + *replacement_len = to_len - to_offset; + *replacement_prefix_len = protocol_relative ? 2 : 0; + *replace_len = candidate_compare_offset + from_compare_len; + return 1; + } + + row_start = row_end + ( row_end < compact_mapping_len ? 1 : 0 ); + } + + return 0; +} + +PHP_FUNCTION( wp_native_apis_rewrite_text_url_bases ) { + char *text; + size_t text_len; + char *base_url = NULL; + size_t base_url_len = 0; + char *compact_mapping; + size_t compact_mapping_len; + size_t cursor = 0; + size_t copied_until = 0; + wp_native_string_buffer buffer = { NULL, 0, 0 }; + + ZEND_PARSE_PARAMETERS_START( 3, 3 ) + Z_PARAM_STRING( text, text_len ) + Z_PARAM_STRING_OR_NULL( base_url, base_url_len ) + Z_PARAM_STRING( compact_mapping, compact_mapping_len ) + ZEND_PARSE_PARAMETERS_END(); + + (void) base_url; + (void) base_url_len; + + while ( cursor < text_len ) { + size_t start = cursor; + size_t end; + const char *replacement; + size_t replacement_len; + size_t replacement_prefix_len; + size_t replace_len; + zend_bool is_url_candidate = 0; + + while ( start < text_len && wp_native_ascii_is_space( text[ start ] ) ) { + start++; + } + + end = start; + while ( end < text_len && ! wp_native_ascii_is_space( text[ end ] ) ) { + end++; + } + cursor = end + ( end < text_len ? 1 : 0 ); + + if ( end <= start ) { + continue; + } + + while ( end > start && wp_native_is_trailing_url_punctuation( text[ end - 1 ] ) ) { + end--; + } + + if ( wp_native_http_scheme_len( text + start, end - start ) > 0 ) { + is_url_candidate = 1; + } else if ( NULL != wp_native_find_char( text + start, end - start, '.' ) ) { + is_url_candidate = 1; + } + + if ( + ! is_url_candidate || + ! wp_native_find_url_rewrite( + text + start, + end - start, + compact_mapping, + compact_mapping_len, + &replacement, + &replacement_len, + &replacement_prefix_len, + &replace_len + ) + ) { + continue; + } + + if ( + ! wp_native_buffer_append( &buffer, text + copied_until, start - copied_until ) || + ! wp_native_buffer_append( &buffer, text + start, replacement_prefix_len ) || + ! wp_native_buffer_append( + &buffer, + replacement, + replacement_len > 0 && + replace_len < end - start && + '/' == replacement[ replacement_len - 1 ] && + '/' == text[ start + replace_len ] + ? replacement_len - 1 + : replacement_len + ) || + ! wp_native_buffer_append( &buffer, text + start + replace_len, end - start - replace_len ) + ) { + if ( NULL != buffer.value ) { + efree( buffer.value ); + } + php_error_docref( NULL, E_WARNING, "Unable to allocate rewritten text" ); + RETURN_FALSE; + } + + copied_until = end; + } + + if ( NULL == buffer.value ) { + RETURN_STRINGL( text, text_len ); + } + + if ( ! wp_native_buffer_append( &buffer, text + copied_until, text_len - copied_until ) ) { + efree( buffer.value ); + php_error_docref( NULL, E_WARNING, "Unable to allocate rewritten text" ); + RETURN_FALSE; + } + + RETVAL_STRINGL( buffer.value, buffer.length ); + efree( buffer.value ); +} + PHP_METHOD( NativeURLInTextProcessor, __construct ) { char *text; size_t text_len; @@ -578,6 +863,12 @@ ZEND_BEGIN_ARG_INFO_EX( arginfo_wp_native_create_from_string, 0, 0, 1 ) ZEND_ARG_TYPE_INFO( 0, xml, IS_STRING, 0 ) ZEND_END_ARG_INFO() +ZEND_BEGIN_ARG_INFO_EX( arginfo_wp_native_rewrite_text_url_bases, 0, 0, 3 ) + ZEND_ARG_TYPE_INFO( 0, text, IS_STRING, 0 ) + ZEND_ARG_TYPE_INFO( 0, base_url, IS_STRING, 1 ) + ZEND_ARG_TYPE_INFO( 0, compact_mapping, IS_STRING, 0 ) +ZEND_END_ARG_INFO() + static const zend_function_entry wp_native_html_tag_processor_methods[] = { PHP_ME( WP_HTML_Native_Tag_Processor, __construct, arginfo_wp_native_string_ctor, ZEND_ACC_PUBLIC ) PHP_ME( WP_HTML_Native_Tag_Processor, next_tag, arginfo_wp_native_next_tag, ZEND_ACC_PUBLIC ) @@ -609,6 +900,7 @@ static const zend_function_entry wp_native_url_processor_methods[] = { static const zend_function_entry wp_native_apis_functions[] = { PHP_FE( wp_native_apis_extension_version, arginfo_wp_native_void ) + PHP_FE( wp_native_apis_rewrite_text_url_bases, arginfo_wp_native_rewrite_text_url_bases ) PHP_FE_END }; diff --git a/extensions/native-apis/playground/blueprint.json b/extensions/native-apis/playground/blueprint.json index 19987510c..3ed435858 100644 --- a/extensions/native-apis/playground/blueprint.json +++ b/extensions/native-apis/playground/blueprint.json @@ -8,13 +8,13 @@ "meta": { "title": "WordPress Native APIs smoke test", "author": "WordPress", - "description": "Verifies that a WordPress Playground runtime exposes the wp_native_apis PHP.wasm extension classes." + "description": "Verifies that a WordPress Playground runtime exposes the wp_native_apis PHP.wasm extension classes and batch text URL rewrite function." }, "steps": [ { "step": "writeFile", "path": "/wordpress/native-api-smoke.php", - "data": "

Text

' );\nif ( ! $tag_processor->next_tag( array( 'tag_name' => 'p', 'class_name' => 'target' ) ) || 'P' !== $tag_processor->get_tag() ) {\n\tfail_native_api_smoke_test( 'WP_HTML_Native_Tag_Processor did not find the target paragraph.' );\n}\n\necho \"WP_HTML_Native_Tag_Processor: ok\\n\";\n\n$html_processor = WP_HTML_Native_Processor::create_fragment( '
Docs
' );\nif ( ! is_object( $html_processor ) || ! $html_processor->next_tag( array( 'tag_name' => 'a' ) ) || 'A' !== $html_processor->get_tag() ) {\n\tfail_native_api_smoke_test( 'WP_HTML_Native_Processor did not find the link in a fragment.' );\n}\n\necho \"WP_HTML_Native_Processor: ok\\n\";\n\n$xml_class = 'WordPress\\\\XML\\\\NativeXMLProcessor';\n$xml_processor = $xml_class::create_from_string( '' );\nif ( ! is_object( $xml_processor ) || ! $xml_processor->next_tag( 'item' ) || 'item' !== $xml_processor->get_tag_local_name() ) {\n\tfail_native_api_smoke_test( 'NativeXMLProcessor did not find the item element.' );\n}\n\necho \"WordPress\\\\XML\\\\NativeXMLProcessor: ok\\n\";\n\n$url_class = 'WordPress\\\\DataLiberation\\\\URL\\\\NativeURLInTextProcessor';\n$url_processor = new $url_class( 'Visit example.com/docs now.', 'https://wordpress.org' );\nif ( ! $url_processor->next_url() || 'example.com/docs' !== $url_processor->get_raw_url() || $url_processor->had_protocol() ) {\n\tfail_native_api_smoke_test( 'NativeURLInTextProcessor did not find the bare-domain URL.' );\n}\n\necho \"WordPress\\\\DataLiberation\\\\URL\\\\NativeURLInTextProcessor: ok\\n\";\necho \"PASS: Native API extension classes are available.\\n\";\n" + "data": "

Text

' );\nif ( ! $tag_processor->next_tag( array( 'tag_name' => 'p', 'class_name' => 'target' ) ) || 'P' !== $tag_processor->get_tag() ) {\n\tfail_native_api_smoke_test( 'WP_HTML_Native_Tag_Processor did not find the target paragraph.' );\n}\n\necho \"WP_HTML_Native_Tag_Processor: ok\\n\";\n\n$html_processor = WP_HTML_Native_Processor::create_fragment( '
Docs
' );\nif ( ! is_object( $html_processor ) || ! $html_processor->next_tag( array( 'tag_name' => 'a' ) ) || 'A' !== $html_processor->get_tag() ) {\n\tfail_native_api_smoke_test( 'WP_HTML_Native_Processor did not find the link in a fragment.' );\n}\n\necho \"WP_HTML_Native_Processor: ok\\n\";\n\n$xml_class = 'WordPress\\\\XML\\\\NativeXMLProcessor';\n$xml_processor = $xml_class::create_from_string( '' );\nif ( ! is_object( $xml_processor ) || ! $xml_processor->next_tag( 'item' ) || 'item' !== $xml_processor->get_tag_local_name() ) {\n\tfail_native_api_smoke_test( 'NativeXMLProcessor did not find the item element.' );\n}\n\necho \"WordPress\\\\XML\\\\NativeXMLProcessor: ok\\n\";\n\n$url_class = 'WordPress\\\\DataLiberation\\\\URL\\\\NativeURLInTextProcessor';\n$url_processor = new $url_class( 'Visit example.com/docs now.', 'https://wordpress.org' );\nif ( ! $url_processor->next_url() || 'example.com/docs' !== $url_processor->get_raw_url() || $url_processor->had_protocol() ) {\n\tfail_native_api_smoke_test( 'NativeURLInTextProcessor did not find the bare-domain URL.' );\n}\n\necho \"WordPress\\\\DataLiberation\\\\URL\\\\NativeURLInTextProcessor: ok\\n\";\n\nif ( ! function_exists( 'wp_native_apis_rewrite_text_url_bases' ) ) {\n\tfail_native_api_smoke_test( 'wp_native_apis_rewrite_text_url_bases is missing.' );\n}\n\n$rewritten = wp_native_apis_rewrite_text_url_bases( 'Visit http://old.example/posts/7.', 'http://old.example', \"http://old.example\\x1fhttps://new.example\" );\nif ( 'Visit https://new.example/posts/7.' !== $rewritten ) {\n\tfail_native_api_smoke_test( 'wp_native_apis_rewrite_text_url_bases returned unexpected output.' );\n}\n\necho \"wp_native_apis_rewrite_text_url_bases: ok\\n\";\necho \"PASS: Native API extension classes and functions are available.\\n\";\n" } ] } From 5628950c5149f09bd9e1f9ba68dcb25474b45097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 19 May 2026 11:12:49 +0200 Subject: [PATCH 3/4] Run URL rewriting tests with native extension --- .github/workflows/native-apis.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/native-apis.yml b/.github/workflows/native-apis.yml index c49c141f5..9d460693e 100644 --- a/.github/workflows/native-apis.yml +++ b/.github/workflows/native-apis.yml @@ -50,3 +50,8 @@ jobs: - name: Verify native extension run: php -d extension=extensions/native-apis/target/release/libwp_native_apis.so extensions/native-apis/tests/verify-native-apis.php + + - name: Run URL rewriting tests with native extension + run: | + php -d extension=extensions/native-apis/target/release/libwp_native_apis.so -r 'require "vendor/autoload.php"; if ( ! is_subclass_of( "WordPress\\DataLiberation\\URL\\URLInTextProcessor", "WordPress\\DataLiberation\\URL\\NativeURLInTextProcessor" ) ) { fwrite( STDERR, "URLInTextProcessor is not using the native implementation.\n" ); exit( 1 ); }' + php -d extension=extensions/native-apis/target/release/libwp_native_apis.so vendor/bin/phpunit components/DataLiberation/Tests/URLInTextProcessorTest.php components/DataLiberation/Tests/BlockMarkupUrlProcessorTest.php components/DataLiberation/Tests/RewriteUrlsTest.php From 684a29c9ab343420d340c1725f3782111dc9b1b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 19 May 2026 11:17:54 +0200 Subject: [PATCH 4/4] Align native URL text processor with PHP tests --- extensions/native-apis/src/url_text.rs | 50 ++++++++++++++++++++------ 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/extensions/native-apis/src/url_text.rs b/extensions/native-apis/src/url_text.rs index 3c10bd2ef..31c7488d6 100644 --- a/extensions/native-apis/src/url_text.rs +++ b/extensions/native-apis/src/url_text.rs @@ -49,7 +49,7 @@ impl NativeUrlInTextProcessor { bytes_already_parsed: 0, current: None, replacements: Vec::new(), - validate_urls: true, + validate_urls: false, base_url, base_protocol, } @@ -152,21 +152,30 @@ impl NativeUrlInTextProcessor { return false; }; + let replacement_text = if candidate.did_prepend_protocol { + new_url + .find("://") + .map(|scheme_end| new_url[scheme_end + 3..].to_string()) + .unwrap_or(new_url) + } else { + new_url + }; + if let Some(replacement) = self .replacements .iter_mut() .find(|replacement| replacement.start == candidate.starts_at) { replacement.length = candidate.length; - replacement.text = new_url.clone(); + replacement.text = replacement_text.clone(); } else { self.replacements.push(UrlTextReplacement { start: candidate.starts_at, length: candidate.length, - text: new_url.clone(), + text: replacement_text.clone(), }); } - candidate.raw_url = new_url; + candidate.raw_url = replacement_text; true } @@ -217,6 +226,11 @@ pub fn find_next_url_text_candidate(text: &str, offset: usize) -> Option bool { return false; }; let tld = &host[last_dot + 1..]; - tld.len() >= 2 - && tld.len() <= 63 - && tld + if tld.len() < 2 || tld.len() > 63 || !host.split('.').all(is_valid_hostname_label) { + return false; + } + + tld.bytes().any(|byte| byte >= 0x80) + || tld .bytes() .all(|byte| byte.is_ascii_alphanumeric() || byte == b'-') - && host.split('.').all(is_valid_hostname_label) } fn is_known_public_domain(tld: &str) -> bool { @@ -750,9 +766,9 @@ fn is_valid_hostname_label(label: &str) -> bool { && bytes.len() <= 63 && bytes[0] != b'-' && bytes[bytes.len() - 1] != b'-' - && bytes - .iter() - .all(|byte| byte.is_ascii_alphanumeric() || *byte == b'-' || *byte == b'%') + && bytes.iter().all(|byte| { + byte.is_ascii_alphanumeric() || *byte == b'-' || *byte == b'%' || *byte >= 0x80 + }) } fn is_hostish_byte(byte: u8) -> bool { @@ -762,6 +778,7 @@ fn is_hostish_byte(byte: u8) -> bool { || byte == b'%' || byte == b'[' || byte == b']' + || byte >= 0x80 } fn ascii_starts_with(bytes: &[u8], offset: usize, needle: &[u8]) -> bool { @@ -815,6 +832,17 @@ mod tests { assert_eq!("http://xn--fsqu00a.xn--0zwm56d", candidate.raw_url); } + #[test] + fn accepts_unicode_hosts() { + let text = "Visit http://例子.测试 and 例子.com/docs"; + let first = find_next_url_text_candidate(text, 0).expect("first URL"); + assert_eq!("http://例子.测试", first.raw_url); + + let second = + find_next_url_text_candidate(text, first.starts_at + first.length).expect("second URL"); + assert_eq!("例子.com/docs", second.raw_url); + } + #[test] fn validates_public_url_candidates_with_base_protocol() { let mut candidate = find_next_url_text_candidate("Visit example.com/docs", 0).expect("URL");