Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 213 additions & 0 deletions bin/benchmark-native-apis.php
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,13 @@
URLInTextProcessor::class,
'wp_toolkit_native_api_benchmark_url_in_text_processor'
);
$results[] = wp_toolkit_native_api_benchmark_run(
'plain-text-literal-url-rewrite',
'php',
$iterations,
URLInTextProcessor::class,
'wp_toolkit_native_api_benchmark_php_plain_text_literal_url_rewrite'
);
}

if ( wp_toolkit_native_api_benchmark_should_run( $mode, 'native' ) ) {
Expand All @@ -758,6 +765,13 @@
'WordPress\\DataLiberation\\URL\\NativeURLInTextProcessor',
'wp_toolkit_native_api_benchmark_native_url_in_text_processor'
);
$results[] = wp_toolkit_native_api_benchmark_run(
'plain-text-literal-url-rewrite',
'native',
$iterations,
'WordPress\\DataLiberation\\URL\\NativeURLInTextProcessor',
'wp_toolkit_native_api_benchmark_native_plain_text_literal_url_rewrite'
);
}
}

Expand Down Expand Up @@ -2365,6 +2379,179 @@ function wp_toolkit_native_api_benchmark_native_url_in_text_processor() {
return $count;
}

/**
* Benchmark the PHP plain text literal URL rewrite path.
*
* @return int Number of URLs rewritten.
*/
function wp_toolkit_native_api_benchmark_php_plain_text_literal_url_rewrite() {
static $mappings = null;

$text = wp_toolkit_native_api_benchmark_plain_text_literal_url_document();
if ( null === $mappings ) {
$mappings = wp_toolkit_native_api_benchmark_parse_plain_text_literal_mappings(
wp_toolkit_native_api_benchmark_plain_text_literal_url_mapping()
);
}

$rewritten = wp_toolkit_native_api_benchmark_php_rewrite_plain_text_literal_urls( $text, $mappings );
if ( false === $rewritten ) {
throw new RuntimeException( 'PHP plain text literal URL rewrite benchmark did not rewrite the fixture.' );
}

$count = substr_count( $rewritten, 'https://new.example/base/' );
if ( 360 !== $count ) {
throw new RuntimeException( "PHP plain text literal URL rewrite benchmark expected 360 URLs, rewrote {$count}." );
}

return $count;
}

/**
* Benchmark the native plain text literal URL rewrite path.
*
* @return int Number of URLs rewritten.
*/
function wp_toolkit_native_api_benchmark_native_plain_text_literal_url_rewrite() {
if ( ! function_exists( 'wp_native_apis_rewrite_plain_text_literal_urls' ) ) {
throw new RuntimeException( 'Function wp_native_apis_rewrite_plain_text_literal_urls is not available.' );
}

$rewritten = wp_native_apis_rewrite_plain_text_literal_urls(
wp_toolkit_native_api_benchmark_plain_text_literal_url_document(),
wp_toolkit_native_api_benchmark_plain_text_literal_url_mapping()
);
if ( false === $rewritten ) {
throw new RuntimeException( 'Native plain text literal URL rewrite benchmark did not rewrite the fixture.' );
}

$count = substr_count( $rewritten, 'https://new.example/base/' );
if ( 360 !== $count ) {
throw new RuntimeException( "Native plain text literal URL rewrite benchmark expected 360 URLs, rewrote {$count}." );
}

return $count;
}

/**
* Rewrite simple literal source-origin URLs in known plain text.
*
* @param string $text Plain text fixture.
* @param array $mappings Parsed source-origin to target-prefix mappings.
* @return false|string false when the generic parser path must handle it.
*/
function wp_toolkit_native_api_benchmark_php_rewrite_plain_text_literal_urls( $text, $mappings ) {
if ( array() === $mappings || false !== strpbrk( $text, "<>\"'\\{}[]()" ) ) {
return false;
}

$replacements = array();
foreach ( $mappings as $mapping ) {
$from = $mapping['from'];
$from_length = strlen( $from );
$offset = 0;

while ( true ) {
$position = strpos( $text, $from, $offset );
if ( false === $position ) {
break;
}

if (
! wp_toolkit_native_api_benchmark_literal_origin_has_valid_left_boundary( $text, $position ) ||
! wp_toolkit_native_api_benchmark_literal_origin_has_valid_right_boundary( $text, $position + $from_length )
) {
return false;
}

$replacements[] = array( $position, $from_length, $mapping['to'] );
$offset = $position + $from_length;
}
}

if ( array() === $replacements ) {
return false;
}

usort(
$replacements,
function ( $a, $b ) {
return $a[0] <=> $b[0];
}
);

$rewritten = '';
$cursor = 0;
foreach ( $replacements as $replacement ) {
$position = $replacement[0];
$length = $replacement[1];
$to = $replacement[2];
if ( $position < $cursor ) {
return false;
}

$rewritten .= substr( $text, $cursor, $position - $cursor );
$rewritten .= $to;
$cursor = $position + $length;
}
$rewritten .= substr( $text, $cursor );

return $rewritten;
}

/**
* Parse compact benchmark mappings into plain text literal rewrite rows.
*
* @param string $compact_mapping Compact source-origin to target-prefix mappings.
* @return array<int,array{from:string,to:string}>
*/
function wp_toolkit_native_api_benchmark_parse_plain_text_literal_mappings( $compact_mapping ) {
$mappings = array();
foreach ( explode( "\x1e", $compact_mapping ) as $row ) {
$parts = explode( "\x1f", $row, 2 );
if ( 2 !== count( $parts ) ) {
continue;
}

$mappings[] = array(
'from' => $parts[0],
'to' => $parts[1],
);
}

return $mappings;
}

/**
* Check the left boundary for a plain text literal source origin.
*
* @param string $text Plain text fixture.
* @param int $position Candidate position.
* @return bool
*/
function wp_toolkit_native_api_benchmark_literal_origin_has_valid_left_boundary( $text, $position ) {
if ( 0 === $position ) {
return true;
}

return ctype_space( $text[ $position - 1 ] );
}

/**
* Check the right boundary for a plain text literal source origin.
*
* @param string $text Plain text fixture.
* @param int $position Candidate end position.
* @return bool
*/
function wp_toolkit_native_api_benchmark_literal_origin_has_valid_right_boundary( $text, $position ) {
if ( $position >= strlen( $text ) ) {
return true;
}

return '/' === $text[ $position ] || '?' === $text[ $position ] || '#' === $text[ $position ];
}

/**
* Benchmark the XML processor.
*
Expand Down Expand Up @@ -4266,6 +4453,32 @@ function wp_toolkit_native_api_benchmark_url_in_text_document() {
return implode( ' ', $items );
}

/**
* Build representative plain text for literal source-origin URL rewrites.
*
* @return string
*/
function wp_toolkit_native_api_benchmark_plain_text_literal_url_document() {
$items = array();
for ( $i = 0; $i < 120; $i++ ) {
$items[] = sprintf(
'Post %1$d references http://old.example/posts/%1$d, http://old.example/media/%1$d.jpg and http://old.example/meta/%1$d.',
$i
);
}

return implode( ' ', $items );
}

/**
* Build compact source-origin to target-prefix mappings for literal rewrites.
*
* @return string
*/
function wp_toolkit_native_api_benchmark_plain_text_literal_url_mapping() {
return "http://old.example\x1fhttps://new.example/base";
}

/**
* Build a representative XML document.
*
Expand Down
7 changes: 7 additions & 0 deletions extensions/native-apis/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ candidate scanner. The public PHP class still validates candidates with the
existing WHATWG parser and uses the PHP regular-expression scanner for non-ASCII
text or when native defaults are disabled.

`wp_native_apis_rewrite_plain_text_literal_urls()` is a narrower primitive for
known plain text leaves. It accepts compact `source-origin\x1ftarget-prefix`
mappings separated by `\x1e`, rewrites exact HTTP(S) source-origin matches, and
returns `false` instead of guessing when the text contains structured-data
delimiters or an origin match is not bounded as a URL origin.

## Build Details

The build requires Rust, PHP development headers, `php-config`, and libclang.
Expand Down Expand Up @@ -378,6 +384,7 @@ caller-shaped workflow. The benchmark harness includes rows for these paths:
- XML tag, prefix, and sanitizer summaries through direct source scans.
- URL-in-text scans through a direct native plain-text URL candidate processor,
with public `URLInTextProcessor` rows preserving WHATWG validation.
- Plain text literal source-origin URL rewrites for parser-owned leaf text.
The compact batch APIs return strings with `\x1f` field separators and `\x1e`
record separators. They are intended for callers that need incremental
processing but can aggregate without building one PHP array per tag or token.
Expand Down
Loading
Loading