Skip to content
Merged
4 changes: 4 additions & 0 deletions src/uu/sort/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ workspace = true
[lib]
path = "src/sort.rs"

[features]
i18n-collator = ["uucore/i18n-collator"]

[dependencies]
bigdecimal = { workspace = true }
binary-heap-plus = { workspace = true }
Expand All @@ -39,6 +42,7 @@ uucore = { workspace = true, features = [
"parser-size",
"version-cmp",
"i18n-decimal",
"i18n-collator",
] }
fluent = { workspace = true }

Expand Down
61 changes: 51 additions & 10 deletions src/uu/sort/src/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use chunks::LineData;
use clap::builder::ValueParser;
use clap::{Arg, ArgAction, ArgMatches, Command};
use custom_str_cmp::custom_str_cmp;

use ext_sort::ext_sort;
use fnv::FnvHasher;
use numeric_str_cmp::{NumInfo, NumInfoParseSettings, human_numeric_str_cmp, numeric_str_cmp};
Expand All @@ -47,6 +48,8 @@ use uucore::error::{FromIo, strip_errno};
use uucore::error::{UError, UResult, USimpleError, UUsageError};
use uucore::extendedbigdecimal::ExtendedBigDecimal;
use uucore::format_usage;
#[cfg(feature = "i18n-collator")]
use uucore::i18n::collator::locale_cmp;
use uucore::i18n::decimal::locale_decimal_separator;
use uucore::line_ending::LineEnding;
use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
Expand Down Expand Up @@ -318,7 +321,10 @@ impl GlobalSettings {
/// Precompute some data needed for sorting.
/// This function **must** be called before starting to sort, and `GlobalSettings` may not be altered
/// afterwards.
fn init_precomputed(&mut self) {
///
/// When i18n-collator is enabled, `disable_fast_lexicographic` should be set to true if we're
/// in a UTF-8 locale (to force locale-aware collation instead of byte comparison).
fn init_precomputed(&mut self, disable_fast_lexicographic: bool) {
self.precomputed.needs_tokens = self.selectors.iter().any(|s| s.needs_tokens);
self.precomputed.selections_per_line =
self.selectors.iter().filter(|s| s.needs_selection).count();
Expand All @@ -333,11 +339,15 @@ impl GlobalSettings {
.filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric))
.count();

self.precomputed.fast_lexicographic = self.can_use_fast_lexicographic();
self.precomputed.fast_lexicographic =
!disable_fast_lexicographic && self.can_use_fast_lexicographic();
self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive();
}

/// Returns true when the fast lexicographic path can be used safely.
/// Note: When i18n-collator is enabled, the caller must have already determined
/// whether locale-aware collation is needed (via checking if we're in a UTF-8 locale).
/// This check is performed in uumain() before init_precomputed() is called.
fn can_use_fast_lexicographic(&self) -> bool {
self.mode == SortMode::Default
&& !self.ignore_case
Expand Down Expand Up @@ -2065,7 +2075,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
emit_debug_warnings(&settings, &global_flags, &legacy_warnings);
}

settings.init_precomputed();
// Initialize locale collation if needed (UTF-8 locales)
// This MUST happen before init_precomputed() to avoid the performance regression
#[cfg(feature = "i18n-collator")]
let needs_locale_collation = uucore::i18n::collator::init_locale_collation();

#[cfg(not(feature = "i18n-collator"))]
let needs_locale_collation = false;

settings.init_precomputed(needs_locale_collation);

let result = exec(&mut files, &settings, output, &mut tmp_dir);
// Wait here if `SIGINT` was received,
Expand Down Expand Up @@ -2446,13 +2464,36 @@ fn compare_by<'a>(
}
SortMode::Month => month_compare(a_str, b_str),
SortMode::Version => version_cmp(a_str, b_str),
SortMode::Default => custom_str_cmp(
a_str,
b_str,
settings.ignore_non_printing,
settings.dictionary_order,
settings.ignore_case,
),
SortMode::Default => {
// Use locale-aware comparison if feature is enabled and no custom flags are set
#[cfg(feature = "i18n-collator")]
{
if settings.ignore_case
|| settings.dictionary_order
|| settings.ignore_non_printing
{
custom_str_cmp(
a_str,
b_str,
settings.ignore_non_printing,
settings.dictionary_order,
settings.ignore_case,
)
} else {
locale_cmp(a_str, b_str)
}
}
#[cfg(not(feature = "i18n-collator"))]
{
custom_str_cmp(
a_str,
b_str,
settings.ignore_non_printing,
settings.dictionary_order,
settings.ignore_case,
)
}
}
};
if cmp != Ordering::Equal {
return if settings.reverse { cmp.reverse() } else { cmp };
Expand Down
39 changes: 39 additions & 0 deletions src/uucore/src/lib/features/i18n/collator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,45 @@ pub fn init_collator(opts: CollatorOptions) {
.expect("Collator already initialized");
}

/// Initialize the collator for locale-aware string comparison if needed.
///
/// This function checks if the current locale requires locale-aware collation
/// (UTF-8 encoding) and initializes the ICU collator with appropriate settings
/// if necessary. For C/POSIX locales, no initialization is needed as byte
/// comparison is sufficient.
///
/// # Returns
///
/// `true` if the collator was initialized for a UTF-8 locale, `false` if
/// using C/POSIX locale (no initialization needed).
///
/// # Example
///
/// ```
/// use uucore::i18n::collator::init_locale_collation;
///
/// if init_locale_collation() {
/// // Using locale-aware collation
/// } else {
/// // Using byte comparison (C/POSIX locale)
/// }
/// ```
pub fn init_locale_collation() -> bool {
use crate::i18n::{UEncoding, get_locale_encoding};

// Check if we need locale-aware collation
if get_locale_encoding() != UEncoding::Utf8 {
// C/POSIX locale - no collator needed
return false;
}

// UTF-8 locale - initialize collator with Shifted mode to match GNU behavior
let mut opts = CollatorOptions::default();
opts.alternate_handling = Some(AlternateHandling::Shifted);

try_init_collator(opts)
}

/// Compare both strings with regard to the current locale.
pub fn locale_cmp(left: &[u8], right: &[u8]) -> Ordering {
// If the detected locale is 'C', just do byte-wise comparison
Expand Down
9 changes: 8 additions & 1 deletion src/uucore/src/lib/features/i18n/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ pub enum UEncoding {
Utf8,
}

const DEFAULT_LOCALE: Locale = locale!("en-US-posix");
// Use "und" (undefined) as the marker for C/POSIX locale
// This ensures real locales like "en-US" won't match
const DEFAULT_LOCALE: Locale = locale!("und");

/// Look at 3 environment variables in the following order
///
Expand All @@ -38,6 +40,11 @@ fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) {
let mut split = locale_var_str.split(&['.', '@']);

if let Some(simple) = split.next() {
// Handle explicit C and POSIX locales - these should always use byte comparison
if simple == "C" || simple == "POSIX" {
return (DEFAULT_LOCALE, UEncoding::Ascii);
}

// Naively convert the locale name to BCP47 tag format.
//
// See https://en.wikipedia.org/wiki/IETF_language_tag
Expand Down
67 changes: 67 additions & 0 deletions tests/by-util/test_sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2463,4 +2463,71 @@ fn test_start_buffer() {
.stdout_only_bytes(&expected);
}

#[test]
fn test_locale_collation_c_locale() {
// In C locale, sorting should be pure byte order
// Accented characters (UTF-8) sort after ASCII letters
let input = "é\ne\nE\na\nA\nz\n";
// C locale: byte order (A=0x41, E=0x45, a=0x61, e=0x65, z=0x7a, é=0xc3a9)
let expected = "A\nE\na\ne\nz\né\n";

new_ucmd!()
.env("LC_ALL", "C")
.pipe_in(input)
.succeeds()
.stdout_is(expected);
}

#[test]
fn test_locale_collation_utf8() {
// Skip if UTF-8 locale is not available
let Ok(locale) = env::var("LOCALE_FR_UTF8") else {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this var?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's an existing pattern in the same file - see line 1636. Used by CI to specify available locale.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you sure it works ?
i don't see where it is set

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're right, switched to en_US.UTF-8. test handles both with/without i18n-collator now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, i would prefer to use french
we have locales in the CI

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

switched to french

return;
};
if locale == "none" {
return;
}

// In UTF-8 locale with collation, accented chars sort near base chars
// "é" should sort near "e", not at the end
let input = "z\né\ne\na\n";

let result = new_ucmd!().env("LC_ALL", &locale).pipe_in(input).succeeds();

let output = result.stdout_str();
// In a proper locale, 'a' comes first, then 'e'/'é' together, then 'z'
// The exact order of e vs é depends on locale, but both should come before z
assert!(
output.starts_with("a\n"),
"Expected 'a' first in locale-aware sort, got: {output}"
);
assert!(
output.ends_with("z\n"),
"Expected 'z' last in locale-aware sort, got: {output}"
);
}

#[test]
fn test_locale_collation_shifted_punctuation() {
// Test that shifted alternate handling works (punctuation/spaces as secondary)
// In shifted mode, "a b" and "ab" should sort together, with space being secondary
let Ok(locale) = env::var("LOCALE_FR_UTF8") else {
return;
};
if locale == "none" {
return;
}

let input = "ab\na b\na-b\n";

let result = new_ucmd!().env("LC_ALL", &locale).pipe_in(input).succeeds();

// All three should sort together since base letters are the same
// The exact order depends on shifted handling, but they shouldn't be
// wildly separated like they would be in byte order
let output = result.stdout_str();
let lines: Vec<&str> = output.lines().collect();
assert_eq!(lines.len(), 3, "Expected 3 lines, got: {output}");
}

/* spell-checker: enable */
Loading