uutils · sylvestre · Jan 17, 2026 · Nov 7, 2025 · Nov 7, 2025 · Nov 8, 2025
diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml
@@ -19,6 +19,9 @@ workspace = true
 [lib]
 path = "src/sort.rs"
 
+[features]
+i18n-collator = ["uucore/i18n-collator"]
+
 [dependencies]
 bigdecimal = { workspace = true }
 binary-heap-plus = { workspace = true }
@@ -39,6 +42,7 @@ uucore = { workspace = true, features = [
   "parser-size",
   "version-cmp",
   "i18n-decimal",
+  "i18n-collator",
 ] }
 fluent = { workspace = true }
 

diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs
@@ -23,6 +23,7 @@ use chunks::LineData;
 use clap::builder::ValueParser;
 use clap::{Arg, ArgAction, ArgMatches, Command};
 use custom_str_cmp::custom_str_cmp;
+
 use ext_sort::ext_sort;
 use fnv::FnvHasher;
 use numeric_str_cmp::{NumInfo, NumInfoParseSettings, human_numeric_str_cmp, numeric_str_cmp};
@@ -47,6 +48,8 @@ use uucore::error::{FromIo, strip_errno};
 use uucore::error::{UError, UResult, USimpleError, UUsageError};
 use uucore::extendedbigdecimal::ExtendedBigDecimal;
 use uucore::format_usage;
+#[cfg(feature = "i18n-collator")]
+use uucore::i18n::collator::locale_cmp;
 use uucore::i18n::decimal::locale_decimal_separator;
 use uucore::line_ending::LineEnding;
 use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError};
@@ -318,7 +321,10 @@ impl GlobalSettings {
     /// Precompute some data needed for sorting.
     /// This function **must** be called before starting to sort, and `GlobalSettings` may not be altered
     /// afterwards.
-    fn init_precomputed(&mut self) {
+    ///
+    /// When i18n-collator is enabled, `disable_fast_lexicographic` should be set to true if we're
+    /// in a UTF-8 locale (to force locale-aware collation instead of byte comparison).
+    fn init_precomputed(&mut self, disable_fast_lexicographic: bool) {
         self.precomputed.needs_tokens = self.selectors.iter().any(|s| s.needs_tokens);
         self.precomputed.selections_per_line =
             self.selectors.iter().filter(|s| s.needs_selection).count();
@@ -333,11 +339,15 @@ impl GlobalSettings {
             .filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric))
             .count();
 
-        self.precomputed.fast_lexicographic = self.can_use_fast_lexicographic();
+        self.precomputed.fast_lexicographic =
+            !disable_fast_lexicographic && self.can_use_fast_lexicographic();
         self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive();
     }
 
     /// Returns true when the fast lexicographic path can be used safely.
+    /// Note: When i18n-collator is enabled, the caller must have already determined
+    /// whether locale-aware collation is needed (via checking if we're in a UTF-8 locale).
+    /// This check is performed in uumain() before init_precomputed() is called.
     fn can_use_fast_lexicographic(&self) -> bool {
         self.mode == SortMode::Default
             && !self.ignore_case
@@ -2065,7 +2075,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
         emit_debug_warnings(&settings, &global_flags, &legacy_warnings);
     }
 
-    settings.init_precomputed();
+    // Initialize locale collation if needed (UTF-8 locales)
+    // This MUST happen before init_precomputed() to avoid the performance regression
+    #[cfg(feature = "i18n-collator")]
+    let needs_locale_collation = uucore::i18n::collator::init_locale_collation();
+
+    #[cfg(not(feature = "i18n-collator"))]
+    let needs_locale_collation = false;
+
+    settings.init_precomputed(needs_locale_collation);
 
     let result = exec(&mut files, &settings, output, &mut tmp_dir);
     // Wait here if `SIGINT` was received,
@@ -2446,13 +2464,36 @@ fn compare_by<'a>(
             }
             SortMode::Month => month_compare(a_str, b_str),
             SortMode::Version => version_cmp(a_str, b_str),
-            SortMode::Default => custom_str_cmp(
-                a_str,
-                b_str,
-                settings.ignore_non_printing,
-                settings.dictionary_order,
-                settings.ignore_case,
-            ),
+            SortMode::Default => {
+                // Use locale-aware comparison if feature is enabled and no custom flags are set
+                #[cfg(feature = "i18n-collator")]
+                {
+                    if settings.ignore_case
+                        || settings.dictionary_order
+                        || settings.ignore_non_printing
+                    {
+                        custom_str_cmp(
+                            a_str,
+                            b_str,
+                            settings.ignore_non_printing,
+                            settings.dictionary_order,
+                            settings.ignore_case,
+                        )
+                    } else {
+                        locale_cmp(a_str, b_str)
+                    }
+                }
+                #[cfg(not(feature = "i18n-collator"))]
+                {
+                    custom_str_cmp(
+                        a_str,
+                        b_str,
+                        settings.ignore_non_printing,
+                        settings.dictionary_order,
+                        settings.ignore_case,
+                    )
+                }
+            }
         };
         if cmp != Ordering::Equal {
             return if settings.reverse { cmp.reverse() } else { cmp };

diff --git a/src/uucore/src/lib/features/i18n/collator.rs b/src/uucore/src/lib/features/i18n/collator.rs
@@ -30,6 +30,45 @@ pub fn init_collator(opts: CollatorOptions) {
         .expect("Collator already initialized");
 }
 
+/// Initialize the collator for locale-aware string comparison if needed.
+///
+/// This function checks if the current locale requires locale-aware collation
+/// (UTF-8 encoding) and initializes the ICU collator with appropriate settings
+/// if necessary. For C/POSIX locales, no initialization is needed as byte
+/// comparison is sufficient.
+///
+/// # Returns
+///
+/// `true` if the collator was initialized for a UTF-8 locale, `false` if
+/// using C/POSIX locale (no initialization needed).
+///
+/// # Example
+///
+/// ```
+/// use uucore::i18n::collator::init_locale_collation;
+///
+/// if init_locale_collation() {
+///     // Using locale-aware collation
+/// } else {
+///     // Using byte comparison (C/POSIX locale)
+/// }
+/// ```
+pub fn init_locale_collation() -> bool {
+    use crate::i18n::{UEncoding, get_locale_encoding};
+
+    // Check if we need locale-aware collation
+    if get_locale_encoding() != UEncoding::Utf8 {
+        // C/POSIX locale - no collator needed
+        return false;
+    }
+
+    // UTF-8 locale - initialize collator with Shifted mode to match GNU behavior
+    let mut opts = CollatorOptions::default();
+    opts.alternate_handling = Some(AlternateHandling::Shifted);
+
+    try_init_collator(opts)
+}
+
 /// Compare both strings with regard to the current locale.
 pub fn locale_cmp(left: &[u8], right: &[u8]) -> Ordering {
     // If the detected locale is 'C', just do byte-wise comparison

diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs
@@ -20,7 +20,9 @@ pub enum UEncoding {
     Utf8,
 }
 
-const DEFAULT_LOCALE: Locale = locale!("en-US-posix");
+// Use "und" (undefined) as the marker for C/POSIX locale
+// This ensures real locales like "en-US" won't match
+const DEFAULT_LOCALE: Locale = locale!("und");
 
 /// Look at 3 environment variables in the following order
 ///
@@ -38,6 +40,11 @@ fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) {
         let mut split = locale_var_str.split(&['.', '@']);
 
         if let Some(simple) = split.next() {
+            // Handle explicit C and POSIX locales - these should always use byte comparison
+            if simple == "C" || simple == "POSIX" {
+                return (DEFAULT_LOCALE, UEncoding::Ascii);
+            }
+
             // Naively convert the locale name to BCP47 tag format.
             //
             // See https://en.wikipedia.org/wiki/IETF_language_tag

diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs
@@ -2463,4 +2463,71 @@ fn test_start_buffer() {
         .stdout_only_bytes(&expected);
 }
 
+#[test]
+fn test_locale_collation_c_locale() {
+    // In C locale, sorting should be pure byte order
+    // Accented characters (UTF-8) sort after ASCII letters
+    let input = "é\ne\nE\na\nA\nz\n";
+    // C locale: byte order (A=0x41, E=0x45, a=0x61, e=0x65, z=0x7a, é=0xc3a9)
+    let expected = "A\nE\na\ne\nz\né\n";
+
+    new_ucmd!()
+        .env("LC_ALL", "C")
+        .pipe_in(input)
+        .succeeds()
+        .stdout_is(expected);
+}
+
+#[test]
+fn test_locale_collation_utf8() {
+    // Skip if UTF-8 locale is not available
+    let Ok(locale) = env::var("LOCALE_FR_UTF8") else {
+        return;
+    };
+    if locale == "none" {
+        return;
+    }
+
+    // In UTF-8 locale with collation, accented chars sort near base chars
+    // "é" should sort near "e", not at the end
+    let input = "z\né\ne\na\n";
+
+    let result = new_ucmd!().env("LC_ALL", &locale).pipe_in(input).succeeds();
+
+    let output = result.stdout_str();
+    // In a proper locale, 'a' comes first, then 'e'/'é' together, then 'z'
+    // The exact order of e vs é depends on locale, but both should come before z
+    assert!(
+        output.starts_with("a\n"),
+        "Expected 'a' first in locale-aware sort, got: {output}"
+    );
+    assert!(
+        output.ends_with("z\n"),
+        "Expected 'z' last in locale-aware sort, got: {output}"
+    );
+}
+
+#[test]
+fn test_locale_collation_shifted_punctuation() {
+    // Test that shifted alternate handling works (punctuation/spaces as secondary)
+    // In shifted mode, "a b" and "ab" should sort together, with space being secondary
+    let Ok(locale) = env::var("LOCALE_FR_UTF8") else {
+        return;
+    };
+    if locale == "none" {
+        return;
+    }
+
+    let input = "ab\na b\na-b\n";
+
+    let result = new_ucmd!().env("LC_ALL", &locale).pipe_in(input).succeeds();
+
+    // All three should sort together since base letters are the same
+    // The exact order depends on shifted handling, but they shouldn't be
+    // wildly separated like they would be in byte order
+    let output = result.stdout_str();
+    let lines: Vec<&str> = output.lines().collect();
+    assert_eq!(lines.len(), 3, "Expected 3 lines, got: {output}");
+}
+
 /* spell-checker: enable */