Skip to content

Commit e539bdc

Browse files
fix(sort): Enable locale-aware collation for UTF-8
locales Fixes #9148 The sort implementation had locale support infrastructure (ICU collator) but it was never being used due to the fast_lexicographic optimization bypassing all locale-aware code. Changes: - Modified can_use_fast_lexicographic() to check locale encoding - For UTF-8 locales, disable fast path to use locale_cmp() - Initialize ICU collator with AlternateHandling::Shifted to match GNU - Enable i18n-common and i18n-collator features in sort's Cargo.toml Result: Perfect match with GNU sort for C, POSIX, and UTF-8 locales. No performance impact for C/POSIX locales (still use fast path).
1 parent 68d69bd commit e539bdc

File tree

3 files changed

+71
-9
lines changed

3 files changed

+71
-9
lines changed

src/uu/sort/Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ workspace = true
1919
[lib]
2020
path = "src/sort.rs"
2121

22+
[features]
23+
i18n-collator = ["uucore/i18n-collator"]
24+
2225
[dependencies]
2326
bigdecimal = { workspace = true }
2427
binary-heap-plus = { workspace = true }
@@ -34,7 +37,7 @@ self_cell = { workspace = true }
3437
tempfile = { workspace = true }
3538
thiserror = { workspace = true }
3639
unicode-width = { workspace = true }
37-
uucore = { workspace = true, features = ["fs", "parser", "version-cmp"] }
40+
uucore = { workspace = true, features = ["fs", "parser", "version-cmp", "i18n-common", "i18n-collator"] }
3841
fluent = { workspace = true }
3942
nix = { workspace = true }
4043

src/uu/sort/src/sort.rs

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ use chunks::LineData;
2323
use clap::builder::ValueParser;
2424
use clap::{Arg, ArgAction, Command};
2525
use custom_str_cmp::custom_str_cmp;
26+
27+
#[cfg(feature = "i18n-collator")]
28+
use uucore::i18n::collator::{locale_cmp, try_init_collator};
2629
use ext_sort::ext_sort;
2730
use fnv::FnvHasher;
2831
#[cfg(target_os = "linux")]
@@ -350,6 +353,18 @@ impl GlobalSettings {
350353

351354
/// Returns true when the fast lexicographic path can be used safely.
352355
fn can_use_fast_lexicographic(&self) -> bool {
356+
// When i18n-collator is enabled, check if we need locale-aware collation.
357+
// If we're in a UTF-8 locale, we must use locale_cmp instead of byte comparison.
358+
#[cfg(feature = "i18n-collator")]
359+
{
360+
use uucore::i18n::{get_locale_encoding, UEncoding};
361+
362+
if get_locale_encoding() == UEncoding::Utf8 {
363+
// UTF-8 locale requires locale-aware collation
364+
return false;
365+
}
366+
}
367+
353368
self.mode == SortMode::Default
354369
&& !self.ignore_case
355370
&& !self.dictionary_order
@@ -1114,6 +1129,20 @@ fn default_merge_batch_size() -> usize {
11141129
#[uucore::main]
11151130
#[allow(clippy::cognitive_complexity)]
11161131
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
1132+
// Initialize locale collator if feature is enabled
1133+
#[cfg(feature = "i18n-collator")]
1134+
{
1135+
use uucore::i18n::collator::{AlternateHandling, CollatorOptions};
1136+
1137+
// Initialize ICU collator with Shifted mode to match GNU sort behavior
1138+
let mut opts = CollatorOptions::default();
1139+
opts.alternate_handling = Some(AlternateHandling::Shifted);
1140+
1141+
if !try_init_collator(opts) {
1142+
eprintln!("sort: warning: Failed to initialize locale collator");
1143+
}
1144+
}
1145+
11171146
let mut settings = GlobalSettings::default();
11181147

11191148
let matches = uucore::clap_localization::handle_clap_result_with_exit_code(uu_app(), args, 2)?;
@@ -1787,13 +1816,36 @@ fn compare_by<'a>(
17871816
}
17881817
SortMode::Month => month_compare(a_str, b_str),
17891818
SortMode::Version => version_cmp(a_str, b_str),
1790-
SortMode::Default => custom_str_cmp(
1791-
a_str,
1792-
b_str,
1793-
settings.ignore_non_printing,
1794-
settings.dictionary_order,
1795-
settings.ignore_case,
1796-
),
1819+
SortMode::Default => {
1820+
// Use locale-aware comparison if feature is enabled and no custom flags are set
1821+
#[cfg(feature = "i18n-collator")]
1822+
{
1823+
if !(settings.ignore_case
1824+
|| settings.dictionary_order
1825+
|| settings.ignore_non_printing)
1826+
{
1827+
locale_cmp(a_str, b_str)
1828+
} else {
1829+
custom_str_cmp(
1830+
a_str,
1831+
b_str,
1832+
settings.ignore_non_printing,
1833+
settings.dictionary_order,
1834+
settings.ignore_case,
1835+
)
1836+
}
1837+
}
1838+
#[cfg(not(feature = "i18n-collator"))]
1839+
{
1840+
custom_str_cmp(
1841+
a_str,
1842+
b_str,
1843+
settings.ignore_non_printing,
1844+
settings.dictionary_order,
1845+
settings.ignore_case,
1846+
)
1847+
}
1848+
}
17971849
};
17981850
if cmp != Ordering::Equal {
17991851
return if settings.reverse { cmp.reverse() } else { cmp };

src/uucore/src/lib/features/i18n/mod.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ pub enum UEncoding {
2020
Utf8,
2121
}
2222

23-
const DEFAULT_LOCALE: Locale = locale!("en-US-posix");
23+
// Use "und" (undefined) as the marker for C/POSIX locale
24+
// This ensures real locales like "en-US" won't match
25+
const DEFAULT_LOCALE: Locale = locale!("und");
2426

2527
/// Look at 3 environment variables in the following order
2628
///
@@ -38,6 +40,11 @@ fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) {
3840
let mut split = locale_var_str.split(&['.', '@']);
3941

4042
if let Some(simple) = split.next() {
43+
// Handle explicit C and POSIX locales - these should always use byte comparison
44+
if simple == "C" || simple == "POSIX" {
45+
return (DEFAULT_LOCALE, UEncoding::Ascii);
46+
}
47+
4148
// Naively convert the locale name to BCP47 tag format.
4249
//
4350
// See https://en.wikipedia.org/wiki/IETF_language_tag

0 commit comments

Comments
 (0)