Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
073f7fc
test: add word joiner and cyrillic kha character tests for fmt
mattsu2020 Nov 19, 2025
36a01a1
feat: Enhance `fmt` to handle invalid UTF-8 input by replacing malfor…
mattsu2020 Nov 19, 2025
2c617d4
chore: add FFFD to spell-checker ignore list in fmt test.
mattsu2020 Nov 19, 2025
db77543
refactor(uu/fmt): switch string outputs to byte slices for efficiency
mattsu2020 Nov 19, 2025
4445c5a
refactor(fmt): replace Option::map_or(false, f) with is_some_and(f)
mattsu2020 Nov 19, 2025
c59f1bc
style(fmt): compact whitespace check in WordSplit iterator to single …
mattsu2020 Nov 19, 2025
6a313a4
fix(test_fmt): align invalid UTF-8 handling with GNU-compatible passt…
mattsu2020 Nov 19, 2025
fee09e6
Merge branch 'main' into fmt_compatibility
mattsu2020 Dec 15, 2025
e36cf9d
Merge branch 'main' into fmt_compatibility
mattsu2020 Dec 24, 2025
94efbb4
Merge branch 'main' into fmt_compatibility
mattsu2020 Dec 27, 2025
6ee9241
refactor(parasplit): extract scan_word_end method to eliminate code d…
mattsu2020 Dec 27, 2025
6348929
feat(fmt): add byte_display_width function for UTF-8 display width ca…
mattsu2020 Dec 28, 2025
8ece9f5
Merge branch 'main' into fmt_compatibility
sylvestre Dec 29, 2025
d2fa979
Merge branch 'main' into fmt_compatibility
mattsu2020 Jan 5, 2026
637bf76
Merge branch 'main' into fmt_compatibility
mattsu2020 Jan 5, 2026
2c5b12d
refactor(fmt/parasplit): introduce DecodedCharInfo struct for unified…
mattsu2020 Jan 10, 2026
7077d69
refactor(fmt): enhance utf8_char_width with constants and comments fo…
mattsu2020 Jan 12, 2026
9182df4
style(fmt): remove extra blank lines in parasplit.rs
mattsu2020 Jan 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions src/uu/fmt/src/parasplit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

// spell-checker:ignore (ToDO) INFTY MULT PSKIP accum aftertab beforetab breakwords fmt's formatline linebreak linebreaking linebreaks linelen maxlength minlength nchars noformat noformatline ostream overlen parasplit plass pmatch poffset posn powf prefixindent punct signum slen sstart tabwidth tlen underlen winfo wlen wordlen wordsplits xanti xprefix

use std::io::{BufRead, Lines};
use std::io::BufRead;
use std::iter::Peekable;
use std::slice::Iter;
use unicode_width::UnicodeWidthChar;
Expand Down Expand Up @@ -78,12 +78,12 @@ pub struct FileLine {
/// Iterator that produces a stream of Lines from a file
pub struct FileLines<'a> {
opts: &'a FmtOptions,
lines: Lines<&'a mut FileOrStdReader>,
reader: &'a mut FileOrStdReader,
}

impl FileLines<'_> {
fn new<'b>(opts: &'b FmtOptions, lines: Lines<&'b mut FileOrStdReader>) -> FileLines<'b> {
FileLines { opts, lines }
fn new<'b>(opts: &'b FmtOptions, reader: &'b mut FileOrStdReader) -> FileLines<'b> {
FileLines { opts, reader }
}

/// returns true if this line should be formatted
Expand Down Expand Up @@ -156,7 +156,19 @@ impl Iterator for FileLines<'_> {
type Item = Line;

fn next(&mut self) -> Option<Line> {
let n = self.lines.next()?.ok()?;
let mut buf = Vec::new();
match self.reader.read_until(b'\n', &mut buf) {
Ok(0) => return None,
Ok(_) => {}
Err(_) => return None,
}
if buf.ends_with(b"\n") {
buf.pop();
if buf.ends_with(b"\r") {
buf.pop();
}
}
let n = String::from_utf8_lossy(&buf).into_owned();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think using from_utf8_lossy is incorrect.

If you look at the output of GNU fmt, you will see that they don't do a lossy conversion:

$ printf "=\xA0=" | fmt -s -w1 | hexdump -X
0000000  3d  a0  3d  0a                                                
0000004

And our output:

 printf "=\xA0=" | cargo run -q fmt -s -w1 | hexdump -X
0000000  3d  ef  bf  bd  3d  0a                                        
0000006

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix it


// if this line is entirely whitespace,
// emit a blank line
Expand Down Expand Up @@ -242,7 +254,7 @@ pub struct ParagraphStream<'a> {

impl ParagraphStream<'_> {
pub fn new<'b>(opts: &'b FmtOptions, reader: &'b mut FileOrStdReader) -> ParagraphStream<'b> {
let lines = FileLines::new(opts, reader.lines()).peekable();
let lines = FileLines::new(opts, reader).peekable();
// at the beginning of the file, we might find mail headers
ParagraphStream {
lines,
Expand Down
18 changes: 17 additions & 1 deletion tests/by-util/test_fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.

// spell-checker:ignore plass samp
// spell-checker:ignore plass samp FFFD
#[cfg(target_os = "linux")]
use std::os::unix::ffi::OsStringExt;
use uutests::new_ucmd;
Expand Down Expand Up @@ -323,6 +323,8 @@ fn test_fmt_unicode_whitespace_handling() {
("non-breaking space", non_breaking_space),
("figure space", figure_space),
("narrow no-break space", narrow_no_break_space),
("word joiner", "\u{2060}"),
("cyrillic kha", "\u{0445}"),
] {
let input = format!("={char}=");
let result = new_ucmd!()
Expand Down Expand Up @@ -397,3 +399,17 @@ fn fmt_reflow_unicode() {
.succeeds()
.stdout_is("漢字漢字\n💐\n日本語の文字\n");
}

#[test]
fn test_fmt_invalid_utf8() {
// Regression test for handling invalid UTF-8 input (e.g. ISO-8859-1)
// fmt should not drop lines with invalid UTF-8.
// \xA0 is non-breaking space in ISO-8859-1, but invalid in UTF-8.
// We expect it to be replaced by replacement character and treated as non-space.
let input = b"=\xA0=";
new_ucmd!()
.args(&["-s", "-w1"])
.pipe_in(input)
.succeeds()
.stdout_is("=\u{FFFD}=\n");
}
Loading