Skip to content

Commit 6db319e

Browse files
fold:fix gnu test fold-zero-width.sh (#9274)
--------- Co-authored-by: Sylvestre Ledru <sylvestre@debian.org>
1 parent d2bd2d3 commit 6db319e

File tree

4 files changed

+443
-42
lines changed

4 files changed

+443
-42
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,13 +535,15 @@ filetime.workspace = true
535535
glob.workspace = true
536536
jiff.workspace = true
537537
libc.workspace = true
538+
bytecount.workspace = true
538539
num-prime.workspace = true
539540
pretty_assertions = "1.4.0"
540541
rand.workspace = true
541542
regex.workspace = true
542543
sha1 = { workspace = true, features = ["std"] }
543544
tempfile.workspace = true
544545
time = { workspace = true, features = ["local-offset"] }
546+
unicode-width.workspace = true
545547
unindent = "0.2.3"
546548
uutests.workspace = true
547549
uucore = { workspace = true, features = [

src/uu/fold/src/fold.rs

Lines changed: 140 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ const TAB_WIDTH: usize = 8;
1919
const NL: u8 = b'\n';
2020
const CR: u8 = b'\r';
2121
const TAB: u8 = b'\t';
22+
// Implementation threshold (8 KiB) to prevent unbounded buffer growth during streaming.
23+
// Chosen as a small, fixed cap: large enough to avoid excessive flushes, but
24+
// small enough to keep memory bounded when the input has no fold points.
25+
const STREAMING_FLUSH_THRESHOLD: usize = 8 * 1024;
2226

2327
mod options {
2428
pub const BYTES: &str = "bytes";
@@ -288,6 +292,10 @@ fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize {
288292
}
289293

290294
fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
295+
// Emit one folded line:
296+
// - with `-s`, cut at the last remembered whitespace when possible
297+
// - otherwise, cut at the current buffer end
298+
// The remainder (if any) stays in the buffer for the next line.
291299
let consume = match *ctx.last_space {
292300
Some(index) => index + 1,
293301
None => ctx.output.len(),
@@ -309,6 +317,7 @@ fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
309317
*ctx.col_count = compute_col_count(ctx.output, ctx.mode);
310318

311319
if ctx.spaces {
320+
// Rebase the remembered whitespace position into the remaining buffer.
312321
*ctx.last_space = last_space.and_then(|idx| {
313322
if idx < consume {
314323
None
@@ -322,6 +331,36 @@ fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
322331
Ok(())
323332
}
324333

334+
fn maybe_flush_unbroken_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
335+
// In streaming mode without `-s`, avoid unbounded buffering by periodically
336+
// flushing long unbroken segments. With `-s` we must keep the buffer so we
337+
// can still break at the last whitespace boundary.
338+
if ctx.spaces || ctx.output.len() < STREAMING_FLUSH_THRESHOLD {
339+
return Ok(());
340+
}
341+
342+
// Write raw bytes without inserting a newline; folding will continue
343+
// based on updated column tracking in the caller.
344+
ctx.writer.write_all(ctx.output)?;
345+
ctx.output.clear();
346+
Ok(())
347+
}
348+
349+
fn push_byte<W: Write>(ctx: &mut FoldContext<'_, W>, byte: u8) -> UResult<()> {
350+
// Append a single byte to the buffer.
351+
ctx.output.push(byte);
352+
maybe_flush_unbroken_output(ctx)
353+
}
354+
355+
fn push_bytes<W: Write>(ctx: &mut FoldContext<'_, W>, bytes: &[u8]) -> UResult<()> {
356+
// Append a byte slice to the buffer and flush if it grows too large.
357+
if bytes.is_empty() {
358+
return Ok(());
359+
}
360+
ctx.output.extend_from_slice(bytes);
361+
maybe_flush_unbroken_output(ctx)
362+
}
363+
325364
fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
326365
let mut idx = 0;
327366
let len = line.len();
@@ -331,15 +370,15 @@ fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR
331370
NL => {
332371
*ctx.last_space = None;
333372
emit_output(ctx)?;
334-
break;
373+
idx += 1;
335374
}
336375
CR => {
337-
ctx.output.push(CR);
376+
push_byte(ctx, CR)?;
338377
*ctx.col_count = 0;
339378
idx += 1;
340379
}
341380
0x08 => {
342-
ctx.output.push(0x08);
381+
push_byte(ctx, 0x08)?;
343382
*ctx.col_count = ctx.col_count.saturating_sub(1);
344383
idx += 1;
345384
}
@@ -358,16 +397,23 @@ fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR
358397
} else {
359398
*ctx.last_space = None;
360399
}
361-
ctx.output.push(TAB);
400+
push_byte(ctx, TAB)?;
362401
idx += 1;
363402
}
364403
0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => {
365-
ctx.output.push(line[idx]);
404+
push_byte(ctx, line[idx])?;
366405
if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR {
367406
*ctx.last_space = Some(ctx.output.len() - 1);
368407
} else if !ctx.spaces {
369408
*ctx.last_space = None;
370409
}
410+
411+
if ctx.mode == WidthMode::Characters {
412+
*ctx.col_count = ctx.col_count.saturating_add(1);
413+
if *ctx.col_count >= ctx.width {
414+
emit_output(ctx)?;
415+
}
416+
}
371417
idx += 1;
372418
}
373419
_ => {
@@ -405,7 +451,7 @@ fn push_ascii_segment<W: Write>(segment: &[u8], ctx: &mut FoldContext<'_, W>) ->
405451
let take = remaining.len().min(available);
406452
let base_len = ctx.output.len();
407453

408-
ctx.output.extend_from_slice(&remaining[..take]);
454+
push_bytes(ctx, &remaining[..take])?;
409455
*ctx.col_count += take;
410456

411457
if ctx.spaces {
@@ -430,16 +476,26 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
430476
return process_ascii_line(line.as_bytes(), ctx);
431477
}
432478

479+
process_utf8_chars(line, ctx)
480+
}
481+
482+
fn process_utf8_chars<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> {
433483
let line_bytes = line.as_bytes();
434484
let mut iter = line.char_indices().peekable();
435485

436486
while let Some((byte_idx, ch)) = iter.next() {
437-
// Include combining characters with the base character
438-
while let Some(&(_, next_ch)) = iter.peek() {
439-
if unicode_width::UnicodeWidthChar::width(next_ch).unwrap_or(1) == 0 {
440-
iter.next();
441-
} else {
442-
break;
487+
// Include combining characters with the base character when we are
488+
// measuring by display columns. In character-counting mode every
489+
// scalar value must advance the counter to match `chars().count()`
490+
// semantics (see `fold_characters_reference` in the tests), so we do
491+
// not coalesce zero-width scalars there.
492+
if ctx.mode == WidthMode::Columns {
493+
while let Some(&(_, next_ch)) = iter.peek() {
494+
if unicode_width::UnicodeWidthChar::width(next_ch).unwrap_or(1) == 0 {
495+
iter.next();
496+
} else {
497+
break;
498+
}
443499
}
444500
}
445501

@@ -448,23 +504,21 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
448504
if ch == '\n' {
449505
*ctx.last_space = None;
450506
emit_output(ctx)?;
451-
break;
507+
continue;
452508
}
453509

454510
if *ctx.col_count >= ctx.width {
455511
emit_output(ctx)?;
456512
}
457513

458514
if ch == '\r' {
459-
ctx.output
460-
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
515+
push_bytes(ctx, &line_bytes[byte_idx..next_idx])?;
461516
*ctx.col_count = 0;
462517
continue;
463518
}
464519

465520
if ch == '\x08' {
466-
ctx.output
467-
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
521+
push_bytes(ctx, &line_bytes[byte_idx..next_idx])?;
468522
*ctx.col_count = ctx.col_count.saturating_sub(1);
469523
continue;
470524
}
@@ -484,8 +538,7 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
484538
} else {
485539
*ctx.last_space = None;
486540
}
487-
ctx.output
488-
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
541+
push_bytes(ctx, &line_bytes[byte_idx..next_idx])?;
489542
continue;
490543
}
491544

@@ -506,8 +559,7 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
506559
*ctx.last_space = Some(ctx.output.len());
507560
}
508561

509-
ctx.output
510-
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
562+
push_bytes(ctx, &line_bytes[byte_idx..next_idx])?;
511563
*ctx.col_count = ctx.col_count.saturating_add(added);
512564
}
513565

@@ -519,7 +571,7 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) ->
519571
if byte == NL {
520572
*ctx.last_space = None;
521573
emit_output(ctx)?;
522-
break;
574+
continue;
523575
}
524576

525577
if *ctx.col_count >= ctx.width {
@@ -539,7 +591,7 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) ->
539591
} else {
540592
None
541593
};
542-
ctx.output.push(byte);
594+
push_byte(ctx, byte)?;
543595
continue;
544596
}
545597
0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1),
@@ -550,7 +602,46 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) ->
550602
_ => *ctx.col_count = ctx.col_count.saturating_add(1),
551603
}
552604

553-
ctx.output.push(byte);
605+
push_byte(ctx, byte)?;
606+
}
607+
608+
Ok(())
609+
}
610+
611+
/// Process buffered bytes, emitting output for valid UTF-8 prefixes and
612+
/// deferring incomplete sequences until more input arrives.
613+
///
614+
/// If the buffer contains invalid UTF-8, it is handled in non-UTF-8 mode and
615+
/// the buffer is fully consumed.
616+
fn process_pending_chunk<W: Write>(
617+
pending: &mut Vec<u8>,
618+
ctx: &mut FoldContext<'_, W>,
619+
) -> UResult<()> {
620+
while !pending.is_empty() {
621+
match std::str::from_utf8(pending) {
622+
Ok(valid) => {
623+
process_utf8_line(valid, ctx)?;
624+
pending.clear();
625+
break;
626+
}
627+
Err(err) => {
628+
if err.error_len().is_some() {
629+
let res = process_non_utf8_line(pending, ctx);
630+
pending.clear();
631+
res?;
632+
break;
633+
}
634+
635+
let valid_up_to = err.valid_up_to();
636+
if valid_up_to == 0 {
637+
break;
638+
}
639+
640+
let valid = std::str::from_utf8(&pending[..valid_up_to]).expect("valid prefix");
641+
process_utf8_line(valid, ctx)?;
642+
pending.drain(..valid_up_to);
643+
}
644+
}
554645
}
555646

556647
Ok(())
@@ -572,20 +663,12 @@ fn fold_file<T: Read, W: Write>(
572663
mode: WidthMode,
573664
writer: &mut W,
574665
) -> UResult<()> {
575-
let mut line = Vec::new();
576666
let mut output = Vec::new();
577667
let mut col_count = 0;
578668
let mut last_space = None;
669+
let mut pending = Vec::with_capacity(8 * 1024);
579670

580-
loop {
581-
if file
582-
.read_until(NL, &mut line)
583-
.map_err_context(|| translate!("fold-error-readline"))?
584-
== 0
585-
{
586-
break;
587-
}
588-
671+
{
589672
let mut ctx = FoldContext {
590673
spaces,
591674
width,
@@ -596,17 +679,32 @@ fn fold_file<T: Read, W: Write>(
596679
last_space: &mut last_space,
597680
};
598681

599-
match std::str::from_utf8(&line) {
600-
Ok(s) => process_utf8_line(s, &mut ctx)?,
601-
Err(_) => process_non_utf8_line(&line, &mut ctx)?,
682+
loop {
683+
let buffer = file
684+
.fill_buf()
685+
.map_err_context(|| translate!("fold-error-readline"))?;
686+
if buffer.is_empty() {
687+
break;
688+
}
689+
pending.extend_from_slice(buffer);
690+
let consumed = buffer.len();
691+
file.consume(consumed);
692+
693+
process_pending_chunk(&mut pending, &mut ctx)?;
602694
}
603695

604-
line.clear();
605-
}
696+
if !pending.is_empty() {
697+
match std::str::from_utf8(&pending) {
698+
Ok(s) => process_utf8_line(s, &mut ctx)?,
699+
Err(_) => process_non_utf8_line(&pending, &mut ctx)?,
700+
}
701+
pending.clear();
702+
}
606703

607-
if !output.is_empty() {
608-
writer.write_all(&output)?;
609-
output.clear();
704+
if !ctx.output.is_empty() {
705+
ctx.writer.write_all(ctx.output)?;
706+
ctx.output.clear();
707+
}
610708
}
611709

612710
Ok(())

0 commit comments

Comments
 (0)