From 8c23920a528f0f7b2cf9bb8645b9b9e365f9f20a Mon Sep 17 00:00:00 2001 From: FidelSch Date: Tue, 20 Jan 2026 17:21:55 -0300 Subject: [PATCH 1/3] fold: read input in chunks This allows processing input that does not contain newlines. --- src/uu/fold/src/fold.rs | 223 ++++++++++++++++++++++++++++------------ 1 file changed, 159 insertions(+), 64 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index 2eb97933180..a232ddb70d9 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -7,7 +7,7 @@ use clap::{Arg, ArgAction, Command}; use std::fs::File; -use std::io::{BufRead, BufReader, BufWriter, Read, Write, stdin, stdout}; +use std::io::{BufReader, BufWriter, Read, Write, stdin, stdout}; use std::path::Path; use unicode_width::UnicodeWidthChar; use uucore::display::Quotable; @@ -19,6 +19,7 @@ const TAB_WIDTH: usize = 8; const NL: u8 = b'\n'; const CR: u8 = b'\r'; const TAB: u8 = b'\t'; +const READ_CHUNK_SIZE: usize = 8192; // 8KB chunks mod options { pub const BYTES: &str = "bytes"; @@ -186,65 +187,105 @@ fn fold_file_bytewise( width: usize, output: &mut W, ) -> UResult<()> { - let mut line = Vec::new(); + let mut buffer = Vec::new(); + let mut chunk_buf = vec![0u8; READ_CHUNK_SIZE]; loop { - if file - .read_until(NL, &mut line) - .map_err_context(|| translate!("fold-error-readline"))? - == 0 - { + let bytes_read = file + .read(&mut chunk_buf) + .map_err_context(|| translate!("fold-error-readline"))?; + + if bytes_read == 0 { + // EOF: process any remaining data as a line without newline + if !buffer.is_empty() { + process_bytewise_line(&buffer, spaces, width, output)?; + } break; } - if line == [NL] { - output.write_all(&[NL])?; - line.truncate(0); - continue; + // Append chunk to buffer + buffer.extend_from_slice(&chunk_buf[..bytes_read]); + + // Process complete lines one at a time + let mut start = 0; + for i in 0..buffer.len() { + if buffer[i] == NL { + // Process line including the newline + process_bytewise_line(&buffer[start..=i], spaces, width, output)?; + start = i + 1; + } } - let len = line.len(); - let mut i = 0; - - while i < len { - let width = if len - i >= width { width } else { len - i }; - let slice = { - let slice = &line[i..i + width]; - if spaces && i + width < len { - match slice - .iter() - .enumerate() - .rev() - .find(|(_, c)| c.is_ascii_whitespace() && **c != CR) - { - Some((m, _)) => &slice[..=m], - None => slice, - } - } else { - slice - } - }; + // Remove processed data from buffer + if start > 0 { + buffer.drain(..start); + } - // Don't duplicate trailing newlines: if the slice is "\n", the - // previous iteration folded just before the end of the line and - // has already printed this newline. - if slice == [NL] { - break; - } + // If buffer is getting too large without a newline, process what we can + // and keep any remainder to maintain proper line handling + if buffer.len() >= READ_CHUNK_SIZE * 2 { + // Process the data but keep any partial line state + let process_len = buffer.len(); + process_bytewise_line(&buffer[..process_len], spaces, width, output)?; + buffer.clear(); + } + } - i += slice.len(); + Ok(()) +} - let at_eol = i >= len; +fn process_bytewise_line( + line: &[u8], + spaces: bool, + width: usize, + output: &mut W, +) -> UResult<()> { + if line.is_empty() { + return Ok(()); + } + + if line == [NL] { + output.write_all(&[NL])?; + return Ok(()); + } - if at_eol { - output.write_all(slice)?; + let len = line.len(); + let mut i = 0; + + while i < len { + let segment_width = if len - i >= width { width } else { len - i }; + let slice = { + let slice = &line[i..i + segment_width]; + if spaces && i + segment_width < len { + match slice + .iter() + .enumerate() + .rev() + .find(|(_, c)| c.is_ascii_whitespace() && **c != CR) + { + Some((m, _)) => &slice[..=m], + None => slice, + } } else { - output.write_all(slice)?; - output.write_all(&[NL])?; + slice } + }; + + // Don't duplicate trailing newlines: if the slice is "\n", the + // previous iteration folded just before the end of the line and + // has already printed this newline. + if slice == [NL] { + break; } - line.truncate(0); + i += slice.len(); + + let at_eol = i >= len; + + output.write_all(slice)?; + if !at_eol { + output.write_all(&[NL])?; + } } Ok(()) @@ -572,36 +613,90 @@ fn fold_file( mode: WidthMode, writer: &mut W, ) -> UResult<()> { - let mut line = Vec::new(); + let mut buffer = Vec::new(); let mut output = Vec::new(); let mut col_count = 0; let mut last_space = None; + let mut chunk_buf = vec![0u8; READ_CHUNK_SIZE]; loop { - if file - .read_until(NL, &mut line) - .map_err_context(|| translate!("fold-error-readline"))? - == 0 - { + let bytes_read = file + .read(&mut chunk_buf) + .map_err_context(|| translate!("fold-error-readline"))?; + + if bytes_read == 0 { + // EOF: process any remaining data as a line without newline + if !buffer.is_empty() { + let mut ctx = FoldContext { + spaces, + width, + mode, + writer, + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; + + match std::str::from_utf8(&buffer) { + Ok(s) => process_utf8_line(s, &mut ctx)?, + Err(_) => process_non_utf8_line(&buffer, &mut ctx)?, + } + } break; } - let mut ctx = FoldContext { - spaces, - width, - mode, - writer, - output: &mut output, - col_count: &mut col_count, - last_space: &mut last_space, - }; + buffer.extend_from_slice(&chunk_buf[..bytes_read]); + + // Process complete lines one at a time + let mut start = 0; + for i in 0..buffer.len() { + if buffer[i] == NL { + // Process line including the newline + let line_data = &buffer[start..=i]; + + let mut ctx = FoldContext { + spaces, + width, + mode, + writer, + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; - match std::str::from_utf8(&line) { - Ok(s) => process_utf8_line(s, &mut ctx)?, - Err(_) => process_non_utf8_line(&line, &mut ctx)?, + match std::str::from_utf8(line_data) { + Ok(s) => process_utf8_line(s, &mut ctx)?, + Err(_) => process_non_utf8_line(line_data, &mut ctx)?, + } + + start = i + 1; + } } - line.clear(); + // Remove processed data from buffer + if start > 0 { + buffer.drain(..start); + } + + // If buffer is getting too large without a newline, process it anyway + if buffer.len() >= READ_CHUNK_SIZE * 2 { + let mut ctx = FoldContext { + spaces, + width, + mode, + writer, + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; + + match std::str::from_utf8(&buffer) { + Ok(s) => process_utf8_line(s, &mut ctx)?, + Err(_) => process_non_utf8_line(&buffer, &mut ctx)?, + } + + buffer.clear(); + } } if !output.is_empty() { From 98da70f7631c6e2326efea6b8bf0fcdc767f10da Mon Sep 17 00:00:00 2001 From: FidelSch Date: Wed, 21 Jan 2026 09:57:25 -0300 Subject: [PATCH 2/3] fold: handle multi-byte characters at chunk borders --- src/uu/fold/src/fold.rs | 75 +++++++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index a232ddb70d9..6e6d92a6de1 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -295,6 +295,49 @@ fn next_tab_stop(col_count: usize) -> usize { col_count + TAB_WIDTH - col_count % TAB_WIDTH } +/// Find the start of an incomplete UTF-8 sequence at the end of a buffer. +/// Returns the index where the incomplete sequence starts, or buffer.len() if complete. +fn find_incomplete_utf8_start(buffer: &[u8]) -> usize { + let len = buffer.len(); + + // Check last 3 bytes looking for an incomplete UTF-8 sequence + // UTF-8 continuation bytes start with 0b10xxxxxx + // UTF-8 start bytes: 0b0xxxxxxx (1-byte), 0b110xxxxx (2-byte), 0b1110xxxx (3-byte), 0b11110xxx (4-byte) + + for i in (len.saturating_sub(3)..len).rev() { + let byte = buffer[i]; + + // Check if this is a UTF-8 start byte + if byte & 0b10000000 == 0 { + // Single-byte character (ASCII), complete + return len; + } else if byte & 0b11000000 == 0b11000000 { + // This is a UTF-8 start byte + let expected_len = if byte & 0b11100000 == 0b11000000 { + 2 + } else if byte & 0b11110000 == 0b11100000 { + 3 + } else if byte & 0b11111000 == 0b11110000 { + 4 + } else { + // Invalid UTF-8 start byte + return len; + }; + + let actual_len = len - i; + if actual_len < expected_len { + // Incomplete sequence found + return i; + } else { + // Complete sequence + return len; + } + } + } + + len +} + fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize { match mode { WidthMode::Characters => std::str::from_utf8(buffer) @@ -680,22 +723,26 @@ fn fold_file( // If buffer is getting too large without a newline, process it anyway if buffer.len() >= READ_CHUNK_SIZE * 2 { - let mut ctx = FoldContext { - spaces, - width, - mode, - writer, - output: &mut output, - col_count: &mut col_count, - last_space: &mut last_space, - }; + let process_up_to = find_incomplete_utf8_start(&buffer); - match std::str::from_utf8(&buffer) { - Ok(s) => process_utf8_line(s, &mut ctx)?, - Err(_) => process_non_utf8_line(&buffer, &mut ctx)?, - } + if process_up_to > 0 { + let mut ctx = FoldContext { + spaces, + width, + mode, + writer, + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; - buffer.clear(); + match std::str::from_utf8(&buffer[..process_up_to]) { + Ok(s) => process_utf8_line(s, &mut ctx)?, + Err(_) => process_non_utf8_line(&buffer[..process_up_to], &mut ctx)?, + } + + buffer.drain(..process_up_to); + } } } From 0a1ca4794f4088ca60a1b6a5c68f35afd9b02286 Mon Sep 17 00:00:00 2001 From: FidelSch Date: Wed, 21 Jan 2026 09:57:25 -0300 Subject: [PATCH 3/3] fold: handle multi-byte characters at chunk borders --- src/uu/fold/src/fold.rs | 74 +++++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index a232ddb70d9..d12188be98b 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -295,6 +295,48 @@ fn next_tab_stop(col_count: usize) -> usize { col_count + TAB_WIDTH - col_count % TAB_WIDTH } +/// Find the start of an incomplete UTF-8 sequence at the end of a buffer. +/// Returns the index where the incomplete sequence starts, or buffer.len() if complete. +fn find_incomplete_utf8_start(buffer: &[u8]) -> usize { + let len = buffer.len(); + + // Check last 3 bytes looking for an incomplete UTF-8 sequence + // UTF-8 continuation bytes start with 0b10xxxxxx + // UTF-8 start bytes: 0b0xxxxxxx (1-byte), 0b110xxxxx (2-byte), 0b1110xxxx (3-byte), 0b11110xxx (4-byte) + + for i in (len.saturating_sub(3)..len).rev() { + let byte = buffer[i]; + + // Check if this is a UTF-8 start byte + if byte & 0b10000000 == 0 { + // Single-byte character (ASCII), complete + return len; + } else if byte & 0b11000000 == 0b11000000 { + // This is a UTF-8 start byte + let expected_len = if byte & 0b11100000 == 0b11000000 { + 2 + } else if byte & 0b11110000 == 0b11100000 { + 3 + } else if byte & 0b11111000 == 0b11110000 { + 4 + } else { + // Invalid UTF-8 start byte + return len; + }; + + let actual_len = len - i; + if actual_len < expected_len { + // Incomplete sequence found + return i; + } + + return len; + } + } + + len +} + fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize { match mode { WidthMode::Characters => std::str::from_utf8(buffer) @@ -680,22 +722,26 @@ fn fold_file( // If buffer is getting too large without a newline, process it anyway if buffer.len() >= READ_CHUNK_SIZE * 2 { - let mut ctx = FoldContext { - spaces, - width, - mode, - writer, - output: &mut output, - col_count: &mut col_count, - last_space: &mut last_space, - }; + let process_up_to = find_incomplete_utf8_start(&buffer); - match std::str::from_utf8(&buffer) { - Ok(s) => process_utf8_line(s, &mut ctx)?, - Err(_) => process_non_utf8_line(&buffer, &mut ctx)?, - } + if process_up_to > 0 { + let mut ctx = FoldContext { + spaces, + width, + mode, + writer, + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; - buffer.clear(); + match std::str::from_utf8(&buffer[..process_up_to]) { + Ok(s) => process_utf8_line(s, &mut ctx)?, + Err(_) => process_non_utf8_line(&buffer[..process_up_to], &mut ctx)?, + } + + buffer.drain(..process_up_to); + } } }