Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 205 additions & 64 deletions src/uu/fold/src/fold.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

use clap::{Arg, ArgAction, Command};
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Read, Write, stdin, stdout};
use std::io::{BufReader, BufWriter, Read, Write, stdin, stdout};
use std::path::Path;
use unicode_width::UnicodeWidthChar;
use uucore::display::Quotable;
Expand All @@ -19,6 +19,7 @@ const TAB_WIDTH: usize = 8;
const NL: u8 = b'\n';
const CR: u8 = b'\r';
const TAB: u8 = b'\t';
const READ_CHUNK_SIZE: usize = 8192; // 8KB chunks

mod options {
pub const BYTES: &str = "bytes";
Expand Down Expand Up @@ -186,65 +187,105 @@ fn fold_file_bytewise<T: Read, W: Write>(
width: usize,
output: &mut W,
) -> UResult<()> {
let mut line = Vec::new();
let mut buffer = Vec::new();
let mut chunk_buf = vec![0u8; READ_CHUNK_SIZE];

loop {
if file
.read_until(NL, &mut line)
.map_err_context(|| translate!("fold-error-readline"))?
== 0
{
let bytes_read = file
.read(&mut chunk_buf)
.map_err_context(|| translate!("fold-error-readline"))?;

if bytes_read == 0 {
// EOF: process any remaining data as a line without newline
if !buffer.is_empty() {
process_bytewise_line(&buffer, spaces, width, output)?;
}
break;
}

if line == [NL] {
output.write_all(&[NL])?;
line.truncate(0);
continue;
// Append chunk to buffer
buffer.extend_from_slice(&chunk_buf[..bytes_read]);

// Process complete lines one at a time
let mut start = 0;
for i in 0..buffer.len() {
if buffer[i] == NL {
// Process line including the newline
process_bytewise_line(&buffer[start..=i], spaces, width, output)?;
start = i + 1;
}
}

let len = line.len();
let mut i = 0;

while i < len {
let width = if len - i >= width { width } else { len - i };
let slice = {
let slice = &line[i..i + width];
if spaces && i + width < len {
match slice
.iter()
.enumerate()
.rev()
.find(|(_, c)| c.is_ascii_whitespace() && **c != CR)
{
Some((m, _)) => &slice[..=m],
None => slice,
}
} else {
slice
}
};
// Remove processed data from buffer
if start > 0 {
buffer.drain(..start);
}

// Don't duplicate trailing newlines: if the slice is "\n", the
// previous iteration folded just before the end of the line and
// has already printed this newline.
if slice == [NL] {
break;
}
// If buffer is getting too large without a newline, process what we can
// and keep any remainder to maintain proper line handling
if buffer.len() >= READ_CHUNK_SIZE * 2 {
// Process the data but keep any partial line state
let process_len = buffer.len();
process_bytewise_line(&buffer[..process_len], spaces, width, output)?;
buffer.clear();
}
}

Ok(())
}

i += slice.len();
fn process_bytewise_line<W: Write>(
line: &[u8],
spaces: bool,
width: usize,
output: &mut W,
) -> UResult<()> {
if line.is_empty() {
return Ok(());
}

let at_eol = i >= len;
if line == [NL] {
output.write_all(&[NL])?;
return Ok(());
}

if at_eol {
output.write_all(slice)?;
let len = line.len();
let mut i = 0;

while i < len {
let segment_width = if len - i >= width { width } else { len - i };
let slice = {
let slice = &line[i..i + segment_width];
if spaces && i + segment_width < len {
match slice
.iter()
.enumerate()
.rev()
.find(|(_, c)| c.is_ascii_whitespace() && **c != CR)
{
Some((m, _)) => &slice[..=m],
None => slice,
}
} else {
output.write_all(slice)?;
output.write_all(&[NL])?;
slice
}
};

// Don't duplicate trailing newlines: if the slice is "\n", the
// previous iteration folded just before the end of the line and
// has already printed this newline.
if slice == [NL] {
break;
}

line.truncate(0);
i += slice.len();

let at_eol = i >= len;

output.write_all(slice)?;
if !at_eol {
output.write_all(&[NL])?;
}
}

Ok(())
Expand All @@ -254,6 +295,48 @@ fn next_tab_stop(col_count: usize) -> usize {
col_count + TAB_WIDTH - col_count % TAB_WIDTH
}

/// Find the start of an incomplete UTF-8 sequence at the end of a buffer.
/// Returns the index where the incomplete sequence starts, or buffer.len() if complete.
fn find_incomplete_utf8_start(buffer: &[u8]) -> usize {
let len = buffer.len();

// Check last 3 bytes looking for an incomplete UTF-8 sequence
// UTF-8 continuation bytes start with 0b10xxxxxx
// UTF-8 start bytes: 0b0xxxxxxx (1-byte), 0b110xxxxx (2-byte), 0b1110xxxx (3-byte), 0b11110xxx (4-byte)

for i in (len.saturating_sub(3)..len).rev() {
let byte = buffer[i];

// Check if this is a UTF-8 start byte
if byte & 0b10000000 == 0 {
// Single-byte character (ASCII), complete
return len;
} else if byte & 0b11000000 == 0b11000000 {
// This is a UTF-8 start byte
let expected_len = if byte & 0b11100000 == 0b11000000 {
2
} else if byte & 0b11110000 == 0b11100000 {
3
} else if byte & 0b11111000 == 0b11110000 {
4
} else {
// Invalid UTF-8 start byte
return len;
};

let actual_len = len - i;
if actual_len < expected_len {
// Incomplete sequence found
return i;
}

return len;
}
}

len
}

fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize {
match mode {
WidthMode::Characters => std::str::from_utf8(buffer)
Expand Down Expand Up @@ -572,36 +655,94 @@ fn fold_file<T: Read, W: Write>(
mode: WidthMode,
writer: &mut W,
) -> UResult<()> {
let mut line = Vec::new();
let mut buffer = Vec::new();
let mut output = Vec::new();
let mut col_count = 0;
let mut last_space = None;
let mut chunk_buf = vec![0u8; READ_CHUNK_SIZE];

loop {
if file
.read_until(NL, &mut line)
.map_err_context(|| translate!("fold-error-readline"))?
== 0
{
let bytes_read = file
.read(&mut chunk_buf)
.map_err_context(|| translate!("fold-error-readline"))?;

if bytes_read == 0 {
// EOF: process any remaining data as a line without newline
if !buffer.is_empty() {
let mut ctx = FoldContext {
spaces,
width,
mode,
writer,
output: &mut output,
col_count: &mut col_count,
last_space: &mut last_space,
};

match std::str::from_utf8(&buffer) {
Ok(s) => process_utf8_line(s, &mut ctx)?,
Err(_) => process_non_utf8_line(&buffer, &mut ctx)?,
}
}
break;
}

let mut ctx = FoldContext {
spaces,
width,
mode,
writer,
output: &mut output,
col_count: &mut col_count,
last_space: &mut last_space,
};
buffer.extend_from_slice(&chunk_buf[..bytes_read]);

// Process complete lines one at a time
let mut start = 0;
for i in 0..buffer.len() {
if buffer[i] == NL {
// Process line including the newline
let line_data = &buffer[start..=i];

let mut ctx = FoldContext {
spaces,
width,
mode,
writer,
output: &mut output,
col_count: &mut col_count,
last_space: &mut last_space,
};

match std::str::from_utf8(line_data) {
Ok(s) => process_utf8_line(s, &mut ctx)?,
Err(_) => process_non_utf8_line(line_data, &mut ctx)?,
}

start = i + 1;
}
}

match std::str::from_utf8(&line) {
Ok(s) => process_utf8_line(s, &mut ctx)?,
Err(_) => process_non_utf8_line(&line, &mut ctx)?,
// Remove processed data from buffer
if start > 0 {
buffer.drain(..start);
}

line.clear();
// If buffer is getting too large without a newline, process it anyway
if buffer.len() >= READ_CHUNK_SIZE * 2 {
let process_up_to = find_incomplete_utf8_start(&buffer);

if process_up_to > 0 {
let mut ctx = FoldContext {
spaces,
width,
mode,
writer,
output: &mut output,
col_count: &mut col_count,
last_space: &mut last_space,
};

match std::str::from_utf8(&buffer[..process_up_to]) {
Ok(s) => process_utf8_line(s, &mut ctx)?,
Err(_) => process_non_utf8_line(&buffer[..process_up_to], &mut ctx)?,
}

buffer.drain(..process_up_to);
}
}
}

if !output.is_empty() {
Expand Down
Loading