diff --git a/crates/bashkit/src/builtins/awk.rs b/crates/bashkit/src/builtins/awk.rs index cf326f64..1ade947a 100644 --- a/crates/bashkit/src/builtins/awk.rs +++ b/crates/bashkit/src/builtins/awk.rs @@ -17,6 +17,8 @@ use async_trait::async_trait; use regex::Regex; use std::collections::HashMap; + +use super::search_common::build_regex; use std::path::PathBuf; use std::sync::Arc; @@ -686,7 +688,7 @@ impl<'a> AwkParser<'a> { if c == '/' { let pattern = &self.input[start..self.pos]; self.pos += 1; - let regex = Regex::new(pattern) + let regex = build_regex(pattern) .map_err(|e| Error::Execution(format!("awk: invalid regex: {}", e)))?; return Ok(Some(AwkPattern::Regex(regex))); } else if c == '\\' { @@ -2162,7 +2164,7 @@ impl AwkInterpreter { fn eval_expr_as_bool(&mut self, expr: &AwkExpr) -> bool { if let AwkExpr::Regex(pattern) = expr { let line = self.state.get_field(0).as_string(); - if let Ok(re) = Regex::new(pattern) { + if let Ok(re) = build_regex(pattern) { return re.is_match(&line); } return false; @@ -2236,7 +2238,7 @@ impl AwkInterpreter { AwkValue::Number(if lb || rb { 1.0 } else { 0.0 }) } "~" => { - if let Ok(re) = Regex::new(&r.as_string()) { + if let Ok(re) = build_regex(&r.as_string()) { AwkValue::Number(if re.is_match(&l.as_string()) { 1.0 } else { @@ -2247,7 +2249,7 @@ impl AwkInterpreter { } } "!~" => { - if let Ok(re) = Regex::new(&r.as_string()) { + if let Ok(re) = build_regex(&r.as_string()) { AwkValue::Number(if !re.is_match(&l.as_string()) { 1.0 } else { @@ -2363,7 +2365,7 @@ impl AwkInterpreter { } AwkExpr::Match(expr, pattern) => { let s = self.eval_expr(expr).as_string(); - if let Ok(re) = Regex::new(pattern) { + if let Ok(re) = build_regex(pattern) { AwkValue::Number(if re.is_match(&s) { 1.0 } else { 0.0 }) } else { AwkValue::Number(0.0) @@ -2514,7 +2516,7 @@ impl AwkInterpreter { let target = self.eval_expr(&target_expr).as_string(); - if let Ok(re) = Regex::new(&pattern) { + if let Ok(re) = build_regex(&pattern) { let (result, count) = if name == "gsub" { let count = re.find_iter(&target).count(); ( @@ -2600,7 +2602,7 @@ impl AwkInterpreter { } else { None }; - if let Ok(re) = Regex::new(&pattern) { + if let Ok(re) = build_regex(&pattern) { if let Some(caps) = re.captures(&s) { let m = caps.get(0).unwrap(); let rstart = m.start() + 1; // awk is 1-indexed @@ -2648,7 +2650,7 @@ impl AwkInterpreter { } else { self.state.get_field(0).as_string() }; - if let Ok(re) = Regex::new(&pattern) { + if let Ok(re) = build_regex(&pattern) { if how == "g" || how == "G" { AwkValue::String(re.replace_all(&target, replacement.as_str()).to_string()) } else { diff --git a/crates/bashkit/src/builtins/grep.rs b/crates/bashkit/src/builtins/grep.rs index b1838ea8..d51c6848 100644 --- a/crates/bashkit/src/builtins/grep.rs +++ b/crates/bashkit/src/builtins/grep.rs @@ -37,9 +37,9 @@ //! grep --line-buffered pattern # line-buffered (no-op) use async_trait::async_trait; -use regex::{Regex, RegexBuilder}; +use regex::Regex; -use super::search_common::parse_numeric_flag_arg; +use super::search_common::{build_regex_opts, parse_numeric_flag_arg}; use super::{Builtin, Context}; use crate::error::{Error, Result}; use crate::interpreter::ExecResult; @@ -295,9 +295,7 @@ impl GrepOptions { combined }; - RegexBuilder::new(&final_pattern) - .case_insensitive(self.ignore_case) - .build() + build_regex_opts(&final_pattern, self.ignore_case) .map_err(|e| Error::Execution(format!("grep: invalid pattern: {}", e))) } } diff --git a/crates/bashkit/src/builtins/search_common.rs b/crates/bashkit/src/builtins/search_common.rs index 4966a17e..dff82e27 100644 --- a/crates/bashkit/src/builtins/search_common.rs +++ b/crates/bashkit/src/builtins/search_common.rs @@ -11,6 +11,29 @@ use regex::{Regex, RegexBuilder}; use crate::error::{Error, Result}; use crate::fs::FileSystem; +/// Default compiled-regex size limit (1 MB). +pub(crate) const REGEX_SIZE_LIMIT: usize = 1_000_000; + +/// Default DFA size limit (1 MB). +pub(crate) const REGEX_DFA_SIZE_LIMIT: usize = 1_000_000; + +/// Build a regex with enforced size limits. +pub(crate) fn build_regex(pattern: &str) -> std::result::Result { + build_regex_opts(pattern, false) +} + +/// Build a regex with enforced size limits and optional case-insensitivity. +pub(crate) fn build_regex_opts( + pattern: &str, + case_insensitive: bool, +) -> std::result::Result { + RegexBuilder::new(pattern) + .case_insensitive(case_insensitive) + .size_limit(REGEX_SIZE_LIMIT) + .dfa_size_limit(REGEX_DFA_SIZE_LIMIT) + .build() +} + /// Recursively collect all files under the given directories in the VFS. /// /// Returns sorted list of file paths (directories are traversed but not included). @@ -60,9 +83,7 @@ pub(crate) fn build_search_regex( pat }; - RegexBuilder::new(&pat) - .case_insensitive(ignore_case) - .build() + build_regex_opts(&pat, ignore_case) .map_err(|e| Error::Execution(format!("{}: invalid pattern: {}", cmd_name, e))) } diff --git a/crates/bashkit/src/builtins/sed.rs b/crates/bashkit/src/builtins/sed.rs index 0dac0794..e60a674e 100644 --- a/crates/bashkit/src/builtins/sed.rs +++ b/crates/bashkit/src/builtins/sed.rs @@ -19,7 +19,9 @@ #![allow(clippy::unwrap_used)] use async_trait::async_trait; -use regex::{Regex, RegexBuilder}; +use regex::Regex; + +use super::search_common::{build_regex, build_regex_opts}; use super::{Builtin, Context, read_text_file}; use crate::error::{Error, Result}; @@ -341,7 +343,7 @@ fn parse_address(s: &str) -> Result<(Option
, &str)> { Error::Execution("sed: unterminated address regex".to_string()) })?; let pattern = &after_slash[..end2]; - let regex = Regex::new(pattern) + let regex = build_regex(pattern) .map_err(|e| Error::Execution(format!("sed: invalid regex: {}", e)))?; if num == 0 { return Ok((Some(Address::ZeroRegex(regex)), &after_slash[end2 + 1..])); @@ -377,7 +379,7 @@ fn parse_address(s: &str) -> Result<(Option
, &str)> { .find('/') .ok_or_else(|| Error::Execution("sed: unterminated address regex".to_string()))?; let pattern = &s[1..end + 1]; - let regex = Regex::new(pattern) + let regex = build_regex(pattern) .map_err(|e| Error::Execution(format!("sed: invalid regex: {}", e)))?; let rest = &s[end + 2..]; @@ -398,7 +400,7 @@ fn parse_address(s: &str) -> Result<(Option
, &str)> { Error::Execution("sed: unterminated address regex".to_string()) })?; let pattern2 = &after_slash[..end2]; - let regex2 = Regex::new(pattern2) + let regex2 = build_regex(pattern2) .map_err(|e| Error::Execution(format!("sed: invalid regex: {}", e)))?; return Ok(( Some(Address::RegexRange(regex, regex2)), @@ -496,9 +498,7 @@ fn parse_sed_command(s: &str, extended_regex: bool) -> Result<(Option
, }; // Build regex with optional case-insensitive flag let case_insensitive = flags.contains('i'); - let regex = RegexBuilder::new(&pattern) - .case_insensitive(case_insensitive) - .build() + let regex = build_regex_opts(&pattern, case_insensitive) .map_err(|e| Error::Execution(format!("sed: invalid pattern: {}", e)))?; // Convert sed replacement syntax to regex replacement syntax diff --git a/crates/bashkit/tests/regex_limit_tests.rs b/crates/bashkit/tests/regex_limit_tests.rs new file mode 100644 index 00000000..6f435846 --- /dev/null +++ b/crates/bashkit/tests/regex_limit_tests.rs @@ -0,0 +1,110 @@ +//! Regex size limit tests for grep, sed, and awk builtins +//! +//! Verifies that oversized regex patterns are rejected rather than causing +//! resource exhaustion (issue #984). + +use bashkit::Bash; +use std::time::Duration; + +/// Helper: generate a large alternation pattern like "1|2|3|...|N" +fn huge_alternation_pattern(n: usize) -> String { + (1..=n).map(|i| i.to_string()).collect::>().join("|") +} + +fn test_bash() -> Bash { + Bash::builder() + .limits(bashkit::ExecutionLimits::new().timeout(Duration::from_secs(10))) + .build() +} + +#[tokio::test] +async fn grep_rejects_huge_regex() { + let mut bash = test_bash(); + let pattern = huge_alternation_pattern(50_000); + let script = format!("echo test | grep '{}'", pattern); + match bash.exec(&script).await { + Ok(result) => { + assert_ne!(result.exit_code, 0, "grep should fail with oversized regex"); + } + Err(e) => { + let msg = e.to_string(); + assert!( + msg.contains("size limit") || msg.contains("invalid pattern"), + "error should mention size limit, got: {}", + msg + ); + } + } +} + +#[tokio::test] +async fn grep_accepts_normal_regex() { + let mut bash = Bash::new(); + let result = bash + .exec("echo 'hello world' | grep 'hello'") + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout.trim(), "hello world"); +} + +#[tokio::test] +async fn sed_rejects_huge_regex() { + let mut bash = test_bash(); + let pattern = huge_alternation_pattern(50_000); + let script = format!("echo test | sed 's/{}/replaced/'", pattern); + match bash.exec(&script).await { + Ok(result) => { + // sed error propagates through pipeline — the key security + // property is it completes quickly without resource exhaustion. + // Depending on how the interpreter handles pipeline errors, + // exit code may or may not be non-zero. + assert!( + result.exit_code != 0 || result.stdout.trim() == "test", + "sed should either fail or pass input through with oversized regex, \ + exit={}, stdout='{}'", + result.exit_code, + result.stdout.trim() + ); + } + Err(e) => { + let msg = e.to_string(); + assert!( + msg.contains("size limit") || msg.contains("invalid"), + "error should mention size limit, got: {}", + msg + ); + } + } +} + +#[tokio::test] +async fn awk_rejects_huge_regex_in_match() { + let mut bash = test_bash(); + let pattern = huge_alternation_pattern(50_000); + let script = format!( + "echo test | awk '{{ if (match($0, \"{}\" )) print }}'", + pattern + ); + match bash.exec(&script).await { + Ok(result) => { + // awk silently handles invalid regex in match() — the key security + // property is it completes quickly without resource exhaustion. + assert!( + result.stdout.trim().is_empty() || result.exit_code != 0, + "awk should not match with oversized regex, \ + exit={}, stdout='{}'", + result.exit_code, + result.stdout.trim() + ); + } + Err(e) => { + let msg = e.to_string(); + assert!( + msg.contains("size limit") || msg.contains("invalid"), + "error should mention size limit, got: {}", + msg + ); + } + } +} diff --git a/crates/bashkit/tests/spec_cases/bash/regex-limit.test.sh b/crates/bashkit/tests/spec_cases/bash/regex-limit.test.sh new file mode 100644 index 00000000..93661823 --- /dev/null +++ b/crates/bashkit/tests/spec_cases/bash/regex-limit.test.sh @@ -0,0 +1,8 @@ +# Regex size/complexity limit tests + +### grep_normal_regex_works +# Normal regex should work fine +echo "hello world" | grep "hello" +### expect +hello world +### end