From 85749d4e052b40be2075c2e2039109ebe83e9341 Mon Sep 17 00:00:00 2001 From: b4prog Date: Fri, 26 Jun 2026 16:04:37 +0200 Subject: [PATCH 01/10] [refactor] move duplicate report logic under report module --- src/lib.rs | 3 +-- src/{duplicate.rs => report/duplicate_detection.rs} | 0 src/{report.rs => report/duplicate_renderer.rs} | 0 src/report/mod.rs | 5 +++++ 4 files changed, 6 insertions(+), 2 deletions(-) rename src/{duplicate.rs => report/duplicate_detection.rs} (100%) rename src/{report.rs => report/duplicate_renderer.rs} (100%) create mode 100644 src/report/mod.rs diff --git a/src/lib.rs b/src/lib.rs index 0e8bb62..b1108c0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,5 @@ pub mod cli; pub mod discovery; -pub mod duplicate; pub mod error; pub mod git; pub mod language; @@ -49,7 +48,7 @@ where time_result(config.verbose, || line::process_source_files(&source_files))?; let (duplicate_blocks, duplicate_detection_duration) = time_value(config.verbose, || { - duplicate::detect_duplicate_blocks(&processed_files) + report::detect_duplicate_blocks(&processed_files) }); let report = report::DuplicateReport { analyzed_files: source_files.len(), diff --git a/src/duplicate.rs b/src/report/duplicate_detection.rs similarity index 100% rename from src/duplicate.rs rename to src/report/duplicate_detection.rs diff --git a/src/report.rs b/src/report/duplicate_renderer.rs similarity index 100% rename from src/report.rs rename to src/report/duplicate_renderer.rs diff --git a/src/report/mod.rs b/src/report/mod.rs new file mode 100644 index 0000000..7b85059 --- /dev/null +++ b/src/report/mod.rs @@ -0,0 +1,5 @@ +mod duplicate_detection; +mod duplicate_renderer; + +pub(crate) use duplicate_detection::detect_duplicate_blocks; +pub use duplicate_renderer::{render_duplicate_report, DuplicateReport, DuplicateReportTimings}; From 90a681fd7f7647d308010699376d54d018340917 Mon Sep 17 00:00:00 2001 From: b4prog Date: Fri, 26 Jun 2026 16:19:55 +0200 Subject: [PATCH 02/10] [refactor] split language registry and classification modules --- src/language.rs | 518 --------------------------------- src/language/classification.rs | 217 ++++++++++++++ src/language/mod.rs | 6 + src/language/patterns.rs | 246 ++++++++++++++++ src/language/registry.rs | 233 +++++++++++++++ 5 files changed, 702 insertions(+), 518 deletions(-) delete mode 100644 src/language.rs create mode 100644 src/language/classification.rs create mode 100644 src/language/mod.rs create mode 100644 src/language/patterns.rs create mode 100644 src/language/registry.rs diff --git a/src/language.rs b/src/language.rs deleted file mode 100644 index 7997237..0000000 --- a/src/language.rs +++ /dev/null @@ -1,518 +0,0 @@ -use std::collections::HashMap; -use std::sync::OnceLock; - -use crate::model::LineStatus; -use regex::Regex; - -#[derive(Debug, Clone, Copy)] -pub struct LanguageLinePattern { - pub language_name: &'static str, - pub extensions: &'static [&'static str], - pub duplicate_mitigation_pattern: &'static [char], - pub duplicate_mitigation_lines: &'static [&'static str], - pub duplicate_mitigation_regexps: &'static [&'static str], -} - -pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ - LanguageLinePattern { - language_name: "Bash", - extensions: &["bash"], - duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'], - duplicate_mitigation_lines: &["do", "done", "else", "fi", "then"], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "C", - extensions: &["c", "h"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &["#else", "#endif"], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "C#", - extensions: &["cs"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &["#else", "#endif", "#endregion"], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "C++", - extensions: &["cpp", "hpp", "cc", "hh", "cxx", "hxx"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &["#else", "#endif"], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "CSS", - extensions: &["css"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Fish", - extensions: &["fish"], - duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'], - duplicate_mitigation_lines: &["else", "end"], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Go", - extensions: &["go"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "HTML", - extensions: &["html", "htm"], - duplicate_mitigation_pattern: &['/', '<', '>'], - duplicate_mitigation_lines: &[ - "", - "", - "", - "", - "", - "", - ], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Java", - extensions: &["java"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "JavaScript", - extensions: &["js", "jsx", "mjs", "cjs"], - duplicate_mitigation_pattern: &[ - '&', '(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '|', '}', - ], - duplicate_mitigation_lines: &["// @ts-nocheck"], - duplicate_mitigation_regexps: &[ - // Excludes single-line block comments used by generated files and tooling. Example: /* eslint-disable */ - r"^/\*.*\*/$", - // Excludes generated interface field declarations. Example: errors: InvalidInputError[] - r"^[A-Za-z_$][A-Za-z0-9_$]*\??:\s*(?:Scalars\['[A-Za-z]+'\]|[A-Z][A-Za-z0-9_$]*(?:\[\])?|[a-z]+(?:\[\])?|\([^)]*\))(?:\[\])?(?:\s*\|\s*(?:null|number|boolean|string))*[,]?$", - // Excludes generated GraphQL typename marker fields. Example: __typename: 'User' - r"^__typename:\s*'[A-Za-z_$][A-Za-z0-9_$]*'[,]?$", - ], - }, - LanguageLinePattern { - language_name: "Kotlin", - extensions: &["kt", "kts"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Less", - extensions: &["less"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Objective-C", - extensions: &["m", "mm"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &["#else", "#endif"], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "PHP", - extensions: &["php", "phtml"], - duplicate_mitigation_pattern: &[ - '(', ')', ',', '/', ':', ';', '<', '>', '?', '[', ']', '{', '}', - ], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "PowerShell", - extensions: &["ps1", "psm1", "psd1"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '?', '[', ']', '{', '|', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Python", - extensions: &["py", "pyw"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Ruby", - extensions: &["rb"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &["end"], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Rust", - extensions: &["rs"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &["///", "#[test]"], - duplicate_mitigation_regexps: &[ - // Excludes short path or enum variant fragments. Example: Self::Ready, - r"^[A-Za-z0-9_]*::?\s*[A-Za-z0-9_]*[,]?$", - // Excludes bare identifiers with optional punctuation. Example: value, - r"^[A-Za-z0-9_]+\s*[.,]?$", - // Excludes simple method or field access lines. Example: .clone() - r"^\.?\s*[A-Za-z0-9_]+(?:\(\s*\)?)?$", - // Excludes incomplete let bindings split across lines. Example: let value = - r"^let\s+(?:mut\s+)?[A-Za-z0-9_]+\s*=$", - // Excludes simple public struct field declarations. Example: pub name: String, - r"^pub\s+[A-Za-z0-9_]*\s*:\s*[A-Za-z0-9_]*[,]?$", - // Excludes single-path use imports. Example: use crate::module; - r"^use\s+[A-Za-z_][A-Za-z0-9_]*(?:::[A-Za-z_][A-Za-z0-9_]*)*;$", - ], - }, - LanguageLinePattern { - language_name: "Sass", - extensions: &["sass"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Scala", - extensions: &["scala", "sc"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "SCSS", - extensions: &["scss"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Shell", - extensions: &["sh"], - duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'], - duplicate_mitigation_lines: &["do", "done", "else", "fi", "then"], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "SQL", - extensions: &["sql"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';'], - duplicate_mitigation_lines: &["BEGIN", "END"], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Swift", - extensions: &["swift"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "TypeScript", - extensions: &["ts", "tsx"], - duplicate_mitigation_pattern: &[ - '&', '(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '|', '}', - ], - duplicate_mitigation_lines: &["// @ts-nocheck"], - duplicate_mitigation_regexps: &[ - // Excludes single-line block comments used by generated files and tooling. Example: /* eslint-disable */ - r"^/\*.*\*/$", - // Excludes generated interface field declarations. Example: errors: InvalidInputError[] - r"^[A-Za-z_$][A-Za-z0-9_$]*\??:\s*(?:Scalars\['[A-Za-z]+'\]|[A-Z][A-Za-z0-9_$]*(?:\[\])?|[a-z]+(?:\[\])?|\([^)]*\))(?:\[\])?(?:\s*\|\s*(?:null|number|boolean|string))*[,]?$", - // Excludes generated GraphQL typename marker fields. Example: __typename: 'User' - r"^__typename:\s*'[A-Za-z_$][A-Za-z0-9_$]*'[,]?$", - ], - }, - LanguageLinePattern { - language_name: "XML", - extensions: &["xml", "xhtml", "svg"], - duplicate_mitigation_pattern: &['/', '<', '>'], - duplicate_mitigation_lines: &[ - "", - "", - "", - "", - "", - "", - ], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "YAML", - extensions: &["yaml", "yml"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &["jobs:", "on:"], - duplicate_mitigation_regexps: &[], - }, - LanguageLinePattern { - language_name: "Zsh", - extensions: &["zsh"], - duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'], - duplicate_mitigation_lines: &["do", "done", "else", "fi", "then"], - duplicate_mitigation_regexps: &[], - }, -]; - -#[must_use] -pub fn supported_file_extensions() -> Vec { - let mut extensions = Vec::new(); - for language in LANGUAGE_PATTERNS { - for &extension in language.extensions { - if !extensions.iter().any(|selected| selected == extension) { - extensions.push(extension.to_string()); - } - } - } - extensions -} - -#[derive(Debug)] -struct DuplicateMitigationLineRegistry { - by_extension: HashMap<&'static str, DuplicateMitigationPatterns>, -} - -#[derive(Debug, Default)] -struct DuplicateMitigationPatterns { - lines_by_hash: HashMap>, - character_pattern: Vec, - regexps: Vec, -} - -static DUPLICATE_MITIGATION_LINE_REGISTRY: OnceLock = - OnceLock::new(); - -#[must_use] -pub fn hash_normalized_line(line: &str) -> u128 { - xxhash_rust::xxh3::xxh3_128(line.as_bytes()) -} - -#[must_use] -pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> LineStatus { - let extension = extension.to_ascii_lowercase(); - let Some(patterns) = registry().by_extension.get(extension.as_str()) else { - return LineStatus::Comparison; - }; - if patterns.matches_line(normalized_line, hash) { - LineStatus::BlockOnly - } else { - LineStatus::Comparison - } -} - -fn registry() -> &'static DuplicateMitigationLineRegistry { - DUPLICATE_MITIGATION_LINE_REGISTRY.get_or_init(|| { - let mut by_extension: HashMap<&'static str, DuplicateMitigationPatterns> = HashMap::new(); - for language in LANGUAGE_PATTERNS { - for extension in language.extensions { - let patterns = by_extension.entry(extension).or_default(); - register_duplicate_mitigation_lines( - &mut patterns.lines_by_hash, - language.duplicate_mitigation_lines, - ); - register_duplicate_mitigation_pattern( - &mut patterns.character_pattern, - language.duplicate_mitigation_pattern, - ); - register_duplicate_mitigation_regexps( - &mut patterns.regexps, - language.duplicate_mitigation_regexps, - ); - } - } - DuplicateMitigationLineRegistry { by_extension } - }) -} - -impl DuplicateMitigationPatterns { - fn matches_line(&self, normalized_line: &str, hash: u128) -> bool { - self.matches_registered_line(normalized_line, hash) - || matches_duplicate_mitigation_pattern(normalized_line, &self.character_pattern) - || matches_duplicate_mitigation_regexps(normalized_line, &self.regexps) - } - - fn matches_registered_line(&self, normalized_line: &str, hash: u128) -> bool { - self.lines_by_hash - .get(&hash) - .is_some_and(|patterns| patterns.contains(&normalized_line)) - } -} - -fn register_duplicate_mitigation_lines( - patterns_by_hash: &mut HashMap>, - lines: &'static [&'static str], -) { - for &line in lines { - patterns_by_hash - .entry(hash_normalized_line(line)) - .or_default() - .push(line); - } -} - -fn register_duplicate_mitigation_pattern( - character_pattern: &mut Vec, - characters: &'static [char], -) { - for &character in characters { - if !character_pattern.contains(&character) { - character_pattern.push(character); - } - } -} - -fn register_duplicate_mitigation_regexps( - regexps: &mut Vec, - patterns: &'static [&'static str], -) { - for &pattern in patterns { - if !regexps.iter().any(|regexp| regexp.as_str() == pattern) { - regexps.push(Regex::new(pattern).expect("duplicate mitigation regexp must compile")); - } - } -} - -fn matches_duplicate_mitigation_pattern(line: &str, character_pattern: &[char]) -> bool { - !character_pattern.is_empty() - && line - .chars() - .all(|character| character.is_whitespace() || character_pattern.contains(&character)) -} - -fn matches_duplicate_mitigation_regexps(line: &str, regexps: &[Regex]) -> bool { - regexps.iter().any(|regexp| { - regexp - .find(line) - .is_some_and(|matched| matched.start() == 0 && matched.end() == line.len()) - }) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn assigns_block_only_status_from_extension_specific_line_registry() { - let line = ".into_iter()"; - let hash = hash_normalized_line(line); - assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly); - } - - #[test] - fn assigns_comparison_status_for_meaningful_lines() { - let line = "const value = computeValue(input);"; - let hash = hash_normalized_line(line); - assert_eq!(classify_line("ts", line, hash), LineStatus::Comparison); - } - - #[test] - fn verifies_text_after_hash_lookup() { - let hash = hash_normalized_line("}"); - assert_eq!( - classify_line("ts", "not-a-brace", hash), - LineStatus::Comparison - ); - } - - #[test] - fn assigns_block_only_status_from_character_pattern() { - let line = "} \t);"; - let hash = hash_normalized_line(line); - assert_eq!(classify_line("ts", line, hash), LineStatus::BlockOnly); - } - - #[test] - fn assigns_block_only_status_from_regexps() { - let line = ".update()"; - let hash = hash_normalized_line(line); - assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly); - } - - #[test] - fn regexps_must_match_the_full_line() { - let line = ".update()?.await"; - let hash = hash_normalized_line(line); - assert_eq!(classify_line("rs", line, hash), LineStatus::Comparison); - } - - #[test] - fn assigns_block_only_status_for_typescript_codegen_lines() { - let lines = [ - "// @ts-nocheck", - "/* eslint-disable */", - "errors: DeleteViewsError[]", - "__typename: 'DeleteViewsResponse'", - ]; - for line in lines { - let hash = hash_normalized_line(line); - assert_eq!(classify_line("ts", line, hash), LineStatus::BlockOnly); - } - } - - #[test] - fn assigns_block_only_status_for_yaml_lines() { - let line = "jobs:"; - let hash = hash_normalized_line(line); - assert_eq!(classify_line("yaml", line, hash), LineStatus::BlockOnly); - } - - #[test] - fn assigns_comparison_status_for_json_lines() { - let line = "}"; - let hash = hash_normalized_line(line); - assert_eq!(classify_line("json", line, hash), LineStatus::Comparison); - } - - #[test] - fn ignores_character_pattern_for_unknown_extensions() { - let line = "});"; - let hash = hash_normalized_line(line); - assert_eq!(classify_line("unknown", line, hash), LineStatus::Comparison); - } - - #[test] - fn empty_character_pattern_does_not_match() { - assert!(!matches_duplicate_mitigation_pattern("}", &[])); - } - - #[test] - fn collects_supported_file_extensions_from_language_patterns() { - let extensions = supported_file_extensions(); - for language in LANGUAGE_PATTERNS { - for extension in language.extensions { - assert!(extensions.iter().any(|selected| selected == extension)); - } - } - } - - #[test] - fn language_patterns_are_sorted_by_name() { - for pair in LANGUAGE_PATTERNS.windows(2) { - assert!( - pair[0].language_name.to_ascii_lowercase() - <= pair[1].language_name.to_ascii_lowercase() - ); - } - } - - #[test] - fn language_patterns_use_unique_extensions() { - let mut languages_by_extension = HashMap::new(); - for language in LANGUAGE_PATTERNS { - for extension in language.extensions { - let previous = languages_by_extension.insert(extension, language.language_name); - assert!( - previous.is_none(), - "{extension} belongs to both {} and {}", - previous.unwrap_or_default(), - language.language_name - ); - } - } - } -} diff --git a/src/language/classification.rs b/src/language/classification.rs new file mode 100644 index 0000000..7528b28 --- /dev/null +++ b/src/language/classification.rs @@ -0,0 +1,217 @@ +use std::collections::HashMap; +use std::sync::OnceLock; + +use regex::Regex; + +use crate::model::LineStatus; + +use super::patterns::LANGUAGE_PATTERNS; + +#[derive(Debug)] +struct DuplicateMitigationLineRegistry { + by_extension: HashMap<&'static str, DuplicateMitigationPatterns>, +} + +#[derive(Debug, Default)] +struct DuplicateMitigationPatterns { + lines_by_hash: HashMap>, + character_pattern: Vec, + regexps: Vec, +} + +static DUPLICATE_MITIGATION_LINE_REGISTRY: OnceLock = + OnceLock::new(); + +#[must_use] +pub fn hash_normalized_line(line: &str) -> u128 { + xxhash_rust::xxh3::xxh3_128(line.as_bytes()) +} + +#[must_use] +pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> LineStatus { + let extension = extension.to_ascii_lowercase(); + let Some(patterns) = registry().by_extension.get(extension.as_str()) else { + return LineStatus::Comparison; + }; + if patterns.matches_line(normalized_line, hash) { + LineStatus::BlockOnly + } else { + LineStatus::Comparison + } +} + +fn registry() -> &'static DuplicateMitigationLineRegistry { + DUPLICATE_MITIGATION_LINE_REGISTRY.get_or_init(|| { + let mut by_extension: HashMap<&'static str, DuplicateMitigationPatterns> = HashMap::new(); + for language in LANGUAGE_PATTERNS { + for extension in language.language.extensions { + let patterns = by_extension.entry(extension).or_default(); + register_duplicate_mitigation_lines( + &mut patterns.lines_by_hash, + language.duplicate_mitigation_lines, + ); + register_duplicate_mitigation_pattern( + &mut patterns.character_pattern, + language.duplicate_mitigation_pattern, + ); + register_duplicate_mitigation_regexps( + &mut patterns.regexps, + language.duplicate_mitigation_regexps, + ); + } + } + DuplicateMitigationLineRegistry { by_extension } + }) +} + +impl DuplicateMitigationPatterns { + fn matches_line(&self, normalized_line: &str, hash: u128) -> bool { + self.matches_registered_line(normalized_line, hash) + || matches_duplicate_mitigation_pattern(normalized_line, &self.character_pattern) + || matches_duplicate_mitigation_regexps(normalized_line, &self.regexps) + } + + fn matches_registered_line(&self, normalized_line: &str, hash: u128) -> bool { + self.lines_by_hash + .get(&hash) + .is_some_and(|patterns| patterns.contains(&normalized_line)) + } +} + +fn register_duplicate_mitigation_lines( + patterns_by_hash: &mut HashMap>, + lines: &'static [&'static str], +) { + for &line in lines { + patterns_by_hash + .entry(hash_normalized_line(line)) + .or_default() + .push(line); + } +} + +fn register_duplicate_mitigation_pattern( + character_pattern: &mut Vec, + characters: &'static [char], +) { + for &character in characters { + if !character_pattern.contains(&character) { + character_pattern.push(character); + } + } +} + +fn register_duplicate_mitigation_regexps( + regexps: &mut Vec, + patterns: &'static [&'static str], +) { + for &pattern in patterns { + if !regexps.iter().any(|regexp| regexp.as_str() == pattern) { + regexps.push(Regex::new(pattern).expect("duplicate mitigation regexp must compile")); + } + } +} + +fn matches_duplicate_mitigation_pattern(line: &str, character_pattern: &[char]) -> bool { + !character_pattern.is_empty() + && line + .chars() + .all(|character| character.is_whitespace() || character_pattern.contains(&character)) +} + +fn matches_duplicate_mitigation_regexps(line: &str, regexps: &[Regex]) -> bool { + regexps.iter().any(|regexp| { + regexp + .find(line) + .is_some_and(|matched| matched.start() == 0 && matched.end() == line.len()) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn assigns_block_only_status_from_extension_specific_line_registry() { + let line = ".into_iter()"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly); + } + + #[test] + fn assigns_comparison_status_for_meaningful_lines() { + let line = "const value = computeValue(input);"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("ts", line, hash), LineStatus::Comparison); + } + + #[test] + fn verifies_text_after_hash_lookup() { + let hash = hash_normalized_line("}"); + assert_eq!( + classify_line("ts", "not-a-brace", hash), + LineStatus::Comparison + ); + } + + #[test] + fn assigns_block_only_status_from_character_pattern() { + let line = "} \t);"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("ts", line, hash), LineStatus::BlockOnly); + } + + #[test] + fn assigns_block_only_status_from_regexps() { + let line = ".update()"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly); + } + + #[test] + fn regexps_must_match_the_full_line() { + let line = ".update()?.await"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("rs", line, hash), LineStatus::Comparison); + } + + #[test] + fn assigns_block_only_status_for_typescript_codegen_lines() { + let lines = [ + "// @ts-nocheck", + "/* eslint-disable */", + "errors: DeleteViewsError[]", + "__typename: 'DeleteViewsResponse'", + ]; + for line in lines { + let hash = hash_normalized_line(line); + assert_eq!(classify_line("ts", line, hash), LineStatus::BlockOnly); + } + } + + #[test] + fn assigns_block_only_status_for_yaml_lines() { + let line = "jobs:"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("yaml", line, hash), LineStatus::BlockOnly); + } + + #[test] + fn assigns_comparison_status_for_json_lines() { + let line = "}"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("json", line, hash), LineStatus::Comparison); + } + + #[test] + fn ignores_character_pattern_for_unknown_extensions() { + let line = "});"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("unknown", line, hash), LineStatus::Comparison); + } + + #[test] + fn empty_character_pattern_does_not_match() { + assert!(!matches_duplicate_mitigation_pattern("}", &[])); + } +} diff --git a/src/language/mod.rs b/src/language/mod.rs new file mode 100644 index 0000000..bd86a84 --- /dev/null +++ b/src/language/mod.rs @@ -0,0 +1,6 @@ +mod classification; +mod patterns; +mod registry; + +pub use classification::{classify_line, hash_normalized_line}; +pub use registry::supported_file_extensions; diff --git a/src/language/patterns.rs b/src/language/patterns.rs new file mode 100644 index 0000000..1dbd12e --- /dev/null +++ b/src/language/patterns.rs @@ -0,0 +1,246 @@ +use super::registry::{ + Language, BASH, C, CSS, C_PLUS_PLUS, C_SHARP, FISH, GO, HTML, JAVA, JAVASCRIPT, KOTLIN, LESS, + OBJECTIVE_C, PHP, POWERSHELL, PYTHON, RUBY, RUST, SASS, SCALA, SCSS, SHELL, SQL, SWIFT, + TYPESCRIPT, XML, YAML, ZSH, +}; + +#[derive(Debug, Clone, Copy)] +pub(super) struct LanguageLinePattern { + pub(super) language: &'static Language, + pub(super) duplicate_mitigation_pattern: &'static [char], + pub(super) duplicate_mitigation_lines: &'static [&'static str], + pub(super) duplicate_mitigation_regexps: &'static [&'static str], +} + +pub(super) const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ + LanguageLinePattern { + language: &BASH, + duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'], + duplicate_mitigation_lines: &["do", "done", "else", "fi", "then"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &C, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["#else", "#endif"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &C_SHARP, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["#else", "#endif", "#endregion"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &C_PLUS_PLUS, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["#else", "#endif"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &CSS, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &FISH, + duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'], + duplicate_mitigation_lines: &["else", "end"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &GO, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &HTML, + duplicate_mitigation_pattern: &['/', '<', '>'], + duplicate_mitigation_lines: &[ + "", + "", + "", + "", + "", + "", + ], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &JAVA, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &JAVASCRIPT, + duplicate_mitigation_pattern: &[ + '&', '(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '|', '}', + ], + duplicate_mitigation_lines: &["// @ts-nocheck"], + duplicate_mitigation_regexps: &[ + // Excludes single-line block comments used by generated files and tooling. Example: /* eslint-disable */ + r"^/\*.*\*/$", + // Excludes generated interface field declarations. Example: errors: InvalidInputError[] + r"^[A-Za-z_$][A-Za-z0-9_$]*\??:\s*(?:Scalars\['[A-Za-z]+'\]|[A-Z][A-Za-z0-9_$]*(?:\[\])?|[a-z]+(?:\[\])?|\([^)]*\))(?:\[\])?(?:\s*\|\s*(?:null|number|boolean|string))*[,]?$", + // Excludes generated GraphQL typename marker fields. Example: __typename: 'User' + r"^__typename:\s*'[A-Za-z_$][A-Za-z0-9_$]*'[,]?$", + ], + }, + LanguageLinePattern { + language: &KOTLIN, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &LESS, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &OBJECTIVE_C, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["#else", "#endif"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &PHP, + duplicate_mitigation_pattern: &[ + '(', ')', ',', '/', ':', ';', '<', '>', '?', '[', ']', '{', '}', + ], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &POWERSHELL, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '?', '[', ']', '{', '|', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &PYTHON, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &RUBY, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["end"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &RUST, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["///", "#[test]"], + duplicate_mitigation_regexps: &[ + // Excludes short path or enum variant fragments. Example: Self::Ready, + r"^[A-Za-z0-9_]*::?\s*[A-Za-z0-9_]*[,]?$", + // Excludes bare identifiers with optional punctuation. Example: value, + r"^[A-Za-z0-9_]+\s*[.,]?$", + // Excludes simple method or field access lines. Example: .clone() + r"^\.?\s*[A-Za-z0-9_]+(?:\(\s*\)?)?$", + // Excludes incomplete let bindings split across lines. Example: let value = + r"^let\s+(?:mut\s+)?[A-Za-z0-9_]+\s*=$", + // Excludes simple public struct field declarations. Example: pub name: String, + r"^pub\s+[A-Za-z0-9_]*\s*:\s*[A-Za-z0-9_]*[,]?$", + // Excludes single-path use imports. Example: use crate::module; + r"^use\s+[A-Za-z_][A-Za-z0-9_]*(?:::[A-Za-z_][A-Za-z0-9_]*)*;$", + ], + }, + LanguageLinePattern { + language: &SASS, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &SCALA, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &SCSS, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &SHELL, + duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'], + duplicate_mitigation_lines: &["do", "done", "else", "fi", "then"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &SQL, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';'], + duplicate_mitigation_lines: &["BEGIN", "END"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &SWIFT, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &TYPESCRIPT, + duplicate_mitigation_pattern: &[ + '&', '(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '|', '}', + ], + duplicate_mitigation_lines: &["// @ts-nocheck"], + duplicate_mitigation_regexps: &[ + // Excludes single-line block comments used by generated files and tooling. Example: /* eslint-disable */ + r"^/\*.*\*/$", + // Excludes generated interface field declarations. Example: errors: InvalidInputError[] + r"^[A-Za-z_$][A-Za-z0-9_$]*\??:\s*(?:Scalars\['[A-Za-z]+'\]|[A-Z][A-Za-z0-9_$]*(?:\[\])?|[a-z]+(?:\[\])?|\([^)]*\))(?:\[\])?(?:\s*\|\s*(?:null|number|boolean|string))*[,]?$", + // Excludes generated GraphQL typename marker fields. Example: __typename: 'User' + r"^__typename:\s*'[A-Za-z_$][A-Za-z0-9_$]*'[,]?$", + ], + }, + LanguageLinePattern { + language: &XML, + duplicate_mitigation_pattern: &['/', '<', '>'], + duplicate_mitigation_lines: &[ + "", + "", + "", + "", + "", + "", + ], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &YAML, + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["jobs:", "on:"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language: &ZSH, + duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'], + duplicate_mitigation_lines: &["do", "done", "else", "fi", "then"], + duplicate_mitigation_regexps: &[], + }, +]; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn language_patterns_are_sorted_by_language_name() { + for pair in LANGUAGE_PATTERNS.windows(2) { + assert!( + pair[0].language.language_name.to_ascii_lowercase() + <= pair[1].language.language_name.to_ascii_lowercase() + ); + } + } +} diff --git a/src/language/registry.rs b/src/language/registry.rs new file mode 100644 index 0000000..6a89f22 --- /dev/null +++ b/src/language/registry.rs @@ -0,0 +1,233 @@ +#[derive(Debug, Clone, Copy)] +pub(super) struct Language { + pub(super) language_name: &'static str, + pub(super) extensions: &'static [&'static str], +} + +pub(super) static BASH: Language = Language { + language_name: "Bash", + extensions: &["bash"], +}; + +pub(super) static C: Language = Language { + language_name: "C", + extensions: &["c", "h"], +}; + +pub(super) static C_SHARP: Language = Language { + language_name: "C#", + extensions: &["cs"], +}; + +pub(super) static C_PLUS_PLUS: Language = Language { + language_name: "C++", + extensions: &["cpp", "hpp", "cc", "hh", "cxx", "hxx"], +}; + +pub(super) static CSS: Language = Language { + language_name: "CSS", + extensions: &["css"], +}; + +pub(super) static FISH: Language = Language { + language_name: "Fish", + extensions: &["fish"], +}; + +pub(super) static GO: Language = Language { + language_name: "Go", + extensions: &["go"], +}; + +pub(super) static HTML: Language = Language { + language_name: "HTML", + extensions: &["html", "htm"], +}; + +pub(super) static JAVA: Language = Language { + language_name: "Java", + extensions: &["java"], +}; + +pub(super) static JAVASCRIPT: Language = Language { + language_name: "JavaScript", + extensions: &["js", "jsx", "mjs", "cjs"], +}; + +pub(super) static KOTLIN: Language = Language { + language_name: "Kotlin", + extensions: &["kt", "kts"], +}; + +pub(super) static LESS: Language = Language { + language_name: "Less", + extensions: &["less"], +}; + +pub(super) static OBJECTIVE_C: Language = Language { + language_name: "Objective-C", + extensions: &["m", "mm"], +}; + +pub(super) static PHP: Language = Language { + language_name: "PHP", + extensions: &["php", "phtml"], +}; + +pub(super) static POWERSHELL: Language = Language { + language_name: "PowerShell", + extensions: &["ps1", "psm1", "psd1"], +}; + +pub(super) static PYTHON: Language = Language { + language_name: "Python", + extensions: &["py", "pyw"], +}; + +pub(super) static RUBY: Language = Language { + language_name: "Ruby", + extensions: &["rb"], +}; + +pub(super) static RUST: Language = Language { + language_name: "Rust", + extensions: &["rs"], +}; + +pub(super) static SASS: Language = Language { + language_name: "Sass", + extensions: &["sass"], +}; + +pub(super) static SCALA: Language = Language { + language_name: "Scala", + extensions: &["scala", "sc"], +}; + +pub(super) static SCSS: Language = Language { + language_name: "SCSS", + extensions: &["scss"], +}; + +pub(super) static SHELL: Language = Language { + language_name: "Shell", + extensions: &["sh"], +}; + +pub(super) static SQL: Language = Language { + language_name: "SQL", + extensions: &["sql"], +}; + +pub(super) static SWIFT: Language = Language { + language_name: "Swift", + extensions: &["swift"], +}; + +pub(super) static TYPESCRIPT: Language = Language { + language_name: "TypeScript", + extensions: &["ts", "tsx"], +}; + +pub(super) static XML: Language = Language { + language_name: "XML", + extensions: &["xml", "xhtml", "svg"], +}; + +pub(super) static YAML: Language = Language { + language_name: "YAML", + extensions: &["yaml", "yml"], +}; + +pub(super) static ZSH: Language = Language { + language_name: "Zsh", + extensions: &["zsh"], +}; + +pub(super) const LANGUAGES: &[&Language] = &[ + &BASH, + &C, + &C_SHARP, + &C_PLUS_PLUS, + &CSS, + &FISH, + &GO, + &HTML, + &JAVA, + &JAVASCRIPT, + &KOTLIN, + &LESS, + &OBJECTIVE_C, + &PHP, + &POWERSHELL, + &PYTHON, + &RUBY, + &RUST, + &SASS, + &SCALA, + &SCSS, + &SHELL, + &SQL, + &SWIFT, + &TYPESCRIPT, + &XML, + &YAML, + &ZSH, +]; + +#[must_use] +pub fn supported_file_extensions() -> Vec { + let mut extensions = Vec::new(); + for language in LANGUAGES { + debug_assert!(!language.language_name.is_empty()); + for &extension in language.extensions { + if !extensions.iter().any(|selected| selected == extension) { + extensions.push(extension.to_string()); + } + } + } + extensions +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::*; + + #[test] + fn collects_supported_file_extensions_from_language_registry() { + let extensions = supported_file_extensions(); + for language in LANGUAGES { + for extension in language.extensions { + assert!(extensions.iter().any(|selected| selected == extension)); + } + } + } + + #[test] + fn languages_are_sorted_by_name() { + for pair in LANGUAGES.windows(2) { + assert!( + pair[0].language_name.to_ascii_lowercase() + <= pair[1].language_name.to_ascii_lowercase() + ); + } + } + + #[test] + fn languages_use_unique_extensions() { + let mut languages_by_extension = HashMap::new(); + for language in LANGUAGES { + for extension in language.extensions { + let previous = languages_by_extension.insert(extension, language.language_name); + assert!( + previous.is_none(), + "{extension} belongs to both {} and {}", + previous.unwrap_or_default(), + language.language_name + ); + } + } + } +} From 9727d05fedec73e6e7c758d0154872cdf5af7675 Mon Sep 17 00:00:00 2001 From: b4prog Date: Fri, 26 Jun 2026 16:21:36 +0200 Subject: [PATCH 03/10] [chore] bump CodeM8 minor version to 0.7.0 --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 732ecee..5c9939c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -113,7 +113,7 @@ checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "codem8" -version = "0.6.0" +version = "0.7.0" dependencies = [ "clap", "ignore", diff --git a/Cargo.toml b/Cargo.toml index 171f9db..4631915 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codem8" -version = "0.6.0" +version = "0.7.0" edition = "2021" rust-version = "1.85" license = "MIT" From a32dd55687a32eff0c62477bef45bda5e2e20f7d Mon Sep 17 00:00:00 2001 From: b4prog Date: Fri, 26 Jun 2026 16:26:40 +0200 Subject: [PATCH 04/10] [refactor] split CLI parsing and help modules --- src/{cli.rs => cli/args.rs} | 180 +----------------------------------- src/cli/help.rs | 90 ++++++++++++++++++ src/cli/mod.rs | 64 +++++++++++++ src/cli/version.rs | 53 +++++++++++ 4 files changed, 208 insertions(+), 179 deletions(-) rename src/{cli.rs => cli/args.rs} (65%) create mode 100644 src/cli/help.rs create mode 100644 src/cli/mod.rs create mode 100644 src/cli/version.rs diff --git a/src/cli.rs b/src/cli/args.rs similarity index 65% rename from src/cli.rs rename to src/cli/args.rs index 1c71e1b..b90e285 100644 --- a/src/cli.rs +++ b/src/cli/args.rs @@ -1,78 +1,11 @@ -use std::fmt::Write as _; use std::path::PathBuf; use clap::{ArgAction, Parser}; +use super::CliConfig; use crate::error::{CodeM8Error, Result}; use crate::language::supported_file_extensions; -const CARGO_LOCK: &str = include_str!("../Cargo.lock"); -const HELP_TEXT_BODY: &str = "\ -USAGE: - codem8 help - codem8 --report-duplicate [OPTIONS] - -COMMANDS: - help - Display this detailed documentation. - -REQUIRED REPORT SWITCHES: - --report-duplicate - Analyze source files and print a duplicate code report. - -OPTIONS: - -file-extension= - Comma-separated source file extensions to analyze. - Defaults to all extensions registered in LANGUAGE_PATTERNS. - Examples: -file-extension=ts,tsx,js,jsx - - -files= - Comma-separated explicit files to analyze instead of recursively - discovering files from the current directory. - Example: -files=src/a.ts,src/b.js - - -git-branch - Analyze files changed on the current local Git branch compared to the - origin base branch, including committed, staged, unstaged, and untracked - files. Cannot be combined with -files. - - -verbose - Include duplicate block metrics in report output. - -DUPLICATE REPORT PURPOSE: - The duplicate report helps you find repeated code that may be worth - refactoring, reviewing, or consolidating. It lists each duplicated block with - the files and line ranges where it appears, making it easier to compare the - repeated code and decide whether it should stay duplicated. - -EXAMPLES: - codem8 --report-duplicate - codem8 --report-duplicate -file-extension=ts,tsx,js,jsx - codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js - codem8 --report-duplicate -git-branch -"; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -struct CargoLockPackage<'a> { - name: &'a str, - version: &'a str, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum CliCommand { - Help, - ReportDuplicate(CliConfig), -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct CliConfig { - pub report_duplicate: bool, - pub verbose: bool, - pub file_extensions: Vec, - pub files: Option>, - pub git_branch: bool, -} - #[derive(Debug, Parser)] #[command(name = "codem8", disable_help_flag = true, disable_version_flag = true)] struct ClapCli { @@ -98,37 +31,6 @@ struct ClapCli { files: Vec>, } -#[must_use] -pub fn help_text() -> String { - let version = codem8_version_from_cargo_lock().unwrap_or("unknown"); - let mut output = String::new(); - let _ = writeln!( - output, - "CodeM8 {version} - deterministic source code analysis reports." - ); - output.push('\n'); - output.push_str(HELP_TEXT_BODY); - output -} - -/// Parses command-line arguments into a CLI command. -/// -/// # Errors -/// -/// Returns an error when the arguments are invalid, repeated, or missing the -/// required report switch. -pub fn parse_command(args: I) -> Result -where - I: IntoIterator, - S: Into, -{ - let args = args.into_iter().map(Into::into).collect::>(); - if args.len() == 1 && is_help_argument(&args[0]) { - return Ok(CliCommand::Help); - } - parse_args(args).map(CliCommand::ReportDuplicate) -} - /// Parses command-line arguments into a validated CLI configuration. /// /// # Errors @@ -241,10 +143,6 @@ pub fn parse_file_list(value: &str) -> Result> { Ok(files) } -fn is_help_argument(arg: &str) -> bool { - matches!(arg, "help" | "-h") -} - fn normalized_clap_args(args: I) -> Result> where I: IntoIterator, @@ -273,86 +171,10 @@ fn normalized_clap_arg(arg: String) -> Result { } } -fn codem8_version_from_cargo_lock() -> Option<&'static str> { - cargo_lock_packages(CARGO_LOCK) - .find(|package| package.name == "codem8") - .map(|package| package.version) -} - -fn cargo_lock_packages(lockfile: &str) -> impl Iterator> { - lockfile.split("[[package]]").filter_map(cargo_lock_package) -} - -fn cargo_lock_package(section: &str) -> Option> { - let name = cargo_lock_value(section, "name")?; - let version = cargo_lock_value(section, "version")?; - Some(CargoLockPackage { name, version }) -} - -fn cargo_lock_value<'a>(section: &'a str, key: &str) -> Option<&'a str> { - let prefix = format!("{key} = \""); - section - .lines() - .map(str::trim) - .find_map(|line| line.strip_prefix(&prefix)?.strip_suffix('"')) -} - #[cfg(test)] mod tests { use super::*; - #[test] - fn parses_help_command() { - let command = parse_command(["help"]).expect("help parses"); - assert_eq!(command, CliCommand::Help); - } - - #[test] - fn parses_short_help_option() { - let command = parse_command(["-h"]).expect("short help parses"); - assert_eq!(command, CliCommand::Help); - } - - #[test] - fn exposes_detailed_help_text() { - let help = help_text(); - assert!(help.contains("USAGE:")); - assert!(help.contains("--report-duplicate")); - assert!(help.contains("-verbose")); - assert!(help.contains("-file-extension=")); - assert!(help.contains("-files=")); - assert!(help.contains("-git-branch")); - assert!(!help.contains("--verbose")); - assert!(!help.contains("--file-extension=")); - assert!(!help.contains("--files=")); - assert!(!help.contains("--git-branch")); - assert!(help.contains("helps you find repeated code")); - assert!(!help.contains("Duplicate weight")); - } - - #[test] - fn help_text_includes_version_from_cargo_lock() { - let version = codem8_version_from_cargo_lock().expect("codem8 version exists"); - assert!(help_text().starts_with(&format!("CodeM8 {version} - "))); - } - - #[test] - fn extracts_package_versions_from_cargo_lock_sections() { - let lockfile = r#" -[[package]] -name = "dependency" -version = "1.2.3" - -[[package]] -name = "codem8" -version = "0.4.2" -"#; - let package = cargo_lock_packages(lockfile) - .find(|package| package.name == "codem8") - .expect("package exists"); - assert_eq!(package.version, "0.4.2"); - } - #[test] fn parses_default_duplicate_report_config() { let config = parse_args(["--report-duplicate"]).expect("config parses"); diff --git a/src/cli/help.rs b/src/cli/help.rs new file mode 100644 index 0000000..5f95a66 --- /dev/null +++ b/src/cli/help.rs @@ -0,0 +1,90 @@ +use std::fmt::Write as _; + +use super::version::codem8_version_from_cargo_lock; + +const HELP_TEXT_BODY: &str = "\ +USAGE: + codem8 help + codem8 --report-duplicate [OPTIONS] + +COMMANDS: + help + Display this detailed documentation. + +REQUIRED REPORT SWITCHES: + --report-duplicate + Analyze source files and print a duplicate code report. + +OPTIONS: + -file-extension= + Comma-separated source file extensions to analyze. + Defaults to all extensions registered in LANGUAGE_PATTERNS. + Examples: -file-extension=ts,tsx,js,jsx + + -files= + Comma-separated explicit files to analyze instead of recursively + discovering files from the current directory. + Example: -files=src/a.ts,src/b.js + + -git-branch + Analyze files changed on the current local Git branch compared to the + origin base branch, including committed, staged, unstaged, and untracked + files. Cannot be combined with -files. + + -verbose + Include duplicate block metrics in report output. + +DUPLICATE REPORT PURPOSE: + The duplicate report helps you find repeated code that may be worth + refactoring, reviewing, or consolidating. It lists each duplicated block with + the files and line ranges where it appears, making it easier to compare the + repeated code and decide whether it should stay duplicated. + +EXAMPLES: + codem8 --report-duplicate + codem8 --report-duplicate -file-extension=ts,tsx,js,jsx + codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js + codem8 --report-duplicate -git-branch +"; + +#[must_use] +pub fn help_text() -> String { + let version = codem8_version_from_cargo_lock().unwrap_or("unknown"); + let mut output = String::new(); + let _ = writeln!( + output, + "CodeM8 {version} - deterministic source code analysis reports." + ); + output.push('\n'); + output.push_str(HELP_TEXT_BODY); + output +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::cli::version::codem8_version_from_cargo_lock; + + #[test] + fn exposes_detailed_help_text() { + let help = help_text(); + assert!(help.contains("USAGE:")); + assert!(help.contains("--report-duplicate")); + assert!(help.contains("-verbose")); + assert!(help.contains("-file-extension=")); + assert!(help.contains("-files=")); + assert!(help.contains("-git-branch")); + assert!(!help.contains("--verbose")); + assert!(!help.contains("--file-extension=")); + assert!(!help.contains("--files=")); + assert!(!help.contains("--git-branch")); + assert!(help.contains("helps you find repeated code")); + assert!(!help.contains("Duplicate weight")); + } + + #[test] + fn help_text_includes_version_from_cargo_lock() { + let version = codem8_version_from_cargo_lock().expect("codem8 version exists"); + assert!(help_text().starts_with(&format!("CodeM8 {version} - "))); + } +} diff --git a/src/cli/mod.rs b/src/cli/mod.rs new file mode 100644 index 0000000..a16bf25 --- /dev/null +++ b/src/cli/mod.rs @@ -0,0 +1,64 @@ +use std::path::PathBuf; + +mod args; +mod help; +mod version; + +pub use args::{parse_args, parse_file_extensions, parse_file_list}; +pub use help::help_text; + +use crate::error::Result; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CliCommand { + Help, + ReportDuplicate(CliConfig), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CliConfig { + pub report_duplicate: bool, + pub verbose: bool, + pub file_extensions: Vec, + pub files: Option>, + pub git_branch: bool, +} + +/// Parses command-line arguments into a CLI command. +/// +/// # Errors +/// +/// Returns an error when the arguments are invalid, repeated, or missing the +/// required report switch. +pub fn parse_command(args: I) -> Result +where + I: IntoIterator, + S: Into, +{ + let args = args.into_iter().map(Into::into).collect::>(); + if args.len() == 1 && is_help_argument(&args[0]) { + return Ok(CliCommand::Help); + } + parse_args(args).map(CliCommand::ReportDuplicate) +} + +fn is_help_argument(arg: &str) -> bool { + matches!(arg, "help" | "-h") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_help_command() { + let command = parse_command(["help"]).expect("help parses"); + assert_eq!(command, CliCommand::Help); + } + + #[test] + fn parses_short_help_option() { + let command = parse_command(["-h"]).expect("short help parses"); + assert_eq!(command, CliCommand::Help); + } +} diff --git a/src/cli/version.rs b/src/cli/version.rs new file mode 100644 index 0000000..8829c32 --- /dev/null +++ b/src/cli/version.rs @@ -0,0 +1,53 @@ +const CARGO_LOCK: &str = include_str!("../../Cargo.lock"); + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct CargoLockPackage<'a> { + name: &'a str, + version: &'a str, +} + +pub(super) fn codem8_version_from_cargo_lock() -> Option<&'static str> { + cargo_lock_packages(CARGO_LOCK) + .find(|package| package.name == "codem8") + .map(|package| package.version) +} + +fn cargo_lock_packages(lockfile: &str) -> impl Iterator> { + lockfile.split("[[package]]").filter_map(cargo_lock_package) +} + +fn cargo_lock_package(section: &str) -> Option> { + let name = cargo_lock_value(section, "name")?; + let version = cargo_lock_value(section, "version")?; + Some(CargoLockPackage { name, version }) +} + +fn cargo_lock_value<'a>(section: &'a str, key: &str) -> Option<&'a str> { + let prefix = format!("{key} = \""); + section + .lines() + .map(str::trim) + .find_map(|line| line.strip_prefix(&prefix)?.strip_suffix('"')) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn extracts_package_versions_from_cargo_lock_sections() { + let lockfile = r#" +[[package]] +name = "dependency" +version = "1.2.3" + +[[package]] +name = "codem8" +version = "0.4.2" +"#; + let package = cargo_lock_packages(lockfile) + .find(|package| package.name == "codem8") + .expect("package exists"); + assert_eq!(package.version, "0.4.2"); + } +} From ddba2c6eb308ff28e3cb94534d5820af609e60c4 Mon Sep 17 00:00:00 2001 From: b4prog Date: Fri, 26 Jun 2026 16:35:58 +0200 Subject: [PATCH 05/10] [refactor] move git branch filtering into discovery module --- src/discovery.rs | 300 ------------------------------------- src/discovery/explicit.rs | 142 ++++++++++++++++++ src/{ => discovery}/git.rs | 0 src/discovery/mod.rs | 41 +++++ src/discovery/recursive.rs | 161 ++++++++++++++++++++ src/lib.rs | 3 +- 6 files changed, 345 insertions(+), 302 deletions(-) delete mode 100644 src/discovery.rs create mode 100644 src/discovery/explicit.rs rename src/{ => discovery}/git.rs (100%) create mode 100644 src/discovery/mod.rs create mode 100644 src/discovery/recursive.rs diff --git a/src/discovery.rs b/src/discovery.rs deleted file mode 100644 index b985495..0000000 --- a/src/discovery.rs +++ /dev/null @@ -1,300 +0,0 @@ -use std::collections::HashSet; -use std::fs; -use std::path::{Path, PathBuf}; -use std::sync::mpsc; - -use ignore::{DirEntry, WalkBuilder, WalkState}; - -use crate::error::{CodeM8Error, Result}; -use crate::model::SourceFile; -use crate::paths::{format_path, normalize_display_path}; - -const IGNORED_DIRECTORIES: &[&str] = &[ - ".git", - "node_modules", - "target", - "dist", - "build", - "coverage", - ".next", - ".nuxt", - ".svelte-kit", - ".idea", - ".vscode", -]; - -/// Discovers source files that match the selected extensions. -/// -/// # Errors -/// -/// Returns an error when explicit files are invalid or when walking the file -/// tree fails. -pub fn discover_source_files( - current_dir: &Path, - extensions: &[String], - explicit_files: Option<&[PathBuf]>, -) -> Result> { - let mut source_files = if let Some(files) = explicit_files { - discover_explicit_files(current_dir, extensions, files)? - } else { - discover_recursive_files(current_dir, extensions)? - }; - source_files.sort_by(|left, right| { - format_path(&left.display_path).cmp(&format_path(&right.display_path)) - }); - Ok(source_files) -} - -fn discover_recursive_files(root: &Path, extensions: &[String]) -> Result> { - let root = root.to_path_buf(); - let extensions = extensions.to_vec(); - let (source_tx, source_rx) = mpsc::channel(); - let (error_tx, error_rx) = mpsc::channel(); - let walker = WalkBuilder::new(&root) - .hidden(false) - .ignore(true) - .git_ignore(true) - .git_global(true) - .git_exclude(true) - .require_git(false) - .parents(true) - .filter_entry(should_walk_entry) - .build_parallel(); - walker.run(|| { - let root = root.clone(); - let extensions = extensions.clone(); - let source_tx = source_tx.clone(); - let error_tx = error_tx.clone(); - Box::new(move |entry| match entry { - Ok(entry) => { - let Some(source_file) = source_file_from_entry(&root, &extensions, &entry) else { - return WalkState::Continue; - }; - if source_tx.send(source_file).is_err() { - return WalkState::Quit; - } - WalkState::Continue - } - Err(error) => { - let _ = error_tx.send(walk_error(&root, &error)); - WalkState::Quit - } - }) - }); - drop(source_tx); - drop(error_tx); - if let Some(error) = error_rx.into_iter().next() { - return Err(error); - } - Ok(source_rx.into_iter().collect()) -} - -fn source_file_from_entry( - root: &Path, - extensions: &[String], - entry: &DirEntry, -) -> Option { - let file_type = entry.file_type()?; - if !file_type.is_file() { - return None; - } - let path = entry.path(); - let extension = selected_extension(path, extensions)?; - let display_path = path - .strip_prefix(root) - .map_or_else(|_| normalize_display_path(path), normalize_display_path); - Some(SourceFile { - path: path.to_path_buf(), - display_path, - extension, - }) -} - -fn walk_error(root: &Path, error: &ignore::Error) -> CodeM8Error { - CodeM8Error::new(format!( - "could not walk directory {}: {error}", - format_path(root) - )) -} - -fn should_walk_entry(entry: &DirEntry) -> bool { - let Some(file_type) = entry.file_type() else { - return true; - }; - if !file_type.is_dir() || entry.depth() == 0 { - return true; - } - let directory_name = entry.file_name().to_string_lossy().to_ascii_lowercase(); - !IGNORED_DIRECTORIES.contains(&directory_name.as_str()) -} - -fn discover_explicit_files( - current_dir: &Path, - extensions: &[String], - files: &[PathBuf], -) -> Result> { - let mut source_files = Vec::new(); - let mut seen_paths = HashSet::new(); - for file in files { - let absolute_input = file.is_absolute(); - let path = if absolute_input { - file.clone() - } else { - current_dir.join(file) - }; - let metadata = fs::symlink_metadata(&path).map_err(|_| { - CodeM8Error::new(format!( - "explicit file does not exist: {}", - format_path(file) - )) - })?; - if metadata.file_type().is_symlink() { - return Err(CodeM8Error::new(format!( - "explicit file is a symbolic link and will not be followed: {}", - format_path(file) - ))); - } - if metadata.is_dir() { - return Err(CodeM8Error::new(format!( - "explicit file is a directory: {}", - format_path(file) - ))); - } - if !metadata.is_file() { - return Err(CodeM8Error::new(format!( - "explicit path is not a file: {}", - format_path(file) - ))); - } - let Some(extension) = selected_extension(&path, extensions) else { - continue; - }; - let canonical_path = fs::canonicalize(&path) - .map_err(|error| CodeM8Error::io(&path, "canonicalize explicit file", &error))?; - if !seen_paths.insert(canonical_path.clone()) { - continue; - } - let display_path = if absolute_input { - canonical_path - .strip_prefix(current_dir) - .map_or_else(|_| normalize_display_path(file), normalize_display_path) - } else { - normalize_display_path(file) - }; - source_files.push(SourceFile { - path: canonical_path, - display_path, - extension, - }); - } - Ok(source_files) -} - -fn selected_extension(path: &Path, extensions: &[String]) -> Option { - let extension = path.extension()?.to_str()?.to_ascii_lowercase(); - extensions - .iter() - .any(|selected| selected.eq_ignore_ascii_case(&extension)) - .then_some(extension) -} - -#[cfg(test)] -mod tests { - use std::fs; - use std::sync::atomic::{AtomicUsize, Ordering}; - - use super::*; - - static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); - - fn temp_dir(name: &str) -> PathBuf { - let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); - let path = std::env::temp_dir().join(format!( - "codem8-discovery-{name}-{}-{id}", - std::process::id() - )); - if path.exists() { - fs::remove_dir_all(&path).expect("remove stale test directory"); - } - fs::create_dir_all(&path).expect("create test directory"); - path - } - - #[test] - fn recursively_discovers_matching_extensions_and_ignores_common_directories() { - let root = temp_dir("recursive"); - fs::create_dir_all(root.join("src")).expect("create src"); - fs::create_dir_all(root.join("target")).expect("create target"); - fs::write(root.join("src").join("a.TS"), "").expect("write ts"); - fs::write(root.join("src").join("b.js"), "").expect("write js"); - fs::write(root.join("target").join("ignored.ts"), "").expect("write ignored"); - let files = discover_source_files(&root, &["ts".to_string()], None).expect("discover"); - assert_eq!(files.len(), 1); - assert_eq!(format_path(&files[0].display_path), "src/a.TS"); - fs::remove_dir_all(root).expect("cleanup"); - } - - #[test] - fn recursive_discovery_respects_gitignore_without_requiring_git_repository() { - let root = temp_dir("gitignore"); - fs::create_dir_all(root.join("src")).expect("create src"); - fs::create_dir_all(root.join("generated")).expect("create generated"); - fs::write(root.join(".gitignore"), "generated/\n").expect("write gitignore"); - fs::write(root.join("src").join("a.ts"), "").expect("write source ts"); - fs::write(root.join("generated").join("ignored.ts"), "").expect("write ignored ts"); - let files = discover_source_files(&root, &["ts".to_string()], None).expect("discover"); - assert_eq!(files.len(), 1); - assert_eq!(format_path(&files[0].display_path), "src/a.ts"); - fs::remove_dir_all(root).expect("cleanup"); - } - - #[test] - fn explicit_files_skip_unselected_extensions() { - let root = temp_dir("explicit-skip"); - fs::write(root.join("a.ts"), "").expect("write ts"); - fs::write(root.join("b.js"), "").expect("write js"); - let files = discover_source_files( - &root, - &["ts".to_string()], - Some(&[PathBuf::from("a.ts"), PathBuf::from("b.js")]), - ) - .expect("discover"); - assert_eq!(files.len(), 1); - assert_eq!(format_path(&files[0].display_path), "a.ts"); - fs::remove_dir_all(root).expect("cleanup"); - } - - #[test] - fn explicit_files_deduplicate_resolved_paths() { - let root = temp_dir("explicit-dedup"); - fs::write(root.join("a.ts"), "").expect("write ts"); - let absolute = fs::canonicalize(root.join("a.ts")).expect("canonicalize ts"); - let files = discover_source_files( - &root, - &["ts".to_string()], - Some(&[ - PathBuf::from("a.ts"), - PathBuf::from(".").join("a.ts"), - absolute.clone(), - ]), - ) - .expect("discover"); - assert_eq!(files.len(), 1); - assert_eq!(files[0].path, absolute); - assert_eq!(format_path(&files[0].display_path), "a.ts"); - fs::remove_dir_all(root).expect("cleanup"); - } - - #[test] - fn explicit_files_reject_directories() { - let root = temp_dir("explicit-directory"); - fs::create_dir_all(root.join("src")).expect("create explicit directory"); - let error = - discover_source_files(&root, &["ts".to_string()], Some(&[PathBuf::from("src")])) - .expect_err("directory explicit file fails"); - assert!(error - .to_string() - .contains("explicit file is a directory: src")); - fs::remove_dir_all(root).expect("cleanup"); - } -} diff --git a/src/discovery/explicit.rs b/src/discovery/explicit.rs new file mode 100644 index 0000000..ddc5a8e --- /dev/null +++ b/src/discovery/explicit.rs @@ -0,0 +1,142 @@ +use std::collections::HashSet; +use std::fs; +use std::path::{Path, PathBuf}; + +use super::selected_extension; +use crate::error::{CodeM8Error, Result}; +use crate::model::SourceFile; +use crate::paths::{format_path, normalize_display_path}; + +pub(super) fn discover_explicit_files( + current_dir: &Path, + extensions: &[String], + files: &[PathBuf], +) -> Result> { + let mut source_files = Vec::new(); + let mut seen_paths = HashSet::new(); + for file in files { + let absolute_input = file.is_absolute(); + let path = if absolute_input { + file.clone() + } else { + current_dir.join(file) + }; + let metadata = fs::symlink_metadata(&path).map_err(|_| { + CodeM8Error::new(format!( + "explicit file does not exist: {}", + format_path(file) + )) + })?; + if metadata.file_type().is_symlink() { + return Err(CodeM8Error::new(format!( + "explicit file is a symbolic link and will not be followed: {}", + format_path(file) + ))); + } + if metadata.is_dir() { + return Err(CodeM8Error::new(format!( + "explicit file is a directory: {}", + format_path(file) + ))); + } + if !metadata.is_file() { + return Err(CodeM8Error::new(format!( + "explicit path is not a file: {}", + format_path(file) + ))); + } + let Some(extension) = selected_extension(&path, extensions) else { + continue; + }; + let canonical_path = fs::canonicalize(&path) + .map_err(|error| CodeM8Error::io(&path, "canonicalize explicit file", &error))?; + if !seen_paths.insert(canonical_path.clone()) { + continue; + } + let display_path = if absolute_input { + canonical_path + .strip_prefix(current_dir) + .map_or_else(|_| normalize_display_path(file), normalize_display_path) + } else { + normalize_display_path(file) + }; + source_files.push(SourceFile { + path: canonical_path, + display_path, + extension, + }); + } + Ok(source_files) +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use super::*; + + static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); + + fn temp_dir(name: &str) -> PathBuf { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = std::env::temp_dir().join(format!( + "codem8-discovery-explicit-{name}-{}-{id}", + std::process::id() + )); + if path.exists() { + fs::remove_dir_all(&path).expect("remove stale test directory"); + } + fs::create_dir_all(&path).expect("create test directory"); + path + } + + #[test] + fn explicit_files_skip_unselected_extensions() { + let root = temp_dir("skip"); + fs::write(root.join("a.ts"), "").expect("write ts"); + fs::write(root.join("b.js"), "").expect("write js"); + let files = discover_explicit_files( + &root, + &["ts".to_string()], + &[PathBuf::from("a.ts"), PathBuf::from("b.js")], + ) + .expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(format_path(&files[0].display_path), "a.ts"); + fs::remove_dir_all(root).expect("cleanup"); + } + + #[test] + fn explicit_files_deduplicate_resolved_paths() { + let root = temp_dir("dedup"); + fs::write(root.join("a.ts"), "").expect("write ts"); + let absolute = fs::canonicalize(root.join("a.ts")).expect("canonicalize ts"); + let files = discover_explicit_files( + &root, + &["ts".to_string()], + &[ + PathBuf::from("a.ts"), + PathBuf::from(".").join("a.ts"), + absolute.clone(), + ], + ) + .expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(files[0].path, absolute); + assert_eq!(format_path(&files[0].display_path), "a.ts"); + fs::remove_dir_all(root).expect("cleanup"); + } + + #[test] + fn explicit_files_reject_directories() { + let root = temp_dir("directory"); + fs::create_dir_all(root.join("src")).expect("create explicit directory"); + let error = discover_explicit_files(&root, &["ts".to_string()], &[PathBuf::from("src")]) + .expect_err("directory explicit file fails"); + assert!(error + .to_string() + .contains("explicit file is a directory: src")); + fs::remove_dir_all(root).expect("cleanup"); + } +} diff --git a/src/git.rs b/src/discovery/git.rs similarity index 100% rename from src/git.rs rename to src/discovery/git.rs diff --git a/src/discovery/mod.rs b/src/discovery/mod.rs new file mode 100644 index 0000000..70e2a1c --- /dev/null +++ b/src/discovery/mod.rs @@ -0,0 +1,41 @@ +use std::path::{Path, PathBuf}; + +mod explicit; +mod git; +mod recursive; + +pub(crate) use git::changed_files_against_origin; + +use crate::error::Result; +use crate::model::SourceFile; +use crate::paths::format_path; + +/// Discovers source files that match the selected extensions. +/// +/// # Errors +/// +/// Returns an error when explicit files are invalid or when walking the file +/// tree fails. +pub fn discover_source_files( + current_dir: &Path, + extensions: &[String], + explicit_files: Option<&[PathBuf]>, +) -> Result> { + let mut source_files = if let Some(files) = explicit_files { + explicit::discover_explicit_files(current_dir, extensions, files)? + } else { + recursive::discover_recursive_files(current_dir, extensions)? + }; + source_files.sort_by(|left, right| { + format_path(&left.display_path).cmp(&format_path(&right.display_path)) + }); + Ok(source_files) +} + +fn selected_extension(path: &Path, extensions: &[String]) -> Option { + let extension = path.extension()?.to_str()?.to_ascii_lowercase(); + extensions + .iter() + .any(|selected| selected.eq_ignore_ascii_case(&extension)) + .then_some(extension) +} diff --git a/src/discovery/recursive.rs b/src/discovery/recursive.rs new file mode 100644 index 0000000..40c7a20 --- /dev/null +++ b/src/discovery/recursive.rs @@ -0,0 +1,161 @@ +use std::path::Path; +use std::sync::mpsc; + +use ignore::{DirEntry, WalkBuilder, WalkState}; + +use super::selected_extension; +use crate::error::{CodeM8Error, Result}; +use crate::model::SourceFile; +use crate::paths::{format_path, normalize_display_path}; + +const IGNORED_DIRECTORIES: &[&str] = &[ + ".git", + "node_modules", + "target", + "dist", + "build", + "coverage", + ".next", + ".nuxt", + ".svelte-kit", + ".idea", + ".vscode", +]; + +pub(super) fn discover_recursive_files( + root: &Path, + extensions: &[String], +) -> Result> { + let root = root.to_path_buf(); + let extensions = extensions.to_vec(); + let (source_tx, source_rx) = mpsc::channel(); + let (error_tx, error_rx) = mpsc::channel(); + let walker = WalkBuilder::new(&root) + .hidden(false) + .ignore(true) + .git_ignore(true) + .git_global(true) + .git_exclude(true) + .require_git(false) + .parents(true) + .filter_entry(should_walk_entry) + .build_parallel(); + walker.run(|| { + let root = root.clone(); + let extensions = extensions.clone(); + let source_tx = source_tx.clone(); + let error_tx = error_tx.clone(); + Box::new(move |entry| match entry { + Ok(entry) => { + let Some(source_file) = source_file_from_entry(&root, &extensions, &entry) else { + return WalkState::Continue; + }; + if source_tx.send(source_file).is_err() { + return WalkState::Quit; + } + WalkState::Continue + } + Err(error) => { + let _ = error_tx.send(walk_error(&root, &error)); + WalkState::Quit + } + }) + }); + drop(source_tx); + drop(error_tx); + if let Some(error) = error_rx.into_iter().next() { + return Err(error); + } + Ok(source_rx.into_iter().collect()) +} + +fn source_file_from_entry( + root: &Path, + extensions: &[String], + entry: &DirEntry, +) -> Option { + let file_type = entry.file_type()?; + if !file_type.is_file() { + return None; + } + let path = entry.path(); + let extension = selected_extension(path, extensions)?; + let display_path = path + .strip_prefix(root) + .map_or_else(|_| normalize_display_path(path), normalize_display_path); + Some(SourceFile { + path: path.to_path_buf(), + display_path, + extension, + }) +} + +fn walk_error(root: &Path, error: &ignore::Error) -> CodeM8Error { + CodeM8Error::new(format!( + "could not walk directory {}: {error}", + format_path(root) + )) +} + +fn should_walk_entry(entry: &DirEntry) -> bool { + let Some(file_type) = entry.file_type() else { + return true; + }; + if !file_type.is_dir() || entry.depth() == 0 { + return true; + } + let directory_name = entry.file_name().to_string_lossy().to_ascii_lowercase(); + !IGNORED_DIRECTORIES.contains(&directory_name.as_str()) +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::path::PathBuf; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use super::*; + + static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); + + fn temp_dir(name: &str) -> PathBuf { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = std::env::temp_dir().join(format!( + "codem8-discovery-recursive-{name}-{}-{id}", + std::process::id() + )); + if path.exists() { + fs::remove_dir_all(&path).expect("remove stale test directory"); + } + fs::create_dir_all(&path).expect("create test directory"); + path + } + + #[test] + fn recursively_discovers_matching_extensions_and_ignores_common_directories() { + let root = temp_dir("recursive"); + fs::create_dir_all(root.join("src")).expect("create src"); + fs::create_dir_all(root.join("target")).expect("create target"); + fs::write(root.join("src").join("a.TS"), "").expect("write ts"); + fs::write(root.join("src").join("b.js"), "").expect("write js"); + fs::write(root.join("target").join("ignored.ts"), "").expect("write ignored"); + let files = discover_recursive_files(&root, &["ts".to_string()]).expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(format_path(&files[0].display_path), "src/a.TS"); + fs::remove_dir_all(root).expect("cleanup"); + } + + #[test] + fn recursive_discovery_respects_gitignore_without_requiring_git_repository() { + let root = temp_dir("gitignore"); + fs::create_dir_all(root.join("src")).expect("create src"); + fs::create_dir_all(root.join("generated")).expect("create generated"); + fs::write(root.join(".gitignore"), "generated/\n").expect("write gitignore"); + fs::write(root.join("src").join("a.ts"), "").expect("write source ts"); + fs::write(root.join("generated").join("ignored.ts"), "").expect("write ignored ts"); + let files = discover_recursive_files(&root, &["ts".to_string()]).expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(format_path(&files[0].display_path), "src/a.ts"); + fs::remove_dir_all(root).expect("cleanup"); + } +} diff --git a/src/lib.rs b/src/lib.rs index b1108c0..692f991 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,6 @@ pub mod cli; pub mod discovery; pub mod error; -pub mod git; pub mod language; pub mod line; pub mod model; @@ -34,7 +33,7 @@ where let should_report_scanned_files = config.git_branch || config.files.is_some(); let (source_files, discovery_duration) = time_result(config.verbose, || { let git_branch_files = if config.git_branch { - Some(git::changed_files_against_origin(current_dir)?) + Some(discovery::changed_files_against_origin(current_dir)?) } else { None }; From 5b17e8ed4b76dfbf3f0a271b4f5ab8a016621b1a Mon Sep 17 00:00:00 2001 From: b4prog Date: Fri, 26 Jun 2026 16:42:09 +0200 Subject: [PATCH 06/10] [docs] add tests to agent verification commands --- AGENTS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/AGENTS.md b/AGENTS.md index 0a2428b..d692c61 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -8,6 +8,7 @@ Run the repository verification commands from the workspace root and fix any iss ```bash cargo fmt --all -- --check +cargo test cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity -W clippy::pedantic -W clippy::nursery -W clippy::cargo cargo build --locked --all-targets ``` From 5246b3f4bb298adf2d05e7600c7dcd42449d35d4 Mon Sep 17 00:00:00 2001 From: b4prog Date: Fri, 26 Jun 2026 16:49:47 +0200 Subject: [PATCH 07/10] [fix] add Rust ampersand duplicate mitigation pattern --- src/language/patterns.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/language/patterns.rs b/src/language/patterns.rs index 1dbd12e..09e7831 100644 --- a/src/language/patterns.rs +++ b/src/language/patterns.rs @@ -135,7 +135,9 @@ pub(super) const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ }, LanguageLinePattern { language: &RUST, - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_pattern: &[ + '&', '(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}', + ], duplicate_mitigation_lines: &["///", "#[test]"], duplicate_mitigation_regexps: &[ // Excludes short path or enum variant fragments. Example: Self::Ready, From c93cc81e232f810d1e24d55a01beceb868872a73 Mon Sep 17 00:00:00 2001 From: b4prog Date: Fri, 26 Jun 2026 16:53:15 +0200 Subject: [PATCH 08/10] [fix] ignore Rust assert macro openers in duplicate mitigation --- src/language/classification.rs | 8 ++++++++ src/language/patterns.rs | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/language/classification.rs b/src/language/classification.rs index 7528b28..aa57005 100644 --- a/src/language/classification.rs +++ b/src/language/classification.rs @@ -138,6 +138,14 @@ mod tests { assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly); } + #[test] + fn assigns_block_only_status_for_rust_assert_macro_openers() { + for line in ["assert!(", "assert_eq!("] { + let hash = hash_normalized_line(line); + assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly); + } + } + #[test] fn assigns_comparison_status_for_meaningful_lines() { let line = "const value = computeValue(input);"; diff --git a/src/language/patterns.rs b/src/language/patterns.rs index 09e7831..657f893 100644 --- a/src/language/patterns.rs +++ b/src/language/patterns.rs @@ -138,7 +138,7 @@ pub(super) const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ duplicate_mitigation_pattern: &[ '&', '(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}', ], - duplicate_mitigation_lines: &["///", "#[test]"], + duplicate_mitigation_lines: &["///", "#[test]", "assert!(", "assert_eq!("], duplicate_mitigation_regexps: &[ // Excludes short path or enum variant fragments. Example: Self::Ready, r"^[A-Za-z0-9_]*::?\s*[A-Za-z0-9_]*[,]?$", From edd89ea33df929845ed0f8d267bb5c80e1174bf4 Mon Sep 17 00:00:00 2001 From: b4prog Date: Fri, 26 Jun 2026 17:06:14 +0200 Subject: [PATCH 09/10] [docs] document short help entrypoint --- src/cli/help.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/cli/help.rs b/src/cli/help.rs index 5f95a66..a73cdf7 100644 --- a/src/cli/help.rs +++ b/src/cli/help.rs @@ -5,10 +5,12 @@ use super::version::codem8_version_from_cargo_lock; const HELP_TEXT_BODY: &str = "\ USAGE: codem8 help + codem8 -h codem8 --report-duplicate [OPTIONS] COMMANDS: help + -h Display this detailed documentation. REQUIRED REPORT SWITCHES: @@ -69,6 +71,8 @@ mod tests { fn exposes_detailed_help_text() { let help = help_text(); assert!(help.contains("USAGE:")); + assert!(help.contains("codem8 -h")); + assert!(help.contains(" -h")); assert!(help.contains("--report-duplicate")); assert!(help.contains("-verbose")); assert!(help.contains("-file-extension=")); From be5325b8b944a00a5d9190edfc40c78cb3108ca4 Mon Sep 17 00:00:00 2001 From: b4prog Date: Fri, 26 Jun 2026 17:09:39 +0200 Subject: [PATCH 10/10] [fix] canonicalize explicit file display root --- src/discovery/explicit.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/discovery/explicit.rs b/src/discovery/explicit.rs index ddc5a8e..4805f0b 100644 --- a/src/discovery/explicit.rs +++ b/src/discovery/explicit.rs @@ -12,6 +12,8 @@ pub(super) fn discover_explicit_files( extensions: &[String], files: &[PathBuf], ) -> Result> { + let canonical_current_dir = fs::canonicalize(current_dir) + .map_err(|error| CodeM8Error::io(current_dir, "canonicalize current directory", &error))?; let mut source_files = Vec::new(); let mut seen_paths = HashSet::new(); for file in files { @@ -55,7 +57,7 @@ pub(super) fn discover_explicit_files( } let display_path = if absolute_input { canonical_path - .strip_prefix(current_dir) + .strip_prefix(&canonical_current_dir) .map_or_else(|_| normalize_display_path(file), normalize_display_path) } else { normalize_display_path(file) @@ -128,6 +130,18 @@ mod tests { fs::remove_dir_all(root).expect("cleanup"); } + #[test] + fn absolute_explicit_files_are_displayed_relative_to_normalized_current_dir() { + let root = temp_dir("normalized-current-dir"); + fs::write(root.join("a.ts"), "").expect("write ts"); + let absolute = fs::canonicalize(root.join("a.ts")).expect("canonicalize ts"); + let files = discover_explicit_files(&root.join("."), &["ts".to_string()], &[absolute]) + .expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(format_path(&files[0].display_path), "a.ts"); + fs::remove_dir_all(root).expect("cleanup"); + } + #[test] fn explicit_files_reject_directories() { let root = temp_dir("directory");