Skip to content

Commit 24a9e53

Browse files
committed
feat: add multiline string support and improve filter rules
- Detect multiline/raw strings to avoid counting embedded comments as actual comments in Rust and Python - Keep `packages` dir (often contains source in monorepos) - Keep `env`/`.env` dirs (may contain non-config files) - Remove outdated performance benchmarks from README
1 parent f922ab8 commit 24a9e53

3 files changed

Lines changed: 270 additions & 13 deletions

File tree

README.md

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -111,16 +111,6 @@ function_pattern = "^\\s*def\\s+\\w+"
111111
complexity_keywords = ["if", "for", "while"]
112112
```
113113

114-
## Performance
115-
116-
Benchmarks on a large codebase (~100k files):
117-
118-
| Tool | Time |
119-
|------|------|
120-
| cloc (Perl) | ~60s |
121-
| code-stats (Python) | ~45s |
122-
| **codelens (Rust)** | **~1.5s** |
123-
124114
## License
125115

126116
MIT

crates/codelens-core/src/analyzer/file.rs

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,17 @@ use crate::language::{Language, LanguageRegistry};
1111
use super::complexity::ComplexityAnalyzer;
1212
use super::stats::{FileStats, LineStats};
1313

14+
/// Represents a string delimiter for multiline string detection.
15+
#[derive(Debug, Clone)]
16+
struct StringDelimiter {
17+
/// The closing delimiter pattern
18+
end_pattern: String,
19+
/// Whether this is a raw string (no escape processing)
20+
is_raw: bool,
21+
/// Whether this is a docstring (Python) - should be counted as comment
22+
is_docstring: bool,
23+
}
24+
1425
/// Analyzes individual source files.
1526
pub struct FileAnalyzer {
1627
registry: Arc<LanguageRegistry>,
@@ -92,6 +103,8 @@ impl FileAnalyzer {
92103
let mut stats = LineStats::default();
93104
let mut in_block_comment = false;
94105
let mut block_comment_end = "";
106+
let mut in_multiline_string = false;
107+
let mut string_delimiter: Option<StringDelimiter> = None;
95108

96109
for line in content.lines() {
97110
stats.total += 1;
@@ -103,6 +116,23 @@ impl FileAnalyzer {
103116
continue;
104117
}
105118

119+
// Inside multiline string
120+
if in_multiline_string {
121+
if let Some(ref delim) = string_delimiter {
122+
// Docstrings count as comments, regular strings as code
123+
if delim.is_docstring {
124+
stats.comment += 1;
125+
} else {
126+
stats.code += 1;
127+
}
128+
if self.line_ends_string(line, delim) {
129+
in_multiline_string = false;
130+
string_delimiter = None;
131+
}
132+
}
133+
continue;
134+
}
135+
106136
// Inside block comment
107137
if in_block_comment {
108138
stats.comment += 1;
@@ -120,6 +150,32 @@ impl FileAnalyzer {
120150
continue;
121151
}
122152

153+
// Check if line starts a multiline string
154+
// starts_multiline_string only returns Some if the string is NOT closed on the same line
155+
if let Some(delim) = self.starts_multiline_string(line, lang) {
156+
// Docstrings count as comments, regular strings as code
157+
if delim.is_docstring {
158+
stats.comment += 1;
159+
} else {
160+
stats.code += 1;
161+
}
162+
in_multiline_string = true;
163+
string_delimiter = Some(delim);
164+
continue;
165+
}
166+
167+
// Check for single-line Python docstring ("""...""" on one line)
168+
if lang.name == "Python" {
169+
if let Some(is_docstring) = self.is_single_line_docstring(trimmed) {
170+
if is_docstring {
171+
stats.comment += 1;
172+
} else {
173+
stats.code += 1;
174+
}
175+
continue;
176+
}
177+
}
178+
123179
// Check for block comment start
124180
let mut found_block_start = false;
125181
for (start, end) in &lang.block_comments {
@@ -177,6 +233,188 @@ impl FileAnalyzer {
177233
stats
178234
}
179235

236+
/// Check if a line starts a multiline string literal.
237+
/// Returns the delimiter info if a multiline string starts on this line.
238+
fn starts_multiline_string(&self, line: &str, lang: &Language) -> Option<StringDelimiter> {
239+
// Check for Rust raw strings: r#"..."# or r##"..."##
240+
if lang.name == "Rust" {
241+
if let Some(delim) = self.detect_rust_raw_string_start(line) {
242+
return Some(delim);
243+
}
244+
}
245+
246+
// Check for Python triple-quoted strings (including docstrings)
247+
if lang.name == "Python" {
248+
for pattern in &["\"\"\"", "'''"] {
249+
if let Some(pos) = line.find(pattern) {
250+
let before = &line[..pos];
251+
if !self.is_in_string(before, lang) {
252+
let after = &line[pos + 3..];
253+
// Check if it closes on the same line
254+
if after.find(pattern).is_none() {
255+
// Docstring: no assignment before the triple quotes
256+
let is_docstring = !before.contains('=');
257+
return Some(StringDelimiter {
258+
end_pattern: pattern.to_string(),
259+
is_raw: false,
260+
is_docstring,
261+
});
262+
}
263+
}
264+
}
265+
}
266+
}
267+
268+
// Check for regular multiline strings (string not closed on same line)
269+
let mut in_string = false;
270+
let mut string_char = '"';
271+
let mut escape_next = false;
272+
273+
let chars: Vec<char> = line.chars().collect();
274+
let mut i = 0;
275+
while i < chars.len() {
276+
let c = chars[i];
277+
278+
if escape_next {
279+
escape_next = false;
280+
i += 1;
281+
continue;
282+
}
283+
284+
if c == '\\' && in_string {
285+
escape_next = true;
286+
i += 1;
287+
continue;
288+
}
289+
290+
if (c == '"' || c == '\'') && !in_string {
291+
// Calculate byte position for string slice
292+
let byte_pos: usize = chars[..i].iter().map(|ch| ch.len_utf8()).sum();
293+
let before = &line[..byte_pos];
294+
if !self.is_in_string(before, lang) {
295+
in_string = true;
296+
string_char = c;
297+
}
298+
} else if c == string_char && in_string {
299+
in_string = false;
300+
}
301+
302+
i += 1;
303+
}
304+
305+
if in_string {
306+
return Some(StringDelimiter {
307+
end_pattern: string_char.to_string(),
308+
is_raw: false,
309+
is_docstring: false,
310+
});
311+
}
312+
313+
None
314+
}
315+
316+
/// Detect Rust raw string start (r#"..."# or r##"..."##, etc.)
317+
fn detect_rust_raw_string_start(&self, line: &str) -> Option<StringDelimiter> {
318+
let bytes = line.as_bytes();
319+
let len = bytes.len();
320+
let mut i = 0;
321+
322+
while i < len {
323+
// Look for 'r' followed by optional '#' and '"'
324+
if bytes[i] == b'r' && i + 1 < len {
325+
let start = i;
326+
i += 1;
327+
328+
// Count the number of '#' after 'r'
329+
let mut hash_count = 0;
330+
while i < len && bytes[i] == b'#' {
331+
hash_count += 1;
332+
i += 1;
333+
}
334+
335+
// Check for opening '"'
336+
if i < len && bytes[i] == b'"' {
337+
// Check that 'r' is not part of an identifier
338+
if start == 0 || !bytes[start - 1].is_ascii_alphanumeric() {
339+
// Build the closing pattern: "# repeated hash_count times
340+
let end_pattern = format!("\"{}", "#".repeat(hash_count));
341+
342+
// Check if it closes on the same line
343+
let after_quote = &line[i + 1..];
344+
if after_quote.find(&end_pattern).is_none() {
345+
return Some(StringDelimiter {
346+
end_pattern,
347+
is_raw: true,
348+
is_docstring: false,
349+
});
350+
}
351+
}
352+
}
353+
}
354+
i += 1;
355+
}
356+
357+
None
358+
}
359+
360+
/// Check if a line ends the current multiline string.
361+
fn line_ends_string(&self, line: &str, delim: &StringDelimiter) -> bool {
362+
if delim.is_raw {
363+
// For raw strings, just look for the closing pattern
364+
line.contains(&delim.end_pattern)
365+
} else {
366+
// For regular strings, need to handle escapes
367+
let mut chars = line.chars().peekable();
368+
let target: Vec<char> = delim.end_pattern.chars().collect();
369+
370+
while let Some(c) = chars.next() {
371+
if c == '\\' {
372+
// Skip escaped character
373+
chars.next();
374+
continue;
375+
}
376+
377+
if !target.is_empty() && c == target[0] {
378+
// Check if this matches the closing pattern
379+
let mut matched = true;
380+
for expected in target.iter().skip(1) {
381+
if chars.next() != Some(*expected) {
382+
matched = false;
383+
break;
384+
}
385+
}
386+
if matched {
387+
return true;
388+
}
389+
}
390+
}
391+
false
392+
}
393+
}
394+
395+
/// Check if a line is a single-line Python docstring.
396+
/// Returns Some(true) if it's a docstring, Some(false) if it's a regular string assignment,
397+
/// None if it doesn't contain a complete triple-quoted string.
398+
fn is_single_line_docstring(&self, trimmed: &str) -> Option<bool> {
399+
for pattern in &["\"\"\"", "'''"] {
400+
if let Some(start_pos) = trimmed.find(pattern) {
401+
let after_start = &trimmed[start_pos + 3..];
402+
// Check if it closes on the same line
403+
if let Some(end_pos) = after_start.find(pattern) {
404+
// Make sure there's nothing significant after the closing quotes
405+
let after_end = after_start[end_pos + 3..].trim();
406+
if after_end.is_empty() || after_end.starts_with('#') {
407+
// It's a complete triple-quoted string on one line
408+
let before = &trimmed[..start_pos];
409+
// Docstring: no assignment before the triple quotes
410+
return Some(!before.contains('='));
411+
}
412+
}
413+
}
414+
}
415+
None
416+
}
417+
180418
/// Check if a string position is likely inside a string literal.
181419
fn is_in_string(&self, text: &str, _lang: &Language) -> bool {
182420
// Simplified check: count unescaped quotes
@@ -270,4 +508,34 @@ mod tests {
270508
assert_eq!(stats.comment, 4);
271509
assert_eq!(stats.blank, 0);
272510
}
511+
512+
#[test]
513+
fn test_count_lines_multiline_string() {
514+
let lang = make_rust_lang();
515+
let registry = Arc::new(LanguageRegistry::empty());
516+
let analyzer = FileAnalyzer::new(registry, &Config::default());
517+
518+
// Multiline string with content that looks like a comment
519+
let content = "let s = \"hello\n// not a comment\nworld\";\n";
520+
let stats = analyzer.count_lines(content, &lang);
521+
assert_eq!(stats.total, 3);
522+
assert_eq!(stats.code, 3, "All lines should be code (inside string)");
523+
assert_eq!(stats.comment, 0, "No comments - // is inside string");
524+
assert_eq!(stats.blank, 0);
525+
}
526+
527+
#[test]
528+
fn test_count_lines_raw_string() {
529+
let lang = make_rust_lang();
530+
let registry = Arc::new(LanguageRegistry::empty());
531+
let analyzer = FileAnalyzer::new(registry, &Config::default());
532+
533+
// Raw string with content that looks like a comment
534+
let content = "let s = r#\"hello\n// not a comment\n/* also not */\nworld\"#;\n";
535+
let stats = analyzer.count_lines(content, &lang);
536+
assert_eq!(stats.total, 4);
537+
assert_eq!(stats.code, 4, "All lines should be code (inside raw string)");
538+
assert_eq!(stats.comment, 0, "No comments - everything is inside raw string");
539+
assert_eq!(stats.blank, 0);
540+
}
273541
}

crates/codelens-core/src/filter/smart.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ impl SmartExclude {
6868
}
6969

7070
// Common build/dependency directories
71+
// Note: "packages" is NOT excluded as it often contains source code
72+
// in monorepos (Python, Rust, JS workspaces)
7173
matches!(
7274
name,
7375
"node_modules"
@@ -78,14 +80,11 @@ impl SmartExclude {
7880
| "out"
7981
| "bin"
8082
| "obj"
81-
| "packages"
8283
| "bower_components"
8384
| "jspm_packages"
8485
| ".bundle"
8586
| "venv"
8687
| ".venv"
87-
| "env"
88-
| ".env"
8988
| "virtualenv"
9089
)
9190
}

0 commit comments

Comments
 (0)