Skip to content

Commit 083862b

Browse files
script3rclaudehappy-otter
committed
Pre-compile constant patterns and add line offset cache
Two performance optimizations for large codebases: 1. Pre-compile constant detection regexes in PatternSet - collect_constants() was compiling 1-2 regexes per file - Now compiled once at startup and reused - ~35% faster algorithm_detection benchmark 2. Add LineCache for O(log n) line/column lookups - line_col_from_offset() was O(n) from file start for each match - LineCache builds line offsets once (O(n)), then binary search (O(log n)) - Significant improvement for files with many matches Combined benchmark improvements: - algorithm_detection/python: 130µs → 85µs (~35% faster) - full_pipeline/python: 2.75ms → 2.54ms (~8% faster) - directory_scale/1000 files: 235ms → 190ms (~19% faster) Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
1 parent c37ee19 commit 083862b

File tree

2 files changed

+119
-37
lines changed

2 files changed

+119
-37
lines changed

src/patterns.rs

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,21 @@ pub struct IncludeSetWithOwners {
8181
pub library_indices: Vec<usize>,
8282
}
8383

84+
/// Pre-compiled constant detection patterns per language
85+
#[derive(Debug, Clone)]
86+
pub struct ConstantPatterns {
87+
pub regexes: Vec<Regex>,
88+
}
89+
8490
#[derive(Debug, Clone)]
8591
pub struct PatternSet {
8692
pub libraries: Vec<Library>,
8793
pub include_sets: HashMap<Language, RegexSet>,
8894
pub api_sets: HashMap<Language, RegexSet>,
8995
/// Pre-compiled include patterns per language with library ownership for find_library_anchors
9096
pub include_sets_with_owners: HashMap<Language, IncludeSetWithOwners>,
97+
/// Pre-compiled constant detection patterns per language
98+
pub constant_patterns: HashMap<Language, ConstantPatterns>,
9199
}
92100

93101
#[derive(Debug, Clone)]
@@ -224,14 +232,81 @@ impl PatternSet {
224232
}
225233
}
226234

235+
// Pre-compile constant detection patterns per language
236+
let constant_patterns = Self::build_constant_patterns()?;
237+
227238
Ok(Self {
228239
libraries,
229240
include_sets,
230241
api_sets,
231242
include_sets_with_owners,
243+
constant_patterns,
232244
})
233245
}
234246

247+
/// Build pre-compiled constant detection patterns for each language
248+
fn build_constant_patterns() -> Result<HashMap<Language, ConstantPatterns>> {
249+
let patterns_by_lang: &[(Language, &[&str])] = &[
250+
(
251+
Language::C,
252+
&[
253+
r"(?m)^\s*#\s*define\s+([A-Za-z_][A-Za-z0-9_]*)\s+([^\n]+)$",
254+
r"(?m)^\s*(?:static\s+)?const\s+[^=;]+?\b([A-Za-z_][A-Za-z0-9_]*)\s*=\s*([^;]+);",
255+
],
256+
),
257+
(
258+
Language::Cpp,
259+
&[
260+
r"(?m)^\s*#\s*define\s+([A-Za-z_][A-Za-z0-9_]*)\s+([^\n]+)$",
261+
r"(?m)^\s*(?:static\s+)?const\s+[^=;]+?\b([A-Za-z_][A-Za-z0-9_]*)\s*=\s*([^;]+);",
262+
],
263+
),
264+
(
265+
Language::Objc,
266+
&[
267+
r"(?m)^\s*#\s*define\s+([A-Za-z_][A-Za-z0-9_]*)\s+([^\n]+)$",
268+
r"(?m)^\s*(?:static\s+)?const\s+[^=;]+?\b([A-Za-z_][A-Za-z0-9_]*)\s*=\s*([^;]+);",
269+
],
270+
),
271+
(
272+
Language::Java,
273+
&[
274+
r"(?m)^\s*(?:public|private|protected)?\s*(?:static\s+)?final\s+(?:int|long|String)\s+([A-Za-z_][A-Za-z0-9_]*)\s*=\s*([^;]+);",
275+
],
276+
),
277+
(
278+
Language::Go,
279+
&[r"(?m)^\s*const\s+([A-Za-z_][A-Za-z0-9_]*)\s*=\s*([^\n]+)$"],
280+
),
281+
(
282+
Language::Python,
283+
&[r"(?m)^\s*([A-Z_][A-Z0-9_]*)\s*=\s*([^#\n]+)"],
284+
),
285+
(
286+
Language::Php,
287+
&[
288+
r"(?m)^\s*const\s+([A-Z_][A-Z0-9_]*)\s*=\s*([^;]+);",
289+
r#"define\(\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*,\s*([^)]+)\)"#,
290+
],
291+
),
292+
(
293+
Language::Rust,
294+
&[r"(?m)^\s*const\s+([A-Za-z_][A-Za-z0-9_]*)\s*:[^=]+=\s*([^;]+);"],
295+
),
296+
];
297+
298+
let mut result = HashMap::new();
299+
for (lang, patterns) in patterns_by_lang {
300+
let regexes = patterns
301+
.iter()
302+
.map(|p| Regex::new(p))
303+
.collect::<std::result::Result<Vec<_>, _>>()
304+
.context("compile constant patterns")?;
305+
result.insert(*lang, ConstantPatterns { regexes });
306+
}
307+
Ok(result)
308+
}
309+
235310
pub fn supports_language(&self, lang: Language) -> bool {
236311
self.libraries.iter().any(|l| l.languages.contains(&lang))
237312
}

src/scan.rs

Lines changed: 44 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
use ahash::{AHashMap as HashMap, AHashSet as HashSet};
22
use anyhow::{Context, Result};
3-
use regex::Regex;
43
use tree_sitter::{Language as TsLanguage, Node, Parser, Point, Tree};
54

65
use crate::patterns::{Language, ParameterPattern, PatternSet};
@@ -190,7 +189,9 @@ pub fn find_algorithms<'a>(
190189
primitive_by_alg.insert(alg.name.clone(), primitive.clone());
191190
}
192191
}
193-
let constants = collect_constants(lang, content);
192+
let constants = collect_constants(lang, content, patterns);
193+
// Build line cache for fast line/column lookups (O(n) once, O(log n) per lookup)
194+
let line_cache = LineCache::new(content);
194195
// Collect raw hits
195196
let mut hits_by_alg: HashMap<&str, Vec<(AlgorithmHit<'a>, bool)>> = HashMap::new();
196197
for node in code_symbol_nodes(lang, tree.root_node()) {
@@ -260,7 +261,7 @@ pub fn find_algorithms<'a>(
260261
}
261262
let Point { row, column } = node.start_position();
262263
let (line, column) = match_offset
263-
.map(|offset| line_col_from_offset(content, node.start_byte() + offset))
264+
.map(|offset| line_cache.line_col(node.start_byte() + offset))
264265
.unwrap_or((row + 1, column + 1));
265266
let hit = AlgorithmHit {
266267
algorithm_name: &alg.name,
@@ -316,7 +317,7 @@ pub fn find_algorithms<'a>(
316317
}
317318
for re in &alg.symbol_regexes {
318319
for m in re.find_iter(content) {
319-
let (line, column) = line_col_from_offset(content, m.start());
320+
let (line, column) = line_cache.line_col(m.start());
320321
if seen_on_line.insert((&alg.name, line)) {
321322
let mut metadata = HashMap::new();
322323
if let Some(primitive) = &alg.primitive {
@@ -373,22 +374,38 @@ pub fn dedupe_more_specific_hits<'a>(hits: Vec<AlgorithmHit<'a>>) -> Vec<Algorit
373374
.collect()
374375
}
375376

376-
fn line_col_from_offset(content: &str, byte_idx: usize) -> (usize, usize) {
377-
// 1-based line, column
378-
let bytes = content.as_bytes();
379-
let mut line = 1usize;
380-
let mut col = 1usize;
381-
let mut i = 0usize;
382-
while i < byte_idx && i < bytes.len() {
383-
if bytes[i] == b'\n' {
384-
line += 1;
385-
col = 1;
386-
} else {
387-
col += 1;
377+
/// Cache of line start byte offsets for fast line/column lookup.
378+
/// Building the cache is O(n), but subsequent lookups are O(log n).
379+
struct LineCache {
380+
/// Byte offset of the start of each line (0-indexed internally)
381+
/// line_starts[0] = 0 (line 1 starts at byte 0)
382+
/// line_starts[1] = offset after first newline (line 2 start)
383+
line_starts: Vec<usize>,
384+
}
385+
386+
impl LineCache {
387+
fn new(content: &str) -> Self {
388+
let bytes = content.as_bytes();
389+
let mut line_starts = vec![0usize];
390+
for (i, &b) in bytes.iter().enumerate() {
391+
if b == b'\n' {
392+
line_starts.push(i + 1);
393+
}
388394
}
389-
i += 1;
395+
Self { line_starts }
396+
}
397+
398+
/// Convert byte offset to 1-based (line, column)
399+
fn line_col(&self, byte_idx: usize) -> (usize, usize) {
400+
// Binary search to find which line contains this offset
401+
let line_idx = match self.line_starts.binary_search(&byte_idx) {
402+
Ok(exact) => exact, // byte_idx is exactly at a line start
403+
Err(insert) => insert - 1, // byte_idx is within line (insert-1)
404+
};
405+
let line = line_idx + 1; // 1-based
406+
let col = byte_idx - self.line_starts[line_idx] + 1; // 1-based
407+
(line, col)
390408
}
391-
(line, col)
392409
}
393410

394411
fn extract_parameter(pp: &ParameterPattern, text: &str) -> Option<serde_json::Value> {
@@ -403,28 +420,18 @@ fn extract_parameter(pp: &ParameterPattern, text: &str) -> Option<serde_json::Va
403420
None
404421
}
405422

406-
fn collect_constants(lang: Language, content: &str) -> HashMap<String, String> {
423+
fn collect_constants(
424+
lang: Language,
425+
content: &str,
426+
patterns: &PatternSet,
427+
) -> HashMap<String, String> {
407428
let mut constants = HashMap::new();
408-
let patterns: &[&str] = match lang {
409-
Language::C | Language::Cpp | Language::Objc => &[
410-
r"(?m)^\s*#\s*define\s+([A-Za-z_][A-Za-z0-9_]*)\s+([^\n]+)$",
411-
r"(?m)^\s*(?:static\s+)?const\s+[^=;]+?\b([A-Za-z_][A-Za-z0-9_]*)\s*=\s*([^;]+);",
412-
],
413-
Language::Java => &[
414-
r"(?m)^\s*(?:public|private|protected)?\s*(?:static\s+)?final\s+(?:int|long|String)\s+([A-Za-z_][A-Za-z0-9_]*)\s*=\s*([^;]+);",
415-
],
416-
Language::Go => &[r"(?m)^\s*const\s+([A-Za-z_][A-Za-z0-9_]*)\s*=\s*([^\n]+)$"],
417-
Language::Python => &[r"(?m)^\s*([A-Z_][A-Z0-9_]*)\s*=\s*([^#\n]+)"],
418-
Language::Php => &[
419-
r"(?m)^\s*const\s+([A-Z_][A-Z0-9_]*)\s*=\s*([^;]+);",
420-
r#"define\(\s*['"]([A-Z_][A-Z0-9_]*)['"]\s*,\s*([^)]+)\)"#,
421-
],
422-
Language::Rust => &[r"(?m)^\s*const\s+([A-Za-z_][A-Za-z0-9_]*)\s*:[^=]+=\s*([^;]+);"],
423-
_ => &[],
429+
430+
let Some(const_patterns) = patterns.constant_patterns.get(&lang) else {
431+
return constants;
424432
};
425433

426-
for pattern in patterns {
427-
let re = Regex::new(pattern).expect("valid regex");
434+
for re in &const_patterns.regexes {
428435
for caps in re.captures_iter(content) {
429436
let Some(name) = caps.get(1).map(|m| m.as_str()) else {
430437
continue;

0 commit comments

Comments
 (0)