Skip to content

Commit c37ee19

Browse files
script3rclaudehappy-otter
committed
Fix per-file RegexSet recompilation in find_library_anchors
The find_library_anchors function was creating a new RegexSet on every file scan by collecting include patterns and compiling them fresh. With ~1000 patterns, this added significant overhead per file. Fix: Pre-compile the include RegexSet per language during PatternSet initialization and store library ownership indices. The function now uses the pre-compiled set directly. Performance improvement: - library_anchors/python: 228µs → 25µs (~9x faster) - full_pipeline/python: 3.07ms → 2.75ms (~10% faster) - directory_scale/files/1000: 246ms → 235ms (~5% faster) Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
1 parent 46ef618 commit c37ee19

2 files changed

Lines changed: 36 additions & 15 deletions

File tree

src/patterns.rs

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,21 @@ struct RawParameterPattern {
7373
default_value: Option<toml::Value>,
7474
}
7575

76+
/// Pre-compiled include patterns with ownership tracking for find_library_anchors
77+
#[derive(Debug, Clone)]
78+
pub struct IncludeSetWithOwners {
79+
pub regex_set: RegexSet,
80+
/// Maps each regex index to its library index in PatternSet::libraries
81+
pub library_indices: Vec<usize>,
82+
}
83+
7684
#[derive(Debug, Clone)]
7785
pub struct PatternSet {
7886
pub libraries: Vec<Library>,
7987
pub include_sets: HashMap<Language, RegexSet>,
8088
pub api_sets: HashMap<Language, RegexSet>,
89+
/// Pre-compiled include patterns per language with library ownership for find_library_anchors
90+
pub include_sets_with_owners: HashMap<Language, IncludeSetWithOwners>,
8191
}
8292

8393
#[derive(Debug, Clone)]
@@ -173,11 +183,16 @@ impl PatternSet {
173183
}
174184
let mut include_patterns: HashMap<Language, Vec<String>> = HashMap::new();
175185
let mut api_patterns: HashMap<Language, Vec<String>> = HashMap::new();
176-
for lib in &libraries {
186+
// Track which library owns each include pattern (for find_library_anchors)
187+
let mut include_owners: HashMap<Language, Vec<usize>> = HashMap::new();
188+
189+
for (lib_idx, lib) in libraries.iter().enumerate() {
177190
for lang in &lib.languages {
178191
let include_entry = include_patterns.entry(*lang).or_default();
192+
let owners_entry = include_owners.entry(*lang).or_default();
179193
for re in &lib.include_regexes {
180194
include_entry.push(re.as_str().to_string());
195+
owners_entry.push(lib_idx);
181196
}
182197
let api_entry = api_patterns.entry(*lang).or_default();
183198
for re in &lib.api_regexes {
@@ -188,9 +203,19 @@ impl PatternSet {
188203

189204
let mut include_sets = HashMap::new();
190205
let mut api_sets = HashMap::new();
206+
let mut include_sets_with_owners = HashMap::new();
207+
191208
for (lang, patterns) in include_patterns {
192209
if !patterns.is_empty() {
193-
include_sets.insert(lang, RegexSet::new(patterns)?);
210+
let regex_set = RegexSet::new(&patterns)?;
211+
include_sets.insert(lang, regex_set.clone());
212+
include_sets_with_owners.insert(
213+
lang,
214+
IncludeSetWithOwners {
215+
regex_set,
216+
library_indices: include_owners.remove(&lang).unwrap_or_default(),
217+
},
218+
);
194219
}
195220
}
196221
for (lang, patterns) in api_patterns {
@@ -203,6 +228,7 @@ impl PatternSet {
203228
libraries,
204229
include_sets,
205230
api_sets,
231+
include_sets_with_owners,
206232
})
207233
}
208234

src/scan.rs

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use ahash::{AHashMap as HashMap, AHashSet as HashSet};
22
use anyhow::{Context, Result};
3-
use regex::{Regex, RegexSet};
3+
use regex::Regex;
44
use tree_sitter::{Language as TsLanguage, Node, Parser, Point, Tree};
55

66
use crate::patterns::{Language, ParameterPattern, PatternSet};
@@ -105,9 +105,8 @@ pub fn find_library_anchors<'a>(
105105
patterns: &'a PatternSet,
106106
) -> Vec<LibraryHit<'a>> {
107107
let mut hits = Vec::new();
108-
let mut include_patterns = Vec::new();
109-
let mut include_owners = Vec::new();
110108

109+
// Handle libraries without include patterns (fallback to api_regexes)
111110
for lib in &patterns.libraries {
112111
if !lib.languages.contains(&lang) {
113112
continue;
@@ -121,27 +120,23 @@ pub fn find_library_anchors<'a>(
121120
column: 1,
122121
});
123122
}
124-
continue;
125-
}
126-
for re in &lib.include_regexes {
127-
include_patterns.push(re.as_str().to_string());
128-
include_owners.push(&lib.name);
129123
}
130124
}
131125

132-
if include_patterns.is_empty() {
126+
// Use pre-compiled include set with ownership tracking
127+
let Some(include_set_with_owners) = patterns.include_sets_with_owners.get(&lang) else {
133128
return hits;
134-
}
129+
};
135130

136-
let include_set = RegexSet::new(&include_patterns).expect("valid include regexes");
137131
for node in import_like_nodes(lang, tree.root_node()) {
138132
let text = node.utf8_text(content.as_bytes()).unwrap_or("");
139-
let matches = include_set.matches(text);
133+
let matches = include_set_with_owners.regex_set.matches(text);
140134
if matches.matched_any() {
141135
let Point { row, column } = node.start_position();
142136
for idx in matches.iter() {
137+
let lib_idx = include_set_with_owners.library_indices[idx];
143138
hits.push(LibraryHit {
144-
library_name: include_owners[idx],
139+
library_name: &patterns.libraries[lib_idx].name,
145140
line: row + 1,
146141
column: column + 1,
147142
});

0 commit comments

Comments
 (0)