Skip to content

Commit 3adc971

Browse files
authored
fix(config): honor include/exclude globs in file collection (#981) (#994)
* fix(config): honor include/exclude globs in file collection (#981) config.include and config.exclude were declared in DEFAULTS but never consumed by either engine, so users' glob filters in .codegraphrc.json had no effect. Both engines now compile the globs once and filter collected paths (relative to project root, forward-slash normalized) during initial walks and incremental fast-path rebuilds. - New src/shared/globs.ts with compileGlobs + matchesAny (extracted from features/boundaries.ts so the collector and boundaries share one implementation) - TS collector: passesIncludeExclude applied in collectFiles recursion and tryFastCollect so config changes take effect on incremental builds - Rust collector: globset-based filter wired through collect_files and try_fast_collect; BuildConfig gains include/exclude fields - Integration tests (wasm + native parity) cover exclude reject, include limit, combined filters, and empty-config default behavior Fixes #981 Impact: 19 functions changed, 20 affected * chore: sync Cargo.lock for globset dependency (#981) * fix(globs): enforce path-component boundary for `**/<literal>` patterns (#994) The `globToRegex` WASM-side glob compiler consumed the `/` after `**` without adding a directory-boundary group, so `**/index.ts` compiled to `^.*index\.ts$` and matched `barindex.ts`. The Rust `globset` crate enforces the boundary, so the two engines disagreed on these patterns. Compile `**/` as `(?:[^/]+/)*` — zero or more complete directory segments — keeping parity with globset. Bare `**` (e.g. trailing in `dir/**`) still compiles to `.*` so `dir/**` keeps matching `dir/a/b`. Adds regression tests for `**/<literal>` and `dir/**`. Impact: 1 functions changed, 8 affected * fix(config): surface GlobSetBuilder::build errors instead of silently disabling filters (#994) If `GlobSetBuilder::build()` returned `Err`, `build_glob_set` silently returned `None` and all include/exclude filters were disabled — users would see unexpected files in the graph with no clue why. Mirror the per-pattern error path and log the failure via `eprintln!` before falling back to `None`. Impact: 1 functions changed, 8 affected
1 parent fb82f5b commit 3adc971

11 files changed

Lines changed: 567 additions & 40 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/codegraph-core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ tree-sitter-haskell = "0.23"
3737
tree-sitter-ocaml = "0.24"
3838
rayon = "1"
3939
ignore = "0.4"
40+
globset = "0.4"
4041
sha2 = "0.10"
4142
# `bundled` embeds a second SQLite copy (better-sqlite3 already bundles one).
4243
# This is intentional: Windows CI lacks a system SQLite, and WAL coordination

crates/codegraph-core/src/build_pipeline.rs

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -549,15 +549,32 @@ fn collect_source_files(
549549
&db_files,
550550
&journal.changed,
551551
&journal.removed,
552+
&config.include,
553+
&config.exclude,
552554
)
553555
} else {
554-
file_collector::collect_files(root_dir, &config.ignore_dirs)
556+
file_collector::collect_files(
557+
root_dir,
558+
&config.ignore_dirs,
559+
&config.include,
560+
&config.exclude,
561+
)
555562
}
556563
} else {
557-
file_collector::collect_files(root_dir, &config.ignore_dirs)
564+
file_collector::collect_files(
565+
root_dir,
566+
&config.ignore_dirs,
567+
&config.include,
568+
&config.exclude,
569+
)
558570
}
559571
} else {
560-
file_collector::collect_files(root_dir, &config.ignore_dirs)
572+
file_collector::collect_files(
573+
root_dir,
574+
&config.ignore_dirs,
575+
&config.include,
576+
&config.exclude,
577+
)
561578
}
562579
}
563580

crates/codegraph-core/src/config.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,17 @@ use serde::Deserialize;
1010
#[derive(Debug, Clone, Deserialize, Default)]
1111
#[serde(rename_all = "camelCase")]
1212
pub struct BuildConfig {
13+
/// Glob patterns limiting which source files are included.
14+
/// When non-empty, a file must match at least one pattern.
15+
/// Matched against paths relative to the project root.
16+
#[serde(default)]
17+
pub include: Vec<String>,
18+
19+
/// Glob patterns excluding source files from the build.
20+
/// Matched against paths relative to the project root.
21+
#[serde(default)]
22+
pub exclude: Vec<String>,
23+
1324
/// Additional directory names to ignore during file collection.
1425
#[serde(default)]
1526
pub ignore_dirs: Vec<String>,
@@ -129,12 +140,16 @@ mod tests {
129140
fn deserialize_empty_config() {
130141
let config: BuildConfig = serde_json::from_str("{}").unwrap();
131142
assert!(config.ignore_dirs.is_empty());
143+
assert!(config.include.is_empty());
144+
assert!(config.exclude.is_empty());
132145
assert!(config.build.incremental);
133146
}
134147

135148
#[test]
136149
fn deserialize_full_config() {
137150
let json = r#"{
151+
"include": ["src/**/*.ts"],
152+
"exclude": ["**/*.test.ts", "**/*.spec.ts"],
138153
"ignoreDirs": ["vendor", "tmp"],
139154
"build": {
140155
"incremental": false,
@@ -145,6 +160,8 @@ mod tests {
145160
}
146161
}"#;
147162
let config: BuildConfig = serde_json::from_str(json).unwrap();
163+
assert_eq!(config.include, vec!["src/**/*.ts"]);
164+
assert_eq!(config.exclude, vec!["**/*.test.ts", "**/*.spec.ts"]);
148165
assert_eq!(config.ignore_dirs, vec!["vendor", "tmp"]);
149166
assert!(!config.build.incremental);
150167
assert_eq!(config.build.drift_threshold, 0.2);

crates/codegraph-core/src/file_collector.rs

Lines changed: 177 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
//! (from BurntSushi/ripgrep) for gitignore-aware traversal.
66
77
use crate::parser_registry::LanguageKind;
8+
use globset::{Glob, GlobSet, GlobSetBuilder};
89
use std::collections::HashSet;
910
use std::path::Path;
1011

@@ -44,10 +45,77 @@ pub struct CollectResult {
4445
pub directories: HashSet<String>,
4546
}
4647

48+
/// Compile a list of glob patterns into a `GlobSet`.
49+
///
50+
/// Invalid patterns are logged via `eprintln!` and skipped so a single bad
51+
/// entry in config can't take down the whole build.
52+
fn build_glob_set(patterns: &[String]) -> Option<GlobSet> {
53+
if patterns.is_empty() {
54+
return None;
55+
}
56+
let mut builder = GlobSetBuilder::new();
57+
let mut added = 0usize;
58+
for p in patterns {
59+
match Glob::new(p) {
60+
Ok(g) => {
61+
builder.add(g);
62+
added += 1;
63+
}
64+
Err(e) => {
65+
eprintln!("codegraph: ignoring invalid glob pattern {p:?}: {e}");
66+
}
67+
}
68+
}
69+
if added == 0 {
70+
return None;
71+
}
72+
match builder.build() {
73+
Ok(set) => Some(set),
74+
Err(e) => {
75+
// Failing to build the GlobSet disables *all* include/exclude
76+
// filters, which silently changes what files the build sees.
77+
// Surface the error so users can correct their config instead of
78+
// being confused by ignored filters.
79+
eprintln!("codegraph: failed to build glob set: {e}");
80+
None
81+
}
82+
}
83+
}
84+
85+
/// `true` when the relative path passes the configured include/exclude filters.
86+
///
87+
/// `rel_path` must be relative to the project root and normalized to forward
88+
/// slashes. Mirrors `passesIncludeExclude` in `src/domain/graph/builder/helpers.ts`
89+
/// so both engines accept or reject the same set of files.
90+
pub fn passes_include_exclude(
91+
rel_path: &str,
92+
include: Option<&GlobSet>,
93+
exclude: Option<&GlobSet>,
94+
) -> bool {
95+
if let Some(set) = include {
96+
if !set.is_match(rel_path) {
97+
return false;
98+
}
99+
}
100+
if let Some(set) = exclude {
101+
if set.is_match(rel_path) {
102+
return false;
103+
}
104+
}
105+
true
106+
}
107+
47108
/// Collect all source files under `root_dir`, respecting gitignore and ignore dirs.
48109
///
49110
/// `extra_ignore_dirs` are additional directory names to skip (from config `ignoreDirs`).
50-
pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectResult {
111+
/// `include_patterns` / `exclude_patterns` are file-level glob filters applied after
112+
/// the extension check, matched against paths relative to `root_dir`.
113+
pub fn collect_files(
114+
root_dir: &str,
115+
extra_ignore_dirs: &[String],
116+
include_patterns: &[String],
117+
exclude_patterns: &[String],
118+
) -> CollectResult {
51119
// Build an owned set of ignore dirs to avoid leaking memory.
52120
// The closure captures this owned set, so lifetimes are satisfied without Box::leak.
53121
let ignore_set: HashSet<String> = DEFAULT_IGNORE_DIRS
@@ -58,6 +126,10 @@ pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectRes
58126

59127
let ext_set: HashSet<&str> = SUPPORTED_EXTENSIONS.iter().copied().collect();
60128

129+
let include_set = build_glob_set(include_patterns);
130+
let exclude_set = build_glob_set(exclude_patterns);
131+
let root_path = Path::new(root_dir);
132+
61133
let mut files = Vec::new();
62134
let mut directories = HashSet::new();
63135

@@ -105,6 +177,19 @@ pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectRes
105177
}
106178
}
107179

180+
// Apply file-level include/exclude globs against the relative path.
181+
if include_set.is_some() || exclude_set.is_some() {
182+
let rel = path
183+
.strip_prefix(root_path)
184+
.ok()
185+
.and_then(|p| p.to_str())
186+
.map(|s| s.replace('\\', "/"))
187+
.unwrap_or_else(|| normalize_path(path));
188+
if !passes_include_exclude(&rel, include_set.as_ref(), exclude_set.as_ref()) {
189+
continue;
190+
}
191+
}
192+
108193
let abs = normalize_path(path);
109194
if let Some(parent) = path.parent() {
110195
directories.insert(normalize_path(parent));
@@ -117,12 +202,18 @@ pub fn collect_files(root_dir: &str, extra_ignore_dirs: &[String]) -> CollectRes
117202

118203
/// Reconstruct file list from DB file_hashes + journal deltas (fast path).
119204
///
205+
/// Applies `include_patterns` / `exclude_patterns` so incremental builds honor
206+
/// config changes — the paths in the DB were collected under an earlier config
207+
/// that may have had different glob filters.
208+
///
120209
/// Returns `None` when the fast path isn't applicable.
121210
pub fn try_fast_collect(
122211
root_dir: &str,
123212
db_files: &[String],
124213
journal_changed: &[String],
125214
journal_removed: &[String],
215+
include_patterns: &[String],
216+
exclude_patterns: &[String],
126217
) -> CollectResult {
127218
let mut file_set: HashSet<String> = db_files.iter().cloned().collect();
128219

@@ -134,12 +225,22 @@ pub fn try_fast_collect(
134225
file_set.insert(changed.clone());
135226
}
136227

228+
let include_set = build_glob_set(include_patterns);
229+
let exclude_set = build_glob_set(exclude_patterns);
230+
let has_filters = include_set.is_some() || exclude_set.is_some();
231+
137232
// Convert relative paths to absolute and compute directories
138233
let root = Path::new(root_dir);
139234
let mut files = Vec::with_capacity(file_set.len());
140235
let mut directories = HashSet::new();
141236

142237
for rel_path in &file_set {
238+
if has_filters {
239+
let norm = rel_path.replace('\\', "/");
240+
if !passes_include_exclude(&norm, include_set.as_ref(), exclude_set.as_ref()) {
241+
continue;
242+
}
243+
}
143244
let abs = root.join(rel_path);
144245
let abs_str = normalize_path(&abs);
145246
if let Some(parent) = abs.parent() {
@@ -171,7 +272,7 @@ mod tests {
171272
fs::write(src.join("readme.md"), "# Hello").unwrap();
172273
fs::write(src.join("util.js"), "module.exports = {};").unwrap();
173274

174-
let result = collect_files(tmp.to_str().unwrap(), &[]);
275+
let result = collect_files(tmp.to_str().unwrap(), &[], &[], &[]);
175276
let names: HashSet<String> = result
176277
.files
177278
.iter()
@@ -200,13 +301,61 @@ mod tests {
200301
fs::create_dir_all(&src).unwrap();
201302
fs::write(src.join("app.ts"), "").unwrap();
202303

203-
let result = collect_files(tmp.to_str().unwrap(), &[]);
304+
let result = collect_files(tmp.to_str().unwrap(), &[], &[], &[]);
204305
assert_eq!(result.files.len(), 1);
205306
assert!(result.files[0].contains("app.ts"));
206307

207308
let _ = fs::remove_dir_all(&tmp);
208309
}
209310

311+
#[test]
312+
fn collect_honors_exclude_globs() {
313+
let tmp = std::env::temp_dir().join("codegraph_collect_exclude_test");
314+
let _ = fs::remove_dir_all(&tmp);
315+
let src = tmp.join("src");
316+
fs::create_dir_all(&src).unwrap();
317+
fs::write(src.join("app.ts"), "").unwrap();
318+
fs::write(src.join("app.test.ts"), "").unwrap();
319+
fs::write(src.join("util.ts"), "").unwrap();
320+
321+
let exclude = vec!["**/*.test.ts".to_string()];
322+
let result = collect_files(tmp.to_str().unwrap(), &[], &[], &exclude);
323+
let names: HashSet<String> = result
324+
.files
325+
.iter()
326+
.filter_map(|f| Path::new(f).file_name().map(|n| n.to_str().unwrap().to_string()))
327+
.collect();
328+
assert!(names.contains("app.ts"));
329+
assert!(names.contains("util.ts"));
330+
assert!(!names.contains("app.test.ts"), "exclude glob should reject matching files");
331+
332+
let _ = fs::remove_dir_all(&tmp);
333+
}
334+
335+
#[test]
336+
fn collect_honors_include_globs() {
337+
let tmp = std::env::temp_dir().join("codegraph_collect_include_test");
338+
let _ = fs::remove_dir_all(&tmp);
339+
let src = tmp.join("src");
340+
let tests = tmp.join("tests");
341+
fs::create_dir_all(&src).unwrap();
342+
fs::create_dir_all(&tests).unwrap();
343+
fs::write(src.join("app.ts"), "").unwrap();
344+
fs::write(tests.join("spec.ts"), "").unwrap();
345+
346+
let include = vec!["src/**".to_string()];
347+
let result = collect_files(tmp.to_str().unwrap(), &[], &include, &[]);
348+
let names: HashSet<String> = result
349+
.files
350+
.iter()
351+
.filter_map(|f| Path::new(f).file_name().map(|n| n.to_str().unwrap().to_string()))
352+
.collect();
353+
assert!(names.contains("app.ts"));
354+
assert!(!names.contains("spec.ts"), "include glob should reject non-matching files");
355+
356+
let _ = fs::remove_dir_all(&tmp);
357+
}
358+
210359
#[test]
211360
fn fast_collect_applies_deltas() {
212361
let root = "/project";
@@ -218,7 +367,7 @@ mod tests {
218367
let changed = vec!["src/d.ts".to_string()];
219368
let removed = vec!["src/b.ts".to_string()];
220369

221-
let result = try_fast_collect(root, &db_files, &changed, &removed);
370+
let result = try_fast_collect(root, &db_files, &changed, &removed, &[], &[]);
222371
assert_eq!(result.files.len(), 3); // a, c, d
223372
let names: HashSet<&str> = result
224373
.files
@@ -230,4 +379,28 @@ mod tests {
230379
assert!(names.contains("c.ts"));
231380
assert!(names.contains("d.ts"));
232381
}
382+
383+
#[test]
384+
fn fast_collect_honors_exclude_globs() {
385+
let root = "/project";
386+
let db_files = vec![
387+
"src/a.ts".to_string(),
388+
"src/a.test.ts".to_string(),
389+
"src/b.ts".to_string(),
390+
];
391+
let exclude = vec!["**/*.test.ts".to_string()];
392+
393+
let result = try_fast_collect(root, &db_files, &[], &[], &[], &exclude);
394+
let names: HashSet<&str> = result
395+
.files
396+
.iter()
397+
.map(|f| f.rsplit('/').next().unwrap_or(f))
398+
.collect();
399+
assert!(names.contains("a.ts"));
400+
assert!(names.contains("b.ts"));
401+
assert!(
402+
!names.contains("a.test.ts"),
403+
"fast path must filter out excluded files so incremental builds honor config changes"
404+
);
405+
}
233406
}

0 commit comments

Comments
 (0)