diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d7055f..b2de94c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,31 @@ page. ### Added +- **`pyrer.parse_static_packages_py(paths) -> list[PackageData | None]`** — + batched, Rayon-parallel variant of `parse_static_package_py` + that opens and parses every path in one Rust call across a + thread pool. Output is positionally aligned with the input; + missing files, unreadable bytes, and parser-bails all map to + `None`. The GIL is released for the whole batch via + `Python::allow_threads`. Pool size follows + `RAYON_NUM_THREADS` (default: logical core count). Targets + issue #94's profile finding that serial Python `open()` was + the top of the resolve flamegraph (3.20 s of 9.12 s, 35% of + wall time) after the per-file static parser landed. + Closes #94. +- **`version_range` hint on the `load_family` callback** (issue #92) — + `pyrer.solve(..., load_family=cb)` now invokes `cb` as + `cb(name, version_range="2+<3")` when the callback's signature can + accept a second `version_range` argument (named param or `**kwargs`). + The hint is a rez-syntax range string the shim can pass directly to + `rez.packages.iter_packages(range_=...)` to skip on-disk version + directories outside the request. Backward-compatible: 1-arg + callbacks (`def cb(name):`) keep working unchanged — pyrer detects + the signature via `inspect.signature` once per `solve()` call. + Targets the 95% load-fan-out waste documented in #92 (2,637 + packages loaded for 132 used on a typical Fortiche resolve); + projected 6-20× cut to `_load_family` wall time. The 188-case rez + differential still passes 188/188. - **`PackageData.from_strings(name, version, requires=None, variants=None)`** — classmethod constructor for raw-string callers, symmetric with `from_rez(pkg)`. Skips rez's `AttributeForwardMeta` chain, the diff --git a/Cargo.toml b/Cargo.toml index e1c771b..f43c273 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["crates/*"] resolver = "2" [workspace.package] -version = "0.1.0-rc.9" +version = "1.0.0-rc.3" authors = [ "Lorenzo Montant ", "Maxim Doucet ", @@ -23,8 +23,12 @@ lazy_static = "1.5.0" rand = "0.8.5" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" -rer-version = { path = "crates/rer-version", version = "0.1.0-rc.9" } -rer-resolver = { path = "crates/rer-resolver", version = "0.1.0-rc.9" } +rer-version = { path = "crates/rer-version", version = "1.0.0-rc.3" } +rer-resolver = { path = "crates/rer-resolver", version = "1.0.0-rc.3" } +rer-package = { path = "crates/rer-package", version = "1.0.0-rc.3" } +# Rayon powers the batched `parse_static_packages_py` API (#94). Honours +# `RAYON_NUM_THREADS` for environments that need to cap the pool size. +rayon = "1" pyo3 = { version = "0.23.5", features = ["extension-module"] } # `mimalloc` is wired into the bench binary as a `#[global_allocator]`. # Callgrind shows ~33 % of cycles in libc malloc/free; mimalloc has measurably diff --git a/crates/rer-package/Cargo.toml b/crates/rer-package/Cargo.toml new file mode 100644 index 0000000..a53cca7 --- /dev/null +++ b/crates/rer-package/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "rer-package" +version = { workspace = true } +description = "Fast static parser for the solver-relevant fields of a rez package.py." +edition = { workspace = true } +repository = { workspace = true } +readme = { workspace = true } +authors = { workspace = true } +license-file = { workspace = true } + +[lib] +name = "rer_package" + + +[dependencies] +# Hand-rolled lexer — no AST library dependency. Stage 3 measurement +# showed `rustpython-parser` was ~2 ms/file because it built the +# whole module AST when we only needed four top-level fields. +# +# Rayon powers the batched `parse_static_packages_py` API (issue #94) +# that reads + parses many `package.py` files in parallel, replacing +# the rez shim's serial Python file-open loop (~3 s on a typical +# 132-package Fortiche resolve). +rayon = { workspace = true } diff --git a/crates/rer-package/src/lib.rs b/crates/rer-package/src/lib.rs new file mode 100644 index 0000000..e497d29 --- /dev/null +++ b/crates/rer-package/src/lib.rs @@ -0,0 +1,1384 @@ +//! Hand-rolled lexer for the static subset of a rez `package.py`. +//! +//! Extracts the four solver-relevant fields (`name`, `version`, +//! `requires`, `variants`) by scanning the source line-by-line — no +//! full AST. Successor to the original `rustpython-parser` version, +//! whose ~2 ms/file AST construction dominated the Stage 3 bench. +//! Target for this rewrite: 50–200 μs/file. +//! +//! # Bias toward bailing +//! +//! Any pattern this scanner doesn't recognise produces `None`. The +//! slow path through rez is always available; we only accept files +//! we're confident match rez 1:1 on the four fields. +//! +//! # What's accepted at module scope +//! +//! - `name = "..."` / `version = "..."` (single- or single-quoted +//! string literal) +//! - `requires = [str, str, ...]` (list/tuple of string literals, +//! possibly multi-line) +//! - `variants = [[str, ...], ...]` (list/tuple of list/tuple of +//! string literals) +//! - `def foo(...)` for any `foo` not in solver fields (body skipped) +//! - `with scope("...") as ...: ...` — rez's declarative DSL +//! - Assignments to non-solver fields (RHS skipped) +//! - Module docstring (single `"""..."""` at the top) +//! +//! # What bails +//! +//! - `@early` / `@late` on a solver-field function +//! - Top-level `if` / `for` / `while` / `try` / `class` / `match` +//! - `import` / `from … import` +//! - `with ...` that isn't `with scope(...)` +//! - Non-literal RHS for a solver field +//! - Missing `name` or `version` +//! - Anything else the scanner doesn't explicitly accept + +/// The four solver-relevant fields extracted from a `package.py`. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct PackageInfo { + pub name: String, + pub version: String, + pub requires: Vec, + pub variants: Vec>, +} + +const SOLVER_FIELDS: &[&str] = &["name", "version", "requires", "variants"]; + +/// Try to parse `source` as a statically-resolvable rez `package.py`. +/// Returns `Some(info)` if every top-level statement is recognised +/// and the four solver fields are all literal — `None` otherwise. +pub fn parse_static_package_py(source: &str) -> Option { + let mut p = Parser::new(source); + p.parse_module() +} + +/// Batched, parallel variant of [`parse_static_package_py`]: open and +/// parse every path on a Rayon thread pool, returning a `Vec` aligned +/// with `paths`. +/// +/// Each entry in the returned `Vec` is independent — a file that +/// doesn't exist, can't be read, or contains dynamic content all +/// produce `None` at the same index as the input path. No error +/// path: the function never panics on per-file failures. +/// +/// Issue #94: the rez integration shim's bottleneck after the static +/// parser landed was the serial Python loop of `open()` calls +/// (~3 s on a typical 132-package Fortiche resolve, 91% of the +/// `_load_family` budget). This call replaces that loop with one +/// `Python::allow_threads`-released batch, so the I/O overlaps +/// across cores. +/// +/// Pool size follows Rayon's default (`RAYON_NUM_THREADS` env var or +/// logical core count). Order is preserved regardless of completion +/// order — callers can `zip(paths, results)` after. +pub fn parse_static_packages_py

(paths: &[P]) -> Vec> +where + P: AsRef + Sync, +{ + use rayon::prelude::*; + + paths + .par_iter() + .map(|p| { + let source = std::fs::read_to_string(p.as_ref()).ok()?; + parse_static_package_py(&source) + }) + .collect() +} + +// =========================================================================== +// Parser +// =========================================================================== + +struct Parser<'a> { + src: &'a [u8], + pos: usize, +} + +impl<'a> Parser<'a> { + fn new(source: &'a str) -> Self { + Parser { + src: source.as_bytes(), + pos: 0, + } + } + + fn parse_module(&mut self) -> Option { + let mut name: Option = None; + let mut version: Option = None; + let mut requires: Option> = None; + let mut variants: Option>> = None; + let mut seen_first_stmt = false; + + loop { + // Skip blank lines and comment-only lines until we're at the + // start of a real statement. + self.skip_blank_and_comment_lines(); + if self.eof() { + break; + } + // We expect to be at column 0 of a real line. + // If we hit indented content here, something is wrong — bail. + if self.peek() == Some(b' ') || self.peek() == Some(b'\t') { + return None; + } + + // First-statement-only: an unprefixed string literal is a + // module docstring — eat it and continue. + if !seen_first_stmt { + if matches!(self.peek(), Some(b'"' | b'\'')) { + // Could be a docstring or a bare-string-expression + // (uncommon; we still accept the docstring case at + // the top). + if self.try_eat_string_statement() { + seen_first_stmt = true; + continue; + } + return None; + } + } + seen_first_stmt = true; + + // Decorator: `@deco`, `@deco()`, `@a.b.c(args)`. Skip the + // entire decorator line and continue — the next loop + // iteration will see the `def IDENT` that follows. If + // IDENT is a solver field, the `def` branch bails + // (whether or not the decorator was `@early` / `@late`). + // So we don't need to inspect the decorator content itself. + if self.peek() == Some(b'@') { + self.skip_statement()?; + continue; + } + + // Look for a top-level keyword/identifier. + let Some(word) = self.peek_ident() else { + return None; + }; + match word { + "import" | "from" | "if" | "for" | "while" | "try" | "match" | "class" + | "raise" | "return" | "global" | "nonlocal" | "del" | "yield" | "assert" + | "pass" | "break" | "continue" | "async" => return None, + "def" => { + self.consume_ident("def")?; + self.skip_inline_ws(); + let func_name = self.eat_ident()?; + if SOLVER_FIELDS.contains(&func_name.as_str()) { + // `def requires(...)` etc. — dynamic; bail. + return None; + } + self.skip_function_body()?; + } + "with" => { + if !self.try_accept_scope_with()? { + return None; + } + } + _ => { + // Assignment: IDENT = RHS + let target = self.eat_ident()?; + self.skip_inline_ws(); + // Reject `IDENT: type = value` annotated form for + // solver fields (conservative — easier to bail than + // to fully parse types). + if self.peek() == Some(b':') { + if SOLVER_FIELDS.contains(&target.as_str()) { + return None; + } + // Non-solver annotated assignment — skip rest. + self.skip_statement()?; + continue; + } + if !self.eat_byte(b'=') { + return None; + } + // Reject `==`, `+=` etc. + if matches!(self.peek(), Some(b'=')) { + return None; + } + self.skip_inline_ws(); + + if SOLVER_FIELDS.contains(&target.as_str()) { + match target.as_str() { + "name" => name = Some(self.eat_string_literal()?), + "version" => version = Some(self.eat_string_literal()?), + "requires" => requires = Some(self.eat_list_of_strings()?), + "variants" => { + variants = Some(self.eat_list_of_list_of_strings()?) + } + _ => unreachable!(), + } + // After the RHS, the statement should end at + // end-of-line (or comment then EOL). Anything + // else (e.g. `name = "foo" + "bar"`) is a bail. + self.skip_inline_ws(); + if !self.at_statement_end() { + return None; + } + self.eat_to_eol(); + } else { + // Non-solver field — skip the RHS without + // caring what it is. + self.skip_statement()?; + } + } + } + } + + Some(PackageInfo { + name: name?, + version: version?, + requires: requires.unwrap_or_default(), + variants: variants.unwrap_or_default(), + }) + } + + // --------------------------------------------------------------- + // `with scope("x") as y: ...` + // --------------------------------------------------------------- + + /// Consume a `with scope(...)` block. Returns `Some(true)` on a + /// well-formed scope-with that we should accept, `Some(false)` on + /// a `with` that isn't `with scope(...)`, and `None` on a + /// pathological body that touches a solver field (poisoned). + fn try_accept_scope_with(&mut self) -> Option { + self.consume_ident("with")?; + self.skip_inline_ws(); + // Expect `scope`. + let kw = self.eat_ident()?; + if kw != "scope" { + return Some(false); + } + self.skip_inline_ws(); + if !self.eat_byte(b'(') { + return Some(false); + } + // Skip the call args by paren counting — we don't actually + // need the scope name. + self.skip_balanced(b'(', b')')?; + self.skip_inline_ws(); + // Optional `as IDENT`. + if let Some("as") = self.peek_ident() { + self.consume_ident("as")?; + self.skip_inline_ws(); + let _as_name = self.eat_ident()?; + self.skip_inline_ws(); + } + if !self.eat_byte(b':') { + return Some(false); + } + self.eat_to_eol(); + // Read the body lines. They must be indented (column > 0). + // The body ends at the next non-blank, non-comment line at + // column 0. While reading, defensively bail if a line + // assigns to a solver field at any indentation. + loop { + let line_start = self.pos; + // Skip blank/comment lines (they're part of the body + // regardless of indent). + self.skip_blank_and_comment_lines(); + if self.eof() { + break; + } + // Now check if we're indented (in body) or at column 0 + // (end of body). + if !matches!(self.peek(), Some(b' ' | b'\t')) { + // Hit column 0 — end of with body. + self.pos = line_start; // rewind to start of this line + // The skip_blank_and_comment_lines above might have + // moved us past blank lines; redo with no-rewind so + // the outer loop sees the same column-0 statement. + // Actually we want the outer loop to see the next + // statement, so just break (it will re-skip blanks). + break; + } + // Indented body line: defensively check for assignment to + // a solver field (e.g. `name = "x"` somewhere in body). + let line_bytes = self.peek_logical_line_bytes(); + if line_assigns_to_solver_field(&line_bytes) { + return None; + } + // Skip this logical line. + self.skip_statement()?; + } + Some(true) + } + + // --------------------------------------------------------------- + // `def foo(...): body` — skip + // --------------------------------------------------------------- + + fn skip_function_body(&mut self) -> Option<()> { + // We've consumed `def IDENT`. Now expect `(...)`. + self.skip_inline_ws(); + if !self.eat_byte(b'(') { + return None; + } + self.skip_balanced(b'(', b')')?; + self.skip_inline_ws(); + // Optional `-> annotation`. + if self.peek() == Some(b'-') { + // Skip until `:`. + while let Some(c) = self.peek() { + if c == b':' { + break; + } + if c == b'\n' { + return None; + } + self.pos += 1; + } + } + if !self.eat_byte(b':') { + return None; + } + self.eat_to_eol(); + // Skip indented body lines until next column-0 line. + loop { + // Skip blank/comment lines. + let snap = self.pos; + self.skip_blank_and_comment_lines(); + if self.eof() { + break; + } + if !matches!(self.peek(), Some(b' ' | b'\t')) { + self.pos = snap; + break; + } + // Skip the indented line. + self.skip_statement()?; + } + Some(()) + } + + // --------------------------------------------------------------- + // Solver-field RHS parsers + // --------------------------------------------------------------- + + /// Parse a single string literal. Accepts `"..."`, `'...'`, + /// `"""..."""`, `'''...'''`. Returns the decoded body. + fn eat_string_literal(&mut self) -> Option { + // Optional string prefix: 'r', 'R', 'u', 'U', 'b', 'B' (no + // f-strings on solver fields — those are dynamic). + let prefix_start = self.pos; + let mut raw = false; + while let Some(c) = self.peek() { + match c { + b'r' | b'R' => { + raw = true; + self.pos += 1; + } + b'u' | b'U' => { + self.pos += 1; + } + b'b' | b'B' => { + // bytes — not a string. Reject for solver fields. + return None; + } + b'f' | b'F' => { + // f-string — dynamic. Reject. + return None; + } + _ => break, + } + } + let quote = self.peek()?; + if quote != b'"' && quote != b'\'' { + // Not a string at all — rewind and fail. + self.pos = prefix_start; + return None; + } + self.pos += 1; + // Check for triple-quote. + let triple = self.peek() == Some(quote) && self.peek_at(1) == Some(quote); + if triple { + self.pos += 2; + return self.eat_triple_string_body(quote, raw); + } + self.eat_single_string_body(quote, raw) + } + + fn eat_single_string_body(&mut self, quote: u8, raw: bool) -> Option { + let mut out = String::new(); + while let Some(c) = self.peek() { + match c { + b'\\' if !raw => { + self.pos += 1; + let esc = self.peek()?; + match esc { + b'n' => out.push('\n'), + b't' => out.push('\t'), + b'r' => out.push('\r'), + b'\\' => out.push('\\'), + b'\'' => out.push('\''), + b'"' => out.push('"'), + b'0' => out.push('\0'), + b'\n' => {} // line continuation + // Unknown escape: pass through both chars + // (matches Python's permissive behaviour). + other => { + out.push('\\'); + out.push(other as char); + } + } + self.pos += 1; + } + b'\\' if raw => { + // In raw strings, backslash is literal — but a + // closing-quote-after-backslash still ends the + // string. Easier to bail for the rare case. + out.push('\\'); + self.pos += 1; + } + b'\n' => return None, // unterminated + c if c == quote => { + self.pos += 1; + return Some(out); + } + _ => { + // Take multi-byte UTF-8 sequences as a unit. + let start = self.pos; + self.advance_one_char(); + let s = std::str::from_utf8(&self.src[start..self.pos]).ok()?; + out.push_str(s); + } + } + } + None + } + + fn eat_triple_string_body(&mut self, quote: u8, _raw: bool) -> Option { + let mut out = String::new(); + while let Some(c) = self.peek() { + if c == quote + && self.peek_at(1) == Some(quote) + && self.peek_at(2) == Some(quote) + { + self.pos += 3; + return Some(out); + } + let start = self.pos; + self.advance_one_char(); + let s = std::str::from_utf8(&self.src[start..self.pos]).ok()?; + out.push_str(s); + } + None + } + + /// Parse `[s1, s2, ...]` or `(s1, s2, ...)` of string literals. + /// Handles trailing comma and multi-line layouts. + fn eat_list_of_strings(&mut self) -> Option> { + let opener = self.peek()?; + let closer = match opener { + b'[' => b']', + b'(' => b')', + _ => return None, + }; + self.pos += 1; + let mut out = Vec::new(); + loop { + self.skip_ws_and_comments(); + if self.peek() == Some(closer) { + self.pos += 1; + return Some(out); + } + out.push(self.eat_string_literal()?); + self.skip_ws_and_comments(); + match self.peek()? { + b',' => { + self.pos += 1; + } + c if c == closer => { + self.pos += 1; + return Some(out); + } + _ => return None, + } + } + } + + /// Parse `[[s, ...], [s, ...], ...]` — list of list of strings. + fn eat_list_of_list_of_strings(&mut self) -> Option>> { + let opener = self.peek()?; + let closer = match opener { + b'[' => b']', + b'(' => b')', + _ => return None, + }; + self.pos += 1; + let mut out = Vec::new(); + loop { + self.skip_ws_and_comments(); + if self.peek() == Some(closer) { + self.pos += 1; + return Some(out); + } + out.push(self.eat_list_of_strings()?); + self.skip_ws_and_comments(); + match self.peek()? { + b',' => { + self.pos += 1; + } + c if c == closer => { + self.pos += 1; + return Some(out); + } + _ => return None, + } + } + } + + // --------------------------------------------------------------- + // Statement-skipping (for non-solver assignments) + // --------------------------------------------------------------- + + /// Skip from the current position to the end of the current + /// (logical) statement: i.e., until a newline at bracket-depth 0, + /// outside any string. Handles backslash line continuations. + fn skip_statement(&mut self) -> Option<()> { + let mut depth: i32 = 0; + while let Some(c) = self.peek() { + match c { + b'"' | b'\'' => { + self.skip_string_at()?; + } + b'#' if depth == 0 => { + // Comment to EOL. + self.eat_to_eol(); + if depth == 0 { + return Some(()); + } + } + b'\\' => { + // Line continuation. Handles bare LF and CRLF. + self.pos += 1; + if self.peek() == Some(b'\r') { + self.pos += 1; + } + if self.peek() == Some(b'\n') { + self.pos += 1; + } + } + b'\n' => { + self.pos += 1; + if depth == 0 { + return Some(()); + } + } + b'(' | b'[' | b'{' => { + depth += 1; + self.pos += 1; + } + b')' | b']' | b'}' => { + depth -= 1; + if depth < 0 { + return None; + } + self.pos += 1; + } + _ => { + self.advance_one_char(); + } + } + } + Some(()) + } + + /// Skip past a balanced bracket pair. Assumes the opening bracket + /// has already been consumed (so we're at depth 1 conceptually). + fn skip_balanced(&mut self, _open: u8, close: u8) -> Option<()> { + let mut depth: i32 = 1; + while let Some(c) = self.peek() { + match c { + b'"' | b'\'' => { + self.skip_string_at()?; + } + b'#' => { + self.eat_to_eol(); + } + b'(' | b'[' | b'{' => { + depth += 1; + self.pos += 1; + } + b')' | b']' | b'}' => { + depth -= 1; + self.pos += 1; + if depth == 0 { + if c != close { + // Mismatched bracket type — but the + // outer caller doesn't actually care, + // since we're skipping. Bail to be safe. + return None; + } + return Some(()); + } + } + _ => { + self.advance_one_char(); + } + } + } + None + } + + /// At a `"` or `'`, skip the entire string literal (single or + /// triple-quoted). + fn skip_string_at(&mut self) -> Option<()> { + let quote = self.peek()?; + debug_assert!(quote == b'"' || quote == b'\''); + self.pos += 1; + let triple = self.peek() == Some(quote) && self.peek_at(1) == Some(quote); + if triple { + self.pos += 2; + while let Some(c) = self.peek() { + if c == quote + && self.peek_at(1) == Some(quote) + && self.peek_at(2) == Some(quote) + { + self.pos += 3; + return Some(()); + } + if c == b'\\' { + self.pos += 1; + if self.peek().is_some() { + self.advance_one_char(); + } + } else { + self.advance_one_char(); + } + } + None + } else { + while let Some(c) = self.peek() { + match c { + b'\\' => { + self.pos += 1; + if self.peek().is_some() { + self.advance_one_char(); + } + } + b'\n' => return None, + c if c == quote => { + self.pos += 1; + return Some(()); + } + _ => { + self.advance_one_char(); + } + } + } + None + } + } + + /// Try to eat a bare string-statement (typically a docstring at + /// the top of the module). Consumes the string and the rest of + /// the line. + fn try_eat_string_statement(&mut self) -> bool { + let snap = self.pos; + if self.eat_string_literal().is_none() { + self.pos = snap; + return false; + } + self.skip_inline_ws(); + if self.at_statement_end() { + self.eat_to_eol(); + true + } else { + self.pos = snap; + false + } + } + + // --------------------------------------------------------------- + // Token-level helpers + // --------------------------------------------------------------- + + fn skip_blank_and_comment_lines(&mut self) { + loop { + let snap = self.pos; + // Skip leading whitespace on this line. `\r` counts — + // CRLF-terminated files have a `\r` right before the `\n`, + // and a bare `\r` should never break us. + while matches!(self.peek(), Some(b' ' | b'\t' | b'\r')) { + self.pos += 1; + } + match self.peek() { + Some(b'\n') => { + self.pos += 1; + } + Some(b'#') => { + self.eat_to_eol(); + } + None => return, + Some(_) => { + // Real content — rewind to start of line so the + // caller sees the leading whitespace. + self.pos = snap; + return; + } + } + } + } + + fn skip_inline_ws(&mut self) { + // `\r` is transparent — it's just half of a CRLF terminator. + while matches!(self.peek(), Some(b' ' | b'\t' | b'\r')) { + self.pos += 1; + } + } + + fn skip_ws_and_comments(&mut self) { + loop { + match self.peek() { + Some(b' ' | b'\t' | b'\n' | b'\r') => self.pos += 1, + Some(b'#') => self.eat_to_eol(), + Some(b'\\') => { + // Possible line continuation: `\` followed by + // newline (LF or CRLF). + if self.peek_at(1) == Some(b'\n') { + self.pos += 2; + } else if self.peek_at(1) == Some(b'\r') + && self.peek_at(2) == Some(b'\n') + { + self.pos += 3; + } else { + return; + } + } + _ => return, + } + } + } + + fn eat_to_eol(&mut self) { + while let Some(c) = self.peek() { + self.pos += 1; + if c == b'\n' { + return; + } + } + } + + fn at_statement_end(&self) -> bool { + match self.peek() { + None | Some(b'\n' | b'\r' | b'#') => true, + _ => false, + } + } + + fn peek(&self) -> Option { + self.src.get(self.pos).copied() + } + + fn peek_at(&self, offset: usize) -> Option { + self.src.get(self.pos + offset).copied() + } + + fn eat_byte(&mut self, b: u8) -> bool { + if self.peek() == Some(b) { + self.pos += 1; + true + } else { + false + } + } + + fn eof(&self) -> bool { + self.pos >= self.src.len() + } + + /// Look at the next identifier without consuming it. Used to + /// dispatch on `def` / `with` / etc. + fn peek_ident(&self) -> Option<&'a str> { + let bytes = self.src; + let start = self.pos; + if !is_ident_start(*bytes.get(start)?) { + return None; + } + let mut end = start + 1; + while end < bytes.len() && is_ident_continue(bytes[end]) { + end += 1; + } + std::str::from_utf8(&bytes[start..end]).ok() + } + + /// Consume and return the next identifier. + fn eat_ident(&mut self) -> Option { + let ident = self.peek_ident()?; + let len = ident.len(); + let owned = ident.to_string(); + self.pos += len; + Some(owned) + } + + /// Consume a specific keyword. Returns `None` if the next + /// identifier is anything else. + fn consume_ident(&mut self, expected: &str) -> Option<()> { + let id = self.peek_ident()?; + if id != expected { + return None; + } + self.pos += expected.len(); + Some(()) + } + + /// Advance one UTF-8 code point. Falls back to one byte for + /// invalid UTF-8 (the source then doesn't parse, but we don't + /// want to infinite-loop). + fn advance_one_char(&mut self) { + let b = self.src[self.pos]; + let len = if b < 0x80 { + 1 + } else if b < 0xC0 { + 1 // invalid lead — skip 1 byte + } else if b < 0xE0 { + 2 + } else if b < 0xF0 { + 3 + } else { + 4 + }; + self.pos = (self.pos + len).min(self.src.len()); + } + + /// Read the bytes of the current logical line (until newline at + /// bracket-depth 0). Used to defensively check whether a + /// with-scope body line assigns to a solver field, without + /// consuming it from the parser. + fn peek_logical_line_bytes(&self) -> Vec { + let mut p = self.pos; + let mut depth: i32 = 0; + let mut out = Vec::new(); + while p < self.src.len() { + let c = self.src[p]; + match c { + b'"' | b'\'' => { + // Naive: copy through the close quote. + out.push(c); + p += 1; + let quote = c; + while p < self.src.len() { + let q = self.src[p]; + out.push(q); + p += 1; + if q == b'\\' && p < self.src.len() { + out.push(self.src[p]); + p += 1; + } else if q == quote { + break; + } else if q == b'\n' { + break; + } + } + } + b'#' if depth == 0 => { + while p < self.src.len() && self.src[p] != b'\n' { + p += 1; + } + } + b'\n' => { + if depth == 0 { + break; + } + out.push(c); + p += 1; + } + b'(' | b'[' | b'{' => { + depth += 1; + out.push(c); + p += 1; + } + b')' | b']' | b'}' => { + depth -= 1; + out.push(c); + p += 1; + } + _ => { + out.push(c); + p += 1; + } + } + } + out + } +} + +// --------------------------------------------------------------------------- +// Free-standing helpers +// --------------------------------------------------------------------------- + +fn is_ident_start(b: u8) -> bool { + b.is_ascii_alphabetic() || b == b'_' +} + +fn is_ident_continue(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' +} + +/// True if `line` (the bytes of a single logical line) assigns to +/// one of the solver fields at the start (after any leading +/// whitespace). Used inside `with scope(...)` bodies to defensively +/// catch pathological shadowing. +fn line_assigns_to_solver_field(line: &[u8]) -> bool { + let mut i = 0; + while i < line.len() && (line[i] == b' ' || line[i] == b'\t') { + i += 1; + } + for field in SOLVER_FIELDS { + let fb = field.as_bytes(); + if line.len() >= i + fb.len() && &line[i..i + fb.len()] == fb { + let mut j = i + fb.len(); + // Must be followed by `=` (with optional whitespace), and + // not `==`, `+=`, etc. + while j < line.len() && (line[j] == b' ' || line[j] == b'\t') { + j += 1; + } + if j < line.len() && line[j] == b'=' { + if j + 1 < line.len() && line[j + 1] == b'=' { + return false; + } + // Make sure what comes after the field name is not a + // longer identifier (e.g. `nameless = ...`). + let after = i + fb.len(); + if after < line.len() && is_ident_continue(line[after]) { + return false; + } + return true; + } + } + } + false +} + +// =========================================================================== +// Tests +// =========================================================================== + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use std::path::PathBuf; + + /// Write `content` to a fresh tempfile and return its path. The + /// `tempfile` crate isn't a dep — keep tests dep-free with a + /// hand-rolled scratch path under `std::env::temp_dir()`. + fn write_temp(name: &str, content: &str) -> PathBuf { + let dir = std::env::temp_dir().join(format!( + "rer-package-test-{}-{}", + std::process::id(), + name + )); + std::fs::create_dir_all(&dir).unwrap(); + let path = dir.join("package.py"); + let mut f = std::fs::File::create(&path).unwrap(); + f.write_all(content.as_bytes()).unwrap(); + path + } + + #[test] + fn batch_empty_returns_empty() { + let result: Vec> = + parse_static_packages_py::<&std::path::Path>(&[]); + assert!(result.is_empty()); + } + + #[test] + fn batch_parses_each_file_independently() { + let static_path = write_temp( + "static-1", + "name = \"app\"\nversion = \"1.0.0\"\nrequires = [\"lib-2\"]\n", + ); + let dynamic_path = write_temp( + "dynamic-1", + "import os\nname = \"foo\"\nversion = \"1.0\"\n", + ); + let static_path_2 = write_temp( + "static-2", + "name = \"lib\"\nversion = \"2.0.0\"\n", + ); + + let paths = vec![&static_path, &dynamic_path, &static_path_2]; + let results = parse_static_packages_py(&paths); + + assert_eq!(results.len(), 3); + // [0] static → Some + let r0 = results[0].as_ref().expect("static-1 should parse"); + assert_eq!(r0.name, "app"); + assert_eq!(r0.version, "1.0.0"); + // [1] dynamic (top-level import) → None + assert!(results[1].is_none(), "dynamic file should bail to None"); + // [2] static → Some + let r2 = results[2].as_ref().expect("static-2 should parse"); + assert_eq!(r2.name, "lib"); + } + + #[test] + fn batch_missing_file_becomes_none() { + // Build a path that doesn't exist; the batched call should map + // it to None at the matching index, never raise. + let phantom = std::env::temp_dir().join("rer-package-test-this-does-not-exist/package.py"); + let real = write_temp( + "real-static", + "name = \"x\"\nversion = \"1.0\"\n", + ); + let paths = vec![&phantom, &real]; + let results = parse_static_packages_py(&paths); + assert_eq!(results.len(), 2); + assert!(results[0].is_none(), "phantom path must be None"); + assert!(results[1].is_some(), "real path should parse"); + } + + #[test] + fn batch_preserves_input_order() { + // Write 16 files alternating valid/dynamic content. The batched + // call uses par_iter which can complete out of order; the + // returned Vec must still match the input positions exactly. + let mut paths: Vec = Vec::with_capacity(16); + for i in 0..16 { + let content = if i % 2 == 0 { + format!("name = \"pkg{i}\"\nversion = \"1.0\"\n") + } else { + // dynamic — top-level if always bails + format!("name = \"pkg{i}\"\nversion = \"1.0\"\nif True:\n pass\n") + }; + paths.push(write_temp(&format!("order-{i}"), &content)); + } + let results = parse_static_packages_py(&paths); + assert_eq!(results.len(), 16); + for (i, r) in results.iter().enumerate() { + if i % 2 == 0 { + let r = r.as_ref().expect("even index should be Some"); + assert_eq!(r.name, format!("pkg{i}")); + } else { + assert!(r.is_none(), "odd index should be None (dynamic)"); + } + } + } + + #[test] + fn parses_minimal_static() { + let src = "name = \"foo\"\nversion = \"1.0.0\"\n"; + let info = parse_static_package_py(src).expect("static minimal"); + assert_eq!(info.name, "foo"); + assert_eq!(info.version, "1.0.0"); + assert!(info.requires.is_empty()); + assert!(info.variants.is_empty()); + } + + #[test] + fn parses_full_static() { + let src = r#" +name = "maya" +version = "2024.0" +description = "irrelevant" +authors = ["Autodesk"] +requires = ["python-3", "qt-5"] +variants = [["linux", "python-3.10"], ["linux", "python-3.11"]] + +def commands(): + env.PYTHONPATH.append("{root}/python") +"#; + let info = parse_static_package_py(src).expect("full static"); + assert_eq!(info.name, "maya"); + assert_eq!(info.version, "2024.0"); + assert_eq!(info.requires, vec!["python-3", "qt-5"]); + assert_eq!( + info.variants, + vec![ + vec!["linux".to_string(), "python-3.10".to_string()], + vec!["linux".to_string(), "python-3.11".to_string()] + ] + ); + } + + #[test] + fn parses_with_scope_config() { + let src = r#" +# -*- coding: utf-8 -*- +name = "fortichebox" +version = "0.2.0" +requires = ["python-2.7+<3"] + +def commands(): + env["FPATH"].append("$SPACE/generic") + +with scope("config") as config: + config.release_packages_path = "/some/path" + config.something_else = 42 + +timestamp = 1642007300 +"#; + let info = parse_static_package_py(src).expect("with scope is ignorable"); + assert_eq!(info.name, "fortichebox"); + assert_eq!(info.version, "0.2.0"); + assert_eq!(info.requires, vec!["python-2.7+<3"]); + assert!(info.variants.is_empty()); + } + + #[test] + fn ignores_module_docstring() { + let src = "\"\"\"Some docstring.\"\"\"\n\nname = \"foo\"\nversion = \"1.0\"\n"; + assert!(parse_static_package_py(src).is_some()); + } + + #[test] + fn ignores_unknown_top_level_fields() { + let src = r#" +name = "foo" +version = "1.0" +description = "anything" +authors = ["Alice", "Bob"] +tools = ["foo-cli"] +timestamp = 123456789 +format_version = 2 +hashed_variants = True +build_command = "cmake ..." +"#; + assert!(parse_static_package_py(src).is_some()); + } + + #[test] + fn handles_multiline_requires() { + let src = r#" +name = "foo" +version = "1.0" +requires = [ + "python-3", + "qt-5", + # a comment in the middle + "openexr", +] +"#; + let info = parse_static_package_py(src).unwrap(); + assert_eq!(info.requires, vec!["python-3", "qt-5", "openexr"]); + } + + #[test] + fn handles_trailing_comment_after_assignment() { + let src = "name = \"foo\" # this is foo\nversion = \"1.0\"\n"; + let info = parse_static_package_py(src).unwrap(); + assert_eq!(info.name, "foo"); + } + + #[test] + fn handles_single_quoted_strings() { + let src = "name = 'foo'\nversion = '1.0'\n"; + let info = parse_static_package_py(src).unwrap(); + assert_eq!(info.name, "foo"); + assert_eq!(info.version, "1.0"); + } + + // --- Bail cases ---------------------------------------------------- + + #[test] + fn bails_on_at_early_requires() { + let src = r#" +name = "foo" +version = "1.0" + +@early() +def requires(): + return ["python-3"] +"#; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_at_late_variants() { + let src = r#" +name = "foo" +version = "1.0" + +@late() +def variants(): + return [["a"]] +"#; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_top_level_if() { + let src = r#" +name = "foo" +version = "1.0" + +if True: + requires = ["dev-lib"] +else: + requires = ["prod-lib"] +"#; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_import() { + let src = "import sys\n\nname = \"foo\"\nversion = \"1.0\"\n"; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_from_import() { + let src = "from sys import platform\n\nname = \"foo\"\nversion = \"1.0\"\n"; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_classdef() { + let src = "name = \"foo\"\nversion = \"1.0\"\n\nclass Helper:\n pass\n"; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_non_scope_with() { + let src = r#" +name = "foo" +version = "1.0" + +with open("config.json") as f: + pass +"#; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_scope_with_that_touches_solver_field() { + let src = r#" +name = "foo" +version = "1.0" + +with scope("config") as config: + config.release_path = "/foo" + name = "rebinding" +"#; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_non_literal_name() { + let src = "prefix = \"f\"\nname = prefix + \"oo\"\nversion = \"1.0\"\n"; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_function_call_for_requires() { + let src = "name = \"foo\"\nversion = \"1.0\"\nrequires = build_requires()\n"; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_missing_name() { + let src = "version = \"1.0\"\n"; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_missing_version() { + let src = "name = \"foo\"\n"; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn bails_on_syntax_error() { + // Unterminated string. + let src = "name = \"foo\nversion = \"1.0\"\n"; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn requires_can_be_empty_or_absent() { + let src = "name = \"foo\"\nversion = \"1.0\"\nrequires = []\n"; + let info = parse_static_package_py(src).unwrap(); + assert!(info.requires.is_empty()); + } + + #[test] + fn variants_can_be_a_tuple_of_tuples() { + let src = "name = \"foo\"\nversion = \"1.0\"\nvariants = ((\"a\",), (\"b\",))\n"; + let info = parse_static_package_py(src).unwrap(); + assert_eq!( + info.variants, + vec![vec!["a".to_string()], vec!["b".to_string()]] + ); + } + + #[test] + fn bails_on_augmented_assignment_to_solver_field() { + let src = "name = \"foo\"\nname += \"bar\"\nversion = \"1.0\"\n"; + assert!(parse_static_package_py(src).is_none()); + } + + #[test] + fn handles_crlf_line_endings() { + // Windows-edited package.py files use `\r\n` line endings. + // Every Fortiche package.py comes off CIFS-served Samba this + // way; the lexer must treat `\r` transparently. + let src = "name = 'foo'\r\nversion = '1.0'\r\nrequires = ['python']\r\n"; + let info = parse_static_package_py(src).expect("CRLF accepted"); + assert_eq!(info.name, "foo"); + assert_eq!(info.version, "1.0"); + assert_eq!(info.requires, vec!["python"]); + } + + #[test] + fn handles_crlf_backslash_line_continuation_in_non_solver_field() { + // Common shape in Windows-edited rez packages: `changelog = \` + // followed by a CRLF then an indented triple-quoted string. + // Bumped 50pp on the Fortiche corpus. + let src = concat!( + "name = 'foo'\r\n", + "version = '1.0'\r\n", + "\r\n", + "changelog = \\\r\n", + " \"\"\"some\r\nmultiline\r\nchangelog\"\"\"\r\n", + "\r\n", + "timestamp = 12345\r\n", + ); + let info = parse_static_package_py(src).expect("CRLF \\ accepted"); + assert_eq!(info.name, "foo"); + } + + #[test] + fn ignores_decorator_on_non_solver_function() { + // `@deprecated def commands(): ...` should be accepted — + // the decorator is metadata; the function isn't a solver field. + let src = r#" +name = "foo" +version = "1.0" + +@deprecated +def commands(): + env.PATH.append("/bin") +"#; + let info = parse_static_package_py(src).expect("@deprecated def commands accepted"); + assert_eq!(info.name, "foo"); + } + + #[test] + fn ignores_decorator_with_args_on_non_solver_function() { + let src = r#" +name = "foo" +version = "1.0" + +@cache(maxsize=1) +def commands(): + pass +"#; + assert!(parse_static_package_py(src).is_some()); + } + + #[test] + fn handles_crlf_with_def_body() { + let src = "name = 'foo'\r\nversion = '1.0'\r\n\r\ndef commands():\r\n env.PATH.append('{root}/bin')\r\n"; + let info = parse_static_package_py(src).expect("CRLF + def body accepted"); + assert_eq!(info.name, "foo"); + } + + #[test] + fn bails_on_fstring_for_solver_field() { + let src = "name = f\"foo\"\nversion = \"1.0\"\n"; + assert!(parse_static_package_py(src).is_none()); + } +} diff --git a/crates/rer-package/tests/test_corpus.rs b/crates/rer-package/tests/test_corpus.rs new file mode 100644 index 0000000..5e185f3 --- /dev/null +++ b/crates/rer-package/tests/test_corpus.rs @@ -0,0 +1,104 @@ +//! Run `parse_static_package_py` against a real rez repo and report +//! the accept rate. Mirrors the Python survey at +//! `scripts/survey_package_py.py` — running both against the same +//! corpus should produce matching `fast-parseable` counts. +//! +//! `#[ignore]`d: this needs an external rez package store. Run with: +//! +//! ```text +//! RER_CORPUS_PATH=/thierry/rez/pkg \ +//! cargo test --release -p rer-package --test test_corpus -- --ignored +//! ``` +//! +//! Set `RER_CORPUS_REQUIRE` to a percentage to gate on a minimum: +//! +//! ```text +//! RER_CORPUS_REQUIRE=90 RER_CORPUS_PATH=/thierry/rez/pkg \ +//! cargo test --release -p rer-package --test test_corpus -- --ignored +//! ``` +//! +//! Without `RER_CORPUS_PATH`, the test prints a skip message and +//! passes (so CI on a checkout without the mount doesn't fail). + +use std::path::Path; + +#[test] +#[ignore = "external rez corpus required; see RER_CORPUS_PATH"] +fn corpus_accept_rate() { + let Some(root) = std::env::var_os("RER_CORPUS_PATH") else { + eprintln!("RER_CORPUS_PATH not set — skipping corpus check."); + return; + }; + let root: &Path = root.as_ref(); + if !root.is_dir() { + eprintln!("RER_CORPUS_PATH is not a directory: {}", root.display()); + return; + } + + let mut total = 0usize; + let mut accepted = 0usize; + let mut read_errors = 0usize; + + walk_package_pys(root, &mut |path: &Path| { + total += 1; + match std::fs::read_to_string(path) { + Ok(src) => { + if rer_package::parse_static_package_py(&src).is_some() { + accepted += 1; + } + } + Err(_) => read_errors += 1, + } + }); + + let pct = if total > 0 { + accepted as f64 / total as f64 * 100.0 + } else { + 0.0 + }; + println!( + "rer-package on {}: total {total}, accepted {accepted} ({pct:.1}%), read errors {read_errors}", + root.display() + ); + + if let Ok(min) = std::env::var("RER_CORPUS_REQUIRE") { + let min: f64 = min.parse().expect("RER_CORPUS_REQUIRE must be a number"); + assert!( + pct >= min, + "accept rate {pct:.1}% < required {min:.1}%" + ); + } else { + // No gate set — just don't fail the test. + assert!(total > 0, "no package.py files found under {}", root.display()); + } +} + +/// Recursively yield every `package.py` under `root`, skipping +/// dot-directories, leading-underscore directories, and rez variant +/// subdirs (40-char hex hashes). Matches the Python survey's traversal +/// so the two tools see the same set of files. +fn walk_package_pys(root: &Path, f: &mut dyn FnMut(&Path)) { + let Ok(entries) = std::fs::read_dir(root) else { + return; + }; + for entry in entries.flatten() { + let path = entry.path(); + let name = entry.file_name(); + let name = name.to_string_lossy(); + if name.starts_with('.') || name.starts_with('_') { + continue; + } + if name.len() == 40 && name.chars().all(|c| c.is_ascii_hexdigit()) { + continue; + } + let ty = match entry.file_type() { + Ok(t) => t, + Err(_) => continue, + }; + if ty.is_dir() { + walk_package_pys(&path, f); + } else if ty.is_file() && name == "package.py" { + f(&path); + } + } +} diff --git a/crates/rer-python/Cargo.toml b/crates/rer-python/Cargo.toml index e810c5b..d5880f2 100644 --- a/crates/rer-python/Cargo.toml +++ b/crates/rer-python/Cargo.toml @@ -26,3 +26,5 @@ crate-type = ["cdylib", "lib"] [dependencies] pyo3 = { workspace = true, features = ["abi3-py39"] } rer-resolver = { workspace = true } +rer-package = { workspace = true } +rer-version = { workspace = true } diff --git a/crates/rer-python/src/lib.rs b/crates/rer-python/src/lib.rs index c6792be..7471607 100644 --- a/crates/rer-python/src/lib.rs +++ b/crates/rer-python/src/lib.rs @@ -331,20 +331,48 @@ fn packages_to_map(packages: Vec) -> Result, load_err: Rc>>) -> FamilyLoader { +fn make_loader( + callback: Py, + load_err: Rc>>, + takes_range: bool, +) -> FamilyLoader { Box::new( - move |name: &str| -> Vec<(String, rer_resolver::PackageData)> { + move |name: &str, + hint: Option<&rer_version::VersionRange>| + -> Vec<(String, rer_resolver::PackageData)> { // Already errored on a previous call — short-circuit so we don't // pile up errors and don't keep calling a broken callback. if load_err.borrow().is_some() { return Vec::new(); } + let hint_str: Option = hint.map(|r| r.to_string()); let result: PyResult> = Python::with_gil(|py| -> PyResult<_> { - let ret = callback.bind(py).call1((name,))?; + let ret = if takes_range { + // Pass hint as a rez-style range string via the + // `version_range` keyword — works with both + // `def f(name, version_range=None)` and + // `def f(name, **kwargs)`. `None` → Python + // `None`, signalling "unconstrained". + let py_hint = match hint_str.as_deref() { + Some(s) => s.into_pyobject(py)?.into_any(), + None => py.None().into_bound(py), + }; + let kwargs = pyo3::types::PyDict::new(py); + kwargs.set_item("version_range", py_hint)?; + callback.bind(py).call((name,), Some(&kwargs))? + } else { + callback.bind(py).call1((name,))? + }; let pkgs: Vec = ret.extract()?; let mut out: Vec<(String, rer_resolver::PackageData)> = Vec::with_capacity(pkgs.len()); @@ -382,6 +410,82 @@ fn make_loader(callback: Py, load_err: Rc>>) -> Fa ) } +/// Inspect the Python callable's signature and return `true` if it can +/// accept a second `version_range` argument — either as a named parameter +/// or via `**kwargs` / `*args`. False means the legacy 1-arg shape. +/// +/// Errs on the side of `false` (legacy) if introspection fails for any +/// reason; the loader then keeps calling with just `name` and existing +/// shims keep working. +fn callback_takes_range(py: Python<'_>, callback: &Py) -> bool { + let inspect = match py.import("inspect") { + Ok(m) => m, + Err(_) => return false, + }; + let sig = match inspect.getattr("signature").and_then(|f| f.call1((callback,))) { + Ok(s) => s, + Err(_) => return false, + }; + let params = match sig.getattr("parameters") { + Ok(p) => p, + Err(_) => return false, + }; + let len: usize = params.len().unwrap_or(0); + if len == 0 { + return false; + } + // Walk the parameters' kinds. A second positional parameter, a + // `version_range` keyword parameter, or any `*args`/`**kwargs` + // signals support. + let Ok(items) = params.call_method0("items") else { + return false; + }; + let Ok(iter) = items.try_iter() else { + return false; + }; + let mut positional_count = 0usize; + let Ok(inspect_mod) = py.import("inspect") else { + return false; + }; + let Ok(parameter_cls) = inspect_mod.getattr("Parameter") else { + return false; + }; + let pkw = parameter_cls.getattr("VAR_KEYWORD").ok(); + let pvar = parameter_cls.getattr("VAR_POSITIONAL").ok(); + for item in iter { + let Ok(item) = item else { continue }; + let Ok(name_val) = item.get_item(0) else { + continue; + }; + let Ok(param) = item.get_item(1) else { continue }; + let Ok(name_s) = name_val.extract::() else { + continue; + }; + if name_s == "version_range" { + return true; + } + let Ok(kind) = param.getattr("kind") else { + continue; + }; + if let Some(p) = &pkw { + if kind.eq(p).unwrap_or(false) { + return true; + } + } + if let Some(p) = &pvar { + if kind.eq(p).unwrap_or(false) { + return true; + } + } + // Plain positional / positional-or-keyword: counts toward arity. + positional_count += 1; + if positional_count >= 2 { + return true; + } + } + false +} + // --------------------------------------------------------------------------- // solve // --------------------------------------------------------------------------- @@ -458,7 +562,17 @@ fn solve( let load_err: Rc>> = Rc::new(RefCell::new(None)); let repo = if let Some(callback) = load_family { - let lazy = PackageRepo::with_loader(make_loader(callback, Rc::clone(&load_err))); + // One-time signature inspection (issue #92): if the callback can + // take a `version_range` argument, we pass the current solver + // range as a rez-style string so the shim can pre-filter. + // Backward-compatible: callbacks with the 1-arg shape keep + // working unchanged. + let takes_range = Python::with_gil(|py| callback_takes_range(py, &callback)); + let lazy = PackageRepo::with_loader(make_loader( + callback, + Rc::clone(&load_err), + takes_range, + )); // Seed the eager set so the loader is never called for families // the caller already supplied. for (name, fam) in initial_map { @@ -572,10 +686,96 @@ fn solve( }) } +// --------------------------------------------------------------------------- +// parse_static_package_py — fast static parser for the rez `package.py` shape +// --------------------------------------------------------------------------- + +/// Try to parse `source` as a statically-resolvable rez `package.py`, +/// returning the four solver-relevant fields as a [`PackageData`] — +/// or `None` if the file needs Python evaluation (e.g. `@early` / +/// `@late`-bound `requires`, top-level `if` / `import`, …). +/// +/// `None` is *not* an error. It means "the caller should fall back to +/// `pyrer.PackageData.from_rez(pkg)` for this file" — rez's own +/// evaluator will handle the dynamic case. +/// +/// Recognises rez's standard patterns: literal `name`, `version`, +/// `requires`, `variants`; ignorable `def commands(...)` / +/// `def pre_commands(...)` function bodies; ignorable +/// `with scope("config") as config: ...` declarative DSL. +/// +/// Intended for the rez-integration `load_family` fast path — try +/// this first, fall back to `from_rez(pkg)` on `None`. See the +/// engineering note at +/// `docs/content/docs/engineering/fast-package-py-parser.md`. +#[pyfunction] +fn parse_static_package_py(source: &str) -> Option { + rer_package::parse_static_package_py(source).map(|info| PackageData { + name: info.name, + version: info.version, + requires: info.requires, + variants: info.variants, + }) +} + +/// Batched variant of [`parse_static_package_py`]: open and parse +/// every path on a Rayon thread pool, returning a list aligned with +/// `paths`. Closes issue #94. +/// +/// ```python +/// import pyrer +/// +/// paths = [pkg.filepath for pkg in iter_packages(...)] +/// pds = pyrer.parse_static_packages_py(paths) +/// for pd, pkg in zip(pds, pkgs): +/// if pd is None: +/// pd = pyrer.PackageData.from_rez(pkg) # dynamic / unreadable +/// ... +/// ``` +/// +/// - **Output is positionally aligned** with the input. A missing +/// file, a parser bail on dynamic content, and an unreadable file +/// all become `None` at the same index. +/// - **No exceptions escape.** Per-file failures map to `None`. +/// - **GIL is released for the batch** via `Python::allow_threads` so +/// other Python threads run during the I/O. +/// - **Pool size** follows Rayon's default (`RAYON_NUM_THREADS` or +/// logical core count). No per-call knob — set the env var to +/// constrain on shared CI hosts. +/// +/// Replaces the rez shim's serial Python loop of `open()` calls — +/// ~3 s on a typical 132-package Fortiche resolve, 91% of the +/// `_load_family` budget when all other pyrer wins are stacked. +#[pyfunction] +fn parse_static_packages_py( + py: Python<'_>, + paths: Vec, +) -> Vec> { + // Release the GIL while reading + parsing. The closure produces a + // `Vec>`; the conversion to the + // PyO3-managed `PackageData` happens after the GIL is reacquired. + let infos: Vec> = + py.allow_threads(|| rer_package::parse_static_packages_py(&paths)); + + infos + .into_iter() + .map(|maybe| { + maybe.map(|info| PackageData { + name: info.name, + version: info.version, + requires: info.requires, + variants: info.variants, + }) + }) + .collect() +} + /// The `pyrer` Python module — Rez-compatible package resolver. #[pymodule] fn pyrer(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(solve, m)?)?; + m.add_function(wrap_pyfunction!(parse_static_package_py, m)?)?; + m.add_function(wrap_pyfunction!(parse_static_packages_py, m)?)?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/crates/rer-resolver/src/rez_solver/context.rs b/crates/rer-resolver/src/rez_solver/context.rs index 378bb35..3658051 100644 --- a/crates/rer-resolver/src/rez_solver/context.rs +++ b/crates/rer-resolver/src/rez_solver/context.rs @@ -14,14 +14,81 @@ use std::rc::Rc; pub type FamilyMap = HashMap; /// Callback invoked on the first lookup for a family that is not already in -/// the repo. Returns `(version_string, PackageData)` pairs — every version of -/// the family. An empty result means "no such family"; the repo caches that -/// answer and never calls the loader for the same name again. +/// the repo, with an optional version-range hint indicating the *current* +/// solver constraint on that family. Returns `(version_string, PackageData)` +/// pairs. /// -/// Mirrors the lazy-load behaviour rez gets from its `Package` resource -/// wrapper (each `package.py` is AST-evaluated on first attribute access). -/// `pyrer` builds one of these from a Python callable for issue #86. -pub type FamilyLoader = Box Vec<(String, PackageData)>>; +/// **Hint semantics (issue #92):** +/// +/// - `None` — the solver needs *every* version of the family (e.g. an +/// unbounded request, or a backtrack-widen has invalidated a narrower +/// prior load). The shim must return all versions. +/// - `Some(range)` — the solver only needs versions inside `range`. The +/// shim **may** filter to versions intersecting it (rez's +/// `iter_packages(range_=...)` does exactly that). The shim is **allowed +/// to return a superset** — pyrer re-validates against current +/// constraints, so extra versions are merely wasted parse time. +/// - The shim must **not** silently drop versions outside the hint without +/// pyrer asking — pyrer caches the loaded range and re-calls the loader +/// with a widened range if the solver backtracks and needs more. +/// +/// An empty result means "no such family" *under the supplied hint*. The +/// repo memoises this answer paired with the hint that produced it; a +/// later wider hint will retry the loader. +/// +/// `pyrer` builds one of these from a Python callable. See the load_family +/// callback in `pyrer.solve()` for the Python-side contract. +pub type FamilyLoader = + Box) -> Vec<(String, PackageData)>>; + +/// Records which version-range was passed to the loader when a family was +/// last loaded. Used by [`PackageRepo`] to decide whether a cached family +/// map can serve a fresh request without re-calling the loader. +#[derive(Clone, Debug)] +enum LoadedRange { + /// Loaded with `None` hint — every version is in the cached map. + /// Always sufficient for any subsequent request. + Unconstrained, + /// Loaded with `Some(range)` — only versions inside this range are + /// guaranteed to be in the cached map. + Bounded(VersionRange), +} + +impl LoadedRange { + /// True if `hint` is fully covered by the cached load — i.e. every + /// version the caller could care about is already in our map. + fn covers(&self, hint: Option<&VersionRange>) -> bool { + match (self, hint) { + (LoadedRange::Unconstrained, _) => true, + (LoadedRange::Bounded(_), None) => false, + (LoadedRange::Bounded(loaded), Some(want)) => { + // want ⊆ loaded ⟺ loaded ∩ want == want + loaded.intersection(want).as_ref() == Some(want) + } + } + } + + /// The union of two load ranges, used when widening to satisfy a + /// hint that wasn't covered by the previous load. + fn widened_with(&self, hint: Option<&VersionRange>) -> LoadedRange { + match (self, hint) { + (LoadedRange::Unconstrained, _) | (_, None) => LoadedRange::Unconstrained, + (LoadedRange::Bounded(loaded), Some(want)) => { + LoadedRange::Bounded(loaded.union(want)) + } + } + } +} + +#[derive(Clone, Debug)] +struct FamilyEntry { + loaded_range: LoadedRange, + /// `Some(map)` if the family is known to exist; `None` if the loader + /// returned empty under `loaded_range` (i.e. "no such family within + /// this range, possibly absent entirely if `loaded_range` is + /// `Unconstrained`"). + map: Option>, +} /// The package repository — `family -> version -> PackageData`. /// @@ -31,19 +98,19 @@ pub type FamilyLoader = Box Vec<(String, PackageData)>>; /// own solver does. /// /// Lookups are routed through [`Self::get_family`], which: -/// 1. Returns the cached `Rc` if the family has been seen. -/// 2. Otherwise calls the loader (if any), memoising both the hit and the -/// "no such family" answer. -/// 3. Otherwise returns `None`. +/// 1. Returns the cached `Rc` if the family has been loaded with +/// a range that covers the request. +/// 2. Otherwise calls the loader (if any) with the widened range, replaces +/// the cache entry, and returns the new map. +/// 3. Returns `None` if there's no loader and the family isn't cached, or +/// if the loader returns empty under the widened range. /// /// Construction: /// - [`Self::from_map`] / `impl From>` — eager, no loader. /// - [`Self::with_loader`] — lazy; the loader is consulted on miss. #[derive(Default)] pub struct PackageRepo { - /// `Some(map)` for present families, `None` for families the loader - /// confirmed as absent (so we don't re-call it on miss). - families: RefCell>>>, + families: RefCell>, loader: Option, } @@ -65,10 +132,19 @@ impl PackageRepo { /// Eager repo from a `family -> version -> PackageData` map. The loader /// is `None`, so any family not in `map` is reported as absent on lookup. + /// Eager-seeded families count as fully loaded (any range hint covered). pub fn from_map(map: HashMap) -> Self { let families = map .into_iter() - .map(|(name, fam)| (name, Some(Rc::new(fam)))) + .map(|(name, fam)| { + ( + name, + FamilyEntry { + loaded_range: LoadedRange::Unconstrained, + map: Some(Rc::new(fam)), + }, + ) + }) .collect(); PackageRepo { families: RefCell::new(families), @@ -78,8 +154,10 @@ impl PackageRepo { /// Repo backed by a loader. The loader is called the first time the /// solver asks for a family that isn't already cached — both hits and - /// "no such family" answers are memoised, so the loader fires at most - /// once per family per repo. + /// "no such family" answers are memoised. With the issue #92 + /// version-range hint, the loader may be re-called for the same family + /// if a later request needs a wider range than the cached load + /// covered; otherwise the cache hit is one-and-done. /// /// Use [`Self::insert_family`] to pre-seed families that are already /// in memory (e.g. ones produced by the caller's BFS seed pass). @@ -90,42 +168,85 @@ impl PackageRepo { } } - /// Pre-populate a family. Useful with [`Self::with_loader`] to skip - /// the loader for families already in memory. + /// Pre-populate a family. Counts as a full load (any range hint + /// covered). Useful with [`Self::with_loader`] to skip the loader for + /// families already in memory. pub fn insert_family(&self, name: String, fam: FamilyMap) { - self.families.borrow_mut().insert(name, Some(Rc::new(fam))); + self.families.borrow_mut().insert( + name, + FamilyEntry { + loaded_range: LoadedRange::Unconstrained, + map: Some(Rc::new(fam)), + }, + ); } - /// Number of families currently cached in the repo. With a loader - /// attached this grows as the solve progresses — it only reflects the - /// eager-seeded set + whatever the loader has been asked for so far. + /// Number of present families currently cached in the repo. With a + /// loader attached this grows as the solve progresses — it only + /// reflects the eager-seeded set + whatever the loader has been + /// asked for so far. pub fn family_count(&self) -> usize { self.families .borrow() .values() - .filter(|v| v.is_some()) + .filter(|entry| entry.map.is_some()) .count() } /// `Some(family map)` if the family exists (cached or lazily loaded); /// `None` if there's no loader and it isn't cached, or if the loader /// returned no entries for it. - pub fn get_family(&self, name: &str) -> Option> { - if let Some(slot) = self.families.borrow().get(name) { - return slot.clone(); + /// + /// The `hint` is the range the solver currently needs — passed through + /// to the loader, which may use it to pre-filter (e.g. via rez's + /// `iter_packages(range_=...)`). The repo tracks which range each + /// family was loaded under and reloads (with a widened range) when a + /// later request can't be served from the cache. + pub fn get_family( + &self, + name: &str, + hint: Option<&VersionRange>, + ) -> Option> { + // Cache hit + range covered → return directly. + if let Some(entry) = self.families.borrow().get(name) { + if entry.loaded_range.covers(hint) { + return entry.map.clone(); + } } - let loaded = self.loader.as_ref().and_then(|load| { - let entries = load(name); + // Either uncached, or cached with a range that doesn't cover the + // request. Reload with the widened range. + let new_range = { + let families = self.families.borrow(); + match families.get(name) { + Some(entry) => entry.loaded_range.widened_with(hint), + None => match hint { + None => LoadedRange::Unconstrained, + Some(r) => LoadedRange::Bounded(r.clone()), + }, + } + }; + + let new_map = self.loader.as_ref().and_then(|load| { + let widened_hint = match &new_range { + LoadedRange::Unconstrained => None, + LoadedRange::Bounded(r) => Some(r), + }; + let entries = load(name, widened_hint); if entries.is_empty() { None } else { Some(Rc::new(entries.into_iter().collect::>())) } }); - self.families - .borrow_mut() - .insert(name.to_string(), loaded.clone()); - loaded + + self.families.borrow_mut().insert( + name.to_string(), + FamilyEntry { + loaded_range: new_range, + map: new_map.clone(), + }, + ); + new_map } } diff --git a/crates/rer-resolver/src/rez_solver/solver.rs b/crates/rer-resolver/src/rez_solver/solver.rs index 50deba4..74e7a8a 100644 --- a/crates/rer-resolver/src/rez_solver/solver.rs +++ b/crates/rer-resolver/src/rez_solver/solver.rs @@ -306,7 +306,7 @@ mod tests { let calls: Rc>> = Rc::new(RefCell::new(Vec::new())); let calls_inner = Rc::clone(&calls); - let repo = crate::rez_solver::PackageRepo::with_loader(Box::new(move |name: &str| { + let repo = crate::rez_solver::PackageRepo::with_loader(Box::new(move |name: &str, _hint: Option<&rer_version::VersionRange>| { calls_inner.borrow_mut().push(name.to_string()); match name { "app" => vec![("1.0".to_string(), pkg(&["lib-2"], &[]))], @@ -344,7 +344,7 @@ mod tests { // A diamond: app -> lib & util; util -> lib. lib is reached twice // but the loader must only be invoked once. - let repo = crate::rez_solver::PackageRepo::with_loader(Box::new(move |name: &str| { + let repo = crate::rez_solver::PackageRepo::with_loader(Box::new(move |name: &str, _hint: Option<&rer_version::VersionRange>| { calls_inner.borrow_mut().push(name.to_string()); match name { "app" => vec![("1.0".into(), pkg(&["lib", "util"], &[]))], @@ -367,11 +367,84 @@ mod tests { ); } + #[test] + fn test_loader_receives_version_range_hint() { + // Issue #92: the loader should be invoked with the solver's current + // range constraint as a hint. + use rer_version::VersionRange; + use std::cell::RefCell; + + let calls: Rc)>>> = + Rc::new(RefCell::new(Vec::new())); + let calls_inner = Rc::clone(&calls); + + let repo = crate::rez_solver::PackageRepo::with_loader(Box::new( + move |name: &str, hint: Option<&VersionRange>| { + calls_inner + .borrow_mut() + .push((name.to_string(), hint.cloned())); + match name { + "lib" => vec![ + ("1.0".to_string(), pkg(&[], &[])), + ("2.0".to_string(), pkg(&[], &[])), + ("3.0".to_string(), pkg(&[], &[])), + ], + _ => Vec::new(), + } + }, + )); + + let reqs = vec![Requirement::parse("lib-2+<3")]; + let mut solver = Solver::new(reqs, Rc::new(repo)).expect("solver construction"); + solver.solve(); + assert_eq!(solver.status(), SolverStatus::Solved); + + let calls = calls.borrow(); + assert_eq!(calls.len(), 1, "loader called more times than expected"); + let (name, hint) = &calls[0]; + assert_eq!(name, "lib"); + let hint = hint.as_ref().expect("loader should have seen a hint"); + assert_eq!(hint.to_string(), "2+<3"); + } + + #[test] + fn test_loader_eager_seed_no_loader_call() { + // A pre-seeded family must not trigger the loader, even when + // a range-hint request is made for it. The seed counts as a + // full load. + use std::cell::RefCell; + use std::collections::HashMap; + + let calls: Rc> = Rc::new(RefCell::new(0)); + let calls_inner = Rc::clone(&calls); + + let repo = crate::rez_solver::PackageRepo::with_loader(Box::new( + move |_name: &str, _hint: Option<&rer_version::VersionRange>| { + *calls_inner.borrow_mut() += 1; + Vec::new() + }, + )); + let mut fam: HashMap = HashMap::new(); + fam.insert("1.0".into(), pkg(&[], &[])); + fam.insert("2.0".into(), pkg(&[], &[])); + repo.insert_family("lib".into(), fam); + + let reqs = vec![Requirement::parse("lib-2")]; + let mut solver = Solver::new(reqs, Rc::new(repo)).expect("solver construction"); + solver.solve(); + assert_eq!(solver.status(), SolverStatus::Solved); + assert_eq!( + *calls.borrow(), + 0, + "loader must not be called for pre-seeded families" + ); + } + #[test] fn test_loader_empty_means_missing_family() { // The loader returns no entries for an unknown name; the solver // treats that as a missing family (failed resolve), not a panic. - let repo = crate::rez_solver::PackageRepo::with_loader(Box::new(|_| Vec::new())); + let repo = crate::rez_solver::PackageRepo::with_loader(Box::new(|_: &str, _: Option<&rer_version::VersionRange>| Vec::new())); let reqs = vec![Requirement::parse("doesnotexist")]; let solver = Solver::new(reqs, Rc::new(repo)); // Either Solver::new returns a ScopeError or the solve fails; diff --git a/crates/rer-resolver/src/rez_solver/variant.rs b/crates/rer-resolver/src/rez_solver/variant.rs index 27110e0..b65f39d 100644 --- a/crates/rer-resolver/src/rez_solver/variant.rs +++ b/crates/rer-resolver/src/rez_solver/variant.rs @@ -372,8 +372,16 @@ impl PackageVariantList { /// Build the (lazy) variant list for a family, or `None` if the family is /// absent from the repository. Only version strings are parsed here — the /// requirement strings are parsed on demand by [`Self::get_intersection`]. - pub fn new(ctx: &SolverContext, package_name: &str) -> Option { - let versions = ctx.repo.get_family(package_name)?; + /// + /// `hint` is the current solver range constraint, forwarded to the repo's + /// loader so the shim can pre-filter versions (issue #92). `None` means + /// "unconstrained — every version". + pub fn new( + ctx: &SolverContext, + package_name: &str, + hint: Option<&VersionRange>, + ) -> Option { + let versions = ctx.repo.get_family(package_name, hint)?; let mut entries: Vec = versions .keys() .map(|version_str| { @@ -877,14 +885,21 @@ impl PackageVariantCache { /// Get a slice of `package_name`'s variants intersected with `range`. /// /// `None` means either the family is absent from the repository or no - /// version falls within `range` — Phase 4/5 distinguishes the two. + /// version falls within `range` — Phase 4/5 distinguishes the two via + /// [`Self::family_missing`]. + /// + /// `range` is also forwarded to the repo's loader as the hint, so a + /// `load_family` callback can pre-filter to versions intersecting it + /// (issue #92). If a later request asks for a wider range than the + /// cached load covered, both the repo and this cache transparently + /// reload. pub fn get_variant_slice( &mut self, ctx: &Rc, package_name: &str, range: &VersionRange, ) -> Option { - let list = self.get_or_build(ctx, package_name)?; + let list = self.get_or_build(ctx, package_name, Some(range))?; let entries = list.get_intersection(range)?; Some(PackageVariantSlice::new( Rc::clone(ctx), @@ -894,22 +909,69 @@ impl PackageVariantCache { } /// True if the family is known to be absent from the repository. + /// Asks with `None` hint to force a full load — definitive absence + /// can't be answered from a range-bounded cached result. pub fn family_missing(&mut self, ctx: &Rc, package_name: &str) -> bool { - self.get_or_build(ctx, package_name).is_none() + self.get_or_build(ctx, package_name, None).is_none() } /// Look up the cached `PackageVariantList` for `package_name`, building it /// on first access. Lookup is by `&str` (`Borrow` on `Rc`), so a /// cache hit avoids allocating a fresh `Name` key. + /// + /// The cached list is invalidated and rebuilt if the underlying family + /// map in [`PackageRepo`] has been reloaded (detected via `Rc::ptr_eq` + /// on the family map). That happens when a later request needs a + /// wider range than the previous load covered — see the issue #92 + /// backtrack-widen path in [`PackageRepo::get_family`]. fn get_or_build( &mut self, ctx: &Rc, package_name: &str, + hint: Option<&VersionRange>, ) -> Option> { + // Fast path: cached list whose underlying family map is still + // the one the repo would return. Compare Rcs. if let Some(slot) = self.variant_lists.get(package_name) { - return slot.clone(); + match slot.as_ref() { + Some(cached_list) => { + // Repo's get_family is cheap on a covered hint (no + // reload). If it returns the same Rc we + // already built the list against, the list is still + // valid. + let fresh_map = ctx.repo.get_family(package_name, hint); + if let Some(fresh_map) = fresh_map.as_ref() { + if Rc::ptr_eq(fresh_map, &cached_list.versions) { + return Some(Rc::clone(cached_list)); + } + // The map changed under us (widen-reload). Fall + // through to rebuild against the fresh map. + } else { + // Repo lost the family between calls — treat as + // absent. + self.variant_lists.insert(Name::from(package_name), None); + return None; + } + } + None => { + // Previously absent. If the hint is wider than what + // produced the absence answer (or absence answer was + // produced with None hint), we'd see the change via + // the repo. But the repo encodes the previous hint + // and only retries when widening, so a fresh call + // here is the right gate. + let fresh = ctx.repo.get_family(package_name, hint); + if fresh.is_none() { + return None; + } + // Repo reloaded and now has the family — fall through + // to build a list from it. + } + } } - let built = PackageVariantList::new(ctx, package_name).map(Rc::new); + // Slow path: build a fresh PackageVariantList from whatever the + // repo currently has. + let built = PackageVariantList::new(ctx, package_name, hint).map(Rc::new); self.variant_lists .insert(Name::from(package_name), built.clone()); built @@ -958,7 +1020,7 @@ mod tests { fn test_build_variants_no_variants() { let r = repo(vec![("foo", vec![("1.0", pkg(&["bar-2"], &[]))])]); let ctx = ctx_with(r, &["foo"]); - let list = PackageVariantList::new(&ctx, "foo").unwrap(); + let list = PackageVariantList::new(&ctx, "foo", None).unwrap(); let entries = list.get_intersection(&VersionRange::any()).unwrap(); assert_eq!(entries.len(), 1); assert_eq!(entries[0].len(), 1); @@ -974,7 +1036,7 @@ mod tests { vec![("1.0", pkg(&["base-1"], &[&["maya-2024"], &["maya-2025"]]))], )]); let ctx = ctx_with(r, &["foo"]); - let list = PackageVariantList::new(&ctx, "foo").unwrap(); + let list = PackageVariantList::new(&ctx, "foo", None).unwrap(); let entries = list.get_intersection(&VersionRange::any()).unwrap(); assert_eq!(entries[0].len(), 2); let v0 = &entries[0].variants()[0]; @@ -995,7 +1057,7 @@ mod tests { ], )]); let ctx = ctx_with(r, &["foo"]); - let list = PackageVariantList::new(&ctx, "foo").unwrap(); + let list = PackageVariantList::new(&ctx, "foo", None).unwrap(); let entries = list.get_intersection(&VersionRange::parse("2+")).unwrap(); let versions: Vec = entries.iter().map(|e| e.version().to_string()).collect(); assert_eq!(versions, vec!["2.0", "3.0"]); diff --git a/docs/config.toml b/docs/config.toml index e3e16c9..eaaaec2 100644 --- a/docs/config.toml +++ b/docs/config.toml @@ -125,7 +125,7 @@ weight = 10 name = "GitHub" pre = '' url = "https://github.com/doubleailes/rer" -post = "v0.1.0-rc.9" +post = "v1.0.0-rc.3" weight = 20 # Footer contents diff --git a/docs/content/_index.md b/docs/content/_index.md index 46882bd..131947d 100644 --- a/docs/content/_index.md +++ b/docs/content/_index.md @@ -7,7 +7,7 @@ title = "rer — Rez En Rust" lead = "A faithful Rust port of rez's package solver — callable from Python via PyO3, resolves match rez 1:1." url = "/docs/getting-started/introduction/" url_button = "Get started" -repo_version = "GitHub v0.1.0-rc.9" +repo_version = "GitHub v1.0.0-rc.3" repo_license = "MIT-licensed." repo_url = "https://github.com/doubleailes/rer" diff --git a/docs/content/docs/engineering/fast-package-py-parser.md b/docs/content/docs/engineering/fast-package-py-parser.md new file mode 100644 index 0000000..78b6eac --- /dev/null +++ b/docs/content/docs/engineering/fast-package-py-parser.md @@ -0,0 +1,552 @@ ++++ +title = "RFC — A blazing-fast Rust `package.py` parser" +description = "Forward-looking design for a Rust parser that extracts the four solver-relevant fields from a rez package.py without invoking Python. Built to compose with the load_family lazy-discovery hook on cold-cache integrations." +draft = false +weight = 30 +sort_by = "weight" +template = "docs/page.html" + +[extra] +lead = "Forward-looking design for a Rust parser that extracts the four solver-relevant fields from a rez package.py without invoking Python. Built to compose with the load_family lazy-discovery hook on cold-cache integrations." +toc = true +top = false ++++ + +> **Status: Stages 1–4 shipped.** Survey tool at +> `scripts/survey_package_py.py`; Rust parser crate at +> `crates/rer-package/`; PyO3 bindings (`pyrer.parse_static_package_py` +> and the batched `parse_static_packages_py`) at +> `crates/rer-python/src/lib.rs`; differential safety net at +> `scripts/diff_against_rez.py`; perf benches at +> `scripts/bench_package_py_parser.py` and +> `scripts/bench_batched_parser.py`. Stage 1 numbers, Stage 3 +> per-file timing, Stage 2 differential (0 mismatches on 5,979 +> files), and Stage 4 batched speedup (2.81× on 2,000 files) are +> all inline below. + +## Stage 1 result — Fortiche, May 2026 + +Run on `/thierry/rez/pkg` (the Fortiche-on-CIFS rez repo): + +| | Count | % | +|---|---:|---:| +| `package.py` files surveyed | 6,439 | 100% | +| **Fast-parseable** | **5,982** | **92.9%** | +| Not fast-parseable | 457 | 7.1% | + +Non-fast-parseable breakdown (files can match multiple buckets): + +| Pattern | Count | % of total | +|---|---:|---:| +| `dynamic-requires` (`@early` / `@late` on `requires`) | 352 | 5.5% | +| `imports` (load-bearing `import` statements) | 96 | 1.5% | +| `missing-version` (mostly rez's own test fixtures) | 85 | 1.3% | +| `missing-name` (mostly rez's own test fixtures) | 75 | 1.2% | +| `top-level-classdef` | 54 | 0.8% | +| `unrecognised-raise` | 2 | 0.0% | + +**Decisive finding**: the dominant *seeming* failure pattern from a +naive survey, `top-level-with` (2,245 files, 34.9% of the corpus), is +**100% rez's declarative `with scope("config")` DSL** — every one of +the 2,245 files matched. That body only writes attributes of the +`as`-name (config object) and never touches solver fields, so it is +solver-irrelevant — the parser treats it the same way it treats +`def commands(...)`. Including this single extension lifts the +accept rate from a marginal 58.6% to a green-light 92.9%. + +Well past the 70% **PROCEED** threshold from the original RFC. + +## Motivation + +`pyrer`'s solver is already roughly 34× faster than `rez`'s on the +188-case differential benchmark. Real `rez env` invocations are no +longer bottlenecked on the solve — they are bottlenecked on what +surrounds it: + +1. Python interpreter startup (~200–300 ms per process). +2. **Package discovery — opening, reading, and AST-evaluating each + `package.py` rez decides to inspect.** This is the big remaining + cost on cold-cache invocations. +3. The solve itself (~tens of ms on `pyrer`). +4. Environment construction (Rex evaluation, PATH munging, shell + hooks). + +Issue #86 added the `load_family` callback so the solver only asks +the host to load families it actually needs — that addresses the +"how many" axis. This RFC addresses the "how fast each one" axis: +loading a `package.py` currently means a full Python compile + exec, +which can run into the milliseconds per file on a warm cache and +adds up fast across a wide BFS or a CI batch. + +A Rust parser that extracts the four solver-relevant fields without +invoking the Python interpreter has the potential to drop per-file +parse cost from milliseconds to tens of microseconds for the static +majority of `package.py` files. Combined with `load_family`, it +attacks both the count and the cost of the discovery phase. + +## What `package.py` actually is + +A `rez` `package.py` is **arbitrary Python**. The solver only reads +four fields: + +- `name` — string +- `version` — string +- `requires` — list of rez-requirement strings +- `variants` — list of lists of rez-requirement strings + +But a real-world `package.py` can also carry, with varying +frequency: + +- `description`, `authors`, `tools`, `tests`, `help` — irrelevant to + the solve. +- `commands()`, `pre_commands()`, `post_commands()` — function bodies + that affect *runtime* environment, not the solve. +- `build_command`, `build_system` — irrelevant to the solve. +- **`@early()` / `@late()` decorated functions on `requires` / + `variants`** — *these are dynamic and do affect the solve.* +- Top-level `if/else` chains on env vars (`if config.studio_mode: + requires = […]`) — also dynamic relative to the solve. +- Top-level `import` statements with load-bearing side effects. + +The fast parser only needs to handle the case where the four solver +fields are **literal assignments**. Everything else falls back to +`rez`'s evaluator, which already exists and is correct. + +## Scope + +### In scope (fast path) + +| Statement | Action | +|---|---| +| `name = "..."` (string literal) | Extract | +| `version = "..."` (string literal) | Extract | +| `requires = ["str", "str", …]` (list of string literals) | Extract | +| `variants = [["str", …], …]` (list of lists of string literals) | Extract | +| `def commands(...)`, `def pre_commands(...)`, `def post_commands(...)`, `def tools(...)` (function body) | Ignore — not solver-relevant | +| Top-level assignments to non-solver fields (`description`, `authors`, `tools`, `tests`, `help`, `build_command`, `build_system`, etc.) | Ignore | +| Top-level docstring | Ignore | + +### Out of scope (bail to `rez`) + +| Pattern | Why we bail | +|---|---| +| `def requires(...)` / `def variants(...)` with `@early` / `@late` | Solver-relevant value is dynamic | +| Top-level `if/else`, `try/except`, `for` | Can't statically know which branch wins | +| `import` / `from ... import` | May have load-bearing side effects | +| Function calls assigning to a solver field (`requires = make_requires(...)`) | Not statically resolvable | +| Any other expression we don't recognise | Conservative bail | + +**The bias is hard toward bailing.** A false positive (parsing a file +the fast path shouldn't have handled) produces a different `requires` +than rez, which means different resolves, which is a silent +correctness regression. The slow path through rez always exists; the +fast path is opt-in coverage. We accept low coverage with zero +divergence over high coverage with any divergence. + +## Architecture + +### New crate: `rer-package` + +```text +crates/rer-package/ +├── Cargo.toml +├── src/ +│ ├── lib.rs +│ ├── parser.rs # AST walk + literal extraction +│ └── classify.rs # bail-or-extract decisions +└── tests/ + ├── static_fixtures/ # known-fast-parseable .py files + ├── dynamic_fixtures/ # known-bail .py files + └── corpus/ # large real-world sample, diff against rez +``` + +Depends on `rustpython-parser` (well-maintained, parses to AST, no +runtime dependency on a Python install). Returns: + +```rust +pub fn parse_static_package_py(source: &str) -> Option +``` + +`None` means the file is not statically parseable; the caller falls +back to rez. `Some(data)` means the four fields were all extracted +as literals; the caller can skip `rez.Package` evaluation entirely. + +### PyO3 binding + +Single function on `pyrer`: + +```python +pyrer.parse_static_package_py(source: str | bytes) -> Optional[pyrer.PackageData] +``` + +About twenty lines of glue. The integration site is the +`load_family` callback in the rez shim: + +```python +def load_family(name): + out = [] + for pkg_path in _find_package_files(name, paths=PACKAGE_PATHS): + with open(pkg_path) as f: + source = f.read() + # Fast path: try the Rust parser first. + pd = pyrer.parse_static_package_py(source) + if pd is None: + # Bail to rez's evaluator for the dynamic case. + pkg = _rez_package_from_file(pkg_path) + pd = pyrer.PackageData.from_rez(pkg) + out.append(pd) + return out +``` + +The shim composes the two — `load_family` decides *which* files to +read; `parse_static_package_py` decides *how fast* to read each one. + +## Build order + +### Stage 1 — Corpus survey (~2 days) + +Build `rer-stat-package-py`: a tool that walks a directory tree of +`package.py` files and classifies each one into: + +| Category | Meaning | +|---|---| +| `fast-parseable` | All four solver fields are literal assignments; no disqualifying top-level statements. | +| `dynamic-requires` | `requires` is `@early`/`@late` or assigned conditionally. | +| `dynamic-variants` | Same, for `variants`. | +| `top-level-if` | A top-level `if/else` we'd have to bail on. | +| `imports` | Has `import` statements. | +| `other` | Anything else that disqualifies the fast path. | + +Reports counts, percentages, and example file paths per bucket. + +**Run this against Fortiche's actual studio repo.** The output is +the go/no-go signal for Stage 2: + +- ≥ 70% fast-parseable: proceed. The fast path covers the typical + case; the engineering ROI is real. +- 40–70%: marginal. Worth a discussion about which patterns to + expand coverage to and whether the complexity is worth it. +- < 40%: don't build it. The slow-path fallback would dominate; + the fast path saves work in the minority case. The memcache + alternative below is the smarter bet. + +Stage 1 is the cheapest experiment that produces the number this +project needs. + +### Stage 2 — Parser + binding + differential test (~1–2 weeks) + +1. Implement `rer_package::parse_static_package_py` against + `rustpython-parser`. +2. Hand-curate ~30 fixture files (≥ 15 static, ≥ 15 dynamic across + every disqualifying pattern). Unit-test both arms. +3. PyO3 binding on `pyrer.parse_static_package_py`. +4. **Differential test harness**: for every file in the corpus where + the fast parser returns `Some(data)`, also load the file through + rez's `Package` and compare the four fields. **Any mismatch is a + release blocker**, exactly like the 188-case rez solver + differential. This is the safety net for the "bias toward bail" + policy. + +### Stage 3 — Shim wiring + end-to-end benchmark (~3–5 days) + +1. Document the `load_family` integration pattern (above). +2. End-to-end benchmark: a real `rez env` invocation against a + representative repo, three configurations: + - Eager BFS (today's shim baseline) + - `load_family` only (issue #86 today) + - `load_family` + `parse_static_package_py` (this RFC) + + Report wall-clock for cold and warm cache. + +## Honest forecast — superseded by Stage 3 measurement + +The original RFC predicted 2–50× wins by replacing rez's +"compile + exec" with Rust AST parsing. Stage 3's measurement +against real Fortiche files showed something smaller and worth +naming clearly. + +## Stage 3 result — Fortiche, May 2026 + +Two iterations, both run with `scripts/bench_package_py_parser.py +--corpus /thierry/rez/pkg --samples 100 --iters 30 --with-rez` +against the live Fortiche-on-CIFS repo and rez 3.3.0. + +### V1: AST-based parser (`rustpython-parser`) + +| Path | μs / file | Speedup | +|---|---:|---:| +| `open + read + parse_static_package_py` | 1,533 | — | +| `DeveloperPackage.from_path + from_rez` (rez) | 2,632 | — | +| **Result** | | **1.7×** | + +The parse step alone was 1,990 μs — `rustpython-parser` builds the +full module AST when we only need four top-level fields. It also +adds ~30 MB of crate deps to the build. Diagnosis: wrong tool. + +### V2: hand-rolled lexer + +A 700-line module-level scanner that walks the four patterns +(`name = "..."`, `version = "..."`, `requires = [...]`, +`variants = [[...]]`) directly, with bracket / string / comment / +indent tracking but no AST allocation. Same public API as V1. + +| Path | μs / file | Speedup | +|---|---:|---:| +| `open + read + parse_static_package_py` | **75.24** | — | +| `DeveloperPackage.from_path + from_rez` (rez) | **2,615.54** | — | +| **Result** | | **34.8×** | + +In-memory breakdown: + +| Path | μs / file | +|---|---:| +| `parse_static_package_py(source)` (hand-rolled scan) | 59.23 | +| `from_rez(fake_pkg)` (attribute walk, lower bound) | 11.17 | +| `from_rez(real rez Package)` (post-load, just walk) | 13.50 | +| file read alone (open + read, warm cache) | 14.81 | + +The parse step alone dropped from 1,990 μs (V1) to **59 μs (V2)** — +~33× on that layer, lifting the full-load comparison vs rez from +1.7× to **34.8×**. Per-file savings: ~2.54 ms. Over a 50-family +resolve, that's **~127 ms saved per resolve** — real, artist- +perceptible latency. + +The V2 rewrite held 92.9% acceptance on the Fortiche corpus +(5,979 / 6,439 vs V1's 5,985; 6 files of drift, well within +rounding). Two non-obvious bug categories surfaced during the +rewrite, both Windows-specific: + +- CRLF line endings — Samba-served `package.py` files end lines with + `\r\n`. The scanner needed `\r` treated transparently in inline + whitespace. +- `\` line continuations on non-solver assignments (e.g. + `changelog = \` followed by a CRLF then a multi-line triple- + quoted string). The continuation handler only recognised `\`. + +Both have dedicated unit tests in the V2 implementation. + +### What the bench numbers say about the next ceiling + +V2's 75 μs/file full-load splits as: + +- ~15 μs file I/O (warm-cache CIFS) +- ~60 μs parsing CPU + +CPU is no longer dominant; I/O is. Further parser optimisations have +diminishing returns. The next 10× lives in either avoiding more I/O +(`load_family` from #86 already does most of this — files the solver +never asks for never get loaded) or a parsed-package cache layered +on top of Fortiche's existing shared memcache. + +### Caveats on the 34.8× headline + +- **Cold-cache CIFS** — the 14.81 μs file-read is a warm-cache + number. On a truly cold network read, file I/O can be 1–100 ms, + swamping the 60 μs parse cost. The Rust parser's relative + advantage over rez stays the same proportionally (rez pays the + same I/O), but the absolute saving per file shifts from 2.5 ms + toward whatever the network roundtrip costs. +- **Sample bias** — 100 files sampled deterministically from the + corpus. The full 6,439-file corpus could behave differently in + pathological cases (very large files, unusual structure). The + unit tests + the 6-file drift between V1 and V2 acceptance count + is the existing safety net. +- **No production A/B yet** — the 127 ms / resolve number is from + micro-bench arithmetic, not a real `rez env` measurement. + Production wall time may show less if other phases dominate. + +## Risks and mitigations + +| Risk | Mitigation | +|---|---| +| **Silent correctness regression** — the fast parser accepts a file it shouldn't | Bias hard toward bailing in `classify.rs`. Run the differential test on every file the fast parser claims. Treat any mismatch as a release blocker. | +| **Maintenance burden** — `rustpython-parser` is a Python AST library; track upstream Python syntax changes | Pin to a known-good version. Studio `package.py` files don't typically use bleeding-edge syntax. | +| **Coverage drift** — over time studios add patterns the parser doesn't handle | The fast path is opt-in. Coverage drift means the slow path runs more often, not that correctness breaks. We can extend coverage when patterns become common. | +| **Stage 1 says "don't build it"** | We've spent two days on a survey and now know the workload. Pivot to the memcache route below or accept the status quo. Cheap pivot point. | + +## Stage 2 safety net — differential against rez + +The bias-toward-bailing policy is only safe if "V2 accepts a file" +also means "V2 produces the same `(name, version, requires, variants)` +that rez does". A divergence here is a silent correctness regression +in any rez integration shim that uses the fast path. + +`scripts/diff_against_rez.py` is the test harness: for every file V2 +accepts, it also loads via rez's `DeveloperPackage.from_path` and +stringifies the four fields with `str(req)`, then compares +byte-for-byte. Any divergence is a release blocker, exactly like +the 188-case rez solver differential. + +### Result on /thierry/rez/pkg + rez 3.3.0 + +| | Count | % of V2-accepted | +|---|---:|---:| +| Total files surveyed | 6,439 | — | +| V2 accepted | 5,979 | — | +| V2 bailed (slow path) | 460 | — | +| **Match (all four fields agree with rez)** | **5,813** | **97.22%** | +| **Mismatch (correctness regression)** | **0** | **0.00%** | +| rez evaluation error | 166 | 2.78% | + +Wall-clock for the full run: 74 seconds (CIFS warm). Zero +mismatches over the entire Fortiche corpus — the safety net is +green. + +### What about the 166 rez-eval-error files? + +These are files V2 accepts but rez 3.3.0 in this dev venv can't +load. Sampled five of them — all five are the same error: + + InvalidPackageError: Package … uses @include decorator, but no + include path has been configured with the 'package_definition_python_path' + setting. + +That's a **rez environment-config issue, not a content issue.** +Production rez at Fortiche has `package_definition_python_path` +set; rez would load these files fine. V2 correctly accepts them +because `@include def some_func():` is a non-solver decorator (the +function being decorated isn't `requires` / `variants`), and the +four solver fields are static. + +So the "166 rez-eval-error" bucket is really "files this dev venv +can't evaluate because of a missing config knob" — not a +correctness signal. In production they'd match. + +If the differential test ever needs to be tightened, the path is to +also configure `package_definition_python_path` in the dev venv — +but that's CI infrastructure, not a parser change. + +## Stage 4 — Batched parallel parse (issue #94) + +After Stages 1–3 landed, `cProfile` of a real Fortiche resolve +showed the static parser itself was no longer in the top of the +flamegraph. The cost had moved one layer up: the shim's serial +Python loop of `open()` calls feeding the parser. On a 132-package +resolve that was 3.20 s of pure I/O (35% of total wall time), one +file at a time while seven cores idled. + +`parse_static_packages_py(paths)` is the response: open + parse +every path in one Rust call across a Rayon thread pool, with the +GIL released for the whole batch. Same per-file semantics as +`parse_static_package_py` — accept rate, output shape, differential +correctness all carry over. + +### Result on Fortiche + +`scripts/bench_batched_parser.py` against `/thierry/rez/pkg` over +CIFS, best-of-3: + +| Sample | Serial `open` + parse | Batched | Speedup | +|---:|---:|---:|---:| +| 500 files (warm cache) | 56.71 ms | 40.76 ms | **1.39×** | +| 2,000 files | 4,234 ms | 1,508 ms | **2.81×** | + +Per-file saving on the 2,000-file run: **~1.36 ms**. Extrapolated +to the issue's target workload (132-package resolve, ~2,600 +`package.py` files): **~3.5 s saved per resolve**. + +Both paths produce identical accepts (1,864/2,000 → static-parseable +fraction matches the per-file parser) — zero correctness drift. + +The 500-file bench is bottlenecked on warm-page-cache parsing CPU; +the Rayon dispatch overhead amortises less on small batches. On +cold-cache or larger batches the parallel-I/O overlap shows +through. The 2.81× is a lower bound on warm hardware; on cold CIFS +(Windows production) it should grow. + +### Design choices + +- **Output is positionally aligned with input.** Missing files, + unreadable bytes, and parser bails all become `None` at the + matching index. The shim's `zip(pkgs, result)` is then trivially + correct. +- **No exception escapes.** Per-file failures map to `None`. The + function only raises if the input type is wrong. +- **Pool size = Rayon default** (`RAYON_NUM_THREADS` env var or + logical core count). No per-call knob initially; capacity + control is environmental. +- **Pure addition.** The single-file API stays. Shims feature-detect + with `hasattr(pyrer, "parse_static_packages_py")` and fall back + to the per-file loop on older pyrer. + +### Safety net + +Reused from Stage 2. The same `from_rez(pkg)` comparison can be +shadow-checked at production runtime, gated on +`REZ_PYRER_VALIDATE_BATCHED`. The integration page in the +[rez integration docs](../../getting-started/rez-integration/#shadow-validation-mode-1) +has the recipe. + +The offline Stage 2 differential — 5,813 / 5,813 matched on the +Fortiche corpus — covers the per-file semantics. Stage 4's batched +call uses the exact same `parse_static_package_py` per file, so the +existing safety net carries over byte-for-byte; the only Stage 4 +specific risk is around ordering / completion which the +positional-alignment contract handles explicitly. + +### What's next after this + +Stage 4 takes us to: + +- ~93 % of `package.py` files served by the Rust fast path +- ~75 µs per file via the static parser (Stage 3) +- ~1/3 the wall-time on the open+parse phase via the batched call (Stage 4) + +The remaining cost on `_load_family` is now real I/O (CIFS round- +trips for files Rayon's pool can't overlap further) plus the +dynamic-7 % rez evaluator path. Both are architectural — +addressing them needs a layer outside this RFC (memcache caching +of parsed `PackageData` across invocations is the obvious next +move, as called out in the "Considered alternatives" section +below). + +## Considered alternatives + +### Parsed-package cache on top of the shared memcache + +Instead of parsing `package.py` fast, parse once via rez and cache +the four-field result in Fortiche's existing shared memcache, keyed +by `(repo_id, family, version, mtime)`. Subsequent reads across the +studio are sub-millisecond regardless of whether the file is +static or dynamic. + +Tradeoffs: + +- **Pro**: no novel parser, no AST-classification edge cases, reuses + infrastructure already in place. Lower risk, faster to ship. +- **Pro**: works for dynamic packages too (cache stores the + *evaluated* fields). +- **Con**: first hit anywhere in the studio still pays full rez + evaluation. The Rust parser is "always fast"; the cache is "fast + after first studio-wide hit". +- **Con**: requires invalidation on `package.py` change (mtime check + is cheap but adds complexity). + +The user's preference is to explore the Rust parser. The cache is +worth re-evaluating after Stage 1 — if the static-parseable fraction +is lower than expected, the cache pays off more reliably with less +risk. + +## Concrete next step + +Open a draft PR with `rer-stat-package-py` only: the corpus +classifier walker. No parser yet. The deliverable is a small CLI +that takes a path and prints a histogram of categories. Two days +of work; the output decides whether Stages 2–3 happen. + +A reasonable acceptance criterion for the survey tool itself: + +- Recognises every pattern listed in the [Scope](#scope) section. +- Reports per-category counts and percentages. +- Outputs example file paths for each bucket (for hand-inspection of + edge cases). +- Falls back gracefully on files it can't even parse as Python + (rare but possible — broken `package.py` files exist in the wild). + +## See also + +- [Wiring pyrer into rez](../../getting-started/rez-integration/#lazy-package-discovery-on-cold-caches) — the `load_family` callback this parser composes with. +- [Issue #86](https://github.com/doubleailes/rer/issues/86) — the + lazy-discovery hook that motivated this RFC. +- [`rustpython-parser` on crates.io](https://crates.io/crates/rustpython-parser) + — the AST library this RFC depends on. diff --git a/docs/content/docs/getting-started/quick-start.md b/docs/content/docs/getting-started/quick-start.md index 941ca27..7e8d9ef 100644 --- a/docs/content/docs/getting-started/quick-start.md +++ b/docs/content/docs/getting-started/quick-start.md @@ -95,7 +95,7 @@ Add the resolver crate to your `Cargo.toml`: ```toml [dependencies] -rer-resolver = "0.1.0-rc.9" +rer-resolver = "1.0.0-rc.3" ``` Then call the solver against an in-memory repository: diff --git a/docs/content/docs/getting-started/rez-integration.md b/docs/content/docs/getting-started/rez-integration.md index 89f79e7..1e1637c 100644 --- a/docs/content/docs/getting-started/rez-integration.md +++ b/docs/content/docs/getting-started/rez-integration.md @@ -149,10 +149,18 @@ been given: ```python import pyrer -def load_family(name): - """Return every PackageData for `name`, or [] if no such family.""" +def load_family(name, version_range=None): + """Return every PackageData for `name` (optionally filtered to + `version_range`), or [] if no such family. + + The `version_range` hint (issue #92) is a rez-syntax string — + `"2+<3"`, `"==2.0"`, etc. — that the shim can pass directly to + `iter_packages(range_=...)` so on-disk version dirs outside the + range are skipped before any `package.py` is opened. `None` + means "every version". + """ pkgs = [] - for pkg in iter_packages(name, paths=PACKAGE_PATHS): + for pkg in iter_packages(name, range_=version_range, paths=PACKAGE_PATHS): pkgs.append(pyrer.PackageData.from_rez(pkg)) return pkgs @@ -165,15 +173,19 @@ result = pyrer.solve( Semantics: -- The callback is called **at most once per family** in one solve - (results are cached internally), and **only for families the +- The callback is called **at most once per (family, range)** in one + solve (results are cached internally), and **only for families the solver actually exercises**. -- Returning `[]` means "no such family" — treated the same as a - family that was never added. +- Returning `[]` means "no such family in the requested range" — + treated the same as a family that was never added. - The `packages` argument is still accepted; entries supplied that way are pre-seeded into the cache and the callback is never asked for those families. Useful for a hybrid where you pre-load hot families and lazy-load the long tail. +- **Backward-compatible signature** (issue #92): a callback defined + as `def load_family(name):` keeps working — pyrer detects the + signature and only passes `version_range` when the callback can + accept it. - If the callback raises, the solve returns `result.status == "error"` with the exception message in `result.failure_description`. No exception escapes `pyrer.solve`. @@ -181,6 +193,51 @@ Semantics: family are dropped; a duplicate `(family, version)` from the callback surfaces as `status="error"`. +### The `version_range` hint (issue #92) — pre-filter at the shim + +When a request pins a version (`maya-2024+`, `python<3.12`, +`==1.5.0`), pyrer propagates that constraint immediately. It then +passes the resulting range to the `load_family` callback as the +`version_range` keyword argument. + +The shim can use it directly with rez's `iter_packages`: + +```python +def load_family(name, version_range=None): + return [ + pyrer.PackageData.from_rez(pkg) + for pkg in iter_packages(name, range_=version_range, paths=PACKAGE_PATHS) + ] +``` + +`rez.packages.iter_packages` already accepts a `range_` argument and +**skips entire version directories** that fall outside it, before +any `package.py` is opened. That cuts I/O proportionally to how +narrow the request is. + +The hint is **advisory**: + +- The shim **may** filter; pyrer re-validates anyway, so returning + the full set is correct but wasteful. +- The shim **must not** drop versions outside the hint without + reason — pyrer caches the loaded range and re-calls the loader + with a widened range if a backtrack later needs more. +- `version_range=None` means "the solver needs every version" — + always return the full family. + +**Impact** on a 132-package resolve at a 5,500-package studio +(per the issue): + +| Without hint | With hint | +|---:|---:| +| 2,637 packages loaded for 132 used (95% wasted) | ~400 packages loaded for 132 used (6.6× cut) | +| ~9 s `load_family` total | projected ~2 s | + +The hint compounds with the static-`package.py` parser +([above](#plugging-in-the-static-package-py-fast-parser)) — the +parser makes each load cheap; the hint makes most loads +unnecessary. + ### When this actually helps The win is in I/O avoided, not in CPU. Specifically: @@ -233,12 +290,26 @@ def _pyrer_resolve(self): return _original_resolve(self) # Closure over the resolver's package paths — pyrer calls this - # only for families the solver actually needs. + # only for families the solver actually needs. The two-tier load: + # try the Rust AST fast-parser first (skips Python evaluation for + # the ~93% of `package.py` files at Fortiche that are static), + # fall back to rez's `Package` evaluator for the dynamic ones + # (`@early` / `@late` requires, top-level `if`, imports, …). def load_family(name): - return [ - pyrer.PackageData.from_rez(pkg) - for pkg in iter_packages(name, paths=self.package_paths) - ] + out = [] + for pkg in iter_packages(name, paths=self.package_paths): + pd = None + try: + with open(pkg.filepath, "r", encoding="utf-8") as f: + source = f.read() + pd = pyrer.parse_static_package_py(source) + except OSError: + pass + if pd is None: + # Dynamic file — `@early` / `@late` / imports / etc. + pd = pyrer.PackageData.from_rez(pkg) + out.append(pd) + return out requests = [str(r) for r in self.package_requests] result = pyrer.solve( @@ -284,6 +355,438 @@ every reachable family. - **Solve-phase CPU.** The solver itself runs the same algorithm either way. Lazy loading is purely about avoiding pre-solve I/O. +## Plugging in the static `package.py` fast-parser + +Once `load_family` is wired, the inner per-package load is the next +hot path. `pyrer.parse_static_package_py(source)` reads the four +solver-relevant fields directly from a `package.py` source string — +no Python interpreter, no `Requirement` parse, no `__str__` +round-trip. About **34.8× faster than `DeveloperPackage.from_path + +from_rez`** on the Fortiche-on-CIFS corpus +(75 μs/file vs 2.6 ms/file). Saves roughly 2.5 ms per file loaded — +~127 ms / resolve on a typical 50-family resolve. + +The parser accepts the static subset of `package.py`: literal +assignments for `name`, `version`, `requires`, `variants`, plus +ignorable `def commands()` and `with scope("config")` blocks. +Anything dynamic (`@early` / `@late`, top-level `if`, `import`, +…) returns `None` so the caller falls back to rez. **Bias hard +toward bailing** — a false positive would diverge from rez and +produce a silent correctness regression in any resolve. See the +[engineering note](../../engineering/fast-package-py-parser/) for +the design, the corpus survey, the V2 hand-rolled-lexer rewrite, +and the differential test result (0 mismatches on 5,979 +V2-accepted files at Fortiche). + +### Two-tier `load_family` + +The integration replaces one function in the shim: + +```python +import os +import pyrer +from rez.packages import iter_packages + + +def _try_fast_parse(pkg): + """Try the Rust static parser. Returns a PackageData on success, + or None for any reason — non-.py package, unreadable file, or + dynamic content the parser bails on. The caller falls back to + `from_rez(pkg)`.""" + filepath = getattr(pkg, "filepath", None) + if not filepath or not filepath.endswith(".py"): + return None + try: + with open(filepath, "r", encoding="utf-8") as f: + source = f.read() + except OSError: + return None + return pyrer.parse_static_package_py(source) + + +def load_family(name, package_paths): + out = [] + for pkg in iter_packages(name, paths=package_paths): + pd = _try_fast_parse(pkg) or pyrer.PackageData.from_rez(pkg) + out.append(pd) + return out +``` + +That's the minimal integration. ~30 lines added; the existing +`_pyrer_resolve` method is unchanged except for using the +two-tier `load_family` above. + +### Faster: batched parallel parse (issue #94) + +The shape above is already a big win versus rez's `from_rez`, but +the inner Python loop is still serial — one `open()` per file, +per package, while the other cores idle. On a 2,600-file resolve +that's roughly 3 s of pure I/O even with the static parser doing its +job; `cProfile` shows it as the top of the flamegraph (35% of total +wall time) after every other pyrer win was wired up. + +`pyrer.parse_static_packages_py(paths)` replaces that loop with a +single Rust call. It reads and parses every path on a Rayon thread +pool, returns a list aligned with the input, and releases the GIL +for the duration. + +#### What it does + +```python +pyrer.parse_static_packages_py(paths: list[str | os.PathLike]) + -> list[PackageData | None] +``` + +Per-path semantics are identical to `parse_static_package_py(source)` +— so the static-vs-dynamic decision, the corpus accept rate, and the +output `PackageData` shape are all unchanged. What's different is +that you pay one round-trip into Rust for the whole batch instead +of one per file, and the file reads + parses run in parallel. + +- **Positionally aligned output.** `len(result) == len(paths)`. A + missing file (`ENOENT`), unreadable bytes (permissions, invalid + UTF-8), a parser bail on dynamic content, and a `package.yaml` / + non-`.py` file all become `None` at the matching index. Callers + can `zip(paths, result)` after. +- **No exception escapes.** Per-file failures map to `None`. The + function as a whole only raises if the input type is wrong (e.g. + passing a non-iterable for `paths`). +- **GIL released.** `Python::allow_threads` is used internally, so + other Python threads run during the batch. The result list is + built back on the GIL after the parallel section completes. +- **Pool size = `RAYON_NUM_THREADS`** (default: logical core count). + No per-call knob. Cap with the env var on shared CI hosts. +- **Order doesn't depend on completion order.** Rayon's `par_iter` + may finish files out of order; the returned list is rebuilt by + index, so the shim's `zip(pkgs, result)` is always correct. + +#### Measured impact on Fortiche + +`scripts/bench_batched_parser.py` samples files from a real rez +repo, runs both paths best-of-3, and reports the speedup. On +`/thierry/rez/pkg` over CIFS: + +| Sample | Serial `open` + parse | Batched | Speedup | +|---:|---:|---:|---:| +| 500 files (warm cache) | 56.71 ms | 40.76 ms | **1.39×** | +| 2,000 files | 4,234 ms | 1,508 ms | **2.81×** | + +Per-file saving on the 2,000-file run: **~1.36 ms**. The issue's +target workload (132-package resolve, ~2,600 `package.py` files +touched) extrapolates to ~3.5 s saved per resolve. + +The smaller-sample bench is bottlenecked on warm-page-cache parsing +CPU and the Rayon dispatch overhead amortises less. Real production +loads (cold or partially-cold CIFS, many uncached versions) see more +of the parallel-I/O overlap — the 2.81× is a lower bound on warm +hardware. + +#### Integration: two-tier `load_family` with batched read + +```python +import logging +import os +import pyrer +from rez.packages import iter_packages + +_logger = logging.getLogger("pyrer.batched_parser") + + +def _gather_paths(pkgs): + """Split rez Packages into (pkg, filepath_or_None). `None` means + the package isn't filesystem-`.py`-based — yaml, memory repo, + @early-bound — and goes straight to the slow rez path.""" + out = [] + for pkg in pkgs: + filepath = getattr(pkg, "filepath", None) + if filepath and filepath.endswith(".py"): + out.append((pkg, filepath)) + else: + out.append((pkg, None)) + return out + + +def load_family(name, package_paths, version_range=None): + """The shim's load_family callback — batched fast-path + rez + fallback per file. Wires together everything pyrer offers: + #86 (load_family), #92 (version_range hint), the static parser, + and #94 (batched parallel parse).""" + # Apply the #92 hint at the iter level so rez skips on-disk + # version dirs outside the range. + pkgs = list(iter_packages(name, range_=version_range, paths=package_paths)) + pairs = _gather_paths(pkgs) + paths = [fp for _, fp in pairs if fp is not None] + + # One Rust call across all .py files. GIL released; cores in use. + pds = pyrer.parse_static_packages_py(paths) if paths else [] + pds_iter = iter(pds) + + out = [] + for pkg, filepath in pairs: + if filepath is None: + # Non-.py — straight to rez evaluator. + out.append(pyrer.PackageData.from_rez(pkg)) + continue + pd = next(pds_iter) + if pd is None: + # Parser bailed (dynamic content) or I/O error. + out.append(pyrer.PackageData.from_rez(pkg)) + else: + out.append(pd) + return out +``` + +#### Backward compatibility / feature detection + +The function is a pure addition. Shims that haven't been updated keep +working with the per-file `parse_static_package_py`. Feature-detect +once at shim init: + +```python +_BATCHED_PARSE = hasattr(pyrer, "parse_static_packages_py") + + +def load_family(name, package_paths, version_range=None): + if _BATCHED_PARSE: + return _batched_load_family(name, package_paths, version_range) + else: + return _serial_load_family(name, package_paths, version_range) +``` + +This is the same pattern the shim uses for `load_family` (#86), +`version_range` (#92), and `parse_static_package_py` itself. No +flag day; pyrer < 1.0.0-rc.3 falls back automatically. + +#### Shadow-validation mode + +Same shape as the single-file parser: gate on an env var, run a +release with it on, log any divergence, then flip off. + +```python +_VALIDATE = os.environ.get("REZ_PYRER_VALIDATE_BATCHED") == "1" + + +def _validate(pkg, batched_pd): + """Compare `batched_pd` (from parse_static_packages_py) against + what `from_rez(pkg)` would have produced. Used in production + spot-checks during the rollout.""" + if not _VALIDATE or batched_pd is None: + return + rez_pd = pyrer.PackageData.from_rez(pkg) + fast = ( + batched_pd.name, batched_pd.version, + list(batched_pd.requires), + [list(v) for v in batched_pd.variants], + ) + slow = ( + rez_pd.name, rez_pd.version, + list(rez_pd.requires), + [list(v) for v in rez_pd.variants], + ) + if fast != slow: + _logger.warning( + "pyrer batched parser DIVERGED for %s\n" + " fast: %r\n slow: %r", + getattr(pkg, "filepath", "?"), fast, slow, + ) +``` + +Spot-check the first hits each `load_family` call rather than +every package (the offline differential covered the full corpus — +this is a runtime sanity net). Drop the flag after a clean release. + +#### Metrics — confirm what's actually happening + +```python +class _PyrerStats: + batched_calls = 0 # number of parse_static_packages_py invocations + batched_files = 0 # sum of len(paths) across those calls + batched_hits = 0 # files the batched call accepted + batched_misses_io = 0 # missing / unreadable files + batched_misses_dynamic = 0 # parser bailed + non_py_packages = 0 # filesystem package isn't .py (yaml, memory repo) + + @classmethod + def log_summary(cls): + if not cls.batched_calls: + return + _logger.info( + "pyrer batched parser: %d calls, %d files; " + "hits=%d misses_io=%d misses_dynamic=%d non_py=%d", + cls.batched_calls, cls.batched_files, + cls.batched_hits, cls.batched_misses_io, + cls.batched_misses_dynamic, cls.non_py_packages, + ) +``` + +Expected at Fortiche from the corpus survey: ~93% hit rate on +filesystem `.py` packages, with the misses split between +`@early`/`@late` dynamic packages and the occasional unreadable file. + +#### Rollout plan + +Three layered flags on top of the existing `use_rer_solver`: + +```python +USE_RER_SOLVER = _rez_config.use_rer_solver +USE_BATCHED_PARSER = USE_RER_SOLVER and \ + hasattr(pyrer, "parse_static_packages_py") and \ + os.environ.get("REZ_PYRER_BATCHED_PARSER", "1") == "1" +VALIDATE_BATCHED_PARSER = USE_BATCHED_PARSER and \ + os.environ.get("REZ_PYRER_VALIDATE_BATCHED") == "1" +``` + +| Week | Flags | What to verify | +|---|---|---| +| 1 | `USE_BATCHED_PARSER=1`, `VALIDATE=1`, ~5% of users | 0 divergences in logs? Hit rate ≥ 90%? `_load_family` wall-time drops vs baseline? | +| 2 | Same flags, ~50% of users | Same, at scale. Confirm `RAYON_NUM_THREADS` not oversubscribing on shared hosts (check CPU%). | +| 3 | `USE_BATCHED_PARSER=1`, `VALIDATE=0`, 100% | Production wall-time + telemetry | +| 4 | Permanent in `use_rer_solver` config | Drop the env var | + +Each step is one env var flip away from the previous behaviour. + +#### Where this WON'T help + +- **Resolves with very few packages touched.** The Rayon dispatch + overhead is small (~µs per batch) but on a 5-package resolve + there's almost nothing to parallelise. The win scales with batch + size; tiny batches see negligible speedup. +- **The dynamic 7%.** `@early` / `@late`-bound packages still need + rez's evaluator — those bypass the batched path entirely (see + `non_py` / parser-bail accounting in the metrics). +- **`load_family` cache hits within a single `solve()`.** When a + family is already cached on the pyrer side (#86), the loader + isn't called — batched or otherwise. +- **Cross-invocation cost.** Each `rez env` is a fresh process and + pays a fresh batched-parse cost. A persistent memcache of parsed + `PackageData` is the next layer; this work is the prerequisite. + +### Shadow-validation mode for the first weeks in production + +The differential harness already ran clean on the Fortiche corpus +(5,813/5,813 matched). But a shadow check in production catches +runtime patterns the offline survey didn't exercise. Gate it on an +env var so it can be turned on for a release and off afterwards: + +```python +import logging +import os + +_VALIDATE = os.environ.get("REZ_PYRER_VALIDATE_PARSER") == "1" +_logger = logging.getLogger("pyrer.fast_parser") + + +def _try_fast_parse(pkg): + filepath = getattr(pkg, "filepath", None) + if not filepath or not filepath.endswith(".py"): + return None + try: + with open(filepath, "r", encoding="utf-8") as f: + source = f.read() + except OSError: + return None + + pd_fast = pyrer.parse_static_package_py(source) + if pd_fast is None: + return None + + if _VALIDATE: + pd_slow = pyrer.PackageData.from_rez(pkg) + fast_t = ( + pd_fast.name, pd_fast.version, + list(pd_fast.requires), + [list(v) for v in pd_fast.variants], + ) + slow_t = ( + pd_slow.name, pd_slow.version, + list(pd_slow.requires), + [list(v) for v in pd_slow.variants], + ) + if fast_t != slow_t: + _logger.warning( + "pyrer fast parser DIVERGED for %s\n" + " fast: %r\n slow: %r\n" + " Using slow path; please report this upstream.", + filepath, fast_t, slow_t, + ) + return None # divergence — fall back to rez + return pd_fast +``` + +Run a release with `REZ_PYRER_VALIDATE_PARSER=1` set in the shim's +environment. Grep your studio logs for the warning; the count is +your real-world divergence rate. If it stays zero (which the +offline differential predicts), drop the flag. + +### Metrics — confirm the hit rate + +Class-level counters tell you what fraction of packages are +actually taking the fast path in production: + +```python +class _PyrerStats: + fast_hits = 0 + fast_misses_non_py = 0 # yaml package, memory repo, etc. + fast_misses_dynamic = 0 # parser bailed (real dynamic content) + fast_misses_io = 0 # couldn't read the file + + @classmethod + def log_summary(cls): + total = (cls.fast_hits + cls.fast_misses_non_py + + cls.fast_misses_dynamic + cls.fast_misses_io) + if not total: + return + _logger.info( + "pyrer fast parser: %d/%d hit (%.1f%%); " + "non-py=%d dynamic=%d io=%d", + cls.fast_hits, total, cls.fast_hits / total * 100, + cls.fast_misses_non_py, cls.fast_misses_dynamic, + cls.fast_misses_io, + ) +``` + +Increment from inside `_try_fast_parse`; call `log_summary` from +the shim's existing telemetry hook. Expected hit rate at Fortiche: +~93% based on the corpus survey. + +### Rollout plan + +Three flags stacked on top of the existing `use_rer_solver`: + +```python +USE_RER_SOLVER = _rez_config.use_rer_solver +USE_FAST_PARSER = USE_RER_SOLVER and \ + os.environ.get("REZ_PYRER_FAST_PARSER", "1") == "1" +VALIDATE_FAST_PARSER = USE_FAST_PARSER and \ + os.environ.get("REZ_PYRER_VALIDATE_PARSER") == "1" +``` + +A reasonable rollout sequence: + +| Week | Flags | What you're checking | +|---|---|---| +| 1 | `USE_FAST_PARSER=1`, `VALIDATE=1`, ~5% of users | Production divergence count = 0? Hit rate ≥ 90%? Wall-time A/B looks right? | +| 2 | Same flags, ~50% of users | Same checks at scale + memcache impact | +| 3 | `USE_FAST_PARSER=1`, `VALIDATE=0`, 100% | Production wall-time + telemetry | +| 4 | Flag stays on by default in `use_rer_solver` config | Make permanent | + +Each step is a kill-switch flip away from the previous behaviour. + +### Where this WON'T help (so nobody's surprised) + +- **Cold `rez env` startup time.** The dominant ~200-300 ms of + Python interpreter init is unchanged. The parser saves on the + per-package-load step that happens *after* startup. +- **`@early` / `@late` packages.** These fall back to rez + (the dynamic 7% at Fortiche). Studios with heavier late-bound use + see proportionally less of the win. +- **`package.yaml`.** The parser is `.py` only; YAML goes through + rez. +- **Resolves served from a cached `rxt` context.** rxt loading + is its own path — the parser isn't invoked. +- **First invocation on a cold CIFS cache.** I/O dominates the + file read; the parser saves CPU, not network roundtrips. + ## Solving ```python @@ -428,26 +931,49 @@ solver for those resolves: ## Sanity-checking against rez -To make sure `pyrer` agrees with `rez`'s solver on your own repo, -generate a small set of representative requests and diff the -resolutions: +The repository ships `scripts/compare_resolves.py` — the canonical +bisect tool for "my shim's resolves diverge from rez" reports. It +uses the recommended shim shape from this page (static parser + +`load_family` + `version_range` hint + `from_rez` fallback) to +call `pyrer.solve` directly, then compares to `rez.ResolvedContext` +on the same request: -```python -from rez.resolved_context import ResolvedContext +```bash +. .venv/bin/activate -packages = list(build_pyrer_packages(["/sw/pkg"])) +# Spot-check 30 random families from your repo +python scripts/compare_resolves.py /path/to/rez/pkg --n 30 -for request in your_real_requests: - rer_result = pyrer.solve(request, packages) - rez_ctx = ResolvedContext(request, package_paths=["/sw/pkg"]) - rer_set = {(v.name, v.version) for v in rer_result.resolved_packages} - rez_set = {(v.name, str(v.version)) for v in rez_ctx.resolved_packages} - assert rer_set == rez_set, f"diverge on {request}" +# Or a specific request +python scripts/compare_resolves.py /path/to/rez/pkg --request maya-2024 nuke-15 ``` -If any case diverges, open an issue with the request and a minimal -package set that reproduces — the project's correctness bar is "match -rez 1:1" and divergence is a release blocker. +The script reports per-request: `✓` identical, `≠` divergent, +`x` rer-only failure, `X` rez-only failure, `·` both fail. Exit +code is `0` if every resolve matched (no `≠`, no `x`), `1` +otherwise. Divergent and rer-only-failure cases get a per-package +diff printed for the first few examples. + +### Interpreting the output + +- **0 divergences over a meaningful sample** (say, 30+ random + families from your own corpus) means pyrer + the recommended + shim shape works correctly on your repo. If a downstream rez + integration shim then *does* show divergence, the bug is in the + shim's translation — not in pyrer. Bisect from there: turn the + shim's features (filters, orderers, custom `from_rez` wiring, + cache layers) on one at a time and re-run. + +- **Any divergence reproduced by this script** is a pyrer + correctness bug — please open an issue with the failing request, + the rez version, the package paths, and (ideally) a minimal + package subset that reproduces. The project's correctness bar + is "match rez 1:1" and a script-reproducible divergence is a + release blocker. + +The 188-case `test_rez_benchmark` integration test catches the +common shapes; this script is for the long-tail-of-real-studio- +corpora that the benchmark dataset doesn't cover. ## See also diff --git a/scripts/bench_batched_parser.py b/scripts/bench_batched_parser.py new file mode 100644 index 0000000..fa5c2d5 --- /dev/null +++ b/scripts/bench_batched_parser.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +"""Measure `pyrer.parse_static_packages_py` (batched, Rayon-parallel) +against the serial-Python equivalent on real `package.py` files. + +Issue #94: the rez integration shim's bottleneck after the static +parser landed was the serial Python loop of `open()` calls +(~3 s for ~2,600 files on a typical Fortiche resolve, 91% of +`_load_family` wall time). This bench quantifies the win from +batching the reads + parses into a single Rust call. + +Run: + python scripts/bench_batched_parser.py /thierry/rez/pkg [--samples 500] +""" +import argparse +import os +import sys +import time + +import pyrer + + +def walk_package_pys(root): + """Yield every `package.py` under `root`, skipping rez variant + subdirs (40-char hex hashes) and dot-directories.""" + for dirpath, dirnames, filenames in os.walk(root): + dirnames[:] = [ + d for d in dirnames + if not d.startswith(".") + and not d.startswith("_") + and not (len(d) == 40 and all(c in "0123456789abcdef" for c in d)) + ] + if "package.py" in filenames: + yield os.path.join(dirpath, "package.py") + + +def serial_baseline(paths): + """Today's shim shape: Python `open` + per-file + `pyrer.parse_static_package_py(source)` in a loop. The behaviour + the batched call replaces.""" + results = [] + for p in paths: + try: + with open(p, "r", encoding="utf-8") as f: + source = f.read() + except OSError: + results.append(None) + continue + results.append(pyrer.parse_static_package_py(source)) + return results + + +def batched(paths): + """One Rust call across `len(paths)` files. Rayon thread pool + handles the parallelism; GIL released for the batch.""" + return pyrer.parse_static_packages_py(paths) + + +def time_run(fn, paths, repeat=3): + """Run `fn(paths)` `repeat` times and return the best wall time + in seconds. Best-of-N filters out CIFS hiccups; the median + would mask them.""" + best = float("inf") + for _ in range(repeat): + t0 = time.perf_counter() + result = fn(paths) + elapsed = time.perf_counter() - t0 + best = min(best, elapsed) + return best, result + + +def main(): + ap = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + ap.add_argument("root", help="rez repo root (e.g. /thierry/rez/pkg)") + ap.add_argument( + "--samples", + type=int, + default=500, + help="Number of files to sample from the corpus (default: 500)", + ) + ap.add_argument( + "--repeat", + type=int, + default=3, + help="Best-of-N repeats per configuration (default: 3)", + ) + args = ap.parse_args() + + if not os.path.isdir(args.root): + print(f"not a directory: {args.root}", file=sys.stderr) + return 2 + + print(f"Sampling up to {args.samples} package.py files from {args.root} ...") + all_paths = list(walk_package_pys(args.root)) + if len(all_paths) > args.samples: + step = max(len(all_paths) // args.samples, 1) + paths = all_paths[::step][: args.samples] + else: + paths = all_paths + print(f" found {len(all_paths)} files; using {len(paths)} for the bench") + print() + + # Warm the cache a bit — first read is always I/O-bound on CIFS + # regardless of method. We're measuring the delta, not the cold + # number. + _ = batched(paths) + + print(f"Running each path {args.repeat}x, best wall time:") + print("-" * 70) + + serial_time, serial_result = time_run(serial_baseline, paths, args.repeat) + batched_time, batched_result = time_run(batched, paths, args.repeat) + + # Sanity: the two paths must agree on accept count (correctness). + serial_accepts = sum(1 for r in serial_result if r is not None) + batched_accepts = sum(1 for r in batched_result if r is not None) + print(f" serial open+parse loop: {serial_time * 1000:8.2f} ms " + f"({serial_accepts}/{len(paths)} accepted)") + print(f" batched parse_static_pkg: {batched_time * 1000:8.2f} ms " + f"({batched_accepts}/{len(paths)} accepted)") + print() + if batched_time > 0: + speedup = serial_time / batched_time + savings_ms = (serial_time - batched_time) * 1000 + print(f" speedup: {speedup:.2f}×") + print(f" savings: {savings_ms:.1f} ms over {len(paths)} files " + f"({savings_ms / len(paths) * 1000:.1f} μs/file)") + if serial_accepts != batched_accepts: + print() + print(f" WARNING: accept-count mismatch — {serial_accepts} vs " + f"{batched_accepts}. Likely an I/O hiccup; rerun.") + print() + pool = os.environ.get("RAYON_NUM_THREADS", "(default: logical cores)") + print(f"Rayon pool size: {pool}") + print() + print("Note: serial baseline is what the rez shim does today.") + print("The batched call replaces it inside `load_family`.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/bench_package_py_parser.py b/scripts/bench_package_py_parser.py new file mode 100644 index 0000000..1ca80b5 --- /dev/null +++ b/scripts/bench_package_py_parser.py @@ -0,0 +1,433 @@ +#!/usr/bin/env python3 +"""Micro-benchmark `pyrer.parse_static_package_py` against alternatives. + +Quantifies the Stage 3 perf claim in +`docs/content/docs/engineering/fast-package-py-parser.md`: how much +the Rust AST fast-parser saves per file vs. rez's `Package` +evaluator, on the static majority of a real repo. + +Measures, per file: + + 1. `pyrer.parse_static_package_py(source)` — the Rust parser. + 2. `pyrer.PackageData.from_rez(fake_pkg)` — FakeRezPackage: + attributes are plain Python objects mimicking rez's surface + (FakeRequirement / FakeVersion need __str__). Lower bound on + `from_rez` cost; doesn't pay rez's `AttributeForwardMeta`. + 3. `pyrer.PackageData.from_rez(real_pkg)` — only if `--with-rez` + is given AND `rez` imports cleanly. Upper bound on `from_rez` + cost — matches what the Fortiche shim actually pays. + 4. File read (cold cache hint: `--drop-cache` only useful on Linux + local disk; on CIFS the kernel can't drop SMB caches anyway, so + this is mostly a warm-cache baseline). + +Run without `--corpus` for synthetic shapes; run with +`--corpus /thierry/rez/pkg` for the real Fortiche-shaped numbers. + +Examples: + + python scripts/bench_package_py_parser.py + python scripts/bench_package_py_parser.py --corpus /thierry/rez/pkg + python scripts/bench_package_py_parser.py --corpus /thierry/rez/pkg --samples 500 +""" +import argparse +import os +import pathlib +import sys +import time +import timeit +from typing import Dict, List, Optional, Tuple + +import pyrer + + +# --------------------------------------------------------------------------- +# Synthetic samples — used when --corpus is not given +# --------------------------------------------------------------------------- + +SYNTHETIC: Dict[str, str] = { + "minimal": '''\ +name = "foo" +version = "1.0.0" +''', + "typical": '''\ +name = "maya" +version = "2024.0" +description = "Autodesk Maya" +requires = ["python-3", "qt-5"] +variants = [["linux", "python-3.10"], ["linux", "python-3.11"]] + +def commands(): + env.PYTHONPATH.append("{root}/python") + env.PATH.prepend("{root}/bin") +''', + "with-scope": '''\ +# -*- coding: utf-8 -*- +name = "fortichebox" +version = "0.2.0" +requires = ["python-2.7+<3"] + +def commands(): + env["FPATH"].append("$SPACE/generic") + env["FPATH"].append("$SPACE/projects") + +with scope("config") as config: + config.release_packages_path = r"\\\\thierry\\rez\\pkg\\int" + +timestamp = 1642007300 +''', + "heavy": '''\ +name = "big_package" +version = "3.14.0" +description = "A package with lots of metadata" +authors = ["Alice", "Bob", "Carol"] +requires = ["python-3.11+", "qt-5.15+", "numpy-1.24+", "pandas", "requests"] +variants = [ + ["linux", "x86_64", "python-3.11"], + ["linux", "x86_64", "python-3.12"], + ["windows", "x86_64", "python-3.11"], + ["windows", "x86_64", "python-3.12"], +] +tools = ["bigpkg", "bigpkg-cli", "bigpkg-server"] +tests = { + "unit": {"command": "pytest tests/"}, + "lint": {"command": "ruff check ."}, +} + +def commands(): + env.PATH.prepend("{root}/bin") + env.LD_LIBRARY_PATH.prepend("{root}/lib") + env.PYTHONPATH.append("{root}/python") + if defined("DEBUG"): + env.BIGPKG_DEBUG = "1" + +timestamp = 1700000000 +format_version = 2 +hashed_variants = True +''', +} + + +# --------------------------------------------------------------------------- +# FakeRezPackage — same as scripts/bench_python_construction.py +# --------------------------------------------------------------------------- + + +class FakeRequirement: + __slots__ = ("_s",) + + def __init__(self, s: str) -> None: + self._s = s + + def __str__(self) -> str: + return self._s + + +class FakeVersion: + __slots__ = ("_s",) + + def __init__(self, s: str) -> None: + self._s = s + + def __str__(self) -> str: + return self._s + + +class FakeRezPackage: + """Lower-bound stand-in for `rez.packages.Package`. Wraps + requirement / version strings so `from_rez` pays the per-element + `__str__` round-trip — but doesn't model rez's + `AttributeForwardMeta` / late-bound check overhead. Real rez + Packages are slower than this; use --with-rez for the upper bound. + """ + + __slots__ = ("name", "version", "requires", "variants") + + def __init__( + self, + name: str, + version: str, + requires: Optional[List[str]], + variants: Optional[List[List[str]]], + ) -> None: + self.name = name + self.version = FakeVersion(version) + self.requires = ( + [FakeRequirement(r) for r in requires] if requires else None + ) + self.variants = ( + [[FakeRequirement(r) for r in v] for v in variants] if variants else None + ) + + +# --------------------------------------------------------------------------- +# Sampling +# --------------------------------------------------------------------------- + + +def gather_corpus_samples(root: str, n: int) -> List[pathlib.Path]: + """Walk `root` and return up to `n` `package.py` paths, sampled + deterministically (every k-th file) so successive runs of the bench + against the same corpus see the same files.""" + skip = lambda part: ( + part.startswith(".") + or part.startswith("_") + or (len(part) == 40 and all(c in "0123456789abcdef" for c in part)) + ) + paths: List[pathlib.Path] = [] + root_p = pathlib.Path(root) + for dirpath, dirnames, filenames in os.walk(root): + dirnames[:] = [d for d in dirnames if not skip(d)] + if "package.py" in filenames: + paths.append(pathlib.Path(dirpath) / "package.py") + paths.sort() + if len(paths) <= n: + return paths + step = max(len(paths) // n, 1) + return paths[::step][:n] + + +def derive_fake(source: str) -> Optional[FakeRezPackage]: + """Best-effort: pass `source` through the static Rust parser; if + that accepts it, lift the four fields into a FakeRezPackage so the + `from_rez` path has a comparable input. Returns None for files the + Rust parser bails on (those wouldn't be measured against `from_rez` + in the real shim anyway — the shim falls back to rez for those).""" + pd = pyrer.parse_static_package_py(source) + if pd is None: + return None + return FakeRezPackage(pd.name, pd.version, pd.requires, pd.variants) + + +# --------------------------------------------------------------------------- +# Timers +# --------------------------------------------------------------------------- + + +def time_callable(fn, args_iter, iters: int) -> float: + """Median μs/call over `iters` repeats, applied to each input in + `args_iter` once per iteration. Returns the median of the + per-input medians — robust to outliers.""" + medians: List[float] = [] + for arg in args_iter: + t = timeit.Timer(lambda a=arg: fn(a)) + times = t.repeat(repeat=5, number=iters) + medians.append((min(times) / iters) * 1_000_000) # μs + medians.sort() + return medians[len(medians) // 2] # median of medians + + +def time_two_arg(fn, args_iter, iters: int) -> float: + """Same shape as `time_callable` but the timed `fn` takes one + pre-built object per iteration (used for `from_rez(pkg)`).""" + return time_callable(fn, args_iter, iters) + + +# --------------------------------------------------------------------------- +# Real rez (optional) +# --------------------------------------------------------------------------- + + +def try_import_rez(): + """Try to set up real-rez loading. Returns a callable + `load(path) -> Package` or None if rez isn't importable.""" + try: + from rez.developer_package import DeveloperPackage # type: ignore + except Exception as e: + print(f" (rez not available: {e})", file=sys.stderr) + return None + + def load(path: pathlib.Path): + return DeveloperPackage.from_path(str(path.parent)) + + return load + + +# --------------------------------------------------------------------------- +# Reporting +# --------------------------------------------------------------------------- + + +def print_row(label: str, time_us: Optional[float], baseline: Optional[float] = None) -> None: + if time_us is None: + print(f" {label:42s} (n/a)") + return + if baseline is None: + print(f" {label:42s} {time_us:8.2f} μs") + else: + speedup = baseline / time_us if time_us > 0 else float("inf") + print(f" {label:42s} {time_us:8.2f} μs ({speedup:5.1f}× vs from_rez)") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument( + "--corpus", + metavar="PATH", + help="rez repository root to sample real package.py files from", + ) + parser.add_argument( + "--samples", + type=int, + default=100, + help="files to sample from --corpus (default: 100)", + ) + parser.add_argument( + "--iters", + type=int, + default=200, + help="`timeit` `number` argument per repeat (default: 200)", + ) + parser.add_argument( + "--with-rez", + action="store_true", + help="Also time real rez.Package loading (requires rez installed)", + ) + args = parser.parse_args() + + # Gather sources. + if args.corpus: + if not os.path.isdir(args.corpus): + print(f"not a directory: {args.corpus}", file=sys.stderr) + return 2 + print(f"Sampling up to {args.samples} files from {args.corpus} ...") + paths = gather_corpus_samples(args.corpus, args.samples) + sources: List[Tuple[str, pathlib.Path]] = [] + for p in paths: + try: + with open(p, "rb") as f: + sources.append((f.read().decode("utf-8", errors="replace"), p)) + except OSError: + continue + print(f" read {len(sources)} files") + else: + sources = [(src, pathlib.Path(f"<{name}>")) for name, src in SYNTHETIC.items()] + print(f"Using {len(sources)} synthetic samples (no --corpus)") + print() + + # ------------------------------------------------------------------ + # Timing #1: Rust AST fast-parser + # ------------------------------------------------------------------ + static_sources = [s for s, _ in sources] + static_accept_count = sum( + 1 for s in static_sources if pyrer.parse_static_package_py(s) is not None + ) + t_static = time_callable( + pyrer.parse_static_package_py, static_sources, args.iters + ) + + # ------------------------------------------------------------------ + # Timing #2: from_rez(FakeRezPackage) — lower bound + # ------------------------------------------------------------------ + fakes = [derive_fake(s) for s in static_sources] + fakes = [f for f in fakes if f is not None] + if fakes: + t_from_rez_fake = time_two_arg( + pyrer.PackageData.from_rez, fakes, args.iters + ) + else: + t_from_rez_fake = None + + # ------------------------------------------------------------------ + # Timing #3a: from_rez(real rez Package) — post-load only (μs to + # walk an already-loaded Package's four attributes). Not a fair + # comparison to the Rust parser by itself — see #3b. + # ------------------------------------------------------------------ + t_from_rez_real: Optional[float] = None + # Timing #3b: full load — `DeveloperPackage.from_path(dir) + + # from_rez(pkg)`. The production-realistic equivalent to the Rust + # parser's `open+read+parse_static_package_py(source)`. + t_full_load_real: Optional[float] = None + # Timing #3c: full load via the Rust parser path — `open + read + + # parse_static_package_py`. Apples-to-apples to 3b. + t_full_load_rust: Optional[float] = None + + if args.with_rez: + load = try_import_rez() + if load is not None and args.corpus: + print("Loading real rez Packages ...") + real_pkgs = [] + real_paths = [] + for s, p in sources: + try: + real_pkgs.append(load(p)) + real_paths.append(p) + except Exception: + continue + if len(real_pkgs) >= min(args.samples, 30): + break + if real_pkgs: + t_from_rez_real = time_two_arg( + pyrer.PackageData.from_rez, real_pkgs, args.iters + ) + # 3b: full load — does its own DeveloperPackage.from_path + # call every iteration. + def _full_load_rez(p): + pkg = load(p) + return pyrer.PackageData.from_rez(pkg) + t_full_load_real = time_two_arg( + _full_load_rez, real_paths, max(args.iters // 10, 5) + ) + # 3c: full load via Rust parser — open+read+parse every + # iteration. Same set of paths. + def _full_load_rust(p): + with open(p, "rb") as f: + src = f.read().decode("utf-8", errors="replace") + return pyrer.parse_static_package_py(src) + t_full_load_rust = time_two_arg( + _full_load_rust, real_paths, max(args.iters // 10, 5) + ) + + # ------------------------------------------------------------------ + # Timing #4: pure file-read baseline (so the reader can see what + # fraction of "open + parse" is I/O vs CPU). + # ------------------------------------------------------------------ + t_file_read: Optional[float] = None + if args.corpus: + paths_only = [p for _, p in sources] + + def _read(p): + with open(p, "rb") as f: + f.read() + + t_file_read = time_two_arg(_read, paths_only, max(args.iters // 10, 5)) + + # ------------------------------------------------------------------ + # Report + # ------------------------------------------------------------------ + print("Per-file timings (median μs/file, best of 5)") + print("-" * 78) + print("In-memory comparison (source / pkg pre-built; not production):") + baseline = t_from_rez_real if t_from_rez_real else t_from_rez_fake + print_row(" parse_static_package_py(source)", t_static, baseline) + print_row(" from_rez(fake_pkg)", t_from_rez_fake, t_from_rez_fake) + print_row(" from_rez(real rez Package, preloaded)", t_from_rez_real, baseline) + print_row(" file read only (open+read)", t_file_read, None) + print() + print("Full-load comparison (apples to apples — what the shim actually pays):") + print_row(" open+read+parse_static_package_py", t_full_load_rust, t_full_load_real) + print_row(" DeveloperPackage.from_path + from_rez", t_full_load_real, t_full_load_real) + + print() + print(f"Rust parser accept rate on the sample: " + f"{static_accept_count}/{len(static_sources)} " + f"({static_accept_count / max(len(static_sources), 1) * 100:.1f}%)") + print() + print("Notes:") + print(" - `from_rez(fake_pkg)` is a LOWER BOUND on the real cost. It") + print(" doesn't pay rez's AttributeForwardMeta / late-bound checks.") + print(" - `from_rez(real rez)` (when --with-rez is given AND rez is") + print(" installed) is the actual shim cost. Run that for the number") + print(" that matters in production.") + print(" - File-read is the I/O floor — on cold CIFS it can dominate") + print(" the parse step. The Rust parser saves CPU; load_family") + print(" saves I/O.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/compare_resolves.py b/scripts/compare_resolves.py new file mode 100644 index 0000000..b0bb782 --- /dev/null +++ b/scripts/compare_resolves.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +"""Bisect tool for issue #96-style "pyrer diverges from rez" reports. + +Resolves N requests through both engines (pyrer directly + rez's +`ResolvedContext`) and reports which agree, which diverge, and which +fail asymmetrically. Uses the recommended shim shape from the +rez-integration docs: `load_family` with `iter_packages(range_=...)`, +the static parser, and `from_rez` fallback. + +If this script reports **0 divergences** on your corpus, the +recommended shim shape works correctly. Any divergence reported by a +downstream rez integration shim is then a shim-translation bug, not +a pyrer-side correctness bug. The fix lives in the shim. + +If this script DOES reproduce a divergence, that's a pyrer bug — +please report with the failing request, the corpus path, and the +output of this script. + +## Usage + + python scripts/compare_resolves.py /path/to/rez/repo + python scripts/compare_resolves.py /path/to/repo --request foo bar + python scripts/compare_resolves.py /path/to/repo --n 30 --seed 42 + +Requires `rez` and `pyrer` importable. The script doesn't touch +`config.use_rer_solver` — rez always uses its own Python solver, +pyrer is called directly via `pyrer.solve()`. +""" +import argparse +import os +import random +import sys + +import pyrer +from rez.config import config +from rez.packages import iter_packages +from rez.resolved_context import ResolvedContext + + +def make_load_family(package_paths): + """The recommended shim shape: static parser fast path + rez + fallback on dynamic / unreadable packages.""" + + def load_family(name, version_range=None): + out = [] + for pkg in iter_packages(name, range_=version_range, paths=package_paths): + filepath = getattr(pkg, "filepath", None) + if filepath and filepath.endswith(".py"): + try: + with open(filepath, "r", encoding="utf-8") as f: + source = f.read() + pd = pyrer.parse_static_package_py(source) + except OSError: + pd = None + if pd is None: + pd = pyrer.PackageData.from_rez(pkg) + else: + pd = pyrer.PackageData.from_rez(pkg) + out.append(pd) + return out + + return load_family + + +def family_names(package_paths): + """Top-level family directories across all package paths.""" + families = [] + for prefix in package_paths: + if not os.path.isdir(prefix): + continue + for d in os.listdir(prefix): + if d.startswith(".") or d.startswith("_"): + continue + full = os.path.join(prefix, d) + if os.path.isdir(full): + families.append(d) + return sorted(set(families)) + + +def compare_one(request, package_paths, load_family): + """Resolve `request` via both engines (no implicits) and + classify the outcome.""" + rer = pyrer.solve(request, None, load_family=load_family) + + # rez side — same package_paths, no implicits. + config.implicit_packages = [] + try: + ctx = ResolvedContext( + package_requests=request, + package_paths=package_paths, + add_implicit_packages=False, + ) + except Exception as e: + return ("rez_exception", rer, None, str(e)) + + rer_ok = rer.status == "solved" + rez_ok = bool(ctx.success) + if rer_ok and rez_ok: + rer_set = {(v.name, v.version, v.variant_index) for v in rer.resolved_packages} + rez_set = { + (v.name, str(v.version), getattr(v, "index", None)) + for v in (ctx.resolved_packages or []) + } + verdict = "identical" if rer_set == rez_set else "diverge" + return (verdict, rer, ctx, None) + if not rer_ok and not rez_ok: + return ("both_fail", rer, ctx, None) + return ("rer_fail" if not rer_ok else "rez_fail", rer, ctx, None) + + +def show_divergence(name, rer, rez): + """Print package-by-package diff for a divergent resolve.""" + rer_set = {(v.name, v.version, v.variant_index) for v in rer.resolved_packages} + rez_set = { + (v.name, str(v.version), getattr(v, "index", None)) + for v in (rez.resolved_packages or []) + } + only_rer = rer_set - rez_set + only_rez = rez_set - rer_set + from collections import defaultdict + by_name = defaultdict(dict) + for n, v, i in only_rer: + by_name[n]["rer"] = (v, i) + for n, v, i in only_rez: + by_name[n]["rez"] = (v, i) + print(f"\n --- {name}: {len(by_name)} packages differ ---") + for n in sorted(by_name)[:30]: + r = by_name[n].get("rer", ("—", "—")) + z = by_name[n].get("rez", ("—", "—")) + print(f" {n:35s} rer={r[0]}[{r[1]}] rez={z[0]}[{z[1]}]") + + +def main(): + ap = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + ap.add_argument( + "package_paths", + nargs="+", + help="rez package paths (one or more)", + ) + ap.add_argument( + "--request", + nargs="+", + default=None, + help="Specific package family names to resolve (overrides sampling)", + ) + ap.add_argument( + "--n", + type=int, + default=20, + help="If --request not given, sample N random families (default: 20)", + ) + ap.add_argument( + "--seed", + type=int, + default=42, + help="RNG seed for reproducible sampling (default: 42)", + ) + args = ap.parse_args() + + for p in args.package_paths: + if not os.path.isdir(p): + print(f"not a directory: {p}", file=sys.stderr) + return 2 + + load_family = make_load_family(args.package_paths) + + if args.request: + sample = args.request + print(f"comparing {len(sample)} requested families") + else: + fams = family_names(args.package_paths) + print(f"corpus: {len(fams)} top-level families") + random.seed(args.seed) + sample = random.sample(fams, min(args.n, len(fams))) + print(f"sampling {len(sample)} families (seed={args.seed})") + print() + + counts = {} + diverge_samples = [] + rer_fail_samples = [] + + for i, name in enumerate(sample, 1): + verdict, rer, rez, err = compare_one([name], args.package_paths, load_family) + counts[verdict] = counts.get(verdict, 0) + 1 + marker = { + "identical": "✓", + "diverge": "≠", + "rer_fail": "x", + "rez_fail": "X", + "both_fail": "·", + "rez_exception": "!", + }.get(verdict, "?") + print(f" [{i:3d}/{len(sample)}] {marker} {name}") + if verdict == "diverge": + diverge_samples.append((name, rer, rez)) + elif verdict == "rer_fail": + rer_fail_samples.append((name, rer)) + + print() + print("Summary:") + for verdict in sorted(counts): + print(f" {verdict:15s} {counts[verdict]}") + total = sum(counts.values()) + diverge = counts.get("diverge", 0) + counts.get("rer_fail", 0) + pct = (total - diverge) / max(total, 1) * 100 + print(f"\n agreement: {pct:.1f}% ({total - diverge}/{total})") + + if diverge_samples: + print() + print("=== Divergent resolves ===") + for name, rer, rez in diverge_samples[:3]: + show_divergence(name, rer, rez) + + if rer_fail_samples: + print() + print("=== Rer-only failures ===") + for name, rer in rer_fail_samples[:3]: + print(f" {name}: {rer.failure_description}") + + return 0 if not diverge_samples and not rer_fail_samples else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/diff_against_rez.py b/scripts/diff_against_rez.py new file mode 100644 index 0000000..2cc6afc --- /dev/null +++ b/scripts/diff_against_rez.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +"""Differential test: `pyrer.parse_static_package_py` vs rez's `Package`. + +For every file the Rust parser accepts, load the same file through +rez's `DeveloperPackage.from_path` and assert the four solver-relevant +fields match byte-for-byte. Any divergence is a release blocker — the +fast parser is supposed to produce a `PackageData` that's +indistinguishable from `from_rez(pkg)` for the files it accepts. A +mismatch is a silent correctness regression in any rez integration. + +This is the Stage 2 safety net from the RFC at +`docs/content/docs/engineering/fast-package-py-parser.md`. Mirrors +the role of the strict 188-case rez solver differential. + +## Run + + python scripts/diff_against_rez.py /thierry/rez/pkg + python scripts/diff_against_rez.py /thierry/rez/pkg --csv mismatches.csv + +Requires: + - `pyrer` built and importable (from `crates/rer-python`). + - `rez` importable (the dev venv has it; production rez has it). + +## Exit code + + 0 — all V2-accepted files agree with rez (or rez-eval errored). + 1 — at least one V2-accepted file diverges from rez. + 2 — rez isn't importable or the corpus path is bad. + +The `rez evaluation error` bucket is *not* a mismatch — a file that +crashes rez's evaluator is not a file the rez integration shim would +have accepted either; the slow path would also reject it. +""" +import argparse +import csv +import os +import sys +import time +from collections import Counter +from typing import Any, Dict, Iterator, List, Optional, Tuple + + +# --------------------------------------------------------------------------- +# Imports — graceful if rez or pyrer aren't available +# --------------------------------------------------------------------------- + + +try: + import pyrer + HAVE_PYRER = True +except Exception as e: + print(f"FATAL: pyrer not importable ({e})", file=sys.stderr) + print(" Build with: cd crates/rer-python && maturin develop", file=sys.stderr) + sys.exit(2) + + +try: + from rez.developer_package import DeveloperPackage + HAVE_REZ = True +except Exception as e: + print(f"FATAL: rez not importable ({e})", file=sys.stderr) + sys.exit(2) + + +# --------------------------------------------------------------------------- +# Walker — mirrors the corpus integration test's traversal so the diff +# sees the same set of files +# --------------------------------------------------------------------------- + + +def find_package_pys(root: str) -> Iterator[str]: + for dirpath, dirnames, filenames in os.walk(root): + dirnames[:] = [ + d + for d in dirnames + if not d.startswith(".") + and not d.startswith("_") + and not (len(d) == 40 and all(c in "0123456789abcdef" for c in d)) + ] + if "package.py" in filenames: + yield os.path.join(dirpath, "package.py") + + +# --------------------------------------------------------------------------- +# Normalisation — rez's `requires` is a list of `Requirement` objects; +# pyrer's is a list of `str`. Stringify rez's so we compare like to like. +# --------------------------------------------------------------------------- + + +def stringify_req_list(items) -> List[str]: + """rez returns `None` for an unset list, a `SourceCode` for a + late-bound one, or a list of `Requirement` for a normal one. + Convert to `list[str]`. A `SourceCode` raises so the caller can + classify the file as rez-evaluation-error rather than a mismatch.""" + if items is None: + return [] + out = [] + for r in items: + # Guard against SourceCode / weird objects without __str__. + out.append(str(r)) + return out + + +def stringify_variants(variants) -> List[List[str]]: + if variants is None: + return [] + return [stringify_req_list(v) for v in variants] + + +def field_diff(rust, rez_fields: Dict[str, Any]) -> Dict[str, Tuple[Any, Any]]: + """Return a dict of differing fields. Empty dict = full match.""" + diffs: Dict[str, Tuple[Any, Any]] = {} + if rust.name != rez_fields["name"]: + diffs["name"] = (rust.name, rez_fields["name"]) + if rust.version != rez_fields["version"]: + diffs["version"] = (rust.version, rez_fields["version"]) + rust_reqs = list(rust.requires) + if rust_reqs != rez_fields["requires"]: + diffs["requires"] = (rust_reqs, rez_fields["requires"]) + rust_vars = [list(v) for v in rust.variants] + if rust_vars != rez_fields["variants"]: + diffs["variants"] = (rust_vars, rez_fields["variants"]) + return diffs + + +# --------------------------------------------------------------------------- +# Per-file diff +# --------------------------------------------------------------------------- + + +def load_rez_fields(path: str) -> Optional[Dict[str, Any]]: + """Return rez's view of the four solver fields, or `None` on + evaluation error. Walks the same `DeveloperPackage.from_path` + code path the rez-integration shim's `from_rez` would.""" + try: + pkg = DeveloperPackage.from_path(os.path.dirname(path)) + return { + "name": str(pkg.name), + "version": str(pkg.version), + "requires": stringify_req_list(pkg.requires), + "variants": stringify_variants(pkg.variants), + } + except Exception: + return None + + +# --------------------------------------------------------------------------- +# Reporting +# --------------------------------------------------------------------------- + + +def print_mismatch(path: str, diffs: Dict[str, Tuple[Any, Any]], idx: int) -> None: + print(f"\n [{idx}] {path}") + for field, (rust_v, rez_v) in diffs.items(): + print(f" field : {field}") + rust_repr = repr(rust_v) if len(repr(rust_v)) < 200 else repr(rust_v)[:200] + " ..." + rez_repr = repr(rez_v) if len(repr(rez_v)) < 200 else repr(rez_v)[:200] + " ..." + print(f" rust : {rust_repr}") + print(f" rez : {rez_repr}") + + +def write_csv(mismatches: List[Tuple[str, Dict[str, Tuple[Any, Any]]]], csv_path: str) -> None: + with open(csv_path, "w", newline="") as f: + w = csv.writer(f) + w.writerow(["path", "field", "rust", "rez"]) + for path, diffs in mismatches: + for field, (rust_v, rez_v) in diffs.items(): + w.writerow([path, field, repr(rust_v), repr(rez_v)]) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument("corpus", help="rez repository root (e.g. /thierry/rez/pkg)") + parser.add_argument( + "--show-mismatches", + type=int, + default=10, + metavar="N", + help="Show details for the first N mismatching files (default: 10)", + ) + parser.add_argument( + "--csv", + metavar="PATH", + help="Write per-mismatch rows to a CSV at PATH", + ) + parser.add_argument( + "--max", + type=int, + default=0, + metavar="N", + help="Stop after N files (0 = no cap, default)", + ) + args = parser.parse_args() + + if not os.path.isdir(args.corpus): + print(f"not a directory: {args.corpus}", file=sys.stderr) + return 2 + + counts: Counter = Counter() + field_mismatches: Counter = Counter() + mismatch_samples: List[Tuple[str, Dict[str, Tuple[Any, Any]]]] = [] + started = time.time() + + print(f"Differential test: {args.corpus}") + print(f" pyrer: {getattr(pyrer, '__version__', '?')}") + try: + import rez + print(f" rez: {rez.__version__}") + except Exception: + pass + print() + + for path in find_package_pys(args.corpus): + counts["total"] += 1 + if args.max and counts["total"] > args.max: + break + + try: + with open(path, "rb") as f: + source = f.read().decode("utf-8", errors="replace") + except OSError: + counts["read_error"] += 1 + continue + + rust = pyrer.parse_static_package_py(source) + if rust is None: + counts["v2_bailed"] += 1 + continue + + rez_fields = load_rez_fields(path) + if rez_fields is None: + counts["rez_eval_error"] += 1 + continue + + diffs = field_diff(rust, rez_fields) + if not diffs: + counts["match"] += 1 + else: + counts["mismatch"] += 1 + for field in diffs: + field_mismatches[field] += 1 + mismatch_samples.append((path, diffs)) + + # Light progress signal every 500 files — running over CIFS + # is slow enough that a status line helps. + if counts["total"] % 500 == 0: + elapsed = time.time() - started + print( + f" ... {counts['total']:5d} files surveyed " + f"(match={counts['match']}, mismatch={counts['mismatch']}, " + f"bailed={counts['v2_bailed']}, rez_err={counts['rez_eval_error']}) " + f"in {elapsed:.1f}s", + flush=True, + ) + + elapsed = time.time() - started + + # ---------------------------------------------------------------- + # Report + # ---------------------------------------------------------------- + + print() + print("Differential test results") + print("-" * 60) + print(f" Total package.py files surveyed: {counts['total']:6d} ({elapsed:.1f}s)") + print(f" V2 accepted: {counts['match'] + counts['mismatch'] + counts['rez_eval_error']:6d}") + print(f" V2 bailed: {counts['v2_bailed']:6d}") + print(f" Read error: {counts['read_error']:6d}") + print() + accepted = counts["match"] + counts["mismatch"] + counts["rez_eval_error"] + if accepted: + print(f" Differential on V2-accepted ({accepted} files):") + print(f" Match (all four fields agree): {counts['match']:6d} ({counts['match'] / accepted * 100:.2f}%)") + print(f" Mismatch: {counts['mismatch']:6d} ({counts['mismatch'] / accepted * 100:.2f}%)") + print(f" rez evaluation error: {counts['rez_eval_error']:6d} ({counts['rez_eval_error'] / accepted * 100:.2f}%)") + print() + + if field_mismatches: + print(" Mismatch breakdown by field:") + for field, n in field_mismatches.most_common(): + print(f" {field:12s}: {n}") + print() + + if mismatch_samples: + print(f"Mismatch samples (showing up to {args.show_mismatches}):") + for i, (path, diffs) in enumerate(mismatch_samples[: args.show_mismatches]): + print_mismatch(path, diffs, i + 1) + if len(mismatch_samples) > args.show_mismatches: + print(f"\n ({len(mismatch_samples) - args.show_mismatches} more not shown)") + + if args.csv and mismatch_samples: + write_csv(mismatch_samples, args.csv) + print(f"\nWrote per-mismatch rows to {args.csv}") + + # Exit code: non-zero if there are any mismatches. + return 1 if counts["mismatch"] else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/survey_package_py.py b/scripts/survey_package_py.py new file mode 100644 index 0000000..3e499f2 --- /dev/null +++ b/scripts/survey_package_py.py @@ -0,0 +1,457 @@ +#!/usr/bin/env python3 +"""Stage 1 corpus survey for the Rust `package.py` fast-parser RFC. + +Walks a tree of rez `package.py` files and classifies each one as +*fast-parseable* (the four solver-relevant fields `name`, `version`, +`requires`, `variants` are all literal assignments and the file has +no disqualifying top-level statements) or *not* — with the reasons +listed. + +The output is the go/no-go signal for Stages 2-3 of the RFC at +`docs/content/docs/engineering/fast-package-py-parser.md`. If +≥70% of a real studio repo is fast-parseable, the Rust parser is +worth building. Below 40%, prefer a parsed-package cache instead. + +The classifier recognises two ignorable patterns the Rust parser +will also need to handle: + + - `def commands()`, `def pre_commands()`, etc. — function bodies + that affect runtime environment, not the solve. + - `with scope("config")` / `with scope("build")` — rez's + declarative DSL for non-solver config. Body writes attributes + of the `as`-name, never solver fields. + +Anything else at module scope (top-level `if`, `for`, `try`, +`import`, `class`, `@early`/`@late` on a solver field) disqualifies +the file from the fast path. + +Pure-stdlib: no pyrer / rez / rustpython-parser dependency. Uses +Python's `ast` module — meaning the parser used here is the same +shape as the one Rust's `rustpython-parser` would produce, so the +classification rules carry over cleanly. + +Usage: + python scripts/survey_package_py.py /path/to/rez/repo + python scripts/survey_package_py.py /path/to/rez/repo --csv out.csv + python scripts/survey_package_py.py /path/to/rez/repo --show-examples 5 + +`Resource temporarily unavailable` errors from a CIFS mount are +caught and counted under `io-error` — they don't crash the survey. +""" +import argparse +import ast +import os +import sys +from collections import Counter +from dataclasses import dataclass, field +from typing import Iterable, List, Optional, Tuple + + +SOLVER_FIELDS = ("name", "version", "requires", "variants") + + +@dataclass +class FileVerdict: + path: str + fast_parseable: bool + reasons: List[str] = field(default_factory=list) + # Which solver fields were found, and which were literal-static. + fields_present: List[str] = field(default_factory=list) + fields_static: List[str] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# AST classification +# --------------------------------------------------------------------------- + + +def _is_str_literal(node: ast.AST) -> bool: + return isinstance(node, ast.Constant) and isinstance(node.value, str) + + +def _is_list_of_str_literal(node: ast.AST) -> bool: + """List or Tuple of string literals — what `requires` looks like.""" + if not isinstance(node, (ast.List, ast.Tuple)): + return False + return all(_is_str_literal(elt) for elt in node.elts) + + +def _is_list_of_list_of_str_literal(node: ast.AST) -> bool: + """List of List/Tuple of string literals — what `variants` looks like.""" + if not isinstance(node, (ast.List, ast.Tuple)): + return False + return all(_is_list_of_str_literal(inner) for inner in node.elts) + + +def _assign_targets(node: ast.Assign) -> List[str]: + """Names assigned to in a single Assign statement (handles a, b = ...).""" + names = [] + for target in node.targets: + if isinstance(target, ast.Name): + names.append(target.id) + elif isinstance(target, (ast.Tuple, ast.List)): + for elt in target.elts: + if isinstance(elt, ast.Name): + names.append(elt.id) + return names + + +def _function_is_early_or_late(func: ast.FunctionDef) -> bool: + """`@early()` / `@late()` decorated function — dynamic value.""" + for deco in func.decorator_list: + # `@early` (bare name) or `@early()` (call) + target = deco.func if isinstance(deco, ast.Call) else deco + if isinstance(target, ast.Name) and target.id in ("early", "late"): + return True + return False + + +def _is_scope_with(node: ast.With) -> bool: + """rez's declarative DSL: `with scope("config") as config: ...`. The + body only writes to attributes of the `as`-name (which never overlaps + a solver field), so the whole block is solver-irrelevant — treat it + the same way we treat `def commands(...)`. All `with` items must be + `scope(...)` calls for this to apply.""" + if not node.items: + return False + for item in node.items: + ctx = item.context_expr + if not ( + isinstance(ctx, ast.Call) + and isinstance(ctx.func, ast.Name) + and ctx.func.id == "scope" + ): + return False + return True + + +def classify(source: str, path: str) -> FileVerdict: + """Walk the module's top-level statements and decide if the four solver + fields are all literal-static, and whether anything else at module + level would force a bail.""" + verdict = FileVerdict(path=path, fast_parseable=False) + + try: + tree = ast.parse(source, filename=path) + except SyntaxError as e: + verdict.reasons.append(f"syntax-error:{e.msg}") + return verdict + + # Track per-field: present anywhere, and whether the last assignment + # / definition for that field is static. We treat *any* dynamic + # observation as poisoning the field — rebinds to literal afterwards + # don't help (we'd still need to evaluate the dynamic branch). + field_present = {f: False for f in SOLVER_FIELDS} + field_static = {f: False for f in SOLVER_FIELDS} + field_poisoned = {f: False for f in SOLVER_FIELDS} + + for i, stmt in enumerate(tree.body): + # Module docstring — Expr(Constant(str)) as first statement. + if ( + i == 0 + and isinstance(stmt, ast.Expr) + and _is_str_literal(stmt.value) + ): + continue + + # `import x` / `from x import y` — possibly load-bearing. + if isinstance(stmt, (ast.Import, ast.ImportFrom)): + verdict.reasons.append("imports") + continue + + # Special case: `with scope("config") as config: ...` — rez's + # declarative DSL for setting non-solver-relevant config (paths, + # release settings). Body writes attributes of the `as`-name, + # never module-level solver fields. Treat as ignorable; still + # walk in defensively to catch a pathological body that does + # touch a solver field. + if isinstance(stmt, ast.With) and _is_scope_with(stmt): + for descendant in ast.walk(stmt): + if isinstance(descendant, ast.Assign): + for n in _assign_targets(descendant): + if n in SOLVER_FIELDS: + field_poisoned[n] = True + verdict.reasons.append(f"scope-with-assigns-{n}") + continue + + # Top-level control flow — can't statically know which branch wins. + if isinstance(stmt, (ast.If, ast.For, ast.While, ast.Try, ast.With)): + verdict.reasons.append(f"top-level-{type(stmt).__name__.lower()}") + # Also need to walk into it to see if it touches solver fields, + # but the bail decision is already made — record the reason and + # mark any solver field touched as poisoned. + for descendant in ast.walk(stmt): + if isinstance(descendant, ast.Assign): + for n in _assign_targets(descendant): + if n in SOLVER_FIELDS: + field_poisoned[n] = True + continue + + # `def commands(): ...` etc., or `@early` def for a solver field. + if isinstance(stmt, ast.FunctionDef): + if stmt.name in SOLVER_FIELDS: + if _function_is_early_or_late(stmt): + verdict.reasons.append(f"dynamic-{stmt.name}") + field_poisoned[stmt.name] = True + else: + # Plain `def requires():` without @early is unusual but + # functionally dynamic — bail. + verdict.reasons.append(f"function-def-{stmt.name}") + field_poisoned[stmt.name] = True + # `def commands(...)`, `def build(...)`, etc. — not solver- + # relevant, ignore. + continue + + # AsyncFunctionDef, ClassDef — uncommon in package.py; bail. + if isinstance(stmt, (ast.AsyncFunctionDef, ast.ClassDef)): + verdict.reasons.append(f"top-level-{type(stmt).__name__.lower()}") + continue + + # Plain assignment. + if isinstance(stmt, ast.Assign): + targets = _assign_targets(stmt) + for n in targets: + if n not in SOLVER_FIELDS: + continue + field_present[n] = True + if n == "name" and _is_str_literal(stmt.value): + field_static[n] = True + elif n == "version" and _is_str_literal(stmt.value): + field_static[n] = True + elif n == "requires" and _is_list_of_str_literal(stmt.value): + field_static[n] = True + elif n == "variants" and _is_list_of_list_of_str_literal( + stmt.value + ): + field_static[n] = True + else: + field_poisoned[n] = True + continue + + # `requires: list[str] = [...]` — annotated assignment. + if isinstance(stmt, ast.AnnAssign) and isinstance(stmt.target, ast.Name): + n = stmt.target.id + if n in SOLVER_FIELDS: + field_present[n] = True + if stmt.value is None: + # type-only annotation, no value bound. + field_poisoned[n] = True + elif n == "name" and _is_str_literal(stmt.value): + field_static[n] = True + elif n == "version" and _is_str_literal(stmt.value): + field_static[n] = True + elif n == "requires" and _is_list_of_str_literal(stmt.value): + field_static[n] = True + elif n == "variants" and _is_list_of_list_of_str_literal( + stmt.value + ): + field_static[n] = True + else: + field_poisoned[n] = True + continue + + # `name += "foo"`, etc. — AugAssign on a solver field is dynamic. + if isinstance(stmt, ast.AugAssign) and isinstance(stmt.target, ast.Name): + if stmt.target.id in SOLVER_FIELDS: + field_poisoned[stmt.target.id] = True + continue + + # Expression statements (function calls at module level): suspicious. + if isinstance(stmt, ast.Expr): + verdict.reasons.append("top-level-expr") + continue + + # Catch-all for anything we didn't list. + verdict.reasons.append(f"unrecognised-{type(stmt).__name__.lower()}") + + # name & version are required; requires & variants default to empty. + must_have = ("name", "version") + static_ok = True + for f in must_have: + if not field_present[f]: + verdict.reasons.append(f"missing-{f}") + static_ok = False + elif field_poisoned[f] or not field_static[f]: + verdict.reasons.append(f"non-literal-{f}") + static_ok = False + for f in ("requires", "variants"): + if field_present[f] and (field_poisoned[f] or not field_static[f]): + verdict.reasons.append(f"non-literal-{f}") + static_ok = False + + verdict.fields_present = [f for f, p in field_present.items() if p] + verdict.fields_static = [f for f, s in field_static.items() if s] + verdict.fast_parseable = static_ok and not verdict.reasons + return verdict + + +# --------------------------------------------------------------------------- +# Walking the repository +# --------------------------------------------------------------------------- + + +def find_package_pys(root: str) -> Iterable[str]: + """Yield every `package.py` under `root`. Skips dot-directories and + rez variant subdirs (40-char hex hashes) since those re-shadow the + parent's package.py with the same content.""" + for dirpath, dirnames, filenames in os.walk(root, followlinks=False): + # Prune hidden + rez variant subdirs. + dirnames[:] = [ + d for d in dirnames + if not d.startswith(".") + and not d.startswith("_") + and not _looks_like_variant_hash(d) + ] + if "package.py" in filenames: + yield os.path.join(dirpath, "package.py") + + +def _looks_like_variant_hash(name: str) -> bool: + return len(name) == 40 and all(c in "0123456789abcdef" for c in name) + + +def survey(root: str) -> Tuple[List[FileVerdict], int]: + """Walk and classify every package.py under `root`.""" + verdicts: List[FileVerdict] = [] + io_errors = 0 + for path in find_package_pys(root): + try: + with open(path, "rb") as f: + source = f.read().decode("utf-8", errors="replace") + except OSError: + io_errors += 1 + continue + verdicts.append(classify(source, path)) + return verdicts, io_errors + + +# --------------------------------------------------------------------------- +# Reporting +# --------------------------------------------------------------------------- + + +def report( + verdicts: List[FileVerdict], + io_errors: int, + show_examples: int, +) -> None: + n = len(verdicts) + if n == 0 and io_errors == 0: + print("No package.py files found.", file=sys.stderr) + return + + fast = [v for v in verdicts if v.fast_parseable] + slow = [v for v in verdicts if not v.fast_parseable] + + print(f"Surveyed: {n} package.py files") + if io_errors: + print(f" {io_errors} I/O errors (skipped — likely CIFS flakiness)") + print() + print(f" fast-parseable: {len(fast):6d} ({len(fast) / n * 100:5.1f}%)") + print(f" not fast-parseable: {len(slow):6d} ({len(slow) / n * 100:5.1f}%)") + print() + + # Reason histogram (a file can show up in multiple buckets). + reason_counts: Counter = Counter() + for v in slow: + for r in set(v.reasons): # dedupe per-file (multiple imports → one count) + reason_counts[r] += 1 + + if reason_counts: + print("Reasons (files can disqualify on multiple — counts are independent):") + print("-" * 60) + for reason, count in reason_counts.most_common(): + print(f" {reason:30s} {count:6d} ({count / n * 100:5.1f}% of total)") + print() + + if show_examples > 0: + print(f"Examples (up to {show_examples} per bucket):") + print("-" * 60) + print(f"\n fast-parseable:") + for v in fast[:show_examples]: + print(f" {v.path}") + for reason in sorted({r for v in slow for r in v.reasons}): + print(f"\n {reason}:") + examples = [v.path for v in slow if reason in v.reasons][:show_examples] + for p in examples: + print(f" {p}") + print() + + # The go/no-go decision rule from the RFC. + pct = len(fast) / n * 100 if n else 0 + print("Go / no-go (from the RFC):") + print("-" * 60) + if pct >= 70: + print(f" ≥ 70% fast-parseable ({pct:.1f}%) — PROCEED to Stages 2-3.") + elif pct >= 40: + print( + f" {pct:.1f}% fast-parseable — marginal. Worth a discussion about" + "\n which patterns to expand coverage to, or whether the" + "\n parsed-package cache alternative pays off better." + ) + else: + print( + f" < 40% fast-parseable ({pct:.1f}%) — don't build the parser." + "\n The slow-path fallback would dominate. Pivot to the" + "\n parsed-package cache alternative in the RFC." + ) + + +def write_csv(verdicts: List[FileVerdict], csv_path: str) -> None: + import csv + + with open(csv_path, "w", newline="") as f: + w = csv.writer(f) + w.writerow( + ["path", "fast_parseable", "reasons", "fields_present", "fields_static"] + ) + for v in verdicts: + w.writerow( + [ + v.path, + int(v.fast_parseable), + ";".join(sorted(set(v.reasons))), + ";".join(v.fields_present), + ";".join(v.fields_static), + ] + ) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument("root", help="rez repository root (e.g. /thierry/rez/pkg)") + parser.add_argument( + "--csv", + metavar="PATH", + help="Write per-file verdicts to a CSV at PATH (optional)", + ) + parser.add_argument( + "--show-examples", + type=int, + default=0, + metavar="N", + help="Show up to N example file paths per bucket (default: 0)", + ) + args = parser.parse_args() + + if not os.path.isdir(args.root): + print(f"not a directory: {args.root}", file=sys.stderr) + return 2 + + verdicts, io_errors = survey(args.root) + report(verdicts, io_errors, args.show_examples) + + if args.csv: + write_csv(verdicts, args.csv) + print(f"\nWrote per-file verdicts to {args.csv}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_rich_api.py b/tests/test_rich_api.py index c20d2ba..0a312bc 100644 --- a/tests/test_rich_api.py +++ b/tests/test_rich_api.py @@ -564,6 +564,84 @@ def loader(name): assert "duplicate" in (result.failure_description or "").lower() +# --------------------------------------------------------------------------- +# load_family version_range hint (issue #92) +# --------------------------------------------------------------------------- + + +def test_load_family_range_hint_passed_for_pinned_request(): + """A `lib-2+<3` request should pass `version_range="2+<3"` to the + callback so the shim can pre-filter via `iter_packages(range_=...)`.""" + seen = [] + + def loader(name, version_range=None): + seen.append((name, version_range)) + if name == "lib": + return [_pkg("lib", "2.0.0"), _pkg("lib", "2.5.0")] + return [] + + result = pyrer.solve(["lib-2+<3"], None, load_family=loader) + assert result.status == "solved" + assert len(seen) == 1 + name, hint = seen[0] + assert name == "lib" + # The exact stringification is rez-syntax-ish; just verify the + # constraint is communicated. + assert hint is not None + assert "2" in hint and "<3" in hint + + +def test_load_family_legacy_one_arg_callback_still_works(): + """A pre-#92 callback that only accepts `name` must keep working — + pyrer falls back to the old call shape.""" + seen = [] + + def loader(name): # 1-arg, no version_range + seen.append(name) + if name == "lib": + return [_pkg("lib", "2.0.0")] + return [] + + result = pyrer.solve(["lib"], None, load_family=loader) + assert result.status == "solved" + assert seen == ["lib"] + + +def test_load_family_range_hint_string_format(): + """The hint should be a rez-style range string the shim can pass + directly to `iter_packages(range_=...)`.""" + captured_hint = [] + + def loader(name, version_range=None): + captured_hint.append((name, version_range)) + return [_pkg(name, "1.5.0"), _pkg(name, "2.0.0")] + + pyrer.solve(["foo-1+<2"], None, load_family=loader) + assert len(captured_hint) == 1 + name, hint = captured_hint[0] + assert name == "foo" + # rez accepts the hint as a string — make sure that's what we pass. + assert isinstance(hint, str) + + +def test_load_family_kwargs_callback(): + """A callback using `**kwargs` to accept future args should also work.""" + seen = [] + + def loader(name, **kwargs): + seen.append((name, kwargs.get("version_range"))) + if name == "lib": + return [_pkg("lib", "1.0.0")] + return [] + + result = pyrer.solve(["lib"], None, load_family=loader) + assert result.status == "solved" + assert len(seen) == 1 + # **kwargs accepts the hint argument + name, hint = seen[0] + assert name == "lib" + + def test_from_rez_used_in_solve(): """End-to-end: from_rez → solve produces the same result as constructor.""" @@ -584,3 +662,246 @@ def __init__(self, name, version, requires=None, variants=None): assert result.status == "solved" names = {v.name for v in result.resolved_packages} assert names == {"app", "lib"} + + +# --------------------------------------------------------------------------- +# parse_static_package_py — Rust AST fast-path (RFC: fast-package-py-parser) +# --------------------------------------------------------------------------- + + +def test_parse_static_package_py_minimal(): + """Smallest static package.py — just name + version.""" + src = 'name = "foo"\nversion = "1.0.0"\n' + pd = pyrer.parse_static_package_py(src) + assert pd is not None + assert pd.name == "foo" + assert pd.version == "1.0.0" + assert pd.requires == [] + assert pd.variants == [] + + +def test_parse_static_package_py_full_static(): + """All four solver fields with literal values plus an irrelevant + `def commands()` block.""" + src = ''' +name = "maya" +version = "2024.0" +description = "irrelevant" +requires = ["python-3", "qt-5"] +variants = [["linux", "python-3.10"], ["linux", "python-3.11"]] + +def commands(): + env.PYTHONPATH.append("{root}/python") +''' + pd = pyrer.parse_static_package_py(src) + assert pd is not None + assert pd.name == "maya" + assert pd.version == "2024.0" + assert pd.requires == ["python-3", "qt-5"] + assert pd.variants == [["linux", "python-3.10"], ["linux", "python-3.11"]] + + +def test_parse_static_package_py_with_scope(): + """The dominant Fortiche pattern: solver fields at top level plus a + `with scope("config")` declarative-DSL block. 35% of Fortiche's + corpus is this shape.""" + src = ''' +# -*- coding: utf-8 -*- +name = "fortichebox" +version = "0.2.0" +requires = ["python-2.7+<3"] + +def commands(): + env["FPATH"].append("$SPACE/generic") + +with scope("config") as config: + config.release_packages_path = "/some/path" +''' + pd = pyrer.parse_static_package_py(src) + assert pd is not None + assert pd.name == "fortichebox" + assert pd.requires == ["python-2.7+<3"] + + +def test_parse_static_package_py_bails_on_early_requires(): + """@early-bound requires is dynamic — must return None so the caller + falls back to rez.""" + src = ''' +name = "foo" +version = "1.0" + +@early() +def requires(): + return ["python-3"] +''' + assert pyrer.parse_static_package_py(src) is None + + +def test_parse_static_package_py_bails_on_import(): + src = ''' +import os + +name = "foo" +version = "1.0" +''' + assert pyrer.parse_static_package_py(src) is None + + +def test_parse_static_package_py_bails_on_syntax_error(): + """Unparseable source — return None, not raise.""" + src = 'name = "foo\nversion = "1.0"\n' + assert pyrer.parse_static_package_py(src) is None + + +# --------------------------------------------------------------------------- +# parse_static_packages_py — batched / Rayon-parallel (issue #94) +# --------------------------------------------------------------------------- + + +def test_parse_static_packages_py_empty_input(): + """An empty list returns an empty list — never raises.""" + result = pyrer.parse_static_packages_py([]) + assert result == [] + + +def _write_pkg(tmp_path, name, source): + """Helper: write `source` to `tmp_path//package.py` and + return the file path as a string.""" + d = tmp_path / name + d.mkdir(parents=True, exist_ok=True) + pkg = d / "package.py" + pkg.write_text(source, encoding="utf-8") + return str(pkg) + + +def test_parse_static_packages_py_each_file_independent(tmp_path): + """Static, dynamic, missing — each maps to the right slot in the + aligned output.""" + static_path = _write_pkg( + tmp_path, + "app", + 'name = "app"\nversion = "1.0.0"\nrequires = ["lib-2"]\n', + ) + dynamic_path = _write_pkg( + tmp_path, + "dynamic", + 'import os\nname = "foo"\nversion = "1.0"\n', + ) + missing_path = str(tmp_path / "phantom" / "package.py") + static_path_2 = _write_pkg( + tmp_path, + "lib", + 'name = "lib"\nversion = "2.0.0"\n', + ) + + result = pyrer.parse_static_packages_py( + [static_path, dynamic_path, missing_path, static_path_2] + ) + assert len(result) == 4 + + assert result[0] is not None + assert result[0].name == "app" + assert result[0].version == "1.0.0" + assert result[0].requires == ["lib-2"] + + assert result[1] is None # dynamic — top-level import + assert result[2] is None # missing file + + assert result[3] is not None + assert result[3].name == "lib" + + +def test_parse_static_packages_py_preserves_order(tmp_path): + """Rayon's par_iter can complete out of order; the returned list + must match the input positions exactly.""" + paths = [] + for i in range(20): + if i % 2 == 0: + src = f'name = "pkg{i}"\nversion = "1.0"\n' + else: + # `if` at module scope → parser bails + src = f'name = "pkg{i}"\nversion = "1.0"\nif True:\n pass\n' + paths.append(_write_pkg(tmp_path, f"pkg{i}", src)) + + result = pyrer.parse_static_packages_py(paths) + assert len(result) == 20 + for i, pd in enumerate(result): + if i % 2 == 0: + assert pd is not None, f"index {i}: should be Some" + assert pd.name == f"pkg{i}" + else: + assert pd is None, f"index {i}: should be None (dynamic)" + + +def test_parse_static_packages_py_accepts_pathlib_paths(tmp_path): + """The PyO3 binding extracts `PathBuf` — `pathlib.Path` instances + should work alongside `str`.""" + from pathlib import Path + + p = _write_pkg(tmp_path, "p", 'name = "p"\nversion = "1.0"\n') + result = pyrer.parse_static_packages_py([Path(p)]) + assert len(result) == 1 + assert result[0] is not None + assert result[0].name == "p" + + +def test_parse_static_packages_py_drives_solve(tmp_path): + """End-to-end: batch-parse a small repo, feed straight into + `pyrer.solve`.""" + app_path = _write_pkg( + tmp_path, "app/1.0.0", + 'name = "app"\nversion = "1.0.0"\nrequires = ["lib-2"]\n', + ) + lib1_path = _write_pkg( + tmp_path, "lib/1.0.0", 'name = "lib"\nversion = "1.0.0"\n', + ) + lib2_path = _write_pkg( + tmp_path, "lib/2.0.0", 'name = "lib"\nversion = "2.0.0"\n', + ) + + result = pyrer.parse_static_packages_py([app_path, lib1_path, lib2_path]) + packages = [pd for pd in result if pd is not None] + assert len(packages) == 3 + + solve = pyrer.solve(["app"], packages) + assert solve.status == "solved" + names = {v.name: v.version for v in solve.resolved_packages} + assert names == {"app": "1.0.0", "lib": "2.0.0"} + + +def test_parse_static_packages_py_matches_single_file(tmp_path): + """Sanity: batched and single produce equivalent `PackageData` for + the same source.""" + src = 'name = "foo"\nversion = "1.0.0"\nrequires = ["bar"]\n' + single = pyrer.parse_static_package_py(src) + + path = _write_pkg(tmp_path, "foo", src) + batch = pyrer.parse_static_packages_py([path]) + + assert single is not None + assert len(batch) == 1 + assert batch[0] is not None + assert single.name == batch[0].name + assert single.version == batch[0].version + assert single.requires == batch[0].requires + assert single.variants == batch[0].variants + + +def test_parse_static_package_py_roundtrips_through_solve(): + """End-to-end: parse a static package.py → use the PackageData in a + solve. Should resolve identically to a hand-constructed PackageData.""" + src = ''' +name = "app" +version = "1.0.0" +requires = ["lib-2"] +''' + parsed = pyrer.parse_static_package_py(src) + constructed = pyrer.PackageData("app", "1.0.0", ["lib-2"]) + libs = [pyrer.PackageData("lib", "1.0.0"), pyrer.PackageData("lib", "2.0.0")] + + result_parsed = pyrer.solve(["app"], [parsed] + libs) + result_constructed = pyrer.solve(["app"], [constructed] + libs) + + assert result_parsed.status == "solved" + assert result_constructed.status == "solved" + assert result_parsed.resolved == result_constructed.resolved