From 09bfd79ce984df6c1dfb33f1a5611b941350e23b Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 15:56:41 -0500 Subject: [PATCH 01/58] docs(node): Phase 3 subtree-withholding plan with Task 0 spike findings --- ...6-05-phase3-subtree-content-withholding.md | 710 ++++++++++++++++++ 1 file changed, 710 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md diff --git a/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md b/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md new file mode 100644 index 0000000..6aaa474 --- /dev/null +++ b/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md @@ -0,0 +1,710 @@ +# Phase 3: Subtree Content Withholding (mode B) Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make a mode-`b` subtree visibility rule actually withhold that subtree's file content on clone/fetch over the node's HTTP git read path, while keeping every commit and tree SHA intact, so a non-reader sees the directory structure and blob SHAs but never the private bytes. + +**Architecture:** The authorization decision already exists as the pure `visibility_check` (one decision per path). Phase 3 adds two node-side pieces: (1) a blob-OID resolver that, given a repo's refs plus the caller's rules, returns the set of blob object IDs the caller may not read (a blob is withheld only if it appears at no allowed path); and (2) a filtered `upload-pack` serve path that builds the response pack excluding those OIDs. The two existing read handlers (`git_info_refs`, `git_upload_pack`) keep their current whole-repo 404 gate unchanged and gain a filtered serve branch when, and only when, the caller has at least one withheld blob. Trees and commits are always sent in full, so SHAs stay intact; only blob content is omitted. + +**Tech Stack:** Rust, axum, the system `git` CLI (shelled out, as the codebase already does in `git/store.rs` and `git/smart_http.rs`), `tempfile` for fixture repos in tests. + +**Scope boundary:** This plan covers the node-side enforcement and the security guarantee (private blob bytes are never placed in the served pack), proven by inspecting the produced pack. It deliberately does NOT cover: the `git-remote-gitlawb` client-side change that lets a non-reader get a *clean* partial checkout (a stock `git clone` of a repo with a withheld blob will fail at checkout on the missing object; that UX work is a separate follow-up plan), filtered-pack caching, or incremental-fetch (`have`-line) hardening beyond what falls out naturally. Those are listed under "Out of scope / follow-ups" at the end. + +--- + +## File Structure + +- **Create:** `crates/gitlawb-node/src/git/visibility_pack.rs`: the blob-OID resolver (`withheld_blob_oids`) and its tests. One responsibility: decide which blob OIDs to withhold for a caller. +- **Modify:** `crates/gitlawb-node/src/git/mod.rs`: add `pub mod visibility_pack;`. +- **Modify:** `crates/gitlawb-node/src/git/smart_http.rs`: add `upload_pack_excluding` (filtered serve) alongside the existing `upload_pack`, plus a small `pack_object_ids` test helper. +- **Modify:** `crates/gitlawb-node/src/api/repos.rs`: in `git_upload_pack` (around line 368-407) branch to the filtered serve when the caller has withheld blobs; `git_info_refs` (around line 308-365) needs no functional change but gets a confirming test. +- **Modify (test oracle only):** `crates/gitlawb-node/src/visibility.rs`: no logic change; `visibility_check` is reused as-is by the resolver. + +--- + +## Task 0: Spike: pin the filtered-serve mechanism + +This is the one genuinely uncertain piece: how to make `git upload-pack` (or `git pack-objects`) produce a clone/fetch response that omits a specific set of blob OIDs while still sending the trees that reference them, and how to frame that as a valid `application/x-git-upload-pack-result` body. Everything downstream depends on a single function signature, not on the mechanism, so this task nails the mechanism by experiment and records the result. No production code is committed in this task. + +**Files:** +- Scratch only (a throwaway shell script and a temp repo). Findings are written back into this plan's "Task 0 Findings" block below. + +- [ ] **Step 1: Build a fixture repo with a public and a private file** + +Run: +```bash +cd "$(mktemp -d)" && export FIX=$PWD +git init -q work && cd work +git config user.email t@t && git config user.name t +mkdir -p public secret +echo "public bytes" > public/a.txt +echo "TOP SECRET" > secret/b.txt +git add . && git commit -qm init +SECRET_OID=$(git rev-parse HEAD:secret/b.txt) +PUBLIC_OID=$(git rev-parse HEAD:public/a.txt) +echo "secret blob=$SECRET_OID public blob=$PUBLIC_OID" +cd .. && git clone -q --bare work bare.git +``` + +- [ ] **Step 2: Produce a pack that excludes the secret blob OID** + +Run (mechanism candidate: explicit object list to `pack-objects`): +```bash +cd "$FIX/bare.git" +# Every object reachable from all refs, as "oid [path]" lines: +git rev-list --objects --all > /tmp/all_objs.txt +# Drop the secret blob's line, keep only the OID column: +grep -v "^$SECRET_OID" /tmp/all_objs.txt | awk '{print $1}' > /tmp/keep_oids.txt +# Build a pack of exactly those objects: +git pack-objects --stdout < /tmp/keep_oids.txt > /tmp/filtered.pack +# Confirm the secret blob is absent and the public blob present: +git verify-pack -v /tmp/filtered.pack | grep -E "$SECRET_OID|$PUBLIC_OID" || echo "secret absent (expected: only public line prints)" +``` +Expected: the public OID prints, the secret OID does not. This proves the OID-exclusion mechanism. + +- [ ] **Step 3: Determine the upload-pack response framing** + +Run, capturing the exact bytes a real clone request/response uses, so the framing in Task 3 is correct rather than guessed: +```bash +cd "$FIX/bare.git" +git config uploadpack.allowFilter true +# Capture a normal v2 clone's request body and response shape: +GIT_TRACE_PACKET=1 git -c protocol.version=2 clone -q --bare "$FIX/bare.git" "$FIX/clone1.git" 2>/tmp/trace.txt +# Inspect the fetch command + response sections (look for "packfile", sideband 0001/0002, flush 0000): +grep -E "fetch|want|packfile|0000|ACK|NAK|ready" /tmp/trace.txt | head -40 +``` +Record from the trace: (a) whether the node should target protocol v2 or v0, (b) the exact section markers around the packfile, (c) whether sideband-64k framing is in use. + +- [ ] **Step 4: Decide the serve implementation and write findings** + +Choose the implementation for `upload_pack_excluding` based on Steps 1-3, preferring the lowest-risk option that the trace confirms works: + +- **Option A (preferred): delegate to `git upload-pack` with an injected mandatory filter.** Set `uploadpack.allowFilter=true`, rewrite the client's fetch request to carry `filter sparse:oid=` (v2) where the spec blob excludes the denied paths, and let `git upload-pack` build and frame the entire response. Lowest framing risk; depends on `sparse:oid` negation behaving (verify in Step 2 variant). +- **Option B (fallback): hand-build the pack.** Parse `want` OIDs from the request body, run `git rev-list --objects ` minus the withheld OIDs, pipe to `git pack-objects --stdout`, and frame the result per the markers captured in Step 3. + +Write the chosen option, the exact `git` invocation(s), and the framing bytes into the "Task 0 Findings" block below. The downstream tasks reference `upload_pack_excluding(repo_path, request_body, withheld_oids) -> Result` regardless of which option is recorded here. + +- [ ] **Step 5: No commit** + +This task records findings only; there is nothing to commit. + +### Task 0 Findings + +Executed 2026-06-06. Results: + +- **Mechanism chosen:** Option B (hand-built pack). `sparse:oid` negation was not needed; explicit OID exclusion via `rev-list` + `pack-objects` is deterministic and self-contained. +- **Exact git invocation(s):** + - `git rev-list --objects --all` (in repo dir) to enumerate reachable objects as `oid [path]` lines. + - Filter out withheld OIDs (first whitespace column), feed remaining OIDs newline-delimited to `git pack-objects --stdout`. + - Verified exclusion by `git index-pack ` then `git verify-pack -v `: secret blob absent, public blob present. Confirmed. +- **Protocol version targeted:** v2 packfile section. The serve hand-frames the body, so no `GIT_PROTOCOL`/`-c protocol.version` flag is passed to our own process; we emit the v2 `packfile` section bytes directly. +- **Response framing (captured by driving `git upload-pack --stateless-rpc` with `GIT_PROTOCOL=version=2`):** + - `pkt_line("packfile\n")` (plain control pkt-line, not a sideband band). + - Then sideband-64k bands: `0x02` = progress (optional, we omit), `0x01` = pack data whose payload begins `PACK...`. + - Pack data chunked under the pkt-line limit, each chunk prefixed with `0x01`. + - Terminated by `0000` flush. + - This matches the plan's Option B framing in Task 2 exactly; no adjustment needed. +- **Confirmed:** served pack contains PUBLIC_OID, excludes SECRET_OID. + +--- + +## Task 1: Blob-OID resolver: withhold a private subtree's blobs for a non-reader + +**Files:** +- Create: `crates/gitlawb-node/src/git/visibility_pack.rs` +- Modify: `crates/gitlawb-node/src/git/mod.rs` (add module) + +- [ ] **Step 1: Register the module** + +In `crates/gitlawb-node/src/git/mod.rs`, add the line in alphabetical position (after `pub mod store;`): +```rust +pub mod visibility_pack; +``` + +- [ ] **Step 2: Write the failing test (non-reader withholds only the private blob)** + +Create `crates/gitlawb-node/src/git/visibility_pack.rs` with the test module first: +```rust +//! Resolve which blob OIDs must be withheld from a caller because every path +//! at which the blob appears is denied by the repo's visibility rules. Trees +//! and commits are never withheld (mode B keeps SHAs intact); only blob +//! content is held back. + +use crate::db::{VisibilityMode, VisibilityRule}; +use crate::git::store; +use crate::visibility::{visibility_check, Decision}; +use anyhow::{Context, Result}; +use std::collections::HashSet; +use std::path::Path; + +#[cfg(test)] +mod tests { + use super::*; + use chrono::Utc; + use std::process::Command; + use tempfile::TempDir; + + fn rule(path_glob: &str, readers: &[&str]) -> VisibilityRule { + VisibilityRule { + id: "x".into(), + repo_id: "r1".into(), + path_glob: path_glob.into(), + mode: VisibilityMode::B, + reader_dids: readers.iter().map(|s| s.to_string()).collect(), + created_by: "did:key:zOwner".into(), + created_at: Utc::now(), + } + } + + const OWNER: &str = "did:key:zOwner"; + + /// Build a bare repo with public/a.txt and secret/b.txt at one commit. + /// Returns (tempdir, bare_path, secret_blob_oid, public_blob_oid). + fn fixture() -> (TempDir, std::path::PathBuf, String, String) { + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let run = |args: &[&str], dir: &Path| { + let ok = Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(); + assert!(ok, "git {args:?} failed"); + }; + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"public bytes\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"TOP SECRET\n").unwrap(); + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + run(&["add", "."], &work); + run(&["commit", "-qm", "init"], &work); + let oid = |path: &str| { + let out = Command::new("git") + .args(["rev-parse", &format!("HEAD:{path}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let secret = oid("secret/b.txt"); + let public = oid("public/a.txt"); + run( + &["clone", "-q", "--bare", work.to_str().unwrap(), bare.to_str().unwrap()], + td.path(), + ); + (td, bare, secret, public) + } + + #[test] + fn non_reader_withholds_only_the_private_blob() { + let (_td, bare, secret, public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = + withheld_blob_oids(&bare, &rules, true, OWNER, Some("did:key:zStranger")).unwrap(); + assert!(withheld.contains(&secret), "secret blob must be withheld"); + assert!(!withheld.contains(&public), "public blob must NOT be withheld"); + } + + #[test] + fn owner_withholds_nothing() { + let (_td, bare, secret, public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, Some(OWNER)).unwrap(); + assert!(withheld.is_empty(), "owner sees everything"); + let _ = (secret, public); + } + + #[test] + fn listed_reader_withholds_nothing() { + let (_td, bare, _secret, _public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = + withheld_blob_oids(&bare, &rules, true, OWNER, Some("did:key:zFriend")).unwrap(); + assert!(withheld.is_empty(), "listed reader sees the subtree"); + } + + #[test] + fn no_subtree_rules_withholds_nothing() { + let (_td, bare, _secret, _public) = fixture(); + let withheld = withheld_blob_oids(&bare, &[], true, OWNER, None).unwrap(); + assert!(withheld.is_empty(), "public repo, no rules, nothing withheld"); + } +} +``` + +- [ ] **Step 3: Run the test to verify it fails** + +Run: `cargo test -p gitlawb-node visibility_pack:: -- --nocapture` +Expected: FAIL to compile with "cannot find function `withheld_blob_oids`". + +- [ ] **Step 4: Implement `withheld_blob_oids`** + +Add above the `#[cfg(test)]` block in `visibility_pack.rs`: +```rust +/// List every (blob_oid, "/repo/relative/path") pair reachable from any branch +/// ref in `repo_path`. Uses `git ls-tree -r` per ref so each path a blob lives +/// at is represented (the same blob content can appear at several paths). Paths +/// are returned with a leading "/" to match the glob form used by visibility +/// rules ("/secret/**"). +fn blob_paths(repo_path: &Path) -> Result> { + let refs = store::list_refs(repo_path).context("list_refs failed")?; + let mut out = Vec::new(); + for (refname, _oid) in refs { + if !refname.starts_with("refs/heads/") && !refname.starts_with("refs/tags/") { + continue; + } + let listing = std::process::Command::new("git") + .args(["ls-tree", "-r", &refname]) + .current_dir(repo_path) + .output() + .context("git ls-tree -r failed")?; + if !listing.status.success() { + continue; + } + for line in String::from_utf8_lossy(&listing.stdout).lines() { + // " blob \t" + let Some((meta, path)) = line.split_once('\t') else { + continue; + }; + let mut parts = meta.split_whitespace(); + let _mode = parts.next(); + let kind = parts.next(); + let oid = parts.next(); + if kind == Some("blob") { + if let Some(oid) = oid { + out.push((oid.to_string(), format!("/{path}"))); + } + } + } + } + Ok(out) +} + +/// Blob OIDs the caller may not read. A blob is withheld only if visibility +/// denies the caller at *every* path the blob appears at; a blob that is also +/// reachable through an allowed path is sent (its content is public elsewhere). +/// +/// The whole-repo "/" gate is handled by the caller before this function runs: +/// if "/" denies, the caller gets a 404 and never reaches the filtered serve. +pub fn withheld_blob_oids( + repo_path: &Path, + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> Result> { + let mut denied: HashSet = HashSet::new(); + let mut allowed: HashSet = HashSet::new(); + for (oid, path) in blob_paths(repo_path)? { + match visibility_check(rules, is_public, owner_did, caller, &path) { + Decision::Deny => { + denied.insert(oid); + } + Decision::Allow => { + allowed.insert(oid); + } + } + } + Ok(denied.difference(&allowed).cloned().collect()) +} +``` + +- [ ] **Step 5: Run the tests to verify they pass** + +Run: `cargo test -p gitlawb-node visibility_pack::` +Expected: PASS (4 tests). + +- [ ] **Step 6: Commit** + +```bash +git add crates/gitlawb-node/src/git/visibility_pack.rs crates/gitlawb-node/src/git/mod.rs +git commit -m "feat(node): resolve withheld blob OIDs for path-scoped visibility" +``` + +--- + +## Task 2: Filtered upload-pack serve (`upload_pack_excluding`) + +**Files:** +- Modify: `crates/gitlawb-node/src/git/smart_http.rs` + +Implement using the mechanism recorded in **Task 0 Findings**. The code below is written for **Option B (hand-built pack)** because it is self-contained and deterministic; if Task 0 recorded Option A, implement that instead behind the identical signature and adjust the test in Step 2 only where it inspects framing (the object-content assertion stays). + +- [ ] **Step 1: Add the test module with a pack-inspection helper and the failing test** + +At the bottom of `smart_http.rs`, add a `#[cfg(test)] mod tests` containing the pack-inspection helper (lists the OIDs inside a raw pack so tests can assert membership) and the first failing test: +```rust +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + use tempfile::TempDir; + + /// List OIDs in a pack by writing it to a temp dir and running verify-pack. + pub(super) fn pack_object_ids(pack: &[u8]) -> std::collections::HashSet { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("test.pack"); + std::fs::write(&path, pack).unwrap(); + // index-pack creates the matching .idx next to the pack. + let ok = Command::new("git") + .args(["index-pack", path.to_str().unwrap()]) + .status() + .unwrap() + .success(); + assert!(ok, "index-pack failed"); + let out = Command::new("git") + .args(["verify-pack", "-v", path.to_str().unwrap()]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout) + .lines() + .filter_map(|l| l.split_whitespace().next()) + .filter(|t| t.len() == 40 && t.chars().all(|c| c.is_ascii_hexdigit())) + .map(|s| s.to_string()) + .collect() + } + + #[tokio::test] + async fn filtered_serve_excludes_withheld_blob() { + // Build a bare repo, capture the secret + public blob OIDs. + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let g = |args: &[&str], dir: &std::path::Path| { + assert!(Command::new("git").args(args).current_dir(dir).status().unwrap().success()); + }; + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &work); + g(&["config", "user.email", "t@t"], &work); + g(&["config", "user.name", "t"], &work); + g(&["add", "."], &work); + g(&["commit", "-qm", "init"], &work); + let oid = |p: &str| { + let o = Command::new("git").args(["rev-parse", &format!("HEAD:{p}")]) + .current_dir(&work).output().unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + let secret = oid("secret/b.txt"); + let public = oid("public/a.txt"); + g(&["clone", "-q", "--bare", work.to_str().unwrap(), bare.to_str().unwrap()], td.path()); + + let mut withheld = std::collections::HashSet::new(); + withheld.insert(secret.clone()); + + let pack = build_filtered_pack(&bare, &withheld).unwrap(); + let ids = pack_object_ids(&pack); + assert!(ids.contains(&public), "public blob must be in the pack"); + assert!(!ids.contains(&secret), "secret blob must NOT be in the pack"); + } +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `cargo test -p gitlawb-node smart_http::tests::filtered_serve_excludes_withheld_blob` +Expected: FAIL to compile with "cannot find function `build_filtered_pack`". + +- [ ] **Step 3: Implement `build_filtered_pack` and `upload_pack_excluding`** + +Add to `smart_http.rs` (above the `#[cfg(test)]` block). `build_filtered_pack` is the deterministic core (unit-tested in Step 1); `upload_pack_excluding` frames it as an HTTP response using the markers recorded in Task 0 Findings: +```rust +use std::collections::HashSet; + +/// Build a packfile containing every object reachable from all refs EXCEPT the +/// given blob OIDs. Commits and trees are always included, so SHAs stay intact; +/// only the named blobs are dropped. +pub fn build_filtered_pack(repo_path: &Path, withheld: &HashSet) -> Result> { + // All reachable objects as "oid [path]" lines. + let rev = std::process::Command::new("git") + .args(["rev-list", "--objects", "--all"]) + .current_dir(repo_path) + .output()?; + if !rev.status.success() { + bail!("git rev-list failed: {}", String::from_utf8_lossy(&rev.stderr)); + } + let mut keep = Vec::new(); + for line in String::from_utf8_lossy(&rev.stdout).lines() { + let oid = line.split_whitespace().next().unwrap_or(""); + if oid.is_empty() || withheld.contains(oid) { + continue; + } + keep.push(oid.to_string()); + } + let mut child = std::process::Command::new("git") + .args(["pack-objects", "--stdout"]) + .current_dir(repo_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + { + use std::io::Write as _; + let mut stdin = child.stdin.take().expect("stdin"); + stdin.write_all(keep.join("\n").as_bytes())?; + stdin.write_all(b"\n")?; + } + let out = child.wait_with_output()?; + if !out.status.success() { + bail!("git pack-objects failed: {}", String::from_utf8_lossy(&out.stderr)); + } + Ok(out.stdout) +} + +/// Serve a clone/fetch with the withheld blobs removed from the response pack. +/// Framing follows Task 0 Findings; the body wraps `build_filtered_pack` output +/// in the upload-pack `packfile` section with sideband-64k, terminated by flush. +pub async fn upload_pack_excluding( + repo_path: &Path, + _request_body: Bytes, + withheld: &HashSet, +) -> Result { + let pack = build_filtered_pack(repo_path, withheld)?; + let mut body = Vec::new(); + body.extend_from_slice(&pkt_line("packfile\n")); + // sideband-64k: band 1 carries pack data, chunked under the pkt-line limit. + for chunk in pack.chunks(65515) { + let mut framed = Vec::with_capacity(chunk.len() + 1); + framed.push(0x01); + framed.extend_from_slice(chunk); + let len = framed.len() + 4; + body.extend_from_slice(format!("{len:04x}").as_bytes()); + body.extend_from_slice(&framed); + } + body.extend_from_slice(b"0000"); + Ok(Response::builder() + .status(StatusCode::OK) + .header("Content-Type", "application/x-git-upload-pack-result") + .header("Cache-Control", "no-cache") + .body(Body::from(body))?) +} +``` +> If Task 0 recorded **Option A**, replace the two functions above with the injected-filter delegation to `git upload-pack`, keeping the `build_filtered_pack` name as a thin wrapper so the Step 1 test still drives the OID-exclusion guarantee. + +- [ ] **Step 4: Run the tests to verify they pass** + +Run: `cargo test -p gitlawb-node smart_http::tests::filtered_serve_excludes_withheld_blob` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add crates/gitlawb-node/src/git/smart_http.rs +git commit -m "feat(node): filtered upload-pack serve that omits withheld blobs" +``` + +--- + +## Task 3: Wire filtered serve into the upload-pack handler + +**Files:** +- Modify: `crates/gitlawb-node/src/api/repos.rs` (`git_upload_pack`, lines ~368-407) + +- [ ] **Step 1: Add the imports** + +At the top of `repos.rs`, in the existing `use crate::git::{...}` group, add `visibility_pack`: +```rust +use crate::git::{smart_http, store, visibility_pack}; +``` +(If `store` is not already in that group, keep whatever is there and append `visibility_pack`.) + +- [ ] **Step 2: Branch to the filtered serve** + +In `git_upload_pack`, the current body computes `rules`, runs the whole-repo `visibility_check(..., "/")` 404 gate, acquires `disk_path`, then calls `smart_http::upload_pack(&disk_path, body)`. Keep the 404 gate and the `acquire` exactly as they are. Replace only the single serve call: +```rust + let disk_path = state + .repo_store + .acquire(&record.owner_did, &record.name) + .await + .map_err(|e| AppError::Git(e.to_string()))?; + let body_len = body.len(); + + let withheld = + visibility_pack::withheld_blob_oids(&disk_path, &rules, record.is_public, &record.owner_did, caller) + .map_err(|e| AppError::Git(e.to_string()))?; + + let resp = if withheld.is_empty() { + smart_http::upload_pack(&disk_path, body).await + } else { + tracing::info!(repo = %name, caller = ?caller, withheld = withheld.len(), "serving filtered pack"); + smart_http::upload_pack_excluding(&disk_path, body, &withheld).await + } + .map_err(|e| { + let msg = e.to_string(); + if msg.contains("bad line length") || msg.contains("protocol error") { + tracing::warn!(repo = %name, err = %msg, "git-upload-pack: bad client request"); + AppError::BadRequest(msg) + } else { + tracing::error!(repo = %name, err = %msg, "git-upload-pack failed"); + AppError::Git(msg) + } + })?; +``` +Leave the `crate::metrics::record_fetch(...)` line and everything after it unchanged. + +- [ ] **Step 3: Verify the crate builds and existing tests pass** + +Run: `cargo test -p gitlawb-node` +Expected: PASS, including the Phase 1 whole-repo visibility tests (no regression). The new fast-path (`withheld.is_empty()`) must keep public and fully-authorized clones byte-identical to before. + +- [ ] **Step 4: Commit** + +```bash +git add crates/gitlawb-node/src/api/repos.rs +git commit -m "feat(node): serve filtered pack when caller has withheld subtree blobs" +``` + +--- + +## Task 4: End-to-end clone test through a real git client + +**Files:** +- Modify: `crates/gitlawb-node/src/git/smart_http.rs` (extend `mod tests`) + +This proves the served body is a clone a real `git` accepts and that the private bytes are absent from the resulting object store, which is the security guarantee. + +- [ ] **Step 1: Write the failing end-to-end test** + +Add to `smart_http.rs` `mod tests`: +```rust + #[tokio::test] + async fn client_clone_lacks_withheld_blob_bytes() { + use axum::body::to_bytes; + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let g = |args: &[&str], dir: &std::path::Path| { + assert!(Command::new("git").args(args).current_dir(dir).status().unwrap().success()); + }; + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &work); + g(&["config", "user.email", "t@t"], &work); + g(&["config", "user.name", "t"], &work); + g(&["add", "."], &work); + g(&["commit", "-qm", "init"], &work); + let secret_oid = { + let o = Command::new("git").args(["rev-parse", "HEAD:secret/b.txt"]) + .current_dir(&work).output().unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + g(&["clone", "-q", "--bare", work.to_str().unwrap(), bare.to_str().unwrap()], td.path()); + + let mut withheld = std::collections::HashSet::new(); + withheld.insert(secret_oid.clone()); + + let resp = upload_pack_excluding(&bare, Bytes::new(), &withheld).await.unwrap(); + let body = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let ids = pack_object_ids(&extract_pack(&body)); + assert!(!ids.contains(&secret_oid), "withheld blob must be absent from served pack"); + } + + /// Strip the upload-pack `packfile` section framing, returning the raw pack. + /// Mirrors how a client de-frames the sideband-64k band-1 stream. + fn extract_pack(body: &[u8]) -> Vec { + let mut out = Vec::new(); + let mut i = 0; + while i + 4 <= body.len() { + let len = usize::from_str_radix( + std::str::from_utf8(&body[i..i + 4]).unwrap_or("0000"), + 16, + ) + .unwrap_or(0); + if len == 0 { + i += 4; + continue; + } + let chunk = &body[i + 4..i + len]; + // band 1 = pack data; skip "packfile\n" control line and other bands. + if chunk.first() == Some(&0x01) { + out.extend_from_slice(&chunk[1..]); + } + i += len; + } + out + } +``` +> If Task 0 chose Option A (delegated framing), `extract_pack` may need adjusting to the exact bands git emits; use the trace from Task 0 Step 3 to confirm. + +- [ ] **Step 2: Run the test to verify it fails (then passes once framing is right)** + +Run: `cargo test -p gitlawb-node smart_http::tests::client_clone_lacks_withheld_blob_bytes` +Expected: initially may FAIL if framing constants are off; iterate `extract_pack` / framing against Task 0 findings until PASS. Success criterion: the withheld OID is absent from the served pack. + +- [ ] **Step 3: Commit** + +```bash +git add crates/gitlawb-node/src/git/smart_http.rs +git commit -m "test(node): end-to-end assert served pack omits withheld blob" +``` + +--- + +## Task 5: Confirm `info/refs` does not leak and stays consistent + +**Files:** +- Modify: `crates/gitlawb-node/src/api/repos.rs` (no logic change to `git_info_refs`; add a confirming comment only if needed) + +The ref advertisement lists commit tips, not blob content, so a mode-B subtree does not require hiding any ref: a non-reader still clones the same commits, just without the private blobs. This task records that decision so a future reader does not "fix" it by gating `info/refs` on subtree rules. + +- [ ] **Step 1: Add a clarifying comment** + +In `git_info_refs`, next to the existing whole-repo gate (the `if service == "git-upload-pack"` block around line 330), append one line after the existing comment: +```rust + // Subtree (mode B) rules do not gate the advertisement: refs expose commit + // tips only, and blob withholding happens in the upload-pack pack build. +``` + +- [ ] **Step 2: Verify nothing else changed** + +Run: `git diff crates/gitlawb-node/src/api/repos.rs` +Expected: only the one comment line added in `git_info_refs`; the whole-repo 404 gate is untouched. + +- [ ] **Step 3: Commit** + +```bash +git add crates/gitlawb-node/src/api/repos.rs +git commit -m "docs(node): note why info/refs is not gated on subtree visibility" +``` + +--- + +## Task 6: Full verification gate + +**Files:** none (verification only) + +- [ ] **Step 1: Format** + +Run: `cargo fmt --all && cargo fmt --all --check` +Expected: clean (no diff). + +- [ ] **Step 2: Lint** + +Run: `cargo clippy --all-targets -- -D warnings` +Expected: no warnings. + +- [ ] **Step 3: Full test suite** + +Run: `cargo test -p gitlawb-node` +Expected: all pass, including Phase 1 visibility tests and the new `visibility_pack` and `smart_http` tests. + +- [ ] **Step 4: Manual smoke (optional but recommended)** + +Set a subtree rule on a local repo via `gl visibility`, clone as a non-reader through the node, and confirm the private file's bytes are absent (`git cat-file -p HEAD:secret/b.txt` fails or the file is missing) while the tree entry / SHA is still listed (`git ls-tree HEAD secret/`). + +--- + +## Out of scope / follow-ups (separate plans) + +1. **`git-remote-gitlawb` partial-clone UX.** Make a non-reader's clone produce a clean partial checkout rather than a checkout error on the missing blob: the helper requests partial-clone semantics and treats withheld blobs as deliberately absent. Without this, a stock `git clone` of a repo with a withheld blob succeeds at fetch but errors at checkout. The security guarantee (bytes never sent) holds regardless; this is purely UX. +2. **Filtered-pack caching.** `build_filtered_pack` recomputes per request. If hot, cache by (repo, tip-OIDs, withheld-set) and invalidate on push. +3. **Incremental fetch (`have` lines).** This plan targets the clone case. Confirm and, if needed, harden the filtered serve for fetches that send `have` lines so withheld blobs are never sent incrementally either. +4. **Replication-path enforcement (Phase 2).** Still blocked on the maintainer A/B decision; unrelated to this HTTP-path work. +``` From 8ab6de9bb2aa905ec127dc308c9e261422234ab1 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:00:43 -0500 Subject: [PATCH 02/58] feat(node): resolve withheld blob OIDs for path-scoped visibility --- crates/gitlawb-node/src/git/mod.rs | 1 + .../gitlawb-node/src/git/visibility_pack.rs | 191 ++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 crates/gitlawb-node/src/git/visibility_pack.rs diff --git a/crates/gitlawb-node/src/git/mod.rs b/crates/gitlawb-node/src/git/mod.rs index 4dcd233..49259d5 100644 --- a/crates/gitlawb-node/src/git/mod.rs +++ b/crates/gitlawb-node/src/git/mod.rs @@ -3,3 +3,4 @@ pub mod repo_store; pub mod smart_http; pub mod store; pub mod tigris; +pub mod visibility_pack; diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs new file mode 100644 index 0000000..bf3c45f --- /dev/null +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -0,0 +1,191 @@ +//! Resolve which blob OIDs must be withheld from a caller because every path +//! at which the blob appears is denied by the repo's visibility rules. Trees +//! and commits are never withheld (mode B keeps SHAs intact); only blob +//! content is held back. + +use crate::db::VisibilityRule; +use crate::git::store; +use crate::visibility::{visibility_check, Decision}; +use anyhow::{Context, Result}; +use std::collections::HashSet; +use std::path::Path; + +#[allow(dead_code)] +/// List every (blob_oid, "/repo/relative/path") pair reachable from any branch +/// ref in `repo_path`. Uses `git ls-tree -r` per ref so each path a blob lives +/// at is represented (the same blob content can appear at several paths). Paths +/// are returned with a leading "/" to match the glob form used by visibility +/// rules ("/secret/**"). +fn blob_paths(repo_path: &Path) -> Result> { + let refs = store::list_refs(repo_path).context("list_refs failed")?; + let mut out = Vec::new(); + for (refname, _oid) in refs { + if !refname.starts_with("refs/heads/") && !refname.starts_with("refs/tags/") { + continue; + } + let listing = std::process::Command::new("git") + .args(["ls-tree", "-r", &refname]) + .current_dir(repo_path) + .output() + .context("git ls-tree -r failed")?; + if !listing.status.success() { + continue; + } + for line in String::from_utf8_lossy(&listing.stdout).lines() { + // " blob \t" + let Some((meta, path)) = line.split_once('\t') else { + continue; + }; + let mut parts = meta.split_whitespace(); + let _mode = parts.next(); + let kind = parts.next(); + let oid = parts.next(); + if kind == Some("blob") { + if let Some(oid) = oid { + out.push((oid.to_string(), format!("/{path}"))); + } + } + } + } + Ok(out) +} + +#[allow(dead_code)] +/// Blob OIDs the caller may not read. A blob is withheld only if visibility +/// denies the caller at *every* path the blob appears at; a blob that is also +/// reachable through an allowed path is sent (its content is public elsewhere). +/// +/// The whole-repo "/" gate is handled by the caller before this function runs: +/// if "/" denies, the caller gets a 404 and never reaches the filtered serve. +pub fn withheld_blob_oids( + repo_path: &Path, + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> Result> { + let mut denied: HashSet = HashSet::new(); + let mut allowed: HashSet = HashSet::new(); + for (oid, path) in blob_paths(repo_path)? { + match visibility_check(rules, is_public, owner_did, caller, &path) { + Decision::Deny => { + denied.insert(oid); + } + Decision::Allow => { + allowed.insert(oid); + } + } + } + Ok(denied.difference(&allowed).cloned().collect()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::VisibilityMode; + use chrono::Utc; + use std::process::Command; + use tempfile::TempDir; + + fn rule(path_glob: &str, readers: &[&str]) -> VisibilityRule { + VisibilityRule { + id: "x".into(), + repo_id: "r1".into(), + path_glob: path_glob.into(), + mode: VisibilityMode::B, + reader_dids: readers.iter().map(|s| s.to_string()).collect(), + created_by: "did:key:zOwner".into(), + created_at: Utc::now(), + } + } + + const OWNER: &str = "did:key:zOwner"; + + /// Build a bare repo with public/a.txt and secret/b.txt at one commit. + /// Returns (tempdir, bare_path, secret_blob_oid, public_blob_oid). + fn fixture() -> (TempDir, std::path::PathBuf, String, String) { + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let run = |args: &[&str], dir: &Path| { + let ok = Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(); + assert!(ok, "git {args:?} failed"); + }; + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"public bytes\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"TOP SECRET\n").unwrap(); + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + run(&["add", "."], &work); + run(&["commit", "-qm", "init"], &work); + let oid = |path: &str| { + let out = Command::new("git") + .args(["rev-parse", &format!("HEAD:{path}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let secret = oid("secret/b.txt"); + let public = oid("public/a.txt"); + run( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + (td, bare, secret, public) + } + + #[test] + fn non_reader_withholds_only_the_private_blob() { + let (_td, bare, secret, public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = + withheld_blob_oids(&bare, &rules, true, OWNER, Some("did:key:zStranger")).unwrap(); + assert!(withheld.contains(&secret), "secret blob must be withheld"); + assert!( + !withheld.contains(&public), + "public blob must NOT be withheld" + ); + } + + #[test] + fn owner_withholds_nothing() { + let (_td, bare, secret, public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, Some(OWNER)).unwrap(); + assert!(withheld.is_empty(), "owner sees everything"); + let _ = (secret, public); + } + + #[test] + fn listed_reader_withholds_nothing() { + let (_td, bare, _secret, _public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = + withheld_blob_oids(&bare, &rules, true, OWNER, Some("did:key:zFriend")).unwrap(); + assert!(withheld.is_empty(), "listed reader sees the subtree"); + } + + #[test] + fn no_subtree_rules_withholds_nothing() { + let (_td, bare, _secret, _public) = fixture(); + let withheld = withheld_blob_oids(&bare, &[], true, OWNER, None).unwrap(); + assert!( + withheld.is_empty(), + "public repo, no rules, nothing withheld" + ); + } +} From 00179836996774d2d2068a27240d2cdbca625a57 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:03:12 -0500 Subject: [PATCH 03/58] feat(node): filtered upload-pack serve that omits withheld blobs --- crates/gitlawb-node/src/git/smart_http.rs | 166 ++++++++++++++++++++++ 1 file changed, 166 insertions(+) diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index 6a00107..e39b747 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -3,6 +3,7 @@ use axum::body::Body; use axum::http::StatusCode; use axum::response::Response; use bytes::Bytes; +use std::collections::HashSet; use std::path::Path; use std::process::Stdio; use tokio::io::AsyncWriteExt; @@ -120,3 +121,168 @@ fn pkt_line(data: &str) -> Vec { let len = data.len() + 4; format!("{len:04x}{data}").into_bytes() } + +/// Build a packfile containing every object reachable from all refs EXCEPT the +/// given blob OIDs. Commits and trees are always included, so SHAs stay intact; +/// only the named blobs are dropped. +// #[allow(dead_code)] removed when wired into the upload-pack handler in the next task. +#[allow(dead_code)] +pub fn build_filtered_pack(repo_path: &Path, withheld: &HashSet) -> Result> { + // All reachable objects as "oid [path]" lines. + let rev = std::process::Command::new("git") + .args(["rev-list", "--objects", "--all"]) + .current_dir(repo_path) + .output()?; + if !rev.status.success() { + bail!( + "git rev-list failed: {}", + String::from_utf8_lossy(&rev.stderr) + ); + } + let mut keep = Vec::new(); + for line in String::from_utf8_lossy(&rev.stdout).lines() { + let oid = line.split_whitespace().next().unwrap_or(""); + if oid.is_empty() || withheld.contains(oid) { + continue; + } + keep.push(oid.to_string()); + } + let mut child = std::process::Command::new("git") + .args(["pack-objects", "--stdout"]) + .current_dir(repo_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + { + use std::io::Write as _; + let mut stdin = child.stdin.take().expect("stdin"); + stdin.write_all(keep.join("\n").as_bytes())?; + stdin.write_all(b"\n")?; + } + let out = child.wait_with_output()?; + if !out.status.success() { + bail!( + "git pack-objects failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + Ok(out.stdout) +} + +/// Serve a clone/fetch with the withheld blobs removed from the response pack. +/// Framing: the body wraps `build_filtered_pack` output in the upload-pack +/// `packfile` section with sideband-64k band 1, terminated by flush. +// #[allow(dead_code)] removed when wired into the upload-pack handler in the next task. +#[allow(dead_code)] +pub async fn upload_pack_excluding( + repo_path: &Path, + _request_body: Bytes, + withheld: &HashSet, +) -> Result { + let pack = build_filtered_pack(repo_path, withheld)?; + let mut body = Vec::new(); + body.extend_from_slice(&pkt_line("packfile\n")); + // sideband-64k: band 1 carries pack data, chunked under the pkt-line limit. + for chunk in pack.chunks(65515) { + let mut framed = Vec::with_capacity(chunk.len() + 1); + framed.push(0x01); + framed.extend_from_slice(chunk); + let len = framed.len() + 4; + body.extend_from_slice(format!("{len:04x}").as_bytes()); + body.extend_from_slice(&framed); + } + body.extend_from_slice(b"0000"); + Ok(Response::builder() + .status(StatusCode::OK) + .header("Content-Type", "application/x-git-upload-pack-result") + .header("Cache-Control", "no-cache") + .body(Body::from(body))?) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + use tempfile::TempDir; + + /// List OIDs in a pack by writing it to a temp dir and running verify-pack. + pub(super) fn pack_object_ids(pack: &[u8]) -> std::collections::HashSet { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("test.pack"); + std::fs::write(&path, pack).unwrap(); + // index-pack creates the matching .idx next to the pack. + let ok = Command::new("git") + .args(["index-pack", path.to_str().unwrap()]) + .status() + .unwrap() + .success(); + assert!(ok, "index-pack failed"); + let out = Command::new("git") + .args(["verify-pack", "-v", path.to_str().unwrap()]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout) + .lines() + .filter_map(|l| l.split_whitespace().next()) + .filter(|t| t.len() == 40 && t.chars().all(|c| c.is_ascii_hexdigit())) + .map(|s| s.to_string()) + .collect() + } + + #[tokio::test] + async fn filtered_serve_excludes_withheld_blob() { + // Build a bare repo, capture the secret + public blob OIDs. + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let g = |args: &[&str], dir: &std::path::Path| { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + }; + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &work); + g(&["config", "user.email", "t@t"], &work); + g(&["config", "user.name", "t"], &work); + g(&["add", "."], &work); + g(&["commit", "-qm", "init"], &work); + let oid = |p: &str| { + let o = Command::new("git") + .args(["rev-parse", &format!("HEAD:{p}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + let secret = oid("secret/b.txt"); + let public = oid("public/a.txt"); + g( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let mut withheld = std::collections::HashSet::new(); + withheld.insert(secret.clone()); + + let pack = build_filtered_pack(&bare, &withheld).unwrap(); + let ids = pack_object_ids(&pack); + assert!(ids.contains(&public), "public blob must be in the pack"); + assert!( + !ids.contains(&secret), + "secret blob must NOT be in the pack" + ); + } +} From e292b79fc0014aa849332129559267ffc2f5ea26 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:05:24 -0500 Subject: [PATCH 04/58] feat(node): serve filtered pack when caller has withheld subtree blobs --- crates/gitlawb-node/src/api/repos.rs | 40 +++++++++++++------ crates/gitlawb-node/src/git/smart_http.rs | 4 -- .../gitlawb-node/src/git/visibility_pack.rs | 2 - 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 0993d4b..fa9810a 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -12,7 +12,7 @@ use uuid::Uuid; use crate::cert; use crate::error::{AppError, Result}; -use crate::git::{smart_http, store}; +use crate::git::{smart_http, store, visibility_pack}; use crate::state::AppState; use crate::visibility::{visibility_check, Decision}; use crate::webhooks; @@ -392,18 +392,32 @@ pub async fn git_upload_pack( .await .map_err(|e| AppError::Git(e.to_string()))?; let body_len = body.len(); - let resp = smart_http::upload_pack(&disk_path, body) - .await - .map_err(|e| { - let msg = e.to_string(); - if msg.contains("bad line length") || msg.contains("protocol error") { - tracing::warn!(repo = %name, err = %msg, "git-upload-pack: bad client request"); - AppError::BadRequest(msg) - } else { - tracing::error!(repo = %name, err = %msg, "git-upload-pack failed"); - AppError::Git(msg) - } - })?; + + let withheld = visibility_pack::withheld_blob_oids( + &disk_path, + &rules, + record.is_public, + &record.owner_did, + caller, + ) + .map_err(|e| AppError::Git(e.to_string()))?; + + let resp = if withheld.is_empty() { + smart_http::upload_pack(&disk_path, body).await + } else { + tracing::info!(repo = %name, caller = ?caller, withheld = withheld.len(), "serving filtered pack"); + smart_http::upload_pack_excluding(&disk_path, body, &withheld).await + } + .map_err(|e| { + let msg = e.to_string(); + if msg.contains("bad line length") || msg.contains("protocol error") { + tracing::warn!(repo = %name, err = %msg, "git-upload-pack: bad client request"); + AppError::BadRequest(msg) + } else { + tracing::error!(repo = %name, err = %msg, "git-upload-pack failed"); + AppError::Git(msg) + } + })?; crate::metrics::record_fetch(&format!("{owner}/{name}")); crate::metrics::observe_pack_size(body_len as f64); Ok(resp) diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index e39b747..a2ac294 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -125,8 +125,6 @@ fn pkt_line(data: &str) -> Vec { /// Build a packfile containing every object reachable from all refs EXCEPT the /// given blob OIDs. Commits and trees are always included, so SHAs stay intact; /// only the named blobs are dropped. -// #[allow(dead_code)] removed when wired into the upload-pack handler in the next task. -#[allow(dead_code)] pub fn build_filtered_pack(repo_path: &Path, withheld: &HashSet) -> Result> { // All reachable objects as "oid [path]" lines. let rev = std::process::Command::new("git") @@ -173,8 +171,6 @@ pub fn build_filtered_pack(repo_path: &Path, withheld: &HashSet) -> Resu /// Serve a clone/fetch with the withheld blobs removed from the response pack. /// Framing: the body wraps `build_filtered_pack` output in the upload-pack /// `packfile` section with sideband-64k band 1, terminated by flush. -// #[allow(dead_code)] removed when wired into the upload-pack handler in the next task. -#[allow(dead_code)] pub async fn upload_pack_excluding( repo_path: &Path, _request_body: Bytes, diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index bf3c45f..d386415 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -10,7 +10,6 @@ use anyhow::{Context, Result}; use std::collections::HashSet; use std::path::Path; -#[allow(dead_code)] /// List every (blob_oid, "/repo/relative/path") pair reachable from any branch /// ref in `repo_path`. Uses `git ls-tree -r` per ref so each path a blob lives /// at is represented (the same blob content can appear at several paths). Paths @@ -50,7 +49,6 @@ fn blob_paths(repo_path: &Path) -> Result> { Ok(out) } -#[allow(dead_code)] /// Blob OIDs the caller may not read. A blob is withheld only if visibility /// denies the caller at *every* path the blob appears at; a blob that is also /// reachable through an allowed path is sent (its content is public elsewhere). From 1474744c0ac026e9fbc88612cb2736d3b3a9406f Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:07:52 -0500 Subject: [PATCH 05/58] test(node): end-to-end assert served pack omits withheld blob --- crates/gitlawb-node/src/git/smart_http.rs | 79 +++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index a2ac294..1a59886 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -281,4 +281,83 @@ mod tests { "secret blob must NOT be in the pack" ); } + + #[tokio::test] + async fn client_clone_lacks_withheld_blob_bytes() { + use axum::body::to_bytes; + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let g = |args: &[&str], dir: &std::path::Path| { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + }; + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &work); + g(&["config", "user.email", "t@t"], &work); + g(&["config", "user.name", "t"], &work); + g(&["add", "."], &work); + g(&["commit", "-qm", "init"], &work); + let secret_oid = { + let o = Command::new("git") + .args(["rev-parse", "HEAD:secret/b.txt"]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + g( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let mut withheld = std::collections::HashSet::new(); + withheld.insert(secret_oid.clone()); + + let resp = upload_pack_excluding(&bare, Bytes::new(), &withheld) + .await + .unwrap(); + let body = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let ids = pack_object_ids(&extract_pack(&body)); + assert!( + !ids.contains(&secret_oid), + "withheld blob must be absent from served pack" + ); + } + + /// Strip the upload-pack `packfile` section framing, returning the raw pack. + /// Mirrors how a client de-frames the sideband-64k band-1 stream. + fn extract_pack(body: &[u8]) -> Vec { + let mut out = Vec::new(); + let mut i = 0; + while i + 4 <= body.len() { + let len = + usize::from_str_radix(std::str::from_utf8(&body[i..i + 4]).unwrap_or("0000"), 16) + .unwrap_or(0); + if len == 0 { + i += 4; + continue; + } + let chunk = &body[i + 4..i + len]; + // band 1 = pack data; skip "packfile\n" control line and other bands. + if chunk.first() == Some(&0x01) { + out.extend_from_slice(&chunk[1..]); + } + i += len; + } + out + } } From 694fddb5622f7811bbf0a80a81dea93f09eb4a51 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:08:14 -0500 Subject: [PATCH 06/58] docs(node): note why info/refs is not gated on subtree visibility --- crates/gitlawb-node/src/api/repos.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index fa9810a..ca5ed05 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -330,6 +330,8 @@ pub async fn git_info_refs( if service == "git-upload-pack" { let rules = state.db.list_visibility_rules(&record.id).await?; let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + // Subtree (mode B) rules do not gate the advertisement: refs expose commit + // tips only, and blob withholding happens in the upload-pack pack build. if visibility_check(&rules, record.is_public, &record.owner_did, caller, "/") == Decision::Deny { From 9413e641e83203215ef280bd1621a6cecc93a6ac Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:27:29 -0500 Subject: [PATCH 07/58] fix(node): frame filtered serve as protocol v0 and prove it with a real clone upload_pack_excluding emitted a v2 packfile section, but info_refs advertises v0, so real clients negotiated v0 and rejected the response with 'expected ACK/NAK, got packfile'. Frame the v0 stateless-rpc shape instead (NAK, then the pack via side-band-64k when offered). Add an end-to-end test that stands up info_refs + upload_pack_excluding and runs a real git partial clone, asserting the withheld blob's bytes never reach the client while its tree entry and SHA stay visible. A stock full clone cannot consume the pack (it is not closed under reachability, so fetch fails the connectivity check); a partial clone is required. --- crates/gitlawb-node/src/git/smart_http.rs | 235 ++++++++++++++++++++-- 1 file changed, 214 insertions(+), 21 deletions(-) diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index 1a59886..c92cb79 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -169,26 +169,49 @@ pub fn build_filtered_pack(repo_path: &Path, withheld: &HashSet) -> Resu } /// Serve a clone/fetch with the withheld blobs removed from the response pack. -/// Framing: the body wraps `build_filtered_pack` output in the upload-pack -/// `packfile` section with sideband-64k band 1, terminated by flush. +/// +/// The framing is git protocol v0 (`NAK` then the pack), matching the v0 ref +/// advertisement that `info_refs` emits (it runs `git upload-pack +/// --advertise-refs` without `GIT_PROTOCOL=version=2`, so clients negotiate v0). +/// If `info_refs` ever advertises v2, this serve path must learn v2 framing too. +/// +/// Because the pack deliberately omits blobs that the sent trees still +/// reference, the pack is not closed under reachability. A stock full clone +/// rejects it at fetch time ("remote did not send all necessary objects"); only +/// a partial clone (the client passes `--filter`, marking a promisor remote) +/// accepts the pack with the private blobs absent. Tree and commit SHAs stay +/// intact either way. The clean partial-clone client UX is a separate follow-up +/// (git-remote-gitlawb); the security guarantee (private bytes never leave the +/// node) holds regardless of client. pub async fn upload_pack_excluding( repo_path: &Path, - _request_body: Bytes, + request_body: Bytes, withheld: &HashSet, ) -> Result { let pack = build_filtered_pack(repo_path, withheld)?; + + // The client lists its capabilities on the first `want` line. Honor + // side-band-64k when offered (every modern smart-HTTP client offers it); + // otherwise stream the raw pack after NAK. + let sideband = memmem(&request_body, b"side-band-64k"); + let mut body = Vec::new(); - body.extend_from_slice(&pkt_line("packfile\n")); - // sideband-64k: band 1 carries pack data, chunked under the pkt-line limit. - for chunk in pack.chunks(65515) { - let mut framed = Vec::with_capacity(chunk.len() + 1); - framed.push(0x01); - framed.extend_from_slice(chunk); - let len = framed.len() + 4; - body.extend_from_slice(format!("{len:04x}").as_bytes()); - body.extend_from_slice(&framed); + body.extend_from_slice(&pkt_line("NAK\n")); + if sideband { + // Band 1 carries pack data, chunked under the pkt-line size limit. + for chunk in pack.chunks(65515) { + let mut framed = Vec::with_capacity(chunk.len() + 1); + framed.push(0x01); + framed.extend_from_slice(chunk); + let len = framed.len() + 4; + body.extend_from_slice(format!("{len:04x}").as_bytes()); + body.extend_from_slice(&framed); + } + body.extend_from_slice(b"0000"); + } else { + body.extend_from_slice(&pack); } - body.extend_from_slice(b"0000"); + Ok(Response::builder() .status(StatusCode::OK) .header("Content-Type", "application/x-git-upload-pack-result") @@ -196,6 +219,17 @@ pub async fn upload_pack_excluding( .body(Body::from(body))?) } +/// True if `needle` occurs anywhere in `haystack`. Small substring scan used to +/// detect a client capability token in the upload-pack request body. +fn memmem(haystack: &[u8], needle: &[u8]) -> bool { + if needle.is_empty() || haystack.len() < needle.len() { + return needle.is_empty(); + } + haystack + .windows(needle.len()) + .any(|window| window == needle) +} + #[cfg(test)] mod tests { use super::*; @@ -305,14 +339,16 @@ mod tests { g(&["config", "user.name", "t"], &work); g(&["add", "."], &work); g(&["commit", "-qm", "init"], &work); - let secret_oid = { + let oid = |p: &str| { let o = Command::new("git") - .args(["rev-parse", "HEAD:secret/b.txt"]) + .args(["rev-parse", &format!("HEAD:{p}")]) .current_dir(&work) .output() .unwrap(); String::from_utf8_lossy(&o.stdout).trim().to_string() }; + let secret_oid = oid("secret/b.txt"); + let public_oid = oid("public/a.txt"); g( &[ "clone", @@ -327,19 +363,27 @@ mod tests { let mut withheld = std::collections::HashSet::new(); withheld.insert(secret_oid.clone()); - let resp = upload_pack_excluding(&bare, Bytes::new(), &withheld) - .await - .unwrap(); + // A realistic v0 request advertises side-band-64k, so the serve frames + // the pack in band 1 (the path real clients exercise). + let req = Bytes::from_static( + b"0098want 0000000000000000000000000000000000000000 \ + side-band-64k ofs-delta agent=git/2\n00000009done\n", + ); + let resp = upload_pack_excluding(&bare, req, &withheld).await.unwrap(); let body = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); let ids = pack_object_ids(&extract_pack(&body)); + assert!( + ids.contains(&public_oid), + "public blob must be present in served pack" + ); assert!( !ids.contains(&secret_oid), "withheld blob must be absent from served pack" ); } - /// Strip the upload-pack `packfile` section framing, returning the raw pack. - /// Mirrors how a client de-frames the sideband-64k band-1 stream. + /// Strip the v0 upload-pack framing (NAK line + sideband-64k bands), + /// returning the raw pack. Mirrors how a client de-frames the band-1 stream. fn extract_pack(body: &[u8]) -> Vec { let mut out = Vec::new(); let mut i = 0; @@ -352,7 +396,7 @@ mod tests { continue; } let chunk = &body[i + 4..i + len]; - // band 1 = pack data; skip "packfile\n" control line and other bands. + // band 1 = pack data; skip the NAK line and any other bands. if chunk.first() == Some(&0x01) { out.extend_from_slice(&chunk[1..]); } @@ -360,4 +404,153 @@ mod tests { } out } + + /// End-to-end: a real `git` client clones through `info_refs` + + /// `upload_pack_excluding` and ends up without the withheld blob's bytes + /// while still seeing its tree entry (SHA). Uses a partial clone + /// (`--filter`) because a pack that omits a referenced blob is only + /// accepted by a promisor-aware client; a stock full clone is refused at + /// fetch time by the connectivity check. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn real_git_partial_clone_omits_withheld_blob() { + use axum::extract::{Query, State}; + use axum::routing::{get, post}; + use axum::Router; + use std::collections::HashMap; + use std::sync::Arc; + + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let g = |args: &[&str], dir: &std::path::Path| { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + }; + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &work); + g(&["config", "user.email", "t@t"], &work); + g(&["config", "user.name", "t"], &work); + g(&["add", "."], &work); + g(&["commit", "-qm", "init"], &work); + let oid = |p: &str| { + let o = Command::new("git") + .args(["rev-parse", &format!("HEAD:{p}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + let secret_oid = oid("secret/b.txt"); + let public_oid = oid("public/a.txt"); + g( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + #[derive(Clone)] + struct St { + repo: std::path::PathBuf, + withheld: HashSet, + } + let state = Arc::new(St { + repo: bare.clone(), + withheld: HashSet::from([secret_oid.clone()]), + }); + + async fn refs( + State(st): State>, + Query(q): Query>, + ) -> Response { + let service = q.get("service").cloned().unwrap_or_default(); + info_refs(&st.repo, &service).await.unwrap() + } + async fn pack(State(st): State>, body: Bytes) -> Response { + upload_pack_excluding(&st.repo, body, &st.withheld) + .await + .unwrap() + } + + let app = Router::new() + .route("/repo.git/info/refs", get(refs)) + .route("/repo.git/git-upload-pack", post(pack)) + .with_state(state); + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + let server = tokio::spawn(async move { + axum::serve(listener, app).await.unwrap(); + }); + + let dest = td.path().join("clone"); + let url = format!("http://127.0.0.1:{port}/repo.git"); + let dest_s = dest.to_str().unwrap().to_string(); + let out = tokio::task::spawn_blocking(move || { + Command::new("git") + .args([ + "-c", + "protocol.version=2", + "clone", + "--filter=blob:none", + "--no-checkout", + "-q", + &url, + &dest_s, + ]) + .output() + .unwrap() + }) + .await + .unwrap(); + + assert!( + out.status.success(), + "clone failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // Enumerate exactly the objects the clone physically received (no + // promisor lazy-fetch): the public blob is present, the withheld blob is + // not. This asserts on the bytes that actually crossed the wire. + let local = Command::new("git") + .args(["cat-file", "--batch-all-objects", "--batch-check"]) + .current_dir(&dest) + .output() + .unwrap(); + let local = String::from_utf8_lossy(&local.stdout); + assert!( + local.contains(&public_oid), + "public blob should be present in the clone" + ); + assert!( + !local.contains(&secret_oid), + "withheld blob bytes must be absent from the clone" + ); + + // The tree entry (and SHA) for the private file is still visible. + let tree = Command::new("git") + .args(["ls-tree", "-r", "HEAD"]) + .current_dir(&dest) + .output() + .unwrap(); + let tree = String::from_utf8_lossy(&tree.stdout); + assert!( + tree.contains(&secret_oid) && tree.contains("secret/b.txt"), + "the private path and its blob SHA must remain visible: {tree}" + ); + + server.abort(); + } } From 72487af44d739db2a89c8c22124f1e2db16c1eb7 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:27:59 -0500 Subject: [PATCH 08/58] docs(node): correct Phase 3 caveat (full clone refused at fetch, not checkout) --- .../plans/2026-06-05-phase3-subtree-content-withholding.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md b/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md index 6aaa474..0ddda81 100644 --- a/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md +++ b/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md @@ -8,7 +8,9 @@ **Tech Stack:** Rust, axum, the system `git` CLI (shelled out, as the codebase already does in `git/store.rs` and `git/smart_http.rs`), `tempfile` for fixture repos in tests. -**Scope boundary:** This plan covers the node-side enforcement and the security guarantee (private blob bytes are never placed in the served pack), proven by inspecting the produced pack. It deliberately does NOT cover: the `git-remote-gitlawb` client-side change that lets a non-reader get a *clean* partial checkout (a stock `git clone` of a repo with a withheld blob will fail at checkout on the missing object; that UX work is a separate follow-up plan), filtered-pack caching, or incremental-fetch (`have`-line) hardening beyond what falls out naturally. Those are listed under "Out of scope / follow-ups" at the end. +**Scope boundary:** This plan covers the node-side enforcement and the security guarantee (private blob bytes are never placed in the served pack), proven by inspecting the produced pack and by a real `git` partial clone. It deliberately does NOT cover: the `git-remote-gitlawb` client-side change that lets a non-reader get a *clean* clone without passing `--filter` (see the corrected client-behavior note below), filtered-pack caching, or incremental-fetch (`have`-line) hardening beyond what falls out naturally. Those are listed under "Out of scope / follow-ups" at the end. + +**Corrected client behavior (verified during execution, supersedes an earlier assumption in this plan):** a served pack that omits a blob still referenced by a sent tree is not closed under reachability. A stock *full* `git clone` therefore rejects it at *fetch* time with "remote did not send all necessary objects" (the connectivity check), NOT at checkout. Only a *partial* clone (the client passes `--filter`, which marks a promisor remote and relaxes that check) accepts the pack with the private blob absent; tree and commit SHAs stay intact. The security guarantee (private bytes never leave the node) holds for every client. Making a normal `git clone` Just Work without `--filter` is the git-remote-gitlawb follow-up. --- @@ -703,7 +705,7 @@ Set a subtree rule on a local repo via `gl visibility`, clone as a non-reader th ## Out of scope / follow-ups (separate plans) -1. **`git-remote-gitlawb` partial-clone UX.** Make a non-reader's clone produce a clean partial checkout rather than a checkout error on the missing blob: the helper requests partial-clone semantics and treats withheld blobs as deliberately absent. Without this, a stock `git clone` of a repo with a withheld blob succeeds at fetch but errors at checkout. The security guarantee (bytes never sent) holds regardless; this is purely UX. +1. **`git-remote-gitlawb` partial-clone UX.** Make a non-reader's clone Just Work without the user passing `--filter`: the helper requests partial-clone semantics, advertises the `filter` capability cleanly (so there is no "filtering not recognized by server, ignoring" warning), and treats withheld blobs as deliberately absent. Without this, a stock full `git clone` of a repo with a withheld blob is refused at fetch time ("remote did not send all necessary objects"); only `git clone --filter=...` succeeds. The security guarantee (bytes never sent) holds regardless; this is purely UX. 2. **Filtered-pack caching.** `build_filtered_pack` recomputes per request. If hot, cache by (repo, tip-OIDs, withheld-set) and invalidate on push. 3. **Incremental fetch (`have` lines).** This plan targets the clone case. Confirm and, if needed, harden the filtered serve for fetches that send `have` lines so withheld blobs are never sent incrementally either. 4. **Replication-path enforcement (Phase 2).** Still blocked on the maintainer A/B decision; unrelated to this HTTP-path work. From b0af815bf7fe9a12cd5a40d78eea5ca9e0dbc49e Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:40:41 -0500 Subject: [PATCH 09/58] test(node): prove incremental fetch still withholds; document negotiation choice Add a real-git test that partial-clones, pushes a new commit server-side, then fetches: the new object arrives and the withheld blob stays absent. This pins down that ignoring have/want negotiation (always sending a self-contained pack of all refs minus withheld, with NAK) is correct for both clone and fetch; the only cost is a fetch re-sends the full object set. Refactor the real-git tests onto a shared server harness and document the negotiation decision in code and in the plan's follow-ups. --- crates/gitlawb-node/src/git/smart_http.rs | 262 +++++++++++++----- ...6-05-phase3-subtree-content-withholding.md | 2 +- 2 files changed, 195 insertions(+), 69 deletions(-) diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index c92cb79..0609f98 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -183,6 +183,14 @@ pub fn build_filtered_pack(repo_path: &Path, withheld: &HashSet) -> Resu /// intact either way. The clean partial-clone client UX is a separate follow-up /// (git-remote-gitlawb); the security guarantee (private bytes never leave the /// node) holds regardless of client. +/// +/// Negotiation is intentionally ignored: rather than honoring the client's +/// `want`/`have` lines, this always sends a self-contained pack of every object +/// across all refs minus the withheld blobs, and replies `NAK`. A fresh clone +/// and an incremental fetch are both correct (the client de-duplicates objects +/// it already has); the cost is that a fetch re-sends the full object set +/// instead of a thin delta. Honoring negotiation for smaller fetch packs is an +/// optimization follow-up, not a correctness requirement. pub async fn upload_pack_excluding( repo_path: &Path, request_body: Bytes, @@ -405,40 +413,80 @@ mod tests { out } - /// End-to-end: a real `git` client clones through `info_refs` + - /// `upload_pack_excluding` and ends up without the withheld blob's bytes - /// while still seeing its tree entry (SHA). Uses a partial clone - /// (`--filter`) because a pack that omits a referenced blob is only - /// accepted by a promisor-aware client; a stock full clone is refused at - /// fetch time by the connectivity check. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn real_git_partial_clone_omits_withheld_blob() { - use axum::extract::{Query, State}; + // Shared harness for the real-git server tests: a minimal smart-HTTP server + // backed by the real info_refs + upload_pack_excluding. + + #[derive(Clone)] + struct FilterState { + repo: std::path::PathBuf, + withheld: HashSet, + } + + async fn refs_handler( + axum::extract::State(st): axum::extract::State>, + axum::extract::Query(q): axum::extract::Query>, + ) -> Response { + let service = q.get("service").cloned().unwrap_or_default(); + info_refs(&st.repo, &service).await.unwrap() + } + + async fn pack_handler( + axum::extract::State(st): axum::extract::State>, + body: Bytes, + ) -> Response { + upload_pack_excluding(&st.repo, body, &st.withheld) + .await + .unwrap() + } + + /// Spawn the server for `bare`, withholding `withheld`. Returns the clone URL + /// and the server task (abort it when done). + async fn spawn_filter_server( + bare: std::path::PathBuf, + withheld: HashSet, + ) -> (String, tokio::task::JoinHandle<()>) { use axum::routing::{get, post}; - use axum::Router; - use std::collections::HashMap; - use std::sync::Arc; + let state = std::sync::Arc::new(FilterState { + repo: bare, + withheld, + }); + let app = axum::Router::new() + .route("/repo.git/info/refs", get(refs_handler)) + .route("/repo.git/git-upload-pack", post(pack_handler)) + .with_state(state); + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + let handle = tokio::spawn(async move { + axum::serve(listener, app).await.unwrap(); + }); + (format!("http://127.0.0.1:{port}/repo.git"), handle) + } - let td = TempDir::new().unwrap(); + fn run_git(args: &[&str], dir: &std::path::Path) { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + } + + /// Build a work repo (public/a.txt, secret/b.txt) and a bare clone of it. + /// Returns (work, bare, secret_blob_oid, public_blob_oid). + fn fixture_with_secret( + td: &TempDir, + ) -> (std::path::PathBuf, std::path::PathBuf, String, String) { let work = td.path().join("work"); let bare = td.path().join("bare.git"); - let g = |args: &[&str], dir: &std::path::Path| { - assert!(Command::new("git") - .args(args) - .current_dir(dir) - .status() - .unwrap() - .success()); - }; std::fs::create_dir_all(work.join("secret")).unwrap(); std::fs::create_dir_all(work.join("public")).unwrap(); std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); - g(&["init", "-q"], &work); - g(&["config", "user.email", "t@t"], &work); - g(&["config", "user.name", "t"], &work); - g(&["add", "."], &work); - g(&["commit", "-qm", "init"], &work); + run_git(&["init", "-q"], &work); + run_git(&["config", "user.email", "t@t"], &work); + run_git(&["config", "user.name", "t"], &work); + run_git(&["add", "."], &work); + run_git(&["commit", "-qm", "init"], &work); let oid = |p: &str| { let o = Command::new("git") .args(["rev-parse", &format!("HEAD:{p}")]) @@ -449,7 +497,7 @@ mod tests { }; let secret_oid = oid("secret/b.txt"); let public_oid = oid("public/a.txt"); - g( + run_git( &[ "clone", "-q", @@ -459,43 +507,34 @@ mod tests { ], td.path(), ); + (work, bare, secret_oid, public_oid) + } - #[derive(Clone)] - struct St { - repo: std::path::PathBuf, - withheld: HashSet, - } - let state = Arc::new(St { - repo: bare.clone(), - withheld: HashSet::from([secret_oid.clone()]), - }); + /// Enumerate exactly the objects a repo physically has (no promisor lazy + /// fetch), so tests assert on what bytes actually crossed the wire. + fn local_object_ids(repo: &std::path::Path) -> String { + let out = Command::new("git") + .args(["cat-file", "--batch-all-objects", "--batch-check"]) + .current_dir(repo) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).into_owned() + } - async fn refs( - State(st): State>, - Query(q): Query>, - ) -> Response { - let service = q.get("service").cloned().unwrap_or_default(); - info_refs(&st.repo, &service).await.unwrap() - } - async fn pack(State(st): State>, body: Bytes) -> Response { - upload_pack_excluding(&st.repo, body, &st.withheld) - .await - .unwrap() - } + /// End-to-end: a real `git` client clones through `info_refs` + + /// `upload_pack_excluding` and ends up without the withheld blob's bytes + /// while still seeing its tree entry (SHA). Uses a partial clone + /// (`--filter`) because a pack that omits a referenced blob is only + /// accepted by a promisor-aware client; a stock full clone is refused at + /// fetch time by the connectivity check. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn real_git_partial_clone_omits_withheld_blob() { + let td = TempDir::new().unwrap(); + let (_work, bare, secret_oid, public_oid) = fixture_with_secret(&td); - let app = Router::new() - .route("/repo.git/info/refs", get(refs)) - .route("/repo.git/git-upload-pack", post(pack)) - .with_state(state); - - let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); - let port = listener.local_addr().unwrap().port(); - let server = tokio::spawn(async move { - axum::serve(listener, app).await.unwrap(); - }); + let (url, server) = spawn_filter_server(bare, HashSet::from([secret_oid.clone()])).await; let dest = td.path().join("clone"); - let url = format!("http://127.0.0.1:{port}/repo.git"); let dest_s = dest.to_str().unwrap().to_string(); let out = tokio::task::spawn_blocking(move || { Command::new("git") @@ -521,15 +560,8 @@ mod tests { String::from_utf8_lossy(&out.stderr) ); - // Enumerate exactly the objects the clone physically received (no - // promisor lazy-fetch): the public blob is present, the withheld blob is - // not. This asserts on the bytes that actually crossed the wire. - let local = Command::new("git") - .args(["cat-file", "--batch-all-objects", "--batch-check"]) - .current_dir(&dest) - .output() - .unwrap(); - let local = String::from_utf8_lossy(&local.stdout); + // The public blob is present in the clone, the withheld blob is not. + let local = local_object_ids(&dest); assert!( local.contains(&public_oid), "public blob should be present in the clone" @@ -553,4 +585,98 @@ mod tests { server.abort(); } + + /// End-to-end: an incremental `git fetch` after a partial clone still works + /// and still withholds the private blob. The serve path ignores the client's + /// have/want negotiation and always sends a self-contained pack of all refs + /// minus the withheld blobs (it replies NAK, so the client treats it as "no + /// common commits" and accepts the full set). This is correct, just not + /// bandwidth-optimal; thin-pack/negotiation is an optimization follow-up. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn real_git_fetch_after_partial_clone_still_withholds() { + let td = TempDir::new().unwrap(); + let (work, bare, secret_oid, _public_oid) = fixture_with_secret(&td); + let branch = { + let o = Command::new("git") + .args(["symbolic-ref", "--short", "HEAD"]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + + let (url, server) = + spawn_filter_server(bare.clone(), HashSet::from([secret_oid.clone()])).await; + + // Partial-clone the initial state. + let dest = td.path().join("clone"); + let dest_s = dest.to_str().unwrap().to_string(); + let url_c = url.clone(); + let out = tokio::task::spawn_blocking(move || { + Command::new("git") + .args([ + "-c", + "protocol.version=2", + "clone", + "--filter=blob:none", + "--no-checkout", + "-q", + &url_c, + &dest_s, + ]) + .output() + .unwrap() + }) + .await + .unwrap(); + assert!( + out.status.success(), + "clone failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // Add a new public commit on the server side. + std::fs::write(work.join("public/c.txt"), b"v2\n").unwrap(); + run_git(&["add", "."], &work); + run_git(&["commit", "-qm", "c2"], &work); + let new_oid = { + let o = Command::new("git") + .args(["rev-parse", "HEAD:public/c.txt"]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + run_git(&["push", "-q", bare.to_str().unwrap(), &branch], &work); + + // Incremental fetch: the client has c1 and asks for the update. + let dest_f = dest.clone(); + let out = tokio::task::spawn_blocking(move || { + Command::new("git") + .args(["-c", "protocol.version=2", "fetch", "-q", "origin"]) + .current_dir(&dest_f) + .output() + .unwrap() + }) + .await + .unwrap(); + assert!( + out.status.success(), + "fetch failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // The new commit's blob arrived; the withheld blob is still absent. + let local = local_object_ids(&dest); + assert!( + local.contains(&new_oid), + "the new commit's blob must be fetched" + ); + assert!( + !local.contains(&secret_oid), + "withheld blob must remain absent after fetch" + ); + + server.abort(); + } } diff --git a/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md b/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md index 0ddda81..d79c0df 100644 --- a/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md +++ b/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md @@ -707,6 +707,6 @@ Set a subtree rule on a local repo via `gl visibility`, clone as a non-reader th 1. **`git-remote-gitlawb` partial-clone UX.** Make a non-reader's clone Just Work without the user passing `--filter`: the helper requests partial-clone semantics, advertises the `filter` capability cleanly (so there is no "filtering not recognized by server, ignoring" warning), and treats withheld blobs as deliberately absent. Without this, a stock full `git clone` of a repo with a withheld blob is refused at fetch time ("remote did not send all necessary objects"); only `git clone --filter=...` succeeds. The security guarantee (bytes never sent) holds regardless; this is purely UX. 2. **Filtered-pack caching.** `build_filtered_pack` recomputes per request. If hot, cache by (repo, tip-OIDs, withheld-set) and invalidate on push. -3. **Incremental fetch (`have` lines).** This plan targets the clone case. Confirm and, if needed, harden the filtered serve for fetches that send `have` lines so withheld blobs are never sent incrementally either. +3. **Incremental fetch efficiency.** Verified during execution: an incremental `git fetch` after a partial clone is already correct and still withholds the private blob (covered by `real_git_fetch_after_partial_clone_still_withholds`). The serve ignores the client's `have`/`want` negotiation and always sends a self-contained pack of all refs minus the withheld blobs, replying `NAK`; the client de-duplicates, so nothing breaks. The only cost is that a fetch re-sends the full object set instead of a thin delta. Honoring negotiation to produce smaller fetch packs is the optimization left here. 4. **Replication-path enforcement (Phase 2).** Still blocked on the maintainer A/B decision; unrelated to this HTTP-path work. ``` From 85a97118c55b28c538700d63eda42e818e04786f Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 18:36:32 -0500 Subject: [PATCH 10/58] refactor(node): address CodeRabbit review on PR #28 Move the two blocking git shell-outs in the filtered upload-pack path off the async worker thread, matching the tokio::process / spawn_blocking usage already in this file: build_filtered_pack (rev-list + pack-objects) and withheld_blob_oids (per-ref ls-tree) now run inside spawn_blocking so a large repo cannot stall the tokio runtime. Behavior is unchanged. Also fix the Task 0 findings block in the Phase 3 plan: it still recorded v2 packfile framing, which is the exact path that failed against a real client and was corrected to v0. The block now documents the shipped v0 contract. Drop a stray trailing code fence flagged by markdownlint (MD040). The speculative ls-tree timeout and the public/no-rules fast-path from the review are intentionally left out: the timeout guards against adversarial repos we do not yet host, and the fast-path is a micro-optimization not worth the extra branch right now. --- crates/gitlawb-node/src/api/repos.rs | 29 ++++++++++++++----- crates/gitlawb-node/src/git/smart_http.rs | 13 +++++++-- ...6-05-phase3-subtree-content-withholding.md | 13 ++++----- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index ca5ed05..4522e8d 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -395,14 +395,27 @@ pub async fn git_upload_pack( .map_err(|e| AppError::Git(e.to_string()))?; let body_len = body.len(); - let withheld = visibility_pack::withheld_blob_oids( - &disk_path, - &rules, - record.is_public, - &record.owner_did, - caller, - ) - .map_err(|e| AppError::Git(e.to_string()))?; + // withheld_blob_oids walks every ref with blocking `git ls-tree`; keep that + // off the async worker thread. + let withheld = { + let path = disk_path.clone(); + let rules = rules.clone(); + let owner_did = record.owner_did.clone(); + let caller_owned = caller.map(str::to_string); + let is_public = record.is_public; + tokio::task::spawn_blocking(move || { + visibility_pack::withheld_blob_oids( + &path, + &rules, + is_public, + &owner_did, + caller_owned.as_deref(), + ) + }) + .await + .map_err(|e| AppError::Git(e.to_string()))? + .map_err(|e| AppError::Git(e.to_string()))? + }; let resp = if withheld.is_empty() { smart_http::upload_pack(&disk_path, body).await diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index 0609f98..80374fb 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -1,4 +1,4 @@ -use anyhow::{bail, Result}; +use anyhow::{bail, Context, Result}; use axum::body::Body; use axum::http::StatusCode; use axum::response::Response; @@ -196,7 +196,16 @@ pub async fn upload_pack_excluding( request_body: Bytes, withheld: &HashSet, ) -> Result { - let pack = build_filtered_pack(repo_path, withheld)?; + // build_filtered_pack shells out to git (rev-list, pack-objects) with + // blocking std::process I/O; run it off the async worker so a large repo's + // pack build does not stall the tokio runtime. + let pack = { + let repo_path = repo_path.to_path_buf(); + let withheld = withheld.clone(); + tokio::task::spawn_blocking(move || build_filtered_pack(&repo_path, &withheld)) + .await + .context("filtered-pack build task panicked")?? + }; // The client lists its capabilities on the first `want` line. Honor // side-band-64k when offered (every modern smart-HTTP client offers it); diff --git a/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md b/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md index d79c0df..453ca94 100644 --- a/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md +++ b/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md @@ -99,13 +99,11 @@ Executed 2026-06-06. Results: - `git rev-list --objects --all` (in repo dir) to enumerate reachable objects as `oid [path]` lines. - Filter out withheld OIDs (first whitespace column), feed remaining OIDs newline-delimited to `git pack-objects --stdout`. - Verified exclusion by `git index-pack ` then `git verify-pack -v `: secret blob absent, public blob present. Confirmed. -- **Protocol version targeted:** v2 packfile section. The serve hand-frames the body, so no `GIT_PROTOCOL`/`-c protocol.version` flag is passed to our own process; we emit the v2 `packfile` section bytes directly. -- **Response framing (captured by driving `git upload-pack --stateless-rpc` with `GIT_PROTOCOL=version=2`):** - - `pkt_line("packfile\n")` (plain control pkt-line, not a sideband band). - - Then sideband-64k bands: `0x02` = progress (optional, we omit), `0x01` = pack data whose payload begins `PACK...`. - - Pack data chunked under the pkt-line limit, each chunk prefixed with `0x01`. - - Terminated by `0000` flush. - - This matches the plan's Option B framing in Task 2 exactly; no adjustment needed. +- **Protocol version targeted:** v0. `info_refs` runs `git upload-pack --advertise-refs` with no `GIT_PROTOCOL=version=2`, so it advertises v0 and clients negotiate v0; the serve path must hand-frame a v0 response. (An earlier draft of this block recorded v2 framing; that path was implemented, failed against a real client with "expected ACK/NAK, got 'packfile'", and was corrected to v0. The record below reflects the shipped v0 contract.) +- **Response framing (v0):** + - `pkt_line("NAK\n")` first (no `packfile\n` control line; that is v2 only). + - If the client offered `side-band-64k`: band 1 (`0x01`) carries pack data whose payload begins `PACK...`, chunked under the pkt-line size limit (65515), each chunk prefixed with `0x01`; terminated by a `0000` flush. + - If no side-band was offered: the raw pack bytes follow `NAK\n` directly, with no flush. - **Confirmed:** served pack contains PUBLIC_OID, excludes SECRET_OID. --- @@ -709,4 +707,3 @@ Set a subtree rule on a local repo via `gl visibility`, clone as a non-reader th 2. **Filtered-pack caching.** `build_filtered_pack` recomputes per request. If hot, cache by (repo, tip-OIDs, withheld-set) and invalidate on push. 3. **Incremental fetch efficiency.** Verified during execution: an incremental `git fetch` after a partial clone is already correct and still withholds the private blob (covered by `real_git_fetch_after_partial_clone_still_withholds`). The serve ignores the client's `have`/`want` negotiation and always sends a self-contained pack of all refs minus the withheld blobs, replying `NAK`; the client de-duplicates, so nothing breaks. The only cost is that a fetch re-sends the full object set instead of a thin delta. Honoring negotiation to produce smaller fetch packs is the optimization left here. 4. **Replication-path enforcement (Phase 2).** Still blocked on the maintainer A/B decision; unrelated to this HTTP-path work. -``` From c27e8dc992e7dcbb9045b68f8d942c228b18c9cd Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 22:06:32 -0500 Subject: [PATCH 11/58] chore(node): drop planning doc from PR per maintainer request kevincodex1 asked to keep the superpowers planning docs out of the repo. The Phase 3 plan was scaffolding for this change, not something the project needs to carry. Removing it leaves only the code and tests in the PR. --- ...6-05-phase3-subtree-content-withholding.md | 709 ------------------ 1 file changed, 709 deletions(-) delete mode 100644 docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md diff --git a/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md b/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md deleted file mode 100644 index 453ca94..0000000 --- a/docs/superpowers/plans/2026-06-05-phase3-subtree-content-withholding.md +++ /dev/null @@ -1,709 +0,0 @@ -# Phase 3: Subtree Content Withholding (mode B) Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Make a mode-`b` subtree visibility rule actually withhold that subtree's file content on clone/fetch over the node's HTTP git read path, while keeping every commit and tree SHA intact, so a non-reader sees the directory structure and blob SHAs but never the private bytes. - -**Architecture:** The authorization decision already exists as the pure `visibility_check` (one decision per path). Phase 3 adds two node-side pieces: (1) a blob-OID resolver that, given a repo's refs plus the caller's rules, returns the set of blob object IDs the caller may not read (a blob is withheld only if it appears at no allowed path); and (2) a filtered `upload-pack` serve path that builds the response pack excluding those OIDs. The two existing read handlers (`git_info_refs`, `git_upload_pack`) keep their current whole-repo 404 gate unchanged and gain a filtered serve branch when, and only when, the caller has at least one withheld blob. Trees and commits are always sent in full, so SHAs stay intact; only blob content is omitted. - -**Tech Stack:** Rust, axum, the system `git` CLI (shelled out, as the codebase already does in `git/store.rs` and `git/smart_http.rs`), `tempfile` for fixture repos in tests. - -**Scope boundary:** This plan covers the node-side enforcement and the security guarantee (private blob bytes are never placed in the served pack), proven by inspecting the produced pack and by a real `git` partial clone. It deliberately does NOT cover: the `git-remote-gitlawb` client-side change that lets a non-reader get a *clean* clone without passing `--filter` (see the corrected client-behavior note below), filtered-pack caching, or incremental-fetch (`have`-line) hardening beyond what falls out naturally. Those are listed under "Out of scope / follow-ups" at the end. - -**Corrected client behavior (verified during execution, supersedes an earlier assumption in this plan):** a served pack that omits a blob still referenced by a sent tree is not closed under reachability. A stock *full* `git clone` therefore rejects it at *fetch* time with "remote did not send all necessary objects" (the connectivity check), NOT at checkout. Only a *partial* clone (the client passes `--filter`, which marks a promisor remote and relaxes that check) accepts the pack with the private blob absent; tree and commit SHAs stay intact. The security guarantee (private bytes never leave the node) holds for every client. Making a normal `git clone` Just Work without `--filter` is the git-remote-gitlawb follow-up. - ---- - -## File Structure - -- **Create:** `crates/gitlawb-node/src/git/visibility_pack.rs`: the blob-OID resolver (`withheld_blob_oids`) and its tests. One responsibility: decide which blob OIDs to withhold for a caller. -- **Modify:** `crates/gitlawb-node/src/git/mod.rs`: add `pub mod visibility_pack;`. -- **Modify:** `crates/gitlawb-node/src/git/smart_http.rs`: add `upload_pack_excluding` (filtered serve) alongside the existing `upload_pack`, plus a small `pack_object_ids` test helper. -- **Modify:** `crates/gitlawb-node/src/api/repos.rs`: in `git_upload_pack` (around line 368-407) branch to the filtered serve when the caller has withheld blobs; `git_info_refs` (around line 308-365) needs no functional change but gets a confirming test. -- **Modify (test oracle only):** `crates/gitlawb-node/src/visibility.rs`: no logic change; `visibility_check` is reused as-is by the resolver. - ---- - -## Task 0: Spike: pin the filtered-serve mechanism - -This is the one genuinely uncertain piece: how to make `git upload-pack` (or `git pack-objects`) produce a clone/fetch response that omits a specific set of blob OIDs while still sending the trees that reference them, and how to frame that as a valid `application/x-git-upload-pack-result` body. Everything downstream depends on a single function signature, not on the mechanism, so this task nails the mechanism by experiment and records the result. No production code is committed in this task. - -**Files:** -- Scratch only (a throwaway shell script and a temp repo). Findings are written back into this plan's "Task 0 Findings" block below. - -- [ ] **Step 1: Build a fixture repo with a public and a private file** - -Run: -```bash -cd "$(mktemp -d)" && export FIX=$PWD -git init -q work && cd work -git config user.email t@t && git config user.name t -mkdir -p public secret -echo "public bytes" > public/a.txt -echo "TOP SECRET" > secret/b.txt -git add . && git commit -qm init -SECRET_OID=$(git rev-parse HEAD:secret/b.txt) -PUBLIC_OID=$(git rev-parse HEAD:public/a.txt) -echo "secret blob=$SECRET_OID public blob=$PUBLIC_OID" -cd .. && git clone -q --bare work bare.git -``` - -- [ ] **Step 2: Produce a pack that excludes the secret blob OID** - -Run (mechanism candidate: explicit object list to `pack-objects`): -```bash -cd "$FIX/bare.git" -# Every object reachable from all refs, as "oid [path]" lines: -git rev-list --objects --all > /tmp/all_objs.txt -# Drop the secret blob's line, keep only the OID column: -grep -v "^$SECRET_OID" /tmp/all_objs.txt | awk '{print $1}' > /tmp/keep_oids.txt -# Build a pack of exactly those objects: -git pack-objects --stdout < /tmp/keep_oids.txt > /tmp/filtered.pack -# Confirm the secret blob is absent and the public blob present: -git verify-pack -v /tmp/filtered.pack | grep -E "$SECRET_OID|$PUBLIC_OID" || echo "secret absent (expected: only public line prints)" -``` -Expected: the public OID prints, the secret OID does not. This proves the OID-exclusion mechanism. - -- [ ] **Step 3: Determine the upload-pack response framing** - -Run, capturing the exact bytes a real clone request/response uses, so the framing in Task 3 is correct rather than guessed: -```bash -cd "$FIX/bare.git" -git config uploadpack.allowFilter true -# Capture a normal v2 clone's request body and response shape: -GIT_TRACE_PACKET=1 git -c protocol.version=2 clone -q --bare "$FIX/bare.git" "$FIX/clone1.git" 2>/tmp/trace.txt -# Inspect the fetch command + response sections (look for "packfile", sideband 0001/0002, flush 0000): -grep -E "fetch|want|packfile|0000|ACK|NAK|ready" /tmp/trace.txt | head -40 -``` -Record from the trace: (a) whether the node should target protocol v2 or v0, (b) the exact section markers around the packfile, (c) whether sideband-64k framing is in use. - -- [ ] **Step 4: Decide the serve implementation and write findings** - -Choose the implementation for `upload_pack_excluding` based on Steps 1-3, preferring the lowest-risk option that the trace confirms works: - -- **Option A (preferred): delegate to `git upload-pack` with an injected mandatory filter.** Set `uploadpack.allowFilter=true`, rewrite the client's fetch request to carry `filter sparse:oid=` (v2) where the spec blob excludes the denied paths, and let `git upload-pack` build and frame the entire response. Lowest framing risk; depends on `sparse:oid` negation behaving (verify in Step 2 variant). -- **Option B (fallback): hand-build the pack.** Parse `want` OIDs from the request body, run `git rev-list --objects ` minus the withheld OIDs, pipe to `git pack-objects --stdout`, and frame the result per the markers captured in Step 3. - -Write the chosen option, the exact `git` invocation(s), and the framing bytes into the "Task 0 Findings" block below. The downstream tasks reference `upload_pack_excluding(repo_path, request_body, withheld_oids) -> Result` regardless of which option is recorded here. - -- [ ] **Step 5: No commit** - -This task records findings only; there is nothing to commit. - -### Task 0 Findings - -Executed 2026-06-06. Results: - -- **Mechanism chosen:** Option B (hand-built pack). `sparse:oid` negation was not needed; explicit OID exclusion via `rev-list` + `pack-objects` is deterministic and self-contained. -- **Exact git invocation(s):** - - `git rev-list --objects --all` (in repo dir) to enumerate reachable objects as `oid [path]` lines. - - Filter out withheld OIDs (first whitespace column), feed remaining OIDs newline-delimited to `git pack-objects --stdout`. - - Verified exclusion by `git index-pack ` then `git verify-pack -v `: secret blob absent, public blob present. Confirmed. -- **Protocol version targeted:** v0. `info_refs` runs `git upload-pack --advertise-refs` with no `GIT_PROTOCOL=version=2`, so it advertises v0 and clients negotiate v0; the serve path must hand-frame a v0 response. (An earlier draft of this block recorded v2 framing; that path was implemented, failed against a real client with "expected ACK/NAK, got 'packfile'", and was corrected to v0. The record below reflects the shipped v0 contract.) -- **Response framing (v0):** - - `pkt_line("NAK\n")` first (no `packfile\n` control line; that is v2 only). - - If the client offered `side-band-64k`: band 1 (`0x01`) carries pack data whose payload begins `PACK...`, chunked under the pkt-line size limit (65515), each chunk prefixed with `0x01`; terminated by a `0000` flush. - - If no side-band was offered: the raw pack bytes follow `NAK\n` directly, with no flush. -- **Confirmed:** served pack contains PUBLIC_OID, excludes SECRET_OID. - ---- - -## Task 1: Blob-OID resolver: withhold a private subtree's blobs for a non-reader - -**Files:** -- Create: `crates/gitlawb-node/src/git/visibility_pack.rs` -- Modify: `crates/gitlawb-node/src/git/mod.rs` (add module) - -- [ ] **Step 1: Register the module** - -In `crates/gitlawb-node/src/git/mod.rs`, add the line in alphabetical position (after `pub mod store;`): -```rust -pub mod visibility_pack; -``` - -- [ ] **Step 2: Write the failing test (non-reader withholds only the private blob)** - -Create `crates/gitlawb-node/src/git/visibility_pack.rs` with the test module first: -```rust -//! Resolve which blob OIDs must be withheld from a caller because every path -//! at which the blob appears is denied by the repo's visibility rules. Trees -//! and commits are never withheld (mode B keeps SHAs intact); only blob -//! content is held back. - -use crate::db::{VisibilityMode, VisibilityRule}; -use crate::git::store; -use crate::visibility::{visibility_check, Decision}; -use anyhow::{Context, Result}; -use std::collections::HashSet; -use std::path::Path; - -#[cfg(test)] -mod tests { - use super::*; - use chrono::Utc; - use std::process::Command; - use tempfile::TempDir; - - fn rule(path_glob: &str, readers: &[&str]) -> VisibilityRule { - VisibilityRule { - id: "x".into(), - repo_id: "r1".into(), - path_glob: path_glob.into(), - mode: VisibilityMode::B, - reader_dids: readers.iter().map(|s| s.to_string()).collect(), - created_by: "did:key:zOwner".into(), - created_at: Utc::now(), - } - } - - const OWNER: &str = "did:key:zOwner"; - - /// Build a bare repo with public/a.txt and secret/b.txt at one commit. - /// Returns (tempdir, bare_path, secret_blob_oid, public_blob_oid). - fn fixture() -> (TempDir, std::path::PathBuf, String, String) { - let td = TempDir::new().unwrap(); - let work = td.path().join("work"); - let bare = td.path().join("bare.git"); - let run = |args: &[&str], dir: &Path| { - let ok = Command::new("git") - .args(args) - .current_dir(dir) - .status() - .unwrap() - .success(); - assert!(ok, "git {args:?} failed"); - }; - std::fs::create_dir_all(work.join("public")).unwrap(); - std::fs::create_dir_all(work.join("secret")).unwrap(); - std::fs::write(work.join("public/a.txt"), b"public bytes\n").unwrap(); - std::fs::write(work.join("secret/b.txt"), b"TOP SECRET\n").unwrap(); - run(&["init", "-q"], &work); - run(&["config", "user.email", "t@t"], &work); - run(&["config", "user.name", "t"], &work); - run(&["add", "."], &work); - run(&["commit", "-qm", "init"], &work); - let oid = |path: &str| { - let out = Command::new("git") - .args(["rev-parse", &format!("HEAD:{path}")]) - .current_dir(&work) - .output() - .unwrap(); - String::from_utf8_lossy(&out.stdout).trim().to_string() - }; - let secret = oid("secret/b.txt"); - let public = oid("public/a.txt"); - run( - &["clone", "-q", "--bare", work.to_str().unwrap(), bare.to_str().unwrap()], - td.path(), - ); - (td, bare, secret, public) - } - - #[test] - fn non_reader_withholds_only_the_private_blob() { - let (_td, bare, secret, public) = fixture(); - let rules = [rule("/secret/**", &["did:key:zFriend"])]; - let withheld = - withheld_blob_oids(&bare, &rules, true, OWNER, Some("did:key:zStranger")).unwrap(); - assert!(withheld.contains(&secret), "secret blob must be withheld"); - assert!(!withheld.contains(&public), "public blob must NOT be withheld"); - } - - #[test] - fn owner_withholds_nothing() { - let (_td, bare, secret, public) = fixture(); - let rules = [rule("/secret/**", &["did:key:zFriend"])]; - let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, Some(OWNER)).unwrap(); - assert!(withheld.is_empty(), "owner sees everything"); - let _ = (secret, public); - } - - #[test] - fn listed_reader_withholds_nothing() { - let (_td, bare, _secret, _public) = fixture(); - let rules = [rule("/secret/**", &["did:key:zFriend"])]; - let withheld = - withheld_blob_oids(&bare, &rules, true, OWNER, Some("did:key:zFriend")).unwrap(); - assert!(withheld.is_empty(), "listed reader sees the subtree"); - } - - #[test] - fn no_subtree_rules_withholds_nothing() { - let (_td, bare, _secret, _public) = fixture(); - let withheld = withheld_blob_oids(&bare, &[], true, OWNER, None).unwrap(); - assert!(withheld.is_empty(), "public repo, no rules, nothing withheld"); - } -} -``` - -- [ ] **Step 3: Run the test to verify it fails** - -Run: `cargo test -p gitlawb-node visibility_pack:: -- --nocapture` -Expected: FAIL to compile with "cannot find function `withheld_blob_oids`". - -- [ ] **Step 4: Implement `withheld_blob_oids`** - -Add above the `#[cfg(test)]` block in `visibility_pack.rs`: -```rust -/// List every (blob_oid, "/repo/relative/path") pair reachable from any branch -/// ref in `repo_path`. Uses `git ls-tree -r` per ref so each path a blob lives -/// at is represented (the same blob content can appear at several paths). Paths -/// are returned with a leading "/" to match the glob form used by visibility -/// rules ("/secret/**"). -fn blob_paths(repo_path: &Path) -> Result> { - let refs = store::list_refs(repo_path).context("list_refs failed")?; - let mut out = Vec::new(); - for (refname, _oid) in refs { - if !refname.starts_with("refs/heads/") && !refname.starts_with("refs/tags/") { - continue; - } - let listing = std::process::Command::new("git") - .args(["ls-tree", "-r", &refname]) - .current_dir(repo_path) - .output() - .context("git ls-tree -r failed")?; - if !listing.status.success() { - continue; - } - for line in String::from_utf8_lossy(&listing.stdout).lines() { - // " blob \t" - let Some((meta, path)) = line.split_once('\t') else { - continue; - }; - let mut parts = meta.split_whitespace(); - let _mode = parts.next(); - let kind = parts.next(); - let oid = parts.next(); - if kind == Some("blob") { - if let Some(oid) = oid { - out.push((oid.to_string(), format!("/{path}"))); - } - } - } - } - Ok(out) -} - -/// Blob OIDs the caller may not read. A blob is withheld only if visibility -/// denies the caller at *every* path the blob appears at; a blob that is also -/// reachable through an allowed path is sent (its content is public elsewhere). -/// -/// The whole-repo "/" gate is handled by the caller before this function runs: -/// if "/" denies, the caller gets a 404 and never reaches the filtered serve. -pub fn withheld_blob_oids( - repo_path: &Path, - rules: &[VisibilityRule], - is_public: bool, - owner_did: &str, - caller: Option<&str>, -) -> Result> { - let mut denied: HashSet = HashSet::new(); - let mut allowed: HashSet = HashSet::new(); - for (oid, path) in blob_paths(repo_path)? { - match visibility_check(rules, is_public, owner_did, caller, &path) { - Decision::Deny => { - denied.insert(oid); - } - Decision::Allow => { - allowed.insert(oid); - } - } - } - Ok(denied.difference(&allowed).cloned().collect()) -} -``` - -- [ ] **Step 5: Run the tests to verify they pass** - -Run: `cargo test -p gitlawb-node visibility_pack::` -Expected: PASS (4 tests). - -- [ ] **Step 6: Commit** - -```bash -git add crates/gitlawb-node/src/git/visibility_pack.rs crates/gitlawb-node/src/git/mod.rs -git commit -m "feat(node): resolve withheld blob OIDs for path-scoped visibility" -``` - ---- - -## Task 2: Filtered upload-pack serve (`upload_pack_excluding`) - -**Files:** -- Modify: `crates/gitlawb-node/src/git/smart_http.rs` - -Implement using the mechanism recorded in **Task 0 Findings**. The code below is written for **Option B (hand-built pack)** because it is self-contained and deterministic; if Task 0 recorded Option A, implement that instead behind the identical signature and adjust the test in Step 2 only where it inspects framing (the object-content assertion stays). - -- [ ] **Step 1: Add the test module with a pack-inspection helper and the failing test** - -At the bottom of `smart_http.rs`, add a `#[cfg(test)] mod tests` containing the pack-inspection helper (lists the OIDs inside a raw pack so tests can assert membership) and the first failing test: -```rust -#[cfg(test)] -mod tests { - use super::*; - use std::process::Command; - use tempfile::TempDir; - - /// List OIDs in a pack by writing it to a temp dir and running verify-pack. - pub(super) fn pack_object_ids(pack: &[u8]) -> std::collections::HashSet { - let dir = TempDir::new().unwrap(); - let path = dir.path().join("test.pack"); - std::fs::write(&path, pack).unwrap(); - // index-pack creates the matching .idx next to the pack. - let ok = Command::new("git") - .args(["index-pack", path.to_str().unwrap()]) - .status() - .unwrap() - .success(); - assert!(ok, "index-pack failed"); - let out = Command::new("git") - .args(["verify-pack", "-v", path.to_str().unwrap()]) - .output() - .unwrap(); - String::from_utf8_lossy(&out.stdout) - .lines() - .filter_map(|l| l.split_whitespace().next()) - .filter(|t| t.len() == 40 && t.chars().all(|c| c.is_ascii_hexdigit())) - .map(|s| s.to_string()) - .collect() - } - - #[tokio::test] - async fn filtered_serve_excludes_withheld_blob() { - // Build a bare repo, capture the secret + public blob OIDs. - let td = TempDir::new().unwrap(); - let work = td.path().join("work"); - let bare = td.path().join("bare.git"); - let g = |args: &[&str], dir: &std::path::Path| { - assert!(Command::new("git").args(args).current_dir(dir).status().unwrap().success()); - }; - std::fs::create_dir_all(work.join("secret")).unwrap(); - std::fs::create_dir_all(work.join("public")).unwrap(); - std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); - std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); - g(&["init", "-q"], &work); - g(&["config", "user.email", "t@t"], &work); - g(&["config", "user.name", "t"], &work); - g(&["add", "."], &work); - g(&["commit", "-qm", "init"], &work); - let oid = |p: &str| { - let o = Command::new("git").args(["rev-parse", &format!("HEAD:{p}")]) - .current_dir(&work).output().unwrap(); - String::from_utf8_lossy(&o.stdout).trim().to_string() - }; - let secret = oid("secret/b.txt"); - let public = oid("public/a.txt"); - g(&["clone", "-q", "--bare", work.to_str().unwrap(), bare.to_str().unwrap()], td.path()); - - let mut withheld = std::collections::HashSet::new(); - withheld.insert(secret.clone()); - - let pack = build_filtered_pack(&bare, &withheld).unwrap(); - let ids = pack_object_ids(&pack); - assert!(ids.contains(&public), "public blob must be in the pack"); - assert!(!ids.contains(&secret), "secret blob must NOT be in the pack"); - } -``` - -- [ ] **Step 2: Run the test to verify it fails** - -Run: `cargo test -p gitlawb-node smart_http::tests::filtered_serve_excludes_withheld_blob` -Expected: FAIL to compile with "cannot find function `build_filtered_pack`". - -- [ ] **Step 3: Implement `build_filtered_pack` and `upload_pack_excluding`** - -Add to `smart_http.rs` (above the `#[cfg(test)]` block). `build_filtered_pack` is the deterministic core (unit-tested in Step 1); `upload_pack_excluding` frames it as an HTTP response using the markers recorded in Task 0 Findings: -```rust -use std::collections::HashSet; - -/// Build a packfile containing every object reachable from all refs EXCEPT the -/// given blob OIDs. Commits and trees are always included, so SHAs stay intact; -/// only the named blobs are dropped. -pub fn build_filtered_pack(repo_path: &Path, withheld: &HashSet) -> Result> { - // All reachable objects as "oid [path]" lines. - let rev = std::process::Command::new("git") - .args(["rev-list", "--objects", "--all"]) - .current_dir(repo_path) - .output()?; - if !rev.status.success() { - bail!("git rev-list failed: {}", String::from_utf8_lossy(&rev.stderr)); - } - let mut keep = Vec::new(); - for line in String::from_utf8_lossy(&rev.stdout).lines() { - let oid = line.split_whitespace().next().unwrap_or(""); - if oid.is_empty() || withheld.contains(oid) { - continue; - } - keep.push(oid.to_string()); - } - let mut child = std::process::Command::new("git") - .args(["pack-objects", "--stdout"]) - .current_dir(repo_path) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn()?; - { - use std::io::Write as _; - let mut stdin = child.stdin.take().expect("stdin"); - stdin.write_all(keep.join("\n").as_bytes())?; - stdin.write_all(b"\n")?; - } - let out = child.wait_with_output()?; - if !out.status.success() { - bail!("git pack-objects failed: {}", String::from_utf8_lossy(&out.stderr)); - } - Ok(out.stdout) -} - -/// Serve a clone/fetch with the withheld blobs removed from the response pack. -/// Framing follows Task 0 Findings; the body wraps `build_filtered_pack` output -/// in the upload-pack `packfile` section with sideband-64k, terminated by flush. -pub async fn upload_pack_excluding( - repo_path: &Path, - _request_body: Bytes, - withheld: &HashSet, -) -> Result { - let pack = build_filtered_pack(repo_path, withheld)?; - let mut body = Vec::new(); - body.extend_from_slice(&pkt_line("packfile\n")); - // sideband-64k: band 1 carries pack data, chunked under the pkt-line limit. - for chunk in pack.chunks(65515) { - let mut framed = Vec::with_capacity(chunk.len() + 1); - framed.push(0x01); - framed.extend_from_slice(chunk); - let len = framed.len() + 4; - body.extend_from_slice(format!("{len:04x}").as_bytes()); - body.extend_from_slice(&framed); - } - body.extend_from_slice(b"0000"); - Ok(Response::builder() - .status(StatusCode::OK) - .header("Content-Type", "application/x-git-upload-pack-result") - .header("Cache-Control", "no-cache") - .body(Body::from(body))?) -} -``` -> If Task 0 recorded **Option A**, replace the two functions above with the injected-filter delegation to `git upload-pack`, keeping the `build_filtered_pack` name as a thin wrapper so the Step 1 test still drives the OID-exclusion guarantee. - -- [ ] **Step 4: Run the tests to verify they pass** - -Run: `cargo test -p gitlawb-node smart_http::tests::filtered_serve_excludes_withheld_blob` -Expected: PASS. - -- [ ] **Step 5: Commit** - -```bash -git add crates/gitlawb-node/src/git/smart_http.rs -git commit -m "feat(node): filtered upload-pack serve that omits withheld blobs" -``` - ---- - -## Task 3: Wire filtered serve into the upload-pack handler - -**Files:** -- Modify: `crates/gitlawb-node/src/api/repos.rs` (`git_upload_pack`, lines ~368-407) - -- [ ] **Step 1: Add the imports** - -At the top of `repos.rs`, in the existing `use crate::git::{...}` group, add `visibility_pack`: -```rust -use crate::git::{smart_http, store, visibility_pack}; -``` -(If `store` is not already in that group, keep whatever is there and append `visibility_pack`.) - -- [ ] **Step 2: Branch to the filtered serve** - -In `git_upload_pack`, the current body computes `rules`, runs the whole-repo `visibility_check(..., "/")` 404 gate, acquires `disk_path`, then calls `smart_http::upload_pack(&disk_path, body)`. Keep the 404 gate and the `acquire` exactly as they are. Replace only the single serve call: -```rust - let disk_path = state - .repo_store - .acquire(&record.owner_did, &record.name) - .await - .map_err(|e| AppError::Git(e.to_string()))?; - let body_len = body.len(); - - let withheld = - visibility_pack::withheld_blob_oids(&disk_path, &rules, record.is_public, &record.owner_did, caller) - .map_err(|e| AppError::Git(e.to_string()))?; - - let resp = if withheld.is_empty() { - smart_http::upload_pack(&disk_path, body).await - } else { - tracing::info!(repo = %name, caller = ?caller, withheld = withheld.len(), "serving filtered pack"); - smart_http::upload_pack_excluding(&disk_path, body, &withheld).await - } - .map_err(|e| { - let msg = e.to_string(); - if msg.contains("bad line length") || msg.contains("protocol error") { - tracing::warn!(repo = %name, err = %msg, "git-upload-pack: bad client request"); - AppError::BadRequest(msg) - } else { - tracing::error!(repo = %name, err = %msg, "git-upload-pack failed"); - AppError::Git(msg) - } - })?; -``` -Leave the `crate::metrics::record_fetch(...)` line and everything after it unchanged. - -- [ ] **Step 3: Verify the crate builds and existing tests pass** - -Run: `cargo test -p gitlawb-node` -Expected: PASS, including the Phase 1 whole-repo visibility tests (no regression). The new fast-path (`withheld.is_empty()`) must keep public and fully-authorized clones byte-identical to before. - -- [ ] **Step 4: Commit** - -```bash -git add crates/gitlawb-node/src/api/repos.rs -git commit -m "feat(node): serve filtered pack when caller has withheld subtree blobs" -``` - ---- - -## Task 4: End-to-end clone test through a real git client - -**Files:** -- Modify: `crates/gitlawb-node/src/git/smart_http.rs` (extend `mod tests`) - -This proves the served body is a clone a real `git` accepts and that the private bytes are absent from the resulting object store, which is the security guarantee. - -- [ ] **Step 1: Write the failing end-to-end test** - -Add to `smart_http.rs` `mod tests`: -```rust - #[tokio::test] - async fn client_clone_lacks_withheld_blob_bytes() { - use axum::body::to_bytes; - let td = TempDir::new().unwrap(); - let work = td.path().join("work"); - let bare = td.path().join("bare.git"); - let g = |args: &[&str], dir: &std::path::Path| { - assert!(Command::new("git").args(args).current_dir(dir).status().unwrap().success()); - }; - std::fs::create_dir_all(work.join("secret")).unwrap(); - std::fs::create_dir_all(work.join("public")).unwrap(); - std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); - std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); - g(&["init", "-q"], &work); - g(&["config", "user.email", "t@t"], &work); - g(&["config", "user.name", "t"], &work); - g(&["add", "."], &work); - g(&["commit", "-qm", "init"], &work); - let secret_oid = { - let o = Command::new("git").args(["rev-parse", "HEAD:secret/b.txt"]) - .current_dir(&work).output().unwrap(); - String::from_utf8_lossy(&o.stdout).trim().to_string() - }; - g(&["clone", "-q", "--bare", work.to_str().unwrap(), bare.to_str().unwrap()], td.path()); - - let mut withheld = std::collections::HashSet::new(); - withheld.insert(secret_oid.clone()); - - let resp = upload_pack_excluding(&bare, Bytes::new(), &withheld).await.unwrap(); - let body = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); - let ids = pack_object_ids(&extract_pack(&body)); - assert!(!ids.contains(&secret_oid), "withheld blob must be absent from served pack"); - } - - /// Strip the upload-pack `packfile` section framing, returning the raw pack. - /// Mirrors how a client de-frames the sideband-64k band-1 stream. - fn extract_pack(body: &[u8]) -> Vec { - let mut out = Vec::new(); - let mut i = 0; - while i + 4 <= body.len() { - let len = usize::from_str_radix( - std::str::from_utf8(&body[i..i + 4]).unwrap_or("0000"), - 16, - ) - .unwrap_or(0); - if len == 0 { - i += 4; - continue; - } - let chunk = &body[i + 4..i + len]; - // band 1 = pack data; skip "packfile\n" control line and other bands. - if chunk.first() == Some(&0x01) { - out.extend_from_slice(&chunk[1..]); - } - i += len; - } - out - } -``` -> If Task 0 chose Option A (delegated framing), `extract_pack` may need adjusting to the exact bands git emits; use the trace from Task 0 Step 3 to confirm. - -- [ ] **Step 2: Run the test to verify it fails (then passes once framing is right)** - -Run: `cargo test -p gitlawb-node smart_http::tests::client_clone_lacks_withheld_blob_bytes` -Expected: initially may FAIL if framing constants are off; iterate `extract_pack` / framing against Task 0 findings until PASS. Success criterion: the withheld OID is absent from the served pack. - -- [ ] **Step 3: Commit** - -```bash -git add crates/gitlawb-node/src/git/smart_http.rs -git commit -m "test(node): end-to-end assert served pack omits withheld blob" -``` - ---- - -## Task 5: Confirm `info/refs` does not leak and stays consistent - -**Files:** -- Modify: `crates/gitlawb-node/src/api/repos.rs` (no logic change to `git_info_refs`; add a confirming comment only if needed) - -The ref advertisement lists commit tips, not blob content, so a mode-B subtree does not require hiding any ref: a non-reader still clones the same commits, just without the private blobs. This task records that decision so a future reader does not "fix" it by gating `info/refs` on subtree rules. - -- [ ] **Step 1: Add a clarifying comment** - -In `git_info_refs`, next to the existing whole-repo gate (the `if service == "git-upload-pack"` block around line 330), append one line after the existing comment: -```rust - // Subtree (mode B) rules do not gate the advertisement: refs expose commit - // tips only, and blob withholding happens in the upload-pack pack build. -``` - -- [ ] **Step 2: Verify nothing else changed** - -Run: `git diff crates/gitlawb-node/src/api/repos.rs` -Expected: only the one comment line added in `git_info_refs`; the whole-repo 404 gate is untouched. - -- [ ] **Step 3: Commit** - -```bash -git add crates/gitlawb-node/src/api/repos.rs -git commit -m "docs(node): note why info/refs is not gated on subtree visibility" -``` - ---- - -## Task 6: Full verification gate - -**Files:** none (verification only) - -- [ ] **Step 1: Format** - -Run: `cargo fmt --all && cargo fmt --all --check` -Expected: clean (no diff). - -- [ ] **Step 2: Lint** - -Run: `cargo clippy --all-targets -- -D warnings` -Expected: no warnings. - -- [ ] **Step 3: Full test suite** - -Run: `cargo test -p gitlawb-node` -Expected: all pass, including Phase 1 visibility tests and the new `visibility_pack` and `smart_http` tests. - -- [ ] **Step 4: Manual smoke (optional but recommended)** - -Set a subtree rule on a local repo via `gl visibility`, clone as a non-reader through the node, and confirm the private file's bytes are absent (`git cat-file -p HEAD:secret/b.txt` fails or the file is missing) while the tree entry / SHA is still listed (`git ls-tree HEAD secret/`). - ---- - -## Out of scope / follow-ups (separate plans) - -1. **`git-remote-gitlawb` partial-clone UX.** Make a non-reader's clone Just Work without the user passing `--filter`: the helper requests partial-clone semantics, advertises the `filter` capability cleanly (so there is no "filtering not recognized by server, ignoring" warning), and treats withheld blobs as deliberately absent. Without this, a stock full `git clone` of a repo with a withheld blob is refused at fetch time ("remote did not send all necessary objects"); only `git clone --filter=...` succeeds. The security guarantee (bytes never sent) holds regardless; this is purely UX. -2. **Filtered-pack caching.** `build_filtered_pack` recomputes per request. If hot, cache by (repo, tip-OIDs, withheld-set) and invalidate on push. -3. **Incremental fetch efficiency.** Verified during execution: an incremental `git fetch` after a partial clone is already correct and still withholds the private blob (covered by `real_git_fetch_after_partial_clone_still_withholds`). The serve ignores the client's `have`/`want` negotiation and always sends a self-contained pack of all refs minus the withheld blobs, replying `NAK`; the client de-duplicates, so nothing breaks. The only cost is that a fetch re-sends the full object set instead of a thin delta. Honoring negotiation to produce smaller fetch packs is the optimization left here. -4. **Replication-path enforcement (Phase 2).** Still blocked on the maintainer A/B decision; unrelated to this HTTP-path work. From 0c8a1b7ffd7b1a6eb932ef267e162381c34d84f8 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 6 Jun 2026 22:08:35 -0500 Subject: [PATCH 12/58] chore: gitignore local planning docs (docs/superpowers/) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 404c87b..a36d8f7 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ keys/ # Logs *.log .openclaude-profile.json + +# Local planning / scratch docs (never commit) +docs/superpowers/ From 70bcefdee2e4a4f5d2326299bfd7bf15260cadd5 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sun, 7 Jun 2026 16:08:12 -0500 Subject: [PATCH 13/58] feat(node): withheld-paths endpoint reporting a caller's denied globs --- crates/gitlawb-node/src/api/visibility.rs | 27 +++++++++++++++ crates/gitlawb-node/src/server.rs | 4 +++ crates/gitlawb-node/src/visibility.rs | 41 +++++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/crates/gitlawb-node/src/api/visibility.rs b/crates/gitlawb-node/src/api/visibility.rs index 531c724..5a36648 100644 --- a/crates/gitlawb-node/src/api/visibility.rs +++ b/crates/gitlawb-node/src/api/visibility.rs @@ -185,6 +185,33 @@ pub async fn list_visibility( }))) } +/// GET /api/v1/repos/{owner}/{repo}/withheld-paths +/// +/// Returns only the path globs the (optionally authenticated) caller is denied, +/// so a clean-clone client can sparse-exclude them. Unlike `list_visibility` +/// this is not owner-gated and never exposes reader_dids. +pub async fn withheld_paths( + State(state): State, + auth: Option>, + Path((owner, repo)): Path<(String, String)>, +) -> Result> { + let record = state + .db + .get_repo(&owner, &repo) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}")))?; + + let rules = state.db.list_visibility_rules(&record.id).await?; + let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + let withheld = + crate::visibility::withheld_globs(&rules, record.is_public, &record.owner_did, caller); + + Ok(Json(serde_json::json!({ + "repo": format!("{owner}/{repo}"), + "withheld": withheld, + }))) +} + #[cfg(test)] mod tests { use super::validate_path_glob; diff --git a/crates/gitlawb-node/src/server.rs b/crates/gitlawb-node/src/server.rs index 4a8ec37..9baea20 100644 --- a/crates/gitlawb-node/src/server.rs +++ b/crates/gitlawb-node/src/server.rs @@ -352,6 +352,10 @@ pub fn build_router(state: AppState) -> Router { "/{owner}/{repo}/git-upload-pack", post(repos::git_upload_pack), ) + .route( + "/api/v1/repos/{owner}/{repo}/withheld-paths", + axum::routing::get(visibility::withheld_paths), + ) .layer(DefaultBodyLimit::disable()) .layer(RequestBodyLimitLayer::new(pack_limit)) .layer(middleware::from_fn(auth::optional_signature)); diff --git a/crates/gitlawb-node/src/visibility.rs b/crates/gitlawb-node/src/visibility.rs index b246dbf..6cc6445 100644 --- a/crates/gitlawb-node/src/visibility.rs +++ b/crates/gitlawb-node/src/visibility.rs @@ -96,6 +96,29 @@ pub fn visibility_check( } } +/// The subtree path globs that `caller` (None = anonymous) may NOT read, given +/// the repo's rules. Whole-repo ("/") rules are excluded: a denied whole-repo +/// read is handled by the 404 gate before a clone ever starts. Each remaining +/// rule is reported when `visibility_check` denies the caller at the glob's +/// representative path. Used by the clean-clone client to sparse-exclude the +/// private paths from checkout. +pub fn withheld_globs( + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> Vec { + rules + .iter() + .filter(|r| r.path_glob != "/") + .filter(|r| { + let probe = glob_prefix(&r.path_glob); + visibility_check(rules, is_public, owner_did, caller, probe) == Decision::Deny + }) + .map(|r| r.path_glob.clone()) + .collect() +} + #[cfg(test)] mod tests { use super::*; @@ -116,6 +139,24 @@ mod tests { const OWNER: &str = "did:key:z6MkOwner"; + #[test] + fn withheld_globs_lists_only_denied_subtrees() { + let rules = [ + rule("/secret/**", VisibilityMode::B, &["did:key:z6MkFriend"]), + rule("/docs/**", VisibilityMode::B, &["did:key:z6MkStranger"]), + ]; + // Stranger is denied /secret but allowed /docs. + let mut got = withheld_globs(&rules, true, OWNER, Some("did:key:z6MkStranger")); + got.sort(); + assert_eq!(got, vec!["/secret/**".to_string()]); + // Owner is denied nothing. + assert!(withheld_globs(&rules, true, OWNER, Some(OWNER)).is_empty()); + // Anonymous is denied both. + let mut anon = withheld_globs(&rules, true, OWNER, None); + anon.sort(); + assert_eq!(anon, vec!["/docs/**".to_string(), "/secret/**".to_string()]); + } + #[test] fn no_rules_public_allows_anonymous() { assert_eq!( From 3e1a2038fa758fae7e27735d7dfe7c783cb81698 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sun, 7 Jun 2026 16:15:20 -0500 Subject: [PATCH 14/58] feat(gl): gl clone with promisor + sparse-exclude for private subtrees --- crates/gl/src/clone.rs | 248 +++++++++++++++++++++++++++++++++++++++++ crates/gl/src/main.rs | 5 + 2 files changed, 253 insertions(+) create mode 100644 crates/gl/src/clone.rs diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs new file mode 100644 index 0000000..9b1ef3e --- /dev/null +++ b/crates/gl/src/clone.rs @@ -0,0 +1,248 @@ +//! `gl clone`: clean partial clone of a gitlawb repo with private subtrees. +//! +//! A repo may withhold blob content under some path globs from the caller +//! (Phase 3). The resulting pack is not closed under reachability, so a stock +//! `git clone` is refused at fetch. This command clones as a promisor +//! (`--filter=blob:none`) and sparse-excludes the caller's withheld globs, +//! producing a clean checkout: public files present, private paths absent. + +use anyhow::{bail, Context, Result}; +use clap::Args; +use serde_json::Value; +use std::path::Path; +use std::process::Command; + +use crate::http::NodeClient; +use crate::identity::load_keypair_from_dir; + +#[derive(Args)] +pub struct CloneArgs { + /// Repo to clone: gitlawb:/// or /. + pub repo: String, + + /// Destination directory (default: the repo name). + pub dir: Option, + + /// Branch to check out (default: the remote's default branch). + #[arg(long)] + pub branch: Option, + + #[arg(long, default_value = "https://node.gitlawb.com", env = "GITLAWB_NODE")] + pub node: String, +} + +/// Run a git command inside `dir`, erroring with stderr on failure. +fn git(dir: &Path, args: &[&str]) -> Result<()> { + let out = Command::new("git") + .args(args) + .current_dir(dir) + .output() + .with_context(|| format!("running git {args:?}"))?; + if !out.status.success() { + bail!( + "git {args:?} failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + Ok(()) +} + +/// Run a git command not tied to a working tree (e.g. `clone`). +fn git_global(args: &[&str]) -> Result<()> { + let out = Command::new("git") + .args(args) + .output() + .with_context(|| format!("running git {args:?}"))?; + if !out.status.success() { + bail!( + "git {args:?} failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + Ok(()) +} + +/// Clone `remote_url` into `dest`, excluding `withheld_globs` from checkout. +/// `dest` must not already exist. With nothing withheld this is a plain full +/// clone. With globs withheld it clones as a promisor (`--filter=blob:none`, +/// marking the repo a promisor so the node's non-closed pack is accepted) +/// without checkout, sparse-excludes each glob, then checks out so the absent +/// blobs are never materialized. `--no-cone` is required for negated excludes. +pub fn setup_partial_clone( + dest: &Path, + remote_url: &str, + withheld_globs: &[String], + branch: Option<&str>, +) -> Result<()> { + let dest_str = dest + .to_str() + .context("destination path is not valid UTF-8")?; + + if withheld_globs.is_empty() { + match branch { + Some(b) => git_global(&["clone", "-q", "--branch", b, remote_url, dest_str])?, + None => git_global(&["clone", "-q", remote_url, dest_str])?, + } + return Ok(()); + } + + git_global(&[ + "clone", + "-q", + "--filter=blob:none", + "--no-checkout", + remote_url, + dest_str, + ])?; + git(dest, &["sparse-checkout", "init", "--no-cone"])?; + let mut spec = String::from("/*\n"); + for g in withheld_globs { + // "/secret/**" -> "!/secret/" + let dir = g.trim_end_matches("**").trim_end_matches('/'); + spec.push('!'); + spec.push_str(dir); + spec.push_str("/\n"); + } + std::fs::write(dest.join(".git/info/sparse-checkout"), spec) + .context("writing sparse-checkout spec")?; + + match branch { + Some(b) => git(dest, &["checkout", "-q", b])?, + None => { + let out = Command::new("git") + .args(["remote", "show", "origin"]) + .current_dir(dest) + .output()?; + let text = String::from_utf8_lossy(&out.stdout); + let head = text + .lines() + .find_map(|l| l.trim().strip_prefix("HEAD branch: ")) + .map(|s| s.to_string()) + .context("could not determine default branch")?; + git(dest, &["checkout", "-q", &head])?; + } + } + Ok(()) +} + +/// Parse `repo` into (gitlawb_url, owner, name). Accepts a full +/// `gitlawb:///` URL or a bare `/`. The owner DID may +/// itself contain colons but no slash, so split on the first slash. +fn parse_repo(repo: &str) -> Result<(String, String, String)> { + let stripped = repo.strip_prefix("gitlawb://").unwrap_or(repo); + let (owner, name) = stripped + .trim_end_matches('/') + .split_once('/') + .context("repo must be / or gitlawb:///")?; + if owner.is_empty() || name.is_empty() { + bail!("repo must be / or gitlawb:///"); + } + Ok(( + format!("gitlawb://{owner}/{name}"), + owner.to_string(), + name.to_string(), + )) +} + +/// Ask the node which globs are withheld for this caller. Any error or non-2xx +/// is treated as "nothing withheld" so public repos clone normally. +async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Vec { + let kp = load_keypair_from_dir(None).ok(); + let signed = kp.is_some(); + let client = NodeClient::new(node, kp); + let path = format!("/api/v1/repos/{owner}/{name}/withheld-paths"); + let resp = if signed { + client.get_signed(&path).await + } else { + client.get(&path).await + }; + let resp = match resp { + Ok(r) if r.status().is_success() => r, + _ => return Vec::new(), + }; + let body: Value = resp.json().await.unwrap_or_default(); + body.get("withheld") + .and_then(|w| w.as_array()) + .map(|a| { + a.iter() + .filter_map(|x| x.as_str().map(String::from)) + .collect() + }) + .unwrap_or_default() +} + +pub async fn run(args: CloneArgs) -> Result<()> { + let (url, owner, name) = parse_repo(&args.repo)?; + let dest_name = args.dir.unwrap_or_else(|| name.clone()); + let dest = std::path::PathBuf::from(&dest_name); + if dest.exists() { + bail!("destination '{dest_name}' already exists"); + } + + let withheld = fetch_withheld(&args.node, &owner, &name).await; + if withheld.is_empty() { + println!("Cloning {url} into {dest_name}"); + } else { + println!( + "Cloning {url} into {dest_name} ({} private path(s) excluded)", + withheld.len() + ); + } + + setup_partial_clone(&dest, &url, &withheld, args.branch.as_deref())?; + println!("Done. Cloned into {dest_name}"); + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + use tempfile::TempDir; + + fn g(args: &[&str], dir: &Path) { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + } + + #[test] + fn setup_partial_clone_excludes_withheld_path() { + let td = TempDir::new().unwrap(); + let origin = td.path().join("origin"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(origin.join("secret")).unwrap(); + std::fs::create_dir_all(origin.join("public")).unwrap(); + std::fs::write(origin.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(origin.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &origin); + g(&["config", "user.email", "t@t"], &origin); + g(&["config", "user.name", "t"], &origin); + g(&["add", "."], &origin); + g(&["commit", "-qm", "init"], &origin); + g( + &[ + "clone", + "-q", + "--bare", + origin.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + // file:// so --filter is honored (local-path clones ignore it). + let dest = td.path().join("dest"); + let url = format!("file://{}", bare.display()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], None).unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public file present"); + assert!( + !dest.join("secret/b.txt").exists(), + "withheld path must be excluded from checkout" + ); + } +} diff --git a/crates/gl/src/main.rs b/crates/gl/src/main.rs index 0af7398..1c1a50d 100644 --- a/crates/gl/src/main.rs +++ b/crates/gl/src/main.rs @@ -7,6 +7,7 @@ mod agent; mod bounty; mod cert; mod changelog; +mod clone; mod doctor; mod http; mod identity; @@ -57,6 +58,9 @@ enum Commands { /// Register this agent with a gitlawb node Register(register::RegisterArgs), + /// Clone a gitlawb repo, handling private subtrees cleanly + Clone(clone::CloneArgs), + /// Manage repositories Repo(repo::RepoArgs), @@ -150,6 +154,7 @@ async fn main() -> Result<()> { match cli.command { Commands::Identity { cmd } => identity::run(cmd).await, Commands::Register(args) => register::run(args).await, + Commands::Clone(args) => clone::run(args).await, Commands::Repo(args) => repo::run(args).await, Commands::Issue(args) => issue::run(args).await, Commands::Pr(args) => pr::run(args).await, From 2fd4fe131c893bd71751f6d61e914084fc2dd8af Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sun, 7 Jun 2026 16:15:30 -0500 Subject: [PATCH 15/58] test(gl): gl clone repo-argument parsing --- crates/gl/src/clone.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 9b1ef3e..d06135e 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -245,4 +245,22 @@ mod tests { "withheld path must be excluded from checkout" ); } + + #[test] + fn parse_repo_accepts_url_and_bare() { + let (url, o, n) = parse_repo("gitlawb://did:key:zAbc/myrepo").unwrap(); + assert_eq!(url, "gitlawb://did:key:zAbc/myrepo"); + assert_eq!((o.as_str(), n.as_str()), ("did:key:zAbc", "myrepo")); + + let (url2, o2, n2) = parse_repo("did:key:zAbc/myrepo").unwrap(); + assert_eq!(url2, "gitlawb://did:key:zAbc/myrepo"); + assert_eq!((o2.as_str(), n2.as_str()), ("did:key:zAbc", "myrepo")); + } + + #[test] + fn parse_repo_rejects_malformed() { + assert!(parse_repo("noslash").is_err()); + assert!(parse_repo("gitlawb://owner/").is_err()); + assert!(parse_repo("/name").is_err()); + } } From af972499a94a8a644389ceef92fac34047866224 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sun, 7 Jun 2026 17:24:00 -0500 Subject: [PATCH 16/58] fix(visibility): gate withheld-paths and handle nested/exact globs Three fixes from the PR #33 review: - withheld_paths now applies the whole-repo "/" read gate (returns repo-not-found when the caller cannot read the root), matching the git read endpoints. Without it the endpoint disclosed a private repo's existence and path layout to unauthorized callers. The withheld_globs doc already assumed this gate existed; now it does. - A nested allow under a denied parent (e.g. "/secret/public/**" allowed, "/secret/**" denied) was over-withheld: the client sparse-excluded the whole parent and hid paths the caller may read. The endpoint now also returns a "reinclude" list (allowed globs strictly under a denied one) and gl clone re-includes them in the sparse spec after the excludes. - Wildcard-free globs like "/docs/private" match both the exact path and a subtree (per glob_matches), but the client only emitted the subtree exclude. sparse_patterns now emits both "/docs/private" and "/docs/private/". Verified the exclude-then-reinclude sparse ordering checks out cleanly with real git, plus unit tests for reincluded_globs, the nested re-include, the exact-path exclude, and sparse_patterns. --- crates/gitlawb-node/src/api/visibility.rs | 21 ++- crates/gitlawb-node/src/visibility.rs | 68 ++++++++++ crates/gl/src/clone.rs | 148 +++++++++++++++++++--- 3 files changed, 214 insertions(+), 23 deletions(-) diff --git a/crates/gitlawb-node/src/api/visibility.rs b/crates/gitlawb-node/src/api/visibility.rs index 5a36648..6665b9e 100644 --- a/crates/gitlawb-node/src/api/visibility.rs +++ b/crates/gitlawb-node/src/api/visibility.rs @@ -187,9 +187,11 @@ pub async fn list_visibility( /// GET /api/v1/repos/{owner}/{repo}/withheld-paths /// -/// Returns only the path globs the (optionally authenticated) caller is denied, -/// so a clean-clone client can sparse-exclude them. Unlike `list_visibility` -/// this is not owner-gated and never exposes reader_dids. +/// Returns the path globs the (optionally authenticated) caller is denied +/// (`withheld`) plus any more-specific globs that are allowed underneath a +/// denied one (`reinclude`), so a clean-clone client can sparse-exclude the +/// denied subtrees while re-including the allowed nested paths. Unlike +/// `list_visibility` this is not owner-gated and never exposes reader_dids. pub async fn withheld_paths( State(state): State, auth: Option>, @@ -203,12 +205,25 @@ pub async fn withheld_paths( let rules = state.db.list_visibility_rules(&record.id).await?; let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + + // Whole-repo read gate: a caller who cannot read "/" gets repo-not-found, + // matching the git read endpoints, so this never discloses a private repo's + // existence or its path layout to an unauthorized caller. + if crate::visibility::visibility_check(&rules, record.is_public, &record.owner_did, caller, "/") + == crate::visibility::Decision::Deny + { + return Err(AppError::RepoNotFound(format!("{owner}/{repo}"))); + } + let withheld = crate::visibility::withheld_globs(&rules, record.is_public, &record.owner_did, caller); + let reinclude = + crate::visibility::reincluded_globs(&rules, record.is_public, &record.owner_did, caller); Ok(Json(serde_json::json!({ "repo": format!("{owner}/{repo}"), "withheld": withheld, + "reinclude": reinclude, }))) } diff --git a/crates/gitlawb-node/src/visibility.rs b/crates/gitlawb-node/src/visibility.rs index 6cc6445..345f41d 100644 --- a/crates/gitlawb-node/src/visibility.rs +++ b/crates/gitlawb-node/src/visibility.rs @@ -119,6 +119,55 @@ pub fn withheld_globs( .collect() } +/// The allowed globs that sit strictly underneath a denied glob. A clean-clone +/// client sparse-excludes everything in `withheld_globs`, which would also hide +/// these nested allowed paths; re-including them restores the caller's access. +/// Example: with `/secret/**` denied and `/secret/public/**` allowed for the +/// same caller, `/secret/public/**` is returned here so the client re-includes +/// it after excluding `/secret/`. +pub fn reincluded_globs( + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> Vec { + let denied: Vec<&str> = rules + .iter() + .filter(|r| r.path_glob != "/") + .filter(|r| { + visibility_check( + rules, + is_public, + owner_did, + caller, + glob_prefix(&r.path_glob), + ) == Decision::Deny + }) + .map(|r| glob_prefix(&r.path_glob)) + .collect(); + + rules + .iter() + .filter(|r| r.path_glob != "/") + .filter(|r| { + visibility_check( + rules, + is_public, + owner_did, + caller, + glob_prefix(&r.path_glob), + ) == Decision::Allow + }) + .filter(|r| { + let p = glob_prefix(&r.path_glob); + denied + .iter() + .any(|d| *d != p && p.starts_with(&format!("{d}/"))) + }) + .map(|r| r.path_glob.clone()) + .collect() +} + #[cfg(test)] mod tests { use super::*; @@ -157,6 +206,25 @@ mod tests { assert_eq!(anon, vec!["/docs/**".to_string(), "/secret/**".to_string()]); } + #[test] + fn reincluded_globs_restores_allowed_nested_path() { + let rules = [ + rule("/secret/**", VisibilityMode::B, &["did:key:z6MkFriend"]), + rule( + "/secret/public/**", + VisibilityMode::B, + &["did:key:z6MkFriend", "did:key:z6MkStranger"], + ), + ]; + // Stranger is denied /secret/** but allowed the nested /secret/public/**. + let withheld = withheld_globs(&rules, true, OWNER, Some("did:key:z6MkStranger")); + assert_eq!(withheld, vec!["/secret/**".to_string()]); + let reinc = reincluded_globs(&rules, true, OWNER, Some("did:key:z6MkStranger")); + assert_eq!(reinc, vec!["/secret/public/**".to_string()]); + // Owner is denied nothing, so there is nothing to re-include. + assert!(reincluded_globs(&rules, true, OWNER, Some(OWNER)).is_empty()); + } + #[test] fn no_rules_public_allows_anonymous() { assert_eq!( diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index d06135e..7bb481b 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -62,6 +62,18 @@ fn git_global(args: &[&str]) -> Result<()> { Ok(()) } +/// Sparse-checkout pattern(s) for a visibility glob. A subtree glob +/// (`/secret/**`) maps to the directory `/secret/`. A wildcard-free glob +/// (`/docs/private`) matches both the exact path and a subtree at that path +/// (mirroring the node's `glob_matches`), so it maps to both `/docs/private` +/// and `/docs/private/`. Callers prefix these with `!` to exclude. +fn sparse_patterns(glob: &str) -> Vec { + match glob.strip_suffix("/**") { + Some(base) => vec![format!("{base}/")], + None => vec![glob.to_string(), format!("{glob}/")], + } +} + /// Clone `remote_url` into `dest`, excluding `withheld_globs` from checkout. /// `dest` must not already exist. With nothing withheld this is a plain full /// clone. With globs withheld it clones as a promisor (`--filter=blob:none`, @@ -72,6 +84,7 @@ pub fn setup_partial_clone( dest: &Path, remote_url: &str, withheld_globs: &[String], + reinclude_globs: &[String], branch: Option<&str>, ) -> Result<()> { let dest_str = dest @@ -95,13 +108,22 @@ pub fn setup_partial_clone( dest_str, ])?; git(dest, &["sparse-checkout", "init", "--no-cone"])?; + // Non-cone sparse-checkout, gitignore-style and order-sensitive: include + // everything, exclude the withheld globs, then re-include any allowed globs + // nested under an excluded one (later patterns win). let mut spec = String::from("/*\n"); for g in withheld_globs { - // "/secret/**" -> "!/secret/" - let dir = g.trim_end_matches("**").trim_end_matches('/'); - spec.push('!'); - spec.push_str(dir); - spec.push_str("/\n"); + for pat in sparse_patterns(g) { + spec.push('!'); + spec.push_str(&pat); + spec.push('\n'); + } + } + for g in reinclude_globs { + for pat in sparse_patterns(g) { + spec.push_str(&pat); + spec.push('\n'); + } } std::fs::write(dest.join(".git/info/sparse-checkout"), spec) .context("writing sparse-checkout spec")?; @@ -144,9 +166,11 @@ fn parse_repo(repo: &str) -> Result<(String, String, String)> { )) } -/// Ask the node which globs are withheld for this caller. Any error or non-2xx -/// is treated as "nothing withheld" so public repos clone normally. -async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Vec { +/// Ask the node which globs are withheld for this caller and which allowed globs +/// nested under them must be re-included. Returns `(withheld, reinclude)`. Any +/// error or non-2xx is treated as "nothing withheld" so public repos clone +/// normally. +async fn fetch_withheld(node: &str, owner: &str, name: &str) -> (Vec, Vec) { let kp = load_keypair_from_dir(None).ok(); let signed = kp.is_some(); let client = NodeClient::new(node, kp); @@ -158,17 +182,20 @@ async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Vec { }; let resp = match resp { Ok(r) if r.status().is_success() => r, - _ => return Vec::new(), + _ => return (Vec::new(), Vec::new()), }; let body: Value = resp.json().await.unwrap_or_default(); - body.get("withheld") - .and_then(|w| w.as_array()) - .map(|a| { - a.iter() - .filter_map(|x| x.as_str().map(String::from)) - .collect() - }) - .unwrap_or_default() + let globs = |field: &str| -> Vec { + body.get(field) + .and_then(|w| w.as_array()) + .map(|a| { + a.iter() + .filter_map(|x| x.as_str().map(String::from)) + .collect() + }) + .unwrap_or_default() + }; + (globs("withheld"), globs("reinclude")) } pub async fn run(args: CloneArgs) -> Result<()> { @@ -179,7 +206,7 @@ pub async fn run(args: CloneArgs) -> Result<()> { bail!("destination '{dest_name}' already exists"); } - let withheld = fetch_withheld(&args.node, &owner, &name).await; + let (withheld, reinclude) = fetch_withheld(&args.node, &owner, &name).await; if withheld.is_empty() { println!("Cloning {url} into {dest_name}"); } else { @@ -189,7 +216,7 @@ pub async fn run(args: CloneArgs) -> Result<()> { ); } - setup_partial_clone(&dest, &url, &withheld, args.branch.as_deref())?; + setup_partial_clone(&dest, &url, &withheld, &reinclude, args.branch.as_deref())?; println!("Done. Cloned into {dest_name}"); Ok(()) } @@ -237,7 +264,7 @@ mod tests { // file:// so --filter is honored (local-path clones ignore it). let dest = td.path().join("dest"); let url = format!("file://{}", bare.display()); - setup_partial_clone(&dest, &url, &["/secret/**".to_string()], None).unwrap(); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); assert!(dest.join("public/a.txt").exists(), "public file present"); assert!( @@ -246,6 +273,87 @@ mod tests { ); } + /// Build a bare remote with the given files (relative path -> contents), + /// committed on one branch. Returns (tempdir, file:// url). + fn bare_remote(files: &[(&str, &[u8])]) -> (TempDir, String) { + let td = TempDir::new().unwrap(); + let origin = td.path().join("origin"); + let bare = td.path().join("bare.git"); + for (path, contents) in files { + let full = origin.join(path); + std::fs::create_dir_all(full.parent().unwrap()).unwrap(); + std::fs::write(full, contents).unwrap(); + } + g(&["init", "-q"], &origin); + g(&["config", "user.email", "t@t"], &origin); + g(&["config", "user.name", "t"], &origin); + g(&["add", "."], &origin); + g(&["commit", "-qm", "init"], &origin); + g( + &[ + "clone", + "-q", + "--bare", + origin.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + let url = format!("file://{}", bare.display()); + (td, url) + } + + #[test] + fn reinclude_restores_allowed_nested_path() { + let (td, url) = bare_remote(&[ + ("public/a.txt", b"pub\n"), + ("secret/private/p.txt", b"PRIV\n"), + ("secret/public/s.txt", b"SHARED\n"), + ]); + let dest = td.path().join("dest"); + setup_partial_clone( + &dest, + &url, + &["/secret/**".to_string()], + &["/secret/public/**".to_string()], + None, + ) + .unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public present"); + assert!( + dest.join("secret/public/s.txt").exists(), + "allowed nested path must be re-included" + ); + assert!( + !dest.join("secret/private/p.txt").exists(), + "denied nested path must stay excluded" + ); + } + + #[test] + fn exact_path_glob_is_excluded() { + // A wildcard-free glob must exclude the exact file, not just a subtree. + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("docs/private", b"SECRET\n")]); + let dest = td.path().join("dest"); + setup_partial_clone(&dest, &url, &["/docs/private".to_string()], &[], None).unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public present"); + assert!( + !dest.join("docs/private").exists(), + "exact-path withheld file must be excluded" + ); + } + + #[test] + fn sparse_patterns_subtree_and_exact() { + assert_eq!(sparse_patterns("/secret/**"), vec!["/secret/".to_string()]); + assert_eq!( + sparse_patterns("/docs/private"), + vec!["/docs/private".to_string(), "/docs/private/".to_string()] + ); + } + #[test] fn parse_repo_accepts_url_and_bare() { let (url, o, n) = parse_repo("gitlawb://did:key:zAbc/myrepo").unwrap(); From 8f7060ad349754569ac821618670a5d6088184e3 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Mon, 8 Jun 2026 09:12:25 -0500 Subject: [PATCH 17/58] fix(gl): reject multi-slash repo input and stop failing open on withheld-path errors split_once('/') accepted owner/name/extra, smuggling a path segment into the repo name that then flowed into the API path and clone URL; reject it. fetch_withheld swallowed every network/auth/5xx/JSON error into an empty result, dropping to a stock clone that the node refuses once blobs are withheld. Now only 404/501 (endpoint unsupported) fall back to empty; the rest propagate so the real cause surfaces. --- crates/gl/src/clone.rs | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 7bb481b..3f9af70 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -156,7 +156,7 @@ fn parse_repo(repo: &str) -> Result<(String, String, String)> { .trim_end_matches('/') .split_once('/') .context("repo must be / or gitlawb:///")?; - if owner.is_empty() || name.is_empty() { + if owner.is_empty() || name.is_empty() || name.contains('/') { bail!("repo must be / or gitlawb:///"); } Ok(( @@ -167,10 +167,13 @@ fn parse_repo(repo: &str) -> Result<(String, String, String)> { } /// Ask the node which globs are withheld for this caller and which allowed globs -/// nested under them must be re-included. Returns `(withheld, reinclude)`. Any -/// error or non-2xx is treated as "nothing withheld" so public repos clone -/// normally. -async fn fetch_withheld(node: &str, owner: &str, name: &str) -> (Vec, Vec) { +/// nested under them must be re-included. Returns `(withheld, reinclude)`. A +/// node that does not implement the endpoint (404/501) yields empties so public +/// repos on older nodes still clone normally. Other failures (network, auth, +/// 5xx, malformed JSON) are propagated: failing open here would silently fall +/// back to a stock clone, which the node refuses once blobs are withheld, hiding +/// the real cause behind a confusing fetch error. +async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Result<(Vec, Vec)> { let kp = load_keypair_from_dir(None).ok(); let signed = kp.is_some(); let client = NodeClient::new(node, kp); @@ -182,9 +185,14 @@ async fn fetch_withheld(node: &str, owner: &str, name: &str) -> (Vec, Ve }; let resp = match resp { Ok(r) if r.status().is_success() => r, - _ => return (Vec::new(), Vec::new()), + Ok(r) if matches!(r.status().as_u16(), 404 | 501) => return Ok((Vec::new(), Vec::new())), + Ok(r) => bail!("withheld-paths lookup failed: {}", r.status()), + Err(err) => return Err(err).context("fetching withheld paths"), }; - let body: Value = resp.json().await.unwrap_or_default(); + let body: Value = resp + .json() + .await + .context("parsing withheld-paths response")?; let globs = |field: &str| -> Vec { body.get(field) .and_then(|w| w.as_array()) @@ -195,7 +203,7 @@ async fn fetch_withheld(node: &str, owner: &str, name: &str) -> (Vec, Ve }) .unwrap_or_default() }; - (globs("withheld"), globs("reinclude")) + Ok((globs("withheld"), globs("reinclude"))) } pub async fn run(args: CloneArgs) -> Result<()> { @@ -206,7 +214,7 @@ pub async fn run(args: CloneArgs) -> Result<()> { bail!("destination '{dest_name}' already exists"); } - let (withheld, reinclude) = fetch_withheld(&args.node, &owner, &name).await; + let (withheld, reinclude) = fetch_withheld(&args.node, &owner, &name).await?; if withheld.is_empty() { println!("Cloning {url} into {dest_name}"); } else { @@ -370,5 +378,7 @@ mod tests { assert!(parse_repo("noslash").is_err()); assert!(parse_repo("gitlawb://owner/").is_err()); assert!(parse_repo("/name").is_err()); + // An extra slash would otherwise smuggle a path segment into the name. + assert!(parse_repo("owner/name/extra").is_err()); } } From 26e65f53451200f92386047a1f605d748f45ce8e Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Mon, 8 Jun 2026 09:54:57 -0500 Subject: [PATCH 18/58] feat(node): replicable_objects filter for replication enforcement --- .../gitlawb-node/src/git/visibility_pack.rs | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index d386415..693a7e7 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -77,6 +77,16 @@ pub fn withheld_blob_oids( Ok(denied.difference(&allowed).cloned().collect()) } +/// Objects that may replicate to the public: everything not in `withheld`. +/// Order-preserving. The single seam every replication site (IPFS, Pinata) +/// passes its object list through; option B would later reroute the withheld +/// ones through encrypt-then-pin instead of dropping them. +pub fn replicable_objects(all: Vec, withheld: &HashSet) -> Vec { + all.into_iter() + .filter(|oid| !withheld.contains(oid)) + .collect() +} + #[cfg(test)] mod tests { use super::*; @@ -186,4 +196,20 @@ mod tests { "public repo, no rules, nothing withheld" ); } + + #[test] + fn replicable_objects_drops_withheld_keeps_rest() { + let all = vec!["aaa".to_string(), "bbb".to_string(), "ccc".to_string()]; + let withheld: HashSet = ["bbb".to_string()].into_iter().collect(); + let got = replicable_objects(all, &withheld); + assert_eq!(got, vec!["aaa".to_string(), "ccc".to_string()]); + } + + #[test] + fn replicable_objects_empty_withheld_keeps_all() { + let all = vec!["aaa".to_string(), "bbb".to_string()]; + let withheld: HashSet = HashSet::new(); + let got = replicable_objects(all.clone(), &withheld); + assert_eq!(got, all); + } } From eb7c7641bebf4624f6ad18892dd850a2f896fd47 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Mon, 8 Jun 2026 09:57:43 -0500 Subject: [PATCH 19/58] test(node): pin anonymous-caller contract of withheld_blob_oids --- crates/gitlawb-node/src/git/visibility_pack.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index 693a7e7..c9c6d6b 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -156,6 +156,24 @@ mod tests { (td, bare, secret, public) } + #[test] + fn anonymous_caller_withholds_only_private_blob() { + let (_td, bare, secret_oid, public_oid) = fixture(); + let rules = [rule("/secret/**", &[])]; + // caller = None models the public / any peer: what must not replicate. + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "secret blob must be withheld" + ); + assert!( + !withheld.contains(&public_oid), + "public blob must replicate" + ); + // Trees and commits are never withheld; the set holds only the secret blob. + assert_eq!(withheld.len(), 1, "only the secret blob OID is withheld"); + } + #[test] fn non_reader_withholds_only_the_private_blob() { let (_td, bare, secret, public) = fixture(); From c2c287ebf69ee233558adb89dd48058bfc83c905 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Mon, 8 Jun 2026 10:00:12 -0500 Subject: [PATCH 20/58] feat(node): IPFS pinning skips withheld blob OIDs --- crates/gitlawb-node/src/api/repos.rs | 9 +++++++-- crates/gitlawb-node/src/ipfs_pin.rs | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 4522e8d..1ddf44b 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -614,8 +614,13 @@ pub async fn git_receive_pack( let repo_path_clone = disk_path.clone(); let db_clone = state.db.clone(); tokio::spawn(async move { - let pinned = - crate::ipfs_pin::pin_new_objects(&ipfs_api, &repo_path_clone, &db_clone).await; + let pinned = crate::ipfs_pin::pin_new_objects( + &ipfs_api, + &repo_path_clone, + &db_clone, + &std::collections::HashSet::new(), + ) + .await; if !pinned.is_empty() { tracing::info!(count = pinned.len(), "pinned git objects to IPFS"); for (sha, cid) in &pinned { diff --git a/crates/gitlawb-node/src/ipfs_pin.rs b/crates/gitlawb-node/src/ipfs_pin.rs index 831f1ad..96d6abd 100644 --- a/crates/gitlawb-node/src/ipfs_pin.rs +++ b/crates/gitlawb-node/src/ipfs_pin.rs @@ -7,6 +7,8 @@ //! If `ipfs_api` is empty the functions are no-ops, so the node works fine //! without a local IPFS daemon. +use std::collections::HashSet; + use anyhow::Result; use gitlawb_core::cid::Cid; @@ -78,6 +80,7 @@ pub async fn pin_new_objects( ipfs_api: &str, repo_path: &std::path::Path, db: &crate::db::Db, + withheld: &HashSet, ) -> Vec<(String, String)> { if ipfs_api.is_empty() { return vec![]; @@ -92,6 +95,8 @@ pub async fn pin_new_objects( } }; + let object_list = crate::git::visibility_pack::replicable_objects(object_list, withheld); + let mut pinned = Vec::new(); for sha in object_list { From d305af738c4b4e2fda262bd57e4959755fcc543c Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Mon, 8 Jun 2026 10:03:05 -0500 Subject: [PATCH 21/58] feat(node): Pinata pinning skips withheld blob OIDs --- crates/gitlawb-node/src/api/repos.rs | 1 + crates/gitlawb-node/src/pinata.rs | 3 +++ 2 files changed, 4 insertions(+) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 1ddf44b..1efde93 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -666,6 +666,7 @@ pub async fn git_receive_pack( &pinata_jwt, &repo_path_clone, &db_clone, + &std::collections::HashSet::new(), ) .await; diff --git a/crates/gitlawb-node/src/pinata.rs b/crates/gitlawb-node/src/pinata.rs index ee9d416..90bddad 100644 --- a/crates/gitlawb-node/src/pinata.rs +++ b/crates/gitlawb-node/src/pinata.rs @@ -7,6 +7,7 @@ //! no-op, so nodes without Pinata backing work fine. use anyhow::Result; +use std::collections::HashSet; /// Pin a single git object's raw bytes on Pinata (v3 API). /// @@ -76,6 +77,7 @@ pub async fn pin_new_objects( jwt: &str, repo_path: &std::path::Path, db: &crate::db::Db, + withheld: &HashSet, ) -> Vec<(String, String)> { if jwt.is_empty() { return vec![]; @@ -92,6 +94,7 @@ pub async fn pin_new_objects( return vec![]; } }; + let object_list = crate::git::visibility_pack::replicable_objects(object_list, withheld); let mut pinned = Vec::new(); From e670ca3557b39ca785287c411d557cde06597ec6 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Mon, 8 Jun 2026 10:12:52 -0500 Subject: [PATCH 22/58] feat(node): enforce visibility on push replication (IPFS/Pinata/gossip/Arweave) --- crates/gitlawb-node/src/api/repos.rs | 202 +++++++++++++++++---------- 1 file changed, 125 insertions(+), 77 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 1efde93..767d76a 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -608,8 +608,44 @@ pub async fn git_receive_pack( } } - // Pin new git objects to the local IPFS node (no-op if ipfs_api is empty) - { + // Replication enforcement (Phase 2): decide once per push whether the public + // may read this repo at all and, if so, which blob OIDs must not leave the + // node. `withheld == None` means replicate nothing (private / mode A / + // undetermined): skip every pin so even commit and tree objects (which + // withheld_blob_oids never lists) stay local. `announce` gates the + // network-facing announcements. Fail closed: a private or undetermined repo + // never leaks. + let rules_opt = state.db.list_visibility_rules(&record.id).await.ok(); + let announce = match &rules_opt { + Some(rules) => { + visibility_check(rules, record.is_public, &record.owner_did, None, "/") + == Decision::Allow + } + None => false, + }; + let withheld: Option> = if !announce { + None + } else { + match &rules_opt { + Some(rules) if rules.is_empty() => Some(std::collections::HashSet::new()), + Some(rules) => crate::git::visibility_pack::withheld_blob_oids( + &disk_path, + rules, + record.is_public, + &record.owner_did, + None, + ) + .map_err(|e| { + tracing::warn!(err = %e, "withheld_blob_oids failed; skipping replication for this push") + }) + .ok(), + None => None, + } + }; + + // Pin new git objects to the local IPFS node (no-op if ipfs_api is empty). + // Skipped entirely when the public cannot read the repo (withheld == None). + if let Some(withheld_ipfs) = withheld.clone() { let ipfs_api = state.config.ipfs_api.clone(); let repo_path_clone = disk_path.clone(); let db_clone = state.db.clone(); @@ -618,7 +654,7 @@ pub async fn git_receive_pack( &ipfs_api, &repo_path_clone, &db_clone, - &std::collections::HashSet::new(), + &withheld_ipfs, ) .await; if !pinned.is_empty() { @@ -659,16 +695,22 @@ pub async fn git_receive_pack( let owner_did_for_arweave = record.owner_did.clone(); let self_public_url = state.config.public_url.clone(); let node_keypair = Arc::clone(&state.node_keypair); + let withheld_pinata = withheld; tokio::spawn(async move { - let pinned = crate::pinata::pin_new_objects( - &http_client, - &pinata_upload_url, - &pinata_jwt, - &repo_path_clone, - &db_clone, - &std::collections::HashSet::new(), - ) - .await; + let pinned = match &withheld_pinata { + Some(withheld) => { + crate::pinata::pin_new_objects( + &http_client, + &pinata_upload_url, + &pinata_jwt, + &repo_path_clone, + &db_clone, + withheld, + ) + .await + } + None => Vec::new(), + }; if !pinned.is_empty() { tracing::info!(count = pinned.len(), "pinned git objects to Pinata"); @@ -687,77 +729,82 @@ pub async fn git_receive_pack( .await; } - if let Some(p2p) = &p2p_handle { - p2p.publish_ref_update(crate::p2p::RefUpdateEvent { - node_did: node_did_str.clone(), - pusher_did: pusher_did_clone.clone(), - repo: repo_slug.clone(), - ref_name: ref_name.clone(), - old_sha: "".to_string(), - new_sha: new_sha.clone(), - timestamp: chrono::Utc::now().to_rfc3339(), - cert_id: None, - cid: cid.map(|s| s.to_string()), - }) - .await; + if announce { + if let Some(p2p) = &p2p_handle { + p2p.publish_ref_update(crate::p2p::RefUpdateEvent { + node_did: node_did_str.clone(), + pusher_did: pusher_did_clone.clone(), + repo: repo_slug.clone(), + ref_name: ref_name.clone(), + old_sha: "".to_string(), + new_sha: new_sha.clone(), + timestamp: chrono::Utc::now().to_rfc3339(), + cert_id: None, + cid: cid.map(|s| s.to_string()), + }) + .await; + } } } // HTTP peer notification — notify all known peers to pull from us. // This is the reliable fallback when Gossipsub p2p is not yet connected. - if let Ok(peers) = db_for_peers.list_peers().await { - for peer in peers { - if peer.http_url.is_empty() { - continue; - } - let peer_url = peer.http_url.trim_end_matches('/'); - if let Some(self_url) = self_public_url.as_deref() { - if peer_url == self_url.trim_end_matches('/') { + // Suppressed for repos the public cannot read. + if announce { + if let Ok(peers) = db_for_peers.list_peers().await { + for peer in peers { + if peer.http_url.is_empty() { continue; } - } - let path = "/api/v1/sync/notify"; - let notify_url = format!("{peer_url}{path}"); - let body = serde_json::json!({ - "repo": repo_slug.clone(), - "ref_name": ref_updates_clone.first().map(|(r, _)| r).unwrap_or(&String::new()), - "new_sha": ref_updates_clone.first().map(|(_, s)| s).unwrap_or(&String::new()), - "node_did": node_did_str.clone(), - "pusher_did": pusher_did_clone.clone(), - "old_sha": "0000000000000000000000000000000000000000", - "timestamp": chrono::Utc::now().to_rfc3339(), - }); - let body_bytes = match serde_json::to_vec(&body) { - Ok(bytes) => bytes, - Err(e) => { - tracing::warn!(peer = %peer.did, err = %e, "failed to serialize peer sync notify"); - continue; - } - }; - let signed = gitlawb_core::http_sig::sign_request( - node_keypair.as_ref(), - "POST", - path, - &body_bytes, - ); - match http_client - .post(¬ify_url) - .header("Content-Type", "application/json") - .header("Content-Digest", signed.content_digest) - .header("Signature-Input", signed.signature_input) - .header("Signature", signed.signature) - .body(body_bytes) - .send() - .await - { - Ok(r) if r.status().is_success() => { - tracing::info!(peer = %peer.did, repo = %repo_slug, "notified peer to sync") - } - Ok(r) => { - tracing::warn!(peer = %peer.did, status = %r.status(), "peer sync notify returned error") + let peer_url = peer.http_url.trim_end_matches('/'); + if let Some(self_url) = self_public_url.as_deref() { + if peer_url == self_url.trim_end_matches('/') { + continue; + } } - Err(e) => { - tracing::warn!(peer = %peer.did, err = %e, "failed to notify peer") + let path = "/api/v1/sync/notify"; + let notify_url = format!("{peer_url}{path}"); + let body = serde_json::json!({ + "repo": repo_slug.clone(), + "ref_name": ref_updates_clone.first().map(|(r, _)| r).unwrap_or(&String::new()), + "new_sha": ref_updates_clone.first().map(|(_, s)| s).unwrap_or(&String::new()), + "node_did": node_did_str.clone(), + "pusher_did": pusher_did_clone.clone(), + "old_sha": "0000000000000000000000000000000000000000", + "timestamp": chrono::Utc::now().to_rfc3339(), + }); + let body_bytes = match serde_json::to_vec(&body) { + Ok(bytes) => bytes, + Err(e) => { + tracing::warn!(peer = %peer.did, err = %e, "failed to serialize peer sync notify"); + continue; + } + }; + let signed = gitlawb_core::http_sig::sign_request( + node_keypair.as_ref(), + "POST", + path, + &body_bytes, + ); + match http_client + .post(¬ify_url) + .header("Content-Type", "application/json") + .header("Content-Digest", signed.content_digest) + .header("Signature-Input", signed.signature_input) + .header("Signature", signed.signature) + .body(body_bytes) + .send() + .await + { + Ok(r) if r.status().is_success() => { + tracing::info!(peer = %peer.did, repo = %repo_slug, "notified peer to sync") + } + Ok(r) => { + tracing::warn!(peer = %peer.did, status = %r.status(), "peer sync notify returned error") + } + Err(e) => { + tracing::warn!(peer = %peer.did, err = %e, "failed to notify peer") + } } } } @@ -781,8 +828,9 @@ pub async fn git_receive_pack( timestamp: now_ts.clone(), }); - // Arweave permanent anchoring — fire for each ref update - if !irys_url.is_empty() { + // Arweave permanent anchoring — fire for each ref update. + // Suppressed for repos the public cannot read (public permanent ledger). + if announce && !irys_url.is_empty() { for (ref_name, new_sha) in &ref_updates_clone { let cid = cid_map.get(new_sha).cloned(); let anchor = crate::arweave::RefAnchor { From 949d131c5dfa7bec40687e31363f7b94bb383dc7 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Mon, 8 Jun 2026 10:29:54 -0500 Subject: [PATCH 23/58] test(node): announce gate matches anonymous repo readability --- crates/gitlawb-node/src/visibility.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/crates/gitlawb-node/src/visibility.rs b/crates/gitlawb-node/src/visibility.rs index b246dbf..1107de7 100644 --- a/crates/gitlawb-node/src/visibility.rs +++ b/crates/gitlawb-node/src/visibility.rs @@ -242,4 +242,24 @@ mod tests { Decision::Allow ); } + + // Mirrors the gossip-announce gate in git_receive_pack: announce iff an + // anonymous caller can read "/". + #[test] + fn announce_gate_matches_public_readability() { + let announce = |rules: &[VisibilityRule], is_public: bool| { + visibility_check(rules, is_public, OWNER, None, "/") == Decision::Allow + }; + // Public repo, no rules → announce. + assert!(announce(&[], true)); + // Legacy private repo (is_public false, no rules) → silent. + assert!(!announce(&[], false)); + // Mode A whole-repo rule with no public readers → silent. + assert!(!announce(&[rule("/", VisibilityMode::A, &[])], true)); + // Mode B public repo with a private subtree → still announce. + assert!(announce( + &[rule("/secret/**", VisibilityMode::B, &[])], + true + )); + } } From 083293d507d56bbd58b041a20d0037df7ccfbc2f Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Mon, 8 Jun 2026 17:51:01 -0500 Subject: [PATCH 24/58] fix(node): run withheld_blob_oids off the async worker on push The receive-pack replication chokepoint called withheld_blob_oids directly on the tokio worker, where its blocking git ls-tree walk can stall the runtime for repos with many refs. Wrap it in spawn_blocking to match the upload-pack serve path. --- crates/gitlawb-node/src/api/repos.rs | 35 +++++++++++++++++++--------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 767d76a..884d7ca 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -628,17 +628,30 @@ pub async fn git_receive_pack( } else { match &rules_opt { Some(rules) if rules.is_empty() => Some(std::collections::HashSet::new()), - Some(rules) => crate::git::visibility_pack::withheld_blob_oids( - &disk_path, - rules, - record.is_public, - &record.owner_did, - None, - ) - .map_err(|e| { - tracing::warn!(err = %e, "withheld_blob_oids failed; skipping replication for this push") - }) - .ok(), + // withheld_blob_oids walks every ref with blocking `git ls-tree`; + // keep that off the async worker thread. + Some(rules) => { + let path = disk_path.clone(); + let rules = rules.clone(); + let owner_did = record.owner_did.clone(); + let is_public = record.is_public; + tokio::task::spawn_blocking(move || { + crate::git::visibility_pack::withheld_blob_oids( + &path, &rules, is_public, &owner_did, None, + ) + }) + .await + .map_err(|e| { + tracing::warn!(err = %e, "withheld_blob_oids task panicked; skipping replication for this push") + }) + .ok() + .and_then(|r| { + r.map_err(|e| { + tracing::warn!(err = %e, "withheld_blob_oids failed; skipping replication for this push") + }) + .ok() + }) + } None => None, } }; From 95a223e3bbc520dcb74ef46df5e18f981b16a623 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 11:47:39 -0500 Subject: [PATCH 25/58] test(gl): guard three-level nested visibility in sparse clone Add a regression test for deny /secret, allow /secret/public, deny /secret/public/admin and clarify the sparse-checkout comment. git does not re-traverse an explicitly excluded directory, so emitting all excludes before re-includes keeps the deepest deny in force; the broader parent re-include does not resurrect it. --- crates/gl/src/clone.rs | 48 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 3f9af70..5b9403f 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -108,9 +108,12 @@ pub fn setup_partial_clone( dest_str, ])?; git(dest, &["sparse-checkout", "init", "--no-cone"])?; - // Non-cone sparse-checkout, gitignore-style and order-sensitive: include - // everything, exclude the withheld globs, then re-include any allowed globs - // nested under an excluded one (later patterns win). + // Non-cone sparse-checkout, gitignore-style: include everything, exclude the + // withheld globs, then re-include any allowed globs nested under an excluded + // one. Emitting all excludes before the re-includes is safe even for deeper + // re-denials (deny /secret, allow /secret/public, deny /secret/public/admin): + // git does not re-traverse an explicitly excluded directory, so a broader + // parent re-include never resurrects a more specific excluded subtree. let mut spec = String::from("/*\n"); for g in withheld_globs { for pat in sparse_patterns(g) { @@ -339,6 +342,45 @@ mod tests { ); } + #[test] + fn three_level_alternating_nesting_respects_specificity() { + // deny /secret, allow /secret/public, deny /secret/public/admin. + // The deepest deny must win even though a shallower allow re-includes + // its parent: order patterns by depth, not all-excludes-then-includes. + let (td, url) = bare_remote(&[ + ("public/a.txt", b"pub\n"), + ("secret/private/p.txt", b"PRIV\n"), + ("secret/public/s.txt", b"SHARED\n"), + ("secret/public/admin/k.txt", b"ADMIN\n"), + ]); + let dest = td.path().join("dest"); + setup_partial_clone( + &dest, + &url, + &[ + "/secret/**".to_string(), + "/secret/public/admin/**".to_string(), + ], + &["/secret/public/**".to_string()], + None, + ) + .unwrap(); + + assert!(dest.join("public/a.txt").exists(), "public present"); + assert!( + dest.join("secret/public/s.txt").exists(), + "allowed middle path must be re-included" + ); + assert!( + !dest.join("secret/private/p.txt").exists(), + "denied sibling must stay excluded" + ); + assert!( + !dest.join("secret/public/admin/k.txt").exists(), + "deepest denied path must stay excluded despite the shallower re-include" + ); + } + #[test] fn exact_path_glob_is_excluded() { // A wildcard-free glob must exclude the exact file, not just a subtree. From 720f8ef6878a651cb0ce906f6e814c69ed60648f Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 11:56:10 -0500 Subject: [PATCH 26/58] fix(gl): robust default-branch detection and strict withheld-paths schema Read the default branch from the local origin/HEAD symref clone already set, instead of parsing the localized, network-dependent output of git remote show origin. Deserialize the withheld-paths body into a typed struct so a missing or mistyped withheld/reinclude field is a hard error rather than silently becoming an empty list, which would mask a server regression behind a later clone failure. --- crates/gl/src/clone.rs | 66 ++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 5b9403f..b5fe39d 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -8,7 +8,7 @@ use anyhow::{bail, Context, Result}; use clap::Args; -use serde_json::Value; +use serde::Deserialize; use std::path::Path; use std::process::Command; @@ -134,17 +134,25 @@ pub fn setup_partial_clone( match branch { Some(b) => git(dest, &["checkout", "-q", b])?, None => { + // Read the default branch from the local `origin/HEAD` symref that + // clone just set, instead of parsing `git remote show origin`, whose + // "HEAD branch:" line is localized and needs a network round-trip. let out = Command::new("git") - .args(["remote", "show", "origin"]) + .args(["symbolic-ref", "--short", "refs/remotes/origin/HEAD"]) .current_dir(dest) .output()?; - let text = String::from_utf8_lossy(&out.stdout); - let head = text - .lines() - .find_map(|l| l.trim().strip_prefix("HEAD branch: ")) - .map(|s| s.to_string()) - .context("could not determine default branch")?; - git(dest, &["checkout", "-q", &head])?; + if !out.status.success() { + bail!( + "could not determine default branch: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + let symref = String::from_utf8_lossy(&out.stdout); + let head = symref + .trim() + .strip_prefix("origin/") + .context("unexpected origin/HEAD format")?; + git(dest, &["checkout", "-q", head])?; } } Ok(()) @@ -192,21 +200,21 @@ async fn fetch_withheld(node: &str, owner: &str, name: &str) -> Result<(Vec bail!("withheld-paths lookup failed: {}", r.status()), Err(err) => return Err(err).context("fetching withheld paths"), }; - let body: Value = resp + let body: WithheldPathsResponse = resp .json() .await .context("parsing withheld-paths response")?; - let globs = |field: &str| -> Vec { - body.get(field) - .and_then(|w| w.as_array()) - .map(|a| { - a.iter() - .filter_map(|x| x.as_str().map(String::from)) - .collect() - }) - .unwrap_or_default() - }; - Ok((globs("withheld"), globs("reinclude"))) + Ok((body.withheld, body.reinclude)) +} + +/// The node's `/withheld-paths` 200 body. Both fields are always emitted as JSON +/// arrays; deserializing into this struct (rather than poking at a `Value`) makes +/// a missing or mistyped field a hard error instead of silently becoming `[]`, +/// which would mask a server regression behind a confusing later clone failure. +#[derive(Deserialize)] +struct WithheldPathsResponse { + withheld: Vec, + reinclude: Vec, } pub async fn run(args: CloneArgs) -> Result<()> { @@ -404,6 +412,22 @@ mod tests { ); } + #[test] + fn withheld_response_requires_both_fields() { + let ok: WithheldPathsResponse = + serde_json::from_str(r#"{"withheld":["/secret/**"],"reinclude":[]}"#).unwrap(); + assert_eq!(ok.withheld, vec!["/secret/**".to_string()]); + assert!(ok.reinclude.is_empty()); + + // A missing field is a schema mismatch: it must error, not default to []. + assert!(serde_json::from_str::(r#"{"withheld":[]}"#).is_err()); + // A wrong-typed field must error too. + assert!(serde_json::from_str::( + r#"{"withheld":"nope","reinclude":[]}"# + ) + .is_err()); + } + #[test] fn parse_repo_accepts_url_and_bare() { let (url, o, n) = parse_repo("gitlawb://did:key:zAbc/myrepo").unwrap(); From 53902435d77295b88ecc1ec7922854c2d7e5ad59 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 09:37:30 -0500 Subject: [PATCH 27/58] feat(node): classify mirror mode from origin withheld-paths --- crates/gitlawb-node/src/sync.rs | 51 +++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index cdcfd3e..348568a 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -18,6 +18,32 @@ use tracing::{info, warn}; use crate::config::Config; use crate::db::Db; +/// How to mirror a repo, decided from the origin's `withheld-paths` answer. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum MirrorMode { + /// No withheld content: a normal full mirror. + Plain, + /// Withheld content present: a promisor mirror that tolerates the blobs the + /// origin omits for an anonymous caller. + Promisor, +} + +/// Decide the mirror mode from the origin's `withheld-paths` response. +/// +/// `Some(non-empty)` → the repo has a private subtree → `Promisor`. +/// `Some(empty)` → fully public → `Plain`. +/// `None` → the lookup 404'd or failed. Attempt a `Plain` mirror; a +/// mode-A repo also 404s the git read endpoint, so the clone +/// fails and nothing is mirrored (fail-closed at the git +/// layer), while a public repo on a peer that predates the +/// `withheld-paths` route still gets mirrored. +fn classify_mirror(withheld: Option>) -> MirrorMode { + match withheld { + Some(globs) if !globs.is_empty() => MirrorMode::Promisor, + _ => MirrorMode::Plain, + } +} + /// Start the background sync worker. Returns immediately; the worker runs /// as a detached tokio task that exits cleanly when `shutdown_rx` flips /// to `true`. @@ -174,3 +200,28 @@ async fn fetch_repo(local_path: &Path, remote_url: &str) -> anyhow::Result<()> { } Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn classify_promisor_when_withheld_nonempty() { + let mode = classify_mirror(Some(vec!["/secret/**".to_string()])); + assert!(matches!(mode, MirrorMode::Promisor)); + } + + #[test] + fn classify_plain_when_withheld_empty() { + let mode = classify_mirror(Some(vec![])); + assert!(matches!(mode, MirrorMode::Plain)); + } + + #[test] + fn classify_plain_when_lookup_failed() { + // None == 404 / network error / parse failure: attempt a plain mirror + // and let the git read endpoint fail-close a mode-A repo. + let mode = classify_mirror(None); + assert!(matches!(mode, MirrorMode::Plain)); + } +} From 469a1a43cb7671daa068d33b6d9e072600ade0b4 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 09:40:11 -0500 Subject: [PATCH 28/58] feat(node): promisor-aware mirror clone for withheld repos --- crates/gitlawb-node/src/sync.rs | 105 +++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index 348568a..c93c084 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -132,7 +132,7 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { let result = if local_path.exists() { fetch_repo(&local_path, &remote_url).await } else { - clone_repo(&remote_url, &local_path).await + clone_repo(&remote_url, &local_path, MirrorMode::Plain).await }; match result { @@ -160,14 +160,20 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { } /// Mirror-clone a repo from a remote URL into a local bare repo. -async fn clone_repo(remote_url: &str, local_path: &Path) -> anyhow::Result<()> { +/// `Promisor` mode adds `--filter=blob:limit=10g`, which marks the repo a git +/// promisor (so a pack with origin-omitted withheld blobs is accepted) while +/// the huge size limit means every blob the origin *does* send is kept. +async fn clone_repo(remote_url: &str, local_path: &Path, mode: MirrorMode) -> anyhow::Result<()> { + let local_str = local_path.to_str().unwrap_or("."); + let mut args = vec!["clone", "--mirror"]; + if mode == MirrorMode::Promisor { + args.push("--filter=blob:limit=10g"); + } + args.push(remote_url); + args.push(local_str); + let out = tokio::process::Command::new("git") - .args([ - "clone", - "--mirror", - remote_url, - local_path.to_str().unwrap_or("."), - ]) + .args(&args) .output() .await .map_err(|e| anyhow::anyhow!("git clone failed to spawn: {e}"))?; @@ -204,6 +210,8 @@ async fn fetch_repo(local_path: &Path, remote_url: &str) -> anyhow::Result<()> { #[cfg(test)] mod tests { use super::*; + use std::process::Command; + use tempfile::TempDir; #[test] fn classify_promisor_when_withheld_nonempty() { @@ -224,4 +232,85 @@ mod tests { let mode = classify_mirror(None); assert!(matches!(mode, MirrorMode::Plain)); } + + fn g(args: &[&str], dir: &Path) { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + } + + /// Build a bare remote containing `files`, committed on one branch. + /// Returns (tempdir, file:// url). file:// makes git honor --filter. + fn bare_remote(files: &[(&str, &[u8])]) -> (TempDir, String) { + let td = TempDir::new().unwrap(); + let origin = td.path().join("origin"); + let bare = td.path().join("bare.git"); + for (path, contents) in files { + let full = origin.join(path); + std::fs::create_dir_all(full.parent().unwrap()).unwrap(); + std::fs::write(full, contents).unwrap(); + } + g(&["init", "-q"], &origin); + g(&["config", "user.email", "t@t"], &origin); + g(&["config", "user.name", "t"], &origin); + g(&["add", "."], &origin); + g(&["commit", "-qm", "init"], &origin); + g( + &["clone", "-q", "--bare", origin.to_str().unwrap(), bare.to_str().unwrap()], + td.path(), + ); + let url = format!("file://{}", bare.display()); + (td, url) + } + + fn git_config(repo: &Path, key: &str) -> String { + let out = Command::new("git") + .args(["-C", repo.to_str().unwrap(), "config", "--get", key]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + } + + fn object_count(repo: &Path) -> usize { + let out = Command::new("git") + .args([ + "-C", + repo.to_str().unwrap(), + "cat-file", + "--batch-all-objects", + "--batch-check=%(objectname)", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout) + .lines() + .filter(|l| !l.trim().is_empty()) + .count() + } + + #[tokio::test] + async fn promisor_clone_marks_promisor_and_keeps_objects() { + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Promisor).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); + assert_eq!(git_config(&dest, "remote.origin.mirror"), "true"); + // No withholding on a plain bare origin, so every object is present: + // 1 commit + 1 root tree + 2 subtrees + 2 blobs = 6. + assert_eq!(object_count(&dest), 6); + } + + #[tokio::test] + async fn plain_clone_is_not_promisor() { + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Plain).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), ""); + assert_eq!(git_config(&dest, "remote.origin.mirror"), "true"); + } } From 97c82d55350ed515446cfaf3bef51079b993a179 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 09:43:15 -0500 Subject: [PATCH 29/58] feat(node): promisor-aware mirror fetch via origin remote --- crates/gitlawb-node/src/sync.rs | 73 ++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index c93c084..b043149 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -130,7 +130,7 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { let remote_url = format!("{}/{}", origin_url, item.repo); let result = if local_path.exists() { - fetch_repo(&local_path, &remote_url).await + fetch_repo(&local_path, &remote_url, MirrorMode::Plain).await } else { clone_repo(&remote_url, &local_path, MirrorMode::Plain).await }; @@ -159,6 +159,20 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { } } +/// Run a git subprocess, returning an error with stderr on non-zero exit. +async fn git_run(args: &[&str]) -> anyhow::Result<()> { + let out = tokio::process::Command::new("git") + .args(args) + .output() + .await + .map_err(|e| anyhow::anyhow!("git failed to spawn: {e}"))?; + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr); + return Err(anyhow::anyhow!("git {args:?} failed: {stderr}")); + } + Ok(()) +} + /// Mirror-clone a repo from a remote URL into a local bare repo. /// `Promisor` mode adds `--filter=blob:limit=10g`, which marks the repo a git /// promisor (so a pack with origin-omitted withheld blobs is accepted) while @@ -185,26 +199,26 @@ async fn clone_repo(remote_url: &str, local_path: &Path, mode: MirrorMode) -> an Ok(()) } -/// Fetch all refs from the remote into an existing mirror repo. -async fn fetch_repo(local_path: &Path, remote_url: &str) -> anyhow::Result<()> { - let out = tokio::process::Command::new("git") - .args([ - "-C", - local_path.to_str().unwrap_or("."), - "fetch", - "--prune", - remote_url, - "+refs/*:refs/*", - ]) - .output() - .await - .map_err(|e| anyhow::anyhow!("git fetch failed to spawn: {e}"))?; +/// Fetch all refs from the remote into an existing mirror repo. Refreshes the +/// stored `origin` URL (the peer's URL may have changed), applies promisor +/// config when `Promisor` (covers a repo that became mode-B after a plain +/// initial mirror), and fetches via the `origin` remote so any stored promisor +/// settings are honored. +async fn fetch_repo(local_path: &Path, remote_url: &str, mode: MirrorMode) -> anyhow::Result<()> { + let local_str = local_path.to_str().unwrap_or("."); - if !out.status.success() { - let stderr = String::from_utf8_lossy(&out.stderr); - return Err(anyhow::anyhow!("git fetch failed: {stderr}")); + git_run(&["-C", local_str, "remote", "set-url", "origin", remote_url]).await?; + + if mode == MirrorMode::Promisor { + git_run(&["-C", local_str, "config", "remote.origin.promisor", "true"]).await?; + git_run(&[ + "-C", local_str, + "config", "remote.origin.partialclonefilter", "blob:limit=10g", + ]) + .await?; } - Ok(()) + + git_run(&["-C", local_str, "fetch", "--prune", "origin"]).await } #[cfg(test)] @@ -313,4 +327,25 @@ mod tests { assert_eq!(git_config(&dest, "remote.origin.promisor"), ""); assert_eq!(git_config(&dest, "remote.origin.mirror"), "true"); } + + #[tokio::test] + async fn promisor_fetch_updates_existing_mirror() { + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Promisor).await.unwrap(); + let before = object_count(&dest); + + // Add a second commit to the origin working tree and push to the bare + // (the working repo has no named remote, so push via the file:// URL). + let origin = td.path().join("origin"); + std::fs::write(origin.join("public/c.txt"), b"more\n").unwrap(); + g(&["add", "."], &origin); + g(&["commit", "-qm", "second"], &origin); + g(&["push", "-q", &url, "HEAD"], &origin); + + fetch_repo(&dest, &url, MirrorMode::Promisor).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); + assert!(object_count(&dest) > before, "fetch pulled the new commit"); + } } From 1e8ae3c2b5c235fae72bb7464cfcd0673a9a8ea3 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 09:46:53 -0500 Subject: [PATCH 30/58] feat(node): classify and mirror peers per withheld-paths --- crates/gitlawb-node/src/sync.rs | 36 +++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index b043149..54dcb65 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -63,12 +63,13 @@ async fn run( shutdown_rx: &mut tokio::sync::watch::Receiver, ) { let machine_id = std::env::var("FLY_MACHINE_ID").ok(); + let client = reqwest::Client::new(); info!("sync worker started (auto_sync=true)"); let mut interval = tokio::time::interval(std::time::Duration::from_secs(30)); loop { tokio::select! { _ = interval.tick() => { - process_batch(&db, &config, machine_id.as_deref()).await; + process_batch(&db, &config, machine_id.as_deref(), &client).await; } _ = shutdown_rx.changed() => { if *shutdown_rx.borrow() { @@ -80,7 +81,7 @@ async fn run( } } -async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { +async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>, client: &reqwest::Client) { let items = match db.dequeue_pending_syncs(10).await { Ok(v) => v, Err(e) => { @@ -129,10 +130,13 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { // (no .git suffix — the server routes don't include it) let remote_url = format!("{}/{}", origin_url, item.repo); + let withheld = fetch_withheld(client, &origin_url, owner_short, repo_name).await; + let mode = classify_mirror(withheld); + let result = if local_path.exists() { - fetch_repo(&local_path, &remote_url, MirrorMode::Plain).await + fetch_repo(&local_path, &remote_url, mode).await } else { - clone_repo(&remote_url, &local_path, MirrorMode::Plain).await + clone_repo(&remote_url, &local_path, mode).await }; match result { @@ -159,6 +163,30 @@ async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>) { } } +/// Query the origin's anonymous `withheld-paths` endpoint. Returns the withheld +/// glob list on a 2xx, or `None` on any non-success / network / parse error +/// (treated as "unknown" by `classify_mirror`). +async fn fetch_withheld( + client: &reqwest::Client, + origin_url: &str, + owner: &str, + repo: &str, +) -> Option> { + let url = format!("{origin_url}/api/v1/repos/{owner}/{repo}/withheld-paths"); + let resp = client.get(&url).send().await.ok()?; + if !resp.status().is_success() { + return None; + } + let body: serde_json::Value = resp.json().await.ok()?; + let globs = body + .get("withheld")? + .as_array()? + .iter() + .filter_map(|v| v.as_str().map(str::to_string)) + .collect(); + Some(globs) +} + /// Run a git subprocess, returning an error with stderr on non-zero exit. async fn git_run(args: &[&str]) -> anyhow::Result<()> { let out = tokio::process::Command::new("git") From f1e2207301f4b4136a5129cda5cfc33bbae2232c Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 10:52:27 -0500 Subject: [PATCH 31/58] style: cargo fmt --- crates/gitlawb-node/src/sync.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index 54dcb65..718cd69 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -81,7 +81,12 @@ async fn run( } } -async fn process_batch(db: &Db, config: &Config, machine_id: Option<&str>, client: &reqwest::Client) { +async fn process_batch( + db: &Db, + config: &Config, + machine_id: Option<&str>, + client: &reqwest::Client, +) { let items = match db.dequeue_pending_syncs(10).await { Ok(v) => v, Err(e) => { @@ -240,8 +245,11 @@ async fn fetch_repo(local_path: &Path, remote_url: &str, mode: MirrorMode) -> an if mode == MirrorMode::Promisor { git_run(&["-C", local_str, "config", "remote.origin.promisor", "true"]).await?; git_run(&[ - "-C", local_str, - "config", "remote.origin.partialclonefilter", "blob:limit=10g", + "-C", + local_str, + "config", + "remote.origin.partialclonefilter", + "blob:limit=10g", ]) .await?; } @@ -301,7 +309,13 @@ mod tests { g(&["add", "."], &origin); g(&["commit", "-qm", "init"], &origin); g( - &["clone", "-q", "--bare", origin.to_str().unwrap(), bare.to_str().unwrap()], + &[ + "clone", + "-q", + "--bare", + origin.to_str().unwrap(), + bare.to_str().unwrap(), + ], td.path(), ); let url = format!("file://{}", bare.display()); From 25390bfed9240e90757cb917b3b74144a8b885a8 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 11:38:38 -0500 Subject: [PATCH 32/58] fix(node): bound sync withheld lookup and rehydrate on promisor->plain Give the sync worker's HTTP client a 30s timeout so a stalled peer withheld-paths lookup cannot hang the worker loop. When a repo that was mirrored as a promisor (mode B) becomes fully public, fetch_repo now clears remote.origin.promisor and partialclonefilter and refetches, so the once-withheld blobs are backfilled instead of the mirror staying permanently partial. --- crates/gitlawb-node/src/sync.rs | 109 +++++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 17 deletions(-) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index 718cd69..df41470 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -63,7 +63,11 @@ async fn run( shutdown_rx: &mut tokio::sync::watch::Receiver, ) { let machine_id = std::env::var("FLY_MACHINE_ID").ok(); - let client = reqwest::Client::new(); + // Bound each withheld-paths lookup so a stalled peer cannot hang the worker. + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(30)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); info!("sync worker started (auto_sync=true)"); let mut interval = tokio::time::interval(std::time::Duration::from_secs(30)); loop { @@ -206,6 +210,29 @@ async fn git_run(args: &[&str]) -> anyhow::Result<()> { Ok(()) } +/// Run a git subprocess, ignoring a non-zero exit. Used for idempotent +/// `config --unset`, which exits non-zero when the key is already absent. +async fn git_run_lenient(args: &[&str]) { + let _ = tokio::process::Command::new("git") + .args(args) + .output() + .await; +} + +/// Read a single git config value; `None` if unset or on error. +async fn git_config_get(repo: &str, key: &str) -> Option { + let out = tokio::process::Command::new("git") + .args(["-C", repo, "config", "--get", key]) + .output() + .await + .ok()?; + if !out.status.success() { + return None; + } + let value = String::from_utf8_lossy(&out.stdout).trim().to_string(); + (!value.is_empty()).then_some(value) +} + /// Mirror-clone a repo from a remote URL into a local bare repo. /// `Promisor` mode adds `--filter=blob:limit=10g`, which marks the repo a git /// promisor (so a pack with origin-omitted withheld blobs is accepted) while @@ -233,28 +260,60 @@ async fn clone_repo(remote_url: &str, local_path: &Path, mode: MirrorMode) -> an } /// Fetch all refs from the remote into an existing mirror repo. Refreshes the -/// stored `origin` URL (the peer's URL may have changed), applies promisor -/// config when `Promisor` (covers a repo that became mode-B after a plain -/// initial mirror), and fetches via the `origin` remote so any stored promisor -/// settings are honored. +/// stored `origin` URL (the peer's URL may have changed) and fetches via the +/// `origin` remote so any stored promisor settings are honored. +/// +/// `Promisor` applies the promisor config first (covers a repo that became +/// mode-B after a plain initial mirror). `Plain` on a mirror that was previously +/// a promisor (the repo went private -> public) clears the partial-clone config +/// and `--refetch`es, so the once-withheld, now-public blobs are backfilled +/// rather than left permanently missing. async fn fetch_repo(local_path: &Path, remote_url: &str, mode: MirrorMode) -> anyhow::Result<()> { let local_str = local_path.to_str().unwrap_or("."); git_run(&["-C", local_str, "remote", "set-url", "origin", remote_url]).await?; - if mode == MirrorMode::Promisor { - git_run(&["-C", local_str, "config", "remote.origin.promisor", "true"]).await?; - git_run(&[ - "-C", - local_str, - "config", - "remote.origin.partialclonefilter", - "blob:limit=10g", - ]) - .await?; + match mode { + MirrorMode::Promisor => { + git_run(&["-C", local_str, "config", "remote.origin.promisor", "true"]).await?; + git_run(&[ + "-C", + local_str, + "config", + "remote.origin.partialclonefilter", + "blob:limit=10g", + ]) + .await?; + git_run(&["-C", local_str, "fetch", "--prune", "origin"]).await + } + MirrorMode::Plain => { + let was_promisor = git_config_get(local_str, "remote.origin.promisor") + .await + .as_deref() + == Some("true"); + if was_promisor { + git_run_lenient(&[ + "-C", + local_str, + "config", + "--unset", + "remote.origin.promisor", + ]) + .await; + git_run_lenient(&[ + "-C", + local_str, + "config", + "--unset", + "remote.origin.partialclonefilter", + ]) + .await; + git_run(&["-C", local_str, "fetch", "--refetch", "--prune", "origin"]).await + } else { + git_run(&["-C", local_str, "fetch", "--prune", "origin"]).await + } + } } - - git_run(&["-C", local_str, "fetch", "--prune", "origin"]).await } #[cfg(test)] @@ -390,4 +449,20 @@ mod tests { assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); assert!(object_count(&dest) > before, "fetch pulled the new commit"); } + + #[tokio::test] + async fn plain_fetch_clears_promisor_config_on_transition() { + // Repo started mode-B (promisor mirror), then went fully public, so the + // next sync classifies Plain. fetch_repo must drop the partial-clone + // config and refetch instead of leaving the mirror a promisor forever. + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n")]); + let dest = td.path().join("mirror.git"); + clone_repo(&url, &dest, MirrorMode::Promisor).await.unwrap(); + assert_eq!(git_config(&dest, "remote.origin.promisor"), "true"); + + fetch_repo(&dest, &url, MirrorMode::Plain).await.unwrap(); + + assert_eq!(git_config(&dest, "remote.origin.promisor"), ""); + assert_eq!(git_config(&dest, "remote.origin.partialclonefilter"), ""); + } } From 370abae526191584de9be48af5f28ea740a23dca Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 13:41:12 -0500 Subject: [PATCH 33/58] feat(core): ed25519 to x25519 key conversion for envelope crypto --- Cargo.lock | 123 ++++++++++++++++++++++++++++ crates/gitlawb-core/Cargo.toml | 3 + crates/gitlawb-core/src/encrypt.rs | 47 +++++++++++ crates/gitlawb-core/src/identity.rs | 6 ++ crates/gitlawb-core/src/lib.rs | 1 + 5 files changed, 180 insertions(+) create mode 100644 crates/gitlawb-core/src/encrypt.rs diff --git a/Cargo.lock b/Cargo.lock index fc8e18a..c2a4595 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,16 @@ version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" +[[package]] +name = "aead" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0" +dependencies = [ + "crypto-common", + "generic-array", +] + [[package]] name = "ahash" version = "0.8.12" @@ -1975,6 +1985,30 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chacha20" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3613f74bd2eac03dad61bd53dbe620703d4371614fe0bc3b9f04dd36fe4e818" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "chacha20poly1305" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10cd79432192d1c0f4e1a0fef9527696cc039165d729fb41b3f4f4f354c2dc35" +dependencies = [ + "aead", + "chacha20", + "cipher", + "poly1305", + "zeroize", +] + [[package]] name = "chrono" version = "0.4.44" @@ -2001,6 +2035,17 @@ dependencies = [ "unsigned-varint 0.8.0", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", + "zeroize", +] + [[package]] name = "clap" version = "4.5.60" @@ -2283,9 +2328,39 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", + "rand_core 0.6.4", "typenum", ] +[[package]] +name = "crypto_box" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16182b4f39a82ec8a6851155cc4c0cda3065bb1db33651726a29e1951de0f009" +dependencies = [ + "aead", + "crypto_secretbox", + "curve25519-dalek", + "salsa20", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto_secretbox" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d6cf87adf719ddf43a805e92c6870a531aedda35ff640442cbaf8674e141e1" +dependencies = [ + "aead", + "cipher", + "generic-array", + "poly1305", + "salsa20", + "subtle", + "zeroize", +] + [[package]] name = "curve25519-dalek" version = "4.1.3" @@ -3219,8 +3294,11 @@ version = "0.3.9" dependencies = [ "anyhow", "base64", + "chacha20poly1305", "chrono", "cid", + "crypto_box", + "curve25519-dalek", "ed25519-dalek", "hex", "multibase", @@ -3836,6 +3914,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -4678,6 +4765,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "opaque-debug" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" + [[package]] name = "openssl-probe" version = "0.2.1" @@ -4927,6 +5020,17 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "poly1305" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8159bd90725d2df49889a078b54f4f79e87f1f8a8444194cdca81d38f5393abf" +dependencies = [ + "cpufeatures", + "opaque-debug", + "universal-hash", +] + [[package]] name = "potential_utf" version = "0.1.4" @@ -5642,6 +5746,15 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6518fc26bced4d53678a22d6e423e9d8716377def84545fe328236e3af070e7f" +[[package]] +name = "salsa20" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213" +dependencies = [ + "cipher", +] + [[package]] name = "schannel" version = "0.1.29" @@ -6884,6 +6997,16 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "universal-hash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea" +dependencies = [ + "crypto-common", + "subtle", +] + [[package]] name = "unsigned-varint" version = "0.7.2" diff --git a/crates/gitlawb-core/Cargo.toml b/crates/gitlawb-core/Cargo.toml index 486a5aa..f479e1a 100644 --- a/crates/gitlawb-core/Cargo.toml +++ b/crates/gitlawb-core/Cargo.toml @@ -23,6 +23,9 @@ chrono = { workspace = true } uuid = { workspace = true } zeroize = { version = "1", features = ["derive"] } pkcs8 = { version = "0.10", features = ["pem", "std"] } +curve25519-dalek = "4" +crypto_box = { version = "0.9", features = ["std"] } +chacha20poly1305 = "0.10" [dev-dependencies] tokio = { workspace = true } diff --git a/crates/gitlawb-core/src/encrypt.rs b/crates/gitlawb-core/src/encrypt.rs new file mode 100644 index 0000000..c75add4 --- /dev/null +++ b/crates/gitlawb-core/src/encrypt.rs @@ -0,0 +1,47 @@ +//! Envelope encryption for withheld blobs (Option B). A random content key +//! encrypts the blob (XChaCha20-Poly1305); the content key is wrapped to each +//! recipient via an X25519 box keyed from their Ed25519 `did:key`. The node +//! seals with public keys only; readers open with their own private key. + +use anyhow::{Context, Result}; +use ed25519_dalek::VerifyingKey; + +/// X25519 public key (Montgomery u) for an Ed25519 verifying key. +fn x25519_public(vk: &VerifyingKey) -> Result<[u8; 32]> { + use curve25519_dalek::edwards::CompressedEdwardsY; + let edwards = CompressedEdwardsY::from_slice(vk.as_bytes()) + .ok() + .and_then(|c| c.decompress()) + .context("verifying key is not a valid edwards point")?; + Ok(edwards.to_montgomery().to_bytes()) +} + +/// X25519 secret scalar for an Ed25519 seed (SHA-512 of seed, lower 32, clamped). +fn x25519_secret_from_seed(seed: &[u8; 32]) -> [u8; 32] { + use sha2::{Digest, Sha512}; + let h = Sha512::digest(seed); + let mut s = [0u8; 32]; + s.copy_from_slice(&h[..32]); + s[0] &= 248; + s[31] &= 127; + s[31] |= 64; + s +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::identity::Keypair; + + #[test] + fn ed25519_to_x25519_keypair_agrees() { + // The X25519 public derived from the Ed25519 public must equal the + // X25519 public of the X25519 secret derived from the same seed. + let kp = Keypair::generate(); + let seed = kp.seed_bytes(); + let xpub_from_public = x25519_public(&kp.verifying_key()).unwrap(); + let xsec = x25519_secret_from_seed(&seed); + let xpub_from_secret = crypto_box::SecretKey::from(xsec).public_key().to_bytes(); + assert_eq!(xpub_from_public, xpub_from_secret); + } +} diff --git a/crates/gitlawb-core/src/identity.rs b/crates/gitlawb-core/src/identity.rs index 96d50b9..9d3fea1 100644 --- a/crates/gitlawb-core/src/identity.rs +++ b/crates/gitlawb-core/src/identity.rs @@ -52,6 +52,12 @@ impl Keypair { URL_SAFE_NO_PAD.encode(sig.to_bytes()) } + /// The raw 32-byte Ed25519 seed. Used to derive the X25519 secret for + /// envelope decryption (see `crate::encrypt`). + pub fn seed_bytes(&self) -> [u8; 32] { + self.signing_key.to_bytes() + } + /// Export the signing key as raw 32-byte seed (wrapped in Zeroizing). pub fn to_seed(&self) -> Zeroizing<[u8; 32]> { Zeroizing::new(self.signing_key.to_bytes()) diff --git a/crates/gitlawb-core/src/lib.rs b/crates/gitlawb-core/src/lib.rs index a608be1..a9e91f6 100644 --- a/crates/gitlawb-core/src/lib.rs +++ b/crates/gitlawb-core/src/lib.rs @@ -1,6 +1,7 @@ pub mod cert; pub mod cid; pub mod did; +pub mod encrypt; pub mod error; pub mod http_sig; pub mod identity; From e6378f5391f5f4d4fafa6b6be89d991a1ca57e8c Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 13:44:11 -0500 Subject: [PATCH 34/58] feat(core): seal/open per-blob encryption envelopes --- Cargo.lock | 2 + crates/gitlawb-core/Cargo.toml | 2 +- crates/gitlawb-core/src/encrypt.rs | 147 +++++++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index c2a4595..02679ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2339,6 +2339,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16182b4f39a82ec8a6851155cc4c0cda3065bb1db33651726a29e1951de0f009" dependencies = [ "aead", + "chacha20", "crypto_secretbox", "curve25519-dalek", "salsa20", @@ -2353,6 +2354,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d6cf87adf719ddf43a805e92c6870a531aedda35ff640442cbaf8674e141e1" dependencies = [ "aead", + "chacha20", "cipher", "generic-array", "poly1305", diff --git a/crates/gitlawb-core/Cargo.toml b/crates/gitlawb-core/Cargo.toml index f479e1a..4468d0c 100644 --- a/crates/gitlawb-core/Cargo.toml +++ b/crates/gitlawb-core/Cargo.toml @@ -24,7 +24,7 @@ uuid = { workspace = true } zeroize = { version = "1", features = ["derive"] } pkcs8 = { version = "0.10", features = ["pem", "std"] } curve25519-dalek = "4" -crypto_box = { version = "0.9", features = ["std"] } +crypto_box = { version = "0.9", features = ["std", "chacha20"] } chacha20poly1305 = "0.10" [dev-dependencies] diff --git a/crates/gitlawb-core/src/encrypt.rs b/crates/gitlawb-core/src/encrypt.rs index c75add4..aad6913 100644 --- a/crates/gitlawb-core/src/encrypt.rs +++ b/crates/gitlawb-core/src/encrypt.rs @@ -3,6 +3,7 @@ //! recipient via an X25519 box keyed from their Ed25519 `did:key`. The node //! seals with public keys only; readers open with their own private key. +use crate::identity::Keypair; use anyhow::{Context, Result}; use ed25519_dalek::VerifyingKey; @@ -28,6 +29,129 @@ fn x25519_secret_from_seed(seed: &[u8; 32]) -> [u8; 32] { s } +use base64::{engine::general_purpose::STANDARD as B64, Engine}; +use chacha20poly1305::{ + aead::{Aead, KeyInit}, + XChaCha20Poly1305, XNonce, +}; +use crypto_box::{ + aead::{AeadCore, OsRng}, + ChaChaBox, PublicKey as XPublic, SecretKey as XSecret, +}; +use rand::RngCore; +use serde::{Deserialize, Serialize}; + +const MAGIC: &[u8] = b"GLENC"; +const VERSION: u8 = 1; + +#[derive(Serialize, Deserialize)] +struct Recipient { + kid: String, // base64 recipient ed25519 pubkey (32B) + eph: String, // base64 ephemeral x25519 pubkey (32B) + nonce: String, // base64 box nonce (24B) + wrap: String, // base64 wrapped content key +} + +#[derive(Serialize, Deserialize)] +struct Header { + alg: String, + nonce: String, // base64 body nonce (24B) + recipients: Vec, +} + +/// Encrypt `plaintext` so any of `recipients` (Ed25519 keys) can decrypt. +pub fn seal_blob(plaintext: &[u8], recipients: &[VerifyingKey]) -> Result> { + if recipients.is_empty() { + return Err(anyhow::anyhow!("seal_blob: no recipients")); + } + let mut content_key = [0u8; 32]; + OsRng.fill_bytes(&mut content_key); + let body_cipher = XChaCha20Poly1305::new_from_slice(&content_key) + .map_err(|e| anyhow::anyhow!("content key: {e}"))?; + let mut body_nonce = [0u8; 24]; + OsRng.fill_bytes(&mut body_nonce); + let body = body_cipher + .encrypt(XNonce::from_slice(&body_nonce), plaintext) + .map_err(|e| anyhow::anyhow!("body encrypt: {e}"))?; + + let mut wrapped = Vec::with_capacity(recipients.len()); + for vk in recipients { + let recip_x = XPublic::from(x25519_public(vk)?); + let eph = XSecret::generate(&mut OsRng); + let abox = ChaChaBox::new(&recip_x, &eph); + let nonce = ChaChaBox::generate_nonce(&mut OsRng); + let ct = abox + .encrypt(&nonce, &content_key[..]) + .map_err(|e| anyhow::anyhow!("wrap: {e}"))?; + wrapped.push(Recipient { + kid: B64.encode(vk.as_bytes()), + eph: B64.encode(eph.public_key().as_bytes()), + nonce: B64.encode(nonce), + wrap: B64.encode(ct), + }); + } + + let header = Header { + alg: "xchacha20poly1305".into(), + nonce: B64.encode(body_nonce), + recipients: wrapped, + }; + let header_json = serde_json::to_vec(&header).context("encode header")?; + + let mut out = Vec::new(); + out.extend_from_slice(MAGIC); + out.push(VERSION); + out.extend_from_slice(&(header_json.len() as u32).to_le_bytes()); + out.extend_from_slice(&header_json); + out.extend_from_slice(&body); + Ok(out) +} + +/// Decrypt an envelope with `keypair`. Errors if not a recipient or on auth fail. +pub fn open_blob(envelope: &[u8], keypair: &Keypair) -> Result> { + let mut p = 0; + if envelope.len() < MAGIC.len() + 1 + 4 || &envelope[..MAGIC.len()] != MAGIC { + return Err(anyhow::anyhow!("bad envelope magic")); + } + p += MAGIC.len(); + if envelope[p] != VERSION { + return Err(anyhow::anyhow!("unsupported envelope version")); + } + p += 1; + let hlen = u32::from_le_bytes(envelope[p..p + 4].try_into().unwrap()) as usize; + p += 4; + let header: Header = + serde_json::from_slice(envelope.get(p..p + hlen).context("truncated header")?) + .context("decode header")?; + let body = &envelope[p + hlen..]; + + let my_kid = B64.encode(keypair.verifying_key().as_bytes()); + let my_x = XSecret::from(x25519_secret_from_seed(&keypair.seed_bytes())); + + let entry = header + .recipients + .iter() + .find(|r| r.kid == my_kid) + .context("not a recipient of this envelope")?; + let eph = XPublic::from(<[u8; 32]>::try_from(B64.decode(&entry.eph)?.as_slice())?); + let nonce = B64.decode(&entry.nonce)?; + let wrap = B64.decode(&entry.wrap)?; + let abox = ChaChaBox::new(&eph, &my_x); + let content_key = abox + .decrypt( + crypto_box::aead::generic_array::GenericArray::from_slice(&nonce), + wrap.as_slice(), + ) + .map_err(|_| anyhow::anyhow!("content-key unwrap failed"))?; + + let body_cipher = XChaCha20Poly1305::new_from_slice(&content_key) + .map_err(|e| anyhow::anyhow!("content key: {e}"))?; + let body_nonce = B64.decode(&header.nonce)?; + body_cipher + .decrypt(XNonce::from_slice(&body_nonce), body) + .map_err(|_| anyhow::anyhow!("body decrypt failed")) +} + #[cfg(test)] mod tests { use super::*; @@ -44,4 +168,27 @@ mod tests { let xpub_from_secret = crypto_box::SecretKey::from(xsec).public_key().to_bytes(); assert_eq!(xpub_from_public, xpub_from_secret); } + + #[test] + fn seal_open_round_trip_for_recipients() { + let owner = Keypair::generate(); + let reader_a = Keypair::generate(); + let reader_b = Keypair::generate(); + let msg = b"private blob contents"; + + let env = seal_blob(msg, &[owner.verifying_key(), reader_a.verifying_key()]).unwrap(); + + assert_eq!(open_blob(&env, &owner).unwrap(), msg); + assert_eq!(open_blob(&env, &reader_a).unwrap(), msg); + assert!(open_blob(&env, &reader_b).is_err(), "non-recipient must fail"); + } + + #[test] + fn tampered_envelope_fails() { + let owner = Keypair::generate(); + let mut env = seal_blob(b"hi", &[owner.verifying_key()]).unwrap(); + let last = env.len() - 1; + env[last] ^= 0x01; + assert!(open_blob(&env, &owner).is_err()); + } } From 01b1e8d116de8f6805ae0d7b404dfe4a0bccdc6d Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 13:49:41 -0500 Subject: [PATCH 35/58] feat(node): resolve per-blob encryption recipients (least-privilege) --- .../gitlawb-node/src/git/visibility_pack.rs | 55 ++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index c9c6d6b..f480d40 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -7,7 +7,7 @@ use crate::db::VisibilityRule; use crate::git::store; use crate::visibility::{visibility_check, Decision}; use anyhow::{Context, Result}; -use std::collections::HashSet; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::path::Path; /// List every (blob_oid, "/repo/relative/path") pair reachable from any branch @@ -87,6 +87,43 @@ pub fn replicable_objects(all: Vec, withheld: &HashSet) -> Vec Result>> { + let withheld = withheld_blob_oids(repo_path, rules, is_public, owner_did, None)?; + if withheld.is_empty() { + return Ok(HashMap::new()); + } + let mut candidates: BTreeSet = BTreeSet::new(); + for r in rules { + for d in &r.reader_dids { + candidates.insert(d.clone()); + } + } + let mut out: HashMap> = HashMap::new(); + for (oid, path) in blob_paths(repo_path)? { + if !withheld.contains(&oid) { + continue; + } + let entry = out.entry(oid).or_default(); + entry.insert(owner_did.to_string()); + for did in &candidates { + if visibility_check(rules, is_public, owner_did, Some(did), &path) == Decision::Allow { + entry.insert(did.clone()); + } + } + } + Ok(out) +} + #[cfg(test)] mod tests { use super::*; @@ -230,4 +267,20 @@ mod tests { let got = replicable_objects(all.clone(), &withheld); assert_eq!(got, all); } + + #[test] + fn recipients_are_owner_plus_allowed_readers_only() { + let (_td, repo, secret_oid, public_oid) = fixture(); + let reader = "did:key:zReader"; + let rules = vec![rule("/secret/**", &[reader])]; + let map = withheld_blob_recipients(&repo, &rules, true, OWNER).unwrap(); + + let recips = map.get(&secret_oid).expect("secret blob has recipients"); + assert!(recips.contains(OWNER)); + assert!(recips.contains(reader)); + assert!( + !map.contains_key(&public_oid), + "public blob is not encrypted" + ); + } } From b126d6a05e3302ea64ba5160fa0eb47663ab87a8 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 13:55:46 -0500 Subject: [PATCH 36/58] feat(node): encrypted_blobs table and recipient-scoped queries --- crates/gitlawb-node/src/db/mod.rs | 94 +++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 6af5ee8..21570ce 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -720,6 +720,21 @@ const MIGRATIONS: &[Migration] = &[ "CREATE INDEX IF NOT EXISTS idx_visibility_rules_repo ON visibility_rules(repo_id)", ], }, + Migration { + version: 4, + name: "encrypted_blobs", + stmts: &[ + r#"CREATE TABLE IF NOT EXISTS encrypted_blobs ( + repo_id TEXT NOT NULL, + oid TEXT NOT NULL, + cid TEXT NOT NULL, + recipients TEXT NOT NULL, + created_at TEXT NOT NULL, + PRIMARY KEY (repo_id, oid) + )"#, + "CREATE INDEX IF NOT EXISTS idx_encrypted_blobs_repo ON encrypted_blobs(repo_id)", + ], + }, ]; // ── Repos ───────────────────────────────────────────────────────────────────── @@ -1621,6 +1636,85 @@ impl Db { Ok(()) } + pub async fn record_encrypted_blob( + &self, + repo_id: &str, + oid: &str, + cid: &str, + recipients: &[String], + ) -> Result<()> { + let recipients_json = serde_json::to_string(recipients)?; + sqlx::query( + "INSERT INTO encrypted_blobs (repo_id, oid, cid, recipients, created_at) + VALUES ($1, $2, $3, $4, $5) + ON CONFLICT (repo_id, oid) DO UPDATE SET cid = EXCLUDED.cid, recipients = EXCLUDED.recipients", + ) + .bind(repo_id) + .bind(oid) + .bind(cid) + .bind(recipients_json) + .bind(Utc::now().to_rfc3339()) + .execute(&self.pool) + .await?; + Ok(()) + } + + /// (oid, cid) for every encrypted blob in the repo that `caller` may decrypt. + pub async fn list_encrypted_blobs_for( + &self, + repo_id: &str, + caller: &str, + ) -> Result> { + let rows = sqlx::query("SELECT oid, cid, recipients FROM encrypted_blobs WHERE repo_id = $1") + .bind(repo_id) + .fetch_all(&self.pool) + .await?; + let mut out = Vec::new(); + for row in rows { + let oid: String = row.get("oid"); + let cid: String = row.get("cid"); + let recipients: String = row.get("recipients"); + let recipients: Vec = serde_json::from_str(&recipients).unwrap_or_default(); + if recipients.iter().any(|d| d == caller) { + out.push((oid, cid)); + } + } + Ok(out) + } + + /// The CID of one encrypted blob, only if `caller` is a recipient. + pub async fn encrypted_blob_cid( + &self, + repo_id: &str, + oid: &str, + caller: &str, + ) -> Result> { + let row = sqlx::query("SELECT cid, recipients FROM encrypted_blobs WHERE repo_id = $1 AND oid = $2") + .bind(repo_id) + .bind(oid) + .fetch_optional(&self.pool) + .await?; + let Some(row) = row else { return Ok(None) }; + let recipients: String = row.get("recipients"); + let recipients: Vec = serde_json::from_str(&recipients).unwrap_or_default(); + if recipients.iter().any(|d| d == caller) { + Ok(Some(row.get("cid"))) + } else { + Ok(None) + } + } + + /// Whether an encrypted blob row exists (recipient-agnostic), to avoid + /// re-pinning on subsequent pushes. + pub async fn has_encrypted_blob(&self, repo_id: &str, oid: &str) -> Result { + let row = sqlx::query("SELECT 1 AS x FROM encrypted_blobs WHERE repo_id = $1 AND oid = $2") + .bind(repo_id) + .bind(oid) + .fetch_optional(&self.pool) + .await?; + Ok(row.is_some()) + } + pub async fn list_pinned_cids(&self) -> Result> { let rows = sqlx::query( "SELECT sha256_hex, cid, pinned_at, pinata_cid FROM pinned_cids ORDER BY pinned_at DESC", From 0f9aaa986ee22666bcd114180aaa32f803648f64 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:00:02 -0500 Subject: [PATCH 37/58] feat(node): encrypt-then-pin withheld blobs at push (IPFS) At the push chokepoint, after pinning withheld objects, resolve each withheld blob's recipient DIDs and seal it to their Ed25519 keys with seal_blob, pinning the ciphertext to IPFS and recording it in encrypted_blobs. Best-effort per blob: failures are logged and skipped, never pinned in plaintext. Pinata replication is unchanged; B1 encrypts to IPFS only. Adds ed25519-dalek as a direct dependency of gitlawb-node (it was only declared in the workspace Cargo.toml). --- Cargo.lock | 1 + crates/gitlawb-node/Cargo.toml | 1 + crates/gitlawb-node/src/api/repos.rs | 27 +++++++++ crates/gitlawb-node/src/encrypted_pin.rs | 59 +++++++++++++++++++ .../gitlawb-node/src/git/visibility_pack.rs | 12 +++- crates/gitlawb-node/src/main.rs | 1 + 6 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 crates/gitlawb-node/src/encrypted_pin.rs diff --git a/Cargo.lock b/Cargo.lock index 02679ed..89900f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3334,6 +3334,7 @@ dependencies = [ "cid", "clap", "dirs-next", + "ed25519-dalek", "futures", "gitlawb-core", "hex", diff --git a/crates/gitlawb-node/Cargo.toml b/crates/gitlawb-node/Cargo.toml index 9cc3ba1..61f63b0 100644 --- a/crates/gitlawb-node/Cargo.toml +++ b/crates/gitlawb-node/Cargo.toml @@ -11,6 +11,7 @@ path = "src/main.rs" [dependencies] gitlawb-core = { path = "../gitlawb-core" } +ed25519-dalek = { workspace = true } tokio = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 884d7ca..dfdfcba 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -662,6 +662,10 @@ pub async fn git_receive_pack( let ipfs_api = state.config.ipfs_api.clone(); let repo_path_clone = disk_path.clone(); let db_clone = state.db.clone(); + let rules_for_enc = rules_opt.clone(); + let repo_id = record.id.clone(); + let owner_did = record.owner_did.clone(); + let is_public = record.is_public; tokio::spawn(async move { let pinned = crate::ipfs_pin::pin_new_objects( &ipfs_api, @@ -676,6 +680,29 @@ pub async fn git_receive_pack( tracing::info!(sha = %sha, %cid, "pinned"); } } + + // Option B1: encrypt-then-pin the withheld blobs so authorized + // readers can recover them when the origin cannot serve them. + if let Some(rules) = rules_for_enc.filter(|r| !r.is_empty()) { + let p = repo_path_clone.clone(); + let owner = owner_did.clone(); + let recip = tokio::task::spawn_blocking(move || { + crate::git::visibility_pack::withheld_blob_recipients( + &p, &rules, is_public, &owner, + ) + }) + .await; + if let Ok(Ok(recipients)) = recip { + crate::encrypted_pin::encrypt_and_pin( + &ipfs_api, + &repo_path_clone, + &db_clone, + &repo_id, + &recipients, + ) + .await; + } + } }); } diff --git a/crates/gitlawb-node/src/encrypted_pin.rs b/crates/gitlawb-node/src/encrypted_pin.rs new file mode 100644 index 0000000..dc5c09f --- /dev/null +++ b/crates/gitlawb-node/src/encrypted_pin.rs @@ -0,0 +1,59 @@ +//! Encrypt-then-pin for withheld blobs (Option B1). Each withheld blob is sealed +//! to its recipient DIDs and the envelope pinned to IPFS, recorded in +//! `encrypted_blobs`. Best-effort per blob: a failure is logged and skipped, +//! never pinned in plaintext. + +use std::collections::{BTreeSet, HashMap}; +use std::path::Path; +use std::str::FromStr; + +use ed25519_dalek::VerifyingKey; +use gitlawb_core::did::Did; +use gitlawb_core::encrypt::seal_blob; + +use crate::db::Db; + +/// Resolve a DID string to its Ed25519 verifying key, or None if it carries no +/// inline key (e.g. did:web / did:gitlawb). +fn did_to_key(did: &str) -> Option { + Did::from_str(did).ok()?.to_verifying_key().ok() +} + +/// Encrypt and pin every withheld blob. `recipients` maps blob oid -> DID set. +pub async fn encrypt_and_pin( + ipfs_api: &str, + repo_path: &Path, + db: &Db, + repo_id: &str, + recipients: &HashMap>, +) { + for (oid, dids) in recipients { + if db.has_encrypted_blob(repo_id, oid).await.unwrap_or(false) { + continue; + } + let keys: Vec = dids.iter().filter_map(|d| did_to_key(d)).collect(); + if keys.is_empty() { + tracing::warn!(oid = %oid, "no resolvable recipient keys; skipping encrypted pin"); + continue; + } + let data = match crate::git::store::read_object(repo_path, oid) { + Ok(Some((_t, bytes))) => bytes, + _ => continue, + }; + let envelope = match seal_blob(&data, &keys) { + Ok(e) => e, + Err(e) => { + tracing::warn!(oid = %oid, err = %e, "seal_blob failed; skipping"); + continue; + } + }; + let cid = match crate::ipfs_pin::pin_git_object(ipfs_api, oid, &envelope).await { + Ok(c) if !c.is_empty() => c, + _ => continue, + }; + let dids_vec: Vec = dids.iter().cloned().collect(); + if let Err(e) = db.record_encrypted_blob(repo_id, oid, &cid, &dids_vec).await { + tracing::warn!(oid = %oid, err = %e, "record_encrypted_blob failed"); + } + } +} diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index f480d40..e32e084 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -87,7 +87,6 @@ pub fn replicable_objects(all: Vec, withheld: &HashSet) -> Vec Date: Wed, 10 Jun 2026 14:05:04 -0500 Subject: [PATCH 38/58] feat(node): authenticated discovery and fetch for encrypted blobs --- crates/gitlawb-node/src/api/encrypted.rs | 53 ++++++++++++++++++++++++ crates/gitlawb-node/src/api/mod.rs | 1 + crates/gitlawb-node/src/ipfs_pin.rs | 13 ++++++ crates/gitlawb-node/src/server.rs | 8 ++++ 4 files changed, 75 insertions(+) create mode 100644 crates/gitlawb-node/src/api/encrypted.rs diff --git a/crates/gitlawb-node/src/api/encrypted.rs b/crates/gitlawb-node/src/api/encrypted.rs new file mode 100644 index 0000000..8b692ab --- /dev/null +++ b/crates/gitlawb-node/src/api/encrypted.rs @@ -0,0 +1,53 @@ +//! Authenticated discovery + fetch for encrypted withheld blobs (Option B1). + +use axum::extract::{Extension, Path, State}; +use axum::Json; + +use crate::auth::AuthenticatedDid; +use crate::error::{AppError, Result}; +use crate::state::AppState; + +/// GET /api/v1/repos/{owner}/{repo}/encrypted-blobs +/// Returns [{oid, cid}] for encrypted blobs the caller may decrypt. +pub async fn list_encrypted_blobs( + State(state): State, + auth: Option>, + Path((owner, repo)): Path<(String, String)>, +) -> Result> { + let caller = auth.as_ref().map(|e| e.0 .0.as_str()).unwrap_or(""); + let record = state + .db + .get_repo(&owner, &repo) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}")))?; + let rows = state.db.list_encrypted_blobs_for(&record.id, caller).await?; + let blobs: Vec<_> = rows + .into_iter() + .map(|(oid, cid)| serde_json::json!({ "oid": oid, "cid": cid })) + .collect(); + Ok(Json(serde_json::json!({ "blobs": blobs }))) +} + +/// GET /api/v1/repos/{owner}/{repo}/encrypted-blob/{oid} +/// Returns raw envelope bytes if the caller is a recipient. +pub async fn get_encrypted_blob( + State(state): State, + auth: Option>, + Path((owner, repo, oid)): Path<(String, String, String)>, +) -> Result> { + let caller = auth.as_ref().map(|e| e.0 .0.as_str()).unwrap_or(""); + let record = state + .db + .get_repo(&owner, &repo) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}")))?; + let cid = state + .db + .encrypted_blob_cid(&record.id, &oid, caller) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}/{oid}")))?; + let bytes = crate::ipfs_pin::cat(&state.config.ipfs_api, &cid) + .await + .map_err(|e| AppError::Git(e.to_string()))?; + Ok(bytes) +} diff --git a/crates/gitlawb-node/src/api/mod.rs b/crates/gitlawb-node/src/api/mod.rs index 2595c48..7f01365 100644 --- a/crates/gitlawb-node/src/api/mod.rs +++ b/crates/gitlawb-node/src/api/mod.rs @@ -3,6 +3,7 @@ pub mod arweave; pub mod bounties; pub mod certs; pub mod changelog; +pub mod encrypted; pub mod events; pub mod ipfs; pub mod issues; diff --git a/crates/gitlawb-node/src/ipfs_pin.rs b/crates/gitlawb-node/src/ipfs_pin.rs index 96d6abd..9bdaade 100644 --- a/crates/gitlawb-node/src/ipfs_pin.rs +++ b/crates/gitlawb-node/src/ipfs_pin.rs @@ -72,6 +72,19 @@ pub async fn pin_git_object(ipfs_api: &str, sha256_hex: &str, data: &[u8]) -> Re Ok(cid) } +/// Fetch raw bytes for a CID from the local Kubo node (`/api/v0/cat`). +pub async fn cat(ipfs_api: &str, cid: &str) -> Result> { + if ipfs_api.is_empty() { + return Err(anyhow::anyhow!("IPFS not configured")); + } + let url = format!("{}/api/v0/cat?arg={}", ipfs_api.trim_end_matches('/'), cid); + let resp = reqwest::Client::new().post(&url).send().await?; + if !resp.status().is_success() { + return Err(anyhow::anyhow!("ipfs cat {cid}: {}", resp.status())); + } + Ok(resp.bytes().await?.to_vec()) +} + /// List all git objects in the given bare repo and pin any that are not yet /// recorded in `pinned_cids`. /// diff --git a/crates/gitlawb-node/src/server.rs b/crates/gitlawb-node/src/server.rs index 9baea20..9d643b9 100644 --- a/crates/gitlawb-node/src/server.rs +++ b/crates/gitlawb-node/src/server.rs @@ -356,6 +356,14 @@ pub fn build_router(state: AppState) -> Router { "/api/v1/repos/{owner}/{repo}/withheld-paths", axum::routing::get(visibility::withheld_paths), ) + .route( + "/api/v1/repos/{owner}/{repo}/encrypted-blobs", + axum::routing::get(crate::api::encrypted::list_encrypted_blobs), + ) + .route( + "/api/v1/repos/{owner}/{repo}/encrypted-blob/{oid}", + axum::routing::get(crate::api::encrypted::get_encrypted_blob), + ) .layer(DefaultBodyLimit::disable()) .layer(RequestBodyLimitLayer::new(pack_limit)) .layer(middleware::from_fn(auth::optional_signature)); From 767f0df3c03eb17e76fab4786a3e03f433a2fd14 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:08:52 -0500 Subject: [PATCH 39/58] feat(gl): transparent recovery of authorized encrypted blobs on clone --- crates/gl/src/clone.rs | 152 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index b5fe39d..514c0cb 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -217,6 +217,105 @@ struct WithheldPathsResponse { reinclude: Vec, } +/// After the base clone, recover encrypted blobs the caller is authorized for +/// that are missing locally: fetch the envelope, decrypt with the caller's key, +/// install as a loose object. Returns the repo-relative paths recovered. +/// Best-effort; logs and continues on any per-blob failure. +async fn recover_encrypted_blobs( + node: &str, + owner: &str, + name: &str, + dest: &Path, + keypair: &gitlawb_core::identity::Keypair, +) -> Result> { + use gitlawb_core::encrypt::open_blob; + use std::collections::HashMap; + use std::io::Write; + + let dest_str = dest.to_str().context("dest path not utf-8")?; + let client = NodeClient::new(node, Some(keypair.clone())); + + let resp = match client + .get_signed(&format!("/api/v1/repos/{owner}/{name}/encrypted-blobs")) + .await + { + Ok(r) if r.status().is_success() => r, + _ => return Ok(vec![]), + }; + let body: serde_json::Value = resp.json().await.context("parsing encrypted-blobs")?; + let blobs = body + .get("blobs") + .and_then(|b| b.as_array()) + .cloned() + .unwrap_or_default(); + if blobs.is_empty() { + return Ok(vec![]); + } + + // Map oid -> repo-relative path from the cloned tree. + let ls = Command::new("git") + .args(["-C", dest_str, "ls-tree", "-r", "HEAD"]) + .output()?; + let mut oid_to_path: HashMap = HashMap::new(); + for line in String::from_utf8_lossy(&ls.stdout).lines() { + if let Some((meta, path)) = line.split_once('\t') { + if let Some(oid) = meta.split_whitespace().nth(2) { + oid_to_path.insert(oid.to_string(), path.to_string()); + } + } + } + + let mut recovered = Vec::new(); + for entry in blobs { + let Some(oid) = entry.get("oid").and_then(|o| o.as_str()) else { + continue; + }; + // Skip if already present locally. + let present = Command::new("git") + .args(["-C", dest_str, "cat-file", "-e", oid]) + .status() + .map(|s| s.success()) + .unwrap_or(false); + if present { + continue; + } + let env_resp = match client + .get_signed(&format!("/api/v1/repos/{owner}/{name}/encrypted-blob/{oid}")) + .await + { + Ok(r) if r.status().is_success() => r, + _ => continue, + }; + let Ok(envelope) = env_resp.bytes().await else { + continue; + }; + let plaintext = match open_blob(&envelope, keypair) { + Ok(p) => p, + Err(e) => { + eprintln!("warning: could not decrypt {oid}: {e}"); + continue; + } + }; + // Install as a loose object; verify the OID matches. + let mut child = Command::new("git") + .args(["-C", dest_str, "hash-object", "-w", "-t", "blob", "--stdin"]) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .spawn()?; + child.stdin.take().unwrap().write_all(&plaintext)?; + let out = child.wait_with_output()?; + let written = String::from_utf8_lossy(&out.stdout).trim().to_string(); + if written == oid { + if let Some(p) = oid_to_path.get(oid) { + recovered.push(p.clone()); + } + } else { + eprintln!("warning: recovered blob {oid} hashed to {written}; discarding"); + } + } + Ok(recovered) +} + pub async fn run(args: CloneArgs) -> Result<()> { let (url, owner, name) = parse_repo(&args.repo)?; let dest_name = args.dir.unwrap_or_else(|| name.clone()); @@ -236,6 +335,30 @@ pub async fn run(args: CloneArgs) -> Result<()> { } setup_partial_clone(&dest, &url, &withheld, &reinclude, args.branch.as_deref())?; + + if let Ok(keypair) = load_keypair_from_dir(None) { + if let Ok(paths) = recover_encrypted_blobs(&args.node, &owner, &name, &dest, &keypair).await { + if !paths.is_empty() { + // Re-include recovered paths if this was a sparse clone, then + // materialize them in the working tree. + let spec = dest.join(".git/info/sparse-checkout"); + if spec.exists() { + if let Ok(mut s) = std::fs::read_to_string(&spec) { + for p in &paths { + s.push_str(&format!("/{p}\n")); + } + let _ = std::fs::write(&spec, s); + } + } + let _ = git(&dest, &["checkout", "--", "."]); + println!( + "Recovered {} private file(s) you are authorized to read", + paths.len() + ); + } + } + } + println!("Done. Cloned into {dest_name}"); Ok(()) } @@ -447,4 +570,33 @@ mod tests { // An extra slash would otherwise smuggle a path segment into the name. assert!(parse_repo("owner/name/extra").is_err()); } + + #[test] + fn recovered_blob_installs_with_matching_oid() { + use gitlawb_core::encrypt::{open_blob, seal_blob}; + use gitlawb_core::identity::Keypair; + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + let oid = { + let out = std::process::Command::new("git") + .args(["-C", dest.to_str().unwrap(), "rev-parse", "HEAD:secret/b.txt"]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let reader = Keypair::generate(); + let env = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + let plaintext = open_blob(&env, &reader).unwrap(); + let mut child = std::process::Command::new("git") + .args(["-C", dest.to_str().unwrap(), "hash-object", "-w", "-t", "blob", "--stdin"]) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .spawn() + .unwrap(); + use std::io::Write; + child.stdin.take().unwrap().write_all(&plaintext).unwrap(); + let out = child.wait_with_output().unwrap(); + assert_eq!(String::from_utf8_lossy(&out.stdout).trim(), oid); + } } From 87a3f361cc1a33992ecc91bb8e1df28c938ebddf Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:11:01 -0500 Subject: [PATCH 40/58] style: cargo fmt --- crates/gitlawb-core/src/encrypt.rs | 5 +++- crates/gitlawb-node/src/api/encrypted.rs | 5 +++- crates/gitlawb-node/src/db/mod.rs | 21 +++++++++------- crates/gitlawb-node/src/encrypted_pin.rs | 5 +++- .../gitlawb-node/src/git/visibility_pack.rs | 4 +++- crates/gl/src/clone.rs | 24 +++++++++++++++---- 6 files changed, 47 insertions(+), 17 deletions(-) diff --git a/crates/gitlawb-core/src/encrypt.rs b/crates/gitlawb-core/src/encrypt.rs index aad6913..b626581 100644 --- a/crates/gitlawb-core/src/encrypt.rs +++ b/crates/gitlawb-core/src/encrypt.rs @@ -180,7 +180,10 @@ mod tests { assert_eq!(open_blob(&env, &owner).unwrap(), msg); assert_eq!(open_blob(&env, &reader_a).unwrap(), msg); - assert!(open_blob(&env, &reader_b).is_err(), "non-recipient must fail"); + assert!( + open_blob(&env, &reader_b).is_err(), + "non-recipient must fail" + ); } #[test] diff --git a/crates/gitlawb-node/src/api/encrypted.rs b/crates/gitlawb-node/src/api/encrypted.rs index 8b692ab..8374925 100644 --- a/crates/gitlawb-node/src/api/encrypted.rs +++ b/crates/gitlawb-node/src/api/encrypted.rs @@ -20,7 +20,10 @@ pub async fn list_encrypted_blobs( .get_repo(&owner, &repo) .await? .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}")))?; - let rows = state.db.list_encrypted_blobs_for(&record.id, caller).await?; + let rows = state + .db + .list_encrypted_blobs_for(&record.id, caller) + .await?; let blobs: Vec<_> = rows .into_iter() .map(|(oid, cid)| serde_json::json!({ "oid": oid, "cid": cid })) diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 21570ce..5aa176c 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -1665,10 +1665,11 @@ impl Db { repo_id: &str, caller: &str, ) -> Result> { - let rows = sqlx::query("SELECT oid, cid, recipients FROM encrypted_blobs WHERE repo_id = $1") - .bind(repo_id) - .fetch_all(&self.pool) - .await?; + let rows = + sqlx::query("SELECT oid, cid, recipients FROM encrypted_blobs WHERE repo_id = $1") + .bind(repo_id) + .fetch_all(&self.pool) + .await?; let mut out = Vec::new(); for row in rows { let oid: String = row.get("oid"); @@ -1689,11 +1690,13 @@ impl Db { oid: &str, caller: &str, ) -> Result> { - let row = sqlx::query("SELECT cid, recipients FROM encrypted_blobs WHERE repo_id = $1 AND oid = $2") - .bind(repo_id) - .bind(oid) - .fetch_optional(&self.pool) - .await?; + let row = sqlx::query( + "SELECT cid, recipients FROM encrypted_blobs WHERE repo_id = $1 AND oid = $2", + ) + .bind(repo_id) + .bind(oid) + .fetch_optional(&self.pool) + .await?; let Some(row) = row else { return Ok(None) }; let recipients: String = row.get("recipients"); let recipients: Vec = serde_json::from_str(&recipients).unwrap_or_default(); diff --git a/crates/gitlawb-node/src/encrypted_pin.rs b/crates/gitlawb-node/src/encrypted_pin.rs index dc5c09f..b1004a0 100644 --- a/crates/gitlawb-node/src/encrypted_pin.rs +++ b/crates/gitlawb-node/src/encrypted_pin.rs @@ -52,7 +52,10 @@ pub async fn encrypt_and_pin( _ => continue, }; let dids_vec: Vec = dids.iter().cloned().collect(); - if let Err(e) = db.record_encrypted_blob(repo_id, oid, &cid, &dids_vec).await { + if let Err(e) = db + .record_encrypted_blob(repo_id, oid, &cid, &dids_vec) + .await + { tracing::warn!(oid = %oid, err = %e, "record_encrypted_blob failed"); } } diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index e32e084..90ca772 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -288,7 +288,9 @@ mod tests { use gitlawb_core::encrypt::{open_blob, seal_blob}; use gitlawb_core::identity::Keypair; let (_td, repo, secret_oid, _public) = fixture(); - let (_t, bytes) = crate::git::store::read_object(&repo, &secret_oid).unwrap().unwrap(); + let (_t, bytes) = crate::git::store::read_object(&repo, &secret_oid) + .unwrap() + .unwrap(); let reader = Keypair::generate(); let env = seal_blob(&bytes, &[reader.verifying_key()]).unwrap(); assert_eq!(open_blob(&env, &reader).unwrap(), bytes); diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 514c0cb..7023796 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -280,7 +280,9 @@ async fn recover_encrypted_blobs( continue; } let env_resp = match client - .get_signed(&format!("/api/v1/repos/{owner}/{name}/encrypted-blob/{oid}")) + .get_signed(&format!( + "/api/v1/repos/{owner}/{name}/encrypted-blob/{oid}" + )) .await { Ok(r) if r.status().is_success() => r, @@ -337,7 +339,8 @@ pub async fn run(args: CloneArgs) -> Result<()> { setup_partial_clone(&dest, &url, &withheld, &reinclude, args.branch.as_deref())?; if let Ok(keypair) = load_keypair_from_dir(None) { - if let Ok(paths) = recover_encrypted_blobs(&args.node, &owner, &name, &dest, &keypair).await { + if let Ok(paths) = recover_encrypted_blobs(&args.node, &owner, &name, &dest, &keypair).await + { if !paths.is_empty() { // Re-include recovered paths if this was a sparse clone, then // materialize them in the working tree. @@ -580,7 +583,12 @@ mod tests { setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); let oid = { let out = std::process::Command::new("git") - .args(["-C", dest.to_str().unwrap(), "rev-parse", "HEAD:secret/b.txt"]) + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) .output() .unwrap(); String::from_utf8_lossy(&out.stdout).trim().to_string() @@ -589,7 +597,15 @@ mod tests { let env = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); let plaintext = open_blob(&env, &reader).unwrap(); let mut child = std::process::Command::new("git") - .args(["-C", dest.to_str().unwrap(), "hash-object", "-w", "-t", "blob", "--stdin"]) + .args([ + "-C", + dest.to_str().unwrap(), + "hash-object", + "-w", + "-t", + "blob", + "--stdin", + ]) .stdin(std::process::Stdio::piped()) .stdout(std::process::Stdio::piped()) .spawn() From 2bf1da87e8faadd8e21be5367131f5a60c1cbe1f Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:18:23 -0500 Subject: [PATCH 41/58] fix(node): re-seal encrypted blob when recipient set changes Skip re-pinning only when an existing envelope already covers exactly the current recipients. A reader added to a rule after the first pin now gets a re-seal on the next push instead of being permanently locked out. Reader removal stays non-retroactive (the old envelope is already public). --- crates/gitlawb-node/src/db/mod.rs | 26 ++++++++++++++++-------- crates/gitlawb-node/src/encrypted_pin.rs | 11 ++++++++-- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 5aa176c..bf081d8 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -1707,15 +1707,23 @@ impl Db { } } - /// Whether an encrypted blob row exists (recipient-agnostic), to avoid - /// re-pinning on subsequent pushes. - pub async fn has_encrypted_blob(&self, repo_id: &str, oid: &str) -> Result { - let row = sqlx::query("SELECT 1 AS x FROM encrypted_blobs WHERE repo_id = $1 AND oid = $2") - .bind(repo_id) - .bind(oid) - .fetch_optional(&self.pool) - .await?; - Ok(row.is_some()) + /// The recipient DID list stored for an encrypted blob, or None if there is + /// no row. Used to decide whether a re-seal is needed (recipients changed). + pub async fn encrypted_blob_recipients( + &self, + repo_id: &str, + oid: &str, + ) -> Result>> { + let row = + sqlx::query("SELECT recipients FROM encrypted_blobs WHERE repo_id = $1 AND oid = $2") + .bind(repo_id) + .bind(oid) + .fetch_optional(&self.pool) + .await?; + Ok(row.map(|r| { + let recipients: String = r.get("recipients"); + serde_json::from_str::>(&recipients).unwrap_or_default() + })) } pub async fn list_pinned_cids(&self) -> Result> { diff --git a/crates/gitlawb-node/src/encrypted_pin.rs b/crates/gitlawb-node/src/encrypted_pin.rs index b1004a0..6ca1382 100644 --- a/crates/gitlawb-node/src/encrypted_pin.rs +++ b/crates/gitlawb-node/src/encrypted_pin.rs @@ -28,8 +28,15 @@ pub async fn encrypt_and_pin( recipients: &HashMap>, ) { for (oid, dids) in recipients { - if db.has_encrypted_blob(repo_id, oid).await.unwrap_or(false) { - continue; + // Skip only if an existing envelope already covers exactly these + // recipients. If the recipient set changed (e.g. a reader was added to + // the rule), re-seal so the new reader can recover the blob. Reader + // removal is not retroactive: the old envelope is already public. + if let Ok(Some(stored)) = db.encrypted_blob_recipients(repo_id, oid).await { + let stored: BTreeSet = stored.into_iter().collect(); + if &stored == dids { + continue; + } } let keys: Vec = dids.iter().filter_map(|d| did_to_key(d)).collect(); if keys.is_empty() { From 2dca0674abf5830960ea5e5d96fa63d2cf972fa8 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 18:52:51 -0500 Subject: [PATCH 42/58] feat(node): unscoped encrypted-blob listing for replication --- crates/gitlawb-node/src/db/mod.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index bf081d8..447fbe2 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -1683,6 +1683,30 @@ impl Db { Ok(out) } + /// (oid, cid, recipients) for every encrypted blob in the repo, unscoped by + /// caller. This is the replication view used by peer mirrors (Option B2), + /// distinct from the recipient-scoped `list_encrypted_blobs_for`. It returns + /// only ciphertext metadata; no plaintext or key material is involved. + pub async fn list_all_encrypted_blobs( + &self, + repo_id: &str, + ) -> Result)>> { + let rows = + sqlx::query("SELECT oid, cid, recipients FROM encrypted_blobs WHERE repo_id = $1") + .bind(repo_id) + .fetch_all(&self.pool) + .await?; + let mut out = Vec::new(); + for row in rows { + let oid: String = row.get("oid"); + let cid: String = row.get("cid"); + let recipients: String = row.get("recipients"); + let recipients: Vec = serde_json::from_str(&recipients).unwrap_or_default(); + out.push((oid, cid, recipients)); + } + Ok(out) + } + /// The CID of one encrypted blob, only if `caller` is a recipient. pub async fn encrypted_blob_cid( &self, From 52b0fc63c21f0a4e01cc0f5c1d3dbd48a1da61d6 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 18:52:51 -0500 Subject: [PATCH 43/58] feat(node): encrypted-blobs/replicate endpoint for peer mirrors --- crates/gitlawb-node/src/api/encrypted.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/crates/gitlawb-node/src/api/encrypted.rs b/crates/gitlawb-node/src/api/encrypted.rs index 8374925..6e19bd5 100644 --- a/crates/gitlawb-node/src/api/encrypted.rs +++ b/crates/gitlawb-node/src/api/encrypted.rs @@ -54,3 +54,27 @@ pub async fn get_encrypted_blob( .map_err(|e| AppError::Git(e.to_string()))?; Ok(bytes) } + +/// GET /api/v1/repos/{owner}/{repo}/encrypted-blobs/replicate +/// Returns [{oid, cid, recipients}] for every encrypted blob in the repo, for +/// peer-mirror replication (Option B2). Not recipient-scoped: recipient DIDs are +/// already public via the IPFS-pinned envelopes, so this exposes only ciphertext +/// metadata (content-addressed OIDs/CIDs and recipient DIDs), never plaintext. +pub async fn replicate_encrypted_blobs( + State(state): State, + Path((owner, repo)): Path<(String, String)>, +) -> Result> { + let record = state + .db + .get_repo(&owner, &repo) + .await? + .ok_or_else(|| AppError::RepoNotFound(format!("{owner}/{repo}")))?; + let rows = state.db.list_all_encrypted_blobs(&record.id).await?; + let blobs: Vec<_> = rows + .into_iter() + .map(|(oid, cid, recipients)| { + serde_json::json!({ "oid": oid, "cid": cid, "recipients": recipients }) + }) + .collect(); + Ok(Json(serde_json::json!({ "blobs": blobs }))) +} From 1f6f9ca4a453fe5e5d7b0957179b7c194f912b5a Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 18:52:51 -0500 Subject: [PATCH 44/58] feat(node): route encrypted-blobs/replicate under git_read_routes --- crates/gitlawb-node/src/server.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/gitlawb-node/src/server.rs b/crates/gitlawb-node/src/server.rs index 9d643b9..31ce4b4 100644 --- a/crates/gitlawb-node/src/server.rs +++ b/crates/gitlawb-node/src/server.rs @@ -364,6 +364,10 @@ pub fn build_router(state: AppState) -> Router { "/api/v1/repos/{owner}/{repo}/encrypted-blob/{oid}", axum::routing::get(crate::api::encrypted::get_encrypted_blob), ) + .route( + "/api/v1/repos/{owner}/{repo}/encrypted-blobs/replicate", + axum::routing::get(crate::api::encrypted::replicate_encrypted_blobs), + ) .layer(DefaultBodyLimit::disable()) .layer(RequestBodyLimitLayer::new(pack_limit)) .layer(middleware::from_fn(auth::optional_signature)); From 9caeffba697219d1647061ae6c57bc3138ad0a3f Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 19:13:10 -0500 Subject: [PATCH 45/58] feat(node): peer mirrors replicate encrypted withheld blobs (Option B2) --- crates/gitlawb-node/src/sync.rs | 196 ++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index df41470..ca3e7bf 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -10,6 +10,7 @@ //! 3. If it exists → `git fetch --prune` from the origin. //! 4. Mark done or failed. +use std::collections::HashMap; use std::path::Path; use std::sync::Arc; @@ -44,6 +45,52 @@ fn classify_mirror(withheld: Option>) -> MirrorMode { } } +/// One encrypted blob as advertised by an origin's `encrypted-blobs/replicate` +/// endpoint (Option B2). Ciphertext metadata only. +#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize)] +struct ReplicaBlob { + oid: String, + cid: String, + #[serde(default)] + recipients: Vec, +} + +/// The shape of the `encrypted-blobs/replicate` JSON response. +#[derive(Debug, serde::Deserialize)] +struct ReplicateResponse { + #[serde(default)] + blobs: Vec, +} + +/// Decide which of the origin's encrypted blobs this mirror must (re)replicate. +/// +/// `have` maps each already-stored blob's oid to its stored recipient DIDs. A +/// remote blob is returned when the mirror has no row for that oid, or when the +/// stored recipient set differs from the remote one (the origin re-sealed after a +/// reader-set change; same semantics as B1). Recipient order is ignored. +fn blobs_needing_replication( + remote: &[ReplicaBlob], + have: &HashMap>, +) -> Vec { + remote + .iter() + .filter(|b| match have.get(&b.oid) { + None => true, + Some(stored) => !same_recipients(stored, &b.recipients), + }) + .cloned() + .collect() +} + +/// Order-insensitive equality of two recipient DID lists. +fn same_recipients(a: &[String], b: &[String]) -> bool { + let mut a: Vec<&String> = a.iter().collect(); + let mut b: Vec<&String> = b.iter().collect(); + a.sort(); + b.sort(); + a == b +} + /// Start the background sync worker. Returns immediately; the worker runs /// as a detached tokio task that exits cleanly when `shutdown_rx` flips /// to `true`. @@ -160,6 +207,20 @@ async fn process_batch( machine_id, ) .await; + // Option B2: carry the encrypted withheld-blob envelopes too, so an + // authorized reader can recover private content from this mirror if + // the origin dies. `item.repo` is the slug "{owner_short}/{name}", + // which is the id upsert_mirror_repo wrote (the local repo_id). + replicate_encrypted_blobs( + client, + &origin_url, + owner_short, + repo_name, + db, + &item.repo, + &config.ipfs_api, + ) + .await; let _ = db.mark_sync_done(&item.id).await; crate::metrics::record_sync_processed("done"); } @@ -196,6 +257,86 @@ async fn fetch_withheld( Some(globs) } +/// Replicate the origin's encrypted withheld blobs onto this mirror (Option B2). +/// +/// After the git objects are mirrored, fetch the origin's replication listing, +/// then for each blob the mirror does not already hold (or whose recipients +/// changed) pull the ciphertext envelope over IPFS, pin it locally, and record +/// the `encrypted_blobs` row keyed by this mirror's local `repo_id`. +/// +/// Best-effort and idempotent: any per-blob failure is logged and skipped, to be +/// retried on the next sync. Confidentiality is never at risk; the mirror only +/// ever handles ciphertext and never decrypts. Cleanly a no-op when IPFS is +/// unconfigured, the origin reports no encrypted blobs, or the replicate endpoint +/// is absent (older peer) or unreachable. +async fn replicate_encrypted_blobs( + client: &reqwest::Client, + origin_url: &str, + owner: &str, + repo: &str, + db: &Db, + repo_id: &str, + ipfs_api: &str, +) { + if ipfs_api.is_empty() { + return; + } + + let url = format!("{origin_url}/api/v1/repos/{owner}/{repo}/encrypted-blobs/replicate"); + let resp = match client.get(&url).send().await { + Ok(r) if r.status().is_success() => r, + _ => return, + }; + let parsed: ReplicateResponse = match resp.json().await { + Ok(p) => p, + Err(e) => { + warn!(repo = %repo, err = %e, "failed to parse encrypted-blobs/replicate response"); + return; + } + }; + if parsed.blobs.is_empty() { + return; + } + + let have: HashMap> = match db.list_all_encrypted_blobs(repo_id).await { + Ok(rows) => rows + .into_iter() + .map(|(oid, _cid, recipients)| (oid, recipients)) + .collect(), + Err(e) => { + warn!(repo = %repo, err = %e, "failed to list local encrypted blobs for replication"); + return; + } + }; + + for blob in blobs_needing_replication(&parsed.blobs, &have) { + let envelope = match crate::ipfs_pin::cat(ipfs_api, &blob.cid).await { + Ok(bytes) => bytes, + Err(e) => { + warn!(oid = %blob.oid, cid = %blob.cid, err = %e, "failed to fetch encrypted envelope over IPFS; will retry next sync"); + continue; + } + }; + match crate::ipfs_pin::pin_git_object(ipfs_api, &blob.oid, &envelope).await { + Ok(cid) if !cid.is_empty() => { + if cid != blob.cid { + warn!(oid = %blob.oid, expected = %blob.cid, got = %cid, "replicated envelope CID mismatch; skipping record"); + continue; + } + if let Err(e) = db + .record_encrypted_blob(repo_id, &blob.oid, &cid, &blob.recipients) + .await + { + warn!(oid = %blob.oid, err = %e, "failed to record replicated encrypted blob"); + } + } + _ => { + warn!(oid = %blob.oid, "failed to pin replicated encrypted envelope; will retry next sync"); + } + } + } +} + /// Run a git subprocess, returning an error with stderr on non-zero exit. async fn git_run(args: &[&str]) -> anyhow::Result<()> { let out = tokio::process::Command::new("git") @@ -342,6 +483,61 @@ mod tests { assert!(matches!(mode, MirrorMode::Plain)); } + fn rb(oid: &str, cid: &str, recipients: &[&str]) -> ReplicaBlob { + ReplicaBlob { + oid: oid.to_string(), + cid: cid.to_string(), + recipients: recipients.iter().map(|s| s.to_string()).collect(), + } + } + + #[test] + fn replicate_stores_new_blob() { + let remote = vec![rb("oid1", "cidA", &["did:key:zA"])]; + let have = HashMap::new(); + assert_eq!(blobs_needing_replication(&remote, &have), remote); + } + + #[test] + fn replicate_skips_already_present_same_recipients() { + let remote = vec![rb("oid1", "cidA", &["did:key:zA", "did:key:zB"])]; + let mut have = HashMap::new(); + // stored in a different order: must still count as present + have.insert( + "oid1".to_string(), + vec!["did:key:zB".to_string(), "did:key:zA".to_string()], + ); + assert!(blobs_needing_replication(&remote, &have).is_empty()); + } + + #[test] + fn replicate_restores_on_recipient_change() { + let remote = vec![rb("oid1", "cidB", &["did:key:zA", "did:key:zC"])]; + let mut have = HashMap::new(); + have.insert("oid1".to_string(), vec!["did:key:zA".to_string()]); + assert_eq!(blobs_needing_replication(&remote, &have), remote); + } + + #[test] + fn replicate_empty_remote_is_noop() { + assert!(blobs_needing_replication(&[], &HashMap::new()).is_empty()); + } + + #[test] + fn replicate_response_parses() { + let json = r#"{"blobs":[{"oid":"o1","cid":"c1","recipients":["did:key:zA"]}]}"#; + let parsed: ReplicateResponse = serde_json::from_str(json).unwrap(); + assert_eq!(parsed.blobs.len(), 1); + assert_eq!(parsed.blobs[0].oid, "o1"); + assert_eq!(parsed.blobs[0].recipients, vec!["did:key:zA".to_string()]); + } + + #[test] + fn replicate_response_empty_blobs_parses() { + let parsed: ReplicateResponse = serde_json::from_str(r#"{"blobs":[]}"#).unwrap(); + assert!(parsed.blobs.is_empty()); + } + fn g(args: &[&str], dir: &Path) { assert!(Command::new("git") .args(args) From c65ef9e2d2120a3acb01d9e1dca9c75d8bb34c88 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:51:09 -0500 Subject: [PATCH 46/58] feat(node): anchor_encrypted_manifest for Option B3 Arweave manifests --- crates/gitlawb-node/src/arweave.rs | 152 +++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/crates/gitlawb-node/src/arweave.rs b/crates/gitlawb-node/src/arweave.rs index a88f31f..5027bc8 100644 --- a/crates/gitlawb-node/src/arweave.rs +++ b/crates/gitlawb-node/src/arweave.rs @@ -103,6 +103,99 @@ pub async fn anchor_ref_update( Ok(tx_id) } +/// A per-push manifest of the blobs encrypted this push (Option B3). The +/// `blobs` slice is `(oid, cid, recipients)` tuples. Anchored directly to +/// Arweave as its JSON body so the discovery index survives total node loss. +pub struct EncryptedManifest<'a> { + pub repo: &'a str, + pub owner_did: &'a str, + pub node_did: &'a str, + pub timestamp: &'a str, + pub blobs: &'a [(String, String, Vec)], +} + +/// Anchor a per-push encrypted-blob manifest to Arweave via Irys. The manifest +/// JSON body is the payload (not a CID pointer to IPFS), so the index is +/// permanent and self-contained. Recipient DIDs are already public via the +/// pinned envelopes, so the manifest carries no new secret. +/// +/// Returns the Irys/Arweave transaction ID, or `Ok("")` when `irys_url` is empty +/// (anchoring disabled) or there are no blobs to anchor. +pub async fn anchor_encrypted_manifest( + client: &reqwest::Client, + irys_url: &str, + manifest: &EncryptedManifest<'_>, +) -> Result { + if irys_url.is_empty() || manifest.blobs.is_empty() { + return Ok(String::new()); + } + + let blobs_json: Vec = manifest + .blobs + .iter() + .map(|(oid, cid, recipients)| json!({ "oid": oid, "cid": cid, "recipients": recipients })) + .collect(); + + let payload = json!({ + "schema": "gitlawb/encrypted-manifest/v1", + "repo": manifest.repo, + "owner_did": manifest.owner_did, + "node_did": manifest.node_did, + "timestamp": manifest.timestamp, + "blobs": blobs_json, + }); + + let body = serde_json::to_vec(&payload)?; + let url = format!("{}/upload", irys_url.trim_end_matches('/')); + + let resp = client + .post(&url) + .header("Content-Type", "application/json") + .header("x-irys-tags", build_manifest_tags_header(manifest)) + .body(body) + .send() + .await + .map_err(|e| anyhow::anyhow!("Irys upload failed: {e}"))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + return Err(anyhow::anyhow!("Irys returned {status}: {body}")); + } + + let json: serde_json::Value = resp + .json() + .await + .map_err(|e| anyhow::anyhow!("failed to parse Irys response: {e}"))?; + + let tx_id = json["id"] + .as_str() + .ok_or_else(|| anyhow::anyhow!("no 'id' in Irys response: {json}"))? + .to_string(); + + tracing::info!( + repo = %manifest.repo, + tx_id = %tx_id, + blobs = manifest.blobs.len(), + "anchored encrypted manifest to Arweave" + ); + + Ok(tx_id) +} + +/// Build the Irys tag header for an encrypted-blob manifest. `Repo` and `Schema` +/// are the tags the `gl` recovery query filters on. +fn build_manifest_tags_header(manifest: &EncryptedManifest<'_>) -> String { + [ + "App-Name:gitlawb".to_string(), + "Schema:gitlawb/encrypted-manifest/v1".to_string(), + format!("Repo:{}", sanitize_tag(manifest.repo)), + format!("Owner-DID:{}", sanitize_tag(manifest.owner_did)), + format!("Node-DID:{}", sanitize_tag(manifest.node_did)), + ] + .join(",") +} + /// Arweave permanent URL for a given Irys transaction ID. pub fn arweave_url(tx_id: &str) -> String { format!("https://arweave.net/{tx_id}") @@ -193,6 +286,65 @@ mod tests { ); } + #[tokio::test] + async fn test_manifest_anchor_noop_when_url_empty() { + let client = reqwest::Client::new(); + let blobs = vec![("oid1".to_string(), "cid1".to_string(), vec!["did:key:zA".to_string()])]; + let m = EncryptedManifest { + repo: "alice/r", + owner_did: "did:key:zO", + node_did: "did:key:zN", + timestamp: "2026-06-11T00:00:00Z", + blobs: &blobs, + }; + assert_eq!(anchor_encrypted_manifest(&client, "", &m).await.unwrap(), ""); + } + + #[tokio::test] + async fn test_manifest_anchor_noop_when_no_blobs() { + let client = reqwest::Client::new(); + let blobs: Vec<(String, String, Vec)> = vec![]; + let m = EncryptedManifest { + repo: "alice/r", + owner_did: "did:key:zO", + node_did: "did:key:zN", + timestamp: "2026-06-11T00:00:00Z", + blobs: &blobs, + }; + // Non-empty URL, but no blobs: still a no-op. + assert_eq!( + anchor_encrypted_manifest(&client, "https://example.invalid", &m) + .await + .unwrap(), + "" + ); + } + + #[tokio::test] + async fn test_manifest_anchor_success() { + let mut server = mockito::Server::new_async().await; + let _mock = server + .mock("POST", "/upload") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(r#"{"id":"MANIFESTTX123","timestamp":1710000000000,"version":"1.0.0"}"#) + .create_async() + .await; + + let client = reqwest::Client::new(); + let blobs = vec![("oid1".to_string(), "cid1".to_string(), vec!["did:key:zA".to_string()])]; + let m = EncryptedManifest { + repo: "alice/r", + owner_did: "did:key:zO", + node_did: "did:key:zN", + timestamp: "2026-06-11T00:00:00Z", + blobs: &blobs, + }; + let r = anchor_encrypted_manifest(&client, &server.url(), &m).await; + assert_eq!(r.unwrap(), "MANIFESTTX123"); + _mock.assert_async().await; + } + #[test] fn test_sanitize_tag() { assert_eq!(sanitize_tag("alice/myrepo"), "alice/myrepo"); From 7ec7aba2eacd05e11ee339512b25538a07a0615b Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:52:14 -0500 Subject: [PATCH 47/58] feat(node): encrypt_and_pin returns the per-push sealed delta --- crates/gitlawb-node/src/encrypted_pin.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/gitlawb-node/src/encrypted_pin.rs b/crates/gitlawb-node/src/encrypted_pin.rs index 6ca1382..50797b5 100644 --- a/crates/gitlawb-node/src/encrypted_pin.rs +++ b/crates/gitlawb-node/src/encrypted_pin.rs @@ -20,13 +20,16 @@ fn did_to_key(did: &str) -> Option { } /// Encrypt and pin every withheld blob. `recipients` maps blob oid -> DID set. +/// Returns `(oid, cid, recipients)` for each blob actually sealed and recorded +/// this call (the per-push delta), used by Option B3 to anchor a manifest. pub async fn encrypt_and_pin( ipfs_api: &str, repo_path: &Path, db: &Db, repo_id: &str, recipients: &HashMap>, -) { +) -> Vec<(String, String, Vec)> { + let mut sealed = Vec::new(); for (oid, dids) in recipients { // Skip only if an existing envelope already covers exactly these // recipients. If the recipient set changed (e.g. a reader was added to @@ -64,6 +67,9 @@ pub async fn encrypt_and_pin( .await { tracing::warn!(oid = %oid, err = %e, "record_encrypted_blob failed"); + continue; } + sealed.push((oid.clone(), cid.clone(), dids_vec)); } + sealed } From eb70bf6cde556b29676f1d41e05f205f73e8c4a9 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:54:14 -0500 Subject: [PATCH 48/58] feat(node): anchor encrypted-blob manifest on push (Option B3) --- crates/gitlawb-node/src/api/repos.rs | 42 +++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index dfdfcba..1880848 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -666,6 +666,10 @@ pub async fn git_receive_pack( let repo_id = record.id.clone(); let owner_did = record.owner_did.clone(); let is_public = record.is_public; + let irys_url = state.config.irys_url.clone(); + let http_client = std::sync::Arc::clone(&state.http_client); + let node_did_str = state.node_did.to_string(); + let repo_name = record.name.clone(); tokio::spawn(async move { let pinned = crate::ipfs_pin::pin_new_objects( &ipfs_api, @@ -693,7 +697,7 @@ pub async fn git_receive_pack( }) .await; if let Ok(Ok(recipients)) = recip { - crate::encrypted_pin::encrypt_and_pin( + let delta = crate::encrypted_pin::encrypt_and_pin( &ipfs_api, &repo_path_clone, &db_clone, @@ -701,6 +705,42 @@ pub async fn git_receive_pack( &recipients, ) .await; + + // Option B3: anchor a per-push manifest of the blobs sealed + // this push to Arweave, so the oid->cid index survives total + // node loss. Best-effort; never fails the push. + if !delta.is_empty() && !irys_url.is_empty() { + let owner_short = + owner_did.split(':').next_back().unwrap_or(&owner_did); + let repo_slug = format!("{owner_short}/{repo_name}"); + let ts = chrono::Utc::now().to_rfc3339(); + let manifest = crate::arweave::EncryptedManifest { + repo: &repo_slug, + owner_did: &owner_did, + node_did: &node_did_str, + timestamp: &ts, + blobs: &delta, + }; + match crate::arweave::anchor_encrypted_manifest( + &http_client, + &irys_url, + &manifest, + ) + .await + { + Ok(tx) if !tx.is_empty() => tracing::info!( + repo = %repo_slug, + tx_id = %tx, + "anchored encrypted manifest to Arweave" + ), + Ok(_) => {} + Err(e) => tracing::warn!( + repo = %repo_slug, + err = %e, + "encrypted manifest anchor failed" + ), + } + } } } }); From 8038c9d7e89d85aa88d0d288b1ee621ebb43f96a Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:56:07 -0500 Subject: [PATCH 49/58] feat(gl): Arweave/IPFS gateway recovery for encrypted blobs (Option B3) --- crates/gl/src/clone.rs | 232 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 7023796..f19eb50 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -318,6 +318,177 @@ async fn recover_encrypted_blobs( Ok(recovered) } +/// One blob entry in an Arweave-anchored encrypted manifest. The manifest also +/// carries a `recipients` field per blob, but `gl` does not need it: authorization +/// is enforced by whether `open_blob` can decrypt with the caller's key. Unknown +/// JSON fields are ignored by serde, so `recipients` is simply not declared here. +#[derive(Deserialize)] +struct ManifestBlob { + oid: String, + cid: String, +} + +/// An Arweave-anchored per-push encrypted manifest (Option B3). +#[derive(Deserialize)] +struct Manifest { + #[serde(default)] + timestamp: String, + #[serde(default)] + blobs: Vec, +} + +/// Extract transaction ids from an Arweave GraphQL `transactions` response. +fn parse_tx_ids(v: &serde_json::Value) -> Vec { + v.get("data") + .and_then(|d| d.get("transactions")) + .and_then(|t| t.get("edges")) + .and_then(|e| e.as_array()) + .map(|edges| { + edges + .iter() + .filter_map(|edge| { + edge.get("node") + .and_then(|n| n.get("id")) + .and_then(|i| i.as_str()) + .map(String::from) + }) + .collect() + }) + .unwrap_or_default() +} + +/// Merge per-push manifests into a single `oid -> cid` map, latest-wins by the +/// manifest `timestamp` (RFC3339, compared lexicographically; a later push that +/// re-sealed a blob overrides the earlier entry). +fn merge_manifests(manifests: Vec) -> std::collections::HashMap { + let mut best: std::collections::HashMap = + std::collections::HashMap::new(); // oid -> (cid, timestamp) + for m in manifests { + for b in m.blobs { + match best.get(&b.oid) { + Some((_, ts)) if ts.as_str() >= m.timestamp.as_str() => {} + _ => { + best.insert(b.oid, (b.cid, m.timestamp.clone())); + } + } + } + } + best.into_iter().map(|(oid, (cid, _))| (oid, cid)).collect() +} + +/// Option B3 fallback recovery, with no dependency on a gitlawb node API. Query +/// the Arweave gateway for this repo's encrypted manifests, merge them, and for +/// each blob still missing locally that the caller can decrypt, pull the envelope +/// from a public IPFS gateway, decrypt, and install it as a loose object. Returns +/// the repo-relative paths recovered. Best-effort; silent when gateways are +/// unreachable, leaving the clone exactly as node-based recovery left it. +async fn recover_from_arweave( + arweave_gateway: &str, + ipfs_gateway: &str, + owner: &str, + name: &str, + dest: &Path, + keypair: &gitlawb_core::identity::Keypair, +) -> Result> { + use gitlawb_core::encrypt::open_blob; + use std::collections::HashMap; + use std::io::Write; + + let dest_str = dest.to_str().context("dest path not utf-8")?; + let owner_short = owner.split(':').next_back().unwrap_or(owner); + let slug = format!("{owner_short}/{name}"); + let ag = arweave_gateway.trim_end_matches('/'); + let ig = ipfs_gateway.trim_end_matches('/'); + let client = reqwest::Client::new(); + + // 1. Discover manifest transaction ids via Arweave GraphQL. + let query = r#"query($repo:String!){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100){edges{node{id}}}}"#; + let gql_body = serde_json::json!({ "query": query, "variables": { "repo": slug } }); + let resp = match client.post(format!("{ag}/graphql")).json(&gql_body).send().await { + Ok(r) if r.status().is_success() => r, + _ => return Ok(vec![]), + }; + let gql: serde_json::Value = match resp.json().await { + Ok(v) => v, + Err(_) => return Ok(vec![]), + }; + let tx_ids = parse_tx_ids(&gql); + if tx_ids.is_empty() { + return Ok(vec![]); + } + + // 2. Fetch and parse each manifest body, then merge latest-wins per oid. + let mut manifests = Vec::new(); + for tx in tx_ids { + let m = match client.get(format!("{ag}/{tx}")).send().await { + Ok(r) if r.status().is_success() => r, + _ => continue, + }; + if let Ok(parsed) = m.json::().await { + manifests.push(parsed); + } + } + let oid_cid = merge_manifests(manifests); + if oid_cid.is_empty() { + return Ok(vec![]); + } + + // Map oid -> repo-relative path from the cloned tree. + let ls = Command::new("git") + .args(["-C", dest_str, "ls-tree", "-r", "HEAD"]) + .output()?; + let mut oid_to_path: HashMap = HashMap::new(); + for line in String::from_utf8_lossy(&ls.stdout).lines() { + if let Some((meta, path)) = line.split_once('\t') { + if let Some(oid) = meta.split_whitespace().nth(2) { + oid_to_path.insert(oid.to_string(), path.to_string()); + } + } + } + + // 3. Recover each missing blob the caller can decrypt. + let mut recovered = Vec::new(); + for (oid, cid) in oid_cid { + let present = Command::new("git") + .args(["-C", dest_str, "cat-file", "-e", &oid]) + .status() + .map(|s| s.success()) + .unwrap_or(false); + if present { + continue; + } + let env_resp = match client.get(format!("{ig}/ipfs/{cid}")).send().await { + Ok(r) if r.status().is_success() => r, + _ => continue, + }; + let Ok(envelope) = env_resp.bytes().await else { + continue; + }; + // open_blob succeeds only if this caller is a recipient: this is the + // authorization gate (no node, no DID check needed). + let plaintext = match open_blob(&envelope, keypair) { + Ok(p) => p, + Err(_) => continue, + }; + let mut child = Command::new("git") + .args(["-C", dest_str, "hash-object", "-w", "-t", "blob", "--stdin"]) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .spawn()?; + child.stdin.take().unwrap().write_all(&plaintext)?; + let out = child.wait_with_output()?; + let written = String::from_utf8_lossy(&out.stdout).trim().to_string(); + if written == oid { + if let Some(p) = oid_to_path.get(&oid) { + recovered.push(p.clone()); + } + } else { + eprintln!("warning: recovered blob {oid} hashed to {written}; discarding"); + } + } + Ok(recovered) +} + pub async fn run(args: CloneArgs) -> Result<()> { let (url, owner, name) = parse_repo(&args.repo)?; let dest_name = args.dir.unwrap_or_else(|| name.clone()); @@ -554,6 +725,67 @@ mod tests { .is_err()); } + #[test] + fn parse_tx_ids_extracts_node_ids() { + let v: serde_json::Value = serde_json::from_str( + r#"{"data":{"transactions":{"edges":[{"node":{"id":"TX1"}},{"node":{"id":"TX2"}}]}}}"#, + ) + .unwrap(); + assert_eq!(parse_tx_ids(&v), vec!["TX1".to_string(), "TX2".to_string()]); + } + + #[test] + fn parse_tx_ids_empty_on_no_edges() { + let v: serde_json::Value = + serde_json::from_str(r#"{"data":{"transactions":{"edges":[]}}}"#).unwrap(); + assert!(parse_tx_ids(&v).is_empty()); + } + + #[test] + fn manifest_parses_and_ignores_recipients() { + let m: Manifest = serde_json::from_str( + r#"{"timestamp":"2026-06-11T00:00:00Z","blobs":[{"oid":"o1","cid":"c1","recipients":["did:key:zA"]}]}"#, + ) + .unwrap(); + assert_eq!(m.timestamp, "2026-06-11T00:00:00Z"); + assert_eq!(m.blobs.len(), 1); + assert_eq!(m.blobs[0].oid, "o1"); + assert_eq!(m.blobs[0].cid, "c1"); + } + + #[test] + fn merge_manifests_latest_wins_per_oid() { + let older = Manifest { + timestamp: "2026-06-10T00:00:00Z".to_string(), + blobs: vec![ManifestBlob { oid: "o1".to_string(), cid: "cidOLD".to_string() }], + }; + let newer = Manifest { + timestamp: "2026-06-11T00:00:00Z".to_string(), + blobs: vec![ + ManifestBlob { oid: "o1".to_string(), cid: "cidNEW".to_string() }, + ManifestBlob { oid: "o2".to_string(), cid: "cid2".to_string() }, + ], + }; + let merged = merge_manifests(vec![older, newer]); + assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); + assert_eq!(merged.get("o2").map(String::as_str), Some("cid2")); + } + + #[test] + fn merge_manifests_is_order_independent() { + let older = Manifest { + timestamp: "2026-06-10T00:00:00Z".to_string(), + blobs: vec![ManifestBlob { oid: "o1".to_string(), cid: "cidOLD".to_string() }], + }; + let newer = Manifest { + timestamp: "2026-06-11T00:00:00Z".to_string(), + blobs: vec![ManifestBlob { oid: "o1".to_string(), cid: "cidNEW".to_string() }], + }; + // Newer first, older second: newer must still win. + let merged = merge_manifests(vec![newer, older]); + assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); + } + #[test] fn parse_repo_accepts_url_and_bare() { let (url, o, n) = parse_repo("gitlawb://did:key:zAbc/myrepo").unwrap(); From 0631df76bd0d38cb7ded8ca5dca7fb7b207bc8f0 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:57:25 -0500 Subject: [PATCH 50/58] feat(gl): transparent Arweave fallback recovery on clone (Option B3) --- crates/gl/src/clone.rs | 57 +++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index f19eb50..62e6198 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -29,6 +29,15 @@ pub struct CloneArgs { #[arg(long, default_value = "https://node.gitlawb.com", env = "GITLAWB_NODE")] pub node: String, + + /// Arweave gateway for B3 manifest discovery/fetch when a node cannot supply + /// the encrypted-blob mapping. + #[arg(long, default_value = "https://arweave.net", env = "GITLAWB_ARWEAVE_GATEWAY")] + pub arweave_gateway: String, + + /// Public IPFS gateway for fetching encrypted envelopes during B3 recovery. + #[arg(long, default_value = "https://dweb.link", env = "GITLAWB_IPFS_GATEWAY")] + pub ipfs_gateway: String, } /// Run a git command inside `dir`, erroring with stderr on failure. @@ -510,26 +519,40 @@ pub async fn run(args: CloneArgs) -> Result<()> { setup_partial_clone(&dest, &url, &withheld, &reinclude, args.branch.as_deref())?; if let Ok(keypair) = load_keypair_from_dir(None) { - if let Ok(paths) = recover_encrypted_blobs(&args.node, &owner, &name, &dest, &keypair).await - { - if !paths.is_empty() { - // Re-include recovered paths if this was a sparse clone, then - // materialize them in the working tree. - let spec = dest.join(".git/info/sparse-checkout"); - if spec.exists() { - if let Ok(mut s) = std::fs::read_to_string(&spec) { - for p in &paths { - s.push_str(&format!("/{p}\n")); - } - let _ = std::fs::write(&spec, s); + // Node-based recovery first (B1/B2), then the B3 Arweave/IPFS gateway + // fallback for any authorized blobs the node could not supply. + let mut paths = recover_encrypted_blobs(&args.node, &owner, &name, &dest, &keypair) + .await + .unwrap_or_default(); + let from_arweave = recover_from_arweave( + &args.arweave_gateway, + &args.ipfs_gateway, + &owner, + &name, + &dest, + &keypair, + ) + .await + .unwrap_or_default(); + paths.extend(from_arweave); + + if !paths.is_empty() { + // Re-include recovered paths if this was a sparse clone, then + // materialize them in the working tree. + let spec = dest.join(".git/info/sparse-checkout"); + if spec.exists() { + if let Ok(mut s) = std::fs::read_to_string(&spec) { + for p in &paths { + s.push_str(&format!("/{p}\n")); } + let _ = std::fs::write(&spec, s); } - let _ = git(&dest, &["checkout", "--", "."]); - println!( - "Recovered {} private file(s) you are authorized to read", - paths.len() - ); } + let _ = git(&dest, &["checkout", "--", "."]); + println!( + "Recovered {} private file(s) you are authorized to read", + paths.len() + ); } } From 3d11e4d46076d9dd12958eccfc313b10c1f84101 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 20:58:13 -0500 Subject: [PATCH 51/58] style: cargo fmt --- crates/gitlawb-node/src/api/repos.rs | 3 +- crates/gitlawb-node/src/arweave.rs | 17 +++++++++-- crates/gl/src/clone.rs | 44 +++++++++++++++++++++++----- 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 1880848..658b38b 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -710,8 +710,7 @@ pub async fn git_receive_pack( // this push to Arweave, so the oid->cid index survives total // node loss. Best-effort; never fails the push. if !delta.is_empty() && !irys_url.is_empty() { - let owner_short = - owner_did.split(':').next_back().unwrap_or(&owner_did); + let owner_short = owner_did.split(':').next_back().unwrap_or(&owner_did); let repo_slug = format!("{owner_short}/{repo_name}"); let ts = chrono::Utc::now().to_rfc3339(); let manifest = crate::arweave::EncryptedManifest { diff --git a/crates/gitlawb-node/src/arweave.rs b/crates/gitlawb-node/src/arweave.rs index 5027bc8..c6cdd3d 100644 --- a/crates/gitlawb-node/src/arweave.rs +++ b/crates/gitlawb-node/src/arweave.rs @@ -289,7 +289,11 @@ mod tests { #[tokio::test] async fn test_manifest_anchor_noop_when_url_empty() { let client = reqwest::Client::new(); - let blobs = vec![("oid1".to_string(), "cid1".to_string(), vec!["did:key:zA".to_string()])]; + let blobs = vec![( + "oid1".to_string(), + "cid1".to_string(), + vec!["did:key:zA".to_string()], + )]; let m = EncryptedManifest { repo: "alice/r", owner_did: "did:key:zO", @@ -297,7 +301,10 @@ mod tests { timestamp: "2026-06-11T00:00:00Z", blobs: &blobs, }; - assert_eq!(anchor_encrypted_manifest(&client, "", &m).await.unwrap(), ""); + assert_eq!( + anchor_encrypted_manifest(&client, "", &m).await.unwrap(), + "" + ); } #[tokio::test] @@ -332,7 +339,11 @@ mod tests { .await; let client = reqwest::Client::new(); - let blobs = vec![("oid1".to_string(), "cid1".to_string(), vec!["did:key:zA".to_string()])]; + let blobs = vec![( + "oid1".to_string(), + "cid1".to_string(), + vec!["did:key:zA".to_string()], + )]; let m = EncryptedManifest { repo: "alice/r", owner_did: "did:key:zO", diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 62e6198..40a0ee3 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -32,11 +32,19 @@ pub struct CloneArgs { /// Arweave gateway for B3 manifest discovery/fetch when a node cannot supply /// the encrypted-blob mapping. - #[arg(long, default_value = "https://arweave.net", env = "GITLAWB_ARWEAVE_GATEWAY")] + #[arg( + long, + default_value = "https://arweave.net", + env = "GITLAWB_ARWEAVE_GATEWAY" + )] pub arweave_gateway: String, /// Public IPFS gateway for fetching encrypted envelopes during B3 recovery. - #[arg(long, default_value = "https://dweb.link", env = "GITLAWB_IPFS_GATEWAY")] + #[arg( + long, + default_value = "https://dweb.link", + env = "GITLAWB_IPFS_GATEWAY" + )] pub ipfs_gateway: String, } @@ -413,7 +421,12 @@ async fn recover_from_arweave( // 1. Discover manifest transaction ids via Arweave GraphQL. let query = r#"query($repo:String!){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100){edges{node{id}}}}"#; let gql_body = serde_json::json!({ "query": query, "variables": { "repo": slug } }); - let resp = match client.post(format!("{ag}/graphql")).json(&gql_body).send().await { + let resp = match client + .post(format!("{ag}/graphql")) + .json(&gql_body) + .send() + .await + { Ok(r) if r.status().is_success() => r, _ => return Ok(vec![]), }; @@ -780,13 +793,22 @@ mod tests { fn merge_manifests_latest_wins_per_oid() { let older = Manifest { timestamp: "2026-06-10T00:00:00Z".to_string(), - blobs: vec![ManifestBlob { oid: "o1".to_string(), cid: "cidOLD".to_string() }], + blobs: vec![ManifestBlob { + oid: "o1".to_string(), + cid: "cidOLD".to_string(), + }], }; let newer = Manifest { timestamp: "2026-06-11T00:00:00Z".to_string(), blobs: vec![ - ManifestBlob { oid: "o1".to_string(), cid: "cidNEW".to_string() }, - ManifestBlob { oid: "o2".to_string(), cid: "cid2".to_string() }, + ManifestBlob { + oid: "o1".to_string(), + cid: "cidNEW".to_string(), + }, + ManifestBlob { + oid: "o2".to_string(), + cid: "cid2".to_string(), + }, ], }; let merged = merge_manifests(vec![older, newer]); @@ -798,11 +820,17 @@ mod tests { fn merge_manifests_is_order_independent() { let older = Manifest { timestamp: "2026-06-10T00:00:00Z".to_string(), - blobs: vec![ManifestBlob { oid: "o1".to_string(), cid: "cidOLD".to_string() }], + blobs: vec![ManifestBlob { + oid: "o1".to_string(), + cid: "cidOLD".to_string(), + }], }; let newer = Manifest { timestamp: "2026-06-11T00:00:00Z".to_string(), - blobs: vec![ManifestBlob { oid: "o1".to_string(), cid: "cidNEW".to_string() }], + blobs: vec![ManifestBlob { + oid: "o1".to_string(), + cid: "cidNEW".to_string(), + }], }; // Newer first, older second: newer must still win. let merged = merge_manifests(vec![newer, older]); From 5d95271c11ba18a38467d891ccb26c268b43dcb6 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 10 Jun 2026 21:04:59 -0500 Subject: [PATCH 52/58] fix(gl): bound Arweave recovery gateway requests with a 30s timeout --- crates/gl/src/clone.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 40a0ee3..0631608 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -416,7 +416,13 @@ async fn recover_from_arweave( let slug = format!("{owner_short}/{name}"); let ag = arweave_gateway.trim_end_matches('/'); let ig = ipfs_gateway.trim_end_matches('/'); - let client = reqwest::Client::new(); + // Bound every gateway request: this runs on every clone, so a slow or hung + // public gateway must not stall it. Best-effort recovery, so a timeout just + // skips the affected blob. + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(30)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); // 1. Discover manifest transaction ids via Arweave GraphQL. let query = r#"query($repo:String!){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100){edges{node{id}}}}"#; From 2131b0c5049cb02ec0d04af76015730f87e7db32 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Thu, 11 Jun 2026 01:39:21 -0500 Subject: [PATCH 53/58] test(gl): read-path recovery tests for Arweave fallback; quiet promisor present-check Add two hermetic integration tests for recover_from_arweave that drive the full read path over mocked Arweave GraphQL + IPFS gateways: discover the manifest, fetch it, fetch the envelope, decrypt, and install the withheld blob. One covers an authorized recipient (blob installed), the other a non-recipient (nothing recovered). Both simulate origin death by removing the promisor remote and enable uploadpack.allowFilter so the blob is truly withheld over file://. Also harden the local presence check in recover_from_arweave with GIT_NO_LAZY_FETCH=1 and .output() so the expected 'missing object' case does not trigger a wasted promisor fetch or leak git stderr to the user. --- crates/gl/src/clone.rs | 196 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 194 insertions(+), 2 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 0631608..ddc1e2d 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -477,10 +477,15 @@ async fn recover_from_arweave( // 3. Recover each missing blob the caller can decrypt. let mut recovered = Vec::new(); for (oid, cid) in oid_cid { + // Local presence check. GIT_NO_LAZY_FETCH stops git from making a wasted + // promisor fetch attempt (we are recovering precisely because the promisor + // cannot supply the blob), and `.output()` captures git's "missing object" + // stderr so that expected case does not leak a confusing error to the user. let present = Command::new("git") .args(["-C", dest_str, "cat-file", "-e", &oid]) - .status() - .map(|s| s.success()) + .env("GIT_NO_LAZY_FETCH", "1") + .output() + .map(|o| o.status.success()) .unwrap_or(false); if present { continue; @@ -843,6 +848,193 @@ mod tests { assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); } + /// Read-path end-to-end over a mocked Arweave + IPFS gateway: discover the + /// manifest via GraphQL, fetch it, fetch the envelope, decrypt with the + /// caller's key, and install the previously-withheld blob. + #[tokio::test] + async fn recover_from_arweave_installs_authorized_blob() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + // Make the bare honor `--filter=blob:none` over file:// so the withheld + // blob is genuinely omitted from the local store, not just unchecked-out. + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + assert!( + !dest.join("secret/b.txt").exists(), + "secret starts withheld" + ); + + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + + // Simulate origin death: drop the promisor remote so `cat-file -e` cannot + // lazily fetch the withheld blob. This is exactly the B3 premise (the node + // can no longer serve it), and forces recovery to go through Arweave/IPFS. + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + let reader = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + + let cid = "testcid123"; + let mut server = mockito::Server::new_async().await; + let _gql = server + .mock("POST", "/graphql") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(r#"{"data":{"transactions":{"edges":[{"node":{"id":"TX1"}}]}}}"#) + .create_async() + .await; + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let _tx = server + .mock("GET", "/TX1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &reader, + ) + .await + .unwrap(); + assert_eq!(paths, vec!["secret/b.txt".to_string()]); + + let present = Command::new("git") + .args(["-C", dest.to_str().unwrap(), "cat-file", "-e", &oid]) + .env("GIT_NO_LAZY_FETCH", "1") + .output() + .unwrap() + .status + .success(); + assert!( + present, + "authorized reader's blob must be installed locally" + ); + } + + /// A caller who is not a recipient cannot decrypt the envelope, so nothing is + /// recovered even though the manifest and envelope are reachable. + #[tokio::test] + async fn recover_from_arweave_skips_unauthorized() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + + // Simulate origin death (see the authorized test) so the withheld blob + // cannot be lazily fetched from the promisor remote. + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + // Sealed to a different reader; the caller below is not a recipient. + let authorized = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[authorized.verifying_key()]).unwrap(); + let intruder = Keypair::generate(); + + let cid = "testcid123"; + let mut server = mockito::Server::new_async().await; + let _gql = server + .mock("POST", "/graphql") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(r#"{"data":{"transactions":{"edges":[{"node":{"id":"TX1"}}]}}}"#) + .create_async() + .await; + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let _tx = server + .mock("GET", "/TX1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &intruder, + ) + .await + .unwrap(); + assert!(paths.is_empty(), "non-recipient must recover nothing"); + + let present = Command::new("git") + .args(["-C", dest.to_str().unwrap(), "cat-file", "-e", &oid]) + .env("GIT_NO_LAZY_FETCH", "1") + .output() + .unwrap() + .status + .success(); + assert!(!present, "non-recipient must not install the blob"); + } + #[test] fn parse_repo_accepts_url_and_bare() { let (url, o, n) = parse_repo("gitlawb://did:key:zAbc/myrepo").unwrap(); From a939a65ece253273880f9e4d5241b0d239fd6cbb Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Thu, 11 Jun 2026 06:43:03 -0500 Subject: [PATCH 54/58] fix: surface corrupt recipients JSON and silent recovery I/O failures db: parse_recipients now propagates a descriptive error instead of defaulting corrupt recipients JSON to an empty list, which would have denied authorized readers and handed peers incomplete metadata. gl: clone recovery now warns when the sparse-checkout file cannot be read or written, or when the post-recovery checkout fails, instead of silently discarding those errors and claiming files were recovered. --- crates/gitlawb-node/src/db/mod.rs | 27 ++++++++++++++++++++------- crates/gl/src/clone.rs | 25 ++++++++++++++++++++----- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 447fbe2..edddb2e 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -1659,6 +1659,16 @@ impl Db { Ok(()) } + /// Deserialize the stored recipients JSON. Corruption is surfaced as an + /// error rather than silently treated as an empty recipient list, which + /// would deny access to every legitimate reader and hand peers incomplete + /// replication metadata. + fn parse_recipients(repo_id: &str, oid: &str, raw: &str) -> Result> { + serde_json::from_str(raw).with_context(|| { + format!("corrupt recipients JSON in encrypted_blobs (repo_id={repo_id}, oid={oid})") + }) + } + /// (oid, cid) for every encrypted blob in the repo that `caller` may decrypt. pub async fn list_encrypted_blobs_for( &self, @@ -1675,7 +1685,7 @@ impl Db { let oid: String = row.get("oid"); let cid: String = row.get("cid"); let recipients: String = row.get("recipients"); - let recipients: Vec = serde_json::from_str(&recipients).unwrap_or_default(); + let recipients = Self::parse_recipients(repo_id, &oid, &recipients)?; if recipients.iter().any(|d| d == caller) { out.push((oid, cid)); } @@ -1701,7 +1711,7 @@ impl Db { let oid: String = row.get("oid"); let cid: String = row.get("cid"); let recipients: String = row.get("recipients"); - let recipients: Vec = serde_json::from_str(&recipients).unwrap_or_default(); + let recipients = Self::parse_recipients(repo_id, &oid, &recipients)?; out.push((oid, cid, recipients)); } Ok(out) @@ -1723,7 +1733,7 @@ impl Db { .await?; let Some(row) = row else { return Ok(None) }; let recipients: String = row.get("recipients"); - let recipients: Vec = serde_json::from_str(&recipients).unwrap_or_default(); + let recipients = Self::parse_recipients(repo_id, oid, &recipients)?; if recipients.iter().any(|d| d == caller) { Ok(Some(row.get("cid"))) } else { @@ -1744,10 +1754,13 @@ impl Db { .bind(oid) .fetch_optional(&self.pool) .await?; - Ok(row.map(|r| { - let recipients: String = r.get("recipients"); - serde_json::from_str::>(&recipients).unwrap_or_default() - })) + match row { + None => Ok(None), + Some(r) => { + let recipients: String = r.get("recipients"); + Ok(Some(Self::parse_recipients(repo_id, oid, &recipients)?)) + } + } } pub async fn list_pinned_cids(&self) -> Result> { diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index ddc1e2d..93e998d 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -565,14 +565,29 @@ pub async fn run(args: CloneArgs) -> Result<()> { // materialize them in the working tree. let spec = dest.join(".git/info/sparse-checkout"); if spec.exists() { - if let Ok(mut s) = std::fs::read_to_string(&spec) { - for p in &paths { - s.push_str(&format!("/{p}\n")); + match std::fs::read_to_string(&spec) { + Ok(mut s) => { + for p in &paths { + s.push_str(&format!("/{p}\n")); + } + if let Err(e) = std::fs::write(&spec, &s) { + eprintln!( + "warning: failed to update sparse-checkout, recovered files may not appear: {e}" + ); + } + } + Err(e) => { + eprintln!( + "warning: failed to read sparse-checkout, recovered files may not appear: {e}" + ); } - let _ = std::fs::write(&spec, s); } } - let _ = git(&dest, &["checkout", "--", "."]); + if let Err(e) = git(&dest, &["checkout", "--", "."]) { + eprintln!( + "warning: checkout after recovery failed, recovered files may not appear: {e}" + ); + } println!( "Recovered {} private file(s) you are authorized to read", paths.len() From d88b1179835a4caa0b6c3afa403c74855bfc920d Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Fri, 12 Jun 2026 12:13:08 -0500 Subject: [PATCH 55/58] feat(core): blind recipient identities in withheld-blob envelopes Envelope v2 drops the cleartext recipient public key (kid) from each wrapped-key header entry, so a party holding the public IPFS/Arweave copy can no longer enumerate who is authorized to decrypt a withheld blob. The version is bumped to 2 and v1 envelopes are rejected. open_blob now selects the reader's entry by trial decryption (the AEAD tag authenticates exactly one entry) instead of matching on the public key. Recipient count is still visible; identity is not. Scope is envelope-only, the node DB recipients column and peer replication metadata are unchanged. Co-authored-by: CommandCodeBot --- crates/gitlawb-core/src/encrypt.rs | 81 +++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 17 deletions(-) diff --git a/crates/gitlawb-core/src/encrypt.rs b/crates/gitlawb-core/src/encrypt.rs index b626581..72004a3 100644 --- a/crates/gitlawb-core/src/encrypt.rs +++ b/crates/gitlawb-core/src/encrypt.rs @@ -42,11 +42,10 @@ use rand::RngCore; use serde::{Deserialize, Serialize}; const MAGIC: &[u8] = b"GLENC"; -const VERSION: u8 = 1; +const VERSION: u8 = 2; #[derive(Serialize, Deserialize)] struct Recipient { - kid: String, // base64 recipient ed25519 pubkey (32B) eph: String, // base64 ephemeral x25519 pubkey (32B) nonce: String, // base64 box nonce (24B) wrap: String, // base64 wrapped content key @@ -84,7 +83,6 @@ pub fn seal_blob(plaintext: &[u8], recipients: &[VerifyingKey]) -> Result Result> { .context("decode header")?; let body = &envelope[p + hlen..]; - let my_kid = B64.encode(keypair.verifying_key().as_bytes()); let my_x = XSecret::from(x25519_secret_from_seed(&keypair.seed_bytes())); - let entry = header - .recipients - .iter() - .find(|r| r.kid == my_kid) - .context("not a recipient of this envelope")?; - let eph = XPublic::from(<[u8; 32]>::try_from(B64.decode(&entry.eph)?.as_slice())?); - let nonce = B64.decode(&entry.nonce)?; - let wrap = B64.decode(&entry.wrap)?; - let abox = ChaChaBox::new(&eph, &my_x); - let content_key = abox - .decrypt( + // Identities are blinded: no entry says which recipient it belongs to, so + // try each one. The ChaChaBox AEAD tag authenticates, so exactly the + // reader's own entry unwraps; every other entry fails cleanly. + let mut content_key: Option> = None; + for entry in &header.recipients { + let eph = match B64 + .decode(&entry.eph) + .ok() + .and_then(|b| <[u8; 32]>::try_from(b.as_slice()).ok()) + { + Some(b) => XPublic::from(b), + None => continue, + }; + let nonce = match B64.decode(&entry.nonce) { + Ok(n) => n, + Err(_) => continue, + }; + let wrap = match B64.decode(&entry.wrap) { + Ok(w) => w, + Err(_) => continue, + }; + let abox = ChaChaBox::new(&eph, &my_x); + if let Ok(ck) = abox.decrypt( crypto_box::aead::generic_array::GenericArray::from_slice(&nonce), wrap.as_slice(), - ) - .map_err(|_| anyhow::anyhow!("content-key unwrap failed"))?; + ) { + content_key = Some(ck); + break; + } + } + let content_key = content_key.context("not a recipient of this envelope")?; let body_cipher = XChaCha20Poly1305::new_from_slice(&content_key) .map_err(|e| anyhow::anyhow!("content key: {e}"))?; @@ -194,4 +207,38 @@ mod tests { env[last] ^= 0x01; assert!(open_blob(&env, &owner).is_err()); } + + #[test] + fn v2_header_contains_no_recipient_pubkey() { + // The blinded envelope header must not carry any recipient's public key. + let reader = Keypair::generate(); + let env = seal_blob(b"private blob contents", &[reader.verifying_key()]).unwrap(); + + // Slice out the header bytes using the envelope framing: + // MAGIC | version(1B) | header_len(4B LE) | header_json | body + let mut p = MAGIC.len() + 1; // skip MAGIC + version byte + let hlen = u32::from_le_bytes(env[p..p + 4].try_into().unwrap()) as usize; + p += 4; + let header = &env[p..p + hlen]; + let header_str = String::from_utf8_lossy(header); + + let pubkey_b64 = B64.encode(reader.verifying_key().as_bytes()); + assert!( + !header_str.contains(&pubkey_b64), + "recipient public key must not appear in the blinded header" + ); + } + + #[test] + fn v1_envelope_is_rejected() { + let reader = Keypair::generate(); + let mut env = seal_blob(b"hi", &[reader.verifying_key()]).unwrap(); + // Flip the version byte (immediately after MAGIC) from 2 to 1. + env[MAGIC.len()] = 1; + let err = open_blob(&env, &reader).unwrap_err(); + assert!( + err.to_string().contains("unsupported envelope version"), + "expected version-rejection error, got: {err}" + ); + } } From 8c15cf5306904b658a852f48e3d8659b43ece1e7 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Fri, 12 Jun 2026 12:30:50 -0500 Subject: [PATCH 56/58] fix(core): reject malformed envelope nonces instead of panicking open_blob fed attacker-controlled envelopes to GenericArray::from_slice and XNonce::from_slice, which panic on a wrong-length input. Validate both the per-recipient box nonce and the body nonce to 24 bytes before use: skip a recipient entry whose nonce is malformed, and return an error for a malformed body nonce, so the public recovery path surfaces an error rather than panicking. --- crates/gitlawb-core/src/encrypt.rs | 55 +++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/crates/gitlawb-core/src/encrypt.rs b/crates/gitlawb-core/src/encrypt.rs index 72004a3..0336270 100644 --- a/crates/gitlawb-core/src/encrypt.rs +++ b/crates/gitlawb-core/src/encrypt.rs @@ -138,9 +138,15 @@ pub fn open_blob(envelope: &[u8], keypair: &Keypair) -> Result> { Some(b) => XPublic::from(b), None => continue, }; - let nonce = match B64.decode(&entry.nonce) { - Ok(n) => n, - Err(_) => continue, + // from_slice panics on a wrong length, and the envelope is attacker + // controlled, so validate the 24-byte box nonce before using it. + let nonce = match B64 + .decode(&entry.nonce) + .ok() + .and_then(|n| <[u8; 24]>::try_from(n.as_slice()).ok()) + { + Some(n) => n, + None => continue, }; let wrap = match B64.decode(&entry.wrap) { Ok(w) => w, @@ -159,7 +165,11 @@ pub fn open_blob(envelope: &[u8], keypair: &Keypair) -> Result> { let body_cipher = XChaCha20Poly1305::new_from_slice(&content_key) .map_err(|e| anyhow::anyhow!("content key: {e}"))?; - let body_nonce = B64.decode(&header.nonce)?; + let body_nonce = B64 + .decode(&header.nonce) + .ok() + .and_then(|n| <[u8; 24]>::try_from(n.as_slice()).ok()) + .context("invalid body nonce")?; body_cipher .decrypt(XNonce::from_slice(&body_nonce), body) .map_err(|_| anyhow::anyhow!("body decrypt failed")) @@ -241,4 +251,41 @@ mod tests { "expected version-rejection error, got: {err}" ); } + + #[test] + fn malformed_nonce_returns_err_not_panic() { + // from_slice panics on wrong-length input; a crafted envelope on the + // public recovery path must surface an error, never panic. + let reader = Keypair::generate(); + let env = seal_blob(b"private blob contents", &[reader.verifying_key()]).unwrap(); + + // Split the envelope framing into header JSON and body. + let mut p = MAGIC.len() + 1; + let hlen = u32::from_le_bytes(env[p..p + 4].try_into().unwrap()) as usize; + p += 4; + let header_bytes = &env[p..p + hlen]; + let body = &env[p + hlen..]; + + let reframe = |header: &serde_json::Value| -> Vec { + let hj = serde_json::to_vec(header).unwrap(); + let mut out = Vec::new(); + out.extend_from_slice(MAGIC); + out.push(VERSION); + out.extend_from_slice(&(hj.len() as u32).to_le_bytes()); + out.extend_from_slice(&hj); + out.extend_from_slice(body); + out + }; + let bad_nonce = serde_json::Value::String(B64.encode([0u8; 12])); + + // Corrupted per-recipient nonce: entry is skipped, no match. + let mut header: serde_json::Value = serde_json::from_slice(header_bytes).unwrap(); + header["recipients"][0]["nonce"] = bad_nonce.clone(); + assert!(open_blob(&reframe(&header), &reader).is_err()); + + // Corrupted body nonce: unwrap succeeds, body nonce is rejected. + let mut header: serde_json::Value = serde_json::from_slice(header_bytes).unwrap(); + header["nonce"] = bad_nonce; + assert!(open_blob(&reframe(&header), &reader).is_err()); + } } From e85fab0df60ffb8e52de0ea8db5ad1abc4f4fa5f Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Fri, 12 Jun 2026 12:50:51 -0500 Subject: [PATCH 57/58] feat(node): blind recipient identities on the B2 replication surface The encrypted-blobs/replicate endpoint shipped every blob's full recipient DID list to mirrors, which persisted it, so any mirroring peer learned the reader set. The v2 envelope blinding already removed recipient public keys from the pinned envelopes, so the comment justifying this (DIDs are already public) was no longer true. /replicate now returns {oid, cid} only. Mirrors detect a re-seal by the CID changing (the OID is stable across re-seals) instead of comparing recipient sets, and store no recipient identities. Origin-side authz and the at-rest recipients column are unchanged; this blinds only the peer-facing surface. --- crates/gitlawb-node/src/api/encrypted.rs | 35 ++++++++++--- crates/gitlawb-node/src/sync.rs | 64 ++++++++++-------------- 2 files changed, 55 insertions(+), 44 deletions(-) diff --git a/crates/gitlawb-node/src/api/encrypted.rs b/crates/gitlawb-node/src/api/encrypted.rs index 6e19bd5..20827fb 100644 --- a/crates/gitlawb-node/src/api/encrypted.rs +++ b/crates/gitlawb-node/src/api/encrypted.rs @@ -56,10 +56,11 @@ pub async fn get_encrypted_blob( } /// GET /api/v1/repos/{owner}/{repo}/encrypted-blobs/replicate -/// Returns [{oid, cid, recipients}] for every encrypted blob in the repo, for -/// peer-mirror replication (Option B2). Not recipient-scoped: recipient DIDs are -/// already public via the IPFS-pinned envelopes, so this exposes only ciphertext -/// metadata (content-addressed OIDs/CIDs and recipient DIDs), never plaintext. +/// Returns [{oid, cid}] for every encrypted blob in the repo, for peer-mirror +/// replication (Option B2). Recipient identities are deliberately withheld: the +/// v2 envelopes no longer carry recipient public keys, so peers must not learn +/// the reader set either. A mirror detects a re-seal by the CID changing (the +/// OID is stable across re-seals). Ciphertext metadata only, never plaintext. pub async fn replicate_encrypted_blobs( State(state): State, Path((owner, repo)): Path<(String, String)>, @@ -72,9 +73,29 @@ pub async fn replicate_encrypted_blobs( let rows = state.db.list_all_encrypted_blobs(&record.id).await?; let blobs: Vec<_> = rows .into_iter() - .map(|(oid, cid, recipients)| { - serde_json::json!({ "oid": oid, "cid": cid, "recipients": recipients }) - }) + .map(|(oid, cid, _recipients)| replicate_blob_json(oid, cid)) .collect(); Ok(Json(serde_json::json!({ "blobs": blobs }))) } + +/// Serialize one blob for the replication wire. Recipient identities are +/// intentionally absent so a mirror never learns the reader set. +fn replicate_blob_json(oid: String, cid: String) -> serde_json::Value { + serde_json::json!({ "oid": oid, "cid": cid }) +} + +#[cfg(test)] +mod tests { + use super::replicate_blob_json; + + #[test] + fn replicate_blob_json_omits_recipients() { + let v = replicate_blob_json("oid1".into(), "cidA".into()); + assert_eq!(v["oid"], "oid1"); + assert_eq!(v["cid"], "cidA"); + assert!( + v.get("recipients").is_none(), + "replication wire must not carry recipient identities" + ); + } +} diff --git a/crates/gitlawb-node/src/sync.rs b/crates/gitlawb-node/src/sync.rs index ca3e7bf..4b02603 100644 --- a/crates/gitlawb-node/src/sync.rs +++ b/crates/gitlawb-node/src/sync.rs @@ -46,13 +46,12 @@ fn classify_mirror(withheld: Option>) -> MirrorMode { } /// One encrypted blob as advertised by an origin's `encrypted-blobs/replicate` -/// endpoint (Option B2). Ciphertext metadata only. +/// endpoint (Option B2). Ciphertext metadata only; recipient identities are +/// withheld from peers, so a re-seal is detected by the CID changing. #[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize)] struct ReplicaBlob { oid: String, cid: String, - #[serde(default)] - recipients: Vec, } /// The shape of the `encrypted-blobs/replicate` JSON response. @@ -64,33 +63,26 @@ struct ReplicateResponse { /// Decide which of the origin's encrypted blobs this mirror must (re)replicate. /// -/// `have` maps each already-stored blob's oid to its stored recipient DIDs. A +/// `have` maps each already-stored blob's oid to the CID the mirror pinned. A /// remote blob is returned when the mirror has no row for that oid, or when the -/// stored recipient set differs from the remote one (the origin re-sealed after a -/// reader-set change; same semantics as B1). Recipient order is ignored. +/// stored CID differs from the advertised one. A re-seal regenerates the +/// envelope (new content key, nonce, and per-recipient wraps), so the CID +/// changes while the OID stays stable; comparing CIDs detects a re-seal without +/// the mirror ever holding recipient identities. fn blobs_needing_replication( remote: &[ReplicaBlob], - have: &HashMap>, + have: &HashMap, ) -> Vec { remote .iter() .filter(|b| match have.get(&b.oid) { None => true, - Some(stored) => !same_recipients(stored, &b.recipients), + Some(stored_cid) => stored_cid != &b.cid, }) .cloned() .collect() } -/// Order-insensitive equality of two recipient DID lists. -fn same_recipients(a: &[String], b: &[String]) -> bool { - let mut a: Vec<&String> = a.iter().collect(); - let mut b: Vec<&String> = b.iter().collect(); - a.sort(); - b.sort(); - a == b -} - /// Start the background sync worker. Returns immediately; the worker runs /// as a detached tokio task that exits cleanly when `shutdown_rx` flips /// to `true`. @@ -260,9 +252,10 @@ async fn fetch_withheld( /// Replicate the origin's encrypted withheld blobs onto this mirror (Option B2). /// /// After the git objects are mirrored, fetch the origin's replication listing, -/// then for each blob the mirror does not already hold (or whose recipients -/// changed) pull the ciphertext envelope over IPFS, pin it locally, and record -/// the `encrypted_blobs` row keyed by this mirror's local `repo_id`. +/// then for each blob the mirror does not already hold (or whose CID changed, +/// i.e. the origin re-sealed) pull the ciphertext envelope over IPFS, pin it +/// locally, and record the `encrypted_blobs` row keyed by this mirror's local +/// `repo_id`. The mirror stores no recipient identities. /// /// Best-effort and idempotent: any per-blob failure is logged and skipped, to be /// retried on the next sync. Confidentiality is never at risk; the mirror only @@ -298,10 +291,10 @@ async fn replicate_encrypted_blobs( return; } - let have: HashMap> = match db.list_all_encrypted_blobs(repo_id).await { + let have: HashMap = match db.list_all_encrypted_blobs(repo_id).await { Ok(rows) => rows .into_iter() - .map(|(oid, _cid, recipients)| (oid, recipients)) + .map(|(oid, cid, _recipients)| (oid, cid)) .collect(), Err(e) => { warn!(repo = %repo, err = %e, "failed to list local encrypted blobs for replication"); @@ -324,7 +317,7 @@ async fn replicate_encrypted_blobs( continue; } if let Err(e) = db - .record_encrypted_blob(repo_id, &blob.oid, &cid, &blob.recipients) + .record_encrypted_blob(repo_id, &blob.oid, &cid, &[]) .await { warn!(oid = %blob.oid, err = %e, "failed to record replicated encrypted blob"); @@ -483,38 +476,34 @@ mod tests { assert!(matches!(mode, MirrorMode::Plain)); } - fn rb(oid: &str, cid: &str, recipients: &[&str]) -> ReplicaBlob { + fn rb(oid: &str, cid: &str) -> ReplicaBlob { ReplicaBlob { oid: oid.to_string(), cid: cid.to_string(), - recipients: recipients.iter().map(|s| s.to_string()).collect(), } } #[test] fn replicate_stores_new_blob() { - let remote = vec![rb("oid1", "cidA", &["did:key:zA"])]; + let remote = vec![rb("oid1", "cidA")]; let have = HashMap::new(); assert_eq!(blobs_needing_replication(&remote, &have), remote); } #[test] - fn replicate_skips_already_present_same_recipients() { - let remote = vec![rb("oid1", "cidA", &["did:key:zA", "did:key:zB"])]; + fn replicate_skips_already_present_same_cid() { + let remote = vec![rb("oid1", "cidA")]; let mut have = HashMap::new(); - // stored in a different order: must still count as present - have.insert( - "oid1".to_string(), - vec!["did:key:zB".to_string(), "did:key:zA".to_string()], - ); + have.insert("oid1".to_string(), "cidA".to_string()); assert!(blobs_needing_replication(&remote, &have).is_empty()); } #[test] - fn replicate_restores_on_recipient_change() { - let remote = vec![rb("oid1", "cidB", &["did:key:zA", "did:key:zC"])]; + fn replicate_restores_on_cid_change() { + // The origin re-sealed: same oid, new envelope, new cid. + let remote = vec![rb("oid1", "cidB")]; let mut have = HashMap::new(); - have.insert("oid1".to_string(), vec!["did:key:zA".to_string()]); + have.insert("oid1".to_string(), "cidA".to_string()); assert_eq!(blobs_needing_replication(&remote, &have), remote); } @@ -525,11 +514,12 @@ mod tests { #[test] fn replicate_response_parses() { + // An older origin may still send a recipients field; it must be ignored. let json = r#"{"blobs":[{"oid":"o1","cid":"c1","recipients":["did:key:zA"]}]}"#; let parsed: ReplicateResponse = serde_json::from_str(json).unwrap(); assert_eq!(parsed.blobs.len(), 1); assert_eq!(parsed.blobs[0].oid, "o1"); - assert_eq!(parsed.blobs[0].recipients, vec!["did:key:zA".to_string()]); + assert_eq!(parsed.blobs[0].cid, "c1"); } #[test] From a7315d8d0c948eec4d98ca0921b66bfd3bdf5ca6 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Fri, 12 Jun 2026 14:51:11 -0500 Subject: [PATCH 58/58] feat(node): stop anchoring recipient identities to Arweave The B3 encrypted-blob manifest anchored {oid, cid, recipients} to Arweave, a permanent public record of every blob's reader set. The v2 envelope blinding already removed recipient keys from the pinned envelopes, so the comment justifying this (recipient DIDs are already public) was no longer true. The manifest now anchors {oid, cid} only. The gl reader already ignores the recipients field (it recovers by trial decryption), so no reader changes. --- crates/gitlawb-node/src/arweave.rs | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/crates/gitlawb-node/src/arweave.rs b/crates/gitlawb-node/src/arweave.rs index c6cdd3d..cf13947 100644 --- a/crates/gitlawb-node/src/arweave.rs +++ b/crates/gitlawb-node/src/arweave.rs @@ -104,8 +104,9 @@ pub async fn anchor_ref_update( } /// A per-push manifest of the blobs encrypted this push (Option B3). The -/// `blobs` slice is `(oid, cid, recipients)` tuples. Anchored directly to -/// Arweave as its JSON body so the discovery index survives total node loss. +/// `blobs` slice is `(oid, cid, recipients)` tuples; only `oid` and `cid` are +/// anchored. Anchored directly to Arweave as its JSON body so the discovery +/// index survives total node loss. pub struct EncryptedManifest<'a> { pub repo: &'a str, pub owner_did: &'a str, @@ -116,8 +117,9 @@ pub struct EncryptedManifest<'a> { /// Anchor a per-push encrypted-blob manifest to Arweave via Irys. The manifest /// JSON body is the payload (not a CID pointer to IPFS), so the index is -/// permanent and self-contained. Recipient DIDs are already public via the -/// pinned envelopes, so the manifest carries no new secret. +/// permanent and self-contained. Recipient identities are deliberately omitted: +/// the anchor is permanent and public, and the v2 envelopes no longer expose +/// recipients, so the reader set must not be written to Arweave either. /// /// Returns the Irys/Arweave transaction ID, or `Ok("")` when `irys_url` is empty /// (anchoring disabled) or there are no blobs to anchor. @@ -133,7 +135,7 @@ pub async fn anchor_encrypted_manifest( let blobs_json: Vec = manifest .blobs .iter() - .map(|(oid, cid, recipients)| json!({ "oid": oid, "cid": cid, "recipients": recipients })) + .map(|(oid, cid, _recipients)| manifest_blob_json(oid, cid)) .collect(); let payload = json!({ @@ -183,6 +185,13 @@ pub async fn anchor_encrypted_manifest( Ok(tx_id) } +/// Serialize one blob for the Arweave manifest. Recipient identities are +/// intentionally absent so the permanent public anchor never records who can +/// read a blob. +fn manifest_blob_json(oid: &str, cid: &str) -> serde_json::Value { + json!({ "oid": oid, "cid": cid }) +} + /// Build the Irys tag header for an encrypted-blob manifest. `Repo` and `Schema` /// are the tags the `gl` recovery query filters on. fn build_manifest_tags_header(manifest: &EncryptedManifest<'_>) -> String { @@ -356,6 +365,17 @@ mod tests { _mock.assert_async().await; } + #[test] + fn manifest_blob_json_omits_recipients() { + let v = manifest_blob_json("oid1", "cidA"); + assert_eq!(v["oid"], "oid1"); + assert_eq!(v["cid"], "cidA"); + assert!( + v.get("recipients").is_none(), + "Arweave manifest must not anchor recipient identities" + ); + } + #[test] fn test_sanitize_tag() { assert_eq!(sanitize_tag("alice/myrepo"), "alice/myrepo");