corewire · Breee · Jun 27, 2026 · Jun 27, 2026 · Jun 27, 2026 · Jun 27, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -133,7 +133,7 @@ jobs:
           make controller-gen
           make sync-crds
           kubectl apply -f config/crd/bases/
-      - name: Deploy E2E infrastructure (Prometheus + Registry)
+      - name: Deploy E2E infrastructure (Prometheus, Loki, Registry)
         run: make e2e-infra
       - name: Deploy operator
         run: |

diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,10 @@ docs/.hugo_build.lock
 # Generated CRD chart templates (produced by make sync-crds in CI)
 charts/drop-crds/templates/drop.corewire.io_*.yaml
 charts/drop/templates/crds-drop.corewire.io_*.yaml
+
+# Imported research archives (always unpack; never commit zip bundles)
+research/**/*.zip
+
+# Python cache artifacts
+__pycache__/
+*.pyc
diff --git a/Makefile b/Makefile
@@ -103,7 +103,7 @@ uninstall: manifests kustomize ## Uninstall CRDs from cluster.
 	$(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found -f -
 
 .PHONY: e2e-infra
-e2e-infra: ## Deploy Prometheus + Registry for E2E/dev.
+e2e-infra: ## Deploy Prometheus, Loki, and Registry for E2E/dev.
 	@chmod +x hack/e2e-infra/setup.sh && hack/e2e-infra/setup.sh
 
 ##@ Docker
@@ -143,6 +143,69 @@ docs-gen-check: docs-gen ## Verify generated AI docs are up to date.
 	@git diff --exit-code knowledge.yaml llms.txt llms-full.txt docs/static/llms-full.txt .github/copilot-instructions.md .cursorrules AGENTS.md docs/content/docs/reference/_generated_*.md || \
 		(echo "ERROR: generated docs are out of date — run 'make docs-gen'" && exit 1)
 
+##@ Research
+
+RESEARCH_TEX_DIR ?= research/tex
+RESEARCH_TEX_FILE ?= paper.tex
+RESEARCH_BENCH_DIR ?= research/benchmark/evaluator
+RESEARCH_BENCH_VENV ?= $(RESEARCH_BENCH_DIR)/.venv
+RESEARCH_BENCH_RESULTS_DIR ?= research/benchmark/results
+RESEARCH_BENCH_RESULTS_DISCOVERY_20RUNS ?= $(RESEARCH_BENCH_RESULTS_DIR)/discovery-strategy-20runs
+RESEARCH_BENCH_RESULTS_ORACLE_20RUNS ?= $(RESEARCH_BENCH_RESULTS_DIR)/oracle-gap-strategy-20runs
+RESEARCH_BENCH_RESULTS_CACHE_20RUNS ?= $(RESEARCH_BENCH_RESULTS_DIR)/ci-image-cache-20runs
+
+.PHONY: research-tex-build
+research-tex-build: ## Build research PDF from TeX source (override RESEARCH_TEX_FILE=<file.tex>).
+	@cd $(RESEARCH_TEX_DIR) && \
+	if command -v latexmk >/dev/null 2>&1; then \
+		latexmk -pdf -interaction=nonstopmode -halt-on-error $(RESEARCH_TEX_FILE); \
+	elif command -v pdflatex >/dev/null 2>&1; then \
+		pdflatex -interaction=nonstopmode -halt-on-error $(RESEARCH_TEX_FILE) && \
+		pdflatex -interaction=nonstopmode -halt-on-error $(RESEARCH_TEX_FILE); \
+	else \
+		echo "ERROR: latexmk/pdflatex not found"; exit 1; \
+	fi
+
+.PHONY: research-bench-setup
+research-bench-setup: ## Create benchmark venv and install Python dependencies.
+	@cd $(RESEARCH_BENCH_DIR) && \
+	python3 -m venv .venv && \
+	. .venv/bin/activate && \
+	pip install -r requirements.txt
+
+.PHONY: research-bench-generate
+research-bench-generate: ## Generate synthetic benchmark dataset.
+	@cd $(RESEARCH_BENCH_DIR) && \
+	. .venv/bin/activate && \
+	python generate_synthetic_day.py --out data --jobs 25000 --nodes 100 --images 30 --seed 20260621
+
+.PHONY: research-bench-replay
+research-bench-replay: ## Run replay policy evaluation from benchmark data.
+	@cd $(RESEARCH_BENCH_DIR) && \
+	. .venv/bin/activate && \
+	python evaluate_replay.py --data data --out outputs
+
+.PHONY: research-bench-discovery
+research-bench-discovery: ## Evaluate discovery strategies from benchmark data.
+	@cd $(RESEARCH_BENCH_DIR) && \
+	. .venv/bin/activate && \
+	python evaluate_discovery_strategies.py --data data --out outputs/strategy_eval
+
+.PHONY: research-bench-plot
+research-bench-plot: ## Render example pipeline Gantt figure.
+	@cd $(RESEARCH_BENCH_DIR) && \
+	. .venv/bin/activate && \
+	python plot_pipeline_gantt.py --modeled-jobs outputs/modeled_jobs_no_prewarming.csv --out figures/example_gantt.png
+
+.PHONY: research-bench-20runs
+research-bench-20runs: ## Run 20-run discovery strategy benchmark batch.
+	@cd $(RESEARCH_BENCH_DIR) && \
+	. .venv/bin/activate && \
+	python run_discovery_strategy_20runs.py
+
+.PHONY: research-bench-all
+research-bench-all: research-bench-generate research-bench-replay research-bench-discovery research-bench-plot ## Run full synthetic benchmark workflow.
+
 .PHONY: tools
 tools: ## Install local tooling and check optional docs/chart binaries.
 	@$(MAKE) kustomize controller-gen setup-envtest golangci-lint chainsaw

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 </p>
 
 
-A Kubernetes operator that pre-pulls container images onto nodes — safely, with pacing, and with automatic discovery. 
+A Kubernetes operator that pre-pulls container images onto nodes — safely, with pacing, and with automatic discovery.
 
 ## Why
 
@@ -115,18 +115,19 @@ spec:
   maxImages: 20
   # Only keep images from your internal registry (regex filter, optional)
   imageFilter: "registry.example.com/.*"
-  sources:
-    - type: prometheus
+  queries:
+    - name: runner-image-usage
+      type: prometheus
       prometheus:
         # Any Prometheus-compatible API (Prometheus, Thanos, Mimir, VictoriaMetrics)
         endpoint: https://mimir.example.com
         # Aggregate over the last 7 days using query_range; counts container
         # instances per image across the window to produce a usage score
+        queryType: range
         lookback: 168h
         # Resolution step for range queries (default: 5m)
         step: 5m
         # PromQL query — MUST return results with an "image" label.
-        # The result value becomes the ranking score (higher = cached first).
         query: |
           count(
             container_memory_working_set_bytes{
@@ -138,6 +139,15 @@ spec:
       # Supported keys: token, username, password, ca.crt, tls.crt, tls.key
       secretRef:
         name: prometheus-creds
+  signals:
+    - name: total-usage
+      query: runner-image-usage
+      type: aggregate
+      aggregate:
+        method: sum
+  ranking:
+    strategy: signal
+    signal: total-usage
 ---
 # --- 3. CachedImageSet: ties discovery + policy together, targets nodes ---
 apiVersion: drop.corewire.io/v1alpha1
@@ -304,18 +314,19 @@ spec:
   maxImages: 30
   # Only keep images matching this regex (optional)
   imageFilter: "registry.example.com/.*"
-  sources:
-    - type: prometheus
+  queries:
+    - name: runner-image-usage
+      type: prometheus
       prometheus:
         # Any Prometheus-compatible API (Prometheus, Thanos, Mimir, VictoriaMetrics)
         endpoint: https://mimir.example.com
         # Aggregate over the last 7 days (uses query_range, sums values per image)
         # Omit for a point-in-time instant query instead
+        queryType: range
         lookback: 168h
         # Resolution step for range queries (default: 5m)
         step: 5m
         # PromQL query — MUST return results with an "image" label.
-        # The result value becomes the ranking score (higher = cached first).
         query: |
           count(
             container_memory_working_set_bytes{
@@ -327,6 +338,15 @@ spec:
       # Supported keys: token, username, password, ca.crt, tls.crt, tls.key, headers.<name>
       secretRef:
         name: prometheus-creds
+  signals:
+    - name: total-usage
+      query: runner-image-usage
+      type: aggregate
+      aggregate:
+        method: sum
+  ranking:
+    strategy: signal
+    signal: total-usage
 ---
 apiVersion: drop.corewire.io/v1alpha1
 kind: CachedImageSet
@@ -342,7 +362,11 @@ spec:
       tag: "3.19"
 ```
 
-### Use case: discover and cache application tags from a registry
+### Use case: discover and cache GitLab runner helper images from a registry
+
+GitLab runner helper tags carry an arch/flavor prefix (e.g. `x86_64-v17.5.0`).
+Drop extracts the embedded version automatically; `versionPattern` is shown for
+clarity but is optional here.
 
 ```yaml
 apiVersion: v1
@@ -362,24 +386,30 @@ metadata:
 spec:
   syncInterval: 15m
   maxImages: 10
-  sources:
-    - type: registry
+  queries:
+    - name: registry-tags
+      type: registry
       registry:
         # Registry base URL
-        url: https://registry.example.com
+        url: https://registry.gitlab.com
         # Repositories to list tags from
         repositories:
-          - team/frontend
-          - team/backend
-          - team/worker
-        # Only discover semver tags (regex on tag name)
-        tagFilter: "^v[0-9]+\\."
-        # Keep only the last 3 matching tags returned by the registry
+          - gitlab-org/gitlab-runner/gitlab-runner-helper
+        # Only discover x86_64 semver tags (regex on tag name)
+        tagFilter: "^x86_64-v[0-9]+\\."
+        # Optional: pin where the version lives in the tag (capture group 1)
+        versionPattern: "x86_64-v(.+)"
+        # Optional: skip straight to the x86_64-v* tags (registry `last` cursor)
+        tagSeek: "x86_64-u~"
+        # Optional: cap tags fetched per repo before filtering (default 1000)
+        maxScan: 2000
+        # Keep only the 3 newest matching tags (newest first)
         topX: 3
       # Optional: Secret in the Drop pod namespace (default: drop-system)
       # Supported keys: token, username, password, ca.crt, tls.crt, tls.key, headers.<name>
       secretRef:
         name: registry-api-creds
+  # No signals/ranking needed: registry tags are returned newest-first.
 ---
 apiVersion: drop.corewire.io/v1alpha1
 kind: CachedImageSet
@@ -442,16 +472,16 @@ dev-set    AllReady    3/3     3         dev-registry   1h
 web-apps   Degraded    1/3     3                        10m
 
 $ kubectl get discoverypolicies
-NAME             STATUS              SOURCES   IMAGES   LASTSYNC   AGE
-dev-registry     Synced              1         3        30s        1h
-broken-prom      ConnectionRefused   1         0                   5m
-bad-auth         Unauthorized        1         0                   2m
+NAME             STATUS              IMAGES   LASTSYNC   AGE
+dev-registry     Synced              3        30s        1h
+broken-prom      ConnectionRefused   0                   5m
+bad-auth         Unauthorized        0                   2m
 ```
 
 ## Development
 
 ```bash
-# Prerequisites: Go 1.23+, Kind, Tilt, Helm
+# Prerequisites: Go 1.26+, Kind, Tilt, Helm
 make generate      # deepcopy
 make manifests     # CRDs + RBAC
 go build ./...     # compile

diff --git a/Tiltfile b/Tiltfile
@@ -82,9 +82,11 @@ local('kubectl create namespace e2e-infra --dry-run=client -o yaml | kubectl app
 k8s_yaml('hack/e2e-infra/prometheus-config.yaml')
 k8s_yaml('hack/e2e-infra/prometheus.yaml')
 k8s_yaml('hack/e2e-infra/registry.yaml')
+k8s_yaml('hack/e2e-infra/loki.yaml')
 
 k8s_resource('prometheus', objects=['prometheus-config:configmap', 'prometheus:serviceaccount', 'prometheus-metrics-reader:clusterrolebinding'], port_forwards=['9090:9090'], labels=['infra'])
 k8s_resource('registry', port_forwards=['5000:5000'], labels=['infra'])
+k8s_resource('loki', objects=['loki-config:configmap'], port_forwards=['3100:3100'], labels=['infra'])
 
 # Configure kind nodes to reach the in-cluster registry.
 # Kubelet/containerd can't resolve cluster DNS, so we point them at the registry's ClusterIP.
@@ -99,6 +101,14 @@ local_resource(
 k8s_yaml('hack/e2e-infra/seed-registry-job.yaml')
 k8s_resource('seed-registry', labels=['infra'], resource_deps=['registry-mirror'])
 
+# Seed Loki with image-pull events (Alloy-style JSON structure)
+k8s_yaml('hack/e2e-infra/seed-loki-job.yaml')
+k8s_resource('seed-loki', labels=['infra'], resource_deps=['loki'])
+
+# Alloy: tail real Kubernetes events into Loki (drop_e2e=true)
+k8s_yaml('hack/e2e-infra/alloy.yaml')
+k8s_resource('alloy', objects=['alloy:serviceaccount', 'alloy-events:clusterrole', 'alloy-events:clusterrolebinding', 'alloy-config:configmap'], labels=['infra'], resource_deps=['loki'])
+
 # --- Grafana with Drop dashboard ---
 # Create dashboard ConfigMap from the shipped JSON, then apply grafana manifests.
 dashboard_json = str(read_file('charts/drop/dashboards/drop-operator.json'))
@@ -150,7 +160,13 @@ k8s_resource(
         'dev-set:cachedimageset',
         'dev-set-discovered:cachedimageset',
         'dev-prometheus:discoverypolicy',
+        'dev-prometheus-instant:discoverypolicy',
+        'dev-hybrid:discoverypolicy',
+        'dev-timeweighted:discoverypolicy',
+        'dev-window:discoverypolicy',
+        'dev-loki:discoverypolicy',
         'dev-registry:discoverypolicy',
+        'dev-modelexposure:discoverypolicy',
         'test-broken-prom:discoverypolicy',
         'test-broken-registry:discoverypolicy',
         'test-notfound-repo:discoverypolicy',

diff --git a/ai-docs/07-feature-ui.md b/ai-docs/07-feature-ui.md
@@ -0,0 +1,47 @@
+# UI Feature Specs
+
+Design specs for a future DiscoveryPolicy UI. All previews use a dry-run API — never persisted in etcd.
+
+## 1. Query Editor (Stage 1)
+
+| Element | Purpose |
+|---------|---------|
+| PromQL/LogQL/registry query input with syntax highlighting | Fast query iteration |
+| Live preview table: image ref, raw sample values, sample count | Shows query output before saving the CR |
+| Query health badge: latency, series count, error message | Surface slow/broken endpoints |
+| Registry: collapsible tag list per repo with tagFilter preview | Highlight matching/excluded tags so regex is visible |
+
+## 2. Signal Inspector (Stage 2)
+
+| Element | Purpose |
+|---------|---------|
+| Bar chart per signal: images on Y-axis sorted by value | "Which images score highest on this signal?" |
+| Side-by-side signal comparison (pick 2+) | Reveals when signals disagree on ranking |
+| timeWeightedAggregate: heatmap (hour-of-day × image) | Shows if business-hours window config shifts rankings |
+| eventPullTime: histogram of pull durations with p50/p90/p95 lines | Debug why an image ranks high ("it takes 12s to pull") |
+
+## 3. Ranking Playground (Stage 3)
+
+| Element | Purpose |
+|---------|---------|
+| Ranked image list with stacked bar score breakdown | Shows *why* an image is ranked #1 vs #5 |
+| Weight sliders (weightedSum): drag to reorder in real-time | Eliminates apply-wait-check loop |
+| maxImages cutoff line: draggable line on ranked list | Simulate different maxImages values |
+| Diff view: images entering/leaving top-N, score deltas | "Did my config change improve things?" |
+| modelExposure: node exposure diagram with estimated pull cost | Makes the abstract formula concrete |
+
+## 4. Cross-cutting Views
+
+| Element | Purpose |
+|---------|---------|
+| Pipeline DAG: query → signal → ranking with health per node | Overview for complex multi-query setups |
+| etcd budget meter: current status size vs max | Ops visibility |
+| Sync timeline: imageCount sparkline with sync events | Detects flapping (oscillating image count) |
+| CachedImageSet propagation: discovered → CachedImage → node pull status | Closes the loop: discovery → caching → readiness |
+
+## Architecture
+
+- Previews (query editor, weight sliders) computed via a `/dryrun` endpoint or CLI tool
+- Dry-run takes a `DiscoveryPolicySpec`, runs the pipeline once, returns full result without writing status
+- CR only stores the last committed sync result (slimmed status)
+- UI richness comes from dry-run responses, not from bloating the stored status