Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,12 @@ updates:
directory: /
schedule:
interval: weekly

# Keep Python dependencies (pyproject.toml / uv.lock) patched; one grouped PR per week.
- package-ecosystem: pip
directory: /
schedule:
interval: weekly
groups:
python:
patterns: ["*"]
10 changes: 5 additions & 5 deletions .github/workflows/ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ jobs:
python-version: ["3.11", "3.12", "3.13"]

steps:
- uses: actions/checkout@v6
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v6
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
with:
python-version: ${{ matrix.python-version }}

- name: Set up uv (fast installs + cache)
uses: astral-sh/setup-uv@v7
uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7
with:
enable-cache: true

Expand Down Expand Up @@ -65,10 +65,10 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@v6
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6

- name: Set up uv
uses: astral-sh/setup-uv@v7
uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7
with:
python-version: "3.12"

Expand Down
47 changes: 47 additions & 0 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: CodeQL

# Static analysis (semantic code scanning) for the Python codebase. Results are
# uploaded to the repository's Security > Code scanning alerts. Runs on PRs/pushes
# to the main branches and on a weekly schedule to catch newly published queries.
on:
push:
branches: [main, master, develop]
pull_request:
branches: [main, master, develop]
schedule:
- cron: "0 4 * * 1"
workflow_dispatch:

concurrency:
group: codeql-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
analyze:
name: Analyze (python)
runs-on: ubuntu-latest
timeout-minutes: 30

# Least privilege: read the code, write code-scanning results, nothing else.
permissions:
contents: read
security-events: write

strategy:
fail-fast: false
matrix:
language: [python]

steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6

- name: Initialize CodeQL
uses: github/codeql-action/init@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4
with:
languages: ${{ matrix.language }}
queries: security-and-quality

- name: Perform CodeQL analysis
uses: github/codeql-action/analyze@8aad20d150bbac5944a9f9d289da16a4b0d87c1e # v4
with:
category: "/language:${{ matrix.language }}"
37 changes: 31 additions & 6 deletions .github/workflows/docker-beta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,25 +36,25 @@ jobs:
- target: ui # Streamlit UI
suffix: "-ui"
steps:
- uses: actions/checkout@v6
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6

- name: Set up QEMU (arm64 emulation)
uses: docker/setup-qemu-action@v4
uses: docker/setup-qemu-action@06116385d9baf250c9f4dcb4858b16962ea869c3 # v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v4
uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 # v4

- name: Log in to GHCR
if: github.event_name != 'pull_request'
uses: docker/login-action@v4
uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Derive image tags & labels
id: meta
uses: docker/metadata-action@v6
uses: docker/metadata-action@80c7e94dd9b9319bd5eb7a0e0fe9291e23a2a2e9 # v6
with:
images: ${{ env.IMAGE_NAME }}
flavor: |
Expand All @@ -66,13 +66,17 @@ jobs:
type=sha,prefix=sha-

- name: Build${{ github.event_name != 'pull_request' && ' and push' || ' (validate, no push)' }}
uses: docker/build-push-action@v7
id: build
uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7
with:
context: .
target: ${{ matrix.target }}
# Multi-arch on master; amd64-only on PRs to keep validation fast.
platforms: ${{ github.event_name == 'pull_request' && 'linux/amd64' || 'linux/amd64,linux/arm64' }}
push: ${{ github.event_name != 'pull_request' }}
# On PRs (no push) load the single-arch image into the local daemon so the
# Trivy step below can scan the exact artifact that was just built.
load: ${{ github.event_name == 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
annotations: ${{ steps.meta.outputs.annotations }}
Expand All @@ -84,3 +88,24 @@ jobs:
cache-to: type=gha,mode=max,scope=${{ matrix.target }},ignore-error=true
provenance: ${{ github.event_name != 'pull_request' }}
sbom: ${{ github.event_name != 'pull_request' }}

# Vulnerability-scan the exact image we just built. On PRs the image is loaded
# into the local daemon (load: true above) and scanned by its concrete `beta`
# tag; on master the image is pushed and scanned by digest. Fails the build on
# any fixable HIGH/CRITICAL OS or library CVE.
- name: Scan image with Trivy
uses: aquasecurity/trivy-action@ed142fd0673e97e23eac54620cfb913e5ce36c25 # v0.36.0
with:
image-ref: >-
${{ github.event_name == 'pull_request'
&& format('{0}:beta{1}', env.IMAGE_NAME, matrix.suffix)
|| format('{0}@{1}', env.IMAGE_NAME, steps.build.outputs.digest) }}
format: table
exit-code: "1"
ignore-unfixed: true
vuln-type: os,library
severity: HIGH,CRITICAL
env:
# Avoid GHCR rate limits when pulling the Trivy vulnerability DB.
TRIVY_USERNAME: ${{ github.actor }}
TRIVY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
4 changes: 2 additions & 2 deletions .github/workflows/helm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@v6
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6

- name: Set up Helm
uses: azure/setup-helm@v4
uses: azure/setup-helm@1a275c3b69536ee54be43f2070a358922e12c8d4 # v4
with:
version: v3.16.4

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- uses: actions/checkout@v6
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6

- name: Set up Python
uses: actions/setup-python@v6
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
with:
python-version: "3.12"

- name: Set up uv (fast installs + cache)
uses: astral-sh/setup-uv@v7
uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7
with:
enable-cache: true

Expand Down
58 changes: 58 additions & 0 deletions .github/workflows/security.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: Security

# Supply-chain and secret-hygiene checks that run independently of the test matrix
# so they gate every PR and push even when the unit tests are skipped. Three jobs:
# - pip-audit: scans resolved Python dependencies for known CVEs (PyPI advisories).
# - gitleaks: scans the repo / PR diff for committed secrets, even without a
# developer running the local pre-commit hook.
on:
push:
branches: [main, master, develop]
pull_request:
branches: [main, master, develop]
workflow_dispatch:

# Least privilege: both jobs only need to read the repository contents.
permissions:
contents: read

concurrency:
group: security-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
pip-audit:
name: pip-audit (dependency CVEs)
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6

- name: Set up uv (provides uvx for pip-audit)
uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7
with:
python-version: "3.12"

# Audit the project's full dependency closure (runtime + optional extras) by
# resolving it from pyproject.toml into a requirements file, then scanning that.
# Fails the job on any dependency with a known advisory.
- name: Resolve dependency closure
run: uv pip compile --all-extras --output-file requirements-audit.txt pyproject.toml

- name: Audit dependencies
run: uvx pip-audit --strict --requirement requirements-audit.txt

gitleaks:
name: gitleaks (secret scan)
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6
with:
# Full history so the scan can diff the whole range on push, not just HEAD.
fetch-depth: 0

- name: Run gitleaks
uses: gitleaks/gitleaks-action@e0c47f4f8be36e29cdc102c57e68cb5cbf0e8d1e # v3
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,8 @@ repos:
- id: mypy
additional_dependencies: ["types-requests"]
args: ["--config-file=pyproject.toml"]
# Secret scanning: block accidental commits of API keys/tokens (mirrored in CI).
- repo: https://github.com/gitleaks/gitleaks
rev: v8.30.1
hooks:
- id: gitleaks
8 changes: 6 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
ARG PYTHON_VERSION=3.12

# ---------- shared base ----------
FROM python:${PYTHON_VERSION}-slim AS base
# Digest-pinned for reproducible, tamper-evident builds. The digest is the
# authoritative reference; the trailing tag is a human-readable reminder only.
# Re-resolve with: docker buildx imagetools inspect python:${PYTHON_VERSION}-slim
FROM python:${PYTHON_VERSION}-slim@sha256:d764629ce0ddd8c71fd371e9901efb324a95789d2315a47db7e4d27e78f1b0e9 AS base
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
Expand All @@ -19,7 +22,8 @@ LABEL org.opencontainers.image.source="https://github.com/Neverdecel/CodeRAG" \
org.opencontainers.image.licenses="Apache-2.0"

# uv for fast, reproducible installs (copied from the official image).
COPY --from=ghcr.io/astral-sh/uv:0.8.17 /uv /uvx /usr/local/bin/
# Digest-pinned (re-resolve with: docker buildx imagetools inspect ghcr.io/astral-sh/uv:0.8.17).
COPY --from=ghcr.io/astral-sh/uv:0.8.17@sha256:e4644cb5bd56fdc2c5ea3ee0525d9d21eed1603bccd6a21f887a938be7e85be1 /uv /uvx /usr/local/bin/

WORKDIR /app
# pyproject reads README.md and LICENSE-2.0.txt, so both are needed to build the wheel.
Expand Down
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ curl "http://127.0.0.1:8000/file?path=coderag/api.py&start_line=1&end_line=40"

Self-host it once and point any number of custom apps or teammates at a big shared codebase.

> **Security.** The API is **unauthenticated by default** and can read indexed source and
> file contents. Keep it on `127.0.0.1` for local use, or set `CODERAG_API_KEY` (sent as
> `Authorization: Bearer <key>` or `X-API-Key`) and front it with TLS / an authenticating
> proxy before exposing it. CORS stays off unless you set `CODERAG_CORS_ORIGINS`. The
> `/file` endpoint only serves files that are actually indexed.

### Web UI (`coderag ui`)

Streamlit app: search box, retrieved chunks with `path:line` citations and similarity
Expand Down Expand Up @@ -131,7 +137,8 @@ docker run --rm -p 8501:8501 \
Tags: `:beta` (latest `master`), `:edge` (alias), `:sha-<commit>` (immutable); the UI image
adds a `-ui` suffix. The container indexes `/workspace` and stores its index in `/data`
(`CODERAG_WATCHED_DIR` / `CODERAG_STORE_DIR`). For OpenAI embeddings/answers, add
`-e OPENAI_API_KEY=…`.
`-e OPENAI_API_KEY=…`. The container binds `0.0.0.0`, so set `-e CODERAG_API_KEY=…` and
keep the port on a trusted network (or behind an authenticating proxy) when exposing it.

## ☸️ Kubernetes (Helm)

Expand Down Expand Up @@ -199,6 +206,8 @@ Everything is configurable via `CODERAG_*` environment variables or a `.env` fil
| `CODERAG_CHAT_MODEL` | `gpt-4o-mini` | OpenAI (or self-hosted) chat model for answers |
| `ANTHROPIC_API_KEY` | – | Anthropic (Claude) answers |
| `CODERAG_ANTHROPIC_MODEL` | `claude-opus-4-8` | Anthropic chat model for answers |
| `CODERAG_API_KEY` | – | If set, the HTTP API **requires** it (`Authorization: Bearer <key>` or `X-API-Key`). Set whenever the server is reachable beyond localhost. |
| `CODERAG_CORS_ORIGINS` | – | Comma-separated CORS allowlist for the HTTP API (never `*`). Empty ⇒ no cross-origin browser access. |

## 🧩 Supported languages

Expand Down
30 changes: 26 additions & 4 deletions coderag/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def __init__(self, config: Optional[Config] = None) -> None:
self._vectors: Optional["FaissVectorIndex"] = None
self._indexer: Optional["Indexer"] = None
self._searcher: Optional["HybridSearcher"] = None
# Set when the store's embedding model/dim changed and the FAISS cache must
# be rebuilt from scratch (consumed when the vector index is first opened).
self._rebuild_required: bool = False

# --- lazily constructed collaborators ---

Expand All @@ -52,17 +55,28 @@ def store(self) -> "SQLiteStore":

self.config.store_dir.mkdir(parents=True, exist_ok=True)
self._store = SQLiteStore(self.config.db_path)
self._store.bootstrap(self.provider.dim, self.provider.model_id)
# bootstrap() returns True when the embedding model/dim changed and the
# store was cleared — the vector cache must then be fully rebuilt.
self._rebuild_required = self._store.bootstrap(
self.provider.dim, self.provider.model_id
)
return self._store

@property
def vectors(self) -> "FaissVectorIndex":
if self._vectors is None:
from coderag.store.vector_index import FaissVectorIndex

# Access the store first so its bootstrap() runs and sets the rebuild flag.
store = self.store
self._vectors = FaissVectorIndex.open(self.config, self.provider.dim)
# FAISS is a rebuildable cache; reconcile with the source of truth on open.
self._vectors.ensure_consistent(self.store)
# An explicit rebuild signal (model/dim changed) forces a clean rebuild
# rather than relying on a chunk-count mismatch as a proxy.
if self._rebuild_required:
self._vectors.rebuild_from_store(store)
else:
self._vectors.ensure_consistent(store)
return self._vectors

@property
Expand Down Expand Up @@ -108,11 +122,19 @@ def get_file(
start_line: Optional[int] = None,
end_line: Optional[int] = None,
) -> str:
"""Return the contents of an indexed file, optionally a 1-based line range."""
full = (self.config.watched_dir / Path(path)).resolve()
"""Return the contents of an indexed file, optionally a 1-based line range.

Only files that are actually in the index can be read — this is a defense in
depth so the endpoint can't be used to read arbitrary files (e.g. ``.env`` or
``.git`` contents) that merely happen to sit under the watched root.
"""
root = self.config.watched_dir.resolve()
full = (root / Path(path)).resolve()
if root not in full.parents and full != root:
raise ValueError(f"Path escapes the indexed root: {path}")
rel = full.relative_to(root).as_posix()
if self.store.get_file(rel) is None:
raise FileNotFoundError(f"Not an indexed file: {path}")
text = full.read_text(encoding="utf-8", errors="replace")
if start_line is None and end_line is None:
return text
Expand Down
Loading
Loading