From 910f7c428a3f2974934c7196d2e04ec0bee3d525 Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Tue, 6 Jan 2026 06:27:54 -0800
Subject: [PATCH 01/11] feat: add smoke tests for CLI integration testing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add smoke tests that verify end-to-end CLI functionality
- Test basic CLI operations (--version, --help, error handling)
- Test eval command with echo provider (no external dependencies)
- Test output formats (JSON, YAML, CSV)
- Test CLI flags (--repeat, --max-concurrency, --verbose, --no-cache)
- Test exit codes (0 for success, 100 for failures, 1 for errors)
- Test assertions (contains, icontains, failing assertions)
- Add pytest configuration with 'smoke' marker for selective testing
- Add comprehensive README documenting smoke test purpose and usage

Total: 20 smoke tests, all passing ✅

Smoke tests run against the installed promptfoo CLI via subprocess,
testing the Python wrapper integration with the Node.js CLI.

Run smoke tests:
  pytest tests/smoke/              # Run all smoke tests
  pytest tests/ -m smoke           # Run only smoke-marked tests
  pytest tests/ -m 'not smoke'     # Skip smoke tests (unit tests only)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 pyproject.toml                                |  13 +
 tests/smoke/README.md                         |  88 ++++
 tests/smoke/__init__.py                       |   1 +
 tests/smoke/fixtures/configs/assertions.yaml  |  22 +
 tests/smoke/fixtures/configs/basic.yaml       |  17 +
 .../fixtures/configs/failing-assertion.yaml   |  17 +
 tests/smoke/test_smoke.py                     | 398 ++++++++++++++++++
 7 files changed, 556 insertions(+)
 create mode 100644 tests/smoke/README.md
 create mode 100644 tests/smoke/__init__.py
 create mode 100644 tests/smoke/fixtures/configs/assertions.yaml
 create mode 100644 tests/smoke/fixtures/configs/basic.yaml
 create mode 100644 tests/smoke/fixtures/configs/failing-assertion.yaml
 create mode 100644 tests/smoke/test_smoke.py

diff --git a/pyproject.toml b/pyproject.toml
index 3d61faf..a092d39 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -98,3 +98,16 @@ show_error_codes = true
 pretty = true
 check_untyped_defs = true
 disallow_incomplete_defs = true
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "-v",
+    "--strict-markers",
+]
+markers = [
+    "smoke: smoke tests that run the full CLI (slow, requires Node.js)",
+]
diff --git a/tests/smoke/README.md b/tests/smoke/README.md
new file mode 100644
index 0000000..73d813b
--- /dev/null
+++ b/tests/smoke/README.md
@@ -0,0 +1,88 @@
+# Smoke Tests
+
+These smoke tests verify that the core promptfoo CLI functionality works correctly through the Python wrapper.
+
+## What are Smoke Tests?
+
+Smoke tests are high-level integration tests that verify the most critical functionality works end-to-end. They:
+
+- Run against the actual installed CLI (via `npx promptfoo`)
+- Test the Python wrapper integration with the Node.js CLI
+- Use the `echo` provider to avoid external API dependencies
+- Verify command-line arguments, file I/O, and output formats
+- Check exit codes and error handling
+
+## Running Smoke Tests
+
+```bash
+# Run all smoke tests
+pytest tests/smoke/
+
+# Run with verbose output
+pytest tests/smoke/ -v
+
+# Run a specific test class
+pytest tests/smoke/test_smoke.py::TestEvalCommand
+
+# Run a specific test
+pytest tests/smoke/test_smoke.py::TestEvalCommand::test_basic_eval
+```
+
+## Test Structure
+
+- `test_smoke.py` - Main smoke test suite
+- `fixtures/` - Test configuration files
+  - `configs/` - YAML configuration files for testing
+
+## Test Coverage
+
+### Basic CLI Operations
+- Version flag (`--version`)
+- Help output (`--help`, `eval --help`)
+- Unknown command handling
+- Missing file error handling
+
+### Eval Command
+- Basic evaluation with echo provider
+- Output formats (JSON, YAML, CSV)
+- Command-line flags (`--max-concurrency`, `--repeat`, `--verbose`)
+- Cache control (`--no-cache`)
+
+### Exit Codes
+- Exit code 0 for success
+- Exit code 100 for assertion failures
+- Exit code 1 for configuration errors
+
+### Echo Provider
+- Basic prompt echoing
+- Variable substitution
+- Multiple variable handling
+
+### Assertions
+- `contains` assertion
+- `icontains` assertion (case-insensitive)
+- Multiple assertions per test
+- Failing assertion behavior
+
+## Why Echo Provider?
+
+The `echo` provider is perfect for smoke tests because:
+
+1. **No external dependencies** - Doesn't require API keys or network calls
+2. **Deterministic** - Always returns the same output for the same input
+3. **Fast** - No network latency
+4. **Predictable** - Easy to write assertions against
+
+## Adding New Smoke Tests
+
+1. Create a new test config in `fixtures/configs/` if needed
+2. Add test methods to the appropriate test class in `test_smoke.py`
+3. Use the `run_promptfoo()` helper to execute CLI commands
+4. Make assertions on stdout, stderr, exit codes, and output files
+
+## Notes
+
+- Smoke tests run slower than unit tests (they spawn subprocesses)
+- They require Node.js and promptfoo to be installed
+- They test the integration between Python and Node.js
+- They should be kept focused on critical functionality
diff --git a/tests/smoke/__init__.py b/tests/smoke/__init__.py
new file mode 100644
index 0000000..a2573de
--- /dev/null
+++ b/tests/smoke/__init__.py
@@ -0,0 +1 @@
+"""Smoke tests for promptfoo CLI."""
diff --git a/tests/smoke/fixtures/configs/assertions.yaml b/tests/smoke/fixtures/configs/assertions.yaml
new file mode 100644
index 0000000..b03ee62
--- /dev/null
+++ b/tests/smoke/fixtures/configs/assertions.yaml
@@ -0,0 +1,22 @@
+# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
+description: 'Smoke test - multiple assertions'
+
+providers:
+  - echo
+
+prompts:
+  - 'Hello {{name}}, welcome to {{place}}'
+
+tests:
+  - vars:
+      name: Alice
+      place: Wonderland
+    assert:
+      - type: contains
+        value: Hello
+      - type: contains
+        value: Alice
+      - type: contains
+        value: Wonderland
+      - type: icontains
+        value: WELCOME
diff --git a/tests/smoke/fixtures/configs/basic.yaml b/tests/smoke/fixtures/configs/basic.yaml
new file mode 100644
index 0000000..936bb4a
--- /dev/null
+++ b/tests/smoke/fixtures/configs/basic.yaml
@@ -0,0 +1,17 @@
+# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
+description: 'Smoke test - basic config validation'
+
+providers:
+  - echo
+
+prompts:
+  - 'Hello {{name}}'
+
+tests:
+  - vars:
+      name: World
+    assert:
+      - type: contains
+        value: Hello
+      - type: contains
+        value: World
diff --git a/tests/smoke/fixtures/configs/failing-assertion.yaml b/tests/smoke/fixtures/configs/failing-assertion.yaml
new file mode 100644
index 0000000..ee8d327
--- /dev/null
+++ b/tests/smoke/fixtures/configs/failing-assertion.yaml
@@ -0,0 +1,17 @@
+# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
+description: 'Smoke test - config with failing assertion'
+
+providers:
+  - echo
+
+prompts:
+  - 'Hello {{name}}'
+
+tests:
+  - vars:
+      name: World
+    assert:
+      # This assertion will fail because echo returns "Hello World"
+      # but we're asserting it contains "IMPOSSIBLE_STRING_NOT_IN_OUTPUT"
+      - type: contains
+        value: IMPOSSIBLE_STRING_NOT_IN_OUTPUT_12345
diff --git a/tests/smoke/test_smoke.py b/tests/smoke/test_smoke.py
new file mode 100644
index 0000000..143db70
--- /dev/null
+++ b/tests/smoke/test_smoke.py
@@ -0,0 +1,398 @@
+"""
+Smoke tests for the promptfoo CLI.
+
+These tests verify the core evaluation pipeline works correctly
+using the echo provider (no external API dependencies).
+
+These tests run against the installed promptfoo package via npx,
+testing the Python wrapper integration.
+"""
+
+import json
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+import pytest
+
+# Mark all tests in this module as smoke tests
+pytestmark = pytest.mark.smoke
+
+# Directories
+SMOKE_DIR = Path(__file__).parent
+FIXTURES_DIR = SMOKE_DIR / "fixtures"
+CONFIGS_DIR = FIXTURES_DIR / "configs"
+OUTPUT_DIR = SMOKE_DIR / ".temp-output"
+
+
+def run_promptfoo(
+    args: list[str],
+    cwd: Path | None = None,
+    expect_error: bool = False,
+    env: dict[str, str] | None = None,
+) -> tuple[str, str, int]:
+    """
+    Run promptfoo CLI and capture output.
+
+    Args:
+        args: CLI arguments to pass to promptfoo
+        cwd: Working directory for the command
+        expect_error: If True, don't raise on non-zero exit
+        env: Environment variables to set
+
+    Returns:
+        Tuple of (stdout, stderr, exit_code)
+    """
+    cmd = ["promptfoo"] + args
+
+    full_env = os.environ.copy()
+    full_env["NO_COLOR"] = "1"  # Disable color output for easier parsing
+    if env:
+        full_env.update(env)
+
+    result = subprocess.run(
+        cmd,
+        cwd=cwd or Path.cwd(),
+        capture_output=True,
+        text=True,
+        env=full_env,
+        timeout=60,  # Eval can take longer
+    )
+
+    stdout = result.stdout
+    stderr = result.stderr
+    exit_code = result.returncode
+
+    if not expect_error and exit_code != 0:
+        # For debugging failed tests
+        print(f"Command failed: {' '.join(cmd)}")
+        print(f"Exit code: {exit_code}")
+        print(f"STDOUT:\n{stdout}")
+        print(f"STDERR:\n{stderr}")
+
+    return stdout, stderr, exit_code
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_and_teardown():
+    """Create and cleanup output directory for smoke tests."""
+    OUTPUT_DIR.mkdir(exist_ok=True)
+    yield
+    if OUTPUT_DIR.exists():
+        shutil.rmtree(OUTPUT_DIR)
+
+
+class TestBasicCLI:
+    """Basic CLI operations smoke tests."""
+
+    def test_version_flag(self):
+        """Test --version flag outputs version."""
+        stdout, stderr, exit_code = run_promptfoo(["--version"])
+
+        assert exit_code == 0
+        # Should output a version number (semver format)
+        assert stdout.strip(), "Version output should not be empty"
+
+    def test_help_flag(self):
+        """Test --help flag outputs help."""
+        stdout, stderr, exit_code = run_promptfoo(["--help"])
+
+        assert exit_code == 0
+        assert "promptfoo" in stdout.lower()
+        assert "eval" in stdout.lower()
+
+    def test_eval_help(self):
+        """Test 'eval --help' outputs eval command help."""
+        stdout, stderr, exit_code = run_promptfoo(["eval", "--help"])
+
+        assert exit_code == 0
+        assert "--config" in stdout or "-c" in stdout
+        assert "--output" in stdout or "-o" in stdout
+
+    def test_unknown_command(self):
+        """Test unknown command returns error."""
+        stdout, stderr, exit_code = run_promptfoo(
+            ["unknowncommand123"],
+            expect_error=True,
+        )
+
+        assert exit_code != 0
+        output = stdout + stderr
+        assert "unknown" in output.lower() or "not found" in output.lower()
+
+    def test_missing_config_file(self):
+        """Test missing config file returns error."""
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", "nonexistent-config-file.yaml"],
+            expect_error=True,
+        )
+
+        assert exit_code != 0
+        output = stdout + stderr
+        # Should indicate the file wasn't found
+        assert any(
+            phrase in output.lower()
+            for phrase in [
+                "not found",
+                "no such file",
+                "does not exist",
+                "cannot find",
+                "no configuration file",
+            ]
+        )
+
+
+class TestEvalCommand:
+    """Eval command smoke tests."""
+
+    def test_basic_eval(self):
+        """Test basic eval with echo provider."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--no-cache"]
+        )
+
+        assert exit_code == 0, f"Eval failed:\nSTDOUT: {stdout}\nSTDERR: {stderr}"
+        # Should show evaluation results
+        assert "pass" in stdout.lower() or "✓" in stdout or "success" in stdout.lower()
+
+    def test_json_output(self):
+        """Test eval outputs valid JSON."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        output_path = OUTPUT_DIR / "output.json"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "-o", str(output_path), "--no-cache"]
+        )
+
+        assert exit_code == 0, f"Eval failed:\nSTDOUT: {stdout}\nSTDERR: {stderr}"
+        assert output_path.exists(), "Output file was not created"
+
+        # Verify it's valid JSON with expected structure
+        with open(output_path) as f:
+            data = json.load(f)
+
+        assert "results" in data
+        assert "results" in data["results"]
+        assert isinstance(data["results"]["results"], list)
+        assert len(data["results"]["results"]) > 0
+
+        # Verify echo provider returns the prompt
+        first_result = data["results"]["results"][0]
+        assert "response" in first_result
+        assert "output" in first_result["response"]
+        output_text = first_result["response"]["output"]
+        assert "Hello" in output_text
+        assert "World" in output_text
+
+    def test_yaml_output(self):
+        """Test eval outputs YAML format."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        output_path = OUTPUT_DIR / "output.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "-o", str(output_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+        assert output_path.exists()
+
+        # Verify it contains YAML-like content
+        with open(output_path) as f:
+            content = f.read()
+
+        assert "results:" in content
+
+    def test_csv_output(self):
+        """Test eval outputs CSV format."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        output_path = OUTPUT_DIR / "output.csv"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "-o", str(output_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+        assert output_path.exists()
+
+        # Verify it's CSV format (has header row with columns)
+        with open(output_path) as f:
+            content = f.read()
+
+        lines = content.strip().split("\n")
+        assert len(lines) > 0
+        # CSV should have comma-separated values
+        assert "," in lines[0]
+
+    def test_max_concurrency_flag(self):
+        """Test --max-concurrency flag."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--max-concurrency", "1", "--no-cache"]
+        )
+
+        assert exit_code == 0
+
+    def test_repeat_flag(self):
+        """Test --repeat flag runs tests multiple times."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        output_path = OUTPUT_DIR / "repeat-output.json"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            [
+                "eval",
+                "-c",
+                str(config_path),
+                "--repeat",
+                "2",
+                "-o",
+                str(output_path),
+                "--no-cache",
+            ]
+        )
+
+        assert exit_code == 0
+
+        # Verify we got repeated results
+        with open(output_path) as f:
+            data = json.load(f)
+
+        # With repeat=2 and 1 test case, we should have 2 results
+        assert len(data["results"]["results"]) == 2
+
+    def test_verbose_flag(self):
+        """Test --verbose flag."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--verbose", "--no-cache"]
+        )
+
+        assert exit_code == 0
+        # Verbose mode should produce output
+        assert len(stdout) > 0 or len(stderr) > 0
+
+
+class TestExitCodes:
+    """Exit code smoke tests."""
+
+    def test_success_exit_code(self):
+        """Test exit code 0 when all assertions pass."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+
+    def test_failure_exit_code(self):
+        """Test exit code 100 when assertions fail."""
+        config_path = CONFIGS_DIR / "failing-assertion.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--no-cache"],
+            expect_error=True,
+        )
+
+        # Exit code 100 indicates test failures
+        assert exit_code == 100, f"Expected exit code 100, got {exit_code}"
+
+    def test_config_error_exit_code(self):
+        """Test exit code 1 for config errors."""
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", "nonexistent-file.yaml", "--no-cache"],
+            expect_error=True,
+        )
+
+        assert exit_code == 1
+
+
+class TestEchoProvider:
+    """Echo provider smoke tests."""
+
+    def test_echo_provider_basic(self):
+        """Test echo provider returns the prompt."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        output_path = OUTPUT_DIR / "echo-test.json"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "-o", str(output_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+
+        # Verify echo provider returns the prompt
+        with open(output_path) as f:
+            data = json.load(f)
+
+        first_result = data["results"]["results"][0]
+
+        # Echo provider should return the prompt in the response
+        output = first_result["response"]["output"]
+        assert "Hello" in output
+        assert "World" in output
+
+    def test_echo_provider_with_multiple_vars(self):
+        """Test echo provider with multiple variables."""
+        config_path = CONFIGS_DIR / "assertions.yaml"
+        output_path = OUTPUT_DIR / "echo-multi-var.json"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "-o", str(output_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+
+        with open(output_path) as f:
+            data = json.load(f)
+
+        first_result = data["results"]["results"][0]
+        output = first_result["response"]["output"]
+
+        # Should contain all variable values
+        assert "Alice" in output
+        assert "Wonderland" in output
+
+
+class TestAssertions:
+    """Assertion smoke tests."""
+
+    def test_contains_assertion(self):
+        """Test contains assertion."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+        # All assertions should pass
+        assert "pass" in stdout.lower() or "✓" in stdout or "success" in stdout.lower()
+
+    def test_multiple_assertions(self):
+        """Test multiple assertions in single test."""
+        config_path = CONFIGS_DIR / "assertions.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+
+    def test_failing_assertion(self):
+        """Test failing assertion."""
+        config_path = CONFIGS_DIR / "failing-assertion.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--no-cache"],
+            expect_error=True,
+        )
+
+        # Should fail with exit code 100
+        assert exit_code == 100
+        output = stdout + stderr
+        # Should indicate failure
+        assert "fail" in output.lower() or "✗" in output or "error" in output.lower()

From 60dff7d68466ba517d69d7c8ac3adefc9eedb782 Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Tue, 6 Jan 2026 06:32:45 -0800
Subject: [PATCH 02/11] ci: run unit tests and smoke tests in CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the CI was only testing CLI invocation but not running pytest.

Changes:
- Install dev dependencies (pytest, mypy, ruff) in test jobs
- Run unit tests with: pytest tests/ -v -m 'not smoke'
- Run smoke tests with: pytest tests/smoke/ -v
- Both 'test' and 'test-npx-fallback' jobs now run full test suite

This ensures:
✅ Unit tests run on all platforms (ubuntu, windows) and Python versions (3.9, 3.13)
✅ Smoke tests verify end-to-end CLI functionality
✅ Both global install and npx fallback paths are tested
---
 .github/workflows/test.yml | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index bf7a19a..aae9d59 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -126,8 +126,14 @@ jobs:
       - name: Pin Python version
         run: uv python pin ${{ matrix.python-version }}
 
-      - name: Install package
-        run: uv sync
+      - name: Install package with dev dependencies
+        run: uv sync --extra dev
+
+      - name: Run unit tests
+        run: uv run pytest tests/ -v -m 'not smoke'
+
+      - name: Run smoke tests
+        run: uv run pytest tests/smoke/ -v
 
       - name: Test CLI can be invoked
         run: uv run promptfoo --version
@@ -192,8 +198,14 @@ jobs:
       - name: Pin Python version
         run: uv python pin ${{ matrix.python-version }}
 
-      - name: Install package
-        run: uv sync
+      - name: Install package with dev dependencies
+        run: uv sync --extra dev
+
+      - name: Run unit tests
+        run: uv run pytest tests/ -v -m 'not smoke'
+
+      - name: Run smoke tests (with npx fallback)
+        run: uv run pytest tests/smoke/ -v
 
       - name: Test CLI fallback to npx (no global install)
         run: uv run promptfoo --version

From 6193feb39f7ffa0a223ceb67d7653fbb22adbadd Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Tue, 6 Jan 2026 06:37:43 -0800
Subject: [PATCH 03/11] fix: use Optional for Python 3.9 compatibility in smoke
 tests

---
 tests/smoke/test_smoke.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/smoke/test_smoke.py b/tests/smoke/test_smoke.py
index 143db70..73377c1 100644
--- a/tests/smoke/test_smoke.py
+++ b/tests/smoke/test_smoke.py
@@ -13,6 +13,7 @@
 import shutil
 import subprocess
 from pathlib import Path
+from typing import Optional
 
 import pytest
 
@@ -28,9 +29,9 @@
 
 def run_promptfoo(
     args: list[str],
-    cwd: Path | None = None,
+    cwd: Optional[Path] = None,
     expect_error: bool = False,
-    env: dict[str, str] | None = None,
+    env: Optional[dict[str, str]] = None,
 ) -> tuple[str, str, int]:
     """
     Run promptfoo CLI and capture output.

From 3f4e9fd433d3c4b19cb8ed1ed6a380ce8e442527 Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Tue, 6 Jan 2026 06:58:41 -0800
Subject: [PATCH 04/11] fix: make platform-specific tests work on both Unix and
 Windows

- Split test_split_path into platform-specific versions (Unix/Windows)
- Split test_find_external_promptfoo_prevents_recursion for platform paths
- Use platform-appropriate node path in test_main_exits_when_neither_external_nor_npx_available
- Tests now skip appropriately on incompatible platforms
---
 tests/test_cli.py | 54 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 1acf1c7..e32f2e9 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -104,6 +104,7 @@ def test_strip_quotes(self, input_path: str, expected: str) -> None:
         """Quote stripping handles various quote patterns correctly."""
         assert _strip_quotes(input_path) == expected
 
+    @pytest.mark.skipif(sys.platform == "win32", reason="Unix-style PATH separator test")
     @pytest.mark.parametrize(
         "path_value,expected",
         [
@@ -115,8 +116,24 @@ def test_strip_quotes(self, input_path: str, expected: str) -> None:
             (":::", []),  # Only separators
         ],
     )
-    def test_split_path(self, path_value: str, expected: list[str]) -> None:
-        """PATH splitting handles quotes, empty entries, and whitespace."""
+    def test_split_path_unix(self, path_value: str, expected: list[str]) -> None:
+        """PATH splitting handles quotes, empty entries, and whitespace on Unix."""
+        assert _split_path(path_value) == expected
+
+    @pytest.mark.skipif(sys.platform != "win32", reason="Windows-style PATH separator test")
+    @pytest.mark.parametrize(
+        "path_value,expected",
+        [
+            ("C:\\bin;C:\\tools", ["C:\\bin", "C:\\tools"]),
+            ('"C:\\bin";C:\\tools', ["C:\\bin", "C:\\tools"]),
+            ("C:\\bin;;C:\\tools", ["C:\\bin", "C:\\tools"]),  # Empty entry removed
+            ("  C:\\bin  ;  C:\\tools  ", ["C:\\bin", "C:\\tools"]),  # Whitespace
+            ("", []),
+            (";;;", []),  # Only separators
+        ],
+    )
+    def test_split_path_windows(self, path_value: str, expected: list[str]) -> None:
+        """PATH splitting handles quotes, empty entries, and whitespace on Windows."""
         assert _split_path(path_value) == expected
 
 
@@ -224,8 +241,9 @@ def test_find_external_promptfoo_when_found(self, monkeypatch: pytest.MonkeyPatc
         result = _find_external_promptfoo()
         assert result == promptfoo_path
 
-    def test_find_external_promptfoo_prevents_recursion(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        """Filters out wrapper directory from PATH to prevent recursion."""
+    @pytest.mark.skipif(sys.platform == "win32", reason="Unix-specific recursion test")
+    def test_find_external_promptfoo_prevents_recursion_unix(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """Filters out wrapper directory from PATH to prevent recursion on Unix."""
         wrapper_path = "/home/user/.local/bin/promptfoo"
         real_promptfoo = "/usr/local/bin/promptfoo"
 
@@ -246,6 +264,29 @@ def mock_which(cmd: str, path: Optional[str] = None) -> Optional[str]:
         result = _find_external_promptfoo()
         assert result == real_promptfoo
 
+    @pytest.mark.skipif(sys.platform != "win32", reason="Windows-specific recursion test")
+    def test_find_external_promptfoo_prevents_recursion_windows(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """Filters out wrapper directory from PATH to prevent recursion on Windows."""
+        wrapper_path = "C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python312\\Scripts\\promptfoo.exe"
+        real_promptfoo = "C:\\npm\\prefix\\promptfoo.cmd"
+
+        monkeypatch.setattr(sys, "argv", [wrapper_path])
+        monkeypatch.setenv("PATH", "C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python312\\Scripts;C:\\npm\\prefix")
+
+        def mock_which(cmd: str, path: Optional[str] = None) -> Optional[str]:
+            if cmd != "promptfoo":
+                return None
+            if path is None:
+                return wrapper_path
+            # When called with filtered PATH, return the real one
+            if "Python312\\Scripts" not in path:
+                return real_promptfoo
+            return None
+
+        monkeypatch.setattr("shutil.which", mock_which)
+        result = _find_external_promptfoo()
+        assert result == real_promptfoo
+
 
 class TestShellRequirement:
     """Test Windows shell requirement detection for .bat/.cmd files."""
@@ -427,9 +468,12 @@ def test_main_exits_when_neither_external_nor_npx_available(
         self, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture
     ) -> None:
         """Exits with error when neither external promptfoo nor npx found."""
+        # Use platform-appropriate path for node
+        node_path = "C:\\Program Files\\nodejs\\node.exe" if sys.platform == "win32" else "/usr/bin/node"
+
         monkeypatch.setattr(sys, "argv", ["promptfoo", "eval"])
         monkeypatch.setattr("shutil.which", lambda cmd, path=None: {
-            "node": "/usr/bin/node"
+            "node": node_path
         }.get(cmd))
 
         with pytest.raises(SystemExit) as exc_info:

From 9cd4d1177add344d3a78d2b83dde8a70056a02c7 Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Tue, 6 Jan 2026 07:12:11 -0800
Subject: [PATCH 05/11] fix: increase smoke test timeout for npx fallback
 scenarios

The first npx call can be slow as it downloads promptfoo.
Increased timeout from 60s to 120s to accommodate this.
---
 tests/smoke/test_smoke.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/smoke/test_smoke.py b/tests/smoke/test_smoke.py
index 73377c1..ec862f0 100644
--- a/tests/smoke/test_smoke.py
+++ b/tests/smoke/test_smoke.py
@@ -58,7 +58,7 @@ def run_promptfoo(
         capture_output=True,
         text=True,
         env=full_env,
-        timeout=60,  # Eval can take longer
+        timeout=120,  # Increased timeout for npx fallback (first npx call downloads promptfoo)
     )
 
     stdout = result.stdout

From 13c1f1d9cfe52d76681ff66bc44a39dd42019256 Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Tue, 6 Jan 2026 07:27:45 -0800
Subject: [PATCH 06/11] fix: handle None stdout/stderr in smoke tests

Add safety checks for None values from subprocess.run() output,
which can occur on Windows in certain error conditions.
---
 tests/smoke/test_smoke.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/smoke/test_smoke.py b/tests/smoke/test_smoke.py
index ec862f0..f7b889b 100644
--- a/tests/smoke/test_smoke.py
+++ b/tests/smoke/test_smoke.py
@@ -61,8 +61,8 @@ def run_promptfoo(
         timeout=120,  # Increased timeout for npx fallback (first npx call downloads promptfoo)
     )
 
-    stdout = result.stdout
-    stderr = result.stderr
+    stdout = result.stdout or ""
+    stderr = result.stderr or ""
     exit_code = result.returncode
 
     if not expect_error and exit_code != 0:

From 874955b522629dc37293e002fb6b0e87afa08ca3 Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Sun, 11 Jan 2026 01:54:32 -0500
Subject: [PATCH 07/11] fix: address linting issues and add temp output to
 gitignore

- Fix line too long (123 > 120) in test_cli.py
- Run ruff format on test files
- Add tests/smoke/.temp-output/ to .gitignore

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .gitignore                |  1 +
 tests/smoke/test_smoke.py | 20 +++++---------------
 tests/test_cli.py         |  7 +++----
 3 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1056d39..bdc5ce5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,6 +42,7 @@ htmlcov/
 .tox/
 .mypy_cache/
 .ruff_cache/
+tests/smoke/.temp-output/
 
 # Distribution
 dist/
diff --git a/tests/smoke/test_smoke.py b/tests/smoke/test_smoke.py
index f7b889b..04f2d8d 100644
--- a/tests/smoke/test_smoke.py
+++ b/tests/smoke/test_smoke.py
@@ -150,9 +150,7 @@ class TestEvalCommand:
     def test_basic_eval(self):
         """Test basic eval with echo provider."""
         config_path = CONFIGS_DIR / "basic.yaml"
-        stdout, stderr, exit_code = run_promptfoo(
-            ["eval", "-c", str(config_path), "--no-cache"]
-        )
+        stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--no-cache"])
 
         assert exit_code == 0, f"Eval failed:\nSTDOUT: {stdout}\nSTDERR: {stderr}"
         # Should show evaluation results
@@ -267,9 +265,7 @@ def test_verbose_flag(self):
         """Test --verbose flag."""
         config_path = CONFIGS_DIR / "basic.yaml"
 
-        stdout, stderr, exit_code = run_promptfoo(
-            ["eval", "-c", str(config_path), "--verbose", "--no-cache"]
-        )
+        stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--verbose", "--no-cache"])
 
         assert exit_code == 0
         # Verbose mode should produce output
@@ -283,9 +279,7 @@ def test_success_exit_code(self):
         """Test exit code 0 when all assertions pass."""
         config_path = CONFIGS_DIR / "basic.yaml"
 
-        stdout, stderr, exit_code = run_promptfoo(
-            ["eval", "-c", str(config_path), "--no-cache"]
-        )
+        stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--no-cache"])
 
         assert exit_code == 0
 
@@ -365,9 +359,7 @@ def test_contains_assertion(self):
         """Test contains assertion."""
         config_path = CONFIGS_DIR / "basic.yaml"
 
-        stdout, stderr, exit_code = run_promptfoo(
-            ["eval", "-c", str(config_path), "--no-cache"]
-        )
+        stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--no-cache"])
 
         assert exit_code == 0
         # All assertions should pass
@@ -377,9 +369,7 @@ def test_multiple_assertions(self):
         """Test multiple assertions in single test."""
         config_path = CONFIGS_DIR / "assertions.yaml"
 
-        stdout, stderr, exit_code = run_promptfoo(
-            ["eval", "-c", str(config_path), "--no-cache"]
-        )
+        stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--no-cache"])
 
         assert exit_code == 0
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 28218ad..82ee073 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -268,7 +268,8 @@ def test_find_external_promptfoo_prevents_recursion_windows(self, monkeypatch: p
         real_promptfoo = "C:\\npm\\prefix\\promptfoo.cmd"
 
         monkeypatch.setattr(sys, "argv", [wrapper_path])
-        monkeypatch.setenv("PATH", "C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python312\\Scripts;C:\\npm\\prefix")
+        test_path = "C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python312\\Scripts;C:\\npm\\prefix"
+        monkeypatch.setenv("PATH", test_path)
 
         def mock_which(cmd: str, path: Optional[str] = None) -> Optional[str]:
             if cmd != "promptfoo":
@@ -471,9 +472,7 @@ def test_main_exits_when_neither_external_nor_npx_available(
         node_path = "C:\\Program Files\\nodejs\\node.exe" if sys.platform == "win32" else "/usr/bin/node"
 
         monkeypatch.setattr(sys, "argv", ["promptfoo", "eval"])
-        monkeypatch.setattr("shutil.which", lambda cmd, path=None: {
-            "node": node_path
-        }.get(cmd))
+        monkeypatch.setattr("shutil.which", lambda cmd, path=None: {"node": node_path}.get(cmd))
 
         with pytest.raises(SystemExit) as exc_info:
             main()

From 055b21160b30c66d8ab8b81088130b00ee31d2e3 Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Sun, 11 Jan 2026 01:57:18 -0500
Subject: [PATCH 08/11] docs: update AGENTS.md with smoke test documentation

- Add comprehensive testing strategy section with unit vs smoke tests
- Document test directory structure
- Add smoke test details and commands
- Update CI/CD section to mention both test types
- Update project structure to include tests directory

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 AGENTS.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 7 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 84e5a2d..35f7fc0 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -135,9 +135,12 @@ Runs on every PR and push to main:
 - **Lint**: Ruff linting (`uv run ruff check src/`)
 - **Format Check**: Ruff formatting (`uv run ruff format --check src/`)
 - **Type Check**: mypy static analysis (`uv run mypy src/promptfoo/`)
-- **Tests**: pytest on multiple Python versions (3.9, 3.13) and OSes (Ubuntu, Windows)
+- **Unit Tests**: Fast tests with mocked dependencies (`uv run pytest -m 'not smoke'`)
+- **Smoke Tests**: Integration tests against real CLI (`uv run pytest tests/smoke/`)
 - **Build**: Package build validation
 
+Tests run on multiple Python versions (3.9, 3.13) and OSes (Ubuntu, Windows).
+
 ### Release Workflow (`.github/workflows/release-please.yml`)
 
 Triggered on push to main:
@@ -214,7 +217,38 @@ uv run pytest
 
 ### Test Structure
 
-Tests are located in the root directory (not yet created, but should be in `tests/` when added).
+Tests are organized in the `tests/` directory:
+
+```
+tests/
+├── __init__.py
+├── test_cli.py              # Unit tests for CLI wrapper logic
+├── test_environment.py      # Unit tests for environment detection
+├── test_instructions.py     # Unit tests for installation instructions
+└── smoke/
+    ├── __init__.py
+    ├── README.md            # Smoke test documentation
+    ├── test_smoke.py        # Integration tests against real CLI
+    └── fixtures/
+        └── configs/         # YAML configs for smoke tests
+            ├── basic.yaml
+            ├── assertions.yaml
+            └── failing-assertion.yaml
+```
+
+### Test Types
+
+**Unit Tests** (`tests/test_*.py`):
+- Fast, isolated tests for individual functions
+- Mock external dependencies
+- Run on every PR
+
+**Smoke Tests** (`tests/smoke/`):
+- Integration tests that run the actual CLI via subprocess
+- Use the `echo` provider (no external API dependencies)
+- Test the full Python → Node.js integration
+- Slower but verify end-to-end functionality
+- Marked with `@pytest.mark.smoke`
 
 ### Test Matrix
 
@@ -229,16 +263,36 @@ CI tests across:
 # Install dependencies with dev extras
 uv sync --extra dev
 
-# Run all tests
+# Run all tests (unit + smoke)
 uv run pytest
 
+# Run only unit tests (fast)
+uv run pytest -m 'not smoke'
+
+# Run only smoke tests (slow, requires Node.js)
+uv run pytest tests/smoke/
+
 # Run with coverage
 uv run pytest --cov=src/promptfoo
 
+# Run specific test class
+uv run pytest tests/test_cli.py::TestMainFunction
+
 # Run specific test
-uv run pytest tests/test_cli.py::test_wrapper_detection
+uv run pytest tests/smoke/test_smoke.py::TestEvalCommand::test_basic_eval
 ```
 
+### Smoke Test Details
+
+Smoke tests verify critical CLI functionality:
+- **Basic CLI**: `--version`, `--help`, unknown commands, missing files
+- **Eval Command**: Output formats (JSON, YAML, CSV), flags (`--repeat`, `--verbose`)
+- **Exit Codes**: 0 for success, 100 for assertion failures, 1 for errors
+- **Echo Provider**: Variable substitution, multiple variables
+- **Assertions**: `contains`, `icontains`, failing assertions
+
+The smoke tests use a 120-second timeout to accommodate the first `npx` call which downloads promptfoo.
+
 ## Security Practices
 
 ### 1. No Credentials in Repository
@@ -365,14 +419,23 @@ promptfoo-python/
 ├── src/
 │   └── promptfoo/
 │       ├── __init__.py         # Package exports
-│       └── cli.py              # Main wrapper implementation
+│       ├── cli.py              # Main wrapper implementation
+│       ├── environment.py      # Environment detection
+│       └── instructions.py     # Node.js installation instructions
+├── tests/
+│   ├── test_cli.py             # Unit tests for CLI
+│   ├── test_environment.py     # Unit tests for environment detection
+│   ├── test_instructions.py    # Unit tests for instructions
+│   └── smoke/
+│       ├── test_smoke.py       # Integration smoke tests
+│       └── fixtures/configs/   # Test configuration files
 ├── AGENTS.md                   # This file (agent documentation)
 ├── CHANGELOG.md                # Auto-generated by release-please
 ├── CLAUDE.md                   # Points to AGENTS.md
 ├── LICENSE                     # MIT License
 ├── README.md                   # User-facing documentation
 ├── pyproject.toml              # Package configuration
-├── release-please-config.json # Release-please configuration
+├── release-please-config.json  # Release-please configuration
 └── .release-please-manifest.json # Release version tracking
 ```
 
@@ -443,5 +506,5 @@ git push --force
 
 ---
 
-**Last Updated**: 2026-01-05
+**Last Updated**: 2026-01-11
 **Maintained By**: @promptfoo/engineering

From 44cdf96b90ab0483354cb1d54eb2ce362fddcdfd Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Sun, 11 Jan 2026 02:06:18 -0500
Subject: [PATCH 09/11] style: add return type annotations and fix
 documentation wording

- Add `-> None` return type annotations to all smoke test methods
- Add Generator return type to setup_and_teardown fixture
- Update documentation to clarify tests run via Python wrapper
  (not just npx)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 tests/smoke/README.md     |  2 +-
 tests/smoke/test_smoke.py | 47 ++++++++++++++++++++-------------------
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/tests/smoke/README.md b/tests/smoke/README.md
index 73d813b..43f964e 100644
--- a/tests/smoke/README.md
+++ b/tests/smoke/README.md
@@ -6,7 +6,7 @@ These smoke tests verify that the core promptfoo CLI functionality works correct
 
 Smoke tests are high-level integration tests that verify the most critical functionality works end-to-end. They:
 
-- Run against the actual installed CLI (via `npx promptfoo`)
+- Run against the actual installed CLI via the Python wrapper (using either global promptfoo or npx)
 - Test the Python wrapper integration with the Node.js CLI
 - Use the `echo` provider to avoid external API dependencies
 - Verify command-line arguments, file I/O, and output formats
diff --git a/tests/smoke/test_smoke.py b/tests/smoke/test_smoke.py
index 04f2d8d..aa6bd4c 100644
--- a/tests/smoke/test_smoke.py
+++ b/tests/smoke/test_smoke.py
@@ -4,14 +4,15 @@
 These tests verify the core evaluation pipeline works correctly
 using the echo provider (no external API dependencies).
 
-These tests run against the installed promptfoo package via npx,
-testing the Python wrapper integration.
+These tests run against the installed promptfoo package via the Python wrapper
+(using either a globally installed promptfoo CLI or falling back to npx).
 """
 
 import json
 import os
 import shutil
 import subprocess
+from collections.abc import Generator
 from pathlib import Path
 from typing import Optional
 
@@ -76,7 +77,7 @@ def run_promptfoo(
 
 
 @pytest.fixture(scope="module", autouse=True)
-def setup_and_teardown():
+def setup_and_teardown() -> Generator[None, None, None]:
     """Create and cleanup output directory for smoke tests."""
     OUTPUT_DIR.mkdir(exist_ok=True)
     yield
@@ -87,7 +88,7 @@ def setup_and_teardown():
 class TestBasicCLI:
     """Basic CLI operations smoke tests."""
 
-    def test_version_flag(self):
+    def test_version_flag(self) -> None:
         """Test --version flag outputs version."""
         stdout, stderr, exit_code = run_promptfoo(["--version"])
 
@@ -95,7 +96,7 @@ def test_version_flag(self):
         # Should output a version number (semver format)
         assert stdout.strip(), "Version output should not be empty"
 
-    def test_help_flag(self):
+    def test_help_flag(self) -> None:
         """Test --help flag outputs help."""
         stdout, stderr, exit_code = run_promptfoo(["--help"])
 
@@ -103,7 +104,7 @@ def test_help_flag(self):
         assert "promptfoo" in stdout.lower()
         assert "eval" in stdout.lower()
 
-    def test_eval_help(self):
+    def test_eval_help(self) -> None:
         """Test 'eval --help' outputs eval command help."""
         stdout, stderr, exit_code = run_promptfoo(["eval", "--help"])
 
@@ -111,7 +112,7 @@ def test_eval_help(self):
         assert "--config" in stdout or "-c" in stdout
         assert "--output" in stdout or "-o" in stdout
 
-    def test_unknown_command(self):
+    def test_unknown_command(self) -> None:
         """Test unknown command returns error."""
         stdout, stderr, exit_code = run_promptfoo(
             ["unknowncommand123"],
@@ -122,7 +123,7 @@ def test_unknown_command(self):
         output = stdout + stderr
         assert "unknown" in output.lower() or "not found" in output.lower()
 
-    def test_missing_config_file(self):
+    def test_missing_config_file(self) -> None:
         """Test missing config file returns error."""
         stdout, stderr, exit_code = run_promptfoo(
             ["eval", "-c", "nonexistent-config-file.yaml"],
@@ -147,7 +148,7 @@ def test_missing_config_file(self):
 class TestEvalCommand:
     """Eval command smoke tests."""
 
-    def test_basic_eval(self):
+    def test_basic_eval(self) -> None:
         """Test basic eval with echo provider."""
         config_path = CONFIGS_DIR / "basic.yaml"
         stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--no-cache"])
@@ -156,7 +157,7 @@ def test_basic_eval(self):
         # Should show evaluation results
         assert "pass" in stdout.lower() or "✓" in stdout or "success" in stdout.lower()
 
-    def test_json_output(self):
+    def test_json_output(self) -> None:
         """Test eval outputs valid JSON."""
         config_path = CONFIGS_DIR / "basic.yaml"
         output_path = OUTPUT_DIR / "output.json"
@@ -185,7 +186,7 @@ def test_json_output(self):
         assert "Hello" in output_text
         assert "World" in output_text
 
-    def test_yaml_output(self):
+    def test_yaml_output(self) -> None:
         """Test eval outputs YAML format."""
         config_path = CONFIGS_DIR / "basic.yaml"
         output_path = OUTPUT_DIR / "output.yaml"
@@ -203,7 +204,7 @@ def test_yaml_output(self):
 
         assert "results:" in content
 
-    def test_csv_output(self):
+    def test_csv_output(self) -> None:
         """Test eval outputs CSV format."""
         config_path = CONFIGS_DIR / "basic.yaml"
         output_path = OUTPUT_DIR / "output.csv"
@@ -224,7 +225,7 @@ def test_csv_output(self):
         # CSV should have comma-separated values
         assert "," in lines[0]
 
-    def test_max_concurrency_flag(self):
+    def test_max_concurrency_flag(self) -> None:
         """Test --max-concurrency flag."""
         config_path = CONFIGS_DIR / "basic.yaml"
 
@@ -234,7 +235,7 @@ def test_max_concurrency_flag(self):
 
         assert exit_code == 0
 
-    def test_repeat_flag(self):
+    def test_repeat_flag(self) -> None:
         """Test --repeat flag runs tests multiple times."""
         config_path = CONFIGS_DIR / "basic.yaml"
         output_path = OUTPUT_DIR / "repeat-output.json"
@@ -261,7 +262,7 @@ def test_repeat_flag(self):
         # With repeat=2 and 1 test case, we should have 2 results
         assert len(data["results"]["results"]) == 2
 
-    def test_verbose_flag(self):
+    def test_verbose_flag(self) -> None:
         """Test --verbose flag."""
         config_path = CONFIGS_DIR / "basic.yaml"
 
@@ -275,7 +276,7 @@ def test_verbose_flag(self):
 class TestExitCodes:
     """Exit code smoke tests."""
 
-    def test_success_exit_code(self):
+    def test_success_exit_code(self) -> None:
         """Test exit code 0 when all assertions pass."""
         config_path = CONFIGS_DIR / "basic.yaml"
 
@@ -283,7 +284,7 @@ def test_success_exit_code(self):
 
         assert exit_code == 0
 
-    def test_failure_exit_code(self):
+    def test_failure_exit_code(self) -> None:
         """Test exit code 100 when assertions fail."""
         config_path = CONFIGS_DIR / "failing-assertion.yaml"
 
@@ -295,7 +296,7 @@ def test_failure_exit_code(self):
         # Exit code 100 indicates test failures
         assert exit_code == 100, f"Expected exit code 100, got {exit_code}"
 
-    def test_config_error_exit_code(self):
+    def test_config_error_exit_code(self) -> None:
         """Test exit code 1 for config errors."""
         stdout, stderr, exit_code = run_promptfoo(
             ["eval", "-c", "nonexistent-file.yaml", "--no-cache"],
@@ -308,7 +309,7 @@ def test_config_error_exit_code(self):
 class TestEchoProvider:
     """Echo provider smoke tests."""
 
-    def test_echo_provider_basic(self):
+    def test_echo_provider_basic(self) -> None:
         """Test echo provider returns the prompt."""
         config_path = CONFIGS_DIR / "basic.yaml"
         output_path = OUTPUT_DIR / "echo-test.json"
@@ -330,7 +331,7 @@ def test_echo_provider_basic(self):
         assert "Hello" in output
         assert "World" in output
 
-    def test_echo_provider_with_multiple_vars(self):
+    def test_echo_provider_with_multiple_vars(self) -> None:
         """Test echo provider with multiple variables."""
         config_path = CONFIGS_DIR / "assertions.yaml"
         output_path = OUTPUT_DIR / "echo-multi-var.json"
@@ -355,7 +356,7 @@ def test_echo_provider_with_multiple_vars(self):
 class TestAssertions:
     """Assertion smoke tests."""
 
-    def test_contains_assertion(self):
+    def test_contains_assertion(self) -> None:
         """Test contains assertion."""
         config_path = CONFIGS_DIR / "basic.yaml"
 
@@ -365,7 +366,7 @@ def test_contains_assertion(self):
         # All assertions should pass
         assert "pass" in stdout.lower() or "✓" in stdout or "success" in stdout.lower()
 
-    def test_multiple_assertions(self):
+    def test_multiple_assertions(self) -> None:
         """Test multiple assertions in single test."""
         config_path = CONFIGS_DIR / "assertions.yaml"
 
@@ -373,7 +374,7 @@ def test_multiple_assertions(self):
 
         assert exit_code == 0
 
-    def test_failing_assertion(self):
+    def test_failing_assertion(self) -> None:
         """Test failing assertion."""
         config_path = CONFIGS_DIR / "failing-assertion.yaml"
 

From 79e74ac0815d18d4f89be4eb05a23dc5334536be Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Sun, 11 Jan 2026 02:33:02 -0500
Subject: [PATCH 10/11] fix: resolve Windows CI test failures

- Add os.path.isfile mock to unit test to prevent _find_windows_promptfoo()
  from finding real promptfoo installations on Windows CI runners
- Add UTF-8 encoding with error replacement to smoke tests to handle
  Windows cp1252 encoding issues with npx output
- Add warmup_npx fixture to pre-download promptfoo via npx before tests,
  preventing timeout on first test when npx needs to download package

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 tests/smoke/test_smoke.py | 35 ++++++++++++++++++++++++++++++++++-
 tests/test_cli.py         |  3 +++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/tests/smoke/test_smoke.py b/tests/smoke/test_smoke.py
index aa6bd4c..7f0fe10 100644
--- a/tests/smoke/test_smoke.py
+++ b/tests/smoke/test_smoke.py
@@ -33,6 +33,7 @@ def run_promptfoo(
     cwd: Optional[Path] = None,
     expect_error: bool = False,
     env: Optional[dict[str, str]] = None,
+    timeout: int = 120,
 ) -> tuple[str, str, int]:
     """
     Run promptfoo CLI and capture output.
@@ -42,6 +43,7 @@ def run_promptfoo(
         cwd: Working directory for the command
         expect_error: If True, don't raise on non-zero exit
         env: Environment variables to set
+        timeout: Timeout in seconds (default 120)
 
     Returns:
         Tuple of (stdout, stderr, exit_code)
@@ -59,7 +61,11 @@ def run_promptfoo(
         capture_output=True,
         text=True,
         env=full_env,
-        timeout=120,  # Increased timeout for npx fallback (first npx call downloads promptfoo)
+        timeout=timeout,
+        # Use UTF-8 encoding with error replacement to handle Windows encoding issues
+        # Windows default cp1252 can't decode some bytes in npx/promptfoo output
+        encoding="utf-8",
+        errors="replace",
     )
 
     stdout = result.stdout or ""
@@ -85,6 +91,33 @@ def setup_and_teardown() -> Generator[None, None, None]:
         shutil.rmtree(OUTPUT_DIR)
 
 
+@pytest.fixture(scope="module", autouse=True)
+def warmup_npx() -> Generator[None, None, None]:
+    """
+    Warm up npx by running promptfoo --version before all tests.
+
+    On npx fallback (when promptfoo isn't globally installed), the first npx call
+    downloads and caches promptfoo, which can take several minutes on Windows.
+    Running this warmup prevents the first actual test from timing out.
+    """
+    # Run with a longer timeout (5 minutes) for the initial npx download
+    try:
+        subprocess.run(
+            ["promptfoo", "--version"],
+            capture_output=True,
+            timeout=300,  # 5 minutes for initial npx download
+            encoding="utf-8",
+            errors="replace",
+        )
+    except subprocess.TimeoutExpired:
+        # If warmup times out, tests will likely fail but let them run anyway
+        pass
+    except FileNotFoundError:
+        # promptfoo not installed, tests will fail but let them try
+        pass
+    yield
+
+
 class TestBasicCLI:
     """Basic CLI operations smoke tests."""
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 82ee073..0c311d8 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -473,6 +473,9 @@ def test_main_exits_when_neither_external_nor_npx_available(
 
         monkeypatch.setattr(sys, "argv", ["promptfoo", "eval"])
         monkeypatch.setattr("shutil.which", lambda cmd, path=None: {"node": node_path}.get(cmd))
+        # Also mock os.path.isfile to prevent _find_windows_promptfoo() from finding
+        # a real promptfoo installation on Windows CI runners
+        monkeypatch.setattr(os.path, "isfile", lambda p: False)
 
         with pytest.raises(SystemExit) as exc_info:
             main()

From 02acd12c40e11a786face89f296c55e20eed8e05 Mon Sep 17 00:00:00 2001
From: mldangelo <michael.l.dangelo@gmail.com>
Date: Sun, 11 Jan 2026 02:41:06 -0500
Subject: [PATCH 11/11] fix: mock telemetry in CLI unit tests

Add record_wrapper_used mock to tests that mock subprocess.run to prevent
PostHog telemetry calls from interfering with mock call counts.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 tests/test_cli.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 0c311d8..0e4a1c0 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -386,6 +386,8 @@ def test_main_uses_external_promptfoo_when_available(self, monkeypatch: pytest.M
             "shutil.which",
             lambda cmd, path=None: {"node": "/usr/bin/node", "promptfoo": "/usr/local/bin/promptfoo"}.get(cmd),
         )
+        # Mock telemetry to avoid PostHog calls during test
+        monkeypatch.setattr("promptfoo.cli.record_wrapper_used", lambda mode: None)
 
         mock_result = subprocess.CompletedProcess([], 0)
         mock_run = MagicMock(return_value=mock_result)
@@ -421,6 +423,8 @@ def test_main_skips_external_when_wrapper_env_set(self, monkeypatch: pytest.Monk
                 "promptfoo": "/usr/local/bin/promptfoo",
             }.get(cmd),
         )
+        # Mock telemetry to avoid PostHog calls during test
+        monkeypatch.setattr("promptfoo.cli.record_wrapper_used", lambda mode: None)
 
         mock_result = subprocess.CompletedProcess([], 0)
         mock_run = MagicMock(return_value=mock_result)
@@ -444,6 +448,8 @@ def test_main_falls_back_to_npx(self, monkeypatch: pytest.MonkeyPatch) -> None:
         monkeypatch.setattr(
             "shutil.which", lambda cmd, path=None: {"node": "/usr/bin/node", "npx": "/usr/bin/npx"}.get(cmd)
         )
+        # Mock telemetry to avoid PostHog calls during test
+        monkeypatch.setattr("promptfoo.cli.record_wrapper_used", lambda mode: None)
 
         mock_result = subprocess.CompletedProcess([], 0)
         mock_run = MagicMock(return_value=mock_result)
@@ -490,6 +496,8 @@ def test_main_passes_arguments_correctly(self, monkeypatch: pytest.MonkeyPatch)
         monkeypatch.setattr(
             "shutil.which", lambda cmd, path=None: {"node": "/usr/bin/node", "npx": "/usr/bin/npx"}.get(cmd)
         )
+        # Mock telemetry to avoid PostHog calls during test
+        monkeypatch.setattr("promptfoo.cli.record_wrapper_used", lambda mode: None)
 
         mock_result = subprocess.CompletedProcess([], 0)
         mock_run = MagicMock(return_value=mock_result)
@@ -512,6 +520,8 @@ def test_main_returns_subprocess_exit_code(self, monkeypatch: pytest.MonkeyPatch
         monkeypatch.setattr(
             "shutil.which", lambda cmd, path=None: {"node": "/usr/bin/node", "npx": "/usr/bin/npx"}.get(cmd)
         )
+        # Mock telemetry to avoid PostHog calls during test
+        monkeypatch.setattr("promptfoo.cli.record_wrapper_used", lambda mode: None)
 
         # Test non-zero exit code
         mock_result = subprocess.CompletedProcess([], 42)