diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index bf7a19a..aae9d59 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -126,8 +126,14 @@ jobs:
       - name: Pin Python version
         run: uv python pin ${{ matrix.python-version }}
 
-      - name: Install package
-        run: uv sync
+      - name: Install package with dev dependencies
+        run: uv sync --extra dev
+
+      - name: Run unit tests
+        run: uv run pytest tests/ -v -m 'not smoke'
+
+      - name: Run smoke tests
+        run: uv run pytest tests/smoke/ -v
 
       - name: Test CLI can be invoked
         run: uv run promptfoo --version
@@ -192,8 +198,14 @@ jobs:
       - name: Pin Python version
         run: uv python pin ${{ matrix.python-version }}
 
-      - name: Install package
-        run: uv sync
+      - name: Install package with dev dependencies
+        run: uv sync --extra dev
+
+      - name: Run unit tests
+        run: uv run pytest tests/ -v -m 'not smoke'
+
+      - name: Run smoke tests (with npx fallback)
+        run: uv run pytest tests/smoke/ -v
 
       - name: Test CLI fallback to npx (no global install)
         run: uv run promptfoo --version
diff --git a/.gitignore b/.gitignore
index 1056d39..bdc5ce5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,6 +42,7 @@ htmlcov/
 .tox/
 .mypy_cache/
 .ruff_cache/
+tests/smoke/.temp-output/
 
 # Distribution
 dist/
diff --git a/AGENTS.md b/AGENTS.md
index 84e5a2d..35f7fc0 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -135,9 +135,12 @@ Runs on every PR and push to main:
 - **Lint**: Ruff linting (`uv run ruff check src/`)
 - **Format Check**: Ruff formatting (`uv run ruff format --check src/`)
 - **Type Check**: mypy static analysis (`uv run mypy src/promptfoo/`)
-- **Tests**: pytest on multiple Python versions (3.9, 3.13) and OSes (Ubuntu, Windows)
+- **Unit Tests**: Fast tests with mocked dependencies (`uv run pytest -m 'not smoke'`)
+- **Smoke Tests**: Integration tests against real CLI (`uv run pytest tests/smoke/`)
 - **Build**: Package build validation
 
+Tests run on multiple Python versions (3.9, 3.13) and OSes (Ubuntu, Windows).
+
 ### Release Workflow (`.github/workflows/release-please.yml`)
 
 Triggered on push to main:
@@ -214,7 +217,38 @@ uv run pytest
 
 ### Test Structure
 
-Tests are located in the root directory (not yet created, but should be in `tests/` when added).
+Tests are organized in the `tests/` directory:
+
+```
+tests/
+├── __init__.py
+├── test_cli.py              # Unit tests for CLI wrapper logic
+├── test_environment.py      # Unit tests for environment detection
+├── test_instructions.py     # Unit tests for installation instructions
+└── smoke/
+    ├── __init__.py
+    ├── README.md            # Smoke test documentation
+    ├── test_smoke.py        # Integration tests against real CLI
+    └── fixtures/
+        └── configs/         # YAML configs for smoke tests
+            ├── basic.yaml
+            ├── assertions.yaml
+            └── failing-assertion.yaml
+```
+
+### Test Types
+
+**Unit Tests** (`tests/test_*.py`):
+- Fast, isolated tests for individual functions
+- Mock external dependencies
+- Run on every PR
+
+**Smoke Tests** (`tests/smoke/`):
+- Integration tests that run the actual CLI via subprocess
+- Use the `echo` provider (no external API dependencies)
+- Test the full Python → Node.js integration
+- Slower but verify end-to-end functionality
+- Marked with `@pytest.mark.smoke`
 
 ### Test Matrix
 
@@ -229,16 +263,36 @@ CI tests across:
 # Install dependencies with dev extras
 uv sync --extra dev
 
-# Run all tests
+# Run all tests (unit + smoke)
 uv run pytest
 
+# Run only unit tests (fast)
+uv run pytest -m 'not smoke'
+
+# Run only smoke tests (slow, requires Node.js)
+uv run pytest tests/smoke/
+
 # Run with coverage
 uv run pytest --cov=src/promptfoo
 
+# Run specific test class
+uv run pytest tests/test_cli.py::TestMainFunction
+
 # Run specific test
-uv run pytest tests/test_cli.py::test_wrapper_detection
+uv run pytest tests/smoke/test_smoke.py::TestEvalCommand::test_basic_eval
 ```
 
+### Smoke Test Details
+
+Smoke tests verify critical CLI functionality:
+- **Basic CLI**: `--version`, `--help`, unknown commands, missing files
+- **Eval Command**: Output formats (JSON, YAML, CSV), flags (`--repeat`, `--verbose`)
+- **Exit Codes**: 0 for success, 100 for assertion failures, 1 for errors
+- **Echo Provider**: Variable substitution, multiple variables
+- **Assertions**: `contains`, `icontains`, failing assertions
+
+The smoke tests use a 120-second timeout to accommodate the first `npx` call which downloads promptfoo.
+
 ## Security Practices
 
 ### 1. No Credentials in Repository
@@ -365,14 +419,23 @@ promptfoo-python/
 ├── src/
 │   └── promptfoo/
 │       ├── __init__.py         # Package exports
-│       └── cli.py              # Main wrapper implementation
+│       ├── cli.py              # Main wrapper implementation
+│       ├── environment.py      # Environment detection
+│       └── instructions.py     # Node.js installation instructions
+├── tests/
+│   ├── test_cli.py             # Unit tests for CLI
+│   ├── test_environment.py     # Unit tests for environment detection
+│   ├── test_instructions.py    # Unit tests for instructions
+│   └── smoke/
+│       ├── test_smoke.py       # Integration smoke tests
+│       └── fixtures/configs/   # Test configuration files
 ├── AGENTS.md                   # This file (agent documentation)
 ├── CHANGELOG.md                # Auto-generated by release-please
 ├── CLAUDE.md                   # Points to AGENTS.md
 ├── LICENSE                     # MIT License
 ├── README.md                   # User-facing documentation
 ├── pyproject.toml              # Package configuration
-├── release-please-config.json # Release-please configuration
+├── release-please-config.json  # Release-please configuration
 └── .release-please-manifest.json # Release version tracking
 ```
 
@@ -443,5 +506,5 @@ git push --force
 
 ---
 
-**Last Updated**: 2026-01-05
+**Last Updated**: 2026-01-11
 **Maintained By**: @promptfoo/engineering
diff --git a/pyproject.toml b/pyproject.toml
index cf1e18a..10a41be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,3 +102,16 @@ show_error_codes = true
 pretty = true
 check_untyped_defs = true
 disallow_incomplete_defs = true
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "-v",
+    "--strict-markers",
+]
+markers = [
+    "smoke: smoke tests that run the full CLI (slow, requires Node.js)",
+]
diff --git a/tests/smoke/README.md b/tests/smoke/README.md
new file mode 100644
index 0000000..43f964e
--- /dev/null
+++ b/tests/smoke/README.md
@@ -0,0 +1,88 @@
+# Smoke Tests
+
+These smoke tests verify that the core promptfoo CLI functionality works correctly through the Python wrapper.
+
+## What are Smoke Tests?
+
+Smoke tests are high-level integration tests that verify the most critical functionality works end-to-end. They:
+
+- Run against the actual installed CLI via the Python wrapper (using either global promptfoo or npx)
+- Test the Python wrapper integration with the Node.js CLI
+- Use the `echo` provider to avoid external API dependencies
+- Verify command-line arguments, file I/O, and output formats
+- Check exit codes and error handling
+
+## Running Smoke Tests
+
+```bash
+# Run all smoke tests
+pytest tests/smoke/
+
+# Run with verbose output
+pytest tests/smoke/ -v
+
+# Run a specific test class
+pytest tests/smoke/test_smoke.py::TestEvalCommand
+
+# Run a specific test
+pytest tests/smoke/test_smoke.py::TestEvalCommand::test_basic_eval
+```
+
+## Test Structure
+
+- `test_smoke.py` - Main smoke test suite
+- `fixtures/` - Test configuration files
+  - `configs/` - YAML configuration files for testing
+
+## Test Coverage
+
+### Basic CLI Operations
+- Version flag (`--version`)
+- Help output (`--help`, `eval --help`)
+- Unknown command handling
+- Missing file error handling
+
+### Eval Command
+- Basic evaluation with echo provider
+- Output formats (JSON, YAML, CSV)
+- Command-line flags (`--max-concurrency`, `--repeat`, `--verbose`)
+- Cache control (`--no-cache`)
+
+### Exit Codes
+- Exit code 0 for success
+- Exit code 100 for assertion failures
+- Exit code 1 for configuration errors
+
+### Echo Provider
+- Basic prompt echoing
+- Variable substitution
+- Multiple variable handling
+
+### Assertions
+- `contains` assertion
+- `icontains` assertion (case-insensitive)
+- Multiple assertions per test
+- Failing assertion behavior
+
+## Why Echo Provider?
+
+The `echo` provider is perfect for smoke tests because:
+
+1. **No external dependencies** - Doesn't require API keys or network calls
+2. **Deterministic** - Always returns the same output for the same input
+3. **Fast** - No network latency
+4. **Predictable** - Easy to write assertions against
+
+## Adding New Smoke Tests
+
+1. Create a new test config in `fixtures/configs/` if needed
+2. Add test methods to the appropriate test class in `test_smoke.py`
+3. Use the `run_promptfoo()` helper to execute CLI commands
+4. Make assertions on stdout, stderr, exit codes, and output files
+
+## Notes
+
+- Smoke tests run slower than unit tests (they spawn subprocesses)
+- They require Node.js and promptfoo to be installed
+- They test the integration between Python and Node.js
+- They should be kept focused on critical functionality
diff --git a/tests/smoke/__init__.py b/tests/smoke/__init__.py
new file mode 100644
index 0000000..a2573de
--- /dev/null
+++ b/tests/smoke/__init__.py
@@ -0,0 +1 @@
+"""Smoke tests for promptfoo CLI."""
diff --git a/tests/smoke/fixtures/configs/assertions.yaml b/tests/smoke/fixtures/configs/assertions.yaml
new file mode 100644
index 0000000..b03ee62
--- /dev/null
+++ b/tests/smoke/fixtures/configs/assertions.yaml
@@ -0,0 +1,22 @@
+# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
+description: 'Smoke test - multiple assertions'
+
+providers:
+  - echo
+
+prompts:
+  - 'Hello {{name}}, welcome to {{place}}'
+
+tests:
+  - vars:
+      name: Alice
+      place: Wonderland
+    assert:
+      - type: contains
+        value: Hello
+      - type: contains
+        value: Alice
+      - type: contains
+        value: Wonderland
+      - type: icontains
+        value: WELCOME
diff --git a/tests/smoke/fixtures/configs/basic.yaml b/tests/smoke/fixtures/configs/basic.yaml
new file mode 100644
index 0000000..936bb4a
--- /dev/null
+++ b/tests/smoke/fixtures/configs/basic.yaml
@@ -0,0 +1,17 @@
+# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
+description: 'Smoke test - basic config validation'
+
+providers:
+  - echo
+
+prompts:
+  - 'Hello {{name}}'
+
+tests:
+  - vars:
+      name: World
+    assert:
+      - type: contains
+        value: Hello
+      - type: contains
+        value: World
diff --git a/tests/smoke/fixtures/configs/failing-assertion.yaml b/tests/smoke/fixtures/configs/failing-assertion.yaml
new file mode 100644
index 0000000..ee8d327
--- /dev/null
+++ b/tests/smoke/fixtures/configs/failing-assertion.yaml
@@ -0,0 +1,17 @@
+# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json
+description: 'Smoke test - config with failing assertion'
+
+providers:
+  - echo
+
+prompts:
+  - 'Hello {{name}}'
+
+tests:
+  - vars:
+      name: World
+    assert:
+      # This assertion will fail because echo returns "Hello World"
+      # but we're asserting it contains "IMPOSSIBLE_STRING_NOT_IN_OUTPUT"
+      - type: contains
+        value: IMPOSSIBLE_STRING_NOT_IN_OUTPUT_12345
diff --git a/tests/smoke/test_smoke.py b/tests/smoke/test_smoke.py
new file mode 100644
index 0000000..7f0fe10
--- /dev/null
+++ b/tests/smoke/test_smoke.py
@@ -0,0 +1,423 @@
+"""
+Smoke tests for the promptfoo CLI.
+
+These tests verify the core evaluation pipeline works correctly
+using the echo provider (no external API dependencies).
+
+These tests run against the installed promptfoo package via the Python wrapper
+(using either a globally installed promptfoo CLI or falling back to npx).
+"""
+
+import json
+import os
+import shutil
+import subprocess
+from collections.abc import Generator
+from pathlib import Path
+from typing import Optional
+
+import pytest
+
+# Mark all tests in this module as smoke tests
+pytestmark = pytest.mark.smoke
+
+# Directories
+SMOKE_DIR = Path(__file__).parent
+FIXTURES_DIR = SMOKE_DIR / "fixtures"
+CONFIGS_DIR = FIXTURES_DIR / "configs"
+OUTPUT_DIR = SMOKE_DIR / ".temp-output"
+
+
+def run_promptfoo(
+    args: list[str],
+    cwd: Optional[Path] = None,
+    expect_error: bool = False,
+    env: Optional[dict[str, str]] = None,
+    timeout: int = 120,
+) -> tuple[str, str, int]:
+    """
+    Run promptfoo CLI and capture output.
+
+    Args:
+        args: CLI arguments to pass to promptfoo
+        cwd: Working directory for the command
+        expect_error: If True, don't raise on non-zero exit
+        env: Environment variables to set
+        timeout: Timeout in seconds (default 120)
+
+    Returns:
+        Tuple of (stdout, stderr, exit_code)
+    """
+    cmd = ["promptfoo"] + args
+
+    full_env = os.environ.copy()
+    full_env["NO_COLOR"] = "1"  # Disable color output for easier parsing
+    if env:
+        full_env.update(env)
+
+    result = subprocess.run(
+        cmd,
+        cwd=cwd or Path.cwd(),
+        capture_output=True,
+        text=True,
+        env=full_env,
+        timeout=timeout,
+        # Use UTF-8 encoding with error replacement to handle Windows encoding issues
+        # Windows default cp1252 can't decode some bytes in npx/promptfoo output
+        encoding="utf-8",
+        errors="replace",
+    )
+
+    stdout = result.stdout or ""
+    stderr = result.stderr or ""
+    exit_code = result.returncode
+
+    if not expect_error and exit_code != 0:
+        # For debugging failed tests
+        print(f"Command failed: {' '.join(cmd)}")
+        print(f"Exit code: {exit_code}")
+        print(f"STDOUT:\n{stdout}")
+        print(f"STDERR:\n{stderr}")
+
+    return stdout, stderr, exit_code
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_and_teardown() -> Generator[None, None, None]:
+    """Create and cleanup output directory for smoke tests."""
+    OUTPUT_DIR.mkdir(exist_ok=True)
+    yield
+    if OUTPUT_DIR.exists():
+        shutil.rmtree(OUTPUT_DIR)
+
+
+@pytest.fixture(scope="module", autouse=True)
+def warmup_npx() -> Generator[None, None, None]:
+    """
+    Warm up npx by running promptfoo --version before all tests.
+
+    On npx fallback (when promptfoo isn't globally installed), the first npx call
+    downloads and caches promptfoo, which can take several minutes on Windows.
+    Running this warmup prevents the first actual test from timing out.
+    """
+    # Run with a longer timeout (5 minutes) for the initial npx download
+    try:
+        subprocess.run(
+            ["promptfoo", "--version"],
+            capture_output=True,
+            timeout=300,  # 5 minutes for initial npx download
+            encoding="utf-8",
+            errors="replace",
+        )
+    except subprocess.TimeoutExpired:
+        # If warmup times out, tests will likely fail but let them run anyway
+        pass
+    except FileNotFoundError:
+        # promptfoo not installed, tests will fail but let them try
+        pass
+    yield
+
+
+class TestBasicCLI:
+    """Basic CLI operations smoke tests."""
+
+    def test_version_flag(self) -> None:
+        """Test --version flag outputs version."""
+        stdout, stderr, exit_code = run_promptfoo(["--version"])
+
+        assert exit_code == 0
+        # Should output a version number (semver format)
+        assert stdout.strip(), "Version output should not be empty"
+
+    def test_help_flag(self) -> None:
+        """Test --help flag outputs help."""
+        stdout, stderr, exit_code = run_promptfoo(["--help"])
+
+        assert exit_code == 0
+        assert "promptfoo" in stdout.lower()
+        assert "eval" in stdout.lower()
+
+    def test_eval_help(self) -> None:
+        """Test 'eval --help' outputs eval command help."""
+        stdout, stderr, exit_code = run_promptfoo(["eval", "--help"])
+
+        assert exit_code == 0
+        assert "--config" in stdout or "-c" in stdout
+        assert "--output" in stdout or "-o" in stdout
+
+    def test_unknown_command(self) -> None:
+        """Test unknown command returns error."""
+        stdout, stderr, exit_code = run_promptfoo(
+            ["unknowncommand123"],
+            expect_error=True,
+        )
+
+        assert exit_code != 0
+        output = stdout + stderr
+        assert "unknown" in output.lower() or "not found" in output.lower()
+
+    def test_missing_config_file(self) -> None:
+        """Test missing config file returns error."""
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", "nonexistent-config-file.yaml"],
+            expect_error=True,
+        )
+
+        assert exit_code != 0
+        output = stdout + stderr
+        # Should indicate the file wasn't found
+        assert any(
+            phrase in output.lower()
+            for phrase in [
+                "not found",
+                "no such file",
+                "does not exist",
+                "cannot find",
+                "no configuration file",
+            ]
+        )
+
+
+class TestEvalCommand:
+    """Eval command smoke tests."""
+
+    def test_basic_eval(self) -> None:
+        """Test basic eval with echo provider."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--no-cache"])
+
+        assert exit_code == 0, f"Eval failed:\nSTDOUT: {stdout}\nSTDERR: {stderr}"
+        # Should show evaluation results
+        assert "pass" in stdout.lower() or "✓" in stdout or "success" in stdout.lower()
+
+    def test_json_output(self) -> None:
+        """Test eval outputs valid JSON."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        output_path = OUTPUT_DIR / "output.json"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "-o", str(output_path), "--no-cache"]
+        )
+
+        assert exit_code == 0, f"Eval failed:\nSTDOUT: {stdout}\nSTDERR: {stderr}"
+        assert output_path.exists(), "Output file was not created"
+
+        # Verify it's valid JSON with expected structure
+        with open(output_path) as f:
+            data = json.load(f)
+
+        assert "results" in data
+        assert "results" in data["results"]
+        assert isinstance(data["results"]["results"], list)
+        assert len(data["results"]["results"]) > 0
+
+        # Verify echo provider returns the prompt
+        first_result = data["results"]["results"][0]
+        assert "response" in first_result
+        assert "output" in first_result["response"]
+        output_text = first_result["response"]["output"]
+        assert "Hello" in output_text
+        assert "World" in output_text
+
+    def test_yaml_output(self) -> None:
+        """Test eval outputs YAML format."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        output_path = OUTPUT_DIR / "output.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "-o", str(output_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+        assert output_path.exists()
+
+        # Verify it contains YAML-like content
+        with open(output_path) as f:
+            content = f.read()
+
+        assert "results:" in content
+
+    def test_csv_output(self) -> None:
+        """Test eval outputs CSV format."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        output_path = OUTPUT_DIR / "output.csv"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "-o", str(output_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+        assert output_path.exists()
+
+        # Verify it's CSV format (has header row with columns)
+        with open(output_path) as f:
+            content = f.read()
+
+        lines = content.strip().split("\n")
+        assert len(lines) > 0
+        # CSV should have comma-separated values
+        assert "," in lines[0]
+
+    def test_max_concurrency_flag(self) -> None:
+        """Test --max-concurrency flag."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--max-concurrency", "1", "--no-cache"]
+        )
+
+        assert exit_code == 0
+
+    def test_repeat_flag(self) -> None:
+        """Test --repeat flag runs tests multiple times."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        output_path = OUTPUT_DIR / "repeat-output.json"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            [
+                "eval",
+                "-c",
+                str(config_path),
+                "--repeat",
+                "2",
+                "-o",
+                str(output_path),
+                "--no-cache",
+            ]
+        )
+
+        assert exit_code == 0
+
+        # Verify we got repeated results
+        with open(output_path) as f:
+            data = json.load(f)
+
+        # With repeat=2 and 1 test case, we should have 2 results
+        assert len(data["results"]["results"]) == 2
+
+    def test_verbose_flag(self) -> None:
+        """Test --verbose flag."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--verbose", "--no-cache"])
+
+        assert exit_code == 0
+        # Verbose mode should produce output
+        assert len(stdout) > 0 or len(stderr) > 0
+
+
+class TestExitCodes:
+    """Exit code smoke tests."""
+
+    def test_success_exit_code(self) -> None:
+        """Test exit code 0 when all assertions pass."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--no-cache"])
+
+        assert exit_code == 0
+
+    def test_failure_exit_code(self) -> None:
+        """Test exit code 100 when assertions fail."""
+        config_path = CONFIGS_DIR / "failing-assertion.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--no-cache"],
+            expect_error=True,
+        )
+
+        # Exit code 100 indicates test failures
+        assert exit_code == 100, f"Expected exit code 100, got {exit_code}"
+
+    def test_config_error_exit_code(self) -> None:
+        """Test exit code 1 for config errors."""
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", "nonexistent-file.yaml", "--no-cache"],
+            expect_error=True,
+        )
+
+        assert exit_code == 1
+
+
+class TestEchoProvider:
+    """Echo provider smoke tests."""
+
+    def test_echo_provider_basic(self) -> None:
+        """Test echo provider returns the prompt."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+        output_path = OUTPUT_DIR / "echo-test.json"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "-o", str(output_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+
+        # Verify echo provider returns the prompt
+        with open(output_path) as f:
+            data = json.load(f)
+
+        first_result = data["results"]["results"][0]
+
+        # Echo provider should return the prompt in the response
+        output = first_result["response"]["output"]
+        assert "Hello" in output
+        assert "World" in output
+
+    def test_echo_provider_with_multiple_vars(self) -> None:
+        """Test echo provider with multiple variables."""
+        config_path = CONFIGS_DIR / "assertions.yaml"
+        output_path = OUTPUT_DIR / "echo-multi-var.json"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "-o", str(output_path), "--no-cache"]
+        )
+
+        assert exit_code == 0
+
+        with open(output_path) as f:
+            data = json.load(f)
+
+        first_result = data["results"]["results"][0]
+        output = first_result["response"]["output"]
+
+        # Should contain all variable values
+        assert "Alice" in output
+        assert "Wonderland" in output
+
+
+class TestAssertions:
+    """Assertion smoke tests."""
+
+    def test_contains_assertion(self) -> None:
+        """Test contains assertion."""
+        config_path = CONFIGS_DIR / "basic.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--no-cache"])
+
+        assert exit_code == 0
+        # All assertions should pass
+        assert "pass" in stdout.lower() or "✓" in stdout or "success" in stdout.lower()
+
+    def test_multiple_assertions(self) -> None:
+        """Test multiple assertions in single test."""
+        config_path = CONFIGS_DIR / "assertions.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(["eval", "-c", str(config_path), "--no-cache"])
+
+        assert exit_code == 0
+
+    def test_failing_assertion(self) -> None:
+        """Test failing assertion."""
+        config_path = CONFIGS_DIR / "failing-assertion.yaml"
+
+        stdout, stderr, exit_code = run_promptfoo(
+            ["eval", "-c", str(config_path), "--no-cache"],
+            expect_error=True,
+        )
+
+        # Should fail with exit code 100
+        assert exit_code == 100
+        output = stdout + stderr
+        # Should indicate failure
+        assert "fail" in output.lower() or "✗" in output or "error" in output.lower()
diff --git a/tests/test_cli.py b/tests/test_cli.py
index d5611b8..0e4a1c0 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -104,6 +104,7 @@ def test_strip_quotes(self, input_path: str, expected: str) -> None:
         """Quote stripping handles various quote patterns correctly."""
         assert _strip_quotes(input_path) == expected
 
+    @pytest.mark.skipif(sys.platform == "win32", reason="Unix-style PATH separator test")
     @pytest.mark.parametrize(
         "path_value,expected",
         [
@@ -115,8 +116,24 @@ def test_strip_quotes(self, input_path: str, expected: str) -> None:
             (":::", []),  # Only separators
         ],
     )
-    def test_split_path(self, path_value: str, expected: list[str]) -> None:
-        """PATH splitting handles quotes, empty entries, and whitespace."""
+    def test_split_path_unix(self, path_value: str, expected: list[str]) -> None:
+        """PATH splitting handles quotes, empty entries, and whitespace on Unix."""
+        assert _split_path(path_value) == expected
+
+    @pytest.mark.skipif(sys.platform != "win32", reason="Windows-style PATH separator test")
+    @pytest.mark.parametrize(
+        "path_value,expected",
+        [
+            ("C:\\bin;C:\\tools", ["C:\\bin", "C:\\tools"]),
+            ('"C:\\bin";C:\\tools', ["C:\\bin", "C:\\tools"]),
+            ("C:\\bin;;C:\\tools", ["C:\\bin", "C:\\tools"]),  # Empty entry removed
+            ("  C:\\bin  ;  C:\\tools  ", ["C:\\bin", "C:\\tools"]),  # Whitespace
+            ("", []),
+            (";;;", []),  # Only separators
+        ],
+    )
+    def test_split_path_windows(self, path_value: str, expected: list[str]) -> None:
+        """PATH splitting handles quotes, empty entries, and whitespace on Windows."""
         assert _split_path(path_value) == expected
 
 
@@ -221,8 +238,9 @@ def test_find_external_promptfoo_when_found(self, monkeypatch: pytest.MonkeyPatc
         result = _find_external_promptfoo()
         assert result == promptfoo_path
 
-    def test_find_external_promptfoo_prevents_recursion(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        """Filters out wrapper directory from PATH to prevent recursion."""
+    @pytest.mark.skipif(sys.platform == "win32", reason="Unix-specific recursion test")
+    def test_find_external_promptfoo_prevents_recursion_unix(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """Filters out wrapper directory from PATH to prevent recursion on Unix."""
         wrapper_path = "/home/user/.local/bin/promptfoo"
         real_promptfoo = "/usr/local/bin/promptfoo"
 
@@ -243,6 +261,30 @@ def mock_which(cmd: str, path: Optional[str] = None) -> Optional[str]:
         result = _find_external_promptfoo()
         assert result == real_promptfoo
 
+    @pytest.mark.skipif(sys.platform != "win32", reason="Windows-specific recursion test")
+    def test_find_external_promptfoo_prevents_recursion_windows(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """Filters out wrapper directory from PATH to prevent recursion on Windows."""
+        wrapper_path = "C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python312\\Scripts\\promptfoo.exe"
+        real_promptfoo = "C:\\npm\\prefix\\promptfoo.cmd"
+
+        monkeypatch.setattr(sys, "argv", [wrapper_path])
+        test_path = "C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python312\\Scripts;C:\\npm\\prefix"
+        monkeypatch.setenv("PATH", test_path)
+
+        def mock_which(cmd: str, path: Optional[str] = None) -> Optional[str]:
+            if cmd != "promptfoo":
+                return None
+            if path is None:
+                return wrapper_path
+            # When called with filtered PATH, return the real one
+            if "Python312\\Scripts" not in path:
+                return real_promptfoo
+            return None
+
+        monkeypatch.setattr("shutil.which", mock_which)
+        result = _find_external_promptfoo()
+        assert result == real_promptfoo
+
 
 class TestShellRequirement:
     """Test Windows shell requirement detection for .bat/.cmd files."""
@@ -344,6 +386,8 @@ def test_main_uses_external_promptfoo_when_available(self, monkeypatch: pytest.M
             "shutil.which",
             lambda cmd, path=None: {"node": "/usr/bin/node", "promptfoo": "/usr/local/bin/promptfoo"}.get(cmd),
         )
+        # Mock telemetry to avoid PostHog calls during test
+        monkeypatch.setattr("promptfoo.cli.record_wrapper_used", lambda mode: None)
 
         mock_result = subprocess.CompletedProcess([], 0)
         mock_run = MagicMock(return_value=mock_result)
@@ -379,6 +423,8 @@ def test_main_skips_external_when_wrapper_env_set(self, monkeypatch: pytest.Monk
                 "promptfoo": "/usr/local/bin/promptfoo",
             }.get(cmd),
         )
+        # Mock telemetry to avoid PostHog calls during test
+        monkeypatch.setattr("promptfoo.cli.record_wrapper_used", lambda mode: None)
 
         mock_result = subprocess.CompletedProcess([], 0)
         mock_run = MagicMock(return_value=mock_result)
@@ -402,6 +448,8 @@ def test_main_falls_back_to_npx(self, monkeypatch: pytest.MonkeyPatch) -> None:
         monkeypatch.setattr(
             "shutil.which", lambda cmd, path=None: {"node": "/usr/bin/node", "npx": "/usr/bin/npx"}.get(cmd)
         )
+        # Mock telemetry to avoid PostHog calls during test
+        monkeypatch.setattr("promptfoo.cli.record_wrapper_used", lambda mode: None)
 
         mock_result = subprocess.CompletedProcess([], 0)
         mock_run = MagicMock(return_value=mock_result)
@@ -426,8 +474,14 @@ def test_main_exits_when_neither_external_nor_npx_available(
         self, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture
     ) -> None:
         """Exits with error when neither external promptfoo nor npx found."""
+        # Use platform-appropriate path for node
+        node_path = "C:\\Program Files\\nodejs\\node.exe" if sys.platform == "win32" else "/usr/bin/node"
+
         monkeypatch.setattr(sys, "argv", ["promptfoo", "eval"])
-        monkeypatch.setattr("shutil.which", lambda cmd, path=None: {"node": "/usr/bin/node"}.get(cmd))
+        monkeypatch.setattr("shutil.which", lambda cmd, path=None: {"node": node_path}.get(cmd))
+        # Also mock os.path.isfile to prevent _find_windows_promptfoo() from finding
+        # a real promptfoo installation on Windows CI runners
+        monkeypatch.setattr(os.path, "isfile", lambda p: False)
 
         with pytest.raises(SystemExit) as exc_info:
             main()
@@ -442,6 +496,8 @@ def test_main_passes_arguments_correctly(self, monkeypatch: pytest.MonkeyPatch)
         monkeypatch.setattr(
             "shutil.which", lambda cmd, path=None: {"node": "/usr/bin/node", "npx": "/usr/bin/npx"}.get(cmd)
         )
+        # Mock telemetry to avoid PostHog calls during test
+        monkeypatch.setattr("promptfoo.cli.record_wrapper_used", lambda mode: None)
 
         mock_result = subprocess.CompletedProcess([], 0)
         mock_run = MagicMock(return_value=mock_result)
@@ -464,6 +520,8 @@ def test_main_returns_subprocess_exit_code(self, monkeypatch: pytest.MonkeyPatch
         monkeypatch.setattr(
             "shutil.which", lambda cmd, path=None: {"node": "/usr/bin/node", "npx": "/usr/bin/npx"}.get(cmd)
         )
+        # Mock telemetry to avoid PostHog calls during test
+        monkeypatch.setattr("promptfoo.cli.record_wrapper_used", lambda mode: None)
 
         # Test non-zero exit code
         mock_result = subprocess.CompletedProcess([], 42)