Skip to content

Commit c63ae59

Browse files
committed
Parse command middleware loading vs config
1 parent 39976c3 commit c63ae59

6 files changed

Lines changed: 374 additions & 89 deletions

File tree

src/parxy_cli/commands/parse.py

Lines changed: 38 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
"""Command line interface for Parxy document processing."""
22

33
import json
4-
import os
54
import tomllib
65
from datetime import timedelta
76
from pathlib import Path
8-
from typing import Optional, List, Annotated, Dict
7+
from typing import Optional, List, Annotated
98

109
import typer
1110

@@ -20,31 +19,21 @@
2019
console = Console()
2120

2221

23-
DEFAULT_MIDDLEWARE_PROFILES: Dict[str, List[str]] = {
24-
'default': [],
25-
}
26-
27-
28-
def _load_middleware_profiles(config_path: Optional[Path]) -> Dict[str, List[str]]:
29-
"""Load middleware profiles from optional config file.
22+
def _load_middleware_from_config(config_path: Path) -> List[str]:
23+
"""Load middleware class paths from a config file.
3024
3125
Supports JSON, TOML, YAML and YML. The expected structure is either:
32-
- ``{"middleware_profiles": {"profile": ["path.to.Middleware"]}}``
33-
- ``{"profiles": {"profile": ["path.to.Middleware"]}}``
26+
- A top-level list: ``["path.to.Middleware1", "path.to.Middleware2"]``
27+
- An object with a ``middleware`` key: ``{"middleware": ["path.to.Middleware"]}``
3428
"""
35-
profiles = dict(DEFAULT_MIDDLEWARE_PROFILES)
36-
if config_path is None:
37-
return profiles
38-
3929
if not config_path.exists():
4030
raise typer.BadParameter(f'Middleware config file not found: {config_path}')
4131

4232
suffix = config_path.suffix.lower()
43-
raw_data = None
4433

45-
if suffix in {'.json'}:
34+
if suffix == '.json':
4635
raw_data = json.loads(config_path.read_text(encoding='utf-8'))
47-
elif suffix in {'.toml'}:
36+
elif suffix == '.toml':
4837
raw_data = tomllib.loads(config_path.read_text(encoding='utf-8'))
4938
elif suffix in {'.yaml', '.yml'}:
5039
try:
@@ -59,60 +48,43 @@ def _load_middleware_profiles(config_path: Optional[Path]) -> Dict[str, List[str
5948
'Unsupported middleware config format. Use .json, .toml, .yaml or .yml'
6049
)
6150

62-
if not isinstance(raw_data, dict):
63-
raise typer.BadParameter(
64-
'Middleware config must be an object/map at top level.'
65-
)
66-
67-
custom_profiles = raw_data.get('middleware_profiles') or raw_data.get('profiles')
68-
if custom_profiles is None:
69-
return profiles
70-
71-
if not isinstance(custom_profiles, dict):
51+
if isinstance(raw_data, list):
52+
middleware_list = raw_data
53+
elif isinstance(raw_data, dict):
54+
middleware_list = raw_data.get('middleware', [])
55+
if not isinstance(middleware_list, list):
56+
raise typer.BadParameter(
57+
'middleware_config: "middleware" key must be a list of class paths.'
58+
)
59+
else:
7260
raise typer.BadParameter(
73-
'middleware_profiles must be a map of profile names to middleware paths.'
61+
'Middleware config must be a list or an object with a "middleware" key.'
7462
)
7563

76-
for name, middleware_list in custom_profiles.items():
77-
if not isinstance(name, str) or not isinstance(middleware_list, list):
78-
raise typer.BadParameter(
79-
'Each profile must map to a list of middleware class paths.'
80-
)
81-
if not all(isinstance(item, str) for item in middleware_list):
82-
raise typer.BadParameter('Middleware class paths must be strings.')
83-
profiles[name] = middleware_list
64+
if not all(isinstance(item, str) for item in middleware_list):
65+
raise typer.BadParameter('Middleware class paths must be strings.')
8466

85-
return profiles
67+
return middleware_list
8668

8769

88-
def configure_middleware_profile(
89-
profile: Optional[str],
70+
def configure_middleware(
71+
middleware: Optional[List[str]],
9072
config_path: Optional[Path],
9173
) -> None:
92-
"""Configure global middleware registry from a selected profile."""
93-
selected_profile = profile or os.getenv('PARXY_MIDDLEWARE_PROFILE')
94-
if not selected_profile:
95-
return
74+
"""Configure global middleware from inline class paths and/or a config file."""
75+
paths: List[str] = list(middleware or [])
9676

97-
env_config_path = os.getenv('PARXY_MIDDLEWARE_CONFIG')
98-
effective_config = config_path or (
99-
Path(env_config_path) if env_config_path else None
100-
)
101-
profiles = _load_middleware_profiles(effective_config)
77+
if config_path is not None:
78+
paths.extend(_load_middleware_from_config(config_path))
10279

103-
if selected_profile not in profiles:
104-
available = ', '.join(sorted(profiles.keys()))
105-
raise typer.BadParameter(
106-
f'Unknown middleware profile: {selected_profile}. Available profiles: {available}'
107-
)
80+
if not paths:
81+
return
10882

109-
middleware_paths = profiles[selected_profile]
11083
Parxy.clear_middleware()
111-
if middleware_paths:
112-
Parxy.with_middleware(middleware_paths)
84+
Parxy.with_middleware(paths)
11385

11486
console.info(
115-
f"Using middleware profile '{selected_profile}' ({len(middleware_paths)} middleware)."
87+
f'Using {len(paths)} middleware class{"es" if len(paths) != 1 else ""}.'
11688
)
11789

11890

@@ -360,20 +332,20 @@ def parse(
360332
min=1,
361333
),
362334
] = None,
363-
middleware_profile: Annotated[
364-
Optional[str],
335+
middleware: Annotated[
336+
Optional[List[str]],
365337
typer.Option(
366-
'--middleware-profile',
367-
envvar='PARXY_MIDDLEWARE_PROFILE',
368-
help='Middleware profile name to activate. Built-in: default. Additional profiles can be loaded via --middleware-config.',
338+
'--middleware',
339+
'-p',
340+
help='Middleware class path(s) to apply. Can be specified multiple times (e.g. --middleware my.pkg.MyMiddleware).',
369341
),
370342
] = None,
371343
middleware_config: Annotated[
372344
Optional[str],
373345
typer.Option(
374346
'--middleware-config',
375347
envvar='PARXY_MIDDLEWARE_CONFIG',
376-
help='Path to a .json/.toml/.yaml config file defining custom middleware profiles.',
348+
help='Path to a .json/.toml/.yaml file with a list of middleware class paths to apply. Appended after inline middleware with --middleware',
377349
),
378350
] = None,
379351
):
@@ -427,9 +399,8 @@ def parse(
427399
# Calculate total tasks
428400
total_tasks = len(files) * len(drivers)
429401

430-
431-
configure_middleware_profile(
432-
profile=middleware_profile,
402+
configure_middleware(
403+
middleware=middleware,
433404
config_path=Path(middleware_config) if middleware_config else None,
434405
)
435406

src/parxy_core/drivers/factory.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,11 @@ class DriverFactory:
4848
__custom_creators: Dict[str, Callable[[], Driver]] = {}
4949
"""The custom drivers"""
5050

51+
__config_middleware: List[Middleware] = []
52+
"""Middleware loaded from ParxyConfig — preserved across clear_middleware() calls."""
53+
5154
__middleware: List[Middleware] = []
52-
"""The global middleware registry"""
55+
"""Runtime middleware added programmatically via with_middleware()."""
5356

5457
_config: Optional[ParxyConfig] = None
5558

@@ -79,6 +82,7 @@ def reset(cls):
7982
Useful for testing and isolation between test cases.
8083
"""
8184
cls.__instance = None
85+
cls.__config_middleware = []
8286
cls.__middleware = []
8387
cls.__drivers = {}
8488
cls.__custom_creators = {}
@@ -108,18 +112,18 @@ def initialize(self, config: ParxyConfig) -> Self:
108112
return self
109113

110114
def _load_middleware_from_config(self) -> None:
111-
"""Load middleware from ParxyConfig.middleware.
115+
"""Load middleware from ParxyConfig.middleware into the config layer.
112116
113-
Middleware specified in config are automatically registered
114-
in the factory's global middleware registry.
117+
Config middleware is kept separate from runtime middleware so it
118+
survives clear_middleware() calls.
115119
"""
116120
if not self._config.middleware:
117121
return
118122

119123
for middleware_path in self._config.middleware:
120124
try:
121125
middleware = self._import_middleware(middleware_path)
122-
self.__middleware.append(middleware)
126+
self.__config_middleware.append(middleware)
123127
self._logger.info(f'Loaded middleware from config: {middleware_path}')
124128
except (ImportError, ValueError) as e:
125129
self._logger.warning(
@@ -342,7 +346,10 @@ def with_middleware(
342346
return self
343347

344348
def clear_middleware(self) -> 'DriverFactory':
345-
"""Clear all global middleware.
349+
"""Clear runtime middleware.
350+
351+
Middleware loaded from ``ParxyConfig.middleware`` is preserved.
352+
Only middleware added via :meth:`with_middleware` is removed.
346353
347354
Returns
348355
-------
@@ -354,14 +361,14 @@ def clear_middleware(self) -> 'DriverFactory':
354361
return self
355362

356363
def get_middleware(self) -> List[Middleware]:
357-
"""Get the list of global middleware.
364+
"""Get the combined middleware list (config layer + runtime layer).
358365
359366
Returns
360367
-------
361368
List[Middleware]
362-
Copy of the current global middleware list
369+
Copy of the current middleware list, config entries first.
363370
"""
364-
return list(self.__middleware)
371+
return list(self.__config_middleware) + list(self.__middleware)
365372

366373
def get_drivers(self) -> Dict[str, Driver]:
367374
"""Get all of the created "drivers".

src/parxy_core/facade/parxy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def with_middleware(cls, middleware: List[Union[str, Middleware]]) -> 'Parxy':
119119
Example
120120
-------
121121
>>> Parxy.with_middleware(
122-
... [LoggingMiddleware(), 'parxy_core.middleware.PIIScanner']
122+
... [MyCustomMiddleware(), 'parxy_core.middleware.SimpleMiddleware']
123123
... )
124124
>>> doc = Parxy.parse('document.pdf')
125125
"""

src/parxy_core/models/config.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
from typing import Literal, Optional, List
1+
from typing import Literal, Optional, List, Any
22

3+
import json
34
import logging
45

56
from pydantic_settings import BaseSettings, SettingsConfigDict
67

7-
from pydantic import Field, SecretStr, BaseModel
8+
from pydantic import Field, SecretStr, BaseModel, field_validator
89

910

1011
class BaseConfig(BaseSettings):
@@ -82,6 +83,16 @@ class ParxyConfig(BaseConfig):
8283
middleware: Optional[List[str]] = None
8384
"""List of middleware class paths to load automatically."""
8485

86+
@field_validator('middleware', mode='before')
87+
@classmethod
88+
def parse_middleware(cls, v: Any) -> Any:
89+
if not isinstance(v, str):
90+
return v
91+
stripped = v.strip()
92+
if stripped.startswith('['):
93+
return json.loads(stripped)
94+
return [item.strip() for item in stripped.split(',') if item.strip()]
95+
8596
model_config = SettingsConfigDict(
8697
env_prefix='parxy_',
8798
env_file='.env',

tests/commands/test_parse.py

Lines changed: 54 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -239,45 +239,89 @@ def test_parse_command_with_multiple_drivers(runner, mock_document, tmp_path):
239239
assert (output_dir / 'llamaparse-test.json').exists()
240240

241241

242-
def test_parse_command_with_default_middleware_profile(runner, mock_document, tmp_path):
243-
"""Test middleware profile selection with builtin default profile."""
242+
def test_parse_command_with_middleware(runner, mock_document, tmp_path):
243+
"""Test inline middleware class paths via --middleware."""
244244

245245
test_file = tmp_path / 'test.pdf'
246246
test_file.write_text('dummy pdf content')
247247

248248
with patch('parxy_cli.commands.parse.Parxy') as mock_parxy:
249-
mock_parxy.parse.return_value = mock_document
250249
mock_parxy.default_driver.return_value = 'pymupdf'
250+
mock_parxy.batch_iter.return_value = iter(
251+
[
252+
BatchResult(
253+
file=str(test_file),
254+
driver='pymupdf',
255+
document=mock_document,
256+
error=None,
257+
)
258+
]
259+
)
260+
261+
result = runner.invoke(
262+
app,
263+
[str(test_file), '--middleware', 'parxy_core.middleware.SimpleMiddleware'],
264+
)
265+
266+
assert result.exit_code == 0
267+
mock_parxy.clear_middleware.assert_called_once()
268+
mock_parxy.with_middleware.assert_called_once_with(
269+
['parxy_core.middleware.SimpleMiddleware']
270+
)
271+
272+
273+
def test_parse_command_with_middleware_config(runner, mock_document, tmp_path):
274+
"""Test middleware loading from a JSON config file via --middleware-config."""
275+
276+
test_file = tmp_path / 'test.pdf'
277+
test_file.write_text('dummy pdf content')
278+
279+
config_file = tmp_path / 'middleware.json'
280+
config_file.write_text('["parxy_core.middleware.SimpleMiddleware"]')
281+
282+
with patch('parxy_cli.commands.parse.Parxy') as mock_parxy:
283+
mock_parxy.default_driver.return_value = 'pymupdf'
284+
mock_parxy.batch_iter.return_value = iter(
285+
[
286+
BatchResult(
287+
file=str(test_file),
288+
driver='pymupdf',
289+
document=mock_document,
290+
error=None,
291+
)
292+
]
293+
)
251294

252295
result = runner.invoke(
253296
app,
254-
[str(test_file), '--middleware-profile', 'default'],
297+
[str(test_file), '--middleware-config', str(config_file)],
255298
)
256299

257300
assert result.exit_code == 0
258301
mock_parxy.clear_middleware.assert_called_once()
259-
mock_parxy.with_middleware.assert_not_called()
302+
mock_parxy.with_middleware.assert_called_once_with(
303+
['parxy_core.middleware.SimpleMiddleware']
304+
)
260305

261306

262-
def test_parse_command_with_unknown_middleware_profile_fails(
307+
def test_parse_command_with_middleware_config_missing_file_fails(
263308
runner, mock_document, tmp_path
264309
):
265-
"""Test unknown middleware profile returns a CLI validation error."""
310+
"""Test that a missing middleware config file returns a CLI error."""
266311

267312
test_file = tmp_path / 'test.pdf'
268313
test_file.write_text('dummy pdf content')
269314

270315
with patch('parxy_cli.commands.parse.Parxy') as mock_parxy:
271-
mock_parxy.parse.return_value = mock_document
272316
mock_parxy.default_driver.return_value = 'pymupdf'
273317

274318
result = runner.invoke(
275319
app,
276-
[str(test_file), '--middleware-profile', 'does_not_exist'],
320+
[str(test_file), '--middleware-config', str(tmp_path / 'nonexistent.json')],
277321
)
278322

279323
assert result.exit_code != 0
280-
mock_parxy.parse.assert_not_called()
324+
mock_parxy.batch_iter.assert_not_called()
281325

282326

283327
def test_collect_files_non_recursive(tmp_path):

0 commit comments

Comments
 (0)