From 796237ffd59a131fff21a5f9871e7663e259931e Mon Sep 17 00:00:00 2001 From: ablaszkiewicz Date: Fri, 19 Jun 2026 21:39:02 +0200 Subject: [PATCH 1/5] feat: mask sensitive data inside objects and URLs in code variables Code variable masking previously only inspected dicts/lists/tuples/strings, and fell back to a raw repr() on serialization failure. As a result, secrets held as attributes of custom objects (e.g. a PostgresSourceConfig with a `password` field) were emitted verbatim via the unmasked repr() path. This hardens masking to be fail-closed: - Traverse custom objects (dataclasses / objects with a populated __dict__) so sensitive fields are redacted by their real attribute name. This is both safer (a custom __repr__ can't relabel a field out of the mask) and higher-fidelity (only the sensitive field is redacted, surrounding context is kept). - Replace the leaky repr() fallback with a fail-closed _safe_repr() that redacts the whole value when any masking rule matches, redacts when the repr is too long to scan, and emits a type-name placeholder when __repr__ raises. json.dumps gets a default= net so no raw object can slip through. - Scrub credentials embedded in URLs/DSNs (postgresql://user:pass@host) from string values regardless of the surrounding key name. Add `connection_string` to the default mask patterns. Adds a `code_variables_mask_url_credentials` config option (default True), wired through the constructor, module-level global, and per-context override, mirroring code_variables_mask_patterns. Co-Authored-By: Claude Opus 4.8 --- .changeset/brave-otters-mask.md | 5 + posthog/__init__.py | 14 ++ posthog/client.py | 17 ++ posthog/contexts.py | 28 +++ posthog/exception_utils.py | 219 +++++++++++++--- posthog/test/test_exception_capture.py | 336 +++++++++++++++++++++++++ 6 files changed, 582 insertions(+), 37 deletions(-) create mode 100644 .changeset/brave-otters-mask.md diff --git a/.changeset/brave-otters-mask.md b/.changeset/brave-otters-mask.md new file mode 100644 index 00000000..35482955 --- /dev/null +++ b/.changeset/brave-otters-mask.md @@ -0,0 +1,5 @@ +--- +'pypi/posthog': minor +--- + +Mask sensitive data held inside objects and in URL/DSN credentials when capturing exception code variables. Custom objects are now traversed so fields like `password` are redacted by attribute name instead of leaking via `repr()`, and credentials embedded in connection strings are scrubbed. Adds the `code_variables_mask_url_credentials` option (default `True`). diff --git a/posthog/__init__.py b/posthog/__init__.py index 4f719c77..dc191fec 100644 --- a/posthog/__init__.py +++ b/posthog/__init__.py @@ -24,6 +24,9 @@ from posthog.contexts import ( set_code_variables_mask_patterns_context as inner_set_code_variables_mask_patterns_context, ) +from posthog.contexts import ( + set_code_variables_mask_url_credentials_context as inner_set_code_variables_mask_url_credentials_context, +) from posthog.contexts import ( set_context_device_id as inner_set_context_device_id, ) @@ -39,6 +42,7 @@ from posthog.exception_utils import ( DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS, DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS, ) from posthog.feature_flag_evaluations import ( FeatureFlagEvaluations as FeatureFlagEvaluations, @@ -226,6 +230,14 @@ def set_code_variables_ignore_patterns_context(ignore_patterns: list): return inner_set_code_variables_ignore_patterns_context(ignore_patterns) +def set_code_variables_mask_url_credentials_context(enabled: bool): + """ + Whether to scrub credentials embedded in URLs/DSNs (e.g. user:pass@host) from + captured code variables for the current context. + """ + return inner_set_code_variables_mask_url_credentials_context(enabled) + + def tag(name: str, value: Any): """ Add a tag to the current context. @@ -346,6 +358,7 @@ def get_tags() -> Dict[str, Any]: capture_exception_code_variables = False code_variables_mask_patterns = DEFAULT_CODE_VARIABLES_MASK_PATTERNS code_variables_ignore_patterns = DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS +code_variables_mask_url_credentials = DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS in_app_modules = None # type: Optional[list[str]] enable_exception_autocapture_rate_limiting = False # type: bool exception_autocapture_bucket_size = ExceptionCapture.DEFAULT_BUCKET_SIZE # type: int @@ -1124,6 +1137,7 @@ def setup() -> Client: capture_exception_code_variables=capture_exception_code_variables, code_variables_mask_patterns=code_variables_mask_patterns, code_variables_ignore_patterns=code_variables_ignore_patterns, + code_variables_mask_url_credentials=code_variables_mask_url_credentials, in_app_modules=in_app_modules, enable_exception_autocapture_rate_limiting=enable_exception_autocapture_rate_limiting, exception_autocapture_bucket_size=exception_autocapture_bucket_size, diff --git a/posthog/client.py b/posthog/client.py index 379200d0..53aaf328 100644 --- a/posthog/client.py +++ b/posthog/client.py @@ -21,6 +21,7 @@ get_capture_exception_code_variables_context, get_code_variables_ignore_patterns_context, get_code_variables_mask_patterns_context, + get_code_variables_mask_url_credentials_context, get_context_device_id, get_context_distinct_id, get_context_session_id, @@ -37,6 +38,7 @@ from posthog.exception_utils import ( DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS, DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS, exc_info_from_error, exception_is_already_captured, exceptions_from_error_tuple, @@ -239,6 +241,7 @@ def __init__( capture_exception_code_variables=False, code_variables_mask_patterns=None, code_variables_ignore_patterns=None, + code_variables_mask_url_credentials=None, in_app_modules: list[str] | None = None, enable_exception_autocapture_rate_limiting=False, exception_autocapture_bucket_size=ExceptionCapture.DEFAULT_BUCKET_SIZE, @@ -396,6 +399,11 @@ def __init__( if code_variables_ignore_patterns is not None else DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS ) + self.code_variables_mask_url_credentials = ( + code_variables_mask_url_credentials + if code_variables_mask_url_credentials is not None + else DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS + ) self.in_app_modules = in_app_modules if project_root is None: @@ -1327,6 +1335,9 @@ def capture_exception( context_enabled = get_capture_exception_code_variables_context() context_mask = get_code_variables_mask_patterns_context() context_ignore = get_code_variables_ignore_patterns_context() + context_mask_url_credentials = ( + get_code_variables_mask_url_credentials_context() + ) enabled = ( context_enabled @@ -1343,6 +1354,11 @@ def capture_exception( if context_ignore is not None else self.code_variables_ignore_patterns ) + mask_url_credentials = ( + context_mask_url_credentials + if context_mask_url_credentials is not None + else self.code_variables_mask_url_credentials + ) if enabled: try_attach_code_variables_to_frames( @@ -1350,6 +1366,7 @@ def capture_exception( exc_info, mask_patterns=mask_patterns, ignore_patterns=ignore_patterns, + mask_url_credentials=mask_url_credentials, ) if self.log_captured_exceptions: diff --git a/posthog/contexts.py b/posthog/contexts.py index 5a31f80a..79323ede 100644 --- a/posthog/contexts.py +++ b/posthog/contexts.py @@ -26,6 +26,7 @@ def __init__( self.capture_exception_code_variables: Optional[bool] = None self.code_variables_mask_patterns: Optional[list] = None self.code_variables_ignore_patterns: Optional[list] = None + self.code_variables_mask_url_credentials: Optional[bool] = None def set_session_id(self, session_id: str): self.session_id = session_id @@ -48,6 +49,9 @@ def set_code_variables_mask_patterns(self, mask_patterns: list): def set_code_variables_ignore_patterns(self, ignore_patterns: list): self.code_variables_ignore_patterns = ignore_patterns + def set_code_variables_mask_url_credentials(self, enabled: bool): + self.code_variables_mask_url_credentials = enabled + def get_parent(self): return self.parent @@ -102,6 +106,13 @@ def get_code_variables_ignore_patterns(self) -> Optional[list]: return self.parent.get_code_variables_ignore_patterns() return None + def get_code_variables_mask_url_credentials(self) -> Optional[bool]: + if self.code_variables_mask_url_credentials is not None: + return self.code_variables_mask_url_credentials + if self.parent is not None and not self.fresh: + return self.parent.get_code_variables_mask_url_credentials() + return None + _context_stack: contextvars.ContextVar[Optional[ContextScope]] = contextvars.ContextVar( "posthog_context_stack", default=None @@ -369,6 +380,16 @@ def set_code_variables_ignore_patterns_context(ignore_patterns: list) -> None: current_context.set_code_variables_ignore_patterns(ignore_patterns) +def set_code_variables_mask_url_credentials_context(enabled: bool) -> None: + """ + Whether to scrub credentials embedded in URLs/DSNs (e.g. user:pass@host) from + captured code variables for the current context. + """ + current_context = _get_current_context() + if current_context: + current_context.set_code_variables_mask_url_credentials(enabled) + + def get_capture_exception_code_variables_context() -> Optional[bool]: current_context = _get_current_context() if current_context: @@ -390,6 +411,13 @@ def get_code_variables_ignore_patterns_context() -> Optional[list]: return None +def get_code_variables_mask_url_credentials_context() -> Optional[bool]: + current_context = _get_current_context() + if current_context: + return current_context.get_code_variables_mask_url_credentials() + return None + + F = TypeVar("F", bound=Callable[..., Any]) diff --git a/posthog/exception_utils.py b/posthog/exception_utils.py index 0f24818c..39e90ac0 100644 --- a/posthog/exception_utils.py +++ b/posthog/exception_utils.py @@ -5,6 +5,7 @@ # 💖open source (under MIT License) # We want to keep payloads as similar to Sentry as possible for easy interoperability +import dataclasses import json import linecache import os @@ -58,10 +59,16 @@ r"(?i)_pass", r"(?i)sk_", r"(?i)jwt", + r"(?i)connection_string", + r"(?i)connectionstring", ] DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS = [r"^__.*"] +# Whether to scrub credentials embedded in URLs/DSNs (e.g. `postgres://user:pass@host`) +# from string values, regardless of the surrounding key/attribute name. +DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS = True + CODE_VARIABLES_REDACTED_VALUE = "$$_posthog_redacted_based_on_masking_rules_$$" CODE_VARIABLES_TOO_LONG_VALUE = "$$_posthog_value_too_long_$$" @@ -69,6 +76,26 @@ _MAX_COLLECTION_ITEMS_TO_SCAN = 100 _REGEX_METACHARACTERS = frozenset(r"\.^$*+?{}[]|()") +# Bounds how deep we recurse into nested structures/objects while masking. Cycles are +# already guarded separately; this caps work for very deep (but acyclic) graphs. +_MAX_MASK_DEPTH = 25 + +# Catches credentials embedded in URLs/DSNs (e.g. `postgresql://user:pass@host`) so they +# are scrubbed even when the surrounding key/attribute name isn't recognised as sensitive. +# Only the `user:pass` userinfo is removed; the rest of the connection string is kept. +_URL_CREDENTIALS_RE = re.compile( + r"([a-z][a-z0-9+.\-]*://)[^/@\s]*:[^/@\s]+@", re.IGNORECASE +) + + +def _redact_url_credentials(value): + if "://" not in value: + return value + return _URL_CREDENTIALS_RE.sub( + r"\g<1>" + CODE_VARIABLES_REDACTED_VALUE + "@", value + ) + + DEFAULT_TOTAL_VARIABLES_SIZE_LIMIT = 20 * 1024 @@ -979,49 +1006,145 @@ def _pattern_matches(name, patterns): return False -def _mask_sensitive_data(value, compiled_mask, _seen=None): +def _safe_type_name(value): + try: + return type(value).__qualname__ + except Exception: + try: + return type(value).__name__ + except Exception: + return "unknown" + + +def _safe_repr(value, compiled_mask, mask_url_credentials=True): + """Last-resort serialization for values we can't structurally decompose. + + Produces ``repr(value)`` but fails *closed*: if we detect any masking-rule match + inside the representation, or the representation is too long to fully scan, the + entire value is redacted rather than emitted. ``repr`` raising is also caught so a + broken ``__repr__`` can neither crash capture nor leak whatever it half-rendered. + """ + try: + rendered = repr(value) + except Exception: + return "<" + _safe_type_name(value) + ">" + + # Too long to scan within our perf budget -> we can't vouch for it, so redact it all. + if len(rendered) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: + return CODE_VARIABLES_REDACTED_VALUE + if compiled_mask and _pattern_matches(rendered, compiled_mask): + return CODE_VARIABLES_REDACTED_VALUE + if mask_url_credentials: + return _redact_url_credentials(rendered) + return rendered + + +def _extract_object_attrs(value): + """Return a ``name -> value`` mapping of an object's attributes, or ``None``. + + Used so attribute-name masking can run on the real field names of dataclasses / + plain objects (which a custom ``__repr__`` could otherwise rename or hide). Returns + ``None`` for values that should be treated as opaque leaves and handled by + ``_safe_repr`` (built-ins, slotted objects, objects with an empty ``__dict__``, etc.). + """ + if isinstance(value, type): + # A class/type object itself, not an instance - nothing useful to traverse. + return None + try: + if dataclasses.is_dataclass(value): + return {f.name: getattr(value, f.name) for f in dataclasses.fields(value)} + except Exception: + return None + try: + instance_dict = getattr(value, "__dict__", None) + except Exception: + return None + if isinstance(instance_dict, dict) and instance_dict: + # Copy so we never mutate the live object; keys here are attribute names. + return dict(instance_dict) + return None + + +def _mask_mapping(items, compiled_mask, _seen, _depth, mask_url_credentials=True): + result = {} + for k, v in items: + key_str = k if isinstance(k, str) else str(k) + if len(key_str) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: + result[k] = CODE_VARIABLES_TOO_LONG_VALUE + elif _pattern_matches(key_str, compiled_mask): + result[k] = CODE_VARIABLES_REDACTED_VALUE + else: + result[k] = _mask_sensitive_data( + v, compiled_mask, _seen, _depth + 1, mask_url_credentials + ) + return result + + +def _mask_sensitive_data( + value, compiled_mask, _seen=None, _depth=0, mask_url_credentials=True +): if not compiled_mask: return value - if isinstance(value, (dict, list, tuple)): - if _seen is None: - _seen = set() - obj_id = id(value) - if obj_id in _seen: - return "" - _seen.add(obj_id) + if isinstance(value, str): + if len(value) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: + return CODE_VARIABLES_TOO_LONG_VALUE + if _pattern_matches(value, compiled_mask): + return CODE_VARIABLES_REDACTED_VALUE + if mask_url_credentials: + return _redact_url_credentials(value) + return value + + if value is None or isinstance(value, (bool, int, float)): + return value + + if _depth >= _MAX_MASK_DEPTH: + return _safe_repr(value, compiled_mask, mask_url_credentials) + + if _seen is None: + _seen = set() + obj_id = id(value) + if obj_id in _seen: + return "" + _seen.add(obj_id) if isinstance(value, dict): if len(value) > _MAX_COLLECTION_ITEMS_TO_SCAN: return CODE_VARIABLES_TOO_LONG_VALUE - result = {} - for k, v in value.items(): - key_str = str(k) if not isinstance(k, str) else k - if len(key_str) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: - result[k] = CODE_VARIABLES_TOO_LONG_VALUE - elif _pattern_matches(key_str, compiled_mask): - result[k] = CODE_VARIABLES_REDACTED_VALUE - else: - result[k] = _mask_sensitive_data(v, compiled_mask, _seen) - return result - elif isinstance(value, (list, tuple)): + return _mask_mapping( + value.items(), compiled_mask, _seen, _depth, mask_url_credentials + ) + + if isinstance(value, (list, tuple)): if len(value) > _MAX_COLLECTION_ITEMS_TO_SCAN: return CODE_VARIABLES_TOO_LONG_VALUE masked_items = [ - _mask_sensitive_data(item, compiled_mask, _seen) for item in value + _mask_sensitive_data( + item, compiled_mask, _seen, _depth + 1, mask_url_credentials + ) + for item in value ] return type(value)(masked_items) - elif isinstance(value, str): - if len(value) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: - return CODE_VARIABLES_TOO_LONG_VALUE - if _pattern_matches(value, compiled_mask): - return CODE_VARIABLES_REDACTED_VALUE - return value - else: - return value + + # Custom objects: traverse their real attributes so a field named e.g. `password` + # is caught by name. This is both safer (a custom __repr__ can't relabel a field + # out of the mask) and higher-fidelity (only the sensitive field is redacted) than + # repr-scanning the whole object. + attrs = _extract_object_attrs(value) + if attrs is not None: + masked = _mask_mapping( + attrs.items(), compiled_mask, _seen, _depth, mask_url_credentials + ) + masked["__class__"] = _safe_type_name(value) + return masked + + # Opaque leaf (built-in/slotted/etc.): fall back to a fail-closed repr. + return _safe_repr(value, compiled_mask, mask_url_credentials) -def _serialize_variable_value(value, limiter, max_length=1024, compiled_mask=None): +def _serialize_variable_value( + value, limiter, max_length=1024, compiled_mask=None, mask_url_credentials=True +): try: if value is None: result = "None" @@ -1038,11 +1161,21 @@ def _serialize_variable_value(value, limiter, max_length=1024, compiled_mask=Non result = CODE_VARIABLES_TOO_LONG_VALUE elif compiled_mask and _pattern_matches(value, compiled_mask): result = CODE_VARIABLES_REDACTED_VALUE + elif compiled_mask and mask_url_credentials: + result = _redact_url_credentials(value) else: result = value else: - masked_value = _mask_sensitive_data(value, compiled_mask) - result = json.dumps(masked_value) + masked_value = _mask_sensitive_data( + value, compiled_mask, mask_url_credentials=mask_url_credentials + ) + # `default` is a safety net: anything _mask_sensitive_data left + # non-serializable is rendered through the fail-closed repr too, so a + # raw object can never reach the output untouched. + result = json.dumps( + masked_value, + default=lambda o: _safe_repr(o, compiled_mask, mask_url_credentials), + ) if len(result) > max_length: result = result[: max_length - 3] + "..." @@ -1055,7 +1188,9 @@ def _serialize_variable_value(value, limiter, max_length=1024, compiled_mask=Non return result except Exception: try: - result = repr(value) + # Fail closed: never emit a raw repr here (e.g. json.dumps choking on + # non-string dict keys) - run it through the same masking-aware repr. + result = _safe_repr(value, compiled_mask, mask_url_credentials) if len(result) > max_length: result = result[: max_length - 3] + "..." @@ -1086,7 +1221,12 @@ def _is_simple_type(value): def serialize_code_variables( - frame, limiter, mask_patterns=None, ignore_patterns=None, max_length=1024 + frame, + limiter, + mask_patterns=None, + ignore_patterns=None, + max_length=1024, + mask_url_credentials=True, ): if mask_patterns is None: mask_patterns = [] @@ -1130,7 +1270,7 @@ def serialize_code_variables( result[name] = redacted_value else: serialized = _serialize_variable_value( - value, limiter, max_length, compiled_mask + value, limiter, max_length, compiled_mask, mask_url_credentials ) if serialized is None: break @@ -1140,18 +1280,22 @@ def serialize_code_variables( def try_attach_code_variables_to_frames( - all_exceptions, exc_info, mask_patterns, ignore_patterns + all_exceptions, exc_info, mask_patterns, ignore_patterns, mask_url_credentials=True ): try: attach_code_variables_to_frames( - all_exceptions, exc_info, mask_patterns, ignore_patterns + all_exceptions, + exc_info, + mask_patterns, + ignore_patterns, + mask_url_credentials, ) except Exception: pass def attach_code_variables_to_frames( - all_exceptions, exc_info, mask_patterns, ignore_patterns + all_exceptions, exc_info, mask_patterns, ignore_patterns, mask_url_credentials=True ): exc_type, exc_value, traceback = exc_info @@ -1182,6 +1326,7 @@ def attach_code_variables_to_frames( mask_patterns=mask_patterns, ignore_patterns=ignore_patterns, max_length=1024, + mask_url_credentials=mask_url_credentials, ) if variables: diff --git a/posthog/test/test_exception_capture.py b/posthog/test/test_exception_capture.py index 155a144c..c53591f0 100644 --- a/posthog/test/test_exception_capture.py +++ b/posthog/test/test_exception_capture.py @@ -809,3 +809,339 @@ def test_mask_sensitive_data_large_tuple_replaced(): result = _mask_sensitive_data(large_tuple, compiled_mask) assert result == CODE_VARIABLES_TOO_LONG_VALUE + + +def test_mask_sensitive_data_traverses_object_attributes(): + from dataclasses import dataclass + + from posthog.exception_utils import ( + CODE_VARIABLES_REDACTED_VALUE, + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + _compile_patterns, + _mask_sensitive_data, + ) + + @dataclass + class PostgresSourceConfig: + host: str + user: str + password: str + connection_string: str + + config = PostgresSourceConfig( + host="db.example.com", + user="warehouse", + password="uHjH9WJuEV0VT2NKoP7zpQ", + connection_string="postgresql://warehouse:uHjH9WJuEV0VT2NKoP7zpQ@db.example.com:26257/db", + ) + + compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) + result = _mask_sensitive_data(config, compiled_mask) + + # Object is decomposed into its real attributes, tagged with its class. + assert "PostgresSourceConfig" in result["__class__"] + # Sensitive fields are redacted by attribute name... + assert result["password"] == CODE_VARIABLES_REDACTED_VALUE + assert result["connection_string"] == CODE_VARIABLES_REDACTED_VALUE + # ...while non-sensitive fields are preserved for debugging. + assert result["host"] == "db.example.com" + assert result["user"] == "warehouse" + # The literal secret survives nowhere in the output. + assert "uHjH9WJuEV0VT2NKoP7zpQ" not in str(result) + + +def test_mask_sensitive_data_traverses_object_nested_in_tuple(): + # Mirrors the real-world leak: a config object carrying a password, sitting + # positionally inside an `args` tuple alongside other useful context. + from dataclasses import dataclass + + from posthog.exception_utils import ( + CODE_VARIABLES_REDACTED_VALUE, + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + _compile_patterns, + _mask_sensitive_data, + ) + + @dataclass + class PostgresSourceConfig: + host: str + password: str + + @dataclass + class SourceInputs: + schema_name: str + team_id: int + + args = ( + PostgresSourceConfig(host="db.example.com", password="topsecret123"), + SourceInputs(schema_name="traffic_stats", team_id=101290), + ) + + compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) + result = _mask_sensitive_data(args, compiled_mask) + + assert isinstance(result, tuple) + config, inputs = result + assert config["host"] == "db.example.com" + assert config["password"] == CODE_VARIABLES_REDACTED_VALUE + # The surrounding object is NOT nuked - only the sensitive field is. This is the + # key advantage of structural traversal over repr-then-redact-everything. + assert inputs["schema_name"] == "traffic_stats" + assert inputs["team_id"] == 101290 + assert "topsecret123" not in str(result) + + +def test_mask_sensitive_data_traverses_plain_object(): + from posthog.exception_utils import ( + CODE_VARIABLES_REDACTED_VALUE, + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + _compile_patterns, + _mask_sensitive_data, + ) + + class Credentials: + def __init__(self): + self.username = "alice" + self.api_key = "sk_live_abc123" + + compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) + result = _mask_sensitive_data(Credentials(), compiled_mask) + + assert result["username"] == "alice" + assert result["api_key"] == CODE_VARIABLES_REDACTED_VALUE + assert "sk_live_abc123" not in str(result) + + +def test_safe_repr_redacts_when_secret_detected(): + # Opaque (slotted, no __dict__) objects can't be traversed, so they fall back to + # repr-then-redact-the-whole-thing if any masking rule matches the representation. + from posthog.exception_utils import ( + CODE_VARIABLES_REDACTED_VALUE, + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + _compile_patterns, + _mask_sensitive_data, + _safe_repr, + ) + + class OpaqueCreds: + __slots__ = ("password",) + + def __init__(self, password): + self.password = password + + def __repr__(self): + return "OpaqueCreds(password={})".format(self.password) + + class OpaqueSafe: + __slots__ = ("x", "y") + + def __init__(self): + self.x, self.y = 1, 2 + + def __repr__(self): + return "OpaqueSafe(x=1, y=2)" + + compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) + + # repr mentions `password` -> the entire value is redacted. + assert ( + _safe_repr(OpaqueCreds("s3cr3t"), compiled_mask) + == CODE_VARIABLES_REDACTED_VALUE + ) + assert ( + _mask_sensitive_data(OpaqueCreds("s3cr3t"), compiled_mask) + == CODE_VARIABLES_REDACTED_VALUE + ) + # No secret indicator -> the repr is preserved (no over-redaction). + assert _safe_repr(OpaqueSafe(), compiled_mask) == "OpaqueSafe(x=1, y=2)" + + +def test_safe_repr_handles_broken_repr(): + from posthog.exception_utils import ( + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + _compile_patterns, + _safe_repr, + ) + + class Boom: + __slots__ = ("secret",) + + def __init__(self): + self.secret = "leak123" + + def __repr__(self): + raise RuntimeError("boom") + + compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) + result = _safe_repr(Boom(), compiled_mask) + + # A broken __repr__ must neither crash nor leak - we emit a type-name placeholder. + assert "leak123" not in result + assert "Boom" in result + assert result.startswith("<") and result.endswith(">") + + +def test_redact_url_credentials(): + from posthog.exception_utils import ( + CODE_VARIABLES_REDACTED_VALUE, + _redact_url_credentials, + ) + + # Embedded credentials are stripped, the rest of the connection string is kept. + assert _redact_url_credentials( + "postgresql://warehouse:uHjH9WJuEV0VT2NKoP7zpQ@db.example.com:26257/db" + ) == ("postgresql://" + CODE_VARIABLES_REDACTED_VALUE + "@db.example.com:26257/db") + # Password-only userinfo is also caught. + assert "p4ss" not in _redact_url_credentials("redis://:p4ss@cache:6379") + # A URL with a port but no credentials is left untouched. + assert ( + _redact_url_credentials("https://api.example.com:8080/v1") + == "https://api.example.com:8080/v1" + ) + # Non-URL strings pass through unchanged. + assert _redact_url_credentials("just a string") == "just a string" + + +def test_code_variables_masks_object_attributes_end_to_end(tmpdir): + app = tmpdir.join("app.py") + app.write( + dedent( + """ + import os + from dataclasses import dataclass + from posthog import Posthog + + @dataclass + class PostgresSourceConfig: + host: str + user: str + password: str + connection_string: str + + @dataclass + class SourceInputs: + schema_name: str + team_id: int + + posthog = Posthog( + 'phc_x', + host='https://eu.i.posthog.com', + debug=True, + enable_exception_autocapture=True, + capture_exception_code_variables=True, + project_root=os.path.dirname(os.path.abspath(__file__)) + ) + + def trigger_error(): + # Read at runtime so the secret is never a source literal (source lines are + # captured too) - this mirrors a credential loaded from config/env in prod. + db_password = os.environ["TEST_DB_PASSWORD"] + args = ( + PostgresSourceConfig( + host="db.example.com", + user="warehouse", + password=db_password, + connection_string=( + "postgresql://warehouse:" + db_password + "@db.example.com:26257/db" + ), + ), + SourceInputs(schema_name="traffic_stats", team_id=101290), + ) + + 1/0 + + trigger_error() + """ + ) + ) + + import os as _os + + with pytest.raises(subprocess.CalledProcessError) as excinfo: + subprocess.check_output( + [sys.executable, str(app)], + stderr=subprocess.STDOUT, + env={**_os.environ, "TEST_DB_PASSWORD": "uHjH9WJuEV0VT2NKoP7zpQ"}, + ) + + output = excinfo.value.output.decode("utf-8") + + assert "ZeroDivisionError" in output + assert "code_variables" in output + + # The secret must never appear, in any form. + assert "uHjH9WJuEV0VT2NKoP7zpQ" not in output + assert "$$_posthog_redacted_based_on_masking_rules_$$" in output + + # Surrounding, non-sensitive context is preserved. + assert "PostgresSourceConfig" in output + assert "db.example.com" in output + assert "traffic_stats" in output + + +def test_mask_url_credentials_can_be_toggled(): + from posthog.exception_utils import ( + CODE_VARIABLES_REDACTED_VALUE, + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + _compile_patterns, + _mask_sensitive_data, + ) + + compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) + # `db_uri` is a neutral name, and the value contains no masked keyword - so only + # the URL-credentials heuristic can catch the embedded password. + data = {"db_uri": "postgresql://user:p4ss@host:5432/db"} + + enabled = _mask_sensitive_data(data, compiled_mask) + assert "p4ss" not in enabled["db_uri"] + assert CODE_VARIABLES_REDACTED_VALUE in enabled["db_uri"] + assert enabled["db_uri"].endswith("@host:5432/db") + + disabled = _mask_sensitive_data(data, compiled_mask, mask_url_credentials=False) + assert disabled["db_uri"] == "postgresql://user:p4ss@host:5432/db" + + +def test_code_variables_mask_url_credentials_disabled_end_to_end(tmpdir): + import os + + app = tmpdir.join("app.py") + app.write( + dedent( + """ + import os + from posthog import Posthog + + posthog = Posthog( + 'phc_x', + host='https://eu.i.posthog.com', + debug=True, + enable_exception_autocapture=True, + capture_exception_code_variables=True, + code_variables_mask_url_credentials=False, + project_root=os.path.dirname(os.path.abspath(__file__)) + ) + + def trigger_error(): + # Neutral variable name + no masked keyword, so only the URL heuristic could + # scrub it - and it is disabled here. Secret sourced at runtime, not a literal. + db_uri = "postgresql://user:" + os.environ["TEST_DB_PASSWORD"] + "@host:5432/db" + + 1/0 + + trigger_error() + """ + ) + ) + + with pytest.raises(subprocess.CalledProcessError) as excinfo: + subprocess.check_output( + [sys.executable, str(app)], + stderr=subprocess.STDOUT, + env={**os.environ, "TEST_DB_PASSWORD": "p4ssRUNTIME"}, + ) + + output = excinfo.value.output.decode("utf-8") + + assert "code_variables" in output + # URL masking explicitly disabled -> credentials are retained. + assert "p4ssRUNTIME" in output From a3c0eeeadff5663289ab312bcc825643170006ce Mon Sep 17 00:00:00 2001 From: ablaszkiewicz Date: Fri, 19 Jun 2026 22:47:10 +0200 Subject: [PATCH 2/5] feat: adjust --- posthog/client.py | 3 + posthog/exception_utils.py | 18 ++- posthog/test/test_exception_capture.py | 169 +++++++++++++++++++++++++ references/public_api_snapshot.txt | 23 +++- 4 files changed, 201 insertions(+), 12 deletions(-) diff --git a/posthog/client.py b/posthog/client.py index 53aaf328..845b14e4 100644 --- a/posthog/client.py +++ b/posthog/client.py @@ -307,6 +307,9 @@ def __init__( capturing code variables. code_variables_ignore_patterns: Variable-name patterns to omit when capturing code variables. + code_variables_mask_url_credentials: Scrub credentials embedded in + URLs/DSNs (e.g. ``user:pass@host``) from captured code variables, + regardless of the surrounding variable name. Defaults to True. in_app_modules: Module/package prefixes treated as in-app frames in captured exceptions. enable_exception_autocapture_rate_limiting: Rate limit diff --git a/posthog/exception_utils.py b/posthog/exception_utils.py index 39e90ac0..2a6e97e9 100644 --- a/posthog/exception_utils.py +++ b/posthog/exception_utils.py @@ -61,6 +61,9 @@ r"(?i)jwt", r"(?i)connection_string", r"(?i)connectionstring", + r"(?i)conn_str", + r"(?i)connstr", + r"(?i)dsn", ] DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS = [r"^__.*"] @@ -80,20 +83,19 @@ # already guarded separately; this caps work for very deep (but acyclic) graphs. _MAX_MASK_DEPTH = 25 -# Catches credentials embedded in URLs/DSNs (e.g. `postgresql://user:pass@host`) so they -# are scrubbed even when the surrounding key/attribute name isn't recognised as sensitive. -# Only the `user:pass` userinfo is removed; the rest of the connection string is kept. +# Redact `user:pass` credentials embedded in URLs/DSNs (e.g. `postgresql://user:pass@host`). +# The lookahead requires a `:` in the userinfo (so a bare `user@host` is left alone), the +# body matches up to the *last* `@` so a password containing `@` is fully redacted, and the +# bounded scheme length keeps matching linear (no catastrophic backtracking). _URL_CREDENTIALS_RE = re.compile( - r"([a-z][a-z0-9+.\-]*://)[^/@\s]*:[^/@\s]+@", re.IGNORECASE + r"([a-z][a-z0-9+.\-]{0,30}://)(?=[^/@\s]*:)[^/\s]*@", re.IGNORECASE ) def _redact_url_credentials(value): if "://" not in value: return value - return _URL_CREDENTIALS_RE.sub( - r"\g<1>" + CODE_VARIABLES_REDACTED_VALUE + "@", value - ) + return _URL_CREDENTIALS_RE.sub(r"\g<1>" + CODE_VARIABLES_REDACTED_VALUE + "@", value) DEFAULT_TOTAL_VARIABLES_SIZE_LIMIT = 20 * 1024 @@ -1132,6 +1134,8 @@ def _mask_sensitive_data( # repr-scanning the whole object. attrs = _extract_object_attrs(value) if attrs is not None: + if len(attrs) > _MAX_COLLECTION_ITEMS_TO_SCAN: + return CODE_VARIABLES_TOO_LONG_VALUE masked = _mask_mapping( attrs.items(), compiled_mask, _seen, _depth, mask_url_credentials ) diff --git a/posthog/test/test_exception_capture.py b/posthog/test/test_exception_capture.py index c53591f0..6c0489a4 100644 --- a/posthog/test/test_exception_capture.py +++ b/posthog/test/test_exception_capture.py @@ -981,6 +981,90 @@ def __repr__(self): assert result.startswith("<") and result.endswith(">") +def test_safe_repr_redacts_when_too_long_to_scan(): + # A repr larger than the scan budget can't be vouched for, so it is redacted whole + # rather than emitted (even if no masking rule visibly matched). + from posthog.exception_utils import ( + CODE_VARIABLES_REDACTED_VALUE, + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH, + _compile_patterns, + _safe_repr, + ) + + class Huge: + __slots__ = ("_payload",) + + def __init__(self, payload): + self._payload = payload + + def __repr__(self): + return self._payload + + compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) + secret = "topsecret" + payload = secret + ("x" * (_MAX_VALUE_LENGTH_FOR_PATTERN_MATCH + 1)) + + result = _safe_repr(Huge(payload), compiled_mask) + assert result == CODE_VARIABLES_REDACTED_VALUE + assert secret not in result + + +def test_safe_repr_scrubs_url_credentials_in_repr(): + # An opaque object whose repr embeds a URL credential (but no sensitive keyword) has + # the credential scrubbed while the rest of the repr is preserved. + from posthog.exception_utils import ( + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + _compile_patterns, + _safe_repr, + ) + + class Conn: + __slots__ = () + + def __repr__(self): + return "Conn(url=postgresql://user:leakpw@db.example.com/app)" + + compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) + result = _safe_repr(Conn(), compiled_mask) + + assert "leakpw" not in result + assert "db.example.com" in result + + # With URL scrubbing disabled the credential is retained. + unscrubbed = _safe_repr(Conn(), compiled_mask, mask_url_credentials=False) + assert "leakpw" in unscrubbed + + +def test_mask_sensitive_data_caps_object_with_many_attributes(): + # The collection-size cap applies to traversed object attributes too, so a single + # object with thousands of fields can't be fully serialized onto the hot path. + from posthog.exception_utils import ( + CODE_VARIABLES_TOO_LONG_VALUE, + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + _MAX_COLLECTION_ITEMS_TO_SCAN, + _compile_patterns, + _mask_sensitive_data, + ) + + class Wide: + def __init__(self, n): + for i in range(n): + setattr(self, "attr_{}".format(i), i) + + compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) + + wide = Wide(_MAX_COLLECTION_ITEMS_TO_SCAN + 5) + assert ( + _mask_sensitive_data(wide, compiled_mask) == CODE_VARIABLES_TOO_LONG_VALUE + ) + + # A small object is still traversed normally. + narrow = _mask_sensitive_data(Wide(2), compiled_mask) + assert isinstance(narrow, dict) + assert narrow["attr_0"] == 0 + + def test_redact_url_credentials(): from posthog.exception_utils import ( CODE_VARIABLES_REDACTED_VALUE, @@ -993,6 +1077,41 @@ def test_redact_url_credentials(): ) == ("postgresql://" + CODE_VARIABLES_REDACTED_VALUE + "@db.example.com:26257/db") # Password-only userinfo is also caught. assert "p4ss" not in _redact_url_credentials("redis://:p4ss@cache:6379") + # A password containing "@" is fully redacted (matched up to the last "@"), not + # partially leaked. + redacted_at = _redact_url_credentials("postgresql://user:p@ssword@host:5432/db") + assert "ssword" not in redacted_at + assert redacted_at == ( + "postgresql://" + CODE_VARIABLES_REDACTED_VALUE + "@host:5432/db" + ) + # Multiple URLs in one string are each scrubbed. + multi = _redact_url_credentials( + "primary=postgres://u:p1@h1 replica=redis://u:p2@h2" + ) + assert "p1" not in multi and "p2" not in multi + assert multi == ( + "primary=postgres://" + + CODE_VARIABLES_REDACTED_VALUE + + "@h1 replica=redis://" + + CODE_VARIABLES_REDACTED_VALUE + + "@h2" + ) + # Other schemes (mongodb, http) are handled the same way. + assert "secret" not in _redact_url_credentials( + "mongodb://admin:secret@mongo:27017" + ) + assert "secret" not in _redact_url_credentials( + "https://admin:secret@api.example.com/v1" + ) + # IPv6 hosts are preserved while credentials are stripped. + assert _redact_url_credentials("redis://user:secret@[::1]:6379") == ( + "redis://" + CODE_VARIABLES_REDACTED_VALUE + "@[::1]:6379" + ) + # A bare username with no password slot is NOT a credential -> left untouched. + assert ( + _redact_url_credentials("ssh://gituser@github.com/repo") + == "ssh://gituser@github.com/repo" + ) # A URL with a port but no credentials is left untouched. assert ( _redact_url_credentials("https://api.example.com:8080/v1") @@ -1145,3 +1264,53 @@ def trigger_error(): assert "code_variables" in output # URL masking explicitly disabled -> credentials are retained. assert "p4ssRUNTIME" in output + + +def test_code_variables_mask_url_credentials_context_override(tmpdir): + # The per-context override must beat the client default: the client masks URL + # credentials by default, but the context turns it off for this block. + import os + + app = tmpdir.join("app.py") + app.write( + dedent( + """ + import os + import posthog + from posthog import Posthog + + posthog_client = Posthog( + 'phc_x', + host='https://eu.i.posthog.com', + debug=True, + enable_exception_autocapture=True, + capture_exception_code_variables=True, + code_variables_mask_url_credentials=True, + project_root=os.path.dirname(os.path.abspath(__file__)) + ) + + def trigger_error(): + # Neutral name + no masked keyword, so only the URL heuristic could scrub it. + db_uri = "postgresql://user:" + os.environ["TEST_DB_PASSWORD"] + "@host:5432/db" + + 1/0 + + with posthog.new_context(client=posthog_client): + posthog.set_code_variables_mask_url_credentials_context(False) + trigger_error() + """ + ) + ) + + with pytest.raises(subprocess.CalledProcessError) as excinfo: + subprocess.check_output( + [sys.executable, str(app)], + stderr=subprocess.STDOUT, + env={**os.environ, "TEST_DB_PASSWORD": "p4ssCONTEXT"}, + ) + + output = excinfo.value.output.decode("utf-8") + + assert "code_variables" in output + # Context override (False) wins over the client default (True) -> creds retained. + assert "p4ssCONTEXT" in output diff --git a/references/public_api_snapshot.txt b/references/public_api_snapshot.txt index f8f1a031..e8c7d815 100644 --- a/references/public_api_snapshot.txt +++ b/references/public_api_snapshot.txt @@ -7,6 +7,7 @@ alias posthog.BeforeSendCallback -> posthog.types.BeforeSendCallback alias posthog.Client -> posthog.client.Client alias posthog.DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS alias posthog.DEFAULT_CODE_VARIABLES_MASK_PATTERNS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_PATTERNS +alias posthog.DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS alias posthog.ExceptionArg -> posthog.args.ExceptionArg alias posthog.ExceptionCapture -> posthog.exception_capture.ExceptionCapture alias posthog.FeatureFlag -> posthog.types.FeatureFlag @@ -215,6 +216,7 @@ alias posthog.client.APIError -> posthog.request.APIError alias posthog.client.Consumer -> posthog.consumer.Consumer alias posthog.client.DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS alias posthog.client.DEFAULT_CODE_VARIABLES_MASK_PATTERNS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_PATTERNS +alias posthog.client.DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS -> posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS alias posthog.client.EVENTS_ENDPOINT -> posthog.request.EVENTS_ENDPOINT alias posthog.client.ExceptionArg -> posthog.args.ExceptionArg alias posthog.client.ExceptionCapture -> posthog.exception_capture.ExceptionCapture @@ -253,6 +255,7 @@ alias posthog.client.get -> posthog.request.get alias posthog.client.get_capture_exception_code_variables_context -> posthog.contexts.get_capture_exception_code_variables_context alias posthog.client.get_code_variables_ignore_patterns_context -> posthog.contexts.get_code_variables_ignore_patterns_context alias posthog.client.get_code_variables_mask_patterns_context -> posthog.contexts.get_code_variables_mask_patterns_context +alias posthog.client.get_code_variables_mask_url_credentials_context -> posthog.contexts.get_code_variables_mask_url_credentials_context alias posthog.client.get_context_device_id -> posthog.contexts.get_context_device_id alias posthog.client.get_context_distinct_id -> posthog.contexts.get_context_distinct_id alias posthog.client.get_context_session_id -> posthog.contexts.get_context_session_id @@ -298,6 +301,7 @@ alias posthog.inner_scoped -> posthog.contexts.scoped alias posthog.inner_set_capture_exception_code_variables_context -> posthog.contexts.set_capture_exception_code_variables_context alias posthog.inner_set_code_variables_ignore_patterns_context -> posthog.contexts.set_code_variables_ignore_patterns_context alias posthog.inner_set_code_variables_mask_patterns_context -> posthog.contexts.set_code_variables_mask_patterns_context +alias posthog.inner_set_code_variables_mask_url_credentials_context -> posthog.contexts.set_code_variables_mask_url_credentials_context alias posthog.inner_set_context_device_id -> posthog.contexts.set_context_device_id alias posthog.inner_set_context_session -> posthog.contexts.set_context_session alias posthog.inner_tag -> posthog.contexts.tag @@ -452,6 +456,7 @@ attribute posthog.client.Client.api_key = (project_api_key or '').strip() attribute posthog.client.Client.capture_exception_code_variables = capture_exception_code_variables attribute posthog.client.Client.code_variables_ignore_patterns = code_variables_ignore_patterns if code_variables_ignore_patterns is not None else DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS attribute posthog.client.Client.code_variables_mask_patterns = code_variables_mask_patterns if code_variables_mask_patterns is not None else DEFAULT_CODE_VARIABLES_MASK_PATTERNS +attribute posthog.client.Client.code_variables_mask_url_credentials = code_variables_mask_url_credentials if code_variables_mask_url_credentials is not None else DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS attribute posthog.client.Client.cohorts: Optional[dict[str, Any]] = None attribute posthog.client.Client.consumers = None attribute posthog.client.Client.debug = debug @@ -494,6 +499,7 @@ attribute posthog.client.Client.timeout = timeout attribute posthog.client.MAX_DICT_SIZE = 50000 attribute posthog.code_variables_ignore_patterns = DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS attribute posthog.code_variables_mask_patterns = DEFAULT_CODE_VARIABLES_MASK_PATTERNS +attribute posthog.code_variables_mask_url_credentials = DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS attribute posthog.consumer.AI_MAX_MSG_SIZE = 8 * 1024 * 1024 attribute posthog.consumer.BATCH_SIZE_LIMIT = 5 * 1024 * 1024 attribute posthog.consumer.Consumer.api_key = api_key @@ -516,6 +522,7 @@ attribute posthog.contexts.ContextScope.capture_exceptions = capture_exceptions attribute posthog.contexts.ContextScope.client: Optional[Client] = client attribute posthog.contexts.ContextScope.code_variables_ignore_patterns: Optional[list] = None attribute posthog.contexts.ContextScope.code_variables_mask_patterns: Optional[list] = None +attribute posthog.contexts.ContextScope.code_variables_mask_url_credentials: Optional[bool] = None attribute posthog.contexts.ContextScope.device_id: Optional[str] = None attribute posthog.contexts.ContextScope.distinct_id: Optional[str] = None attribute posthog.contexts.ContextScope.fresh = fresh @@ -546,7 +553,8 @@ attribute posthog.exception_utils.BASE64_ALPHABET = re.compile('^[a-zA-Z0-9/+=]* attribute posthog.exception_utils.CODE_VARIABLES_REDACTED_VALUE = '$$_posthog_redacted_based_on_masking_rules_$$' attribute posthog.exception_utils.CODE_VARIABLES_TOO_LONG_VALUE = '$$_posthog_value_too_long_$$' attribute posthog.exception_utils.DEFAULT_CODE_VARIABLES_IGNORE_PATTERNS = ['^__.*'] -attribute posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_PATTERNS = ['(?i)password', '(?i)secret', '(?i)passwd', '(?i)pwd', '(?i)api_key', '(?i)apikey', '(?i)auth', '(?i)credentials', '(?i)privatekey', '(?i)private_key', '(?i)token', '(?i)aws_access_key_id', '(?i)_pass', '(?i)sk_', '(?i)jwt'] +attribute posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_PATTERNS = ['(?i)password', '(?i)secret', '(?i)passwd', '(?i)pwd', '(?i)api_key', '(?i)apikey', '(?i)auth', '(?i)credentials', '(?i)privatekey', '(?i)private_key', '(?i)token', '(?i)aws_access_key_id', '(?i)_pass', '(?i)sk_', '(?i)jwt', '(?i)connection_string', '(?i)connectionstring', '(?i)conn_str', '(?i)connstr', '(?i)dsn'] +attribute posthog.exception_utils.DEFAULT_CODE_VARIABLES_MASK_URL_CREDENTIALS = True attribute posthog.exception_utils.DEFAULT_MAX_VALUE_LENGTH = 1024 attribute posthog.exception_utils.DEFAULT_TOTAL_VARIABLES_SIZE_LIMIT = 20 * 1024 attribute posthog.exception_utils.Event = TypedDict('Event', {'breadcrumbs': Dict[Literal['values'], List[Dict[str, Any]]], 'check_in_id': str, 'contexts': Dict[str, Dict[str, object]], 'dist': str, 'duration': Optional[float], 'environment': str, 'errors': List[Dict[str, Any]], 'event_id': str, 'exception': Dict[Literal['values'], List[Dict[str, Any]]], 'level': LogLevelStr, 'logger': str, 'message': str, 'modules': Dict[str, str], 'monitor_slug': Optional[str], 'platform': Literal['python'], 'profile': object, 'release': str, 'request': Dict[str, object], 'server_name': str, 'spans': List[Dict[str, object]], 'stacktrace': Dict[str, object], 'start_timestamp': datetime, 'status': Optional[str], 'threads': Dict[Literal['values'], List[Dict[str, Any]]], 'timestamp': Optional[datetime], 'transaction': str, 'type': Literal['check_in', 'transaction'], 'user': Dict[str, object], '_metrics_summary': Dict[str, object]}, total=False) @@ -744,7 +752,7 @@ class posthog.ai.types.ToolInProgress class posthog.args.OptionalCaptureArgs class posthog.args.OptionalSetArgs class posthog.bucketed_rate_limiter.BucketedRateLimiter(bucket_size: Number, refill_rate: Number, refill_interval_seconds: Number, on_bucket_rate_limited: Optional[Callable[[Hashable], None]] = None, clock: Callable[[], float] = time.monotonic) -class posthog.client.Client(project_api_key: str, host=None, debug=False, max_queue_size=10000, send=True, on_error=None, flush_at=100, flush_interval=5.0, gzip=False, max_retries=3, sync_mode=False, timeout=15, thread=1, poll_interval=30, personal_api_key=None, disabled=False, disable_geoip=True, is_server=True, historical_migration=False, feature_flags_request_timeout_seconds=3, super_properties=None, enable_exception_autocapture=False, log_captured_exceptions=False, project_root=None, privacy_mode=False, before_send=None, flag_fallback_cache_url=None, enable_local_evaluation=True, flag_definition_cache_provider: Optional[FlagDefinitionCacheProvider] = None, capture_exception_code_variables=False, code_variables_mask_patterns=None, code_variables_ignore_patterns=None, in_app_modules: list[str] | None = None, enable_exception_autocapture_rate_limiting=False, exception_autocapture_bucket_size=ExceptionCapture.DEFAULT_BUCKET_SIZE, exception_autocapture_refill_rate=ExceptionCapture.DEFAULT_REFILL_RATE, exception_autocapture_refill_interval_seconds=ExceptionCapture.DEFAULT_REFILL_INTERVAL_SECONDS, _dedicated_ai_endpoint=False) +class posthog.client.Client(project_api_key: str, host=None, debug=False, max_queue_size=10000, send=True, on_error=None, flush_at=100, flush_interval=5.0, gzip=False, max_retries=3, sync_mode=False, timeout=15, thread=1, poll_interval=30, personal_api_key=None, disabled=False, disable_geoip=True, is_server=True, historical_migration=False, feature_flags_request_timeout_seconds=3, super_properties=None, enable_exception_autocapture=False, log_captured_exceptions=False, project_root=None, privacy_mode=False, before_send=None, flag_fallback_cache_url=None, enable_local_evaluation=True, flag_definition_cache_provider: Optional[FlagDefinitionCacheProvider] = None, capture_exception_code_variables=False, code_variables_mask_patterns=None, code_variables_ignore_patterns=None, code_variables_mask_url_credentials=None, in_app_modules: list[str] | None = None, enable_exception_autocapture_rate_limiting=False, exception_autocapture_bucket_size=ExceptionCapture.DEFAULT_BUCKET_SIZE, exception_autocapture_refill_rate=ExceptionCapture.DEFAULT_REFILL_RATE, exception_autocapture_refill_interval_seconds=ExceptionCapture.DEFAULT_REFILL_INTERVAL_SECONDS, _dedicated_ai_endpoint=False) class posthog.consumer.Consumer(queue, api_key, flush_at=100, host=None, on_error=None, flush_interval=5.0, gzip=False, retries=10, timeout=15, historical_migration=False, dedicated_ai_endpoint=False) class posthog.contexts.ContextScope(parent=None, fresh: bool = False, capture_exceptions: bool = True, client: Optional[Client] = None) class posthog.exception_capture.ExceptionCapture(client: Client, rate_limiting_enabled=False, bucket_size=DEFAULT_BUCKET_SIZE, refill_rate=DEFAULT_REFILL_RATE, refill_interval_seconds=DEFAULT_REFILL_INTERVAL_SECONDS) @@ -865,6 +873,7 @@ function posthog.client.stringify_id(val) function posthog.contexts.get_capture_exception_code_variables_context() -> Optional[bool] function posthog.contexts.get_code_variables_ignore_patterns_context() -> Optional[list] function posthog.contexts.get_code_variables_mask_patterns_context() -> Optional[list] +function posthog.contexts.get_code_variables_mask_url_credentials_context() -> Optional[bool] function posthog.contexts.get_context_device_id() -> Optional[str] function posthog.contexts.get_context_distinct_id() -> Optional[str] function posthog.contexts.get_context_session_id() -> Optional[str] @@ -875,11 +884,12 @@ function posthog.contexts.scoped(fresh: bool = False, capture_exceptions: Option function posthog.contexts.set_capture_exception_code_variables_context(enabled: bool) -> None function posthog.contexts.set_code_variables_ignore_patterns_context(ignore_patterns: list) -> None function posthog.contexts.set_code_variables_mask_patterns_context(mask_patterns: list) -> None +function posthog.contexts.set_code_variables_mask_url_credentials_context(enabled: bool) -> None function posthog.contexts.set_context_device_id(device_id: str) -> None function posthog.contexts.set_context_session(session_id: str) -> None function posthog.contexts.tag(key: str, value: Any) -> None function posthog.evaluate_flags(distinct_id=None, groups=None, person_properties=None, group_properties=None, only_evaluate_locally=False, disable_geoip=None, flag_keys=None, device_id=None) -> FeatureFlagEvaluations -function posthog.exception_utils.attach_code_variables_to_frames(all_exceptions, exc_info, mask_patterns, ignore_patterns) +function posthog.exception_utils.attach_code_variables_to_frames(all_exceptions, exc_info, mask_patterns, ignore_patterns, mask_url_credentials=True) function posthog.exception_utils.construct_artificial_traceback(e) function posthog.exception_utils.event_hint_with_exc_info(exc_info=None) function posthog.exception_utils.exc_info_from_error(error) @@ -901,7 +911,7 @@ function posthog.exception_utils.iter_stacks(tb) function posthog.exception_utils.mark_exception_as_captured(error, uuid) function posthog.exception_utils.safe_repr(value) function posthog.exception_utils.safe_str(value) -function posthog.exception_utils.serialize_code_variables(frame, limiter, mask_patterns=None, ignore_patterns=None, max_length=1024) +function posthog.exception_utils.serialize_code_variables(frame, limiter, mask_patterns=None, ignore_patterns=None, max_length=1024, mask_url_credentials=True) function posthog.exception_utils.serialize_frame(frame, tb_lineno=None, max_value_length=None) function posthog.exception_utils.set_in_app_in_frames(frames, in_app_exclude, in_app_include, project_root=None) function posthog.exception_utils.should_hide_frame(frame: FrameType) -> bool @@ -909,7 +919,7 @@ function posthog.exception_utils.single_exception_from_error_tuple(exc_type, exc function posthog.exception_utils.strip_string(value, max_length=None) function posthog.exception_utils.to_string(value) function posthog.exception_utils.to_timestamp(value) -function posthog.exception_utils.try_attach_code_variables_to_frames(all_exceptions, exc_info, mask_patterns, ignore_patterns) +function posthog.exception_utils.try_attach_code_variables_to_frames(all_exceptions, exc_info, mask_patterns, ignore_patterns, mask_url_credentials=True) function posthog.exception_utils.walk_exception_chain(exc_info) function posthog.feature_enabled(key, distinct_id, groups=None, person_properties=None, group_properties=None, only_evaluate_locally=False, send_feature_flag_events=True, disable_geoip=None, device_id=None) function posthog.feature_flag_definitions() @@ -957,6 +967,7 @@ function posthog.set(**kwargs: Unpack[OptionalSetArgs]) -> Optional[str] function posthog.set_capture_exception_code_variables_context(enabled: bool) function posthog.set_code_variables_ignore_patterns_context(ignore_patterns: list) function posthog.set_code_variables_mask_patterns_context(mask_patterns: list) +function posthog.set_code_variables_mask_url_credentials_context(enabled: bool) function posthog.set_context_device_id(device_id: str) function posthog.set_context_session(session_id: str) function posthog.set_once(**kwargs: Unpack[OptionalSetArgs]) -> Optional[str] @@ -1082,6 +1093,7 @@ method posthog.contexts.ContextScope.collect_tags() -> Dict[str, Any] method posthog.contexts.ContextScope.get_capture_exception_code_variables() -> Optional[bool] method posthog.contexts.ContextScope.get_code_variables_ignore_patterns() -> Optional[list] method posthog.contexts.ContextScope.get_code_variables_mask_patterns() -> Optional[list] +method posthog.contexts.ContextScope.get_code_variables_mask_url_credentials() -> Optional[bool] method posthog.contexts.ContextScope.get_device_id() -> Optional[str] method posthog.contexts.ContextScope.get_distinct_id() -> Optional[str] method posthog.contexts.ContextScope.get_parent() @@ -1089,6 +1101,7 @@ method posthog.contexts.ContextScope.get_session_id() -> Optional[str] method posthog.contexts.ContextScope.set_capture_exception_code_variables(enabled: bool) method posthog.contexts.ContextScope.set_code_variables_ignore_patterns(ignore_patterns: list) method posthog.contexts.ContextScope.set_code_variables_mask_patterns(mask_patterns: list) +method posthog.contexts.ContextScope.set_code_variables_mask_url_credentials(enabled: bool) method posthog.contexts.ContextScope.set_device_id(device_id: str) method posthog.contexts.ContextScope.set_distinct_id(distinct_id: str) method posthog.contexts.ContextScope.set_session_id(session_id: str) From bee7af4b02549193cc966e69834ec30261e16cbe Mon Sep 17 00:00:00 2001 From: ablaszkiewicz Date: Fri, 19 Jun 2026 23:00:29 +0200 Subject: [PATCH 3/5] fix: comments --- posthog/exception_utils.py | 11 +++-- posthog/test/test_exception_capture.py | 61 ++++++++++++++++++++++---- 2 files changed, 60 insertions(+), 12 deletions(-) diff --git a/posthog/exception_utils.py b/posthog/exception_utils.py index 2a6e97e9..add5d897 100644 --- a/posthog/exception_utils.py +++ b/posthog/exception_utils.py @@ -95,7 +95,9 @@ def _redact_url_credentials(value): if "://" not in value: return value - return _URL_CREDENTIALS_RE.sub(r"\g<1>" + CODE_VARIABLES_REDACTED_VALUE + "@", value) + return _URL_CREDENTIALS_RE.sub( + r"\g<1>" + CODE_VARIABLES_REDACTED_VALUE + "@", value + ) DEFAULT_TOTAL_VARIABLES_SIZE_LIMIT = 20 * 1024 @@ -1085,7 +1087,10 @@ def _mask_mapping(items, compiled_mask, _seen, _depth, mask_url_credentials=True def _mask_sensitive_data( value, compiled_mask, _seen=None, _depth=0, mask_url_credentials=True ): - if not compiled_mask: + # Name-based masking and URL-credential scrubbing are independent toggles: only skip + # entirely when both are off. `_pattern_matches` safely treats an empty mask as "no + # match", so URL scrubbing still runs when `compiled_mask` is None. + if not compiled_mask and not mask_url_credentials: return value if isinstance(value, str): @@ -1165,7 +1170,7 @@ def _serialize_variable_value( result = CODE_VARIABLES_TOO_LONG_VALUE elif compiled_mask and _pattern_matches(value, compiled_mask): result = CODE_VARIABLES_REDACTED_VALUE - elif compiled_mask and mask_url_credentials: + elif mask_url_credentials: result = _redact_url_credentials(value) else: result = value diff --git a/posthog/test/test_exception_capture.py b/posthog/test/test_exception_capture.py index 6c0489a4..22acbbf8 100644 --- a/posthog/test/test_exception_capture.py +++ b/posthog/test/test_exception_capture.py @@ -1014,6 +1014,7 @@ def test_safe_repr_scrubs_url_credentials_in_repr(): # An opaque object whose repr embeds a URL credential (but no sensitive keyword) has # the credential scrubbed while the rest of the repr is preserved. from posthog.exception_utils import ( + CODE_VARIABLES_REDACTED_VALUE, DEFAULT_CODE_VARIABLES_MASK_PATTERNS, _compile_patterns, _safe_repr, @@ -1028,8 +1029,13 @@ def __repr__(self): compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) result = _safe_repr(Conn(), compiled_mask) + # Only the credential is removed; the surrounding repr (scheme, host, path) is kept. assert "leakpw" not in result - assert "db.example.com" in result + assert result == ( + "Conn(url=postgresql://" + + CODE_VARIABLES_REDACTED_VALUE + + "@db.example.com/app)" + ) # With URL scrubbing disabled the credential is retained. unscrubbed = _safe_repr(Conn(), compiled_mask, mask_url_credentials=False) @@ -1055,9 +1061,7 @@ def __init__(self, n): compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) wide = Wide(_MAX_COLLECTION_ITEMS_TO_SCAN + 5) - assert ( - _mask_sensitive_data(wide, compiled_mask) == CODE_VARIABLES_TOO_LONG_VALUE - ) + assert _mask_sensitive_data(wide, compiled_mask) == CODE_VARIABLES_TOO_LONG_VALUE # A small object is still traversed normally. narrow = _mask_sensitive_data(Wide(2), compiled_mask) @@ -1097,9 +1101,7 @@ def test_redact_url_credentials(): + "@h2" ) # Other schemes (mongodb, http) are handled the same way. - assert "secret" not in _redact_url_credentials( - "mongodb://admin:secret@mongo:27017" - ) + assert "secret" not in _redact_url_credentials("mongodb://admin:secret@mongo:27017") assert "secret" not in _redact_url_credentials( "https://admin:secret@api.example.com/v1" ) @@ -1121,6 +1123,47 @@ def test_redact_url_credentials(): assert _redact_url_credentials("just a string") == "just a string" +def test_url_credentials_scrubbed_with_empty_mask_patterns(): + # URL-credential scrubbing and name-based masking are independent toggles: disabling + # all name patterns (mask_patterns=[]) must NOT silently disable URL scrubbing. + from posthog.exception_utils import ( + CODE_VARIABLES_REDACTED_VALUE, + VariableSizeLimiter, + _compile_patterns, + _mask_sensitive_data, + _serialize_variable_value, + ) + + compiled_mask = _compile_patterns([]) + assert compiled_mask is None # no name patterns at all + + # Structural path (dict -> str value). + masked = _mask_sensitive_data( + {"db_uri": "postgresql://user:p4ss@host:5432/db"}, + compiled_mask, + mask_url_credentials=True, + ) + assert "p4ss" not in masked["db_uri"] + assert CODE_VARIABLES_REDACTED_VALUE in masked["db_uri"] + + # String fast-path in _serialize_variable_value. + out = _serialize_variable_value( + "postgresql://user:p4ss@host/db", + VariableSizeLimiter(), + compiled_mask=None, + mask_url_credentials=True, + ) + assert "p4ss" not in out + + # With both toggles off, the value is returned untouched. + assert ( + _mask_sensitive_data( + "postgresql://user:p4ss@host/db", None, mask_url_credentials=False + ) + == "postgresql://user:p4ss@host/db" + ) + + def test_code_variables_masks_object_attributes_end_to_end(tmpdir): app = tmpdir.join("app.py") app.write( @@ -1192,9 +1235,9 @@ def trigger_error(): assert "uHjH9WJuEV0VT2NKoP7zpQ" not in output assert "$$_posthog_redacted_based_on_masking_rules_$$" in output - # Surrounding, non-sensitive context is preserved. + # Surrounding, non-sensitive context is preserved (sibling fields kept verbatim). assert "PostgresSourceConfig" in output - assert "db.example.com" in output + assert "warehouse" in output assert "traffic_stats" in output From deb3aeddf7d75a7a913db57c6d6e32c6ec27f4fb Mon Sep 17 00:00:00 2001 From: ablaszkiewicz Date: Sat, 20 Jun 2026 16:06:29 +0200 Subject: [PATCH 4/5] fix: deeply nested objects --- posthog/exception_utils.py | 28 +++++--- posthog/test/test_exception_capture.py | 88 +++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 10 deletions(-) diff --git a/posthog/exception_utils.py b/posthog/exception_utils.py index add5d897..e9582495 100644 --- a/posthog/exception_utils.py +++ b/posthog/exception_utils.py @@ -8,6 +8,7 @@ import dataclasses import json import linecache +import math import os import re import sys @@ -83,10 +84,9 @@ # already guarded separately; this caps work for very deep (but acyclic) graphs. _MAX_MASK_DEPTH = 25 -# Redact `user:pass` credentials embedded in URLs/DSNs (e.g. `postgresql://user:pass@host`). -# The lookahead requires a `:` in the userinfo (so a bare `user@host` is left alone), the -# body matches up to the *last* `@` so a password containing `@` is fully redacted, and the -# bounded scheme length keeps matching linear (no catastrophic backtracking). +# Matches `user:pass` credentials in URLs/DSNs (e.g. `postgresql://user:pass@host`). The +# lookahead requires a password, the body extends to the last `@`, and the bounded scheme +# length avoids catastrophic backtracking. _URL_CREDENTIALS_RE = re.compile( r"([a-z][a-z0-9+.\-]{0,30}://)(?=[^/@\s]*:)[^/\s]*@", re.IGNORECASE ) @@ -1087,9 +1087,8 @@ def _mask_mapping(items, compiled_mask, _seen, _depth, mask_url_credentials=True def _mask_sensitive_data( value, compiled_mask, _seen=None, _depth=0, mask_url_credentials=True ): - # Name-based masking and URL-credential scrubbing are independent toggles: only skip - # entirely when both are off. `_pattern_matches` safely treats an empty mask as "no - # match", so URL scrubbing still runs when `compiled_mask` is None. + # Name masking and URL scrubbing are independent toggles, so only skip when both are + # off (`_pattern_matches` treats an empty mask as a non-match). if not compiled_mask and not mask_url_credentials: return value @@ -1102,11 +1101,18 @@ def _mask_sensitive_data( return _redact_url_credentials(value) return value + if isinstance(value, float) and not math.isfinite(value): + # Non-finite floats serialize to invalid JSON (NaN/Infinity) once json.dumps + # runs, so render them as strings here too (see _serialize_variable_value). + return str(value) + if value is None or isinstance(value, (bool, int, float)): return value if _depth >= _MAX_MASK_DEPTH: - return _safe_repr(value, compiled_mask, mask_url_credentials) + # Too deep to keep traversing. A repr could expose a field that traversal would + # have masked by name, so fail closed with a placeholder instead of rendering it. + return CODE_VARIABLES_TOO_LONG_VALUE if _seen is None: _seen = set() @@ -1159,6 +1165,11 @@ def _serialize_variable_value( result = "None" elif isinstance(value, bool): result = str(value) + elif isinstance(value, float) and not math.isfinite(value): + # nan / inf / -inf serialize to the JSON tokens NaN / Infinity, which + # are invalid JSON: a strict parser on the ingestion side can reject the + # whole batch. Emit them as strings ("nan"/"inf"/"-inf") instead. + result = str(value) elif isinstance(value, (int, float)): result_size = len(str(value)) if not limiter.can_add(result_size): @@ -1184,6 +1195,7 @@ def _serialize_variable_value( result = json.dumps( masked_value, default=lambda o: _safe_repr(o, compiled_mask, mask_url_credentials), + allow_nan=False, ) if len(result) > max_length: diff --git a/posthog/test/test_exception_capture.py b/posthog/test/test_exception_capture.py index 22acbbf8..681e116f 100644 --- a/posthog/test/test_exception_capture.py +++ b/posthog/test/test_exception_capture.py @@ -884,8 +884,7 @@ class SourceInputs: config, inputs = result assert config["host"] == "db.example.com" assert config["password"] == CODE_VARIABLES_REDACTED_VALUE - # The surrounding object is NOT nuked - only the sensitive field is. This is the - # key advantage of structural traversal over repr-then-redact-everything. + # Only the sensitive field is redacted; surrounding fields are kept. assert inputs["schema_name"] == "traffic_stats" assert inputs["team_id"] == 101290 assert "topsecret123" not in str(result) @@ -1069,6 +1068,91 @@ def __init__(self, n): assert narrow["attr_0"] == 0 +def test_mask_sensitive_data_deeply_nested_object_does_not_leak(): + # Past the depth limit a custom __repr__ that hides a sensitive field must not leak. + from dataclasses import dataclass + + from posthog.exception_utils import ( + CODE_VARIABLES_TOO_LONG_VALUE, + DEFAULT_CODE_VARIABLES_MASK_PATTERNS, + _MAX_MASK_DEPTH, + _compile_patterns, + _mask_sensitive_data, + ) + + @dataclass + class Box: + password: str + + def __repr__(self): + # Omits the field name, so only name-based traversal can catch the secret. + return f"Box({self.password})" + + class Node: + def __init__(self, child): + self.child = child + + compiled_mask = _compile_patterns(DEFAULT_CODE_VARIABLES_MASK_PATTERNS) + + # Wrap the secret-bearing Box at and past the depth limit. + for depth in [_MAX_MASK_DEPTH - 1, _MAX_MASK_DEPTH, _MAX_MASK_DEPTH + 5]: + value = Box("hunter2") + for _ in range(depth): + value = Node(value) + result = _mask_sensitive_data(value, compiled_mask) + assert "hunter2" not in str(result) + + # Shallow objects are still traversed and redacted by name. + shallow = _mask_sensitive_data(Box("hunter2"), compiled_mask) + assert shallow["password"] != "hunter2" + assert "hunter2" not in str(shallow) + + # A structure that only exceeds the depth limit degrades to the placeholder. + deep = "leaf" + for _ in range(_MAX_MASK_DEPTH + 1): + deep = [deep] + result = _mask_sensitive_data(deep, compiled_mask) + assert CODE_VARIABLES_TOO_LONG_VALUE in str(result) + assert "leaf" not in str(result) + + +def test_non_finite_floats_serialized_as_json_safe_strings(): + # nan / inf / -inf would serialize to the invalid-JSON tokens NaN / Infinity, which + # a strict parser on the ingestion side rejects - potentially dropping the whole + # batch. They must be emitted as plain strings instead. + import json + import math + + from posthog.exception_utils import ( + VariableSizeLimiter, + _mask_sensitive_data, + _serialize_variable_value, + ) + + # Top-level scalars: emitted as strings, never as raw non-finite floats. + assert _serialize_variable_value(float("nan"), VariableSizeLimiter()) == "nan" + assert _serialize_variable_value(float("inf"), VariableSizeLimiter()) == "inf" + assert _serialize_variable_value(float("-inf"), VariableSizeLimiter()) == "-inf" + + # Finite floats are unchanged - still emitted as raw numbers. + assert _serialize_variable_value(3.14, VariableSizeLimiter()) == 3.14 + + # Nested non-finite floats are converted before json.dumps runs, so the serialized + # value contains no NaN/Infinity tokens and parses as strict-valid JSON. + serialized = _serialize_variable_value( + {"ratio": float("inf"), "delta": float("nan"), "ok": 1.5}, + VariableSizeLimiter(), + ) + assert "NaN" not in serialized and "Infinity" not in serialized + assert json.loads(serialized) == {"ratio": "inf", "delta": "nan", "ok": 1.5} + + # The masker also renders non-finite floats as strings directly. + assert _mask_sensitive_data(float("nan"), None) == "nan" + masked_list = _mask_sensitive_data([float("inf"), 2.0], None) + assert masked_list == ["inf", 2.0] + assert all(not (isinstance(v, float) and not math.isfinite(v)) for v in masked_list) + + def test_redact_url_credentials(): from posthog.exception_utils import ( CODE_VARIABLES_REDACTED_VALUE, From 4e3981ba63ceb56ada23bfa8e4f4cc5e63e0ec2d Mon Sep 17 00:00:00 2001 From: ablaszkiewicz Date: Sat, 20 Jun 2026 16:28:15 +0200 Subject: [PATCH 5/5] feat: simplify --- posthog/exception_utils.py | 90 ++++++++++++++------------------------ 1 file changed, 34 insertions(+), 56 deletions(-) diff --git a/posthog/exception_utils.py b/posthog/exception_utils.py index e9582495..4e15b20f 100644 --- a/posthog/exception_utils.py +++ b/posthog/exception_utils.py @@ -1010,14 +1010,24 @@ def _pattern_matches(name, patterns): return False +def _mask_string(value, compiled_mask, mask_url_credentials): + """Apply the string masking policy: over-length cap, then name/value patterns, then + embedded URL credentials. Shared by the structural and top-level serializers so the + order and placeholders stay identical.""" + if len(value) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: + return CODE_VARIABLES_TOO_LONG_VALUE + if _pattern_matches(value, compiled_mask): + return CODE_VARIABLES_REDACTED_VALUE + if mask_url_credentials: + return _redact_url_credentials(value) + return value + + def _safe_type_name(value): try: return type(value).__qualname__ except Exception: - try: - return type(value).__name__ - except Exception: - return "unknown" + return "unknown" def _safe_repr(value, compiled_mask, mask_url_credentials=True): @@ -1057,9 +1067,6 @@ def _extract_object_attrs(value): try: if dataclasses.is_dataclass(value): return {f.name: getattr(value, f.name) for f in dataclasses.fields(value)} - except Exception: - return None - try: instance_dict = getattr(value, "__dict__", None) except Exception: return None @@ -1093,13 +1100,7 @@ def _mask_sensitive_data( return value if isinstance(value, str): - if len(value) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: - return CODE_VARIABLES_TOO_LONG_VALUE - if _pattern_matches(value, compiled_mask): - return CODE_VARIABLES_REDACTED_VALUE - if mask_url_credentials: - return _redact_url_credentials(value) - return value + return _mask_string(value, compiled_mask, mask_url_credentials) if isinstance(value, float) and not math.isfinite(value): # Non-finite floats serialize to invalid JSON (NaN/Infinity) once json.dumps @@ -1157,6 +1158,17 @@ def _mask_sensitive_data( return _safe_repr(value, compiled_mask, mask_url_credentials) +def _finalize(result, limiter, max_length): + """Truncate a string result to ``max_length`` and charge it against the size budget. + Returns ``None`` when the budget is exhausted, which signals callers to stop.""" + if len(result) > max_length: + result = result[: max_length - 3] + "..." + if not limiter.can_add(len(result)): + return None + limiter.add(len(result)) + return result + + def _serialize_variable_value( value, limiter, max_length=1024, compiled_mask=None, mask_url_credentials=True ): @@ -1171,20 +1183,14 @@ def _serialize_variable_value( # whole batch. Emit them as strings ("nan"/"inf"/"-inf") instead. result = str(value) elif isinstance(value, (int, float)): + # Numbers are emitted as raw JSON numbers, so they skip string truncation. result_size = len(str(value)) if not limiter.can_add(result_size): return None limiter.add(result_size) return value elif isinstance(value, str): - if len(value) > _MAX_VALUE_LENGTH_FOR_PATTERN_MATCH: - result = CODE_VARIABLES_TOO_LONG_VALUE - elif compiled_mask and _pattern_matches(value, compiled_mask): - result = CODE_VARIABLES_REDACTED_VALUE - elif mask_url_credentials: - result = _redact_url_credentials(value) - else: - result = value + result = _mask_string(value, compiled_mask, mask_url_credentials) else: masked_value = _mask_sensitive_data( value, compiled_mask, mask_url_credentials=mask_url_credentials @@ -1198,43 +1204,15 @@ def _serialize_variable_value( allow_nan=False, ) - if len(result) > max_length: - result = result[: max_length - 3] + "..." - - result_size = len(result) - if not limiter.can_add(result_size): - return None - limiter.add(result_size) - - return result + return _finalize(result, limiter, max_length) except Exception: + # Fail closed: even if json.dumps chokes (e.g. on non-string dict keys), never + # emit a raw repr - re-render through the masking-aware repr, then a type name. try: - # Fail closed: never emit a raw repr here (e.g. json.dumps choking on - # non-string dict keys) - run it through the same masking-aware repr. - result = _safe_repr(value, compiled_mask, mask_url_credentials) - if len(result) > max_length: - result = result[: max_length - 3] + "..." - - result_size = len(result) - if not limiter.can_add(result_size): - return None - limiter.add(result_size) - return result + rendered = _safe_repr(value, compiled_mask, mask_url_credentials) except Exception: - try: - fallback = f"<{type(value).__name__}>" - fallback_size = len(fallback) - if not limiter.can_add(fallback_size): - return None - limiter.add(fallback_size) - return fallback - except Exception: - fallback = "" - fallback_size = len(fallback) - if not limiter.can_add(fallback_size): - return None - limiter.add(fallback_size) - return fallback + rendered = f"<{_safe_type_name(value)}>" + return _finalize(rendered, limiter, max_length) def _is_simple_type(value):