datajoint
diff --git a/‎src/datajoint/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎src/datajoint/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/datajoint/adapters/postgres.py‎
Lines changed: 25 additions & 0 deletions b/‎src/datajoint/adapters/postgres.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/datajoint/condition.py‎
Lines changed: 14 additions & 0 deletions b/‎src/datajoint/condition.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/datajoint/deploy.py‎
Lines changed: 185 additions & 0 deletions b/‎src/datajoint/deploy.py‎
Lines changed: 185 additions & 0 deletions
diff --git a/‎src/datajoint/schemas.py‎
Lines changed: 17 additions & 0 deletions b/‎src/datajoint/schemas.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/datajoint/spark.py‎
Lines changed: 92 additions & 0 deletions b/‎src/datajoint/spark.py‎
Lines changed: 92 additions & 0 deletions
@@ -51,12 +51,15 @@
     "get_codec",
     "ObjectRef",
     "NpyRef",
+    # SparkAdapter Codec Protocol
+    "SparkAdapter",
     # Storage Adapter API
     "StorageAdapter",
     "get_storage_adapter",
     # Other
     "errors",
     "migrate",
+    "deploy",
     "DataJointError",
     "ThreadSafetyError",
     "logger",
@@ -69,6 +72,7 @@
 # =============================================================================
 from . import errors
 from . import migrate
+from . import deploy
 from .codecs import (
     Codec,
     get_codec,
@@ -85,6 +89,7 @@
 from .instance import Instance, _ConfigProxy, _get_singleton_connection, _global_config, _check_thread_safe
 from .logging import logger
 from .objectref import ObjectRef
+from .spark import SparkAdapter
 from .storage_adapter import StorageAdapter, get_storage_adapter
 from .schemas import _Schema, VirtualModule, list_schemas, virtual_schema
 from .autopopulate import AutoPopulate
 
@@ -1280,6 +1280,31 @@ def enum_type_ddl(self, type_name: str, values: list[str]) -> str | None:
         quoted_values = ", ".join(f"'{v}'" for v in values)
         return f"CREATE TYPE {self.quote_identifier(type_name)} AS ENUM ({quoted_values})"
 
+    def replica_identity_ddl(self, full_table_name: str, mode: str) -> str:
+        """
+        Generate ALTER TABLE ... REPLICA IDENTITY statement.
+
+        Controls how much of the old row PostgreSQL writes to WAL on UPDATE/DELETE.
+        ``"default"`` logs only primary-key columns; ``"full"`` logs the entire row.
+        Required by some CDC tools (e.g. Databricks Lakehouse Sync) that need the
+        full pre-image to drive Slowly-Changing-Dimension history.
+
+        The ALTER is metadata-only, instant, and idempotent — re-applying the same
+        mode is a no-op at the storage layer.
+
+        Examples
+        --------
+        >>> adapter.replica_identity_ddl('"schema"."table"', 'full')
+        'ALTER TABLE "schema"."table" REPLICA IDENTITY FULL'
+        >>> adapter.replica_identity_ddl('"schema"."table"', 'default')
+        'ALTER TABLE "schema"."table" REPLICA IDENTITY DEFAULT'
+        """
+        if mode not in ("default", "full"):
+            from ..errors import DataJointError
+
+            raise DataJointError(f"Unsupported replica_identity mode: {mode!r}. Expected 'default' or 'full'.")
+        return f"ALTER TABLE {full_table_name} REPLICA IDENTITY {mode.upper()}"
+
     def get_pending_enum_ddl(self, schema_name: str) -> list[str]:
         """
         Get DDL statements for pending enum types and clear the pending list.
 
@@ -268,6 +268,20 @@ def assert_join_compatibility(
             lineage2 = expr2.heading[name].lineage
             # Semantic match requires both lineages to be non-None and equal
             if lineage1 is None or lineage2 is None or lineage1 != lineage2:
+                if lineage1 is None or lineage2 is None:
+                    # Missing lineage usually means stale ~lineage rows that survived
+                    # an upgrade or a partial declare. Decoration in 2.3+ refreshes
+                    # lineage automatically, so this typically indicates a schema
+                    # that has not been re-decorated since the upgrade.
+                    raise DataJointError(
+                        f"Cannot join on attribute `{name}`: lineage missing on "
+                        f"one side ({lineage1} vs {lineage2}). This usually "
+                        f"indicates a stale `~lineage` entry from an older "
+                        f"DataJoint version or an incomplete declare. Run "
+                        f"`schema.rebuild_lineage()` to recompute lineage from "
+                        f"current FK definitions. If the lineages are genuinely "
+                        f"different, use `.proj()` to rename one of the attributes."
+                    )
                 raise DataJointError(
                     f"Cannot join on attribute `{name}`: "
                     f"different lineages ({lineage1} vs {lineage2}). "
 
@@ -0,0 +1,185 @@
+"""
+Deployment-time operations for configuring an existing DataJoint pipeline.
+
+This module hosts idempotent operational helpers — things you run as part of a
+deploy hook to configure a schema for its environment, distinct from
+:mod:`datajoint.migrate` which handles one-shot schema/state evolution.
+
+The boundary between the two:
+
+- :mod:`datajoint.migrate` — fix legacy state, evolve a schema definition,
+  retroactive corrections. Cadence: one-shot. Examples: ``migrate_columns``,
+  ``add_job_metadata_columns``, ``rebuild_lineage``.
+- :mod:`datajoint.deploy` — configure an environment for a consumer's
+  requirements (CDC tools, replication, role grants, performance tuning).
+  Cadence: re-runnable, idempotent. Examples: :func:`set_replica_identity`.
+
+Functions in this module should be safe to call repeatedly from a deploy hook
+without accumulating side effects.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Literal, Union
+
+from .errors import DataJointError
+
+if TYPE_CHECKING:
+    from .schemas import _Schema
+    from .table import Table
+
+    TargetType = Union["_Schema", type["Table"], "Table"]
+
+
+def set_replica_identity(
+    target: "TargetType",
+    mode: Literal["default", "full"] = "full",
+    dry_run: bool = True,
+) -> dict:
+    """
+    Apply ``ALTER TABLE ... REPLICA IDENTITY <mode>`` to a schema or table on PostgreSQL.
+
+    ``REPLICA IDENTITY`` controls how much of the **old row** PostgreSQL writes to
+    the write-ahead log on UPDATE/DELETE. Under ``DEFAULT``, only primary-key
+    columns appear in WAL; under ``FULL``, the entire old row does.
+
+    Why this exists
+    ---------------
+    Some change-data-capture (CDC) consumers require the full row pre-image to
+    drive their downstream models. The canonical example is **Databricks
+    Lakehouse Sync**: tables without ``REPLICA IDENTITY FULL`` are silently
+    skipped by the sync — no error, just missing data downstream. Other CDC
+    tools (Debezium, ClickHouse ClickPipes, Azure CDC) work fine with
+    ``DEFAULT`` when tables have a primary key; only Databricks mandates
+    ``FULL``.
+
+    This helper is the **operational** way to apply the setting. It is not a
+    migration: there's no legacy state being fixed; the setting is simply a
+    property of the deployment environment, and a fresh declare in a new
+    environment may need it re-applied. It is idempotent — re-applying the
+    same mode is a no-op at the storage layer — so it is safe to call from a
+    deploy hook on every release.
+
+    Cost
+    ----
+    The ALTER itself is metadata-only and instant, but requires a brief
+    ``AccessExclusiveLock`` on each table — it will block behind in-flight
+    writes/reads on a busy table. Run during a quiet window on actively-
+    ingested tables.
+
+    The ongoing cost is in WAL volume after the change: UPDATE/DELETE on
+    tables with FULL log the entire old row, which can be sizable on tables
+    with TOASTed bytea columns. For DataJoint's typical insert-append
+    workload, this cost is negligible. The notable scenario is bulk
+    ``delete()`` on tables with ``<blob>`` columns — a transient WAL burst
+    proportional to the deleted-row payload size.
+
+    Partial-failure semantics
+    -------------------------
+    If ``connection.query(ddl)`` raises on table N of M, the first N-1
+    tables are already modified at the storage layer but the exception
+    propagates without returning the partial summary. The operation is
+    idempotent, so re-running brings the remaining tables into compliance.
+
+    Compliance considerations
+    -------------------------
+    Under ``DEFAULT``, only primary-key values appear in WAL. Under ``FULL``,
+    entire rows do — including any PHI/PII/sensitive columns. For self-hosted
+    PostgreSQL with unrestricted WAL access this is a real consideration; for
+    managed PostgreSQL with logical replication confined to a specific
+    subscriber (Lakebase, RDS), WAL stays inside the managed environment's
+    security boundary. Apply intentionally.
+
+    Parameters
+    ----------
+    target : Schema or Table
+        A :class:`datajoint.Schema` (all user tables) or a
+        :class:`datajoint.Table` class/instance (just that table).
+    mode : str, default ``"full"``
+        ``"default"`` (PK only, minimal WAL) or ``"full"`` (entire row).
+    dry_run : bool, default ``True``
+        If True, collect the DDL statements but do not execute. Set to False
+        to actually apply.
+
+    Returns
+    -------
+    dict
+        - ``tables_analyzed`` (int): number of tables considered.
+        - ``tables_modified`` (int): number of tables on which the ALTER ran.
+          Always 0 when ``dry_run=True``.
+        - ``ddl`` (list[str]): the DDL statements that were (or would be) executed.
+
+    Raises
+    ------
+    DataJointError
+        If the target's backend is not PostgreSQL, or if ``mode`` is not one of
+        ``"default"`` / ``"full"``.
+
+    Examples
+    --------
+    >>> from datajoint.deploy import set_replica_identity
+    >>> # Preview
+    >>> set_replica_identity(my_schema, mode="full", dry_run=True)
+    {'tables_analyzed': 12, 'tables_modified': 0, 'ddl': ['ALTER TABLE "ms"."t1" REPLICA IDENTITY FULL', ...]}
+    >>> # Apply
+    >>> set_replica_identity(my_schema, mode="full", dry_run=False)
+    {'tables_analyzed': 12, 'tables_modified': 12, 'ddl': [...]}
+    >>> # Single table
+    >>> set_replica_identity(MyTable, mode="full", dry_run=False)
+
+    See Also
+    --------
+    PostgreSQL: `Logical Replication — Replica Identity
+    <https://www.postgresql.org/docs/current/logical-replication-publication.html>`_.
+    Databricks: `Lakehouse Sync
+    <https://docs.databricks.com/aws/en/oltp/projects/lakehouse-sync>`_.
+    """
+    mode_normalized = mode.lower() if isinstance(mode, str) else mode
+    if mode_normalized not in ("default", "full"):
+        raise DataJointError(f"mode must be 'default' or 'full'; got {mode!r}")
+    mode = mode_normalized  # type: ignore[assignment]
+
+    from .schemas import _Schema
+    from .table import Table
+
+    if isinstance(target, _Schema):
+        connection = target.connection
+        if connection is None:
+            raise DataJointError("Schema has no active connection.")
+        adapter = connection.adapter
+        if target.database is None:
+            raise DataJointError("Schema is not activated. Call schema.activate(...) before set_replica_identity().")
+        tables = [adapter.make_full_table_name(target.database, t) for t in target.list_tables()]
+    elif isinstance(target, type) and issubclass(target, Table):
+        instance = target()
+        connection = instance.connection
+        if connection is None:
+            raise DataJointError(f"Table {target.__name__} has no active connection.")
+        adapter = connection.adapter
+        tables = [instance.full_table_name]
+    elif isinstance(target, Table):
+        connection = target.connection
+        if connection is None:
+            raise DataJointError(f"Table {type(target).__name__} has no active connection.")
+        adapter = connection.adapter
+        tables = [target.full_table_name]
+    else:
+        raise DataJointError(f"target must be a Schema or Table class/instance; got {type(target).__name__}")
+
+    if not hasattr(adapter, "replica_identity_ddl"):
+        raise DataJointError(
+            f"set_replica_identity is PostgreSQL-only; the {adapter.backend} adapter does not support REPLICA IDENTITY."
+        )
+
+    result: dict[str, Any] = {
+        "tables_analyzed": len(tables),
+        "tables_modified": 0,
+        "ddl": [],
+    }
+    for full_name in tables:
+        ddl = adapter.replica_identity_ddl(full_name, mode)  # type: ignore[attr-defined]
+        result["ddl"].append(ddl)
+        if not dry_run:
+            connection.query(ddl)
+            result["tables_modified"] += 1
+    return result
@@ -303,6 +303,23 @@ def _decorate_table(self, table_class: type, context: dict[str, Any], assert_dec
         if not is_declared and not assert_declared and create_tables:
             instance.declare(context)
             self.connection.dependencies.clear()
+        elif is_declared and create_tables:
+            # Table already exists — declare() didn't run, so _populate_lineage
+            # didn't either. Scan the already-loaded heading for the symptom
+            # of stale/missing lineage rows (#1454): any PK attribute with
+            # lineage=None indicates the ~lineage table is missing rows for
+            # this table. Only then trigger a refresh — no extra DB queries
+            # on healthy schemas, automatic repair when the bug is present.
+            #
+            # Note: stale-but-non-None rows (DJ version skew that wrote a
+            # different string format) are not auto-detected here; users hit
+            # the tailored "rebuild_lineage" error message on first join.
+            try:
+                pk_lineages = [instance.heading[attr].lineage for attr in instance.primary_key]
+            except Exception:
+                pk_lineages = []
+            if pk_lineages and any(lineage is None for lineage in pk_lineages):
+                instance._refresh_lineage(context)
         is_declared = is_declared or instance.is_declared
 
         # add table definition to the doc string
 
@@ -0,0 +1,92 @@
+"""
+SparkAdapter Codec Protocol.
+
+Opt-in contract for codecs that adapt their decoded values to Spark-native
+types — primitives, lists, dicts, and nested combinations.
+
+Codecs implement this method when they want their column eligible for
+downstream typed-query systems (Spark SQL, Delta Sharing, BI tools).
+Generic codecs like ``<blob@>`` and ``<hash@>`` deliberately do not
+implement it: their decoded values can be arbitrary Python objects with
+no fixed Spark-native shape.
+
+The contract is intentionally a Protocol rather than an abstract method
+on :class:`datajoint.Codec`:
+
+- Generic codecs need no acknowledgement (no ``NotImplementedError`` stubs).
+- Existing plugin codecs continue to work unchanged.
+- Codec authors opt in by adding the method on their own release cadence.
+- Consumers detect support structurally via ``isinstance(codec, SparkAdapter)``.
+
+See ``datajoint-docs/src/reference/specs/spark-adapter.md`` for the
+normative specification (signature, return-value shape constraints,
+worked codec examples).
+"""
+
+from __future__ import annotations
+
+from typing import Any, Protocol, runtime_checkable
+
+
+@runtime_checkable
+class SparkAdapter(Protocol):
+    """
+    A codec that adapts its decoded values to Spark-native types.
+
+    Opt-in. Codecs implementing this method declare that their decoded
+    values can be expressed as primitives, lists, or dicts of the same —
+    i.e., shapes that map cleanly to Spark's ``StructType`` /
+    ``ArrayType`` / ``MapType``.
+
+    Consumers (e.g., a Databricks silver-layer publish pipeline) check
+    ``isinstance(codec, SparkAdapter)`` per column to determine eligibility.
+
+    Allowed return-value shapes:
+
+    - Primitives: ``bool``, ``int``, ``float``, ``str``, ``bytes``,
+      ``None``, ``datetime.date``, ``datetime.datetime``.
+    - ``list[T]`` where ``T`` is any allowed shape (→ Spark ``ArrayType``).
+    - ``dict[str, T]`` where ``T`` is any allowed shape (→ Spark
+      ``StructType`` or ``MapType``, consumer-decided).
+
+    NumPy arrays must be converted to lists; no tuples, sets, or custom
+    objects in the return value.
+
+    Examples
+    --------
+    A 1D float-array codec (shipped as a plugin, not in datajoint-python)::
+
+        class FloatArrayCodec(dj.Codec):
+            name = "float_array"
+
+            def encode(self, value, *, key=None, store_name=None): ...
+            def decode(self, stored, *, key=None) -> np.ndarray: ...
+
+            def to_spark(self, decoded: np.ndarray, *, key=None) -> list[float]:
+                return decoded.tolist()  # → Spark ARRAY<DOUBLE>
+
+    Eligibility check::
+
+        from datajoint import SparkAdapter
+        isinstance(FloatArrayCodec(), SparkAdapter)  # True
+    """
+
+    def to_spark(self, decoded: Any, *, key: dict | None = None) -> Any:
+        """
+        Adapt a decoded codec value to a Spark-native shape.
+
+        Parameters
+        ----------
+        decoded : Any
+            The Python value produced by the codec's ``decode()``.
+        key : dict, optional
+            Optional context dict — same shape as ``Codec.encode``'s
+            ``key`` parameter. Most codecs ignore it.
+
+        Returns
+        -------
+        Any
+            A value composed entirely of allowed Spark-native shapes
+            (see class docstring).
+        """
+        ...