From e0ee0afcf879bed94a0c2869299456e9f2942312 Mon Sep 17 00:00:00 2001 From: Masaori Koshiba Date: Wed, 24 Jun 2026 08:28:44 +0900 Subject: [PATCH 1/3] cache: shared-memory-backed Dir for fast restart Cold-start cache initialization rebuilds each stripe's in-memory directory from disk on every restart -- multi-minute on large caches. Host the directory in POSIX shared memory so the next process start attaches the existing segment in milliseconds instead of rebuilding it. Recovery stays binary and fail-safe: when the segment cannot be trusted (crash, reboot, ABI/schema or storage mismatch, failed validation) the start drops it and rebuilds via the existing disk path, and reads still validate Doc magic + key so a stale entry is a miss, never corruption. Opt-in behind proxy.config.cache.shm.enabled (default 0), where it is a functional no-op. --- doc/admin-guide/files/records.yaml.en.rst | 83 ++ .../cache-architecture/index.en.rst | 1 + .../shm-fast-restart.en.rst | 656 +++++++++++++++ src/iocore/cache/AggregateWriteBuffer.cc | 1 - src/iocore/cache/AggregateWriteBuffer.h | 2 +- src/iocore/cache/CMakeLists.txt | 2 + src/iocore/cache/CacheDir.cc | 6 + src/iocore/cache/CacheProcessor.cc | 7 + src/iocore/cache/CacheShm.cc | 748 ++++++++++++++++++ src/iocore/cache/CacheShm.h | 98 +++ src/iocore/cache/CacheShmLayout.h | 105 +++ src/iocore/cache/CacheShmPurge.h | 241 ++++++ src/iocore/cache/Stripe.cc | 83 +- src/iocore/cache/Stripe.h | 7 +- src/iocore/cache/StripeSM.cc | 46 +- src/iocore/cache/unit_tests/test_CacheShm.cc | 219 +++++ src/records/RecordsConfig.cc | 9 + src/traffic_ctl/CMakeLists.txt | 3 + src/traffic_ctl/CacheShmCommand.cc | 261 ++++++ src/traffic_ctl/CacheShmCommand.h | 45 ++ src/traffic_ctl/traffic_ctl.cc | 13 + .../cache/cache_shm_bad_disk_dropped.test.py | 226 ++++++ .../cache/cache_shm_concurrent_attach.test.py | 185 +++++ .../cache/cache_shm_fast_restart.test.py | 247 ++++++ .../cache/cache_shm_purge_on_disable.test.py | 232 ++++++ .../cache/cache_shm_schema_mismatch.test.py | 207 +++++ .../cache/cache_shm_storage_mismatch.test.py | 210 +++++ .../cache/cache_shm_unclean_shutdown.test.py | 185 +++++ .../gold/cache_shm_state_after_shutdown.gold | 13 + .../replay/cache-shm-fast-restart.replay.yaml | 87 ++ tests/gold_tests/cache/shm_poke.py | 47 ++ .../gold_tests/logging/ts_process_handler.py | 5 +- 32 files changed, 4261 insertions(+), 19 deletions(-) create mode 100644 doc/developer-guide/cache-architecture/shm-fast-restart.en.rst create mode 100644 src/iocore/cache/CacheShm.cc create mode 100644 src/iocore/cache/CacheShm.h create mode 100644 src/iocore/cache/CacheShmLayout.h create mode 100644 src/iocore/cache/CacheShmPurge.h create mode 100644 src/iocore/cache/unit_tests/test_CacheShm.cc create mode 100644 src/traffic_ctl/CacheShmCommand.cc create mode 100644 src/traffic_ctl/CacheShmCommand.h create mode 100644 tests/gold_tests/cache/cache_shm_bad_disk_dropped.test.py create mode 100644 tests/gold_tests/cache/cache_shm_concurrent_attach.test.py create mode 100644 tests/gold_tests/cache/cache_shm_fast_restart.test.py create mode 100644 tests/gold_tests/cache/cache_shm_purge_on_disable.test.py create mode 100644 tests/gold_tests/cache/cache_shm_schema_mismatch.test.py create mode 100644 tests/gold_tests/cache/cache_shm_storage_mismatch.test.py create mode 100644 tests/gold_tests/cache/cache_shm_unclean_shutdown.test.py create mode 100644 tests/gold_tests/cache/gold/cache_shm_state_after_shutdown.gold create mode 100644 tests/gold_tests/cache/replay/cache-shm-fast-restart.replay.yaml create mode 100644 tests/gold_tests/cache/shm_poke.py diff --git a/doc/admin-guide/files/records.yaml.en.rst b/doc/admin-guide/files/records.yaml.en.rst index a554c554a2d..38fc75089c0 100644 --- a/doc/admin-guide/files/records.yaml.en.rst +++ b/doc/admin-guide/files/records.yaml.en.rst @@ -3003,6 +3003,89 @@ RAM Cache Compression runs on task threads. To use more cores for RAM cache compression, increase :ts:cv:`proxy.config.task_threads`. +.. _admin-cache-shm-fast-restart: + +Shared Memory Fast Restart +========================== + +|TS| can optionally keep the cache directory -- the in-memory index that maps +cached objects to their location on disk -- in POSIX shared memory so that it +survives a process restart. On a normal start the directory is read from disk +and, for a large cache, rebuilt in memory before the cache comes online. When +this feature is enabled and the previous instance shut down cleanly, the new +instance attaches the existing shared memory segments and skips that work, +bringing the cache online much faster. + +The shared memory directory is only an optimization for restart time; the +on-disk cache always remains the source of truth. A new instance discards the +segments and falls back to reading the directory from disk whenever they cannot +be trusted, including when: + +- the previous instance did not shut down cleanly (for example, it crashed), +- the on-disk storage layout described by :file:`storage.yaml` changed, +- the |TS| binary's directory structures changed (an ABI mismatch, such as + after an upgrade), or +- the shared memory schema version changed. + +Segments left over from a crash can be inspected or removed with +``traffic_ctl cache shm status`` and ``traffic_ctl cache shm clear``, which act +directly on the shared memory objects whether or not |TS| is running. + +.. note:: + + This is an experimental feature, disabled by default. All of its settings + take effect only on a restart of |TS|. + +.. ts:cv:: CONFIG proxy.config.cache.shm.enabled INT 0 + + Enables the shared memory cache directory described above. When ``0`` (the + default), the cache directory is always read from disk on start. + +.. ts:cv:: CONFIG proxy.config.cache.shm.name_prefix STRING ats + + The word used to name the POSIX shared memory objects, which on Linux appear + under ``/dev/shm``. Set only the middle word (default ``ats``); |TS| frames it + as ``/-`` so the leading ``/`` that POSIX requires and the trailing + ``-`` separator cannot be mis-typed. With the default the control segment is + named ``/ats-control`` and each per-stripe directory segment ``/ats-s`` + (for example ``/ats-s0``). Any stray framing characters are trimmed, so a + value carried over from an older release (such as ``/ats-``) still resolves to + the same names. Give each |TS| instance sharing a host a distinct word so + their segments do not collide. + + Renaming this value does not remove segments created under the old prefix: + |TS| only manages segments under the *current* prefix, so the old ``/dev/shm`` + objects linger until cleared manually with ``traffic_ctl cache shm clear + --prefix `` (or a host reboot). + +.. ts:cv:: CONFIG proxy.config.cache.shm.use_hugepages INT 0 + + When enabled (``1``), |TS| attempts to back the shared memory directory with + huge pages to reduce TLB pressure. This requires the shared memory to be + eligible for huge pages (for example, ``/dev/shm`` mounted with huge page + support on Linux). When it is not, |TS| logs a debug message under the + ``cache_shm`` tag and transparently falls back to ordinary pages, so + enabling this is always safe. + +.. ts:cv:: CONFIG proxy.config.cache.shm.purge_stale_on_start INT 0 + + When enabled (``1``) and :ts:cv:`proxy.config.cache.shm.enabled` is ``0``, + |TS| removes any leftover shared memory segments for + :ts:cv:`proxy.config.cache.shm.name_prefix` at startup (the ``control`` + segment and the per-stripe segments it lists). This guards against two + hazards of running with the feature disabled after it had been enabled: + + - the leftover segments keep consuming memory (for example ``/dev/shm`` on + Linux) even though the disabled instance never reads them, and + - a later run with the feature re-enabled would otherwise fast-attach a + directory that went stale while |TS| ran disabled and wrote only to disk. + + The purge is skipped if a live process still owns the segments (a concurrent + instance using the same prefix), and it never blocks startup. It has no + effect when the feature is enabled, when no ``control`` segment + exists, or when set to ``0`` (the default). ``traffic_ctl cache shm clear`` + performs the same cleanup on demand. + .. _admin-heuristic-expiration: Heuristic Expiration diff --git a/doc/developer-guide/cache-architecture/index.en.rst b/doc/developer-guide/cache-architecture/index.en.rst index 4e78f8febc3..1c9da957c70 100644 --- a/doc/developer-guide/cache-architecture/index.en.rst +++ b/doc/developer-guide/cache-architecture/index.en.rst @@ -41,5 +41,6 @@ understanding and modifying the source. api-functions.en consistency.en ram-cache.en + shm-fast-restart.en cache-tool.en tiered-storage.en diff --git a/doc/developer-guide/cache-architecture/shm-fast-restart.en.rst b/doc/developer-guide/cache-architecture/shm-fast-restart.en.rst new file mode 100644 index 00000000000..48254107cda --- /dev/null +++ b/doc/developer-guide/cache-architecture/shm-fast-restart.en.rst @@ -0,0 +1,656 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. include:: ../../common.defs +.. default-domain:: cpp + +.. _cache-shm-fast-restart: + +Shared-Memory Cache Directory (Fast Restart) +******************************************** + +.. note:: + + This is an experimental feature, disabled by default. It is controlled by + the ``proxy.config.cache.shm.*`` settings (see :ref:`configuration + `). The administrator-facing description lives at + :ref:`admin-cache-shm-fast-restart`; this document covers the design. + +Motivation +========== + +The :ref:`cache directory ` is the memory-resident index that +maps cached objects to their location on disk. It is rebuilt every time |TS| +starts: each stripe reads its two on-disk directory copies, picks the newer +valid one, and then runs recovery (``StripeSM::recover_data``) to replay +the fragments written since the last directory sync. For a large cache this is +the dominant cost of a restart -- the cache is not online, and therefore not +serving from cache, until it finishes. + +The directory itself, however, is purely a function of state |TS| already had +in memory in the previous process. If that memory could *survive* the process +restart, the new process could attach it and come online immediately, skipping +both the disk read and recovery. + +The shared-memory fast-restart feature does exactly that. It hosts each +stripe's ``Directory::raw_dir`` buffer in a POSIX shared-memory segment +(:manpage:`shm_open(3)`, on Linux backed by ``tmpfs`` under ``/dev/shm``). +Because the segment is owned by the kernel and not by the process, it outlives +an orderly ``traffic_server`` exit. The next start re-maps the existing segment +in milliseconds instead of rebuilding from disk. + +Design principles +================= + +The feature is built around two non-negotiable invariants. + +**The on-disk cache is always the source of truth.** The shared-memory +directory is *only* an optimization of restart time. The data fragments +themselves are never kept in shared memory -- they are read from disk on demand +exactly as before. The shared segment holds the directory index and nothing +else. + +**Recovery is binary.** The shared segment is either trustworthy enough to +attach wholesale, or it is dropped and the stripe rebuilds from disk through +the existing cold-start path. There is no attempt to repair, partially trust, +checksum, or torn-write-detect the segment. Every gate described below is a +fail-closed test: if anything is wrong or even ambiguous, the answer is "drop +and rebuild," which is always correct because the disk is authoritative. + +This keeps the trusted code small. The fast path adds no new durability +mechanism; it borrows the one the cache already has. Whenever the shared +segment is unavailable for any reason, |TS| takes precisely the path it takes +today after an unclean shutdown. + +Object layout +============= + +The feature uses two kinds of shared-memory object, defined in +:ts:git:`src/iocore/cache/CacheShmLayout.h`. + +.. code-block:: text + + POSIX shared memory (e.g. /dev/shm on Linux) + + control one per traffic_server instance + +-------------------------------------------------------------+ + | magic "ATS-SHM\0" schema_version abi_hash | + | storage_signature clean_shutdown owner_pid | + | stripe_count | + | stripes[0 .. MAX_STRIPES-1]: | + | { shm_name, raw_dir_size, stripe_key_hash } | + +-------------------------------------------------------------+ + | | | + v v v + s0 s1 s2 per-stripe raw_dir + +-----------+ +-----------+ +-----------+ + | header | | header | | header | StripeHeaderFooter + | dir[] | | dir[] | | dir[] | directory entries + | footer | | footer | | footer | + +-----------+ +-----------+ +-----------+ + +The control segment +------------------- + +There is one control segment per instance, named ``control``. It is a +fixed-size ``cache_shm::CacheShmControl`` -- a header plus a table of +up to ``MAX_STRIPES`` (256) ``cache_shm::StripeEntry`` rows. A +``static_assert`` keeps the whole control segment under 32 KiB. Its fields: + +.. list-table:: + :header-rows: 1 + :widths: 20 80 + + * - Field + - Purpose + * - ``magic`` + - ``"ATS-SHM\0"``. Identifies a |TS| control segment and is the first + thing checked on attach. + * - ``schema_version`` + - The on-shm wire-format version. Bumped whenever the meaning of the + layout changes; a mismatch drops the segment. + * - ``abi_hash`` + - A compile-time fingerprint of the binary's directory structures (see + ``CacheShm::abi_hash``). A mismatch -- e.g. after an upgrade that + changed ``Dir`` -- drops the segment. + * - ``storage_signature`` + - A fingerprint of the ``storage.config`` topology. **Not** a hard + gate; see `Storage changes and partial attach`_. + * - ``clean_shutdown`` + - ``1`` only between a clean shutdown and the next attach. ``0`` at all + other times, including throughout a running process, so a crash leaves + it ``0``. + * - ``owner_pid`` + - PID of the process currently mapping the segment read-write, or ``0`` + when none. Backs the concurrent-attach guard. + * - ``stripe_count`` + - High-water mark of used rows in ``stripes[]``. + * - ``stripes[]`` + - One row per stripe: its segment name, the segment's byte size, and the + 64-bit stripe identity hash used to match a stripe to its prior segment. + +Per-stripe directory segments +----------------------------- + +Each stripe's directory lives in its own segment, ``s``. The mapped +region *is* the stripe's ``Directory::raw_dir``: the +:cpp:class:`StripeHeaderFooter` header, the array of :cpp:class:`Dir` entries, +and the footer, in exactly the same byte layout the cache writes to disk. A +stripe reads and writes its directory through this mapping for the entire run, +so the segment is continuously current -- there is no separate "flush to shared +memory" step. + +Naming +------ + +All names derive from :ts:cv:`proxy.config.cache.shm.name_prefix`, which is just +the middle word (default ``ats``). |TS| frames that word as ``/-`` -- the +leading ``/`` that POSIX shared memory requires and the trailing ``-`` separator +are supplied by ``cache_shm::normalize_name_prefix``, not the operator, +so neither can be mis-typed; any stray framing carried over from an older config +(for example a literal ``/ats-``) is trimmed first, so it can never become an +invalid embedded-slash name like ``//ats--``. With the default word the framed +prefix is ``/ats-``: the control segment is ``/ats-control`` and stripe segments +are ``/ats-s`` where ``N`` is a per-instance slot index. Names are kept under +``cache_shm::MAX_SHM_NAME_LEN`` (31) characters because macOS caps POSIX +shared-memory names (``PSHMNAMLEN``) at 31 including the leading ``/``; keeping +to that limit makes the same naming work on Linux and macOS. Instances sharing +a host **must** use distinct words so their segments do not collide. + +Note that the stripe segment name is just a slot label. A stripe is matched to +its prior segment by ``stripe_key_hash`` (a 64-bit FNV-1a of the stripe's +``hash_text``), **not** by name or index, so a span going offline can shift +slot numbers without breaking the identity match. + +Startup +======= + +``CacheShm::initialize`` runs from +``CacheProcessor::start_internal``, after the :cpp:class:`Store` is +read but before any :cpp:class:`Stripe` is constructed. It loads the +configuration, then opens the control segment and selects one of three modes: + +.. list-table:: + :header-rows: 1 + :widths: 22 78 + + * - Mode + - Meaning + * - ``Disabled`` + - The feature is off (or a fatal precondition failed, such as a name that + is too long or losing the concurrent-attach race). Stripes use the + normal heap/hugepage directory; behavior is identical to stock |TS|. + * - ``AttachExisting`` + - A trustworthy prior control segment exists. Stripes attach their prior + segment by identity, or create a fresh one where there is no match. + * - ``CreateFresh`` + - No usable prior control segment. A new one is created and every stripe + segment is created empty (the cold path, but now shared-memory-backed + for *next* time). + +Trust gates +----------- + +When a prior control segment exists, ``initialize`` applies these gates in +order. The first failure drops the entire control segment (unlinking every +stripe segment it lists) and falls through to ``CreateFresh``: + +.. list-table:: + :header-rows: 1 + :widths: 26 74 + + * - Gate + - Drops the segment when... + * - concurrent-attach guard + - another live process is mapping the segment (see below). This actually + disables shared memory for the run rather than dropping -- the live + owner's segment must be left intact. + * - ``magic`` + - the magic bytes do not match (not our segment, or corrupt). + * - ``schema_version`` + - the on-shm format version differs from this binary's. + * - ``abi_hash`` + - the binary's directory structures differ from the writer's (e.g. an + upgrade changed ``Dir``, ``StripeHeaderFooter``, ``DIR_DEPTH``, ...). + * - ``clean_shutdown`` + - the previous run did not set it to ``1`` -- i.e. it crashed or was + killed. A crash may have left directory entries pointing at fragments + that were never flushed, so no stripe can safely skip recovery. + +If every gate passes, ``initialize`` adopts the segment: it records itself as +``owner_pid``, sets ``clean_shutdown = 0`` (so a crash *this* run drops the +segment next time), ``msync``\ s the header, and enters ``AttachExisting``. The +per-stripe work then happens lazily as each stripe initializes. + +Concurrent-attach guard +----------------------- + +Two ``traffic_server`` processes must never map the same directory read-write; +the second would corrupt the first's live index. ``clean_shutdown`` is no help +here -- it says nothing about a process that is *currently* running. The guard +is therefore based on ownership, with two layers: + +* **flock.** ``initialize`` takes a non-blocking exclusive ``flock`` on the + control-segment fd and holds it for the entire process lifetime + (``g_control_fd``). The kernel releases it automatically on exit *or crash*, + so it is self-healing. If the lock is already held + (``LockResult::HeldByOther``), a live owner exists and the new process + disables shared memory for its run. This is authoritative on Linux/``tmpfs``. + +* **owner_pid liveness.** macOS POSIX shared memory does not honor ``flock`` + (``LockResult::Unsupported``). There, the guard falls back to the recorded + ``owner_pid``: if it names a live process other than ourselves + (``CacheShm::process_is_alive``, via ``kill(pid, 0)``), the new + process disables shared memory. A clean shutdown clears ``owner_pid`` to + ``0``; a crash leaves a stale pid, but a crash also leaves + ``clean_shutdown = 0``, so the segment is dropped by that gate anyway. + +A symmetric check guards the ``CreateFresh`` path: after creating the fresh +control segment, ``initialize`` takes the lock, and if it lost a creation race +to another starting process it backs out and disables shared memory for the +run. + +Per-stripe attach and the fast path +==================================== + +For each stripe, ``Stripe::_init_directory`` asks +``CacheShm::attach_or_create_stripe`` for its ``raw_dir`` *before* +falling back to the hugepage / aligned-heap allocation: + +.. code-block:: cpp + + this->directory.raw_dir = CacheShm::attach_or_create_stripe(hash_text.get(), directory_size); + if (this->directory.raw_dir == nullptr) { + // shm disabled or attach/create failed -> hugepage, then aligned heap + } + +``attach_or_create_stripe`` looks up the stripe by ``stripe_key_hash`` in the +control table: + +* **Match found** (and the recorded size matches): map the existing segment and + return it. This is the segment the previous run left behind. +* **No match**: reserve a fresh table slot and create a new, zero-filled + segment. + +A freshly created segment has a zero header magic, so the fast-attach gate +below rejects it and ``StripeSM::init`` falls through to the normal disk +read, which repopulates the directory in place. + +The fast-attach gate +-------------------- + +In ``AttachExisting`` mode, when ``raw_dir`` came from shared memory, +``StripeSM::init`` checks whether the in-segment directory can be trusted +without reading disk: + +#. ``header->magic`` and ``footer->magic`` are both ``STRIPE_MAGIC``; +#. the directory version is within + ``[CACHE_DB_MAJOR_VERSION_COMPATIBLE, CACHE_DB_MAJOR_VERSION]``; +#. ``Stripe::_shm_directory_is_valid`` passes (see below). + +When all three hold, the stripe skips both the disk read **and** +``StripeSM::recover_data`` -- which would otherwise rescan the tail and +discard the very entries the shared segment preserved -- and jumps straight to +the post-recovery state (``sector_size``, ``scan_pos``, +``periodic_scan``, then ``StripeSM::dir_init_done``), +mirroring the tail of ``handle_recover_write_dir()``. It logs:: + + attaching cached directory from shm for '' (fast restart, recovery skipped) + +If any check fails, it logs ``shm directory invalid ...; falling back to disk +read`` and proceeds exactly as a cold start would. + +Bounds-validating a trusted segment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The magic/version checks confirm the segment *looks like* a directory, but a +stale-yet-magic-valid segment could still present offsets that would turn into +out-of-bounds disk I/O. ``Stripe::_shm_directory_is_valid`` is a +defensive bounds check over the trusted header fields: + +* ``sector_size`` is non-zero and no larger than ``STORE_BLOCK_SIZE``; +* ``write_pos``, ``last_write_pos`` and ``agg_pos`` all lie within the stripe's + data region (``[start, skip + len]``); +* every per-segment free-list head indexes a ``Dir`` entry within its segment + (walking a free list from an out-of-range head would run off the end). + +A failure here is treated like any other attach miss: drop to the disk read and +recover. It is cheap insurance against a corrupted or version-skewed segment +that slipped past the coarse magic check. + +Storage changes and partial attach +=================================== + +A change to ``storage.config`` does **not** invalidate the whole control +segment. ``storage_signature`` is recorded and used only to phrase the startup +log line ("partial -- storage changed"); it is not a trust gate. The actual +reconciliation is per stripe, driven by identity: + +* A stripe whose ``stripe_key_hash`` still matches a table entry of the right + size attaches its prior segment as usual. +* A stripe that is new, relocated, or resized finds no match and creates a + fresh segment (then loads from disk). +* A table entry that *no* stripe claimed this run is an **orphan** -- its stripe + left the cache (a span was dropped, or a disk failed to open). + +``CacheShm::finalize_attach``, called from +``CacheProcessor::cacheInitialized`` once every stripe has initialized, +reclaims the orphans: it unlinks each unclaimed segment, tombstones its slot for +reuse, and trims trailing tombstones so ``stripe_count`` tracks the live +high-water mark. + +One guard matters here: if **zero** stripes claimed a segment this run, +``finalize_attach`` leaves every entry intact. Zero claims cannot be +distinguished from an aborted init (for example a transient ``volume.config`` +error), and reclaiming a valid cache's segments would be far worse than leaking +them for one run. + +Shutdown +======== + +A clean shutdown is what makes the next start fast, so the directory must be +made final and the segment marked clean -- in that order. + +Wiring +------ + +On a clean exit, ``AutoStopCont::mainEvent`` calls +``sync_cache_dir_on_shutdown()`` whenever the cache is initialized. +``sync_cache_dir_on_shutdown`` stops every stripe (taking +each stripe mutex, so all writers are quiesced), and only then calls +``CacheShm::mark_clean_shutdown``, which sets ``clean_shutdown = 1``, +clears ``owner_pid`` to ``0``, and ``msync``\ s the header. When the feature is +disabled, ``mark_clean_shutdown`` is a no-op (there is no control segment), so +the shutdown path is unchanged for a stock |TS|. + +Skipping the on-disk directory write +------------------------------------ + +For a shared-memory-backed stripe, ``StripeSM::shutdown`` still flushes +the aggregation buffer (so pending *content* reaches disk) but then **skips the +on-disk directory write** entirely: + +.. code-block:: text + + Dir : shm-backed, skipping on-disk directory write + +The shared segment is already the durable copy of the directory and is attached +directly next start, so writing the A/B copies to disk would be pure waste. The +trade-off is deliberate and safe: if the segment is later dropped for any +reason, the on-disk A/B copies plus ``StripeSM::recover_data`` reconcile +the tail -- the same path an unclean restart already takes. + +Invalidating on flush failure +----------------------------- + +If the aggregation-buffer flush at shutdown fails (e.g. the disk went bad), the +on-disk content no longer matches the directory, so the shared segment must not +be trusted next start. ``StripeSM::shutdown`` calls +``CacheShm::invalidate_stripe_directory``, which zeroes the in-segment +header magic and ``msync``\ s it. Next start, the fast-attach gate rejects that +one stripe on the magic check and it reloads from disk and recovers, while the +other stripes still fast-attach. + +Crash and recovery summary +========================== + +The state machine reduces to: *the segment is attached only when it is provably +consistent, and dropped otherwise.* + +.. list-table:: + :header-rows: 1 + :widths: 34 66 + + * - Event between runs + - Next start + * - Clean shutdown, unchanged binary & storage + - Fast attach. Recovery skipped. Cache online in milliseconds. + * - Crash / ``SIGKILL`` + - ``clean_shutdown`` still ``0`` -> drop, rebuild from disk + recover. + * - Binary upgrade changing directory structures + - ``abi_hash`` mismatch -> drop, rebuild. + * - Schema bump + - ``schema_version`` mismatch -> drop, rebuild. + * - ``storage.config`` change + - Control segment kept; matching stripes fast-attach, changed stripes + rebuild, orphans reclaimed. + * - Per-stripe shutdown flush failed + - That stripe's segment was invalidated -> it rebuilds; others + fast-attach. + * - Another live owner using the prefix + - Refuse to attach; shared memory disabled for this run. + +In every "drop/rebuild" row, |TS| behaves exactly as it does today without the +feature -- the fast path is the only thing lost. + +Huge pages +========== + +The large directory segments make page-table teardown at process exit +non-trivial: ``exit_mmap`` walks O(number of PTEs), which for multi-gigabyte +directories can cost seconds. Backing the mapping with huge pages cuts the PTE +count ~512x and the teardown cost with it. + +When :ts:cv:`proxy.config.cache.shm.use_hugepages` is set, |TS| advises +transparent huge pages on the mapping with ``madvise(MADV_HUGEPAGE)``. +``MAP_HUGETLB`` is deliberately **not** used: ``shm_open`` fds are ``tmpfs`` +backed, and ``MAP_HUGETLB`` requires a ``hugetlbfs`` fd, so it always fails with +``EINVAL``. The advice requires shmem THP to be enabled on the host (for +example ``/sys/kernel/mm/transparent_hugepage/shmem_enabled`` set to ``advise`` +or ``always``, or the ``tmpfs`` mounted with ``huge=advise``). When huge pages +are unavailable the ``madvise`` simply logs a debug line under the +``cache_shm`` tag and the kernel uses base pages, so enabling the setting is +always safe. + +Concurrency model +================= + +Stripes initialize concurrently across the AIO/disk threads, so the +control-table bookkeeping is locked, but the slow shared-memory syscalls are +kept out of the critical section: + +* ``g_table_mutex`` guards the control-segment stripe table and the per-run + claim bookkeeping. ``attach_or_create_stripe`` decides what to do (reuse a + table slot or reserve a fresh one) under the lock, then **drops it** before + ``shm_open`` / ``ftruncate`` / ``mmap``. Each stripe owns a distinct segment, + so the syscalls never touch another thread's segment. Holding the lock across + them would serialize every disk thread's init. +* ``g_pointers_mutex`` guards the set of pointers handed out, so + ``CacheShm::is_shm_pointer`` (used to tell a shm-backed directory from + a heap-allocated one, e.g. to skip the redundant on-disk directory write) is + thread-safe. +* Slot reservation tombstones a slot if the create later fails + (``release_reserved_slot``), so a failed create cannot strand a half-built + table entry. + +Disabling the feature: stale-segment purge +========================================== + +Running with the feature **disabled** after it had been enabled is hazardous in +two ways: the leftover segments keep consuming memory the disabled instance +never reads, and a later re-enabled run could fast-attach a directory that went +stale while |TS| ran disabled (writing only to disk). To address this, +:ts:cv:`proxy.config.cache.shm.purge_stale_on_start` (opt-in) makes a disabled +start best-effort remove any leftover segments for the configured prefix. + +The purge shares one primitive with the operator tooling (see below): +``cache_shm::purge_segments`` in :ts:git:`src/iocore/cache/CacheShmPurge.h`. It +enumerates the stripe table and unlinks every stripe segment plus the control +object, returning a structured ``PurgeReport`` that each caller renders in its +own format. It refuses to unlink anything owned by a live process (the same +flock + ``owner_pid`` guard used at attach), and it never blocks startup. An +already-gone segment (``ENOENT``) is the desired end state and is not counted as +a failure. + +Operator tooling: ``traffic_ctl cache shm`` +============================================ + +Because crash-leftover segments may need inspecting when no live process is +around to query, the tooling acts on the shared-memory objects **directly**, via +``shm_open``, rather than over JSON-RPC. For that reason ``traffic_ctl`` does +**not** link the cache library; the small amount of shared logic lives in +header-only form (:ts:git:`src/iocore/cache/CacheShmLayout.h` and +``CacheShmPurge.h``). + +``traffic_ctl cache shm status [--prefix P]`` + Maps the control segment read-only and prints its header (magic, + schema/abi/storage fingerprints, ``clean_shutdown``, and whether + ``owner_pid`` names a live process) followed by the stripe table, flagging + each segment ``present`` / ``MISSING`` and each free slot as a tombstone. + +``traffic_ctl cache shm clear [--prefix P]`` + Removes the segments via the shared ``purge_segments`` primitive. It + **refuses** to clear segments owned by a live ``traffic_server`` (stop it + first), so it cannot orphan a running instance's fast restart. This is the + on-demand equivalent of ``purge_stale_on_start``. + +.. _cache-shm-configuration: + +Configuration +============= + +All settings are under ``proxy.config.cache.shm`` and take effect only on a +restart (``RECU_RESTART_TS``). See :ref:`admin-cache-shm-fast-restart` for the +full administrator-facing descriptions. + +.. list-table:: + :header-rows: 1 + :widths: 38 12 50 + + * - Setting + - Default + - Effect + * - :ts:cv:`proxy.config.cache.shm.enabled` + - ``0`` + - Master switch. ``0`` = always read the directory from disk (stock + behavior). + * - :ts:cv:`proxy.config.cache.shm.name_prefix` + - ``ats`` + - Middle word of the shared-memory object names; framed as ``/-`` + (the ``/`` and ``-`` are added by |TS|). Give co-located instances + distinct words. + * - :ts:cv:`proxy.config.cache.shm.use_hugepages` + - ``0`` + - Advise transparent huge pages on the directory mappings. Safe when + unavailable; falls back to base pages. + * - :ts:cv:`proxy.config.cache.shm.purge_stale_on_start` + - ``0`` + - When the feature is disabled, best-effort remove leftover segments for + the prefix at startup. + +Platform considerations +======================= + +* **Linux** is the primary target: ``tmpfs`` (``/dev/shm``) backs the segments, + ``flock`` is authoritative for the concurrent-attach guard, and shmem THP + provides the huge-page teardown win. +* **macOS** is supported for development and testing on a best-effort basis. + POSIX shared-memory names are limited to 31 characters (the reason for + ``MAX_SHM_NAME_LEN``), ``flock`` is not honored on shm fds, so the + concurrent-attach guard is best-effort there: it relies on the ``owner_pid`` + liveness backstop alone (the ``kill(pid, 0)`` check), which closes the window + but cannot make the attach atomic the way ``flock`` does on Linux. The kernel + also rounds a segment up to a page boundary, so ``open_and_map_shm`` accepts + any size in ``[requested, page-up]``. +* The feature is inert at the default :ts:cv:`proxy.config.cache.shm.enabled` + ``0``: no segments are created or attached on any platform, and behavior is + identical to stock |TS|. +* Realistic multi-gigabyte directory sizes, the ``MADV_HUGEPAGE`` teardown win, + and the restart-time benchmarks are Linux-only -- the same platform boundary + |TS| already has for its hugepage directory allocation. (Recall ``MAP_HUGETLB`` + is never used here; see `Huge pages`_.) + +Testing +======= + +The pure trust-gate logic is unit-tested in +:ts:git:`src/iocore/cache/unit_tests/test_CacheShm.cc` (ABI-hash stability, the +storage-signature topology sensitivity, control-header round-trip, the macOS +name-length limit, and the process-liveness check). + +The end-to-end behavior is covered by autests in +:ts:git:`tests/gold_tests/cache/`, one scenario each: + +.. list-table:: + :header-rows: 1 + :widths: 42 58 + + * - Test + - Scenario + * - ``cache_shm_fast_restart`` + - Directory survives a clean shutdown and is fast-attached. + * - ``cache_shm_data_integrity`` + - Objects cached before shutdown are served byte-identical from cache + after the attach (including multi-fragment objects). + * - ``cache_shm_unclean_shutdown`` + - ``SIGKILL`` leaves the segment dirty; next start drops and rebuilds. + * - ``cache_shm_schema_mismatch`` + - A poked ``schema_version`` is dropped, never attached. + * - ``cache_shm_storage_mismatch`` + - A changed storage layout keeps the control segment, creates a fresh + relocated stripe, and reclaims the orphan. + * - ``cache_shm_bad_disk_dropped`` + - Dropping a disk fast-attaches the survivors and reclaims the removed + disk's segment. + * - ``cache_shm_concurrent_attach`` + - A second ``traffic_server`` refuses to attach over a live owner and runs + with shared memory disabled. + * - ``cache_shm_purge_on_disable`` + - ``purge_stale_on_start`` removes leftover segments on a disabled start. + +The schema/storage tests drive their gates by editing ``/dev/shm`` directly +(``shm_poke.py``), which is a Linux facility; they have no macOS condition. + +Limitations and non-goals +========================= + +* The feature accelerates restart only; it does not change steady-state cache + behavior, durability, or the on-disk format. +* Only the directory is shared, never cached content. +* There is no migration or repair of an untrusted segment -- the disk is + authoritative and rebuilding from it is always the fallback. +* A single host may run multiple instances only with distinct + ``name_prefix`` values. + +Source map +========== + +.. list-table:: + :header-rows: 1 + :widths: 42 58 + + * - File + - Role + * - :ts:git:`src/iocore/cache/CacheShm.h` / ``CacheShm.cc`` + - The ``CacheShm`` facade: initialize, attach/create, finalize, mark-clean, + invalidate, and the trust-gate fingerprints. + * - :ts:git:`src/iocore/cache/CacheShmLayout.h` + - The on-shm control-segment layout, shared with tooling. + * - :ts:git:`src/iocore/cache/CacheShmPurge.h` + - The header-only enumerate-and-unlink primitive and its owner guard, + shared by the disabled-start purge and ``traffic_ctl``. + * - :ts:git:`src/iocore/cache/Stripe.cc` + - The shared-memory ``raw_dir`` allocation and ``_shm_directory_is_valid``. + * - :ts:git:`src/iocore/cache/StripeSM.cc` + - The fast-attach gate in ``StripeSM::init`` and the shutdown-write skip / + invalidate in ``StripeSM::shutdown``. + * - :ts:git:`src/iocore/cache/CacheProcessor.cc` + - ``initialize`` / ``finalize_attach`` call sites in ``CacheProcessor``. + * - :ts:git:`src/iocore/cache/CacheDir.cc` + - ``mark_clean_shutdown`` from ``sync_cache_dir_on_shutdown``. + * - :ts:git:`src/traffic_ctl/CacheShmCommand.cc` + - The ``traffic_ctl cache shm status`` / ``clear`` commands. diff --git a/src/iocore/cache/AggregateWriteBuffer.cc b/src/iocore/cache/AggregateWriteBuffer.cc index b761d656170..1407d5b2693 100644 --- a/src/iocore/cache/AggregateWriteBuffer.cc +++ b/src/iocore/cache/AggregateWriteBuffer.cc @@ -49,7 +49,6 @@ AggregateWriteBuffer::flush(int fd, off_t write_pos) const { int r = pwrite(fd, this->_buffer, this->_buffer_pos, write_pos); if (r != this->_buffer_pos) { - ink_assert(!"flushing agg buffer failed"); return false; } return true; diff --git a/src/iocore/cache/AggregateWriteBuffer.h b/src/iocore/cache/AggregateWriteBuffer.h index ad99b03ce04..22951fb797a 100644 --- a/src/iocore/cache/AggregateWriteBuffer.h +++ b/src/iocore/cache/AggregateWriteBuffer.h @@ -120,7 +120,7 @@ class AggregateWriteBuffer * @param write_pos The offset at which to write the buffer data. * @return Returns true if all bytes were flushed, otherwise false. */ - bool flush(int fd, off_t write_pos) const; + [[nodiscard]] bool flush(int fd, off_t write_pos) const; /** * Copy part of the buffer. diff --git a/src/iocore/cache/CMakeLists.txt b/src/iocore/cache/CMakeLists.txt index f8a252b430c..f4c28cd4223 100644 --- a/src/iocore/cache/CMakeLists.txt +++ b/src/iocore/cache/CMakeLists.txt @@ -27,6 +27,7 @@ add_library( CacheHttp.cc CacheProcessor.cc CacheRead.cc + CacheShm.cc CacheVC.cc CacheWrite.cc HttpTransactCache.cc @@ -92,6 +93,7 @@ if(BUILD_TESTING) add_cache_test(Update_Header unit_tests/test_Update_header.cc) add_cache_test(CacheStripe unit_tests/test_Stripe.cc) add_cache_test(CacheAggregateWriteBuffer unit_tests/test_AggregateWriteBuffer.cc) + add_cache_test(CacheShm unit_tests/test_CacheShm.cc) # Unit Tests without unit_tests/main.cc add_executable(test_ConfigVolumes unit_tests/test_ConfigVolumes.cc) diff --git a/src/iocore/cache/CacheDir.cc b/src/iocore/cache/CacheDir.cc index 99e9fba47b4..beec83c39a8 100644 --- a/src/iocore/cache/CacheDir.cc +++ b/src/iocore/cache/CacheDir.cc @@ -27,6 +27,7 @@ #include "P_CacheInternal.h" #include "PreservationTable.h" #include "Stripe.h" +#include "CacheShm.h" #include "tscore/hugepages.h" #include "tscore/Random.h" @@ -947,6 +948,11 @@ sync_cache_dir_on_shutdown() thr.join(); } + // All writers are now stopped (every stripe's mutex was held and released by the + // shutdown threads above), so the directory is final -- only now is it safe to + // mark the shm control segment clean. + CacheShm::mark_clean_shutdown(); + Dbg(dbg_ctl_cache_dir_sync, "shutdown sync done"); } diff --git a/src/iocore/cache/CacheProcessor.cc b/src/iocore/cache/CacheProcessor.cc index 1fa158d9889..c178aeb959d 100644 --- a/src/iocore/cache/CacheProcessor.cc +++ b/src/iocore/cache/CacheProcessor.cc @@ -28,6 +28,7 @@ #include "P_CacheInternal.h" #include "StripeSM.h" #include "Stripe.h" +#include "CacheShm.h" // Must be included after P_CacheInternal.h. #include "P_CacheHosting.h" @@ -187,6 +188,9 @@ CacheProcessor::start_internal(int flags) gndisks = theCacheStore.n_spans; gdisks.resize(gndisks); + // Must run before any Stripe is constructed so each can attach/create its segment. + CacheShm::initialize(theCacheStore); + // Temporaries to carry values between loops char **paths = static_cast(alloca(sizeof(char *) * gndisks)); memset(paths, 0, sizeof(char *) * gndisks); @@ -1495,6 +1499,9 @@ CacheProcessor::cacheInitialized() } } + // All stripes have claimed their segments; reclaim any orphan (e.g. a dropped disk). + CacheShm::finalize_attach(); + if (caches_ready) { Dbg(dbg_ctl_cache_init, "CacheProcessor::cacheInitialized - caches_ready=0x%0X, gnvol=%d", (unsigned int)caches_ready, gnstripes.load()); diff --git a/src/iocore/cache/CacheShm.cc b/src/iocore/cache/CacheShm.cc new file mode 100644 index 00000000000..5618fa6d28a --- /dev/null +++ b/src/iocore/cache/CacheShm.cc @@ -0,0 +1,748 @@ +/** @file + + Shared-memory-backed cache directory for fast restart. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#include "CacheShm.h" +#include "CacheShmLayout.h" +#include "CacheShmPurge.h" + +#include "P_CacheDir.h" +#include "iocore/cache/Store.h" + +#include "records/RecCore.h" +#include "tscore/Diags.h" +#include "tscore/HashFNV.h" +#include "tscore/ink_align.h" +#include "tscore/ink_memory.h" +#include "tscore/ink_string.h" +#include "tsutil/DbgCtl.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace +{ + +DbgCtl dbg_ctl{"cache_shm"}; + +using cache_shm::CACHE_SHM_MAGIC; +using cache_shm::CACHE_SHM_SCHEMA_VERSION; +using cache_shm::CacheShmControl; +using cache_shm::control_segment_name; +using cache_shm::CONTROL_SIZE; +using cache_shm::LockResult; +using cache_shm::MAX_SHM_NAME_LEN; +using cache_shm::MAX_STRIPES; +using cache_shm::read_shm_name; +using cache_shm::StripeEntry; +using cache_shm::try_lock_control; + +// Sanity bound: the control struct (header + stripe table) must stay small. +constexpr std::size_t MAX_CONTROL_SEGMENT_BYTES = 32 * 1024; +static_assert(sizeof(CacheShmControl) <= MAX_CONTROL_SEGMENT_BYTES, "control segment unexpectedly large"); + +// Configuration loaded at initialize() time. +struct Config { + bool enabled = false; + bool use_hugepages = false; + bool purge_stale_on_start = false; + std::string name_prefix = "/ats-"; // normalized "/-" (see normalize_name_prefix); set in load_config. +}; + +Config g_config; + +// Live state for the open control segment. +CacheShmControl *g_control = nullptr; +std::string g_control_name; + +// Holds the control segment's exclusive flock for the process lifetime; the OS +// releases it on exit. Only set on the path that owns the segment. +ats_scoped_fd g_control_fd; + +// shm pointers we returned (mapped to their length), so the Stripe destructor can +// choose munmap vs ats_free and detach_stripe can munmap the right span. +std::mutex g_pointers_mutex; +std::unordered_map g_pointers; + +// Guards the control-segment stripe table and the per-run claim bookkeeping below +// (stripes initialize concurrently across disk threads). +std::mutex g_table_mutex; + +// Per-run partial-attach bookkeeping, indexed in lockstep with g_control->stripes[]. +// An entry still unclaimed once init completes is an orphan reclaimed by +// finalize_attach(). Process-local, reset each run. +bool g_entry_claimed[MAX_STRIPES] = {}; +uint32_t g_claims_this_run = 0; + +void +fnv_update(ATSHash64FNV1a &h, uint64_t v) +{ + h.update(&v, sizeof v); +} + +/// Full 64-bit stripe identity used to match a stripe to its prior shm segment. +uint64_t +compute_stripe_key_hash(const char *stripe_key) +{ + ATSHash64FNV1a hash; + hash.update(stripe_key, std::strlen(stripe_key)); + return hash.get(); +} + +/// Build a stripe shm name from its per-host index (unique, so names never +/// collide). Matching to a prior segment uses the key hash, not this name. +std::string +build_stripe_shm_name(const std::string &prefix, uint32_t stripe_index) +{ + std::string name = prefix + "s" + std::to_string(stripe_index); + if (name.size() >= MAX_SHM_NAME_LEN) { + name.resize(MAX_SHM_NAME_LEN - 1); + } + return name; +} + +// Named flags for open_and_map_shm so call sites read `ShmAccess::Create` / +// `HugePages::Off` and the two can't be transposed. +enum class ShmAccess { Open, Create }; +enum class HugePages { Off, On }; + +/// Open or create a shm segment of `size` bytes and mmap it. Returns nullptr +/// on failure. When `out_fd` is non-null, the open fd is handed back to the +/// caller (left open) so it can hold an flock on the segment; otherwise the fd +/// is closed once the mapping is established (the mmap survives the close). +/// When `out_errno` is non-null it receives the failing syscall's errno (0 on +/// success) so the caller can render a non-opaque diagnostic. +void * +open_and_map_shm(const std::string &name, std::size_t size, ShmAccess access, [[maybe_unused]] HugePages hugepages, + int *out_fd = nullptr, int *out_errno = nullptr) +{ + if (out_errno != nullptr) { + *out_errno = 0; + } + int oflags = O_RDWR; + if (access == ShmAccess::Create) { + oflags |= O_CREAT; + } + + ats_scoped_fd fd{shm_open(name.c_str(), oflags, 0600)}; + if (fd < 0) { + int e = errno; + Dbg(dbg_ctl, "shm_open(%s, %s) failed: %s", name.c_str(), access == ShmAccess::Create ? "create" : "open", strerror(e)); + if (out_errno != nullptr) { + *out_errno = e; + } + return nullptr; + } + + if (access == ShmAccess::Create) { + if (ftruncate(fd, size) < 0) { + int e = errno; + Warning("ftruncate(%s, %zu) failed: %s", name.c_str(), size, strerror(e)); + shm_unlink(name.c_str()); + if (out_errno != nullptr) { + *out_errno = e; + } + return nullptr; + } + } else { + // The kernel rounds shm size up to a page boundary (16 KiB on macOS / Apple + // Silicon), so accept any size in [requested, page-up]. + struct stat sb { + }; + std::size_t expected_max = INK_ALIGN(size, ats_pagesize()); + if (fstat(fd, &sb) < 0 || sb.st_size < 0 || static_cast(sb.st_size) < size || + static_cast(sb.st_size) > expected_max) { + Dbg(dbg_ctl, "shm %s size mismatch (have %lld, want %zu, max %zu)", name.c_str(), static_cast(sb.st_size), size, + expected_max); + return nullptr; + } + } + + int prot = PROT_READ | PROT_WRITE; + int flags = MAP_SHARED; + void *addr = mmap(nullptr, size, prot, flags, fd, 0); + if (addr == MAP_FAILED) { + int e = errno; + Warning("mmap(%s, %zu) failed: %s", name.c_str(), size, strerror(e)); + if (out_errno != nullptr) { + *out_errno = e; + } + return nullptr; + } + + // Advise shmem THP for the mapping (cuts page-table teardown at exit). MAP_HUGETLB + // is not usable here: shm_open fds are tmpfs-backed, so it always EINVALs. Requires + // shmem THP enabled on the host; see the design doc for details. +#if defined(MADV_HUGEPAGE) + if (hugepages == HugePages::On) { + if (madvise(addr, size, MADV_HUGEPAGE) != 0) { + Dbg(dbg_ctl, "madvise(MADV_HUGEPAGE) on %s failed: %s", name.c_str(), strerror(errno)); + } + } +#endif + + if (out_fd != nullptr) { + *out_fd = fd.release(); // caller owns the fd and keeps it open for flock + } + return addr; +} + +void +unlink_all_known_segments() +{ + if (g_control != nullptr) { + for (uint32_t i = 0; i < g_control->stripe_count && i < MAX_STRIPES; ++i) { + std::string name = read_shm_name(g_control->stripes[i].shm_name); + if (!name.empty()) { + Dbg(dbg_ctl, "shm_unlink stripe %s", name.c_str()); + shm_unlink(name.c_str()); + } + } + munmap(g_control, CONTROL_SIZE); + g_control = nullptr; + } + if (!g_control_name.empty()) { + Dbg(dbg_ctl, "shm_unlink control %s", g_control_name.c_str()); + shm_unlink(g_control_name.c_str()); + } +} + +// Purge leftover shm segments when shm is disabled this run (opt-in via +// purge_stale_on_start). Best-effort: logs but never blocks startup. The +// enumerate-and-unlink work is shared with `traffic_ctl cache shm clear` +// (cache_shm::purge_segments); this just renders the result into diags. +void +purge_stale_segments(const std::string &prefix) +{ + const cache_shm::PurgeReport report = cache_shm::purge_segments(prefix); + + switch (report.outcome) { + case cache_shm::PurgeOutcome::BadPrefix: + // load_config() already warned about a bad prefix; stay quiet here. + case cache_shm::PurgeOutcome::NotPresent: + return; // ENOENT: shm never used with this prefix. + case cache_shm::PurgeOutcome::OpenFailed: + Warning("cache shm: cannot open control segment %s to purge stale segments: %s", report.control_name.c_str(), + strerror(report.sys_errno)); + return; + case cache_shm::PurgeOutcome::MapFailed: + Warning("cache shm: mmap of control segment %s failed while purging: %s", report.control_name.c_str(), + strerror(report.sys_errno)); + return; + case cache_shm::PurgeOutcome::TooSmall: + Warning("cache shm: leftover control segment %s is too small to read (%lld bytes); unlinking it", report.control_name.c_str(), + report.segment_size); + break; // purge_segments() already unlinked the control object; render the result below. + case cache_shm::PurgeOutcome::OwnedByLive: + Warning("cache shm: control segment %s is owned by a live process; leaving stale segments in place", + report.control_name.c_str()); + return; + case cache_shm::PurgeOutcome::Purged: + break; + } + + for (const auto &u : report.unlinked) { + if (u.error == 0) { + Dbg(dbg_ctl, "purge: unlinked %s %s", u.is_control ? "control" : "stripe", u.name.c_str()); + } else if (u.error != ENOENT) { + Warning("cache shm: failed to unlink %s %s while purging: %s", u.is_control ? "control segment" : "stripe", u.name.c_str(), + strerror(u.error)); + } + } + + Note("cache shm: purged stale segments while disabled (removed %u, %u failure(s), prefix '%s')", report.removed(), + report.failures(), prefix.c_str()); +} + +bool +load_config() +{ + RecInt enabled = RecGetRecordInt("proxy.config.cache.shm.enabled").value_or(0); + g_config.enabled = enabled != 0; + + RecInt use_hugepages = RecGetRecordInt("proxy.config.cache.shm.use_hugepages").value_or(0); + g_config.use_hugepages = use_hugepages != 0; + + RecInt purge_stale_on_start = RecGetRecordInt("proxy.config.cache.shm.purge_stale_on_start").value_or(0); + g_config.purge_stale_on_start = purge_stale_on_start != 0; + + char prefix_buf[256] = {0}; + std::string configured = "ats"; // operator sets only the middle word; framing is added below. + if (RecGetRecordString("proxy.config.cache.shm.name_prefix", prefix_buf, sizeof(prefix_buf)).has_value() && + prefix_buf[0] != '\0') { + configured = prefix_buf; + } + // Frame the configured middle word as "/-" so the leading '/' that POSIX + // shm_open requires and the '-' separator can never be mis-typed (a carried-over + // "/ats-" normalizes back to "/ats-" rather than an invalid "//ats--"). + g_config.name_prefix = cache_shm::normalize_name_prefix(configured); + + return g_config.enabled; +} + +// Reserve a control-table slot for a stripe about to be created (reusing a +// tombstone if any, else appending). Marks the slot non-empty so a concurrent +// create cannot pick the same index; g_entry_claimed stays clear until the segment +// is mapped. Returns the slot index (and shm name via out_name), or MAX_STRIPES +// when the table is full. Caller must hold g_table_mutex. +uint32_t +reserve_stripe_slot(uint64_t key_hash, std::size_t directory_size, std::string &out_name) +{ + uint32_t idx = g_control->stripe_count; + bool reuse_slot = false; + for (uint32_t i = 0; i < g_control->stripe_count && i < MAX_STRIPES; ++i) { + if (g_control->stripes[i].shm_name[0] == '\0') { + idx = i; + reuse_slot = true; + break; + } + } + if (!reuse_slot && g_control->stripe_count >= MAX_STRIPES) { + Warning("cache shm: stripe count exceeds MAX_STRIPES (%zu); falling back", MAX_STRIPES); + return MAX_STRIPES; + } + + out_name = build_stripe_shm_name(g_config.name_prefix, idx); + if (!reuse_slot) { + g_control->stripe_count++; + } + StripeEntry &e = g_control->stripes[idx]; + ink_strlcpy(e.shm_name, out_name.c_str(), sizeof(e.shm_name)); + e.raw_dir_size = directory_size; + e.stripe_key_hash = key_hash; + return idx; +} + +// Undo a reserve_stripe_slot() reservation when the segment could not be created. +// Tombstones the slot (empty shm_name) for reuse. Caller must hold g_table_mutex. +void +release_reserved_slot(uint32_t idx) +{ + StripeEntry &e = g_control->stripes[idx]; + e.shm_name[0] = '\0'; + e.raw_dir_size = 0; + e.stripe_key_hash = 0; +} + +// Record a freshly mapped stripe segment as claimed for this run and remember its +// pointer and length (for is_shm_pointer / invalidate_stripe_directory / +// detach_stripe). Takes the locks itself so the shm syscalls that produced `p` ran +// without g_table_mutex held. +char * +claim_mapped_stripe(uint32_t idx, void *p, std::size_t size) +{ + { + std::scoped_lock lk{g_table_mutex}; + g_entry_claimed[idx] = true; + ++g_claims_this_run; + } + { + std::scoped_lock plk{g_pointers_mutex}; + g_pointers.insert({static_cast(p), size}); + } + return static_cast(p); +} + +} // namespace + +CacheShm::Mode CacheShm::_mode = CacheShm::Mode::Disabled; + +uint64_t +CacheShm::abi_hash() +{ + ATSHash64FNV1a h; + h.update(tag.data(), tag.size()); + fnv_update(h, sizeof(Dir)); + fnv_update(h, sizeof(StripeHeaderFooter)); + fnv_update(h, sizeof(CacheShmControl)); + fnv_update(h, sizeof(StripeEntry)); + fnv_update(h, DIR_DEPTH); + fnv_update(h, SIZEOF_DIR); + fnv_update(h, MAX_STRIPES); + return h.get(); +} + +uint64_t +CacheShm::storage_signature(const Store &store) +{ + ATSHash64FNV1a h; + for (unsigned i = 0; i < store.n_spans; ++i) { + const Span *span = store.spans[i]; + if (span == nullptr) { + continue; + } + if (span->pathname) { + std::string_view path{span->pathname.get()}; + h.update(path.data(), path.size()); + } + fnv_update(h, static_cast(span->blocks)); + fnv_update(h, static_cast(span->offset)); + fnv_update(h, static_cast(span->hw_sector_size)); + } + return h.get(); +} + +void +CacheShm::initialize(const Store &store) +{ + if (!load_config()) { + _mode = Mode::Disabled; + // shm is off this run; reclaim any leftover segments from a prior run (rationale + // and guards documented on purge_stale_segments). Opt-in and best-effort. + if (g_config.purge_stale_on_start) { + purge_stale_segments(g_config.name_prefix); + } + Dbg(dbg_ctl, "shm disabled"); + return; + } + + g_control_name = control_segment_name(g_config.name_prefix); + if (g_control_name.size() >= MAX_SHM_NAME_LEN) { + Warning("shm name_prefix too long (control segment name '%s' exceeds %zu chars); shm disabled", g_control_name.c_str(), + MAX_SHM_NAME_LEN); + _mode = Mode::Disabled; + return; + } + + const uint64_t expected_abi = abi_hash(); + const uint64_t expected_signature = storage_signature(store); + + // Try to attach an existing control segment first. + int existing_fd = -1; + void *existing = open_and_map_shm(g_control_name, CONTROL_SIZE, ShmAccess::Open, HugePages::Off, &existing_fd); + if (existing != nullptr) { + auto *ctrl = static_cast(existing); + + // Concurrent-attach guard: refuse shm (and rebuild from disk) if another live + // process still owns this segment. + int flock_errno = 0; + const LockResult lock = try_lock_control(existing_fd, &flock_errno); + bool live_owner = false; + switch (lock) { + case LockResult::Acquired: + break; // we hold the exclusive lock, so any prior owner is gone + case LockResult::HeldByOther: + live_owner = true; + break; + case LockResult::Unsupported: // macOS POSIX shm: flock is a no-op, fall back to owner_pid + Dbg(dbg_ctl, "flock unsupported for control segment %s (errno %d: %s); using owner-pid liveness guard", + g_control_name.c_str(), flock_errno, strerror(flock_errno)); + live_owner = ctrl->owner_pid != 0 && ctrl->owner_pid != static_cast(getpid()) && process_is_alive(ctrl->owner_pid); + break; + } + if (live_owner) { + Warning("cache shm: control segment %s has a live owner (pid %d); disabling shm this run to avoid concurrent attach", + g_control_name.c_str(), ctrl->owner_pid); + munmap(existing, CONTROL_SIZE); + close(existing_fd); + _mode = Mode::Disabled; + return; + } + + bool ok = std::memcmp(ctrl->magic, CACHE_SHM_MAGIC, sizeof(CACHE_SHM_MAGIC)) == 0; + if (ok && ctrl->schema_version != CACHE_SHM_SCHEMA_VERSION) { + Note("cache shm: schema mismatch (%u vs %u), dropping", ctrl->schema_version, CACHE_SHM_SCHEMA_VERSION); + ok = false; + } + if (ok && ctrl->abi_hash != expected_abi) { + Note("cache shm: ABI mismatch, dropping"); + ok = false; + } + + // storage_signature is NOT a hard gate (see storage_signature() doc): a + // storage.config change keeps the segment, each stripe attaches by its own + // identity. Refreshed in place below. + const bool storage_changed = ok && ctrl->storage_signature != expected_signature; + + if (ok && ctrl->clean_shutdown == 0) { + // A crash may have left dir entries pointing at content never flushed, so no + // stripe can safely skip recovery -- whole-segment drop. + Note("cache shm: previous run did not shutdown cleanly, dropping"); + ok = false; + } + + if (ok) { + Note("cache shm: attaching up to %u stripes (fast restart%s)", ctrl->stripe_count, + storage_changed ? ", partial -- storage changed" : ""); + g_control = ctrl; + g_control_fd = existing_fd; // hold the exclusive lock for the process lifetime + std::memset(g_entry_claimed, 0, sizeof(g_entry_claimed)); + g_claims_this_run = 0; + if (storage_changed) { + g_control->storage_signature = expected_signature; + } + // Become owner and clear clean_shutdown so a crash this run drops shm next time. + g_control->owner_pid = static_cast(getpid()); + g_control->clean_shutdown = 0; + msync(g_control, CONTROL_SIZE, MS_SYNC); + _mode = Mode::AttachExisting; + return; + } + + // Drop everything and fall through to fresh-create. We hold the exclusive lock, + // so unlinking cannot pull segments out from under a live owner. + g_control = ctrl; // so unlink_all_known_segments can iterate stripes + unlink_all_known_segments(); + close(existing_fd); // releases the lock on the now-unlinked object + } + + // Create fresh control segment. + int fresh_fd = -1; + int create_errno = 0; + void *fresh = open_and_map_shm(g_control_name, CONTROL_SIZE, ShmAccess::Create, HugePages::Off, &fresh_fd, &create_errno); + if (fresh == nullptr) { + // Surface the errno + offending name: e.g. an embedded '/' in name_prefix yields EINVAL here. + Warning("cache shm: failed to create control segment %s: %s; shm disabled", g_control_name.c_str(), strerror(create_errno)); + _mode = Mode::Disabled; + return; + } + // Lock the freshly created segment. Another starting process could have created + // and locked it first in the window since the drop above; if so, refuse. + if (try_lock_control(fresh_fd) == LockResult::HeldByOther) { + Warning("cache shm: lost the create race for control segment %s; disabling shm this run", g_control_name.c_str()); + munmap(fresh, CONTROL_SIZE); + close(fresh_fd); + _mode = Mode::Disabled; + return; + } + g_control = static_cast(fresh); + g_control_fd = fresh_fd; // hold the exclusive lock for the process lifetime + std::memset(g_control, 0, CONTROL_SIZE); + std::memset(g_entry_claimed, 0, sizeof(g_entry_claimed)); + g_claims_this_run = 0; + std::memcpy(g_control->magic, CACHE_SHM_MAGIC, sizeof(CACHE_SHM_MAGIC)); + g_control->schema_version = CACHE_SHM_SCHEMA_VERSION; + g_control->abi_hash = expected_abi; + g_control->storage_signature = expected_signature; + g_control->clean_shutdown = 0; + g_control->owner_pid = static_cast(getpid()); + g_control->stripe_count = 0; + + _mode = Mode::CreateFresh; + Note("cache shm: creating fresh control segment %s (owner pid %d)", g_control_name.c_str(), static_cast(getpid())); + return; +} + +char * +CacheShm::attach_or_create_stripe(const char *stripe_key, std::size_t directory_size) +{ + if (_mode == Mode::Disabled || g_control == nullptr) { + return nullptr; + } + + const uint64_t key_hash = compute_stripe_key_hash(stripe_key); + const HugePages hugepages = g_config.use_hugepages ? HugePages::On : HugePages::Off; + + // Decide what to do under the table lock, but run the shm syscalls afterwards + // with the lock dropped (holding it across them would serialize every disk + // thread's init). Each stripe owns a distinct segment, so the syscalls never + // touch another thread's segment. + std::string attach_name; // non-empty => map this existing segment + std::string create_name; // set when a fresh slot was reserved (the create path) + uint32_t idx = MAX_STRIPES; + { + std::scoped_lock lk{g_table_mutex}; + + // 1. Try to attach this stripe's prior segment, matched by 64-bit identity (not + // name), so a span going offline shifts indices but not identities. + for (uint32_t i = 0; i < g_control->stripe_count && i < MAX_STRIPES; ++i) { + StripeEntry &e = g_control->stripes[i]; + if (e.shm_name[0] == '\0' || e.stripe_key_hash != key_hash) { + continue; // tombstoned slot, or a different stripe + } + if (e.raw_dir_size != directory_size) { + // Same identity, different size: shouldn't happen (size derives from the + // keyed blocks). Treat as a miss and recreate; the stale entry is reaped by + // finalize_attach(). + Note("cache shm: stripe %s size mismatch (have %llu, want %zu); recreating", read_shm_name(e.shm_name).c_str(), + static_cast(e.raw_dir_size), directory_size); + break; + } + attach_name = read_shm_name(e.shm_name); + idx = i; + break; + } + + // 2. No usable prior segment -- reserve a slot for a fresh create under the lock. + if (attach_name.empty() && (idx = reserve_stripe_slot(key_hash, directory_size, create_name)) == MAX_STRIPES) { + return nullptr; // table full (already logged) + } + } + + // Attach path: map the existing segment outside the lock. + if (!attach_name.empty()) { + void *p = open_and_map_shm(attach_name, directory_size, ShmAccess::Open, hugepages); + if (p != nullptr) { + Note("cache shm: attached stripe %s (%zu bytes) for key=%s", attach_name.c_str(), directory_size, stripe_key); + return claim_mapped_stripe(idx, p, directory_size); + } + // Attach failed (segment vanished/unmappable): reserve a fresh slot and fall + // through to create. The stale entry is reaped by finalize_attach(). + Note("cache shm: failed to attach stripe %s; recreating", attach_name.c_str()); + std::scoped_lock lk{g_table_mutex}; + if ((idx = reserve_stripe_slot(key_hash, directory_size, create_name)) == MAX_STRIPES) { + return nullptr; + } + } + + // Create path: slot already reserved; syscalls run outside the lock. A fresh + // ftruncate'd segment is zero-filled (magic 0), so Stripe::init falls back to the + // disk read and repopulates it. shm_unlink clears any leftover with this name. + shm_unlink(create_name.c_str()); + void *p = open_and_map_shm(create_name, directory_size, ShmAccess::Create, hugepages); + if (p == nullptr) { + std::scoped_lock lk{g_table_mutex}; + release_reserved_slot(idx); + return nullptr; + } + + Note("cache shm: created stripe %s (%zu bytes) for key=%s", create_name.c_str(), directory_size, stripe_key); + return claim_mapped_stripe(idx, p, directory_size); +} + +void +CacheShm::finalize_attach() +{ + if (g_control == nullptr) { + return; + } + + std::scoped_lock lk{g_table_mutex}; + + // With zero claims this run we cannot distinguish "genuinely empty cache" from + // "init aborted" (e.g. a transient volume.config error), so leave every segment + // intact rather than risk reclaiming a valid cache. + if (g_claims_this_run == 0) { + Dbg(dbg_ctl, "finalize_attach: no stripes claimed this run; leaving %u segment(s) intact", g_control->stripe_count); + return; + } + + uint32_t reclaimed = 0; + for (uint32_t i = 0; i < g_control->stripe_count && i < MAX_STRIPES; ++i) { + StripeEntry &e = g_control->stripes[i]; + if (e.shm_name[0] == '\0' || g_entry_claimed[i]) { + continue; // already empty, or claimed by a live stripe this run + } + // Unclaimed, non-empty entry: its stripe left the cache (span dropped, or disk + // failed to open). Unlink the orphan and tombstone the slot for reuse. + std::string name = read_shm_name(e.shm_name); + Note("cache shm: reclaiming orphaned stripe segment %s", name.c_str()); + shm_unlink(name.c_str()); + e.shm_name[0] = '\0'; + e.raw_dir_size = 0; + e.stripe_key_hash = 0; + ++reclaimed; + } + if (reclaimed > 0) { + Note("cache shm: reclaimed %u orphaned stripe segment(s) after storage change", reclaimed); + } + + // Trim trailing tombstones so stripe_count tracks the live high-water mark; + // interior tombstones stay (reused by attach_or_create_stripe). + uint32_t live_count = 0; + for (uint32_t i = 0; i < g_control->stripe_count && i < MAX_STRIPES; ++i) { + if (g_control->stripes[i].shm_name[0] != '\0') { + live_count = i + 1; + } + } + const bool count_changed = live_count != g_control->stripe_count; + if (count_changed) { + Note("cache shm: trimming stripe_count %u -> %u after reclaim", g_control->stripe_count, live_count); + g_control->stripe_count = live_count; + } + + if (reclaimed > 0 || count_changed) { + msync(g_control, CONTROL_SIZE, MS_SYNC); + } +} + +bool +CacheShm::is_shm_pointer(char *raw_dir) +{ + if (raw_dir == nullptr) { + return false; + } + std::scoped_lock lk{g_pointers_mutex}; + return g_pointers.find(raw_dir) != g_pointers.end(); +} + +void +CacheShm::mark_clean_shutdown() +{ + if (g_control == nullptr) { + return; + } + Note("cache shm: marking clean shutdown"); + g_control->clean_shutdown = 1; + // Clear owner_pid so the next start's liveness backstop does not mistake our + // (exiting) PID for a live owner. The flock is still held until exit, so a + // concurrent starter is still correctly refused during the shutdown window. + g_control->owner_pid = 0; + msync(g_control, CONTROL_SIZE, MS_SYNC); +} + +bool +CacheShm::process_is_alive(int pid) +{ + return cache_shm::process_is_alive(pid); +} + +void +CacheShm::invalidate_stripe_directory(char *raw_dir) +{ + if (!is_shm_pointer(raw_dir)) { + return; + } + // Zero the in-shm header magic so Stripe::init's attach gate rejects this segment + // next start and recovers the stripe from disk instead of fast-attaching a + // directory we could not finish flushing. + auto *header = reinterpret_cast(raw_dir); + header->magic = 0; + msync(raw_dir, sizeof(StripeHeaderFooter), MS_SYNC); +} + +void +CacheShm::detach_stripe(char *raw_dir) +{ + if (raw_dir == nullptr) { + return; + } + std::scoped_lock lk{g_pointers_mutex}; + auto it = g_pointers.find(raw_dir); + if (it == g_pointers.end()) { + return; + } + // munmap the recorded span; never shm_unlink -- the segment must survive for the + // next start to attach. + munmap(it->first, it->second); + g_pointers.erase(it); +} diff --git a/src/iocore/cache/CacheShm.h b/src/iocore/cache/CacheShm.h new file mode 100644 index 00000000000..37f671ee393 --- /dev/null +++ b/src/iocore/cache/CacheShm.h @@ -0,0 +1,98 @@ +/** @file + + Shared-memory-backed cache directory for fast restart. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#pragma once + +#include +#include +#include + +struct Store; + +/// Hosts Stripe::Directory::raw_dir in POSIX shared memory so the next process +/// start can attach the existing directory in milliseconds rather than rebuilding +/// it from disk. Purely an optimization over the rebuild path: anything wrong → +/// drop shm, rebuild from disk. See the cache-shm fast-restart design doc. +class CacheShm +{ +public: + static constexpr std::string_view tag{"ATS-SHM-V1"}; + + enum class Mode { + Disabled, ///< shm.enabled=0; behave like today. + AttachExisting, ///< A valid prior control segment exists; stripes attach by identity or create fresh. + CreateFresh, ///< No/invalid prior control - create everything new (cold path). + }; + + /// Initialize the control segment and decide Mode. Must be called from + /// CacheProcessor::start after the store is read but before any Stripe is built. + static void initialize(const Store &store); + + static Mode + mode() + { + return _mode; + } + + /// Allocate raw_dir for one stripe, keyed by its identity (`stripe_key`). + /// Attaches the stripe's prior segment of matching size when one exists, else + /// creates fresh. Returns the mapped pointer, or nullptr to fall back to the + /// heap path (always in Disabled). + static char *attach_or_create_stripe(const char *stripe_key, std::size_t directory_size); + + /// Reclaim segments left by stripes no longer in the cache (e.g. a dropped disk). + /// Call once after all stripes init, from CacheProcessor::cacheInitialized. + /// No-ops when no stripe came up this run. Idempotent. + static void finalize_attach(); + + /// Whether a pointer was returned from attach_or_create_stripe (munmap vs ats_free). + static bool is_shm_pointer(char *raw_dir); + + /// Mark control->clean_shutdown = 1. Called after sync_cache_dir_on_shutdown. + static void mark_clean_shutdown(); + + /// Invalidate one stripe's shm directory (zero its header magic) so the next + /// start recovers it from disk instead of fast-attaching. Called when a stripe's + /// shutdown flush failed. No-op if raw_dir is not a shm segment. + static void invalidate_stripe_directory(char *raw_dir); + + /// Detach (munmap) one stripe's shm directory and forget the pointer; never + /// shm_unlink (the segment must survive for the next start). No-op if raw_dir + /// is not a shm segment. Called from ~Stripe so the dtor frees the right way. + static void detach_stripe(char *raw_dir); + + /// Compile-time ABI fingerprint of the shm-resident layout; a writer/reader + /// mismatch forces a drop + rebuild. Exposed for unit testing. + static uint64_t abi_hash(); + + /// Fingerprint of the storage topology. Not a trust gate (see initialize()): + /// informational, drives the "storage changed" log wording. + static uint64_t storage_signature(const Store &store); + + /// True if `pid` names a live process (pid <= 0 is not). Backs the + /// concurrent-attach owner-liveness backstop. Exposed for unit testing. + static bool process_is_alive(int pid); + +private: + static Mode _mode; +}; diff --git a/src/iocore/cache/CacheShmLayout.h b/src/iocore/cache/CacheShmLayout.h new file mode 100644 index 00000000000..d58ae9faaa7 --- /dev/null +++ b/src/iocore/cache/CacheShmLayout.h @@ -0,0 +1,105 @@ +/** @file + + Layout of the cache shared-memory control segment, shared between the cache + subsystem and tools (traffic_ctl) that inspect or clear the segment without + going through the running traffic_server. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace cache_shm +{ + +constexpr char CACHE_SHM_MAGIC[8] = {'A', 'T', 'S', '-', 'S', 'H', 'M', '\0'}; +constexpr uint32_t CACHE_SHM_SCHEMA_VERSION = 1; +constexpr std::string_view CACHE_SHM_CONTROL = "control"; + +// macOS PSHMNAMLEN is 31 chars including the leading '/'. Keep names under that +// limit on Linux too, so the same naming works everywhere. +constexpr std::size_t MAX_SHM_NAME_LEN = 31; + +// Maximum number of stripes in the control segment. Bumping it changes the ABI +// hash, so old segments are dropped automatically. +constexpr std::size_t MAX_STRIPES = 256; + +// Per-stripe entry in the control segment. A stripe is matched to its prior +// segment on attach by stripe_key_hash, not by name (order-independent). +struct StripeEntry { + char shm_name[MAX_SHM_NAME_LEN + 1]; ///< full shm name, NUL-terminated. + uint64_t raw_dir_size; ///< size of the stripe's raw_dir segment, bytes. + uint64_t stripe_key_hash; ///< full 64-bit FNV-1a of the stripe hash_text. +}; + +struct CacheShmControl { + char magic[8]; ///< CACHE_SHM_MAGIC + uint32_t schema_version; ///< CACHE_SHM_SCHEMA_VERSION + uint32_t pad0; + uint64_t abi_hash; ///< compile-time ABI fingerprint + uint64_t storage_signature; ///< storage.yaml fingerprint + uint8_t clean_shutdown; ///< 0 = dirty, 1 = clean + uint8_t pad1[3]; + int32_t owner_pid; ///< PID holding the exclusive lock; 0 when none. Backs the + ///< concurrent-attach guard. Cleared on clean shutdown. + uint32_t stripe_count; + uint32_t pad2; + StripeEntry stripes[MAX_STRIPES]; +}; + +constexpr std::size_t CONTROL_SIZE = sizeof(CacheShmControl); + +// Normalize the operator-configured prefix into the full shm name prefix used +// to build segment names. The operator sets only the middle word (e.g. "ats"); +// the framing is supplied here so the leading '/' that POSIX shm_open requires +// and the '-' separating the prefix from the per-object suffix can never be +// mis-typed. Any stray framing carried over from an older config (e.g. a +// literal "/ats-") is trimmed first, so migration can never yield an invalid +// embedded-slash name like "//ats--". An embedded '/' or '-' in the middle is +// preserved; only the framing characters are trimmed. Both the running server +// and traffic_ctl normalize through here so they agree on the same names. +inline std::string +normalize_name_prefix(std::string_view configured) +{ + std::size_t begin = configured.find_first_not_of('/'); + if (begin == std::string_view::npos) { + begin = configured.size(); // all '/' (or empty): no middle. + } + std::size_t last_kept = configured.find_last_not_of('-'); + std::string_view middle = (last_kept == std::string_view::npos || last_kept < begin) ? + std::string_view{} : + configured.substr(begin, last_kept - begin + 1); + return "/" + std::string(middle) + "-"; +} + +// Name of the "control" segment. Derived in one place so the cache +// subsystem and traffic_ctl agree on it. `prefix` is the normalized prefix +// (see normalize_name_prefix), e.g. "/ats-". +inline std::string +control_segment_name(std::string_view prefix) +{ + return std::string(prefix) + CACHE_SHM_CONTROL.data(); +} + +} // namespace cache_shm diff --git a/src/iocore/cache/CacheShmPurge.h b/src/iocore/cache/CacheShmPurge.h new file mode 100644 index 00000000000..1bc64253b68 --- /dev/null +++ b/src/iocore/cache/CacheShmPurge.h @@ -0,0 +1,241 @@ +/** @file + + Shared "enumerate and unlink the shm segments for a prefix" primitive, used by + both the cache subsystem (purge-on-disabled-start) and `traffic_ctl cache shm + clear`. Header-only since traffic_ctl does not link the cache library. + purge_segments() does no logging; it returns a report each caller formats itself. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#pragma once + +#include "CacheShmLayout.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cache_shm +{ + +/// True if `pid` names a live process (pid <= 0 is not). EPERM counts as alive (it +/// exists, we just may not signal it). Backs the owner-liveness backstop used where +/// the control-segment flock is not honored. +inline bool +process_is_alive(int32_t pid) +{ + if (pid <= 0) { + return false; + } + return ::kill(static_cast(pid), 0) == 0 || errno == EPERM; +} + +/// Outcome of trying to take the control segment's exclusive lock. +enum class LockResult { + Acquired, ///< We hold the exclusive lock; no other process does. + HeldByOther, ///< Another live process holds it (flock returned EWOULDBLOCK). + Unsupported, ///< flock is not honored for this fd (e.g. macOS POSIX shm). +}; + +/// Take the exclusive, non-blocking advisory lock on the control fd. Authoritative +/// on Linux/tmpfs (auto-released on crash); macOS POSIX shm returns Unsupported, so +/// the owner_pid liveness check is used there instead. On Unsupported the flock errno +/// is reported via `unexpected_errno` (when non-null) so a caller with logging can +/// surface an otherwise-silent failure (EBADF/EINVAL/ENOLCK vs the expected macOS case). +inline LockResult +try_lock_control(int fd, int *unexpected_errno = nullptr) +{ + if (::flock(fd, LOCK_EX | LOCK_NB) == 0) { + return LockResult::Acquired; + } + // EWOULDBLOCK is the only errno meaning "another process holds it"; anything else + // means flock is unusable here -> fall back to the owner_pid backstop. + if (errno == EWOULDBLOCK) { + return LockResult::HeldByOther; + } + if (unexpected_errno != nullptr) { + *unexpected_errno = errno; + } + return LockResult::Unsupported; +} + +/// Read a shm_name field bounded by the field size (the fixed char[] may be +/// un-terminated in a tampered/stale segment). Empty for a tombstoned slot. +inline std::string +read_shm_name(const char (&field)[32]) +{ + return std::string(field, ::strnlen(field, sizeof(field))); +} + +/// How far purge_segments() got. Everything but Purged/TooSmall means nothing was +/// unlinked. +enum class PurgeOutcome { + BadPrefix, ///< Prefix is empty or does not start with '/'. Nothing attempted. + NotPresent, ///< No control segment exists (shm_open ENOENT). Nothing to do. + OpenFailed, ///< shm_open failed for a reason other than ENOENT; cannot read safely. + MapFailed, ///< The control segment exists but could not be mmap'd. + TooSmall, ///< Control segment is smaller than CacheShmControl; control unlinked, table not walked. + OwnedByLive, ///< A live process owns the segment; nothing was unlinked. + Purged, ///< The stripe table was walked and its segments unlinked (possibly zero stripes). +}; + +/// One shm_unlink attempt, so callers can log each name in their own format. +struct PurgeUnlink { + std::string name; + bool is_control; ///< true for the control object, false for a stripe. + int error; ///< 0 on success; otherwise the errno from shm_unlink (ENOENT == already gone). +}; + +/// Result of purge_segments(). `unlinked` lists every shm_unlink attempted, in +/// order (stripes first, then the control object). +struct PurgeReport { + PurgeOutcome outcome = PurgeOutcome::NotPresent; + std::string control_name; ///< the control name (set whenever the prefix was valid). + int sys_errno = 0; ///< errno behind OpenFailed / MapFailed. + long long segment_size = -1; ///< control segment size in bytes, for TooSmall. + int32_t owner_pid = 0; ///< the recorded owner pid, for OwnedByLive. + std::vector unlinked; + + /// Segments successfully removed (a shm_unlink that returned 0). + unsigned + removed() const + { + unsigned n = 0; + for (const auto &u : unlinked) { + if (u.error == 0) { + ++n; + } + } + return n; + } + + /// Real failures. ENOENT means the segment was already gone, which is the + /// desired end state, so it is not counted. + unsigned + failures() const + { + unsigned n = 0; + for (const auto &u : unlinked) { + if (u.error != 0 && u.error != ENOENT) { + ++n; + } + } + return n; + } +}; + +namespace detail +{ + /// Close an fd on scope exit (the mmap survives the close). + struct FdGuard { + int fd; + ~FdGuard() + { + if (fd >= 0) { + ::close(fd); + } + } + }; +} // namespace detail + +/// Open `control` read-only and, unless a live process still owns it, +/// unlink every stripe segment it lists plus the control object. No logging -- +/// callers format the returned report. The owner guard uses flock, falling back to +/// owner_pid liveness where flock is unsupported. The stripe table is trusted on +/// magic alone (the size check bounds the read; stale names just ENOENT on unlink). +inline PurgeReport +purge_segments(const std::string &prefix) +{ + PurgeReport report; + + if (prefix.empty() || prefix[0] != '/') { + report.outcome = PurgeOutcome::BadPrefix; + return report; + } + report.control_name = control_segment_name(prefix); + + int fd = ::shm_open(report.control_name.c_str(), O_RDONLY, 0); + if (fd < 0) { + report.sys_errno = errno; + report.outcome = (errno == ENOENT) ? PurgeOutcome::NotPresent : PurgeOutcome::OpenFailed; + return report; + } + detail::FdGuard guard{fd}; + + // clang-format off + struct stat sb{}; + // clang-format on + if (::fstat(fd, &sb) < 0 || static_cast(sb.st_size) < CONTROL_SIZE) { + // Too small to hold a valid header/table: there is no table to walk, so just + // unlink the control object itself (it still occupies memory). + report.segment_size = static_cast(sb.st_size); + report.outcome = PurgeOutcome::TooSmall; + int e = ::shm_unlink(report.control_name.c_str()) == 0 ? 0 : errno; + report.unlinked.push_back({report.control_name, true, e}); + return report; + } + + void *addr = ::mmap(nullptr, CONTROL_SIZE, PROT_READ, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + report.sys_errno = errno; + report.outcome = PurgeOutcome::MapFailed; + return report; + } + + const auto *ctrl = static_cast(addr); + const bool magic_ok = std::memcmp(ctrl->magic, CACHE_SHM_MAGIC, sizeof(CACHE_SHM_MAGIC)) == 0; + + const LockResult lock = try_lock_control(fd); + if (lock == LockResult::HeldByOther || (lock == LockResult::Unsupported && magic_ok && process_is_alive(ctrl->owner_pid))) { + report.owner_pid = magic_ok ? ctrl->owner_pid : 0; + report.outcome = PurgeOutcome::OwnedByLive; + ::munmap(addr, CONTROL_SIZE); + return report; + } + + const uint32_t stripe_count = magic_ok ? std::min(ctrl->stripe_count, MAX_STRIPES) : 0; + for (uint32_t i = 0; i < stripe_count; ++i) { + std::string name = read_shm_name(ctrl->stripes[i].shm_name); + if (name.empty()) { + continue; // tombstoned slot -- nothing to unlink + } + int e = ::shm_unlink(name.c_str()) == 0 ? 0 : errno; + report.unlinked.push_back({std::move(name), false, e}); + } + ::munmap(addr, CONTROL_SIZE); + + int e = ::shm_unlink(report.control_name.c_str()) == 0 ? 0 : errno; + report.unlinked.push_back({report.control_name, true, e}); + + report.outcome = PurgeOutcome::Purged; + return report; +} + +} // namespace cache_shm diff --git a/src/iocore/cache/Stripe.cc b/src/iocore/cache/Stripe.cc index 373d545ae1e..19b0864600b 100644 --- a/src/iocore/cache/Stripe.cc +++ b/src/iocore/cache/Stripe.cc @@ -24,6 +24,7 @@ #include "P_CacheDisk.h" #include "P_CacheInternal.h" #include "StripeSM.h" +#include "CacheShm.h" #include "tsutil/DbgCtl.h" @@ -153,15 +154,22 @@ Stripe::_init_directory(std::size_t directory_size, int header_size, int footer_ Dbg(dbg_ctl_cache_init, "Stripe %s: allocating %zu directory bytes for a %lld byte volume (%lf%%)", hash_text.get(), directory_size, (long long)this->len, percent(directory_size, this->len)); - if (ats_hugepage_enabled()) { - this->directory.raw_dir = static_cast(ats_alloc_hugepage(directory_size)); - if (this->directory.raw_dir != nullptr) { - this->directory.raw_dir_huge = true; - } - } - if (nullptr == this->directory.raw_dir) { - this->directory.raw_dir = static_cast(ats_memalign(ats_pagesize(), directory_size)); + // Try a shared-memory-backed directory first; fall back to the hugepage / + // aligned-heap path when shm is disabled or the attach/create fails. + this->directory.raw_dir = CacheShm::attach_or_create_stripe(hash_text.get(), directory_size); + if (this->directory.raw_dir != nullptr) { this->directory.raw_dir_huge = false; + } else { + if (ats_hugepage_enabled()) { + this->directory.raw_dir = static_cast(ats_alloc_hugepage(directory_size)); + if (this->directory.raw_dir != nullptr) { + this->directory.raw_dir_huge = true; + } + } + if (nullptr == this->directory.raw_dir) { + this->directory.raw_dir = static_cast(ats_memalign(ats_pagesize(), directory_size)); + this->directory.raw_dir_huge = false; + } } this->directory.raw_dir_size = directory_size; this->directory.dir = reinterpret_cast(this->directory.raw_dir + header_size); @@ -170,6 +178,52 @@ Stripe::_init_directory(std::size_t directory_size, int header_size, int footer_ this->directory.footer = reinterpret_cast(this->directory.raw_dir + footer_offset); } +// Bounds-check the trusted header/freelist fields of an in-shm directory before +// the fast-restart attach (magic/version are already checked by the caller). A +// stale-but-magic-valid segment could present out-of-range offsets that become OOB +// disk I/O. On failure the caller falls through to the disk read + recover_data(). +// +// Trust model: the shm segment is trusted to the same degree as the on-disk +// directory (same-uid, mode 0600). Stripe geometry (segments/buckets) is recomputed +// locally each run and raw_dir_size is exact-matched before attach, so this only +// validates the header cursor fields and per-segment freelist heads; it does not +// re-validate individual Dir entries -- the read path already checks Doc magic + key +// before serving, so a stale entry resolves to a miss, never served corruption. +bool +Stripe::_shm_directory_is_valid() const +{ + // sector_size must be sane geometry (mirrors the hw_sector_size range in Cache.cc). + if (this->directory.header->sector_size == 0 || this->directory.header->sector_size > STORE_BLOCK_SIZE) { + return false; + } + + // phase is a single bit of write-cursor state; only 0/1 are valid. + if (this->directory.header->phase > 1) { + return false; + } + + // write_pos/last_write_pos/agg_pos must point into the data region. + const off_t data_lo = this->start; + const off_t data_hi = this->skip + this->len; + + if (this->directory.header->write_pos < data_lo || this->directory.header->write_pos > data_hi || + this->directory.header->last_write_pos < data_lo || this->directory.header->last_write_pos > data_hi || + this->directory.header->agg_pos < data_lo || this->directory.header->agg_pos > data_hi) { + return false; + } + + // Each per-segment freelist head must index a Dir within its segment (0 == empty); + // a head past the entry count would walk the free list out of bounds. + const int64_t segment_entries = static_cast(this->directory.buckets) * DIR_DEPTH; + for (int s = 0; s < this->directory.segments; s++) { + if (this->directory.header->freelist[s] >= segment_entries) { + return false; + } + } + + return true; +} + // coverity[exn_spec_violation] - ink_assert aborts (doesn't throw), Dbg is exception-safe Stripe::~Stripe() { @@ -182,12 +236,19 @@ Stripe::~Stripe() ink_assert(this->directory.raw_dir_size > 0); ink_assert(this->directory.raw_dir_size < MAX_STRIPE_SIZE); + // shm-backed directories must outlive the process; never ats_free or poison them. + const bool is_shm = CacheShm::is_shm_pointer(this->directory.raw_dir); + #ifdef DEBUG - // Poison memory before freeing to help detect use-after-free - memset(this->directory.raw_dir, 0xDE, this->directory.raw_dir_size); + if (!is_shm) { + // Poison memory before freeing to help detect use-after-free + memset(this->directory.raw_dir, 0xDE, this->directory.raw_dir_size); + } #endif - if (this->directory.raw_dir_huge) { + if (is_shm) { + CacheShm::detach_stripe(this->directory.raw_dir); + } else if (this->directory.raw_dir_huge) { ats_free_hugepage(this->directory.raw_dir, this->directory.raw_dir_size); } else { ats_free(this->directory.raw_dir); diff --git a/src/iocore/cache/Stripe.h b/src/iocore/cache/Stripe.h index b99b4773fee..c4ee70857b0 100644 --- a/src/iocore/cache/Stripe.h +++ b/src/iocore/cache/Stripe.h @@ -148,9 +148,10 @@ class Stripe off_t data_blocks{}; AggregateWriteBuffer _write_buffer; - void _clear_init(std::uint32_t hw_sector_size); - void _init_dir(); - bool flush_aggregate_write_buffer(int fd); + void _clear_init(std::uint32_t hw_sector_size); + void _init_dir(); + bool _shm_directory_is_valid() const; + [[nodiscard]] bool flush_aggregate_write_buffer(int fd); private: void _init_hash_text(CacheDisk const *disk, off_t blocks, off_t dir_skip); diff --git a/src/iocore/cache/StripeSM.cc b/src/iocore/cache/StripeSM.cc index 4587963968f..15a934e5d1d 100644 --- a/src/iocore/cache/StripeSM.cc +++ b/src/iocore/cache/StripeSM.cc @@ -31,6 +31,7 @@ #include "CacheEvacuateDocVC.h" #include "PreservationTable.h" #include "Stripe.h" +#include "CacheShm.h" #include "iocore/cache/CacheDefs.h" #include "CacheVC.h" @@ -178,6 +179,26 @@ StripeSM::init(bool clear) return clear_dir_aio(); } + // shm fast restart: a clean shutdown left the in-shm directory authoritative, so + // skip both the disk read and recover_data() (which would re-scan the tail and + // discard the entries the shm copy preserved). After validating the in-shm + // header/footer, jump straight to dir_init_done() in the normal post-recovery + // state. Validation failure falls through to disk read + recover_data(). + if (CacheShm::mode() == CacheShm::Mode::AttachExisting && CacheShm::is_shm_pointer(this->directory.raw_dir)) { + if (this->directory.header->magic == STRIPE_MAGIC && this->directory.footer->magic == STRIPE_MAGIC && + CACHE_DB_MAJOR_VERSION_COMPATIBLE <= this->directory.header->version._major && + this->directory.header->version._major <= CACHE_DB_MAJOR_VERSION && this->_shm_directory_is_valid()) { + Note("attaching cached directory from shm for '%s' (fast restart, recovery skipped)", hash_text.get()); + this->sector_size = this->directory.header->sector_size; + this->scan_pos = this->directory.header->write_pos; + this->_preserved_dirs.periodic_scan(this); + this->set_io_not_in_progress(); + SET_HANDLER(&StripeSM::dir_init_done); + return this->dir_init_done(EVENT_IMMEDIATE, nullptr); + } + Note("shm directory invalid for '%s'; falling back to disk read", hash_text.get()); + } + init_info = new StripeInitInfo(); int footerlen = ROUND_TO_STORE_BLOCK(sizeof(StripeHeaderFooter)); off_t footer_offset = this->dirlen() - footerlen; @@ -1326,7 +1347,10 @@ StripeSM::shutdown(EThread *shutdown_thread) SCOPED_MUTEX_LOCK(lock, this->mutex, shutdown_thread); if (DISK_BAD(this->disk)) { - Dbg(dbg_ctl_cache_dir_sync, "Dir %s: ignoring -- bad disk", this->hash_text.get()); + // Bad disk: invalidate the shm copy so the next start recovers from disk + // (mirrors the flush-failure branch below). + Dbg(dbg_ctl_cache_dir_sync, "Dir %s: bad disk -- invalidating shm copy for disk recovery", this->hash_text.get()); + CacheShm::invalidate_stripe_directory(this->directory.raw_dir); return; } size_t dirlen = this->dirlen(); @@ -1342,7 +1366,15 @@ StripeSM::shutdown(EThread *shutdown_thread) // directories have not been inserted for these writes if (!this->_write_buffer.is_empty()) { Dbg(dbg_ctl_cache_dir_sync, "Dir %s: flushing agg buffer first", this->hash_text.get()); - this->flush_aggregate_write_buffer(this->fd); + if (!this->flush_aggregate_write_buffer(this->fd)) { + // Flush failed (e.g. disk went bad): the pwrite below would abort on a short + // write, and the directory no longer matches disk, so invalidate the shm copy + // so this stripe falls back to disk read + recover_data() next start. + Error("Dir %s: aggregation buffer flush failed during shutdown; invalidating shm copy so this stripe reloads from disk", + this->hash_text.get()); + CacheShm::invalidate_stripe_directory(this->directory.raw_dir); + return; + } } // We already asserted that dirlen > 0. @@ -1354,6 +1386,16 @@ StripeSM::shutdown(EThread *shutdown_thread) this->directory.footer->sync_serial = this->directory.header->sync_serial; CHECK_DIR(d); + + // A shm-backed directory is kept current in the shared segment by every + // dir_insert and attached directly next start, so the on-disk write here is pure + // waste -- skip it. If the shm segment is later dropped, the on-disk A/B copies + + // recover_data() reconcile the tail, the same path an unclean restart takes. + if (CacheShm::is_shm_pointer(this->directory.raw_dir)) { + Note("Dir %s: shm-backed, skipping on-disk directory write", this->hash_text.get()); + return; + } + size_t B = this->directory.header->sync_serial & 1; off_t start = this->skip + (B ? dirlen : 0); B = pwrite(this->fd, this->directory.raw_dir, dirlen, start); diff --git a/src/iocore/cache/unit_tests/test_CacheShm.cc b/src/iocore/cache/unit_tests/test_CacheShm.cc new file mode 100644 index 00000000000..dbcc12fb221 --- /dev/null +++ b/src/iocore/cache/unit_tests/test_CacheShm.cc @@ -0,0 +1,219 @@ +/** @file + + Unit tests for the cache shared-memory trust gates and control-segment layout. + + These exercise the pure, side-effect-free pieces of the shm fast-restart + feature -- the ABI fingerprint, the storage signature, the control-header + layout, and the macOS shm-name-length constraint -- without standing up a + cache. They are the logic that decides whether a prior shm segment may be + attached (fast restart) or must be dropped and rebuilt from disk. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#include "main.h" + +#include "../CacheShm.h" +#include "../CacheShmLayout.h" + +#include "iocore/cache/Store.h" +#include "tscore/ink_memory.h" + +#include +#include + +#include + +// Required by the shared test harness (main.cc). +int cache_vols = 1; +bool reuse_existing_cache = false; + +namespace +{ + +// Build a single-span Store with the given path and size (in STORE_BLOCK_SIZE +// blocks). The returned Store owns the Span and frees it on destruction. +void +make_store(Store &store, const char *path, int64_t blocks, int64_t offset = 0) +{ + store.extend(1); + auto *span = new Span(); + span->pathname = ats_strdup(path); + span->blocks = blocks; + span->offset = offset; + span->file_pathname = true; + store.spans[0] = span; +} + +} // namespace + +TEST_CASE("CacheShm ABI hash is stable and non-zero", "[cache][shm]") +{ + const uint64_t a = CacheShm::abi_hash(); + const uint64_t b = CacheShm::abi_hash(); + + // Deterministic: the fingerprint is a pure function of compile-time layout. + CHECK(a == b); + // A zero hash would defeat the trust gate (every segment would look matching); + // the FNV-1a seed and the struct sizes guarantee it is non-zero. + CHECK(a != 0); +} + +TEST_CASE("CacheShm storage signature is sensitive to topology", "[cache][shm]") +{ + Store base; + make_store(base, "/cache/disk0", 1000); + + SECTION("identical topology -> identical signature") + { + Store same; + make_store(same, "/cache/disk0", 1000); + CHECK(CacheShm::storage_signature(base) == CacheShm::storage_signature(same)); + } + + SECTION("different path -> different signature") + { + Store other; + make_store(other, "/cache/disk1", 1000); + CHECK(CacheShm::storage_signature(base) != CacheShm::storage_signature(other)); + } + + SECTION("different size -> different signature") + { + Store resized; + make_store(resized, "/cache/disk0", 2000); + CHECK(CacheShm::storage_signature(base) != CacheShm::storage_signature(resized)); + } + + SECTION("different offset -> different signature") + { + Store moved; + make_store(moved, "/cache/disk0", 1000, /*offset=*/512); + CHECK(CacheShm::storage_signature(base) != CacheShm::storage_signature(moved)); + } + + SECTION("an empty store has a stable signature") + { + Store empty0; + Store empty1; + CHECK(CacheShm::storage_signature(empty0) == CacheShm::storage_signature(empty1)); + } +} + +TEST_CASE("CacheShm control header round-trips through a byte buffer", "[cache][shm]") +{ + using cache_shm::CACHE_SHM_MAGIC; + using cache_shm::CACHE_SHM_SCHEMA_VERSION; + using cache_shm::CacheShmControl; + using cache_shm::CONTROL_SIZE; + + // The on-shm size must equal the struct size; tooling (traffic_ctl) maps + // exactly CONTROL_SIZE bytes and reads the struct out of it. + CHECK(CONTROL_SIZE == sizeof(CacheShmControl)); + + CacheShmControl src; + std::memset(&src, 0, sizeof(src)); + std::memcpy(src.magic, CACHE_SHM_MAGIC, sizeof(CACHE_SHM_MAGIC)); + src.schema_version = CACHE_SHM_SCHEMA_VERSION; + src.abi_hash = 0x0123456789abcdefULL; + src.storage_signature = 0xfedcba9876543210ULL; + src.clean_shutdown = 1; + src.owner_pid = 4242; + src.stripe_count = 2; + std::strncpy(src.stripes[0].shm_name, "/ats-s0", sizeof(src.stripes[0].shm_name) - 1); + src.stripes[0].raw_dir_size = 4096; + src.stripes[0].stripe_key_hash = 0xaaaabbbbccccddddULL; + std::strncpy(src.stripes[1].shm_name, "/ats-s1", sizeof(src.stripes[1].shm_name) - 1); + src.stripes[1].raw_dir_size = 8192; + src.stripes[1].stripe_key_hash = 0x1111222233334444ULL; + + // Serialize to a raw byte buffer and read it back, mimicking shm attach. + unsigned char buf[CONTROL_SIZE]; + std::memcpy(buf, &src, CONTROL_SIZE); + const auto *dst = reinterpret_cast(buf); + + CHECK(std::memcmp(dst->magic, CACHE_SHM_MAGIC, sizeof(CACHE_SHM_MAGIC)) == 0); + CHECK(dst->schema_version == CACHE_SHM_SCHEMA_VERSION); + CHECK(dst->abi_hash == 0x0123456789abcdefULL); + CHECK(dst->storage_signature == 0xfedcba9876543210ULL); + CHECK(dst->clean_shutdown == 1); + CHECK(dst->owner_pid == 4242); + CHECK(dst->stripe_count == 2); + CHECK(std::string(dst->stripes[0].shm_name) == "/ats-s0"); + CHECK(dst->stripes[0].raw_dir_size == 4096); + CHECK(dst->stripes[0].stripe_key_hash == 0xaaaabbbbccccddddULL); + CHECK(std::string(dst->stripes[1].shm_name) == "/ats-s1"); + CHECK(dst->stripes[1].raw_dir_size == 8192); + CHECK(dst->stripes[1].stripe_key_hash == 0x1111222233334444ULL); +} + +TEST_CASE("CacheShm names respect the macOS PSHMNAMLEN limit", "[cache][shm]") +{ + using cache_shm::MAX_SHM_NAME_LEN; + using cache_shm::StripeEntry; + + // macOS caps POSIX shm names at 31 chars including the leading '/'. The shared + // limit must match so the same naming works on Linux and macOS alike. + CHECK(MAX_SHM_NAME_LEN == 31); + + // The per-stripe name field must hold a maximum-length name plus its NUL. + CHECK(sizeof(StripeEntry{}.shm_name) > MAX_SHM_NAME_LEN); + + // The default control segment name fits comfortably under the limit. + const std::string control_name = cache_shm::control_segment_name("/ats-"); + CHECK(control_name.size() < MAX_SHM_NAME_LEN); +} + +TEST_CASE("CacheShm normalizes the configured name prefix", "[cache][shm]") +{ + using cache_shm::normalize_name_prefix; + + // The operator configures only the middle word; the framing '/' and '-' are + // supplied by the code so a name like "/ats-" cannot be mis-typed. + CHECK(normalize_name_prefix("ats") == "/ats-"); + CHECK(normalize_name_prefix("foo") == "/foo-"); + + // Forgiving of stray framing an operator may carry over (e.g. a pre-existing + // "/ats-" config), so migration cannot produce "//ats--". + CHECK(normalize_name_prefix("/ats-") == "/ats-"); + CHECK(normalize_name_prefix("/ats") == "/ats-"); + CHECK(normalize_name_prefix("ats-") == "/ats-"); + CHECK(normalize_name_prefix("//ats--") == "/ats-"); + + // An embedded '/' or '-' in the middle is preserved -- only the framing is + // trimmed. + CHECK(normalize_name_prefix("ats-v2") == "/ats-v2-"); +} + +TEST_CASE("CacheShm process liveness check backs the concurrent-attach guard", "[cache][shm]") +{ + // Our own PID is, by definition, live -- this is the "a different live owner + // still holds the segment" case the guard refuses to attach over. + CHECK(CacheShm::process_is_alive(static_cast(getpid()))); + + // A zero / negative owner_pid means "no owner recorded" (e.g. after a clean + // shutdown); it must never read as live or the guard would wrongly refuse. + CHECK_FALSE(CacheShm::process_is_alive(0)); + CHECK_FALSE(CacheShm::process_is_alive(-1)); + + // A PID at the top of the range is overwhelmingly unlikely to name a live + // process; kill(pid, 0) returns ESRCH, so it reads as not-alive (a stale + // owner left by a crash, which the guard is free to reclaim). + CHECK_FALSE(CacheShm::process_is_alive(std::numeric_limits::max())); +} diff --git a/src/records/RecordsConfig.cc b/src/records/RecordsConfig.cc index 7890f427ed1..d209716ebfc 100644 --- a/src/records/RecordsConfig.cc +++ b/src/records/RecordsConfig.cc @@ -82,6 +82,15 @@ static constexpr RecordElement RecordsConfig[] = , {RECT_CONFIG, "proxy.config.cache.persist_bad_disks", RECD_INT, "0", RECU_RESTART_TS, RR_NULL, RECC_INT, "[0-1]", RECA_NULL} , + {RECT_CONFIG, "proxy.config.cache.shm.enabled", RECD_INT, "0", RECU_RESTART_TS, RR_NULL, RECC_INT, "[0-1]", RECA_NULL} + , + {RECT_CONFIG, "proxy.config.cache.shm.name_prefix", RECD_STRING, "ats", RECU_RESTART_TS, RR_NULL, RECC_NULL, nullptr, RECA_NULL} + , + {RECT_CONFIG, "proxy.config.cache.shm.use_hugepages", RECD_INT, "0", RECU_RESTART_TS, RR_NULL, RECC_INT, "[0-1]", RECA_NULL} + , + {RECT_CONFIG, "proxy.config.cache.shm.purge_stale_on_start", RECD_INT, "0", RECU_RESTART_TS, RR_NULL, RECC_INT, "[0-1]", + RECA_NULL} + , {RECT_CONFIG, "proxy.config.cache.default_volumes", RECD_STRING, "", RECU_RESTART_TS, RR_NULL, RECC_NULL, nullptr, RECA_NULL} , {RECT_CONFIG, "proxy.config.output.logfile.name", RECD_STRING, "traffic.out", RECU_RESTART_TS, RR_REQUIRED, RECC_NULL, nullptr, diff --git a/src/traffic_ctl/CMakeLists.txt b/src/traffic_ctl/CMakeLists.txt index c967d42e9c9..165e6d17fa7 100644 --- a/src/traffic_ctl/CMakeLists.txt +++ b/src/traffic_ctl/CMakeLists.txt @@ -18,6 +18,7 @@ add_executable( traffic_ctl traffic_ctl.cc + CacheShmCommand.cc ConvertConfigCommand.cc CtrlCommands.cc CtrlPrinters.cc @@ -27,6 +28,8 @@ add_executable( ${CMAKE_SOURCE_DIR}/src/shared/rpc/IPCSocketClient.cc ) +target_include_directories(traffic_ctl PRIVATE ${CMAKE_SOURCE_DIR}/src/iocore/cache) + target_link_libraries(traffic_ctl ts::tscore ts::config libswoc::libswoc yaml-cpp::yaml-cpp ts::tsutil) install(TARGETS traffic_ctl) diff --git a/src/traffic_ctl/CacheShmCommand.cc b/src/traffic_ctl/CacheShmCommand.cc new file mode 100644 index 00000000000..ec63b113c22 --- /dev/null +++ b/src/traffic_ctl/CacheShmCommand.cc @@ -0,0 +1,261 @@ +/** @file + + traffic_ctl command for inspecting and clearing the cache shared-memory + control segment and its associated stripe segments. + + The status and clear operations work by direct shm_open access rather than + JSONRPC, so they function whether traffic_server is running or not. This + is important for debugging crash-leftover segments when no live process + is available to query. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#include "CacheShmCommand.h" +#include "CacheShmLayout.h" +#include "CacheShmPurge.h" +#include "TrafficCtlStatus.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace +{ + +// The middle word of the shm name prefix when --prefix is not given. The +// framing "/-" is supplied by cache_shm::normalize_name_prefix, matching +// the server's proxy.config.cache.shm.name_prefix default. +constexpr const char *DEFAULT_PREFIX = "ats"; + +bool +shm_segment_exists(const std::string &name) +{ + int fd = shm_open(name.c_str(), O_RDONLY, 0); + if (fd < 0) { + return false; + } + close(fd); + return true; +} + +std::string +format_size(uint64_t bytes) +{ + char buf[64]; + if (bytes >= (uint64_t{1} << 30)) { + std::snprintf(buf, sizeof(buf), "%.2f GiB", static_cast(bytes) / (uint64_t{1} << 30)); + } else if (bytes >= (uint64_t{1} << 20)) { + std::snprintf(buf, sizeof(buf), "%.2f MiB", static_cast(bytes) / (uint64_t{1} << 20)); + } else if (bytes >= (uint64_t{1} << 10)) { + std::snprintf(buf, sizeof(buf), "%.2f KiB", static_cast(bytes) / (uint64_t{1} << 10)); + } else { + std::snprintf(buf, sizeof(buf), "%llu B", static_cast(bytes)); + } + return buf; +} + +// Shared with the cache subsystem (CacheShmPurge.h): read_shm_name bounds a +// possibly-unterminated name field, process_is_alive backs the owner-liveness gate. +using cache_shm::process_is_alive; +using cache_shm::read_shm_name; + +} // namespace + +CacheShmCommand::CacheShmCommand(ts::Arguments *args) : CtrlCommand(args) +{ + if (get_parsed_arguments()->get(STATUS_STR)) { + _invoked_func = [this]() { status(); }; + } else if (get_parsed_arguments()->get(CLEAR_STR)) { + _invoked_func = [this]() { clear(); }; + } +} + +std::string +CacheShmCommand::get_prefix() +{ + // The operator gives only the middle word (e.g. --prefix ats); frame it the + // same way the server does so the two agree on segment names. + std::string configured = DEFAULT_PREFIX; + if (auto arg = get_parsed_arguments()->get(PREFIX_STR); arg && !arg.empty()) { + configured = arg.value(); + } + return cache_shm::normalize_name_prefix(configured); +} + +void +CacheShmCommand::status() +{ + const std::string prefix = get_prefix(); + const std::string control_name = cache_shm::control_segment_name(prefix); + + int fd = shm_open(control_name.c_str(), O_RDONLY, 0); + if (fd < 0) { + std::cerr << "cache shm: control segment '" << control_name << "' not found: " << std::strerror(errno) << '\n'; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + } + + struct stat sb { + }; + if (fstat(fd, &sb) < 0) { + std::cerr << "cache shm: fstat failed: " << std::strerror(errno) << '\n'; + close(fd); + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + } + + if (static_cast(sb.st_size) < sizeof(cache_shm::CacheShmControl)) { + std::cerr << "cache shm: control segment too small (" << sb.st_size << " bytes, need at least " + << sizeof(cache_shm::CacheShmControl) << ")\n"; + close(fd); + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + } + + void *addr = mmap(nullptr, sizeof(cache_shm::CacheShmControl), PROT_READ, MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + std::cerr << "cache shm: mmap failed: " << std::strerror(errno) << '\n'; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + } + + const auto *ctrl = static_cast(addr); + + const bool magic_ok = std::memcmp(ctrl->magic, cache_shm::CACHE_SHM_MAGIC, sizeof(cache_shm::CACHE_SHM_MAGIC)) == 0; + const bool schema_ok = ctrl->schema_version == cache_shm::CACHE_SHM_SCHEMA_VERSION; + + std::cout << "Control segment: " << control_name << '\n'; + std::cout << " segment size: " << sb.st_size << " bytes (" << format_size(sb.st_size) << ")\n"; + std::cout << " magic: "; + for (char c : ctrl->magic) { + if (c >= 0x20 && c < 0x7f) { + std::cout << c; + } + } + std::cout << (magic_ok ? " [valid]" : " [INVALID]") << '\n'; + std::cout << " schema_version: " << ctrl->schema_version << (schema_ok ? " [valid]" : " [INVALID]") << '\n'; + std::cout << " abi_hash: 0x" << std::hex << ctrl->abi_hash << std::dec << '\n'; + std::cout << " storage_sig: 0x" << std::hex << ctrl->storage_signature << std::dec << '\n'; + std::cout << " clean_shutdown: " << static_cast(ctrl->clean_shutdown) + << (ctrl->clean_shutdown ? " (clean)" : " (DIRTY -- next start will rebuild)") << '\n'; + std::cout << " owner_pid: " << ctrl->owner_pid; + if (ctrl->owner_pid == 0) { + std::cout << " (none -- not currently attached)"; + } else if (process_is_alive(ctrl->owner_pid)) { + std::cout << " (LIVE -- a running traffic_server owns this segment)"; + } else { + std::cout << " (stale -- owner no longer running)"; + } + std::cout << '\n'; + std::cout << " stripe_count: " << ctrl->stripe_count << '\n'; + + if (!magic_ok || !schema_ok) { + std::cout << "\nHeader is invalid; not interpreting stripe table.\n"; + munmap(addr, sizeof(cache_shm::CacheShmControl)); + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + } + + const uint32_t shown = std::min(ctrl->stripe_count, cache_shm::MAX_STRIPES); + + if (shown > 0) { + std::cout << "\nStripes:\n"; + for (uint32_t i = 0; i < shown; ++i) { + const auto &entry = ctrl->stripes[i]; + std::string name = read_shm_name(entry.shm_name); + if (name.empty()) { + std::cout << " [" << i << "] (tombstone -- slot free for reuse)\n"; + continue; + } + const bool present = shm_segment_exists(name); + std::cout << " [" << i << "] " << name << " size=" << entry.raw_dir_size << " (" << format_size(entry.raw_dir_size) << ") " + << (present ? "present" : "MISSING") << '\n'; + } + } + + if (ctrl->stripe_count > cache_shm::MAX_STRIPES) { + std::cout << "\n(stripe_count " << ctrl->stripe_count << " exceeds MAX_STRIPES " << cache_shm::MAX_STRIPES << "; truncated.)\n"; + } + + munmap(addr, sizeof(cache_shm::CacheShmControl)); +} + +void +CacheShmCommand::clear() +{ + // Shared with the server's purge-on-disabled-start (cache_shm::purge_segments); + // this command just renders the result to the console and sets the exit code. + const cache_shm::PurgeReport report = cache_shm::purge_segments(get_prefix()); + + switch (report.outcome) { + case cache_shm::PurgeOutcome::BadPrefix: + std::cerr << "cache shm: invalid prefix (must be non-empty and begin with '/').\n"; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + case cache_shm::PurgeOutcome::NotPresent: + std::cerr << "cache shm: control segment '" << report.control_name << "' not found (" << std::strerror(report.sys_errno) + << "); nothing to clear.\n"; + std::cout << "Removed 0 segment(s).\n"; + return; + case cache_shm::PurgeOutcome::OpenFailed: + // Not ENOENT: the segment may well exist but we could not open it (e.g. EACCES on a + // segment owned by another user). Report the real errno and fail rather than claim success. + std::cerr << "cache shm: cannot open control segment '" << report.control_name << "' (" << std::strerror(report.sys_errno) + << "); cannot clear.\n"; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + case cache_shm::PurgeOutcome::MapFailed: + std::cerr << "cache shm: mmap failed while reading stripe table: " << std::strerror(report.sys_errno) << '\n'; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + case cache_shm::PurgeOutcome::OwnedByLive: + // Refuse: unlinking a live owner's segments would orphan its fast restart. + std::cerr << "cache shm: control segment '" << report.control_name << "' is owned by a live traffic_server (pid " + << report.owner_pid << "); refusing to clear. Stop traffic_server first.\n"; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + case cache_shm::PurgeOutcome::TooSmall: + case cache_shm::PurgeOutcome::Purged: + break; + } + + for (const auto &u : report.unlinked) { + if (u.error == 0) { + std::cout << "unlinked " << u.name << '\n'; + } else if (u.error != ENOENT) { + std::cerr << "failed to unlink " << u.name << ": " << std::strerror(u.error) << '\n'; + } + } + + const unsigned failures = report.failures(); + std::cout << "Removed " << report.removed() << " segment(s)"; + if (failures != 0) { + std::cout << ", " << failures << " failure(s)"; + App_Exit_Status_Code = CTRL_EX_ERROR; + } + std::cout << ".\n"; +} diff --git a/src/traffic_ctl/CacheShmCommand.h b/src/traffic_ctl/CacheShmCommand.h new file mode 100644 index 00000000000..1a1f2db879b --- /dev/null +++ b/src/traffic_ctl/CacheShmCommand.h @@ -0,0 +1,45 @@ +/** @file + + traffic_ctl command for inspecting and clearing the cache shared-memory + control segment and its associated stripe segments. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#pragma once + +#include "CtrlCommands.h" + +#include + +class CacheShmCommand : public CtrlCommand +{ +public: + CacheShmCommand(ts::Arguments *args); + +private: + static inline const std::string STATUS_STR{"status"}; + static inline const std::string CLEAR_STR{"clear"}; + static inline const std::string PREFIX_STR{"prefix"}; + + void status(); + void clear(); + + std::string get_prefix(); +}; diff --git a/src/traffic_ctl/traffic_ctl.cc b/src/traffic_ctl/traffic_ctl.cc index 9697aaa05da..2824011c190 100644 --- a/src/traffic_ctl/traffic_ctl.cc +++ b/src/traffic_ctl/traffic_ctl.cc @@ -31,6 +31,7 @@ #include "tscore/signals.h" #include "CtrlCommands.h" +#include "CacheShmCommand.h" #include "ConvertConfigCommand.h" #include "FileConfigCommand.h" #include "SSLMultiCertCommand.h" @@ -101,6 +102,7 @@ main([[maybe_unused]] int argc, const char **argv) auto &host_command = parser.add_command("host", "Interact with host status").require_commands(); auto &hostdb_command = parser.add_command("hostdb", "Interact with HostDB status").require_commands(); auto &direct_rpc_command = parser.add_command("rpc", "Interact with the rpc api").require_commands(); + auto &cache_command = parser.add_command("cache", "Inspect and manage the cache").require_commands(); // config commands config_command.add_command("defaults", "Show default information configuration values", Command_Execute) @@ -315,6 +317,16 @@ main([[maybe_unused]] int argc, const char **argv) .add_option("--params", "-p", "Parameters to be passed in the request, YAML or JSON format", "", MORE_THAN_ONE_ARG_N, "", "") .add_example_usage("traffic_ctl rpc invoke foo_bar -p \"numbers: [1, 2, 3]\""); + // cache shm commands - operate directly on POSIX shared memory; no running server required. + auto &shm_command = cache_command.add_command("shm", "Inspect and manage cache shared-memory segments").require_commands(); + shm_command.add_option("--prefix", "-p", "shm name prefix word, framed as /- (default 'ats')", "", 1, "ats"); + shm_command.add_command("status", "Show the cache shared-memory control segment and stripe table", [&]() { command->execute(); }) + .add_example_usage("traffic_ctl cache shm status") + .add_example_usage("traffic_ctl cache shm status --prefix ats-t"); + shm_command.add_command("clear", "Unlink the cache shared-memory control and stripe segments", [&]() { command->execute(); }) + .add_example_usage("traffic_ctl cache shm clear") + .add_example_usage("traffic_ctl cache shm clear --prefix ats-t"); + auto create_command = [](ts::Arguments &args) -> std::unique_ptr { if (args.get("config")) { if (args.get("convert")) { @@ -337,6 +349,7 @@ main([[maybe_unused]] int argc, const char **argv) {"host", [](ts::Arguments *a) { return std::make_unique(a); } }, {"hostdb", [](ts::Arguments *a) { return std::make_unique(a); } }, {"rpc", [](ts::Arguments *a) { return std::make_unique(a); }}, + {"cache", [](ts::Arguments *a) { return std::make_unique(a); } }, }; for (const auto &[key, factory] : factories) { diff --git a/tests/gold_tests/cache/cache_shm_bad_disk_dropped.test.py b/tests/gold_tests/cache/cache_shm_bad_disk_dropped.test.py new file mode 100644 index 00000000000..d31e0418ebf --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_bad_disk_dropped.test.py @@ -0,0 +1,226 @@ +''' +Verify per-stripe partial attach when a disk is dropped from storage.yaml +(the "bad disk" case). A storage change no longer cold-starts every stripe: +the stripes on healthy, unchanged disks fast-attach their prior shm segments +while the segment left behind by the removed disk is reclaimed. + +ts1 caches an object across two disks and clean-shuts-down (marking the shm +clean). ts2 starts against the *same* shm prefix but with the second disk +removed from storage.yaml -- simulating a bad disk dropped by the operator. +ts2 must: + - keep the existing control segment (partial attach, not a full recreate), + - fast-attach the surviving disk's stripe by its stable identity, + - reclaim the orphaned stripe segment of the removed disk, + - and still serve traffic. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import uuid + +Test.Summary = ''' +Dropping a disk from storage.yaml fast-attaches the surviving stripes from +shm and reclaims the orphaned stripe segment of the removed disk. +''' +Test.ContinueOnFail = True + + +class CacheShmBadDiskDroppedTest: + """ + A stripe's shm identity is its hash_text -- the disk seed (path or + hash_base_string) plus that disk's own dir_skip:blocks, read from the + disk's persisted header. None of those depend on the other disks, so when + one disk is removed from storage.yaml the surviving disks compute the + same hash_text as before and re-attach their prior shm segments. The + removed disk's stripe is no longer present, so its control entry is never + claimed and finalize_attach() reclaims the orphaned segment. + + ts1 starts cold across disk_a + disk_b, populates the cache, and clean-shuts + down. ts2 starts against disk_a only (disk_b "fails"/dropped) sharing the + shm prefix, and asserts ts2: + - enters partial-attach mode (storage signature changed) keeping the + control segment rather than recreating it, + - fast-attaches the surviving disk_a stripe from shm, + - reclaims exactly the orphaned disk_b stripe segment, + - reports neither an unclean shutdown nor a schema/ABI mismatch, + - and serves a request (200). + """ + + TS_PID_SCRIPT = 'ts_process_handler.py' + + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB per disk + + def __init__(self): + self._setup_shared_state() + # ts1 sees both disks; ts2 sees only disk_a (disk_b dropped). + self.ts1 = self._configure_ts('shmbd_ts1', [self._storage_path_a, self._storage_path_b]) + self.ts2 = self._configure_ts('shmbd_ts2', [self._storage_path_a]) + self._add_diags_log_assertions() + self._url_path = f'/cache/40/{uuid.uuid4()}' + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + # Absolute paths keep the spans independent of MakeATSProcess's + # per-instance STORAGEDIR so disk_a has identical geometry for ts1 and + # ts2 (hence identical stripe identity -> fast attach). + self._storage_path_a = os.path.join(shared_storage_dir, 'disk_a.img') + self._storage_path_b = os.path.join(shared_storage_dir, 'disk_b.img') + for path in (self._storage_path_a, self._storage_path_b): + with open(path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # macOS PSHMNAMLEN is 31 chars incl. '/'; 'bd' = bad-disk-dropped variant. + self._shm_prefix = f'/cshmbd-{os.getpid() % 100000}-' + + def _configure_ts(self, name, storage_paths): + ts = Test.MakeATSProcess(name) + storage_lines = ['cache:', ' spans:'] + for i, storage_path in enumerate(storage_paths): + storage_lines += [ + f' - name: disk.{i}', + f' path: {storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ] + storage_lines += [' volumes:', ' - id: 1', ' scheme: http', ' size: 100%'] + ts.Disk.storage_yaml.AddLines(storage_lines) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm|cache_init', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine('map / http://127.0.0.1/ @plugin=generator.so') + return ts + + def _add_diags_log_assertions(self): + # ts1 cold start across both disks, clean shutdown. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create a fresh shm control segment on first start') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'created stripe \S+ \(\d+ bytes\) for key=', 'ts1 should create the shm-backed stripe segments') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: marking clean shutdown', 'ts1 should mark the shm clean before exit') + + # ts2 warm start with disk_b dropped: partial attach -- the surviving + # disk_a stripe attaches, the orphaned disk_b segment is reclaimed. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'attaching up to \d+ stripes \(fast restart, partial -- storage changed\)', + 'ts2 must enter partial-attach mode after the disk was dropped') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'attached stripe \S+ \(\d+ bytes\) for key=', 'ts2 must fast-attach the surviving disk_a stripe from shm') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: reclaiming orphaned stripe segment', 'ts2 must reclaim the dropped disk_b stripe segment') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'reclaimed \d+ orphaned stripe segment\(s\) after storage change', 'ts2 must report the reclaim summary') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: creating fresh control segment', 'ts2 must keep the control segment across the disk drop') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: previous run did not shutdown cleanly', + 'the partial attach must be due to the disk drop, not an unclean shutdown') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: (schema|ABI) mismatch', 'the partial attach must be due to the disk drop, not schema/ABI') + + def _populate_cache(self): + tr = Test.AddTestRun('Populate cache via ts1 (disk_a + disk_b)') + tr.Processes.Default.StartBefore(self.ts1) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts1.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression('200', 'ts1 first GET should return 200') + tr.StillRunningAfter = self.ts1 + + def _clean_shutdown_ts1(self): + tr = Test.AddTestRun('Drain and clean-shutdown ts1') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shmbd_ts1 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _dump_shm_state(self): + # Between ts1's clean shutdown and ts2's start the control segment is + # marked clean and records both stripes (nothing reclaimed yet). Capture + # it with `traffic_ctl cache shm status` and compare against a gold file. + # The gold masks the run-specific names, the ABI/storage hashes, and the + # page-rounded sizes with the `` wildcard, so what is asserted literally + # is the meaningful state: valid magic/schema, clean_shutdown=1, + # stripe_count=2, and both stripe segments present. + tr = Test.AddTestRun('Dump shm control state after ts1 clean shutdown') + # Use ts1's Env: it has been started, so the per-instance bin dir is on + # PATH (ts2's Env only gains it once ts2 starts, which is the next step). + # `cache shm status` reads the segment directly and needs no live server. + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm status --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = 'gold/cache_shm_state_after_shutdown.gold' + + def _verify_survivor_attach_and_reclaim(self): + tr = Test.AddTestRun('Start ts2 (disk_b dropped); verify survivor fast-attach + orphan reclaim') + tr.Processes.Default.StartBefore(self.ts2) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts2.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + '200', 'ts2 should serve correctly after the partial attach') + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown_ts2(self): + # Stop ts2 before clearing the shm: `cache shm clear` refuses to unlink a + # segment a live traffic_server still owns, so the segments must be + # ownerless (clean_shutdown clears owner_pid) before cleanup runs. + tr = Test.AddTestRun('Drain and clean-shutdown ts2') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shmbd_ts2 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _cleanup_shm(self): + # A clean shutdown deliberately keeps the control + live stripe segments + # for the next fast restart, so they outlive the test. Unlink them by + # prefix to avoid leaking POSIX shm across repeated local runs (macOS has + # no /dev/shm to clear out of band). + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._populate_cache() + self._clean_shutdown_ts1() + self._dump_shm_state() + self._verify_survivor_attach_and_reclaim() + self._clean_shutdown_ts2() + self._cleanup_shm() + + +CacheShmBadDiskDroppedTest().run() diff --git a/tests/gold_tests/cache/cache_shm_concurrent_attach.test.py b/tests/gold_tests/cache/cache_shm_concurrent_attach.test.py new file mode 100644 index 00000000000..90453312708 --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_concurrent_attach.test.py @@ -0,0 +1,185 @@ +''' +Verify the concurrent-attach guard: a second traffic_server must never map the +shm directory read-write underneath a live owner. ts1 cold-starts and becomes +the owner of the control segment (it sets owner_pid and, on Linux, holds an +exclusive flock for its lifetime). While ts1 is still running, ts2 starts +against the *same* shm prefix; it must refuse shm for this run, disable it, and +come up on its own disk cache without touching ts1's segment. ts1 keeps serving +throughout. + +The two instances use *separate* on-disk cache files so the test isolates the +shm concurrent-attach guard from any contention over a shared cache file. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import uuid + +Test.Summary = ''' +A second traffic_server refuses to attach the shm directory while a live owner +holds it, disabling shm for its run instead of attaching concurrently. +''' +Test.ContinueOnFail = True + + +class CacheShmConcurrentAttachTest: + """ + The concurrent-attach guard (P0). A live owner is still mapping the Dir + read-write; a second writer would corrupt it, and clean_shutdown is no + protection against a concurrent *live* run. The guard fires from either of + two mechanisms, so this test asserts on the shared tail of both messages: + - Linux: ts1 holds an exclusive flock on the control segment for its + lifetime; ts2's lock attempt returns HeldByOther ("... is locked by a + live owner ..."). + - macOS (flock unsupported): the owner_pid liveness backstop fires + instead ("... claims a live owner ..."). + Both end in "disabling shm this run to avoid concurrent attach" and set the + run to shm-disabled, which is what this test pins -- so it runs on every + platform. + + ts2 must: + - log the concurrent-attach refusal, + - NOT create or attach a control segment (it bails before either), + - still serve a request (200) from its own disk cache. + ts1 must keep running and serving the whole time. + """ + + TS_PID_SCRIPT = 'ts_process_handler.py' + + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB + + def __init__(self): + self._setup_shared_state() + # Same shm prefix, different storage files: the collision under test is + # purely on the shm control segment, not the on-disk cache. + self.ts1 = self._configure_ts('shmc_ts1', self._storage_path_a) + self.ts2 = self._configure_ts('shmc_ts2', self._storage_path_b) + self._add_diags_log_assertions() + self._url_path = f'/cache/40/{uuid.uuid4()}' + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + self._storage_path_a = os.path.join(shared_storage_dir, 'disk_a.img') + self._storage_path_b = os.path.join(shared_storage_dir, 'disk_b.img') + for path in (self._storage_path_a, self._storage_path_b): + with open(path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # macOS PSHMNAMLEN is 31 chars incl. '/'; 'c' = concurrent-attach variant. + self._shm_prefix = f'/cshmc-{os.getpid() % 100000}-' + + def _configure_ts(self, name, storage_path): + ts = Test.MakeATSProcess(name) + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine('map / http://127.0.0.1/ @plugin=generator.so') + return ts + + def _add_diags_log_assertions(self): + # ts1 is the owner: it creates the fresh control segment. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create and own the shm control segment') + + # ts2 starts while ts1 owns the segment: it must refuse and disable shm. + # The message head differs by platform (flock vs owner_pid backstop); the + # tail is common, so anchor on it. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'disabling shm this run to avoid concurrent attach', 'ts2 must refuse to attach while ts1 owns the segment') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: creating fresh control segment', 'ts2 must not create a control segment when it refuses shm') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: attaching up to \d+ stripes \(fast restart', "ts2 must not attach ts1's live control segment") + + def _start_owner(self): + tr = Test.AddTestRun('Cold-start ts1 (becomes the shm owner)') + tr.Processes.Default.StartBefore(self.ts1) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts1.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression('200', 'ts1 GET should return 200') + tr.StillRunningAfter = self.ts1 + + def _start_second_refused(self): + # ts1 is still running (kept alive by StillRunningAfter above), so ts2's + # start hits the concurrent-attach guard. + tr = Test.AddTestRun('Start ts2 while ts1 is live; ts2 must refuse shm and serve from disk') + tr.Processes.Default.StartBefore(self.ts2) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts2.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + '200', 'ts2 should serve from its own disk cache with shm disabled') + tr.StillRunningAfter = self.ts1 + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown(self, ts, name): + tr = Test.AddTestRun(f'Drain and clean-shutdown {name}') + tr.Processes.Default.Env = ts.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} {name} --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _cleanup_shm(self): + # ts1 (the owner) is stopped before this so clean_shutdown clears + # owner_pid; otherwise `cache shm clear` refuses a live owner. + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._start_owner() + self._start_second_refused() + # Stop the non-owner first, then the owner (clears owner_pid), then clear. + self._clean_shutdown(self.ts2, 'shmc_ts2') + self._clean_shutdown(self.ts1, 'shmc_ts1') + self._cleanup_shm() + + +CacheShmConcurrentAttachTest().run() diff --git a/tests/gold_tests/cache/cache_shm_fast_restart.test.py b/tests/gold_tests/cache/cache_shm_fast_restart.test.py new file mode 100644 index 00000000000..520b2b00986 --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_fast_restart.test.py @@ -0,0 +1,247 @@ +''' +Verify the cache directory survives a clean shutdown via shared memory and is +attached on the next start (fast restart). Two ATS instances share an on-disk +cache file and a POSIX shm name prefix; ts1 populates the cache and is shut +down via traffic_ctl drain + SIGTERM, then ts2 starts and serves the same URL +out of cache without re-fetching from the origin. + +Traffic is driven with Proxy Verifier: a single verifier-server acts as the +origin and the verifier-client replays cache-shm-fast-restart.replay.yaml -- +the "fill" transaction against ts1 and the "hit" transaction against ts2. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +Test.Summary = ''' +Cache directory survives clean shutdown via POSIX shared memory. +''' +Test.ContinueOnFail = True + + +class CacheShmFastRestartTest: + """ + Cover the cache shm fast-restart scenario end-to-end. + + Sequence: + 1. ts1 cold-start: creates a fresh shm control segment and per-stripe + segments; populates the cache via the "fill" transaction (cache miss, + fetched from the verifier-server origin). + 2. ts1 is drained and SIGTERM'd. The shutdown hook flushes the directory + and marks the shm clean. + 3. ts2 starts against the same on-disk file and shm prefix: attaches the + existing control segment, attaches per-stripe segments, and reuses the + cached directory without re-reading it from disk. + 4. ts2 serves the same URL out of cache via the "hit" transaction + (X-Cache: hit-fresh). The transaction's origin response is a 502 + sentinel, so any forward to the origin would fail the run. + + Each step is verified both at the response level (proxy-verifier) and via + diags-log assertions on the cache_shm / cache_dir_init code paths. + """ + + # Helper script for sending signals to a traffic_server process by command-line + # identifier match. Reused from gold_tests/logging. + TS_PID_SCRIPT = 'ts_process_handler.py' + + # The replay file driving both the populate ("fill") and verify ("hit") + # transactions. They share a cache key and differ only by uuid. + REPLAY_FILE = 'replay/cache-shm-fast-restart.replay.yaml' + + # Stripe size for the shared cache. Must be large enough that the directory + # contains real entries; small enough that the disk.img is cheap to create. + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB + + def __init__(self): + self._setup_shared_state() + # A single verifier-server is the origin for both ts1 and ts2. It is + # started before ts1 and kept running across the whole test. + self.server = Test.MakeVerifierServerProcess('shm-origin', self.REPLAY_FILE) + self.ts1 = self._configure_ts('shm_ts1') + self.ts2 = self._configure_ts('shm_ts2') + self._add_diags_log_assertions() + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + # Shared storage file used by both ts1 and ts2. The absolute path makes + # storage.yaml independent of MakeATSProcess's per-instance STORAGEDIR. + # ATS opens regular-file spans with O_RDONLY first to stat them -- it + # does not auto-create the backing file -- so pre-create disk.img at the + # configured size before either ts starts. + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + self._shared_storage_path = os.path.join(shared_storage_dir, 'disk.img') + with open(self._shared_storage_path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # POSIX shm names: macOS PSHMNAMLEN limit is 31 chars including '/'. + # Keep the prefix short and unique per test run so concurrent autest + # runs do not collide. + self._shm_prefix = f'/cshm-{os.getpid() % 100000}-' + + def _configure_ts(self, name): + ts = Test.MakeATSProcess(name) + # Master configures cache storage via storage.yaml. An absolute span path + # keeps the span independent of MakeATSProcess's per-instance STORAGEDIR so + # ts1 and ts2 share the same on-disk cache, which yields identical stripe + # geometry (hence identical shm identity). + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {self._shared_storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + # The per-stripe 'created/attached stripe' lines are Dbg() calls; + # route debug output to diags.log (default is stderr) so the + # ContainsExpression assertions below can match them. + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine(f'map / http://127.0.0.1:{self.server.Variables.http_port}/') + return ts + + def _add_diags_log_assertions(self): + # These assertions match the *stable core* of each cache-shm log line and + # deliberately stop before the trailing parenthetical qualifier. The shm + # code appends optional context to several of these messages as it evolves + # -- "attaching N stripes" became "attaching up to N stripes", + # "(fast restart)" became "(fast restart, recovery skipped)" on the stripe + # path and "(fast restart, partial -- storage changed)" on the control + # path. Anchoring on the invariant prefix (not the closing paren) keeps the + # test from breaking every time such a qualifier is added. Likewise, the + # excludes name only log strings that actually exist in the source: an + # exclude on a non-existent string can never fire and gives false comfort. + + # ts1 (cold start): creates fresh shm, marks it clean on shutdown, and must + # NOT report any "drop" reason since there is nothing to drop. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create a fresh shm control segment on first start') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: created stripe \S+ \(\d+ bytes\) for key=', 'ts1 should create at least one shm-backed stripe segment') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: marking clean shutdown', 'ts1 should mark the shm clean before exit') + self.ts1.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: (schema|ABI) mismatch', 'ts1 should not detect any shm mismatch on cold start') + self.ts1.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: previous run did not shutdown cleanly', 'ts1 should not see a dirty shm on cold start') + self.ts1.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: stripe \S+ size mismatch', 'ts1 should not see a stripe size mismatch on cold start') + + # ts2 (warm start): attaches the existing control segment, fast-attaches the + # per-stripe segment, reuses the cached directory, and must NOT fall back to + # the disk-rebuild path. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: attaching up to \d+ stripes \(fast restart', 'ts2 should attach the existing shm (fast restart)') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: attached stripe \S+ \(\d+ bytes\) for key=', 'ts2 should attach at least one shm-backed stripe segment') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r"attaching cached directory from shm for '.+' \(fast restart", 'ts2 should reuse the per-stripe directory from shm') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: creating fresh control segment', 'ts2 should not create a fresh control segment') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: (schema|ABI) mismatch', 'ts2 should not detect any shm mismatch on warm start') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: previous run did not shutdown cleanly', 'ts2 should see the shm marked clean') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'shm directory invalid for', 'ts2 should not fall back from shm to disk read') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: stripe \S+ size mismatch', 'ts2 should fast-attach without a stripe size-mismatch recreate') + + def _start_ts1(self): + # Cold start ts1 against the verifier-server origin and replay the + # "fill" transaction: a cache miss that ATS fetches and stores. + tr = Test.AddTestRun('Start ts1, then cache contents (fill)') + tr.AddVerifierClientProcess( + 'shm-fill-client', self.REPLAY_FILE, http_ports=[self.ts1.Variables.port], keys='fill', other_args='--thread-limit 1') + tr.Processes.Default.StartBefore(self.server) + tr.Processes.Default.StartBefore(self.ts1) + tr.StillRunningAfter = self.server + tr.StillRunningAfter = self.ts1 + + def _clean_shutdown_ts1(self): + # Drain + SIGTERM ts1. SIGTERM goes through AutoStopCont which invokes + # TS_LIFECYCLE_SHUTDOWN_HOOK -> sync_cache_dir_on_shutdown -> + # CacheShm::mark_clean_shutdown. + tr = Test.AddTestRun('Drain and clean-shutdown ts1') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shm_ts1 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + tr.StillRunningAfter = self.server + + def _start_ts2(self): + # ts2 attaches the CacheDir from the shm created by ts1. Replay the + # "hit" transaction: ATS must serve it from cache (X-Cache: hit-fresh) + # without contacting the origin -- the replay's 502 sentinel response + # would otherwise surface as a proxy-response mismatch. + tr = Test.AddTestRun('Start ts2; verify shm fast-attach and cache HIT') + tr.AddVerifierClientProcess( + 'shm-hit-client', self.REPLAY_FILE, http_ports=[self.ts2.Variables.port], keys='hit', other_args='--thread-limit 1') + tr.Processes.Default.StartBefore(self.ts2) + tr.StillRunningAfter = self.server + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown_ts2(self): + # Stop ts2 before clearing the shm: `cache shm clear` refuses to unlink a + # segment a live traffic_server still owns, so the segments must be + # ownerless (clean_shutdown clears owner_pid) before cleanup runs. + tr = Test.AddTestRun('Drain and clean-shutdown ts2') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shm_ts2 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + tr.StillRunningAfter = self.server + + def _cleanup_shm(self): + # A clean shutdown deliberately keeps the control + live stripe segments + # for the next fast restart, so they outlive the test. Unlink them by + # prefix to avoid leaking POSIX shm across repeated local runs (macOS has + # no /dev/shm to clear out of band). + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._start_ts1() + self._clean_shutdown_ts1() + self._start_ts2() + self._clean_shutdown_ts2() + self._cleanup_shm() + + +CacheShmFastRestartTest().run() diff --git a/tests/gold_tests/cache/cache_shm_purge_on_disable.test.py b/tests/gold_tests/cache/cache_shm_purge_on_disable.test.py new file mode 100644 index 00000000000..51fd57cdedf --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_purge_on_disable.test.py @@ -0,0 +1,232 @@ +''' +Purge-stale-on-start: when shm is disabled but a prior run left segments behind, +proxy.config.cache.shm.purge_stale_on_start=1 removes them at startup. + +This guards two hazards of running with the feature disabled after it had been +enabled (see records.yaml docs): (a) the leftover segments keep consuming tmpfs +the disabled instance never reads, and (b) a later re-enabled run would otherwise +fast-attach a directory that went stale while ATS ran disabled (writing only to +disk). + +Three scenarios, each on its own shm prefix + on-disk storage so they do not +interact: + + - PURGE (positive): a seed instance runs shm-enabled and clean-shuts-down, + leaving a clean control + stripe segment. A second instance runs disabled + with purge_stale_on_start=1 and must remove them. Confirmed three ways: the + seed's "clean" segment exists before (traffic_ctl cache shm status, exit 0), + the disabled instance logs the purge Note, and the segment is gone after + (status exits 2, "not found"). + + - KEEP (negative): same seed, but the disabled instance has + purge_stale_on_start=0. It must NOT log the purge and the segment must remain. + + - NOOP (no leftover): a disabled instance with purge_stale_on_start=1 against a + never-used prefix must do nothing quietly -- no purge Note, no "cannot open" + warning. + +The segments are inspected with traffic_ctl (POSIX shm is not path-addressable on +macOS, so /dev/shm cannot be listed directly). +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import sys + +Test.Summary = ''' +shm.purge_stale_on_start removes leftover shm segments at startup when shm is +disabled, only when set, and only when a control segment exists. +''' +Test.ContinueOnFail = True + + +class CacheShmPurgeOnDisableTest: + + TS_PID_SCRIPT = 'ts_process_handler.py' + DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB; matches the other shm gold tests. + + # CtrlCommand sets this exit code when a shm control segment is absent/invalid + # (src/traffic_ctl/TrafficCtlStatus.h). + CTRL_EX_ERROR = 2 + + PURGE_NOTE = r"cache shm: purged stale segments while disabled \(removed [1-9]" + + def __init__(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + pid = os.getpid() % 100000 + # Each control name is "control"; keep well under macOS PSHMNAMLEN (31). + self._prefix_purge = f'/cshmp-{pid}-' # positive: must be purged + self._prefix_keep = f'/cshmk-{pid}-' # negative: must remain + self._prefix_noop = f'/cshmz-{pid}-' # no leftover: nothing to do + + # Seed (shm enabled) instances that create the leftover segments. + self.seed_purge = self._make_ts('cshm_seed_p', self._prefix_purge, 'disk_p.img', enabled=True, purge=False) + self.seed_keep = self._make_ts('cshm_seed_k', self._prefix_keep, 'disk_k.img', enabled=True, purge=False) + + # Disabled instances under test. + self.run_purge = self._make_ts('cshm_run_p', self._prefix_purge, 'disk_p.img', enabled=False, purge=True) + self.run_keep = self._make_ts('cshm_run_k', self._prefix_keep, 'disk_k.img', enabled=False, purge=False) + self.run_noop = self._make_ts('cshm_run_z', self._prefix_noop, 'disk_z.img', enabled=False, purge=True) + + self._add_diags_assertions() + + def _make_ts(self, name, prefix, disk_name, enabled, purge): + disk_path = self._ensure_disk(disk_name) + ts = Test.MakeATSProcess(name) + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {disk_path}', + f' size: {self.DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.remap_config.AddLine('map / http://127.0.0.1:8080/') # never exercised; keeps remap.config non-empty + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1 if enabled else 0, + 'proxy.config.cache.shm.name_prefix': prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.cache.shm.purge_stale_on_start': 1 if purge else 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + return ts + + def _ensure_disk(self, disk_name): + storage_dir = os.path.join(Test.RunDirectory, 'storage') + os.makedirs(storage_dir, exist_ok=True) + path = os.path.join(storage_dir, disk_name) + if not os.path.exists(path): + with open(path, 'ab') as f: + f.truncate(self.DISK_SIZE_BYTES) + return path + + def _add_diags_assertions(self): + # Seeds create a fresh control segment and mark it clean on the way out -- + # that is the "fast-attachable but now stale" state the purge must clean up. + for seed in (self.seed_purge, self.seed_keep): + seed.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'seed should create a fresh shm control segment') + seed.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: marking clean shutdown', 'seed should mark the shm clean before exit') + + # Positive: the disabled+purge instance logs the purge of at least one segment. + self.run_purge.Disk.diags_log.Content += Testers.ContainsExpression( + self.PURGE_NOTE, 'disabled instance with purge_stale_on_start=1 should purge the leftover segments') + + # Negative: purge_stale_on_start=0 must never purge. + self.run_keep.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: purged stale segments', 'purge_stale_on_start=0 must not purge') + + # No-op: nothing exists for this prefix, so neither a purge nor an error. + self.run_noop.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: purged stale segments', 'no leftover means nothing is purged') + self.run_noop.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: cannot open control segment', 'a missing control segment is a quiet no-op, not a warning') + + def _shm_status(self, description, ts, prefix, expect_present): + """Run `traffic_ctl cache shm status` and assert the control segment is (not) there.""" + control_name = prefix + 'control' + tr = Test.AddTestRun(description) + tr.Processes.Default.Env = ts.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm status --prefix {prefix}' + if expect_present: + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + r'Control segment:\s+' + re.escape(control_name), 'control segment should be present') + else: + tr.Processes.Default.ReturnCode = self.CTRL_EX_ERROR + tr.Processes.Default.Streams.stderr = Testers.ContainsExpression( + r"control segment '" + re.escape(control_name) + r"' not found", 'control segment should be gone') + return tr + + def _start_seed(self, description, seed, prefix): + # Starting the seed (shm enabled) creates the control + stripe segments; the + # status probe also confirms they exist while the seed is the live owner. + tr = self._shm_status(description, seed, prefix, expect_present=True) + tr.Processes.Default.StartBefore(seed) + tr.StillRunningAfter = seed + return tr + + def _clean_shutdown(self, description, seed, name): + tr = Test.AddTestRun(description) + tr.Processes.Default.Env = seed.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} {name} --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _start_disabled(self, description, ts, prefix, expect_present): + # Start the disabled instance under test; its purge (or no-op) runs during + # cache init, so by the time it is ready the status below reflects the result. + tr = self._shm_status(description, ts, prefix, expect_present=expect_present) + tr.Processes.Default.StartBefore(ts) + tr.StillRunningAfter = ts + return tr + + def _cleanup(self): + tr = Test.AddTestRun('Unlink any remaining test shm segments') + tr.Processes.Default.Env = self.run_keep.Env + tr.Processes.Default.Command = ( + f'traffic_ctl cache shm clear --prefix {self._prefix_purge} ; ' + f'traffic_ctl cache shm clear --prefix {self._prefix_keep} ; ' + f'traffic_ctl cache shm clear --prefix {self._prefix_noop}') + tr.Processes.Default.ReturnCode = 0 + + def run(self): + # PURGE (positive) + self._start_seed('PURGE: start shm-enabled seed; control segment is created', self.seed_purge, self._prefix_purge) + self._clean_shutdown('PURGE: clean-shutdown seed (leaves a clean segment)', self.seed_purge, 'cshm_seed_p') + # Probe with a Env whose bin/ autest has already populated (seed_purge was + # started above); run_purge has not started yet, so its bin/ does not exist. + self._shm_status( + 'PURGE: precondition -- clean leftover segment is present', self.seed_purge, self._prefix_purge, + expect_present=True).Processes.Default.Streams.stdout += Testers.ContainsExpression( + r'clean_shutdown:\s+1 \(clean\)', 'leftover segment should be marked clean (the stale-but-attachable case)') + self._start_disabled( + 'PURGE: start disabled+purge=1; leftover segments are removed', + self.run_purge, + self._prefix_purge, + expect_present=False) + + # KEEP (negative) + self._start_seed('KEEP: start shm-enabled seed; control segment is created', self.seed_keep, self._prefix_keep) + self._clean_shutdown('KEEP: clean-shutdown seed (leaves a clean segment)', self.seed_keep, 'cshm_seed_k') + self._start_disabled( + 'KEEP: start disabled+purge=0; leftover segments remain', self.run_keep, self._prefix_keep, expect_present=True) + + # NOOP (no leftover) + tr = Test.AddTestRun('NOOP: start disabled+purge=1 against an unused prefix; nothing to do') + tr.Processes.Default.Env = self.run_noop.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm status --prefix {self._prefix_noop}' + tr.Processes.Default.ReturnCode = self.CTRL_EX_ERROR # never existed + tr.Processes.Default.StartBefore(self.run_noop) + tr.StillRunningAfter = self.run_noop + + self._cleanup() + + +CacheShmPurgeOnDisableTest().run() diff --git a/tests/gold_tests/cache/cache_shm_schema_mismatch.test.py b/tests/gold_tests/cache/cache_shm_schema_mismatch.test.py new file mode 100644 index 00000000000..fac47ebde57 --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_schema_mismatch.test.py @@ -0,0 +1,207 @@ +''' +Verify the shm schema-version trust gate: a control segment whose schema_version +does not match the running build is dropped, never fast-attached. ts1 cold-starts, +caches an object, and clean-shuts-down (marking the segment clean). The segment +file under /dev/shm is then tampered -- schema_version is overwritten with a +bogus value -- before ts2 starts against the same shm prefix. ts2 must detect the +mismatch, drop the segment, recreate it fresh, and rebuild the directory from disk. + +Linux-only: it pokes raw bytes in the /dev/shm segment file, which exists only on +Linux (macOS POSIX shm segments are not path-addressable). +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import platform +import sys +import uuid + +Test.Summary = ''' +A control segment with a mismatched schema_version is dropped and rebuilt from +disk, never fast-attached. +''' +Test.ContinueOnFail = True + +# The byte-poke drives the gate by editing /dev/shm directly, which is a Linux +# facility; macOS POSIX shm is not exposed as a file. There is no Condition for +# the platform, so gate with a lambda (ports.py branches on platform the same way). +Test.SkipUnless(Condition(lambda: platform.system() == 'Linux', "shm byte-poke gates need Linux /dev/shm")) + + +class CacheShmSchemaMismatchTest: + """ + The schema-version gate. The control header records the build's + CACHE_SHM_SCHEMA_VERSION; on attach, a segment whose recorded version differs + is dropped ("schema mismatch ( vs ), dropping") rather than trusted + -- the on-disk struct layout it describes may no longer match this build. The + ABI-hash gate (abi_hash @16) works identically; this test exercises the + schema field (@8) as the representative case. + + Sequence: ts1 creates a clean segment, then schema_version is poked to a bogus + value, then ts2 starts and must: + - log the schema mismatch and drop, + - recreate a fresh control segment, + - NOT fast-attach, + - and still serve a request (200). + """ + + TS_PID_SCRIPT = 'ts_process_handler.py' + POKE_SCRIPT = 'shm_poke.py' + + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB + + # CacheShmControl layout (CacheShmLayout.h): magic[8] @0, schema_version @8. + SCHEMA_VERSION_OFFSET = 8 + # Little-endian uint32 = 9; the build's CACHE_SHM_SCHEMA_VERSION is small, so + # any value it never uses works. 9 is comfortably out of range. + BOGUS_SCHEMA_LE_HEX = '09000000' + + def __init__(self): + self._setup_shared_state() + self.ts1 = self._configure_ts('shmx_ts1') + self.ts2 = self._configure_ts('shmx_ts2') + self._add_diags_log_assertions() + self._url_path = f'/cache/40/{uuid.uuid4()}' + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + Test.Setup.Copy(os.path.join(Test.TestDirectory, self.POKE_SCRIPT)) + + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + self._shared_storage_path = os.path.join(shared_storage_dir, 'disk.img') + with open(self._shared_storage_path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # macOS PSHMNAMLEN is 31 chars incl. '/'; 'x' = schema-mismatch variant. + # (This test is Linux-only, but keep the prefix short for consistency.) + self._shm_prefix = f'/cshmx-{os.getpid() % 100000}-' + # The control segment is name_prefix + "control"; on Linux it is a file + # under /dev/shm by the same name (sans the leading '/'). + self._control_file = '/dev/shm/' + self._shm_prefix.lstrip('/') + 'control' + + def _configure_ts(self, name): + ts = Test.MakeATSProcess(name) + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {self._shared_storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine('map / http://127.0.0.1/ @plugin=generator.so') + return ts + + def _add_diags_log_assertions(self): + # ts1 cold start, clean shutdown -- a valid, clean segment to tamper with. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create a fresh shm control segment on first start') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: marking clean shutdown', 'ts1 should mark the shm clean before exit') + + # ts2 start against the poked segment: detect, drop, recreate, rebuild. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: schema mismatch \(\d+ vs \d+\), dropping', 'ts2 must detect the schema mismatch and drop the segment') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts2 must recreate the control segment after the drop') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'\(fast restart, recovery skipped\)', 'ts2 must rebuild from disk, never fast-attach the mismatched segment') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: previous run did not shutdown cleanly', + 'the drop must be due to the schema mismatch, not an unclean shutdown') + + def _populate_cache(self): + tr = Test.AddTestRun('Cold-start ts1 and cache an object') + tr.Processes.Default.StartBefore(self.ts1) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts1.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression('200', 'ts1 first GET should return 200') + tr.StillRunningAfter = self.ts1 + + def _clean_shutdown_ts1(self): + tr = Test.AddTestRun('Drain and clean-shutdown ts1') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shmx_ts1 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _poke_schema_version(self): + # ts1 is dead; the segment is just a file now. Overwrite schema_version. + tr = Test.AddTestRun('Tamper schema_version in the shm control segment') + tr.Processes.Default.Command = ( + f'{sys.executable} ./{self.POKE_SCRIPT} {self._control_file} ' + f'{self.SCHEMA_VERSION_OFFSET} {self.BOGUS_SCHEMA_LE_HEX}') + tr.Processes.Default.ReturnCode = 0 + + def _verify_mismatch_drop(self): + tr = Test.AddTestRun('Start ts2; verify the schema mismatch is dropped and rebuilt from disk') + tr.Processes.Default.StartBefore(self.ts2) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts2.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + '200', 'ts2 should serve correctly after dropping the mismatched segment') + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown_ts2(self): + tr = Test.AddTestRun('Drain and clean-shutdown ts2') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shmx_ts2 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _cleanup_shm(self): + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._populate_cache() + self._clean_shutdown_ts1() + self._poke_schema_version() + self._verify_mismatch_drop() + self._clean_shutdown_ts2() + self._cleanup_shm() + + +CacheShmSchemaMismatchTest().run() diff --git a/tests/gold_tests/cache/cache_shm_storage_mismatch.test.py b/tests/gold_tests/cache/cache_shm_storage_mismatch.test.py new file mode 100644 index 00000000000..24aadcb04e7 --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_storage_mismatch.test.py @@ -0,0 +1,210 @@ +''' +Verify that a changed storage layout never fast-attaches a stale directory. +A storage.yaml change no longer drops the whole shm control segment; instead +each stripe is matched to its prior segment by its own identity. ts1 caches an +object against one storage file and clean-shuts-down (marking the shm clean); +ts2 starts against a *different* storage file but the *same* shm name prefix. +ts2 finds ts1's control segment, keeps it (partial attach), but because its +stripe identity no longer matches any recorded entry it creates a fresh stripe +segment and reclaims ts1's orphaned one -- it must never fast-attach a segment +that describes a different on-disk layout. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import uuid + +Test.Summary = ''' +A changed storage layout never fast-attaches a stale directory: the control +segment is kept (partial attach), the relocated stripe creates a fresh segment, +and the orphaned prior segment is reclaimed. +''' +Test.ContinueOnFail = True + + +class CacheShmStorageMismatchTest: + """ + The storage signature is a fingerprint of every span's path and geometry, + stored in the shm control header. It is no longer a hard gate: a storage + change keeps the control segment and lets each stripe attach by its own + identity (its hash_text, which includes the disk path). This test points + ts1 and ts2 at different storage files (a repath) while sharing one shm + prefix, and asserts ts2: + - keeps the existing control segment (does NOT recreate it), + - enters partial-attach mode because the storage signature changed, + - never fast-attaches any stripe segment (its identity differs, so the + stale directory built for storage A is never reused), + - creates a fresh stripe segment for its own (storage B) layout, + - reclaims ts1's now-orphaned stripe segment, + - and still serves a request (200). + Because the storage change does not gate the clean-shutdown check, ts2 must + NOT report the prior run as unclean: the only reason for the recreate is the + storage change. + """ + + TS_PID_SCRIPT = 'ts_process_handler.py' + + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB + + def __init__(self): + self._setup_shared_state() + # ts1 and ts2 share the shm prefix but use different storage files. + self.ts1 = self._configure_ts('shms_ts1', self._storage_path_a) + self.ts2 = self._configure_ts('shms_ts2', self._storage_path_b) + self._add_diags_log_assertions() + self._url_path = f'/cache/40/{uuid.uuid4()}' + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + # Two distinct storage files -> distinct span paths -> distinct + # storage signatures, which is exactly the "repath" case under test. + self._storage_path_a = os.path.join(shared_storage_dir, 'disk_a.img') + self._storage_path_b = os.path.join(shared_storage_dir, 'disk_b.img') + for path in (self._storage_path_a, self._storage_path_b): + with open(path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # macOS PSHMNAMLEN is 31 chars incl. '/'; 's' = storage-mismatch variant. + self._shm_prefix = f'/cshms-{os.getpid() % 100000}-' + + def _configure_ts(self, name, storage_path): + ts = Test.MakeATSProcess(name) + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine('map / http://127.0.0.1/ @plugin=generator.so') + return ts + + def _add_diags_log_assertions(self): + # ts1 cold start against storage A, clean shutdown. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create a fresh shm control segment on first start') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'created stripe \S+ \(\d+ bytes\) for key=', 'ts1 should create at least one shm-backed stripe segment') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: marking clean shutdown', 'ts1 should mark the shm clean before exit') + + # ts2 start against storage B: the storage signature differs, so the + # control segment is kept (partial attach) but the relocated stripe + # creates a fresh segment rather than fast-attaching the stale one. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'attaching up to \d+ stripes \(fast restart, partial -- storage changed\)', + 'ts2 must enter partial-attach mode after the storage change') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'created stripe \S+ \(\d+ bytes\) for key=', 'ts2 must create a fresh stripe segment for its own layout') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: reclaiming orphaned stripe segment', "ts2 must reclaim ts1's orphaned stripe segment") + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'reclaimed \d+ orphaned stripe segment\(s\) after storage change', 'ts2 must report the reclaim summary') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'attached stripe \S+ \(\d+ bytes\) for key=', + 'ts2 must never fast-attach a stripe segment built for a different layout') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: creating fresh control segment', 'ts2 must keep the control segment across the storage change') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: (schema|ABI) mismatch', 'the recreate must be due to the storage change, not schema/ABI') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: previous run did not shutdown cleanly', + 'the recreate must be due to the storage change, not an unclean shutdown') + + def _populate_cache(self): + tr = Test.AddTestRun('Populate cache via ts1 (storage A)') + tr.Processes.Default.StartBefore(self.ts1) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts1.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression('200', 'ts1 first GET should return 200') + tr.StillRunningAfter = self.ts1 + + def _clean_shutdown_ts1(self): + tr = Test.AddTestRun('Drain and clean-shutdown ts1') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shms_ts1 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _verify_partial_attach_and_reclaim(self): + tr = Test.AddTestRun('Start ts2 (storage B); verify partial attach: fresh stripe + orphan reclaim') + tr.Processes.Default.StartBefore(self.ts2) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts2.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + '200', 'ts2 should serve correctly after the partial attach') + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown_ts2(self): + # Stop ts2 before clearing the shm: `cache shm clear` refuses to unlink a + # segment a live traffic_server still owns, so the segments must be + # ownerless (clean_shutdown clears owner_pid) before cleanup runs. + tr = Test.AddTestRun('Drain and clean-shutdown ts2') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shms_ts2 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _cleanup_shm(self): + # A clean shutdown deliberately keeps the control + live stripe segments + # for the next fast restart, so they outlive the test. Unlink them by + # prefix to avoid leaking POSIX shm across repeated local runs (macOS has + # no /dev/shm to clear out of band). + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._populate_cache() + self._clean_shutdown_ts1() + self._verify_partial_attach_and_reclaim() + self._clean_shutdown_ts2() + self._cleanup_shm() + + +CacheShmStorageMismatchTest().run() diff --git a/tests/gold_tests/cache/cache_shm_unclean_shutdown.test.py b/tests/gold_tests/cache/cache_shm_unclean_shutdown.test.py new file mode 100644 index 00000000000..4711636c70b --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_unclean_shutdown.test.py @@ -0,0 +1,185 @@ +''' +Verify the shm fast-restart path refuses to trust a directory left by a crash. +A clean shutdown marks the control segment clean; a crash (SIGKILL) does not, +so clean_shutdown stays 0. ts1 cold-starts, caches an object, and is *killed* +(no drain, no SIGTERM) so the shutdown hook never runs. ts2 starts against the +same on-disk cache and shm prefix: it must find the dirty segment, drop the +whole thing, rebuild the directory from disk, and never take the fast-attach +"recovery skipped" path. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import uuid + +Test.Summary = ''' +An unclean shutdown (SIGKILL) leaves the shm control segment dirty, so the next +start drops it and rebuilds the directory from disk instead of fast-attaching. +''' +Test.ContinueOnFail = True + + +class CacheShmUncleanShutdownTest: + """ + The crash-safety gate. clean_shutdown is set to 1 only by the shutdown hook + (CacheShm::mark_clean_shutdown); a SIGKILL bypasses it, leaving the segment + with clean_shutdown == 0. On the next start the control segment is found but + rejected -- a crash may have left dir entries pointing at content that never + reached disk, so no stripe can safely skip recovery. ts2 must: + - log "previous run did not shutdown cleanly, dropping", + - recreate a fresh control segment, + - NOT take the stripe fast-attach "recovery skipped" path, + - and still serve a request (200). + + This gate is cross-platform: clean_shutdown lives in the control segment, so + it does not depend on the Linux-only flock path. + """ + + TS_PID_SCRIPT = 'ts_process_handler.py' + + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB + + def __init__(self): + self._setup_shared_state() + # ts1 and ts2 share the same on-disk cache file and shm prefix so ts2 + # would fast-attach ts1's directory -- were it not left dirty by the kill. + self.ts1 = self._configure_ts('shmu_ts1') + # ts1 is SIGKILLed mid-test, so it exits on signal 9 (returncode -9, or + # 137 where the runner reports 128+signal). Declare that expected exit so + # the managed-process check does not flag the deliberate kill. ts1 still + # starts normally, so leave Ready at its default (port-open) condition. + self.ts1.ReturnCode = Any(-9, 137) + self.ts2 = self._configure_ts('shmu_ts2') + self._add_diags_log_assertions() + self._url_path = f'/cache/40/{uuid.uuid4()}' + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + self._shared_storage_path = os.path.join(shared_storage_dir, 'disk.img') + with open(self._shared_storage_path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # macOS PSHMNAMLEN is 31 chars incl. '/'; 'u' = unclean-shutdown variant. + self._shm_prefix = f'/cshmu-{os.getpid() % 100000}-' + + def _configure_ts(self, name): + ts = Test.MakeATSProcess(name) + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {self._shared_storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine('map / http://127.0.0.1/ @plugin=generator.so') + return ts + + def _add_diags_log_assertions(self): + # ts1 cold start: creates a fresh segment but is killed before it can mark + # the shutdown clean. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create a fresh shm control segment on first start') + self.ts1.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: marking clean shutdown', 'ts1 is SIGKILLed, so it must never mark the shm clean') + + # ts2 start: finds the dirty segment, drops it, recreates, and rebuilds + # from disk -- it must NOT fast-attach. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: previous run did not shutdown cleanly, dropping', 'ts2 must reject the dirty segment left by the crash') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts2 must recreate the control segment after dropping the dirty one') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'\(fast restart, recovery skipped\)', 'ts2 must rebuild from disk, never take the fast-attach path') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: attaching up to \d+ stripes \(fast restart', 'ts2 must not attach the dirty control segment') + + def _populate_cache(self): + tr = Test.AddTestRun('Cold-start ts1 and cache an object') + tr.Processes.Default.StartBefore(self.ts1) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts1.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression('200', 'ts1 first GET should return 200') + tr.StillRunningAfter = self.ts1 + + def _kill_ts1(self): + # SIGKILL -- no drain, no SIGTERM -- so the shutdown hook never runs and + # the control segment is left with clean_shutdown == 0. + tr = Test.AddTestRun('SIGKILL ts1 (unclean shutdown)') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = (f'{sys.executable} ./{self.TS_PID_SCRIPT} shmu_ts1 --signal KILL && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _verify_dirty_drop(self): + tr = Test.AddTestRun('Start ts2; verify the dirty segment is dropped and rebuilt from disk') + tr.Processes.Default.StartBefore(self.ts2) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts2.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + '200', 'ts2 should serve correctly after dropping the dirty segment') + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown_ts2(self): + tr = Test.AddTestRun('Drain and clean-shutdown ts2') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shmu_ts2 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _cleanup_shm(self): + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._populate_cache() + self._kill_ts1() + self._verify_dirty_drop() + self._clean_shutdown_ts2() + self._cleanup_shm() + + +CacheShmUncleanShutdownTest().run() diff --git a/tests/gold_tests/cache/gold/cache_shm_state_after_shutdown.gold b/tests/gold_tests/cache/gold/cache_shm_state_after_shutdown.gold new file mode 100644 index 00000000000..5a981414f4f --- /dev/null +++ b/tests/gold_tests/cache/gold/cache_shm_state_after_shutdown.gold @@ -0,0 +1,13 @@ +Control segment: `` + segment size: `` + magic: `` [valid] + schema_version: `` [valid] + abi_hash: 0x`` + storage_sig: 0x`` + clean_shutdown: 1 (clean) + owner_pid: 0 (none -- not currently attached) + stripe_count: 2 + +Stripes: + [0] `` present + [1] `` present diff --git a/tests/gold_tests/cache/replay/cache-shm-fast-restart.replay.yaml b/tests/gold_tests/cache/replay/cache-shm-fast-restart.replay.yaml new file mode 100644 index 00000000000..a18a9951643 --- /dev/null +++ b/tests/gold_tests/cache/replay/cache-shm-fast-restart.replay.yaml @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Traffic for the cache shm fast-restart test. The two transactions share a +# cache key (same method + host + url); they differ only in the uuid so the +# verifier-client can drive them one at a time via --keys: +# +# * key "fill": replayed against ts1 (cold cache). The request misses, ATS +# forwards it to the origin (verifier-server) and caches the 200 response. +# +# * key "hit": replayed against ts2 after a clean shutdown + shm fast restart. +# ATS must serve it from the shm-attached directory WITHOUT contacting the +# origin. The 502 server-response is a sentinel: it is only ever returned if +# ATS wrongly forwards the request, in which case the proxy-response check +# (expecting the cached 200 / X-Cache: hit-fresh) fails. +# + +meta: + version: "1.0" + +sessions: +- transactions: + + - client-request: + method: "GET" + version: "1.1" + scheme: "http" + url: /cache-shm-fast-restart/object + headers: + fields: + - [ Host, example.com ] + - [ uuid, fill ] + - [ X-Debug, "x-cache,via" ] + + server-response: + status: 200 + reason: OK + headers: + fields: + - [ Content-Length, 16 ] + - [ Cache-Control, "max-age=300,public" ] + + proxy-response: + status: 200 + headers: + fields: + - [ X-Cache, { value: miss, as: equal } ] + + # Restart ATS + + - client-request: + method: "GET" + version: "1.1" + scheme: "http" + url: /cache-shm-fast-restart/object + headers: + fields: + - [ Host, example.com ] + - [ uuid, hit ] + - [ X-Debug, "x-cache,via" ] + + server-response: + status: 502 + reason: "Bad Gateway" + headers: + fields: + - [ Content-Length, 0 ] + + proxy-response: + status: 200 + headers: + fields: + - [ X-Cache, { value: hit-fresh, as: equal } ] diff --git a/tests/gold_tests/cache/shm_poke.py b/tests/gold_tests/cache/shm_poke.py new file mode 100644 index 00000000000..a4ed38c0485 --- /dev/null +++ b/tests/gold_tests/cache/shm_poke.py @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flip raw bytes in a cache shm segment file, for shm trust-gate autests. + +On Linux the POSIX shm segments are plain files under /dev/shm, so a segment +left behind by a clean shutdown can be tampered with between runs to drive the +control-segment trust gates (schema/ABI mismatch, an unterminated shm_name, +etc.). This is Linux-only: macOS POSIX shm segments are not path-addressable. + +Usage: + shm_poke.py + +Example (set schema_version @8 to 9, little-endian uint32): + shm_poke.py /dev/shm/cshmx-12345-control 8 09000000 +""" + +import sys + + +def main() -> int: + if len(sys.argv) != 4: + sys.stderr.write(f'usage: {sys.argv[0]} \n') + return 2 + path = sys.argv[1] + offset = int(sys.argv[2], 0) + data = bytes.fromhex(sys.argv[3]) + with open(path, 'r+b') as f: + f.seek(offset) + f.write(data) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/tests/gold_tests/logging/ts_process_handler.py b/tests/gold_tests/logging/ts_process_handler.py index 40640e3922e..fdd1eb586cb 100644 --- a/tests/gold_tests/logging/ts_process_handler.py +++ b/tests/gold_tests/logging/ts_process_handler.py @@ -36,10 +36,13 @@ def __init__(self, message): def get_ts_process_pid(ts_identifier): processes = [] for proc in psutil.process_iter(['cmdline']): + # psutil returns a None cmdline for processes whose command line is not + # readable (e.g. system processes on macOS); skip those rather than + # letting ' '.join(None) raise TypeError before the target is found. cmdline = proc.info.get('cmdline', []) if not cmdline: continue - commandline = ' '.join(cmdline) + commandline = ' '.join(cmdline or []) if '/traffic_server' in commandline and ts_identifier in commandline: return proc raise GetPidError("Could not find a traffic_server process") From 1557f03ae77342165c7bbe3ca46dffb47608e066 Mon Sep 17 00:00:00 2001 From: Masaori Koshiba Date: Thu, 25 Jun 2026 14:07:31 +0900 Subject: [PATCH 2/3] cache: strip embedded '/' from shm name prefix POSIX shm names permit only the leading '/', so a misconfigured name_prefix like "foo/bar" would build a name shm_open rejects with EINVAL. Strip embedded '/' during normalization instead of preserving it. --- src/iocore/cache/CacheShmLayout.h | 17 +++++++++++++---- src/iocore/cache/unit_tests/test_CacheShm.cc | 9 +++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/iocore/cache/CacheShmLayout.h b/src/iocore/cache/CacheShmLayout.h index d58ae9faaa7..12bf5d8c1b2 100644 --- a/src/iocore/cache/CacheShmLayout.h +++ b/src/iocore/cache/CacheShmLayout.h @@ -76,9 +76,11 @@ constexpr std::size_t CONTROL_SIZE = sizeof(CacheShmControl); // and the '-' separating the prefix from the per-object suffix can never be // mis-typed. Any stray framing carried over from an older config (e.g. a // literal "/ats-") is trimmed first, so migration can never yield an invalid -// embedded-slash name like "//ats--". An embedded '/' or '-' in the middle is -// preserved; only the framing characters are trimmed. Both the running server -// and traffic_ctl normalize through here so they agree on the same names. +// embedded-slash name like "//ats--". An embedded '-' in the middle is preserved; +// an embedded '/' is stripped, since POSIX shm names permit only the leading '/' +// (a mistyped "foo/bar" would otherwise build a name shm_open rejects with EINVAL). +// Both the running server and traffic_ctl normalize through here so they agree on +// the same names. inline std::string normalize_name_prefix(std::string_view configured) { @@ -90,7 +92,14 @@ normalize_name_prefix(std::string_view configured) std::string_view middle = (last_kept == std::string_view::npos || last_kept < begin) ? std::string_view{} : configured.substr(begin, last_kept - begin + 1); - return "/" + std::string(middle) + "-"; + std::string word{"/"}; + for (char c : middle) { + if (c != '/') { // POSIX shm names allow only the leading '/'. + word += c; + } + } + word += "-"; + return word; } // Name of the "control" segment. Derived in one place so the cache diff --git a/src/iocore/cache/unit_tests/test_CacheShm.cc b/src/iocore/cache/unit_tests/test_CacheShm.cc index dbcc12fb221..8858b79a5c7 100644 --- a/src/iocore/cache/unit_tests/test_CacheShm.cc +++ b/src/iocore/cache/unit_tests/test_CacheShm.cc @@ -196,9 +196,14 @@ TEST_CASE("CacheShm normalizes the configured name prefix", "[cache][shm]") CHECK(normalize_name_prefix("ats-") == "/ats-"); CHECK(normalize_name_prefix("//ats--") == "/ats-"); - // An embedded '/' or '-' in the middle is preserved -- only the framing is - // trimmed. + // An embedded '-' in the middle is preserved -- only the framing is trimmed. CHECK(normalize_name_prefix("ats-v2") == "/ats-v2-"); + + // An embedded '/' is stripped: POSIX shm names permit only the leading '/', so a + // mistyped middle word must not build a name shm_open would reject with EINVAL. + CHECK(normalize_name_prefix("foo/bar") == "/foobar-"); + CHECK(normalize_name_prefix("/ats/v2/") == "/atsv2-"); + CHECK(normalize_name_prefix("a/b/c") == "/abc-"); } TEST_CASE("CacheShm process liveness check backs the concurrent-attach guard", "[cache][shm]") From 12a2a515bb1aa238197286ad627dadb9df8b22c9 Mon Sep 17 00:00:00 2001 From: Masaori Koshiba Date: Thu, 25 Jun 2026 14:07:32 +0900 Subject: [PATCH 3/3] cache: link librt for shm_open/shm_unlink on older glibc On glibc < 2.34 (e.g. CentOS 7) shm_open/shm_unlink live in librt, so traffic_ctl and inkcache fail to link. Add an optional rt::rt target that is a no-op where the library is folded into libc (modern glibc, macOS). --- CMakeLists.txt | 3 +++ cmake/Findrt.cmake | 33 +++++++++++++++++++++++++++++++++ src/iocore/cache/CMakeLists.txt | 2 +- src/traffic_ctl/CMakeLists.txt | 10 +++++++++- 4 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 cmake/Findrt.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index d28c3bd289c..e968b0c776c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -676,6 +676,9 @@ check_struct_has_member("struct mptcp_info" mptcpi_subflows "linux/mptcp.h" HAVE # find resolv library if available find_package(resolv) +# find rt library if available (shm_open/shm_unlink live there on older glibc) +find_package(rt) + if(ENABLE_DOCS OR ENABLE_AUTEST) find_package(Python3 REQUIRED) find_program(UV uv REQUIRED) diff --git a/cmake/Findrt.cmake b/cmake/Findrt.cmake new file mode 100644 index 00000000000..3c4de19a1f6 --- /dev/null +++ b/cmake/Findrt.cmake @@ -0,0 +1,33 @@ +####################### +# +# Licensed to the Apache Software Foundation (ASF) under one or more contributor license +# agreements. See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# +####################### + +# librt provides shm_open()/shm_unlink() on older glibc (e.g. CentOS 7 / glibc 2.17). +# glibc >= 2.34 folds them into libc and macOS has them in libc, so the library is +# absent there; the imported target is then an empty no-op. + +find_library(rt_LIBRARY rt) + +mark_as_advanced(rt_FOUND rt_LIBRARY) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(rt REQUIRED_VARS rt_LIBRARY) + +# Always provide the target; only carry a link dependency when librt exists. +add_library(rt::rt INTERFACE IMPORTED) +if(rt_FOUND) + target_link_libraries(rt::rt INTERFACE ${rt_LIBRARY}) +endif() diff --git a/src/iocore/cache/CMakeLists.txt b/src/iocore/cache/CMakeLists.txt index f4c28cd4223..dd4dc9f6a24 100644 --- a/src/iocore/cache/CMakeLists.txt +++ b/src/iocore/cache/CMakeLists.txt @@ -51,7 +51,7 @@ target_include_directories(inkcache PRIVATE ${CMAKE_SOURCE_DIR}/lib) target_link_libraries( inkcache PUBLIC ts::aio ts::hdrs ts::inkevent ts::tscore - PRIVATE ts::config ts::tsapibackend fastlz ZLIB::ZLIB + PRIVATE ts::config ts::tsapibackend fastlz ZLIB::ZLIB rt::rt ) if(HAVE_LZMA_H) diff --git a/src/traffic_ctl/CMakeLists.txt b/src/traffic_ctl/CMakeLists.txt index 165e6d17fa7..c0ef76cd1cd 100644 --- a/src/traffic_ctl/CMakeLists.txt +++ b/src/traffic_ctl/CMakeLists.txt @@ -30,7 +30,15 @@ add_executable( target_include_directories(traffic_ctl PRIVATE ${CMAKE_SOURCE_DIR}/src/iocore/cache) -target_link_libraries(traffic_ctl ts::tscore ts::config libswoc::libswoc yaml-cpp::yaml-cpp ts::tsutil) +target_link_libraries( + traffic_ctl + ts::tscore + ts::config + libswoc::libswoc + yaml-cpp::yaml-cpp + ts::tsutil + rt::rt +) install(TARGETS traffic_ctl)