diff --git a/CMakeLists.txt b/CMakeLists.txt index d28c3bd289c..e968b0c776c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -676,6 +676,9 @@ check_struct_has_member("struct mptcp_info" mptcpi_subflows "linux/mptcp.h" HAVE # find resolv library if available find_package(resolv) +# find rt library if available (shm_open/shm_unlink live there on older glibc) +find_package(rt) + if(ENABLE_DOCS OR ENABLE_AUTEST) find_package(Python3 REQUIRED) find_program(UV uv REQUIRED) diff --git a/cmake/Findrt.cmake b/cmake/Findrt.cmake new file mode 100644 index 00000000000..3c4de19a1f6 --- /dev/null +++ b/cmake/Findrt.cmake @@ -0,0 +1,33 @@ +####################### +# +# Licensed to the Apache Software Foundation (ASF) under one or more contributor license +# agreements. See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# +####################### + +# librt provides shm_open()/shm_unlink() on older glibc (e.g. CentOS 7 / glibc 2.17). +# glibc >= 2.34 folds them into libc and macOS has them in libc, so the library is +# absent there; the imported target is then an empty no-op. + +find_library(rt_LIBRARY rt) + +mark_as_advanced(rt_FOUND rt_LIBRARY) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(rt REQUIRED_VARS rt_LIBRARY) + +# Always provide the target; only carry a link dependency when librt exists. +add_library(rt::rt INTERFACE IMPORTED) +if(rt_FOUND) + target_link_libraries(rt::rt INTERFACE ${rt_LIBRARY}) +endif() diff --git a/doc/admin-guide/files/records.yaml.en.rst b/doc/admin-guide/files/records.yaml.en.rst index a554c554a2d..38fc75089c0 100644 --- a/doc/admin-guide/files/records.yaml.en.rst +++ b/doc/admin-guide/files/records.yaml.en.rst @@ -3003,6 +3003,89 @@ RAM Cache Compression runs on task threads. To use more cores for RAM cache compression, increase :ts:cv:`proxy.config.task_threads`. +.. _admin-cache-shm-fast-restart: + +Shared Memory Fast Restart +========================== + +|TS| can optionally keep the cache directory -- the in-memory index that maps +cached objects to their location on disk -- in POSIX shared memory so that it +survives a process restart. On a normal start the directory is read from disk +and, for a large cache, rebuilt in memory before the cache comes online. When +this feature is enabled and the previous instance shut down cleanly, the new +instance attaches the existing shared memory segments and skips that work, +bringing the cache online much faster. + +The shared memory directory is only an optimization for restart time; the +on-disk cache always remains the source of truth. A new instance discards the +segments and falls back to reading the directory from disk whenever they cannot +be trusted, including when: + +- the previous instance did not shut down cleanly (for example, it crashed), +- the on-disk storage layout described by :file:`storage.yaml` changed, +- the |TS| binary's directory structures changed (an ABI mismatch, such as + after an upgrade), or +- the shared memory schema version changed. + +Segments left over from a crash can be inspected or removed with +``traffic_ctl cache shm status`` and ``traffic_ctl cache shm clear``, which act +directly on the shared memory objects whether or not |TS| is running. + +.. note:: + + This is an experimental feature, disabled by default. All of its settings + take effect only on a restart of |TS|. + +.. ts:cv:: CONFIG proxy.config.cache.shm.enabled INT 0 + + Enables the shared memory cache directory described above. When ``0`` (the + default), the cache directory is always read from disk on start. + +.. ts:cv:: CONFIG proxy.config.cache.shm.name_prefix STRING ats + + The word used to name the POSIX shared memory objects, which on Linux appear + under ``/dev/shm``. Set only the middle word (default ``ats``); |TS| frames it + as ``/-`` so the leading ``/`` that POSIX requires and the trailing + ``-`` separator cannot be mis-typed. With the default the control segment is + named ``/ats-control`` and each per-stripe directory segment ``/ats-s`` + (for example ``/ats-s0``). Any stray framing characters are trimmed, so a + value carried over from an older release (such as ``/ats-``) still resolves to + the same names. Give each |TS| instance sharing a host a distinct word so + their segments do not collide. + + Renaming this value does not remove segments created under the old prefix: + |TS| only manages segments under the *current* prefix, so the old ``/dev/shm`` + objects linger until cleared manually with ``traffic_ctl cache shm clear + --prefix `` (or a host reboot). + +.. ts:cv:: CONFIG proxy.config.cache.shm.use_hugepages INT 0 + + When enabled (``1``), |TS| attempts to back the shared memory directory with + huge pages to reduce TLB pressure. This requires the shared memory to be + eligible for huge pages (for example, ``/dev/shm`` mounted with huge page + support on Linux). When it is not, |TS| logs a debug message under the + ``cache_shm`` tag and transparently falls back to ordinary pages, so + enabling this is always safe. + +.. ts:cv:: CONFIG proxy.config.cache.shm.purge_stale_on_start INT 0 + + When enabled (``1``) and :ts:cv:`proxy.config.cache.shm.enabled` is ``0``, + |TS| removes any leftover shared memory segments for + :ts:cv:`proxy.config.cache.shm.name_prefix` at startup (the ``control`` + segment and the per-stripe segments it lists). This guards against two + hazards of running with the feature disabled after it had been enabled: + + - the leftover segments keep consuming memory (for example ``/dev/shm`` on + Linux) even though the disabled instance never reads them, and + - a later run with the feature re-enabled would otherwise fast-attach a + directory that went stale while |TS| ran disabled and wrote only to disk. + + The purge is skipped if a live process still owns the segments (a concurrent + instance using the same prefix), and it never blocks startup. It has no + effect when the feature is enabled, when no ``control`` segment + exists, or when set to ``0`` (the default). ``traffic_ctl cache shm clear`` + performs the same cleanup on demand. + .. _admin-heuristic-expiration: Heuristic Expiration diff --git a/doc/developer-guide/cache-architecture/index.en.rst b/doc/developer-guide/cache-architecture/index.en.rst index 4e78f8febc3..1c9da957c70 100644 --- a/doc/developer-guide/cache-architecture/index.en.rst +++ b/doc/developer-guide/cache-architecture/index.en.rst @@ -41,5 +41,6 @@ understanding and modifying the source. api-functions.en consistency.en ram-cache.en + shm-fast-restart.en cache-tool.en tiered-storage.en diff --git a/doc/developer-guide/cache-architecture/shm-fast-restart.en.rst b/doc/developer-guide/cache-architecture/shm-fast-restart.en.rst new file mode 100644 index 00000000000..48254107cda --- /dev/null +++ b/doc/developer-guide/cache-architecture/shm-fast-restart.en.rst @@ -0,0 +1,656 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. include:: ../../common.defs +.. default-domain:: cpp + +.. _cache-shm-fast-restart: + +Shared-Memory Cache Directory (Fast Restart) +******************************************** + +.. note:: + + This is an experimental feature, disabled by default. It is controlled by + the ``proxy.config.cache.shm.*`` settings (see :ref:`configuration + `). The administrator-facing description lives at + :ref:`admin-cache-shm-fast-restart`; this document covers the design. + +Motivation +========== + +The :ref:`cache directory ` is the memory-resident index that +maps cached objects to their location on disk. It is rebuilt every time |TS| +starts: each stripe reads its two on-disk directory copies, picks the newer +valid one, and then runs recovery (``StripeSM::recover_data``) to replay +the fragments written since the last directory sync. For a large cache this is +the dominant cost of a restart -- the cache is not online, and therefore not +serving from cache, until it finishes. + +The directory itself, however, is purely a function of state |TS| already had +in memory in the previous process. If that memory could *survive* the process +restart, the new process could attach it and come online immediately, skipping +both the disk read and recovery. + +The shared-memory fast-restart feature does exactly that. It hosts each +stripe's ``Directory::raw_dir`` buffer in a POSIX shared-memory segment +(:manpage:`shm_open(3)`, on Linux backed by ``tmpfs`` under ``/dev/shm``). +Because the segment is owned by the kernel and not by the process, it outlives +an orderly ``traffic_server`` exit. The next start re-maps the existing segment +in milliseconds instead of rebuilding from disk. + +Design principles +================= + +The feature is built around two non-negotiable invariants. + +**The on-disk cache is always the source of truth.** The shared-memory +directory is *only* an optimization of restart time. The data fragments +themselves are never kept in shared memory -- they are read from disk on demand +exactly as before. The shared segment holds the directory index and nothing +else. + +**Recovery is binary.** The shared segment is either trustworthy enough to +attach wholesale, or it is dropped and the stripe rebuilds from disk through +the existing cold-start path. There is no attempt to repair, partially trust, +checksum, or torn-write-detect the segment. Every gate described below is a +fail-closed test: if anything is wrong or even ambiguous, the answer is "drop +and rebuild," which is always correct because the disk is authoritative. + +This keeps the trusted code small. The fast path adds no new durability +mechanism; it borrows the one the cache already has. Whenever the shared +segment is unavailable for any reason, |TS| takes precisely the path it takes +today after an unclean shutdown. + +Object layout +============= + +The feature uses two kinds of shared-memory object, defined in +:ts:git:`src/iocore/cache/CacheShmLayout.h`. + +.. code-block:: text + + POSIX shared memory (e.g. /dev/shm on Linux) + + control one per traffic_server instance + +-------------------------------------------------------------+ + | magic "ATS-SHM\0" schema_version abi_hash | + | storage_signature clean_shutdown owner_pid | + | stripe_count | + | stripes[0 .. MAX_STRIPES-1]: | + | { shm_name, raw_dir_size, stripe_key_hash } | + +-------------------------------------------------------------+ + | | | + v v v + s0 s1 s2 per-stripe raw_dir + +-----------+ +-----------+ +-----------+ + | header | | header | | header | StripeHeaderFooter + | dir[] | | dir[] | | dir[] | directory entries + | footer | | footer | | footer | + +-----------+ +-----------+ +-----------+ + +The control segment +------------------- + +There is one control segment per instance, named ``control``. It is a +fixed-size ``cache_shm::CacheShmControl`` -- a header plus a table of +up to ``MAX_STRIPES`` (256) ``cache_shm::StripeEntry`` rows. A +``static_assert`` keeps the whole control segment under 32 KiB. Its fields: + +.. list-table:: + :header-rows: 1 + :widths: 20 80 + + * - Field + - Purpose + * - ``magic`` + - ``"ATS-SHM\0"``. Identifies a |TS| control segment and is the first + thing checked on attach. + * - ``schema_version`` + - The on-shm wire-format version. Bumped whenever the meaning of the + layout changes; a mismatch drops the segment. + * - ``abi_hash`` + - A compile-time fingerprint of the binary's directory structures (see + ``CacheShm::abi_hash``). A mismatch -- e.g. after an upgrade that + changed ``Dir`` -- drops the segment. + * - ``storage_signature`` + - A fingerprint of the ``storage.config`` topology. **Not** a hard + gate; see `Storage changes and partial attach`_. + * - ``clean_shutdown`` + - ``1`` only between a clean shutdown and the next attach. ``0`` at all + other times, including throughout a running process, so a crash leaves + it ``0``. + * - ``owner_pid`` + - PID of the process currently mapping the segment read-write, or ``0`` + when none. Backs the concurrent-attach guard. + * - ``stripe_count`` + - High-water mark of used rows in ``stripes[]``. + * - ``stripes[]`` + - One row per stripe: its segment name, the segment's byte size, and the + 64-bit stripe identity hash used to match a stripe to its prior segment. + +Per-stripe directory segments +----------------------------- + +Each stripe's directory lives in its own segment, ``s``. The mapped +region *is* the stripe's ``Directory::raw_dir``: the +:cpp:class:`StripeHeaderFooter` header, the array of :cpp:class:`Dir` entries, +and the footer, in exactly the same byte layout the cache writes to disk. A +stripe reads and writes its directory through this mapping for the entire run, +so the segment is continuously current -- there is no separate "flush to shared +memory" step. + +Naming +------ + +All names derive from :ts:cv:`proxy.config.cache.shm.name_prefix`, which is just +the middle word (default ``ats``). |TS| frames that word as ``/-`` -- the +leading ``/`` that POSIX shared memory requires and the trailing ``-`` separator +are supplied by ``cache_shm::normalize_name_prefix``, not the operator, +so neither can be mis-typed; any stray framing carried over from an older config +(for example a literal ``/ats-``) is trimmed first, so it can never become an +invalid embedded-slash name like ``//ats--``. With the default word the framed +prefix is ``/ats-``: the control segment is ``/ats-control`` and stripe segments +are ``/ats-s`` where ``N`` is a per-instance slot index. Names are kept under +``cache_shm::MAX_SHM_NAME_LEN`` (31) characters because macOS caps POSIX +shared-memory names (``PSHMNAMLEN``) at 31 including the leading ``/``; keeping +to that limit makes the same naming work on Linux and macOS. Instances sharing +a host **must** use distinct words so their segments do not collide. + +Note that the stripe segment name is just a slot label. A stripe is matched to +its prior segment by ``stripe_key_hash`` (a 64-bit FNV-1a of the stripe's +``hash_text``), **not** by name or index, so a span going offline can shift +slot numbers without breaking the identity match. + +Startup +======= + +``CacheShm::initialize`` runs from +``CacheProcessor::start_internal``, after the :cpp:class:`Store` is +read but before any :cpp:class:`Stripe` is constructed. It loads the +configuration, then opens the control segment and selects one of three modes: + +.. list-table:: + :header-rows: 1 + :widths: 22 78 + + * - Mode + - Meaning + * - ``Disabled`` + - The feature is off (or a fatal precondition failed, such as a name that + is too long or losing the concurrent-attach race). Stripes use the + normal heap/hugepage directory; behavior is identical to stock |TS|. + * - ``AttachExisting`` + - A trustworthy prior control segment exists. Stripes attach their prior + segment by identity, or create a fresh one where there is no match. + * - ``CreateFresh`` + - No usable prior control segment. A new one is created and every stripe + segment is created empty (the cold path, but now shared-memory-backed + for *next* time). + +Trust gates +----------- + +When a prior control segment exists, ``initialize`` applies these gates in +order. The first failure drops the entire control segment (unlinking every +stripe segment it lists) and falls through to ``CreateFresh``: + +.. list-table:: + :header-rows: 1 + :widths: 26 74 + + * - Gate + - Drops the segment when... + * - concurrent-attach guard + - another live process is mapping the segment (see below). This actually + disables shared memory for the run rather than dropping -- the live + owner's segment must be left intact. + * - ``magic`` + - the magic bytes do not match (not our segment, or corrupt). + * - ``schema_version`` + - the on-shm format version differs from this binary's. + * - ``abi_hash`` + - the binary's directory structures differ from the writer's (e.g. an + upgrade changed ``Dir``, ``StripeHeaderFooter``, ``DIR_DEPTH``, ...). + * - ``clean_shutdown`` + - the previous run did not set it to ``1`` -- i.e. it crashed or was + killed. A crash may have left directory entries pointing at fragments + that were never flushed, so no stripe can safely skip recovery. + +If every gate passes, ``initialize`` adopts the segment: it records itself as +``owner_pid``, sets ``clean_shutdown = 0`` (so a crash *this* run drops the +segment next time), ``msync``\ s the header, and enters ``AttachExisting``. The +per-stripe work then happens lazily as each stripe initializes. + +Concurrent-attach guard +----------------------- + +Two ``traffic_server`` processes must never map the same directory read-write; +the second would corrupt the first's live index. ``clean_shutdown`` is no help +here -- it says nothing about a process that is *currently* running. The guard +is therefore based on ownership, with two layers: + +* **flock.** ``initialize`` takes a non-blocking exclusive ``flock`` on the + control-segment fd and holds it for the entire process lifetime + (``g_control_fd``). The kernel releases it automatically on exit *or crash*, + so it is self-healing. If the lock is already held + (``LockResult::HeldByOther``), a live owner exists and the new process + disables shared memory for its run. This is authoritative on Linux/``tmpfs``. + +* **owner_pid liveness.** macOS POSIX shared memory does not honor ``flock`` + (``LockResult::Unsupported``). There, the guard falls back to the recorded + ``owner_pid``: if it names a live process other than ourselves + (``CacheShm::process_is_alive``, via ``kill(pid, 0)``), the new + process disables shared memory. A clean shutdown clears ``owner_pid`` to + ``0``; a crash leaves a stale pid, but a crash also leaves + ``clean_shutdown = 0``, so the segment is dropped by that gate anyway. + +A symmetric check guards the ``CreateFresh`` path: after creating the fresh +control segment, ``initialize`` takes the lock, and if it lost a creation race +to another starting process it backs out and disables shared memory for the +run. + +Per-stripe attach and the fast path +==================================== + +For each stripe, ``Stripe::_init_directory`` asks +``CacheShm::attach_or_create_stripe`` for its ``raw_dir`` *before* +falling back to the hugepage / aligned-heap allocation: + +.. code-block:: cpp + + this->directory.raw_dir = CacheShm::attach_or_create_stripe(hash_text.get(), directory_size); + if (this->directory.raw_dir == nullptr) { + // shm disabled or attach/create failed -> hugepage, then aligned heap + } + +``attach_or_create_stripe`` looks up the stripe by ``stripe_key_hash`` in the +control table: + +* **Match found** (and the recorded size matches): map the existing segment and + return it. This is the segment the previous run left behind. +* **No match**: reserve a fresh table slot and create a new, zero-filled + segment. + +A freshly created segment has a zero header magic, so the fast-attach gate +below rejects it and ``StripeSM::init`` falls through to the normal disk +read, which repopulates the directory in place. + +The fast-attach gate +-------------------- + +In ``AttachExisting`` mode, when ``raw_dir`` came from shared memory, +``StripeSM::init`` checks whether the in-segment directory can be trusted +without reading disk: + +#. ``header->magic`` and ``footer->magic`` are both ``STRIPE_MAGIC``; +#. the directory version is within + ``[CACHE_DB_MAJOR_VERSION_COMPATIBLE, CACHE_DB_MAJOR_VERSION]``; +#. ``Stripe::_shm_directory_is_valid`` passes (see below). + +When all three hold, the stripe skips both the disk read **and** +``StripeSM::recover_data`` -- which would otherwise rescan the tail and +discard the very entries the shared segment preserved -- and jumps straight to +the post-recovery state (``sector_size``, ``scan_pos``, +``periodic_scan``, then ``StripeSM::dir_init_done``), +mirroring the tail of ``handle_recover_write_dir()``. It logs:: + + attaching cached directory from shm for '' (fast restart, recovery skipped) + +If any check fails, it logs ``shm directory invalid ...; falling back to disk +read`` and proceeds exactly as a cold start would. + +Bounds-validating a trusted segment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The magic/version checks confirm the segment *looks like* a directory, but a +stale-yet-magic-valid segment could still present offsets that would turn into +out-of-bounds disk I/O. ``Stripe::_shm_directory_is_valid`` is a +defensive bounds check over the trusted header fields: + +* ``sector_size`` is non-zero and no larger than ``STORE_BLOCK_SIZE``; +* ``write_pos``, ``last_write_pos`` and ``agg_pos`` all lie within the stripe's + data region (``[start, skip + len]``); +* every per-segment free-list head indexes a ``Dir`` entry within its segment + (walking a free list from an out-of-range head would run off the end). + +A failure here is treated like any other attach miss: drop to the disk read and +recover. It is cheap insurance against a corrupted or version-skewed segment +that slipped past the coarse magic check. + +Storage changes and partial attach +=================================== + +A change to ``storage.config`` does **not** invalidate the whole control +segment. ``storage_signature`` is recorded and used only to phrase the startup +log line ("partial -- storage changed"); it is not a trust gate. The actual +reconciliation is per stripe, driven by identity: + +* A stripe whose ``stripe_key_hash`` still matches a table entry of the right + size attaches its prior segment as usual. +* A stripe that is new, relocated, or resized finds no match and creates a + fresh segment (then loads from disk). +* A table entry that *no* stripe claimed this run is an **orphan** -- its stripe + left the cache (a span was dropped, or a disk failed to open). + +``CacheShm::finalize_attach``, called from +``CacheProcessor::cacheInitialized`` once every stripe has initialized, +reclaims the orphans: it unlinks each unclaimed segment, tombstones its slot for +reuse, and trims trailing tombstones so ``stripe_count`` tracks the live +high-water mark. + +One guard matters here: if **zero** stripes claimed a segment this run, +``finalize_attach`` leaves every entry intact. Zero claims cannot be +distinguished from an aborted init (for example a transient ``volume.config`` +error), and reclaiming a valid cache's segments would be far worse than leaking +them for one run. + +Shutdown +======== + +A clean shutdown is what makes the next start fast, so the directory must be +made final and the segment marked clean -- in that order. + +Wiring +------ + +On a clean exit, ``AutoStopCont::mainEvent`` calls +``sync_cache_dir_on_shutdown()`` whenever the cache is initialized. +``sync_cache_dir_on_shutdown`` stops every stripe (taking +each stripe mutex, so all writers are quiesced), and only then calls +``CacheShm::mark_clean_shutdown``, which sets ``clean_shutdown = 1``, +clears ``owner_pid`` to ``0``, and ``msync``\ s the header. When the feature is +disabled, ``mark_clean_shutdown`` is a no-op (there is no control segment), so +the shutdown path is unchanged for a stock |TS|. + +Skipping the on-disk directory write +------------------------------------ + +For a shared-memory-backed stripe, ``StripeSM::shutdown`` still flushes +the aggregation buffer (so pending *content* reaches disk) but then **skips the +on-disk directory write** entirely: + +.. code-block:: text + + Dir : shm-backed, skipping on-disk directory write + +The shared segment is already the durable copy of the directory and is attached +directly next start, so writing the A/B copies to disk would be pure waste. The +trade-off is deliberate and safe: if the segment is later dropped for any +reason, the on-disk A/B copies plus ``StripeSM::recover_data`` reconcile +the tail -- the same path an unclean restart already takes. + +Invalidating on flush failure +----------------------------- + +If the aggregation-buffer flush at shutdown fails (e.g. the disk went bad), the +on-disk content no longer matches the directory, so the shared segment must not +be trusted next start. ``StripeSM::shutdown`` calls +``CacheShm::invalidate_stripe_directory``, which zeroes the in-segment +header magic and ``msync``\ s it. Next start, the fast-attach gate rejects that +one stripe on the magic check and it reloads from disk and recovers, while the +other stripes still fast-attach. + +Crash and recovery summary +========================== + +The state machine reduces to: *the segment is attached only when it is provably +consistent, and dropped otherwise.* + +.. list-table:: + :header-rows: 1 + :widths: 34 66 + + * - Event between runs + - Next start + * - Clean shutdown, unchanged binary & storage + - Fast attach. Recovery skipped. Cache online in milliseconds. + * - Crash / ``SIGKILL`` + - ``clean_shutdown`` still ``0`` -> drop, rebuild from disk + recover. + * - Binary upgrade changing directory structures + - ``abi_hash`` mismatch -> drop, rebuild. + * - Schema bump + - ``schema_version`` mismatch -> drop, rebuild. + * - ``storage.config`` change + - Control segment kept; matching stripes fast-attach, changed stripes + rebuild, orphans reclaimed. + * - Per-stripe shutdown flush failed + - That stripe's segment was invalidated -> it rebuilds; others + fast-attach. + * - Another live owner using the prefix + - Refuse to attach; shared memory disabled for this run. + +In every "drop/rebuild" row, |TS| behaves exactly as it does today without the +feature -- the fast path is the only thing lost. + +Huge pages +========== + +The large directory segments make page-table teardown at process exit +non-trivial: ``exit_mmap`` walks O(number of PTEs), which for multi-gigabyte +directories can cost seconds. Backing the mapping with huge pages cuts the PTE +count ~512x and the teardown cost with it. + +When :ts:cv:`proxy.config.cache.shm.use_hugepages` is set, |TS| advises +transparent huge pages on the mapping with ``madvise(MADV_HUGEPAGE)``. +``MAP_HUGETLB`` is deliberately **not** used: ``shm_open`` fds are ``tmpfs`` +backed, and ``MAP_HUGETLB`` requires a ``hugetlbfs`` fd, so it always fails with +``EINVAL``. The advice requires shmem THP to be enabled on the host (for +example ``/sys/kernel/mm/transparent_hugepage/shmem_enabled`` set to ``advise`` +or ``always``, or the ``tmpfs`` mounted with ``huge=advise``). When huge pages +are unavailable the ``madvise`` simply logs a debug line under the +``cache_shm`` tag and the kernel uses base pages, so enabling the setting is +always safe. + +Concurrency model +================= + +Stripes initialize concurrently across the AIO/disk threads, so the +control-table bookkeeping is locked, but the slow shared-memory syscalls are +kept out of the critical section: + +* ``g_table_mutex`` guards the control-segment stripe table and the per-run + claim bookkeeping. ``attach_or_create_stripe`` decides what to do (reuse a + table slot or reserve a fresh one) under the lock, then **drops it** before + ``shm_open`` / ``ftruncate`` / ``mmap``. Each stripe owns a distinct segment, + so the syscalls never touch another thread's segment. Holding the lock across + them would serialize every disk thread's init. +* ``g_pointers_mutex`` guards the set of pointers handed out, so + ``CacheShm::is_shm_pointer`` (used to tell a shm-backed directory from + a heap-allocated one, e.g. to skip the redundant on-disk directory write) is + thread-safe. +* Slot reservation tombstones a slot if the create later fails + (``release_reserved_slot``), so a failed create cannot strand a half-built + table entry. + +Disabling the feature: stale-segment purge +========================================== + +Running with the feature **disabled** after it had been enabled is hazardous in +two ways: the leftover segments keep consuming memory the disabled instance +never reads, and a later re-enabled run could fast-attach a directory that went +stale while |TS| ran disabled (writing only to disk). To address this, +:ts:cv:`proxy.config.cache.shm.purge_stale_on_start` (opt-in) makes a disabled +start best-effort remove any leftover segments for the configured prefix. + +The purge shares one primitive with the operator tooling (see below): +``cache_shm::purge_segments`` in :ts:git:`src/iocore/cache/CacheShmPurge.h`. It +enumerates the stripe table and unlinks every stripe segment plus the control +object, returning a structured ``PurgeReport`` that each caller renders in its +own format. It refuses to unlink anything owned by a live process (the same +flock + ``owner_pid`` guard used at attach), and it never blocks startup. An +already-gone segment (``ENOENT``) is the desired end state and is not counted as +a failure. + +Operator tooling: ``traffic_ctl cache shm`` +============================================ + +Because crash-leftover segments may need inspecting when no live process is +around to query, the tooling acts on the shared-memory objects **directly**, via +``shm_open``, rather than over JSON-RPC. For that reason ``traffic_ctl`` does +**not** link the cache library; the small amount of shared logic lives in +header-only form (:ts:git:`src/iocore/cache/CacheShmLayout.h` and +``CacheShmPurge.h``). + +``traffic_ctl cache shm status [--prefix P]`` + Maps the control segment read-only and prints its header (magic, + schema/abi/storage fingerprints, ``clean_shutdown``, and whether + ``owner_pid`` names a live process) followed by the stripe table, flagging + each segment ``present`` / ``MISSING`` and each free slot as a tombstone. + +``traffic_ctl cache shm clear [--prefix P]`` + Removes the segments via the shared ``purge_segments`` primitive. It + **refuses** to clear segments owned by a live ``traffic_server`` (stop it + first), so it cannot orphan a running instance's fast restart. This is the + on-demand equivalent of ``purge_stale_on_start``. + +.. _cache-shm-configuration: + +Configuration +============= + +All settings are under ``proxy.config.cache.shm`` and take effect only on a +restart (``RECU_RESTART_TS``). See :ref:`admin-cache-shm-fast-restart` for the +full administrator-facing descriptions. + +.. list-table:: + :header-rows: 1 + :widths: 38 12 50 + + * - Setting + - Default + - Effect + * - :ts:cv:`proxy.config.cache.shm.enabled` + - ``0`` + - Master switch. ``0`` = always read the directory from disk (stock + behavior). + * - :ts:cv:`proxy.config.cache.shm.name_prefix` + - ``ats`` + - Middle word of the shared-memory object names; framed as ``/-`` + (the ``/`` and ``-`` are added by |TS|). Give co-located instances + distinct words. + * - :ts:cv:`proxy.config.cache.shm.use_hugepages` + - ``0`` + - Advise transparent huge pages on the directory mappings. Safe when + unavailable; falls back to base pages. + * - :ts:cv:`proxy.config.cache.shm.purge_stale_on_start` + - ``0`` + - When the feature is disabled, best-effort remove leftover segments for + the prefix at startup. + +Platform considerations +======================= + +* **Linux** is the primary target: ``tmpfs`` (``/dev/shm``) backs the segments, + ``flock`` is authoritative for the concurrent-attach guard, and shmem THP + provides the huge-page teardown win. +* **macOS** is supported for development and testing on a best-effort basis. + POSIX shared-memory names are limited to 31 characters (the reason for + ``MAX_SHM_NAME_LEN``), ``flock`` is not honored on shm fds, so the + concurrent-attach guard is best-effort there: it relies on the ``owner_pid`` + liveness backstop alone (the ``kill(pid, 0)`` check), which closes the window + but cannot make the attach atomic the way ``flock`` does on Linux. The kernel + also rounds a segment up to a page boundary, so ``open_and_map_shm`` accepts + any size in ``[requested, page-up]``. +* The feature is inert at the default :ts:cv:`proxy.config.cache.shm.enabled` + ``0``: no segments are created or attached on any platform, and behavior is + identical to stock |TS|. +* Realistic multi-gigabyte directory sizes, the ``MADV_HUGEPAGE`` teardown win, + and the restart-time benchmarks are Linux-only -- the same platform boundary + |TS| already has for its hugepage directory allocation. (Recall ``MAP_HUGETLB`` + is never used here; see `Huge pages`_.) + +Testing +======= + +The pure trust-gate logic is unit-tested in +:ts:git:`src/iocore/cache/unit_tests/test_CacheShm.cc` (ABI-hash stability, the +storage-signature topology sensitivity, control-header round-trip, the macOS +name-length limit, and the process-liveness check). + +The end-to-end behavior is covered by autests in +:ts:git:`tests/gold_tests/cache/`, one scenario each: + +.. list-table:: + :header-rows: 1 + :widths: 42 58 + + * - Test + - Scenario + * - ``cache_shm_fast_restart`` + - Directory survives a clean shutdown and is fast-attached. + * - ``cache_shm_data_integrity`` + - Objects cached before shutdown are served byte-identical from cache + after the attach (including multi-fragment objects). + * - ``cache_shm_unclean_shutdown`` + - ``SIGKILL`` leaves the segment dirty; next start drops and rebuilds. + * - ``cache_shm_schema_mismatch`` + - A poked ``schema_version`` is dropped, never attached. + * - ``cache_shm_storage_mismatch`` + - A changed storage layout keeps the control segment, creates a fresh + relocated stripe, and reclaims the orphan. + * - ``cache_shm_bad_disk_dropped`` + - Dropping a disk fast-attaches the survivors and reclaims the removed + disk's segment. + * - ``cache_shm_concurrent_attach`` + - A second ``traffic_server`` refuses to attach over a live owner and runs + with shared memory disabled. + * - ``cache_shm_purge_on_disable`` + - ``purge_stale_on_start`` removes leftover segments on a disabled start. + +The schema/storage tests drive their gates by editing ``/dev/shm`` directly +(``shm_poke.py``), which is a Linux facility; they have no macOS condition. + +Limitations and non-goals +========================= + +* The feature accelerates restart only; it does not change steady-state cache + behavior, durability, or the on-disk format. +* Only the directory is shared, never cached content. +* There is no migration or repair of an untrusted segment -- the disk is + authoritative and rebuilding from it is always the fallback. +* A single host may run multiple instances only with distinct + ``name_prefix`` values. + +Source map +========== + +.. list-table:: + :header-rows: 1 + :widths: 42 58 + + * - File + - Role + * - :ts:git:`src/iocore/cache/CacheShm.h` / ``CacheShm.cc`` + - The ``CacheShm`` facade: initialize, attach/create, finalize, mark-clean, + invalidate, and the trust-gate fingerprints. + * - :ts:git:`src/iocore/cache/CacheShmLayout.h` + - The on-shm control-segment layout, shared with tooling. + * - :ts:git:`src/iocore/cache/CacheShmPurge.h` + - The header-only enumerate-and-unlink primitive and its owner guard, + shared by the disabled-start purge and ``traffic_ctl``. + * - :ts:git:`src/iocore/cache/Stripe.cc` + - The shared-memory ``raw_dir`` allocation and ``_shm_directory_is_valid``. + * - :ts:git:`src/iocore/cache/StripeSM.cc` + - The fast-attach gate in ``StripeSM::init`` and the shutdown-write skip / + invalidate in ``StripeSM::shutdown``. + * - :ts:git:`src/iocore/cache/CacheProcessor.cc` + - ``initialize`` / ``finalize_attach`` call sites in ``CacheProcessor``. + * - :ts:git:`src/iocore/cache/CacheDir.cc` + - ``mark_clean_shutdown`` from ``sync_cache_dir_on_shutdown``. + * - :ts:git:`src/traffic_ctl/CacheShmCommand.cc` + - The ``traffic_ctl cache shm status`` / ``clear`` commands. diff --git a/src/iocore/cache/AggregateWriteBuffer.cc b/src/iocore/cache/AggregateWriteBuffer.cc index b761d656170..1407d5b2693 100644 --- a/src/iocore/cache/AggregateWriteBuffer.cc +++ b/src/iocore/cache/AggregateWriteBuffer.cc @@ -49,7 +49,6 @@ AggregateWriteBuffer::flush(int fd, off_t write_pos) const { int r = pwrite(fd, this->_buffer, this->_buffer_pos, write_pos); if (r != this->_buffer_pos) { - ink_assert(!"flushing agg buffer failed"); return false; } return true; diff --git a/src/iocore/cache/AggregateWriteBuffer.h b/src/iocore/cache/AggregateWriteBuffer.h index ad99b03ce04..22951fb797a 100644 --- a/src/iocore/cache/AggregateWriteBuffer.h +++ b/src/iocore/cache/AggregateWriteBuffer.h @@ -120,7 +120,7 @@ class AggregateWriteBuffer * @param write_pos The offset at which to write the buffer data. * @return Returns true if all bytes were flushed, otherwise false. */ - bool flush(int fd, off_t write_pos) const; + [[nodiscard]] bool flush(int fd, off_t write_pos) const; /** * Copy part of the buffer. diff --git a/src/iocore/cache/CMakeLists.txt b/src/iocore/cache/CMakeLists.txt index f8a252b430c..dd4dc9f6a24 100644 --- a/src/iocore/cache/CMakeLists.txt +++ b/src/iocore/cache/CMakeLists.txt @@ -27,6 +27,7 @@ add_library( CacheHttp.cc CacheProcessor.cc CacheRead.cc + CacheShm.cc CacheVC.cc CacheWrite.cc HttpTransactCache.cc @@ -50,7 +51,7 @@ target_include_directories(inkcache PRIVATE ${CMAKE_SOURCE_DIR}/lib) target_link_libraries( inkcache PUBLIC ts::aio ts::hdrs ts::inkevent ts::tscore - PRIVATE ts::config ts::tsapibackend fastlz ZLIB::ZLIB + PRIVATE ts::config ts::tsapibackend fastlz ZLIB::ZLIB rt::rt ) if(HAVE_LZMA_H) @@ -92,6 +93,7 @@ if(BUILD_TESTING) add_cache_test(Update_Header unit_tests/test_Update_header.cc) add_cache_test(CacheStripe unit_tests/test_Stripe.cc) add_cache_test(CacheAggregateWriteBuffer unit_tests/test_AggregateWriteBuffer.cc) + add_cache_test(CacheShm unit_tests/test_CacheShm.cc) # Unit Tests without unit_tests/main.cc add_executable(test_ConfigVolumes unit_tests/test_ConfigVolumes.cc) diff --git a/src/iocore/cache/CacheDir.cc b/src/iocore/cache/CacheDir.cc index 99e9fba47b4..beec83c39a8 100644 --- a/src/iocore/cache/CacheDir.cc +++ b/src/iocore/cache/CacheDir.cc @@ -27,6 +27,7 @@ #include "P_CacheInternal.h" #include "PreservationTable.h" #include "Stripe.h" +#include "CacheShm.h" #include "tscore/hugepages.h" #include "tscore/Random.h" @@ -947,6 +948,11 @@ sync_cache_dir_on_shutdown() thr.join(); } + // All writers are now stopped (every stripe's mutex was held and released by the + // shutdown threads above), so the directory is final -- only now is it safe to + // mark the shm control segment clean. + CacheShm::mark_clean_shutdown(); + Dbg(dbg_ctl_cache_dir_sync, "shutdown sync done"); } diff --git a/src/iocore/cache/CacheProcessor.cc b/src/iocore/cache/CacheProcessor.cc index 1fa158d9889..c178aeb959d 100644 --- a/src/iocore/cache/CacheProcessor.cc +++ b/src/iocore/cache/CacheProcessor.cc @@ -28,6 +28,7 @@ #include "P_CacheInternal.h" #include "StripeSM.h" #include "Stripe.h" +#include "CacheShm.h" // Must be included after P_CacheInternal.h. #include "P_CacheHosting.h" @@ -187,6 +188,9 @@ CacheProcessor::start_internal(int flags) gndisks = theCacheStore.n_spans; gdisks.resize(gndisks); + // Must run before any Stripe is constructed so each can attach/create its segment. + CacheShm::initialize(theCacheStore); + // Temporaries to carry values between loops char **paths = static_cast(alloca(sizeof(char *) * gndisks)); memset(paths, 0, sizeof(char *) * gndisks); @@ -1495,6 +1499,9 @@ CacheProcessor::cacheInitialized() } } + // All stripes have claimed their segments; reclaim any orphan (e.g. a dropped disk). + CacheShm::finalize_attach(); + if (caches_ready) { Dbg(dbg_ctl_cache_init, "CacheProcessor::cacheInitialized - caches_ready=0x%0X, gnvol=%d", (unsigned int)caches_ready, gnstripes.load()); diff --git a/src/iocore/cache/CacheShm.cc b/src/iocore/cache/CacheShm.cc new file mode 100644 index 00000000000..5618fa6d28a --- /dev/null +++ b/src/iocore/cache/CacheShm.cc @@ -0,0 +1,748 @@ +/** @file + + Shared-memory-backed cache directory for fast restart. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#include "CacheShm.h" +#include "CacheShmLayout.h" +#include "CacheShmPurge.h" + +#include "P_CacheDir.h" +#include "iocore/cache/Store.h" + +#include "records/RecCore.h" +#include "tscore/Diags.h" +#include "tscore/HashFNV.h" +#include "tscore/ink_align.h" +#include "tscore/ink_memory.h" +#include "tscore/ink_string.h" +#include "tsutil/DbgCtl.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace +{ + +DbgCtl dbg_ctl{"cache_shm"}; + +using cache_shm::CACHE_SHM_MAGIC; +using cache_shm::CACHE_SHM_SCHEMA_VERSION; +using cache_shm::CacheShmControl; +using cache_shm::control_segment_name; +using cache_shm::CONTROL_SIZE; +using cache_shm::LockResult; +using cache_shm::MAX_SHM_NAME_LEN; +using cache_shm::MAX_STRIPES; +using cache_shm::read_shm_name; +using cache_shm::StripeEntry; +using cache_shm::try_lock_control; + +// Sanity bound: the control struct (header + stripe table) must stay small. +constexpr std::size_t MAX_CONTROL_SEGMENT_BYTES = 32 * 1024; +static_assert(sizeof(CacheShmControl) <= MAX_CONTROL_SEGMENT_BYTES, "control segment unexpectedly large"); + +// Configuration loaded at initialize() time. +struct Config { + bool enabled = false; + bool use_hugepages = false; + bool purge_stale_on_start = false; + std::string name_prefix = "/ats-"; // normalized "/-" (see normalize_name_prefix); set in load_config. +}; + +Config g_config; + +// Live state for the open control segment. +CacheShmControl *g_control = nullptr; +std::string g_control_name; + +// Holds the control segment's exclusive flock for the process lifetime; the OS +// releases it on exit. Only set on the path that owns the segment. +ats_scoped_fd g_control_fd; + +// shm pointers we returned (mapped to their length), so the Stripe destructor can +// choose munmap vs ats_free and detach_stripe can munmap the right span. +std::mutex g_pointers_mutex; +std::unordered_map g_pointers; + +// Guards the control-segment stripe table and the per-run claim bookkeeping below +// (stripes initialize concurrently across disk threads). +std::mutex g_table_mutex; + +// Per-run partial-attach bookkeeping, indexed in lockstep with g_control->stripes[]. +// An entry still unclaimed once init completes is an orphan reclaimed by +// finalize_attach(). Process-local, reset each run. +bool g_entry_claimed[MAX_STRIPES] = {}; +uint32_t g_claims_this_run = 0; + +void +fnv_update(ATSHash64FNV1a &h, uint64_t v) +{ + h.update(&v, sizeof v); +} + +/// Full 64-bit stripe identity used to match a stripe to its prior shm segment. +uint64_t +compute_stripe_key_hash(const char *stripe_key) +{ + ATSHash64FNV1a hash; + hash.update(stripe_key, std::strlen(stripe_key)); + return hash.get(); +} + +/// Build a stripe shm name from its per-host index (unique, so names never +/// collide). Matching to a prior segment uses the key hash, not this name. +std::string +build_stripe_shm_name(const std::string &prefix, uint32_t stripe_index) +{ + std::string name = prefix + "s" + std::to_string(stripe_index); + if (name.size() >= MAX_SHM_NAME_LEN) { + name.resize(MAX_SHM_NAME_LEN - 1); + } + return name; +} + +// Named flags for open_and_map_shm so call sites read `ShmAccess::Create` / +// `HugePages::Off` and the two can't be transposed. +enum class ShmAccess { Open, Create }; +enum class HugePages { Off, On }; + +/// Open or create a shm segment of `size` bytes and mmap it. Returns nullptr +/// on failure. When `out_fd` is non-null, the open fd is handed back to the +/// caller (left open) so it can hold an flock on the segment; otherwise the fd +/// is closed once the mapping is established (the mmap survives the close). +/// When `out_errno` is non-null it receives the failing syscall's errno (0 on +/// success) so the caller can render a non-opaque diagnostic. +void * +open_and_map_shm(const std::string &name, std::size_t size, ShmAccess access, [[maybe_unused]] HugePages hugepages, + int *out_fd = nullptr, int *out_errno = nullptr) +{ + if (out_errno != nullptr) { + *out_errno = 0; + } + int oflags = O_RDWR; + if (access == ShmAccess::Create) { + oflags |= O_CREAT; + } + + ats_scoped_fd fd{shm_open(name.c_str(), oflags, 0600)}; + if (fd < 0) { + int e = errno; + Dbg(dbg_ctl, "shm_open(%s, %s) failed: %s", name.c_str(), access == ShmAccess::Create ? "create" : "open", strerror(e)); + if (out_errno != nullptr) { + *out_errno = e; + } + return nullptr; + } + + if (access == ShmAccess::Create) { + if (ftruncate(fd, size) < 0) { + int e = errno; + Warning("ftruncate(%s, %zu) failed: %s", name.c_str(), size, strerror(e)); + shm_unlink(name.c_str()); + if (out_errno != nullptr) { + *out_errno = e; + } + return nullptr; + } + } else { + // The kernel rounds shm size up to a page boundary (16 KiB on macOS / Apple + // Silicon), so accept any size in [requested, page-up]. + struct stat sb { + }; + std::size_t expected_max = INK_ALIGN(size, ats_pagesize()); + if (fstat(fd, &sb) < 0 || sb.st_size < 0 || static_cast(sb.st_size) < size || + static_cast(sb.st_size) > expected_max) { + Dbg(dbg_ctl, "shm %s size mismatch (have %lld, want %zu, max %zu)", name.c_str(), static_cast(sb.st_size), size, + expected_max); + return nullptr; + } + } + + int prot = PROT_READ | PROT_WRITE; + int flags = MAP_SHARED; + void *addr = mmap(nullptr, size, prot, flags, fd, 0); + if (addr == MAP_FAILED) { + int e = errno; + Warning("mmap(%s, %zu) failed: %s", name.c_str(), size, strerror(e)); + if (out_errno != nullptr) { + *out_errno = e; + } + return nullptr; + } + + // Advise shmem THP for the mapping (cuts page-table teardown at exit). MAP_HUGETLB + // is not usable here: shm_open fds are tmpfs-backed, so it always EINVALs. Requires + // shmem THP enabled on the host; see the design doc for details. +#if defined(MADV_HUGEPAGE) + if (hugepages == HugePages::On) { + if (madvise(addr, size, MADV_HUGEPAGE) != 0) { + Dbg(dbg_ctl, "madvise(MADV_HUGEPAGE) on %s failed: %s", name.c_str(), strerror(errno)); + } + } +#endif + + if (out_fd != nullptr) { + *out_fd = fd.release(); // caller owns the fd and keeps it open for flock + } + return addr; +} + +void +unlink_all_known_segments() +{ + if (g_control != nullptr) { + for (uint32_t i = 0; i < g_control->stripe_count && i < MAX_STRIPES; ++i) { + std::string name = read_shm_name(g_control->stripes[i].shm_name); + if (!name.empty()) { + Dbg(dbg_ctl, "shm_unlink stripe %s", name.c_str()); + shm_unlink(name.c_str()); + } + } + munmap(g_control, CONTROL_SIZE); + g_control = nullptr; + } + if (!g_control_name.empty()) { + Dbg(dbg_ctl, "shm_unlink control %s", g_control_name.c_str()); + shm_unlink(g_control_name.c_str()); + } +} + +// Purge leftover shm segments when shm is disabled this run (opt-in via +// purge_stale_on_start). Best-effort: logs but never blocks startup. The +// enumerate-and-unlink work is shared with `traffic_ctl cache shm clear` +// (cache_shm::purge_segments); this just renders the result into diags. +void +purge_stale_segments(const std::string &prefix) +{ + const cache_shm::PurgeReport report = cache_shm::purge_segments(prefix); + + switch (report.outcome) { + case cache_shm::PurgeOutcome::BadPrefix: + // load_config() already warned about a bad prefix; stay quiet here. + case cache_shm::PurgeOutcome::NotPresent: + return; // ENOENT: shm never used with this prefix. + case cache_shm::PurgeOutcome::OpenFailed: + Warning("cache shm: cannot open control segment %s to purge stale segments: %s", report.control_name.c_str(), + strerror(report.sys_errno)); + return; + case cache_shm::PurgeOutcome::MapFailed: + Warning("cache shm: mmap of control segment %s failed while purging: %s", report.control_name.c_str(), + strerror(report.sys_errno)); + return; + case cache_shm::PurgeOutcome::TooSmall: + Warning("cache shm: leftover control segment %s is too small to read (%lld bytes); unlinking it", report.control_name.c_str(), + report.segment_size); + break; // purge_segments() already unlinked the control object; render the result below. + case cache_shm::PurgeOutcome::OwnedByLive: + Warning("cache shm: control segment %s is owned by a live process; leaving stale segments in place", + report.control_name.c_str()); + return; + case cache_shm::PurgeOutcome::Purged: + break; + } + + for (const auto &u : report.unlinked) { + if (u.error == 0) { + Dbg(dbg_ctl, "purge: unlinked %s %s", u.is_control ? "control" : "stripe", u.name.c_str()); + } else if (u.error != ENOENT) { + Warning("cache shm: failed to unlink %s %s while purging: %s", u.is_control ? "control segment" : "stripe", u.name.c_str(), + strerror(u.error)); + } + } + + Note("cache shm: purged stale segments while disabled (removed %u, %u failure(s), prefix '%s')", report.removed(), + report.failures(), prefix.c_str()); +} + +bool +load_config() +{ + RecInt enabled = RecGetRecordInt("proxy.config.cache.shm.enabled").value_or(0); + g_config.enabled = enabled != 0; + + RecInt use_hugepages = RecGetRecordInt("proxy.config.cache.shm.use_hugepages").value_or(0); + g_config.use_hugepages = use_hugepages != 0; + + RecInt purge_stale_on_start = RecGetRecordInt("proxy.config.cache.shm.purge_stale_on_start").value_or(0); + g_config.purge_stale_on_start = purge_stale_on_start != 0; + + char prefix_buf[256] = {0}; + std::string configured = "ats"; // operator sets only the middle word; framing is added below. + if (RecGetRecordString("proxy.config.cache.shm.name_prefix", prefix_buf, sizeof(prefix_buf)).has_value() && + prefix_buf[0] != '\0') { + configured = prefix_buf; + } + // Frame the configured middle word as "/-" so the leading '/' that POSIX + // shm_open requires and the '-' separator can never be mis-typed (a carried-over + // "/ats-" normalizes back to "/ats-" rather than an invalid "//ats--"). + g_config.name_prefix = cache_shm::normalize_name_prefix(configured); + + return g_config.enabled; +} + +// Reserve a control-table slot for a stripe about to be created (reusing a +// tombstone if any, else appending). Marks the slot non-empty so a concurrent +// create cannot pick the same index; g_entry_claimed stays clear until the segment +// is mapped. Returns the slot index (and shm name via out_name), or MAX_STRIPES +// when the table is full. Caller must hold g_table_mutex. +uint32_t +reserve_stripe_slot(uint64_t key_hash, std::size_t directory_size, std::string &out_name) +{ + uint32_t idx = g_control->stripe_count; + bool reuse_slot = false; + for (uint32_t i = 0; i < g_control->stripe_count && i < MAX_STRIPES; ++i) { + if (g_control->stripes[i].shm_name[0] == '\0') { + idx = i; + reuse_slot = true; + break; + } + } + if (!reuse_slot && g_control->stripe_count >= MAX_STRIPES) { + Warning("cache shm: stripe count exceeds MAX_STRIPES (%zu); falling back", MAX_STRIPES); + return MAX_STRIPES; + } + + out_name = build_stripe_shm_name(g_config.name_prefix, idx); + if (!reuse_slot) { + g_control->stripe_count++; + } + StripeEntry &e = g_control->stripes[idx]; + ink_strlcpy(e.shm_name, out_name.c_str(), sizeof(e.shm_name)); + e.raw_dir_size = directory_size; + e.stripe_key_hash = key_hash; + return idx; +} + +// Undo a reserve_stripe_slot() reservation when the segment could not be created. +// Tombstones the slot (empty shm_name) for reuse. Caller must hold g_table_mutex. +void +release_reserved_slot(uint32_t idx) +{ + StripeEntry &e = g_control->stripes[idx]; + e.shm_name[0] = '\0'; + e.raw_dir_size = 0; + e.stripe_key_hash = 0; +} + +// Record a freshly mapped stripe segment as claimed for this run and remember its +// pointer and length (for is_shm_pointer / invalidate_stripe_directory / +// detach_stripe). Takes the locks itself so the shm syscalls that produced `p` ran +// without g_table_mutex held. +char * +claim_mapped_stripe(uint32_t idx, void *p, std::size_t size) +{ + { + std::scoped_lock lk{g_table_mutex}; + g_entry_claimed[idx] = true; + ++g_claims_this_run; + } + { + std::scoped_lock plk{g_pointers_mutex}; + g_pointers.insert({static_cast(p), size}); + } + return static_cast(p); +} + +} // namespace + +CacheShm::Mode CacheShm::_mode = CacheShm::Mode::Disabled; + +uint64_t +CacheShm::abi_hash() +{ + ATSHash64FNV1a h; + h.update(tag.data(), tag.size()); + fnv_update(h, sizeof(Dir)); + fnv_update(h, sizeof(StripeHeaderFooter)); + fnv_update(h, sizeof(CacheShmControl)); + fnv_update(h, sizeof(StripeEntry)); + fnv_update(h, DIR_DEPTH); + fnv_update(h, SIZEOF_DIR); + fnv_update(h, MAX_STRIPES); + return h.get(); +} + +uint64_t +CacheShm::storage_signature(const Store &store) +{ + ATSHash64FNV1a h; + for (unsigned i = 0; i < store.n_spans; ++i) { + const Span *span = store.spans[i]; + if (span == nullptr) { + continue; + } + if (span->pathname) { + std::string_view path{span->pathname.get()}; + h.update(path.data(), path.size()); + } + fnv_update(h, static_cast(span->blocks)); + fnv_update(h, static_cast(span->offset)); + fnv_update(h, static_cast(span->hw_sector_size)); + } + return h.get(); +} + +void +CacheShm::initialize(const Store &store) +{ + if (!load_config()) { + _mode = Mode::Disabled; + // shm is off this run; reclaim any leftover segments from a prior run (rationale + // and guards documented on purge_stale_segments). Opt-in and best-effort. + if (g_config.purge_stale_on_start) { + purge_stale_segments(g_config.name_prefix); + } + Dbg(dbg_ctl, "shm disabled"); + return; + } + + g_control_name = control_segment_name(g_config.name_prefix); + if (g_control_name.size() >= MAX_SHM_NAME_LEN) { + Warning("shm name_prefix too long (control segment name '%s' exceeds %zu chars); shm disabled", g_control_name.c_str(), + MAX_SHM_NAME_LEN); + _mode = Mode::Disabled; + return; + } + + const uint64_t expected_abi = abi_hash(); + const uint64_t expected_signature = storage_signature(store); + + // Try to attach an existing control segment first. + int existing_fd = -1; + void *existing = open_and_map_shm(g_control_name, CONTROL_SIZE, ShmAccess::Open, HugePages::Off, &existing_fd); + if (existing != nullptr) { + auto *ctrl = static_cast(existing); + + // Concurrent-attach guard: refuse shm (and rebuild from disk) if another live + // process still owns this segment. + int flock_errno = 0; + const LockResult lock = try_lock_control(existing_fd, &flock_errno); + bool live_owner = false; + switch (lock) { + case LockResult::Acquired: + break; // we hold the exclusive lock, so any prior owner is gone + case LockResult::HeldByOther: + live_owner = true; + break; + case LockResult::Unsupported: // macOS POSIX shm: flock is a no-op, fall back to owner_pid + Dbg(dbg_ctl, "flock unsupported for control segment %s (errno %d: %s); using owner-pid liveness guard", + g_control_name.c_str(), flock_errno, strerror(flock_errno)); + live_owner = ctrl->owner_pid != 0 && ctrl->owner_pid != static_cast(getpid()) && process_is_alive(ctrl->owner_pid); + break; + } + if (live_owner) { + Warning("cache shm: control segment %s has a live owner (pid %d); disabling shm this run to avoid concurrent attach", + g_control_name.c_str(), ctrl->owner_pid); + munmap(existing, CONTROL_SIZE); + close(existing_fd); + _mode = Mode::Disabled; + return; + } + + bool ok = std::memcmp(ctrl->magic, CACHE_SHM_MAGIC, sizeof(CACHE_SHM_MAGIC)) == 0; + if (ok && ctrl->schema_version != CACHE_SHM_SCHEMA_VERSION) { + Note("cache shm: schema mismatch (%u vs %u), dropping", ctrl->schema_version, CACHE_SHM_SCHEMA_VERSION); + ok = false; + } + if (ok && ctrl->abi_hash != expected_abi) { + Note("cache shm: ABI mismatch, dropping"); + ok = false; + } + + // storage_signature is NOT a hard gate (see storage_signature() doc): a + // storage.config change keeps the segment, each stripe attaches by its own + // identity. Refreshed in place below. + const bool storage_changed = ok && ctrl->storage_signature != expected_signature; + + if (ok && ctrl->clean_shutdown == 0) { + // A crash may have left dir entries pointing at content never flushed, so no + // stripe can safely skip recovery -- whole-segment drop. + Note("cache shm: previous run did not shutdown cleanly, dropping"); + ok = false; + } + + if (ok) { + Note("cache shm: attaching up to %u stripes (fast restart%s)", ctrl->stripe_count, + storage_changed ? ", partial -- storage changed" : ""); + g_control = ctrl; + g_control_fd = existing_fd; // hold the exclusive lock for the process lifetime + std::memset(g_entry_claimed, 0, sizeof(g_entry_claimed)); + g_claims_this_run = 0; + if (storage_changed) { + g_control->storage_signature = expected_signature; + } + // Become owner and clear clean_shutdown so a crash this run drops shm next time. + g_control->owner_pid = static_cast(getpid()); + g_control->clean_shutdown = 0; + msync(g_control, CONTROL_SIZE, MS_SYNC); + _mode = Mode::AttachExisting; + return; + } + + // Drop everything and fall through to fresh-create. We hold the exclusive lock, + // so unlinking cannot pull segments out from under a live owner. + g_control = ctrl; // so unlink_all_known_segments can iterate stripes + unlink_all_known_segments(); + close(existing_fd); // releases the lock on the now-unlinked object + } + + // Create fresh control segment. + int fresh_fd = -1; + int create_errno = 0; + void *fresh = open_and_map_shm(g_control_name, CONTROL_SIZE, ShmAccess::Create, HugePages::Off, &fresh_fd, &create_errno); + if (fresh == nullptr) { + // Surface the errno + offending name: e.g. an embedded '/' in name_prefix yields EINVAL here. + Warning("cache shm: failed to create control segment %s: %s; shm disabled", g_control_name.c_str(), strerror(create_errno)); + _mode = Mode::Disabled; + return; + } + // Lock the freshly created segment. Another starting process could have created + // and locked it first in the window since the drop above; if so, refuse. + if (try_lock_control(fresh_fd) == LockResult::HeldByOther) { + Warning("cache shm: lost the create race for control segment %s; disabling shm this run", g_control_name.c_str()); + munmap(fresh, CONTROL_SIZE); + close(fresh_fd); + _mode = Mode::Disabled; + return; + } + g_control = static_cast(fresh); + g_control_fd = fresh_fd; // hold the exclusive lock for the process lifetime + std::memset(g_control, 0, CONTROL_SIZE); + std::memset(g_entry_claimed, 0, sizeof(g_entry_claimed)); + g_claims_this_run = 0; + std::memcpy(g_control->magic, CACHE_SHM_MAGIC, sizeof(CACHE_SHM_MAGIC)); + g_control->schema_version = CACHE_SHM_SCHEMA_VERSION; + g_control->abi_hash = expected_abi; + g_control->storage_signature = expected_signature; + g_control->clean_shutdown = 0; + g_control->owner_pid = static_cast(getpid()); + g_control->stripe_count = 0; + + _mode = Mode::CreateFresh; + Note("cache shm: creating fresh control segment %s (owner pid %d)", g_control_name.c_str(), static_cast(getpid())); + return; +} + +char * +CacheShm::attach_or_create_stripe(const char *stripe_key, std::size_t directory_size) +{ + if (_mode == Mode::Disabled || g_control == nullptr) { + return nullptr; + } + + const uint64_t key_hash = compute_stripe_key_hash(stripe_key); + const HugePages hugepages = g_config.use_hugepages ? HugePages::On : HugePages::Off; + + // Decide what to do under the table lock, but run the shm syscalls afterwards + // with the lock dropped (holding it across them would serialize every disk + // thread's init). Each stripe owns a distinct segment, so the syscalls never + // touch another thread's segment. + std::string attach_name; // non-empty => map this existing segment + std::string create_name; // set when a fresh slot was reserved (the create path) + uint32_t idx = MAX_STRIPES; + { + std::scoped_lock lk{g_table_mutex}; + + // 1. Try to attach this stripe's prior segment, matched by 64-bit identity (not + // name), so a span going offline shifts indices but not identities. + for (uint32_t i = 0; i < g_control->stripe_count && i < MAX_STRIPES; ++i) { + StripeEntry &e = g_control->stripes[i]; + if (e.shm_name[0] == '\0' || e.stripe_key_hash != key_hash) { + continue; // tombstoned slot, or a different stripe + } + if (e.raw_dir_size != directory_size) { + // Same identity, different size: shouldn't happen (size derives from the + // keyed blocks). Treat as a miss and recreate; the stale entry is reaped by + // finalize_attach(). + Note("cache shm: stripe %s size mismatch (have %llu, want %zu); recreating", read_shm_name(e.shm_name).c_str(), + static_cast(e.raw_dir_size), directory_size); + break; + } + attach_name = read_shm_name(e.shm_name); + idx = i; + break; + } + + // 2. No usable prior segment -- reserve a slot for a fresh create under the lock. + if (attach_name.empty() && (idx = reserve_stripe_slot(key_hash, directory_size, create_name)) == MAX_STRIPES) { + return nullptr; // table full (already logged) + } + } + + // Attach path: map the existing segment outside the lock. + if (!attach_name.empty()) { + void *p = open_and_map_shm(attach_name, directory_size, ShmAccess::Open, hugepages); + if (p != nullptr) { + Note("cache shm: attached stripe %s (%zu bytes) for key=%s", attach_name.c_str(), directory_size, stripe_key); + return claim_mapped_stripe(idx, p, directory_size); + } + // Attach failed (segment vanished/unmappable): reserve a fresh slot and fall + // through to create. The stale entry is reaped by finalize_attach(). + Note("cache shm: failed to attach stripe %s; recreating", attach_name.c_str()); + std::scoped_lock lk{g_table_mutex}; + if ((idx = reserve_stripe_slot(key_hash, directory_size, create_name)) == MAX_STRIPES) { + return nullptr; + } + } + + // Create path: slot already reserved; syscalls run outside the lock. A fresh + // ftruncate'd segment is zero-filled (magic 0), so Stripe::init falls back to the + // disk read and repopulates it. shm_unlink clears any leftover with this name. + shm_unlink(create_name.c_str()); + void *p = open_and_map_shm(create_name, directory_size, ShmAccess::Create, hugepages); + if (p == nullptr) { + std::scoped_lock lk{g_table_mutex}; + release_reserved_slot(idx); + return nullptr; + } + + Note("cache shm: created stripe %s (%zu bytes) for key=%s", create_name.c_str(), directory_size, stripe_key); + return claim_mapped_stripe(idx, p, directory_size); +} + +void +CacheShm::finalize_attach() +{ + if (g_control == nullptr) { + return; + } + + std::scoped_lock lk{g_table_mutex}; + + // With zero claims this run we cannot distinguish "genuinely empty cache" from + // "init aborted" (e.g. a transient volume.config error), so leave every segment + // intact rather than risk reclaiming a valid cache. + if (g_claims_this_run == 0) { + Dbg(dbg_ctl, "finalize_attach: no stripes claimed this run; leaving %u segment(s) intact", g_control->stripe_count); + return; + } + + uint32_t reclaimed = 0; + for (uint32_t i = 0; i < g_control->stripe_count && i < MAX_STRIPES; ++i) { + StripeEntry &e = g_control->stripes[i]; + if (e.shm_name[0] == '\0' || g_entry_claimed[i]) { + continue; // already empty, or claimed by a live stripe this run + } + // Unclaimed, non-empty entry: its stripe left the cache (span dropped, or disk + // failed to open). Unlink the orphan and tombstone the slot for reuse. + std::string name = read_shm_name(e.shm_name); + Note("cache shm: reclaiming orphaned stripe segment %s", name.c_str()); + shm_unlink(name.c_str()); + e.shm_name[0] = '\0'; + e.raw_dir_size = 0; + e.stripe_key_hash = 0; + ++reclaimed; + } + if (reclaimed > 0) { + Note("cache shm: reclaimed %u orphaned stripe segment(s) after storage change", reclaimed); + } + + // Trim trailing tombstones so stripe_count tracks the live high-water mark; + // interior tombstones stay (reused by attach_or_create_stripe). + uint32_t live_count = 0; + for (uint32_t i = 0; i < g_control->stripe_count && i < MAX_STRIPES; ++i) { + if (g_control->stripes[i].shm_name[0] != '\0') { + live_count = i + 1; + } + } + const bool count_changed = live_count != g_control->stripe_count; + if (count_changed) { + Note("cache shm: trimming stripe_count %u -> %u after reclaim", g_control->stripe_count, live_count); + g_control->stripe_count = live_count; + } + + if (reclaimed > 0 || count_changed) { + msync(g_control, CONTROL_SIZE, MS_SYNC); + } +} + +bool +CacheShm::is_shm_pointer(char *raw_dir) +{ + if (raw_dir == nullptr) { + return false; + } + std::scoped_lock lk{g_pointers_mutex}; + return g_pointers.find(raw_dir) != g_pointers.end(); +} + +void +CacheShm::mark_clean_shutdown() +{ + if (g_control == nullptr) { + return; + } + Note("cache shm: marking clean shutdown"); + g_control->clean_shutdown = 1; + // Clear owner_pid so the next start's liveness backstop does not mistake our + // (exiting) PID for a live owner. The flock is still held until exit, so a + // concurrent starter is still correctly refused during the shutdown window. + g_control->owner_pid = 0; + msync(g_control, CONTROL_SIZE, MS_SYNC); +} + +bool +CacheShm::process_is_alive(int pid) +{ + return cache_shm::process_is_alive(pid); +} + +void +CacheShm::invalidate_stripe_directory(char *raw_dir) +{ + if (!is_shm_pointer(raw_dir)) { + return; + } + // Zero the in-shm header magic so Stripe::init's attach gate rejects this segment + // next start and recovers the stripe from disk instead of fast-attaching a + // directory we could not finish flushing. + auto *header = reinterpret_cast(raw_dir); + header->magic = 0; + msync(raw_dir, sizeof(StripeHeaderFooter), MS_SYNC); +} + +void +CacheShm::detach_stripe(char *raw_dir) +{ + if (raw_dir == nullptr) { + return; + } + std::scoped_lock lk{g_pointers_mutex}; + auto it = g_pointers.find(raw_dir); + if (it == g_pointers.end()) { + return; + } + // munmap the recorded span; never shm_unlink -- the segment must survive for the + // next start to attach. + munmap(it->first, it->second); + g_pointers.erase(it); +} diff --git a/src/iocore/cache/CacheShm.h b/src/iocore/cache/CacheShm.h new file mode 100644 index 00000000000..37f671ee393 --- /dev/null +++ b/src/iocore/cache/CacheShm.h @@ -0,0 +1,98 @@ +/** @file + + Shared-memory-backed cache directory for fast restart. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#pragma once + +#include +#include +#include + +struct Store; + +/// Hosts Stripe::Directory::raw_dir in POSIX shared memory so the next process +/// start can attach the existing directory in milliseconds rather than rebuilding +/// it from disk. Purely an optimization over the rebuild path: anything wrong → +/// drop shm, rebuild from disk. See the cache-shm fast-restart design doc. +class CacheShm +{ +public: + static constexpr std::string_view tag{"ATS-SHM-V1"}; + + enum class Mode { + Disabled, ///< shm.enabled=0; behave like today. + AttachExisting, ///< A valid prior control segment exists; stripes attach by identity or create fresh. + CreateFresh, ///< No/invalid prior control - create everything new (cold path). + }; + + /// Initialize the control segment and decide Mode. Must be called from + /// CacheProcessor::start after the store is read but before any Stripe is built. + static void initialize(const Store &store); + + static Mode + mode() + { + return _mode; + } + + /// Allocate raw_dir for one stripe, keyed by its identity (`stripe_key`). + /// Attaches the stripe's prior segment of matching size when one exists, else + /// creates fresh. Returns the mapped pointer, or nullptr to fall back to the + /// heap path (always in Disabled). + static char *attach_or_create_stripe(const char *stripe_key, std::size_t directory_size); + + /// Reclaim segments left by stripes no longer in the cache (e.g. a dropped disk). + /// Call once after all stripes init, from CacheProcessor::cacheInitialized. + /// No-ops when no stripe came up this run. Idempotent. + static void finalize_attach(); + + /// Whether a pointer was returned from attach_or_create_stripe (munmap vs ats_free). + static bool is_shm_pointer(char *raw_dir); + + /// Mark control->clean_shutdown = 1. Called after sync_cache_dir_on_shutdown. + static void mark_clean_shutdown(); + + /// Invalidate one stripe's shm directory (zero its header magic) so the next + /// start recovers it from disk instead of fast-attaching. Called when a stripe's + /// shutdown flush failed. No-op if raw_dir is not a shm segment. + static void invalidate_stripe_directory(char *raw_dir); + + /// Detach (munmap) one stripe's shm directory and forget the pointer; never + /// shm_unlink (the segment must survive for the next start). No-op if raw_dir + /// is not a shm segment. Called from ~Stripe so the dtor frees the right way. + static void detach_stripe(char *raw_dir); + + /// Compile-time ABI fingerprint of the shm-resident layout; a writer/reader + /// mismatch forces a drop + rebuild. Exposed for unit testing. + static uint64_t abi_hash(); + + /// Fingerprint of the storage topology. Not a trust gate (see initialize()): + /// informational, drives the "storage changed" log wording. + static uint64_t storage_signature(const Store &store); + + /// True if `pid` names a live process (pid <= 0 is not). Backs the + /// concurrent-attach owner-liveness backstop. Exposed for unit testing. + static bool process_is_alive(int pid); + +private: + static Mode _mode; +}; diff --git a/src/iocore/cache/CacheShmLayout.h b/src/iocore/cache/CacheShmLayout.h new file mode 100644 index 00000000000..12bf5d8c1b2 --- /dev/null +++ b/src/iocore/cache/CacheShmLayout.h @@ -0,0 +1,114 @@ +/** @file + + Layout of the cache shared-memory control segment, shared between the cache + subsystem and tools (traffic_ctl) that inspect or clear the segment without + going through the running traffic_server. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace cache_shm +{ + +constexpr char CACHE_SHM_MAGIC[8] = {'A', 'T', 'S', '-', 'S', 'H', 'M', '\0'}; +constexpr uint32_t CACHE_SHM_SCHEMA_VERSION = 1; +constexpr std::string_view CACHE_SHM_CONTROL = "control"; + +// macOS PSHMNAMLEN is 31 chars including the leading '/'. Keep names under that +// limit on Linux too, so the same naming works everywhere. +constexpr std::size_t MAX_SHM_NAME_LEN = 31; + +// Maximum number of stripes in the control segment. Bumping it changes the ABI +// hash, so old segments are dropped automatically. +constexpr std::size_t MAX_STRIPES = 256; + +// Per-stripe entry in the control segment. A stripe is matched to its prior +// segment on attach by stripe_key_hash, not by name (order-independent). +struct StripeEntry { + char shm_name[MAX_SHM_NAME_LEN + 1]; ///< full shm name, NUL-terminated. + uint64_t raw_dir_size; ///< size of the stripe's raw_dir segment, bytes. + uint64_t stripe_key_hash; ///< full 64-bit FNV-1a of the stripe hash_text. +}; + +struct CacheShmControl { + char magic[8]; ///< CACHE_SHM_MAGIC + uint32_t schema_version; ///< CACHE_SHM_SCHEMA_VERSION + uint32_t pad0; + uint64_t abi_hash; ///< compile-time ABI fingerprint + uint64_t storage_signature; ///< storage.yaml fingerprint + uint8_t clean_shutdown; ///< 0 = dirty, 1 = clean + uint8_t pad1[3]; + int32_t owner_pid; ///< PID holding the exclusive lock; 0 when none. Backs the + ///< concurrent-attach guard. Cleared on clean shutdown. + uint32_t stripe_count; + uint32_t pad2; + StripeEntry stripes[MAX_STRIPES]; +}; + +constexpr std::size_t CONTROL_SIZE = sizeof(CacheShmControl); + +// Normalize the operator-configured prefix into the full shm name prefix used +// to build segment names. The operator sets only the middle word (e.g. "ats"); +// the framing is supplied here so the leading '/' that POSIX shm_open requires +// and the '-' separating the prefix from the per-object suffix can never be +// mis-typed. Any stray framing carried over from an older config (e.g. a +// literal "/ats-") is trimmed first, so migration can never yield an invalid +// embedded-slash name like "//ats--". An embedded '-' in the middle is preserved; +// an embedded '/' is stripped, since POSIX shm names permit only the leading '/' +// (a mistyped "foo/bar" would otherwise build a name shm_open rejects with EINVAL). +// Both the running server and traffic_ctl normalize through here so they agree on +// the same names. +inline std::string +normalize_name_prefix(std::string_view configured) +{ + std::size_t begin = configured.find_first_not_of('/'); + if (begin == std::string_view::npos) { + begin = configured.size(); // all '/' (or empty): no middle. + } + std::size_t last_kept = configured.find_last_not_of('-'); + std::string_view middle = (last_kept == std::string_view::npos || last_kept < begin) ? + std::string_view{} : + configured.substr(begin, last_kept - begin + 1); + std::string word{"/"}; + for (char c : middle) { + if (c != '/') { // POSIX shm names allow only the leading '/'. + word += c; + } + } + word += "-"; + return word; +} + +// Name of the "control" segment. Derived in one place so the cache +// subsystem and traffic_ctl agree on it. `prefix` is the normalized prefix +// (see normalize_name_prefix), e.g. "/ats-". +inline std::string +control_segment_name(std::string_view prefix) +{ + return std::string(prefix) + CACHE_SHM_CONTROL.data(); +} + +} // namespace cache_shm diff --git a/src/iocore/cache/CacheShmPurge.h b/src/iocore/cache/CacheShmPurge.h new file mode 100644 index 00000000000..1bc64253b68 --- /dev/null +++ b/src/iocore/cache/CacheShmPurge.h @@ -0,0 +1,241 @@ +/** @file + + Shared "enumerate and unlink the shm segments for a prefix" primitive, used by + both the cache subsystem (purge-on-disabled-start) and `traffic_ctl cache shm + clear`. Header-only since traffic_ctl does not link the cache library. + purge_segments() does no logging; it returns a report each caller formats itself. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#pragma once + +#include "CacheShmLayout.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cache_shm +{ + +/// True if `pid` names a live process (pid <= 0 is not). EPERM counts as alive (it +/// exists, we just may not signal it). Backs the owner-liveness backstop used where +/// the control-segment flock is not honored. +inline bool +process_is_alive(int32_t pid) +{ + if (pid <= 0) { + return false; + } + return ::kill(static_cast(pid), 0) == 0 || errno == EPERM; +} + +/// Outcome of trying to take the control segment's exclusive lock. +enum class LockResult { + Acquired, ///< We hold the exclusive lock; no other process does. + HeldByOther, ///< Another live process holds it (flock returned EWOULDBLOCK). + Unsupported, ///< flock is not honored for this fd (e.g. macOS POSIX shm). +}; + +/// Take the exclusive, non-blocking advisory lock on the control fd. Authoritative +/// on Linux/tmpfs (auto-released on crash); macOS POSIX shm returns Unsupported, so +/// the owner_pid liveness check is used there instead. On Unsupported the flock errno +/// is reported via `unexpected_errno` (when non-null) so a caller with logging can +/// surface an otherwise-silent failure (EBADF/EINVAL/ENOLCK vs the expected macOS case). +inline LockResult +try_lock_control(int fd, int *unexpected_errno = nullptr) +{ + if (::flock(fd, LOCK_EX | LOCK_NB) == 0) { + return LockResult::Acquired; + } + // EWOULDBLOCK is the only errno meaning "another process holds it"; anything else + // means flock is unusable here -> fall back to the owner_pid backstop. + if (errno == EWOULDBLOCK) { + return LockResult::HeldByOther; + } + if (unexpected_errno != nullptr) { + *unexpected_errno = errno; + } + return LockResult::Unsupported; +} + +/// Read a shm_name field bounded by the field size (the fixed char[] may be +/// un-terminated in a tampered/stale segment). Empty for a tombstoned slot. +inline std::string +read_shm_name(const char (&field)[32]) +{ + return std::string(field, ::strnlen(field, sizeof(field))); +} + +/// How far purge_segments() got. Everything but Purged/TooSmall means nothing was +/// unlinked. +enum class PurgeOutcome { + BadPrefix, ///< Prefix is empty or does not start with '/'. Nothing attempted. + NotPresent, ///< No control segment exists (shm_open ENOENT). Nothing to do. + OpenFailed, ///< shm_open failed for a reason other than ENOENT; cannot read safely. + MapFailed, ///< The control segment exists but could not be mmap'd. + TooSmall, ///< Control segment is smaller than CacheShmControl; control unlinked, table not walked. + OwnedByLive, ///< A live process owns the segment; nothing was unlinked. + Purged, ///< The stripe table was walked and its segments unlinked (possibly zero stripes). +}; + +/// One shm_unlink attempt, so callers can log each name in their own format. +struct PurgeUnlink { + std::string name; + bool is_control; ///< true for the control object, false for a stripe. + int error; ///< 0 on success; otherwise the errno from shm_unlink (ENOENT == already gone). +}; + +/// Result of purge_segments(). `unlinked` lists every shm_unlink attempted, in +/// order (stripes first, then the control object). +struct PurgeReport { + PurgeOutcome outcome = PurgeOutcome::NotPresent; + std::string control_name; ///< the control name (set whenever the prefix was valid). + int sys_errno = 0; ///< errno behind OpenFailed / MapFailed. + long long segment_size = -1; ///< control segment size in bytes, for TooSmall. + int32_t owner_pid = 0; ///< the recorded owner pid, for OwnedByLive. + std::vector unlinked; + + /// Segments successfully removed (a shm_unlink that returned 0). + unsigned + removed() const + { + unsigned n = 0; + for (const auto &u : unlinked) { + if (u.error == 0) { + ++n; + } + } + return n; + } + + /// Real failures. ENOENT means the segment was already gone, which is the + /// desired end state, so it is not counted. + unsigned + failures() const + { + unsigned n = 0; + for (const auto &u : unlinked) { + if (u.error != 0 && u.error != ENOENT) { + ++n; + } + } + return n; + } +}; + +namespace detail +{ + /// Close an fd on scope exit (the mmap survives the close). + struct FdGuard { + int fd; + ~FdGuard() + { + if (fd >= 0) { + ::close(fd); + } + } + }; +} // namespace detail + +/// Open `control` read-only and, unless a live process still owns it, +/// unlink every stripe segment it lists plus the control object. No logging -- +/// callers format the returned report. The owner guard uses flock, falling back to +/// owner_pid liveness where flock is unsupported. The stripe table is trusted on +/// magic alone (the size check bounds the read; stale names just ENOENT on unlink). +inline PurgeReport +purge_segments(const std::string &prefix) +{ + PurgeReport report; + + if (prefix.empty() || prefix[0] != '/') { + report.outcome = PurgeOutcome::BadPrefix; + return report; + } + report.control_name = control_segment_name(prefix); + + int fd = ::shm_open(report.control_name.c_str(), O_RDONLY, 0); + if (fd < 0) { + report.sys_errno = errno; + report.outcome = (errno == ENOENT) ? PurgeOutcome::NotPresent : PurgeOutcome::OpenFailed; + return report; + } + detail::FdGuard guard{fd}; + + // clang-format off + struct stat sb{}; + // clang-format on + if (::fstat(fd, &sb) < 0 || static_cast(sb.st_size) < CONTROL_SIZE) { + // Too small to hold a valid header/table: there is no table to walk, so just + // unlink the control object itself (it still occupies memory). + report.segment_size = static_cast(sb.st_size); + report.outcome = PurgeOutcome::TooSmall; + int e = ::shm_unlink(report.control_name.c_str()) == 0 ? 0 : errno; + report.unlinked.push_back({report.control_name, true, e}); + return report; + } + + void *addr = ::mmap(nullptr, CONTROL_SIZE, PROT_READ, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + report.sys_errno = errno; + report.outcome = PurgeOutcome::MapFailed; + return report; + } + + const auto *ctrl = static_cast(addr); + const bool magic_ok = std::memcmp(ctrl->magic, CACHE_SHM_MAGIC, sizeof(CACHE_SHM_MAGIC)) == 0; + + const LockResult lock = try_lock_control(fd); + if (lock == LockResult::HeldByOther || (lock == LockResult::Unsupported && magic_ok && process_is_alive(ctrl->owner_pid))) { + report.owner_pid = magic_ok ? ctrl->owner_pid : 0; + report.outcome = PurgeOutcome::OwnedByLive; + ::munmap(addr, CONTROL_SIZE); + return report; + } + + const uint32_t stripe_count = magic_ok ? std::min(ctrl->stripe_count, MAX_STRIPES) : 0; + for (uint32_t i = 0; i < stripe_count; ++i) { + std::string name = read_shm_name(ctrl->stripes[i].shm_name); + if (name.empty()) { + continue; // tombstoned slot -- nothing to unlink + } + int e = ::shm_unlink(name.c_str()) == 0 ? 0 : errno; + report.unlinked.push_back({std::move(name), false, e}); + } + ::munmap(addr, CONTROL_SIZE); + + int e = ::shm_unlink(report.control_name.c_str()) == 0 ? 0 : errno; + report.unlinked.push_back({report.control_name, true, e}); + + report.outcome = PurgeOutcome::Purged; + return report; +} + +} // namespace cache_shm diff --git a/src/iocore/cache/Stripe.cc b/src/iocore/cache/Stripe.cc index 373d545ae1e..19b0864600b 100644 --- a/src/iocore/cache/Stripe.cc +++ b/src/iocore/cache/Stripe.cc @@ -24,6 +24,7 @@ #include "P_CacheDisk.h" #include "P_CacheInternal.h" #include "StripeSM.h" +#include "CacheShm.h" #include "tsutil/DbgCtl.h" @@ -153,15 +154,22 @@ Stripe::_init_directory(std::size_t directory_size, int header_size, int footer_ Dbg(dbg_ctl_cache_init, "Stripe %s: allocating %zu directory bytes for a %lld byte volume (%lf%%)", hash_text.get(), directory_size, (long long)this->len, percent(directory_size, this->len)); - if (ats_hugepage_enabled()) { - this->directory.raw_dir = static_cast(ats_alloc_hugepage(directory_size)); - if (this->directory.raw_dir != nullptr) { - this->directory.raw_dir_huge = true; - } - } - if (nullptr == this->directory.raw_dir) { - this->directory.raw_dir = static_cast(ats_memalign(ats_pagesize(), directory_size)); + // Try a shared-memory-backed directory first; fall back to the hugepage / + // aligned-heap path when shm is disabled or the attach/create fails. + this->directory.raw_dir = CacheShm::attach_or_create_stripe(hash_text.get(), directory_size); + if (this->directory.raw_dir != nullptr) { this->directory.raw_dir_huge = false; + } else { + if (ats_hugepage_enabled()) { + this->directory.raw_dir = static_cast(ats_alloc_hugepage(directory_size)); + if (this->directory.raw_dir != nullptr) { + this->directory.raw_dir_huge = true; + } + } + if (nullptr == this->directory.raw_dir) { + this->directory.raw_dir = static_cast(ats_memalign(ats_pagesize(), directory_size)); + this->directory.raw_dir_huge = false; + } } this->directory.raw_dir_size = directory_size; this->directory.dir = reinterpret_cast(this->directory.raw_dir + header_size); @@ -170,6 +178,52 @@ Stripe::_init_directory(std::size_t directory_size, int header_size, int footer_ this->directory.footer = reinterpret_cast(this->directory.raw_dir + footer_offset); } +// Bounds-check the trusted header/freelist fields of an in-shm directory before +// the fast-restart attach (magic/version are already checked by the caller). A +// stale-but-magic-valid segment could present out-of-range offsets that become OOB +// disk I/O. On failure the caller falls through to the disk read + recover_data(). +// +// Trust model: the shm segment is trusted to the same degree as the on-disk +// directory (same-uid, mode 0600). Stripe geometry (segments/buckets) is recomputed +// locally each run and raw_dir_size is exact-matched before attach, so this only +// validates the header cursor fields and per-segment freelist heads; it does not +// re-validate individual Dir entries -- the read path already checks Doc magic + key +// before serving, so a stale entry resolves to a miss, never served corruption. +bool +Stripe::_shm_directory_is_valid() const +{ + // sector_size must be sane geometry (mirrors the hw_sector_size range in Cache.cc). + if (this->directory.header->sector_size == 0 || this->directory.header->sector_size > STORE_BLOCK_SIZE) { + return false; + } + + // phase is a single bit of write-cursor state; only 0/1 are valid. + if (this->directory.header->phase > 1) { + return false; + } + + // write_pos/last_write_pos/agg_pos must point into the data region. + const off_t data_lo = this->start; + const off_t data_hi = this->skip + this->len; + + if (this->directory.header->write_pos < data_lo || this->directory.header->write_pos > data_hi || + this->directory.header->last_write_pos < data_lo || this->directory.header->last_write_pos > data_hi || + this->directory.header->agg_pos < data_lo || this->directory.header->agg_pos > data_hi) { + return false; + } + + // Each per-segment freelist head must index a Dir within its segment (0 == empty); + // a head past the entry count would walk the free list out of bounds. + const int64_t segment_entries = static_cast(this->directory.buckets) * DIR_DEPTH; + for (int s = 0; s < this->directory.segments; s++) { + if (this->directory.header->freelist[s] >= segment_entries) { + return false; + } + } + + return true; +} + // coverity[exn_spec_violation] - ink_assert aborts (doesn't throw), Dbg is exception-safe Stripe::~Stripe() { @@ -182,12 +236,19 @@ Stripe::~Stripe() ink_assert(this->directory.raw_dir_size > 0); ink_assert(this->directory.raw_dir_size < MAX_STRIPE_SIZE); + // shm-backed directories must outlive the process; never ats_free or poison them. + const bool is_shm = CacheShm::is_shm_pointer(this->directory.raw_dir); + #ifdef DEBUG - // Poison memory before freeing to help detect use-after-free - memset(this->directory.raw_dir, 0xDE, this->directory.raw_dir_size); + if (!is_shm) { + // Poison memory before freeing to help detect use-after-free + memset(this->directory.raw_dir, 0xDE, this->directory.raw_dir_size); + } #endif - if (this->directory.raw_dir_huge) { + if (is_shm) { + CacheShm::detach_stripe(this->directory.raw_dir); + } else if (this->directory.raw_dir_huge) { ats_free_hugepage(this->directory.raw_dir, this->directory.raw_dir_size); } else { ats_free(this->directory.raw_dir); diff --git a/src/iocore/cache/Stripe.h b/src/iocore/cache/Stripe.h index b99b4773fee..c4ee70857b0 100644 --- a/src/iocore/cache/Stripe.h +++ b/src/iocore/cache/Stripe.h @@ -148,9 +148,10 @@ class Stripe off_t data_blocks{}; AggregateWriteBuffer _write_buffer; - void _clear_init(std::uint32_t hw_sector_size); - void _init_dir(); - bool flush_aggregate_write_buffer(int fd); + void _clear_init(std::uint32_t hw_sector_size); + void _init_dir(); + bool _shm_directory_is_valid() const; + [[nodiscard]] bool flush_aggregate_write_buffer(int fd); private: void _init_hash_text(CacheDisk const *disk, off_t blocks, off_t dir_skip); diff --git a/src/iocore/cache/StripeSM.cc b/src/iocore/cache/StripeSM.cc index 4587963968f..15a934e5d1d 100644 --- a/src/iocore/cache/StripeSM.cc +++ b/src/iocore/cache/StripeSM.cc @@ -31,6 +31,7 @@ #include "CacheEvacuateDocVC.h" #include "PreservationTable.h" #include "Stripe.h" +#include "CacheShm.h" #include "iocore/cache/CacheDefs.h" #include "CacheVC.h" @@ -178,6 +179,26 @@ StripeSM::init(bool clear) return clear_dir_aio(); } + // shm fast restart: a clean shutdown left the in-shm directory authoritative, so + // skip both the disk read and recover_data() (which would re-scan the tail and + // discard the entries the shm copy preserved). After validating the in-shm + // header/footer, jump straight to dir_init_done() in the normal post-recovery + // state. Validation failure falls through to disk read + recover_data(). + if (CacheShm::mode() == CacheShm::Mode::AttachExisting && CacheShm::is_shm_pointer(this->directory.raw_dir)) { + if (this->directory.header->magic == STRIPE_MAGIC && this->directory.footer->magic == STRIPE_MAGIC && + CACHE_DB_MAJOR_VERSION_COMPATIBLE <= this->directory.header->version._major && + this->directory.header->version._major <= CACHE_DB_MAJOR_VERSION && this->_shm_directory_is_valid()) { + Note("attaching cached directory from shm for '%s' (fast restart, recovery skipped)", hash_text.get()); + this->sector_size = this->directory.header->sector_size; + this->scan_pos = this->directory.header->write_pos; + this->_preserved_dirs.periodic_scan(this); + this->set_io_not_in_progress(); + SET_HANDLER(&StripeSM::dir_init_done); + return this->dir_init_done(EVENT_IMMEDIATE, nullptr); + } + Note("shm directory invalid for '%s'; falling back to disk read", hash_text.get()); + } + init_info = new StripeInitInfo(); int footerlen = ROUND_TO_STORE_BLOCK(sizeof(StripeHeaderFooter)); off_t footer_offset = this->dirlen() - footerlen; @@ -1326,7 +1347,10 @@ StripeSM::shutdown(EThread *shutdown_thread) SCOPED_MUTEX_LOCK(lock, this->mutex, shutdown_thread); if (DISK_BAD(this->disk)) { - Dbg(dbg_ctl_cache_dir_sync, "Dir %s: ignoring -- bad disk", this->hash_text.get()); + // Bad disk: invalidate the shm copy so the next start recovers from disk + // (mirrors the flush-failure branch below). + Dbg(dbg_ctl_cache_dir_sync, "Dir %s: bad disk -- invalidating shm copy for disk recovery", this->hash_text.get()); + CacheShm::invalidate_stripe_directory(this->directory.raw_dir); return; } size_t dirlen = this->dirlen(); @@ -1342,7 +1366,15 @@ StripeSM::shutdown(EThread *shutdown_thread) // directories have not been inserted for these writes if (!this->_write_buffer.is_empty()) { Dbg(dbg_ctl_cache_dir_sync, "Dir %s: flushing agg buffer first", this->hash_text.get()); - this->flush_aggregate_write_buffer(this->fd); + if (!this->flush_aggregate_write_buffer(this->fd)) { + // Flush failed (e.g. disk went bad): the pwrite below would abort on a short + // write, and the directory no longer matches disk, so invalidate the shm copy + // so this stripe falls back to disk read + recover_data() next start. + Error("Dir %s: aggregation buffer flush failed during shutdown; invalidating shm copy so this stripe reloads from disk", + this->hash_text.get()); + CacheShm::invalidate_stripe_directory(this->directory.raw_dir); + return; + } } // We already asserted that dirlen > 0. @@ -1354,6 +1386,16 @@ StripeSM::shutdown(EThread *shutdown_thread) this->directory.footer->sync_serial = this->directory.header->sync_serial; CHECK_DIR(d); + + // A shm-backed directory is kept current in the shared segment by every + // dir_insert and attached directly next start, so the on-disk write here is pure + // waste -- skip it. If the shm segment is later dropped, the on-disk A/B copies + + // recover_data() reconcile the tail, the same path an unclean restart takes. + if (CacheShm::is_shm_pointer(this->directory.raw_dir)) { + Note("Dir %s: shm-backed, skipping on-disk directory write", this->hash_text.get()); + return; + } + size_t B = this->directory.header->sync_serial & 1; off_t start = this->skip + (B ? dirlen : 0); B = pwrite(this->fd, this->directory.raw_dir, dirlen, start); diff --git a/src/iocore/cache/unit_tests/test_CacheShm.cc b/src/iocore/cache/unit_tests/test_CacheShm.cc new file mode 100644 index 00000000000..8858b79a5c7 --- /dev/null +++ b/src/iocore/cache/unit_tests/test_CacheShm.cc @@ -0,0 +1,224 @@ +/** @file + + Unit tests for the cache shared-memory trust gates and control-segment layout. + + These exercise the pure, side-effect-free pieces of the shm fast-restart + feature -- the ABI fingerprint, the storage signature, the control-header + layout, and the macOS shm-name-length constraint -- without standing up a + cache. They are the logic that decides whether a prior shm segment may be + attached (fast restart) or must be dropped and rebuilt from disk. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#include "main.h" + +#include "../CacheShm.h" +#include "../CacheShmLayout.h" + +#include "iocore/cache/Store.h" +#include "tscore/ink_memory.h" + +#include +#include + +#include + +// Required by the shared test harness (main.cc). +int cache_vols = 1; +bool reuse_existing_cache = false; + +namespace +{ + +// Build a single-span Store with the given path and size (in STORE_BLOCK_SIZE +// blocks). The returned Store owns the Span and frees it on destruction. +void +make_store(Store &store, const char *path, int64_t blocks, int64_t offset = 0) +{ + store.extend(1); + auto *span = new Span(); + span->pathname = ats_strdup(path); + span->blocks = blocks; + span->offset = offset; + span->file_pathname = true; + store.spans[0] = span; +} + +} // namespace + +TEST_CASE("CacheShm ABI hash is stable and non-zero", "[cache][shm]") +{ + const uint64_t a = CacheShm::abi_hash(); + const uint64_t b = CacheShm::abi_hash(); + + // Deterministic: the fingerprint is a pure function of compile-time layout. + CHECK(a == b); + // A zero hash would defeat the trust gate (every segment would look matching); + // the FNV-1a seed and the struct sizes guarantee it is non-zero. + CHECK(a != 0); +} + +TEST_CASE("CacheShm storage signature is sensitive to topology", "[cache][shm]") +{ + Store base; + make_store(base, "/cache/disk0", 1000); + + SECTION("identical topology -> identical signature") + { + Store same; + make_store(same, "/cache/disk0", 1000); + CHECK(CacheShm::storage_signature(base) == CacheShm::storage_signature(same)); + } + + SECTION("different path -> different signature") + { + Store other; + make_store(other, "/cache/disk1", 1000); + CHECK(CacheShm::storage_signature(base) != CacheShm::storage_signature(other)); + } + + SECTION("different size -> different signature") + { + Store resized; + make_store(resized, "/cache/disk0", 2000); + CHECK(CacheShm::storage_signature(base) != CacheShm::storage_signature(resized)); + } + + SECTION("different offset -> different signature") + { + Store moved; + make_store(moved, "/cache/disk0", 1000, /*offset=*/512); + CHECK(CacheShm::storage_signature(base) != CacheShm::storage_signature(moved)); + } + + SECTION("an empty store has a stable signature") + { + Store empty0; + Store empty1; + CHECK(CacheShm::storage_signature(empty0) == CacheShm::storage_signature(empty1)); + } +} + +TEST_CASE("CacheShm control header round-trips through a byte buffer", "[cache][shm]") +{ + using cache_shm::CACHE_SHM_MAGIC; + using cache_shm::CACHE_SHM_SCHEMA_VERSION; + using cache_shm::CacheShmControl; + using cache_shm::CONTROL_SIZE; + + // The on-shm size must equal the struct size; tooling (traffic_ctl) maps + // exactly CONTROL_SIZE bytes and reads the struct out of it. + CHECK(CONTROL_SIZE == sizeof(CacheShmControl)); + + CacheShmControl src; + std::memset(&src, 0, sizeof(src)); + std::memcpy(src.magic, CACHE_SHM_MAGIC, sizeof(CACHE_SHM_MAGIC)); + src.schema_version = CACHE_SHM_SCHEMA_VERSION; + src.abi_hash = 0x0123456789abcdefULL; + src.storage_signature = 0xfedcba9876543210ULL; + src.clean_shutdown = 1; + src.owner_pid = 4242; + src.stripe_count = 2; + std::strncpy(src.stripes[0].shm_name, "/ats-s0", sizeof(src.stripes[0].shm_name) - 1); + src.stripes[0].raw_dir_size = 4096; + src.stripes[0].stripe_key_hash = 0xaaaabbbbccccddddULL; + std::strncpy(src.stripes[1].shm_name, "/ats-s1", sizeof(src.stripes[1].shm_name) - 1); + src.stripes[1].raw_dir_size = 8192; + src.stripes[1].stripe_key_hash = 0x1111222233334444ULL; + + // Serialize to a raw byte buffer and read it back, mimicking shm attach. + unsigned char buf[CONTROL_SIZE]; + std::memcpy(buf, &src, CONTROL_SIZE); + const auto *dst = reinterpret_cast(buf); + + CHECK(std::memcmp(dst->magic, CACHE_SHM_MAGIC, sizeof(CACHE_SHM_MAGIC)) == 0); + CHECK(dst->schema_version == CACHE_SHM_SCHEMA_VERSION); + CHECK(dst->abi_hash == 0x0123456789abcdefULL); + CHECK(dst->storage_signature == 0xfedcba9876543210ULL); + CHECK(dst->clean_shutdown == 1); + CHECK(dst->owner_pid == 4242); + CHECK(dst->stripe_count == 2); + CHECK(std::string(dst->stripes[0].shm_name) == "/ats-s0"); + CHECK(dst->stripes[0].raw_dir_size == 4096); + CHECK(dst->stripes[0].stripe_key_hash == 0xaaaabbbbccccddddULL); + CHECK(std::string(dst->stripes[1].shm_name) == "/ats-s1"); + CHECK(dst->stripes[1].raw_dir_size == 8192); + CHECK(dst->stripes[1].stripe_key_hash == 0x1111222233334444ULL); +} + +TEST_CASE("CacheShm names respect the macOS PSHMNAMLEN limit", "[cache][shm]") +{ + using cache_shm::MAX_SHM_NAME_LEN; + using cache_shm::StripeEntry; + + // macOS caps POSIX shm names at 31 chars including the leading '/'. The shared + // limit must match so the same naming works on Linux and macOS alike. + CHECK(MAX_SHM_NAME_LEN == 31); + + // The per-stripe name field must hold a maximum-length name plus its NUL. + CHECK(sizeof(StripeEntry{}.shm_name) > MAX_SHM_NAME_LEN); + + // The default control segment name fits comfortably under the limit. + const std::string control_name = cache_shm::control_segment_name("/ats-"); + CHECK(control_name.size() < MAX_SHM_NAME_LEN); +} + +TEST_CASE("CacheShm normalizes the configured name prefix", "[cache][shm]") +{ + using cache_shm::normalize_name_prefix; + + // The operator configures only the middle word; the framing '/' and '-' are + // supplied by the code so a name like "/ats-" cannot be mis-typed. + CHECK(normalize_name_prefix("ats") == "/ats-"); + CHECK(normalize_name_prefix("foo") == "/foo-"); + + // Forgiving of stray framing an operator may carry over (e.g. a pre-existing + // "/ats-" config), so migration cannot produce "//ats--". + CHECK(normalize_name_prefix("/ats-") == "/ats-"); + CHECK(normalize_name_prefix("/ats") == "/ats-"); + CHECK(normalize_name_prefix("ats-") == "/ats-"); + CHECK(normalize_name_prefix("//ats--") == "/ats-"); + + // An embedded '-' in the middle is preserved -- only the framing is trimmed. + CHECK(normalize_name_prefix("ats-v2") == "/ats-v2-"); + + // An embedded '/' is stripped: POSIX shm names permit only the leading '/', so a + // mistyped middle word must not build a name shm_open would reject with EINVAL. + CHECK(normalize_name_prefix("foo/bar") == "/foobar-"); + CHECK(normalize_name_prefix("/ats/v2/") == "/atsv2-"); + CHECK(normalize_name_prefix("a/b/c") == "/abc-"); +} + +TEST_CASE("CacheShm process liveness check backs the concurrent-attach guard", "[cache][shm]") +{ + // Our own PID is, by definition, live -- this is the "a different live owner + // still holds the segment" case the guard refuses to attach over. + CHECK(CacheShm::process_is_alive(static_cast(getpid()))); + + // A zero / negative owner_pid means "no owner recorded" (e.g. after a clean + // shutdown); it must never read as live or the guard would wrongly refuse. + CHECK_FALSE(CacheShm::process_is_alive(0)); + CHECK_FALSE(CacheShm::process_is_alive(-1)); + + // A PID at the top of the range is overwhelmingly unlikely to name a live + // process; kill(pid, 0) returns ESRCH, so it reads as not-alive (a stale + // owner left by a crash, which the guard is free to reclaim). + CHECK_FALSE(CacheShm::process_is_alive(std::numeric_limits::max())); +} diff --git a/src/records/RecordsConfig.cc b/src/records/RecordsConfig.cc index 7890f427ed1..d209716ebfc 100644 --- a/src/records/RecordsConfig.cc +++ b/src/records/RecordsConfig.cc @@ -82,6 +82,15 @@ static constexpr RecordElement RecordsConfig[] = , {RECT_CONFIG, "proxy.config.cache.persist_bad_disks", RECD_INT, "0", RECU_RESTART_TS, RR_NULL, RECC_INT, "[0-1]", RECA_NULL} , + {RECT_CONFIG, "proxy.config.cache.shm.enabled", RECD_INT, "0", RECU_RESTART_TS, RR_NULL, RECC_INT, "[0-1]", RECA_NULL} + , + {RECT_CONFIG, "proxy.config.cache.shm.name_prefix", RECD_STRING, "ats", RECU_RESTART_TS, RR_NULL, RECC_NULL, nullptr, RECA_NULL} + , + {RECT_CONFIG, "proxy.config.cache.shm.use_hugepages", RECD_INT, "0", RECU_RESTART_TS, RR_NULL, RECC_INT, "[0-1]", RECA_NULL} + , + {RECT_CONFIG, "proxy.config.cache.shm.purge_stale_on_start", RECD_INT, "0", RECU_RESTART_TS, RR_NULL, RECC_INT, "[0-1]", + RECA_NULL} + , {RECT_CONFIG, "proxy.config.cache.default_volumes", RECD_STRING, "", RECU_RESTART_TS, RR_NULL, RECC_NULL, nullptr, RECA_NULL} , {RECT_CONFIG, "proxy.config.output.logfile.name", RECD_STRING, "traffic.out", RECU_RESTART_TS, RR_REQUIRED, RECC_NULL, nullptr, diff --git a/src/traffic_ctl/CMakeLists.txt b/src/traffic_ctl/CMakeLists.txt index c967d42e9c9..c0ef76cd1cd 100644 --- a/src/traffic_ctl/CMakeLists.txt +++ b/src/traffic_ctl/CMakeLists.txt @@ -18,6 +18,7 @@ add_executable( traffic_ctl traffic_ctl.cc + CacheShmCommand.cc ConvertConfigCommand.cc CtrlCommands.cc CtrlPrinters.cc @@ -27,7 +28,17 @@ add_executable( ${CMAKE_SOURCE_DIR}/src/shared/rpc/IPCSocketClient.cc ) -target_link_libraries(traffic_ctl ts::tscore ts::config libswoc::libswoc yaml-cpp::yaml-cpp ts::tsutil) +target_include_directories(traffic_ctl PRIVATE ${CMAKE_SOURCE_DIR}/src/iocore/cache) + +target_link_libraries( + traffic_ctl + ts::tscore + ts::config + libswoc::libswoc + yaml-cpp::yaml-cpp + ts::tsutil + rt::rt +) install(TARGETS traffic_ctl) diff --git a/src/traffic_ctl/CacheShmCommand.cc b/src/traffic_ctl/CacheShmCommand.cc new file mode 100644 index 00000000000..ec63b113c22 --- /dev/null +++ b/src/traffic_ctl/CacheShmCommand.cc @@ -0,0 +1,261 @@ +/** @file + + traffic_ctl command for inspecting and clearing the cache shared-memory + control segment and its associated stripe segments. + + The status and clear operations work by direct shm_open access rather than + JSONRPC, so they function whether traffic_server is running or not. This + is important for debugging crash-leftover segments when no live process + is available to query. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#include "CacheShmCommand.h" +#include "CacheShmLayout.h" +#include "CacheShmPurge.h" +#include "TrafficCtlStatus.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace +{ + +// The middle word of the shm name prefix when --prefix is not given. The +// framing "/-" is supplied by cache_shm::normalize_name_prefix, matching +// the server's proxy.config.cache.shm.name_prefix default. +constexpr const char *DEFAULT_PREFIX = "ats"; + +bool +shm_segment_exists(const std::string &name) +{ + int fd = shm_open(name.c_str(), O_RDONLY, 0); + if (fd < 0) { + return false; + } + close(fd); + return true; +} + +std::string +format_size(uint64_t bytes) +{ + char buf[64]; + if (bytes >= (uint64_t{1} << 30)) { + std::snprintf(buf, sizeof(buf), "%.2f GiB", static_cast(bytes) / (uint64_t{1} << 30)); + } else if (bytes >= (uint64_t{1} << 20)) { + std::snprintf(buf, sizeof(buf), "%.2f MiB", static_cast(bytes) / (uint64_t{1} << 20)); + } else if (bytes >= (uint64_t{1} << 10)) { + std::snprintf(buf, sizeof(buf), "%.2f KiB", static_cast(bytes) / (uint64_t{1} << 10)); + } else { + std::snprintf(buf, sizeof(buf), "%llu B", static_cast(bytes)); + } + return buf; +} + +// Shared with the cache subsystem (CacheShmPurge.h): read_shm_name bounds a +// possibly-unterminated name field, process_is_alive backs the owner-liveness gate. +using cache_shm::process_is_alive; +using cache_shm::read_shm_name; + +} // namespace + +CacheShmCommand::CacheShmCommand(ts::Arguments *args) : CtrlCommand(args) +{ + if (get_parsed_arguments()->get(STATUS_STR)) { + _invoked_func = [this]() { status(); }; + } else if (get_parsed_arguments()->get(CLEAR_STR)) { + _invoked_func = [this]() { clear(); }; + } +} + +std::string +CacheShmCommand::get_prefix() +{ + // The operator gives only the middle word (e.g. --prefix ats); frame it the + // same way the server does so the two agree on segment names. + std::string configured = DEFAULT_PREFIX; + if (auto arg = get_parsed_arguments()->get(PREFIX_STR); arg && !arg.empty()) { + configured = arg.value(); + } + return cache_shm::normalize_name_prefix(configured); +} + +void +CacheShmCommand::status() +{ + const std::string prefix = get_prefix(); + const std::string control_name = cache_shm::control_segment_name(prefix); + + int fd = shm_open(control_name.c_str(), O_RDONLY, 0); + if (fd < 0) { + std::cerr << "cache shm: control segment '" << control_name << "' not found: " << std::strerror(errno) << '\n'; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + } + + struct stat sb { + }; + if (fstat(fd, &sb) < 0) { + std::cerr << "cache shm: fstat failed: " << std::strerror(errno) << '\n'; + close(fd); + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + } + + if (static_cast(sb.st_size) < sizeof(cache_shm::CacheShmControl)) { + std::cerr << "cache shm: control segment too small (" << sb.st_size << " bytes, need at least " + << sizeof(cache_shm::CacheShmControl) << ")\n"; + close(fd); + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + } + + void *addr = mmap(nullptr, sizeof(cache_shm::CacheShmControl), PROT_READ, MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + std::cerr << "cache shm: mmap failed: " << std::strerror(errno) << '\n'; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + } + + const auto *ctrl = static_cast(addr); + + const bool magic_ok = std::memcmp(ctrl->magic, cache_shm::CACHE_SHM_MAGIC, sizeof(cache_shm::CACHE_SHM_MAGIC)) == 0; + const bool schema_ok = ctrl->schema_version == cache_shm::CACHE_SHM_SCHEMA_VERSION; + + std::cout << "Control segment: " << control_name << '\n'; + std::cout << " segment size: " << sb.st_size << " bytes (" << format_size(sb.st_size) << ")\n"; + std::cout << " magic: "; + for (char c : ctrl->magic) { + if (c >= 0x20 && c < 0x7f) { + std::cout << c; + } + } + std::cout << (magic_ok ? " [valid]" : " [INVALID]") << '\n'; + std::cout << " schema_version: " << ctrl->schema_version << (schema_ok ? " [valid]" : " [INVALID]") << '\n'; + std::cout << " abi_hash: 0x" << std::hex << ctrl->abi_hash << std::dec << '\n'; + std::cout << " storage_sig: 0x" << std::hex << ctrl->storage_signature << std::dec << '\n'; + std::cout << " clean_shutdown: " << static_cast(ctrl->clean_shutdown) + << (ctrl->clean_shutdown ? " (clean)" : " (DIRTY -- next start will rebuild)") << '\n'; + std::cout << " owner_pid: " << ctrl->owner_pid; + if (ctrl->owner_pid == 0) { + std::cout << " (none -- not currently attached)"; + } else if (process_is_alive(ctrl->owner_pid)) { + std::cout << " (LIVE -- a running traffic_server owns this segment)"; + } else { + std::cout << " (stale -- owner no longer running)"; + } + std::cout << '\n'; + std::cout << " stripe_count: " << ctrl->stripe_count << '\n'; + + if (!magic_ok || !schema_ok) { + std::cout << "\nHeader is invalid; not interpreting stripe table.\n"; + munmap(addr, sizeof(cache_shm::CacheShmControl)); + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + } + + const uint32_t shown = std::min(ctrl->stripe_count, cache_shm::MAX_STRIPES); + + if (shown > 0) { + std::cout << "\nStripes:\n"; + for (uint32_t i = 0; i < shown; ++i) { + const auto &entry = ctrl->stripes[i]; + std::string name = read_shm_name(entry.shm_name); + if (name.empty()) { + std::cout << " [" << i << "] (tombstone -- slot free for reuse)\n"; + continue; + } + const bool present = shm_segment_exists(name); + std::cout << " [" << i << "] " << name << " size=" << entry.raw_dir_size << " (" << format_size(entry.raw_dir_size) << ") " + << (present ? "present" : "MISSING") << '\n'; + } + } + + if (ctrl->stripe_count > cache_shm::MAX_STRIPES) { + std::cout << "\n(stripe_count " << ctrl->stripe_count << " exceeds MAX_STRIPES " << cache_shm::MAX_STRIPES << "; truncated.)\n"; + } + + munmap(addr, sizeof(cache_shm::CacheShmControl)); +} + +void +CacheShmCommand::clear() +{ + // Shared with the server's purge-on-disabled-start (cache_shm::purge_segments); + // this command just renders the result to the console and sets the exit code. + const cache_shm::PurgeReport report = cache_shm::purge_segments(get_prefix()); + + switch (report.outcome) { + case cache_shm::PurgeOutcome::BadPrefix: + std::cerr << "cache shm: invalid prefix (must be non-empty and begin with '/').\n"; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + case cache_shm::PurgeOutcome::NotPresent: + std::cerr << "cache shm: control segment '" << report.control_name << "' not found (" << std::strerror(report.sys_errno) + << "); nothing to clear.\n"; + std::cout << "Removed 0 segment(s).\n"; + return; + case cache_shm::PurgeOutcome::OpenFailed: + // Not ENOENT: the segment may well exist but we could not open it (e.g. EACCES on a + // segment owned by another user). Report the real errno and fail rather than claim success. + std::cerr << "cache shm: cannot open control segment '" << report.control_name << "' (" << std::strerror(report.sys_errno) + << "); cannot clear.\n"; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + case cache_shm::PurgeOutcome::MapFailed: + std::cerr << "cache shm: mmap failed while reading stripe table: " << std::strerror(report.sys_errno) << '\n'; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + case cache_shm::PurgeOutcome::OwnedByLive: + // Refuse: unlinking a live owner's segments would orphan its fast restart. + std::cerr << "cache shm: control segment '" << report.control_name << "' is owned by a live traffic_server (pid " + << report.owner_pid << "); refusing to clear. Stop traffic_server first.\n"; + App_Exit_Status_Code = CTRL_EX_ERROR; + return; + case cache_shm::PurgeOutcome::TooSmall: + case cache_shm::PurgeOutcome::Purged: + break; + } + + for (const auto &u : report.unlinked) { + if (u.error == 0) { + std::cout << "unlinked " << u.name << '\n'; + } else if (u.error != ENOENT) { + std::cerr << "failed to unlink " << u.name << ": " << std::strerror(u.error) << '\n'; + } + } + + const unsigned failures = report.failures(); + std::cout << "Removed " << report.removed() << " segment(s)"; + if (failures != 0) { + std::cout << ", " << failures << " failure(s)"; + App_Exit_Status_Code = CTRL_EX_ERROR; + } + std::cout << ".\n"; +} diff --git a/src/traffic_ctl/CacheShmCommand.h b/src/traffic_ctl/CacheShmCommand.h new file mode 100644 index 00000000000..1a1f2db879b --- /dev/null +++ b/src/traffic_ctl/CacheShmCommand.h @@ -0,0 +1,45 @@ +/** @file + + traffic_ctl command for inspecting and clearing the cache shared-memory + control segment and its associated stripe segments. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#pragma once + +#include "CtrlCommands.h" + +#include + +class CacheShmCommand : public CtrlCommand +{ +public: + CacheShmCommand(ts::Arguments *args); + +private: + static inline const std::string STATUS_STR{"status"}; + static inline const std::string CLEAR_STR{"clear"}; + static inline const std::string PREFIX_STR{"prefix"}; + + void status(); + void clear(); + + std::string get_prefix(); +}; diff --git a/src/traffic_ctl/traffic_ctl.cc b/src/traffic_ctl/traffic_ctl.cc index 9697aaa05da..2824011c190 100644 --- a/src/traffic_ctl/traffic_ctl.cc +++ b/src/traffic_ctl/traffic_ctl.cc @@ -31,6 +31,7 @@ #include "tscore/signals.h" #include "CtrlCommands.h" +#include "CacheShmCommand.h" #include "ConvertConfigCommand.h" #include "FileConfigCommand.h" #include "SSLMultiCertCommand.h" @@ -101,6 +102,7 @@ main([[maybe_unused]] int argc, const char **argv) auto &host_command = parser.add_command("host", "Interact with host status").require_commands(); auto &hostdb_command = parser.add_command("hostdb", "Interact with HostDB status").require_commands(); auto &direct_rpc_command = parser.add_command("rpc", "Interact with the rpc api").require_commands(); + auto &cache_command = parser.add_command("cache", "Inspect and manage the cache").require_commands(); // config commands config_command.add_command("defaults", "Show default information configuration values", Command_Execute) @@ -315,6 +317,16 @@ main([[maybe_unused]] int argc, const char **argv) .add_option("--params", "-p", "Parameters to be passed in the request, YAML or JSON format", "", MORE_THAN_ONE_ARG_N, "", "") .add_example_usage("traffic_ctl rpc invoke foo_bar -p \"numbers: [1, 2, 3]\""); + // cache shm commands - operate directly on POSIX shared memory; no running server required. + auto &shm_command = cache_command.add_command("shm", "Inspect and manage cache shared-memory segments").require_commands(); + shm_command.add_option("--prefix", "-p", "shm name prefix word, framed as /- (default 'ats')", "", 1, "ats"); + shm_command.add_command("status", "Show the cache shared-memory control segment and stripe table", [&]() { command->execute(); }) + .add_example_usage("traffic_ctl cache shm status") + .add_example_usage("traffic_ctl cache shm status --prefix ats-t"); + shm_command.add_command("clear", "Unlink the cache shared-memory control and stripe segments", [&]() { command->execute(); }) + .add_example_usage("traffic_ctl cache shm clear") + .add_example_usage("traffic_ctl cache shm clear --prefix ats-t"); + auto create_command = [](ts::Arguments &args) -> std::unique_ptr { if (args.get("config")) { if (args.get("convert")) { @@ -337,6 +349,7 @@ main([[maybe_unused]] int argc, const char **argv) {"host", [](ts::Arguments *a) { return std::make_unique(a); } }, {"hostdb", [](ts::Arguments *a) { return std::make_unique(a); } }, {"rpc", [](ts::Arguments *a) { return std::make_unique(a); }}, + {"cache", [](ts::Arguments *a) { return std::make_unique(a); } }, }; for (const auto &[key, factory] : factories) { diff --git a/tests/gold_tests/cache/cache_shm_bad_disk_dropped.test.py b/tests/gold_tests/cache/cache_shm_bad_disk_dropped.test.py new file mode 100644 index 00000000000..d31e0418ebf --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_bad_disk_dropped.test.py @@ -0,0 +1,226 @@ +''' +Verify per-stripe partial attach when a disk is dropped from storage.yaml +(the "bad disk" case). A storage change no longer cold-starts every stripe: +the stripes on healthy, unchanged disks fast-attach their prior shm segments +while the segment left behind by the removed disk is reclaimed. + +ts1 caches an object across two disks and clean-shuts-down (marking the shm +clean). ts2 starts against the *same* shm prefix but with the second disk +removed from storage.yaml -- simulating a bad disk dropped by the operator. +ts2 must: + - keep the existing control segment (partial attach, not a full recreate), + - fast-attach the surviving disk's stripe by its stable identity, + - reclaim the orphaned stripe segment of the removed disk, + - and still serve traffic. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import uuid + +Test.Summary = ''' +Dropping a disk from storage.yaml fast-attaches the surviving stripes from +shm and reclaims the orphaned stripe segment of the removed disk. +''' +Test.ContinueOnFail = True + + +class CacheShmBadDiskDroppedTest: + """ + A stripe's shm identity is its hash_text -- the disk seed (path or + hash_base_string) plus that disk's own dir_skip:blocks, read from the + disk's persisted header. None of those depend on the other disks, so when + one disk is removed from storage.yaml the surviving disks compute the + same hash_text as before and re-attach their prior shm segments. The + removed disk's stripe is no longer present, so its control entry is never + claimed and finalize_attach() reclaims the orphaned segment. + + ts1 starts cold across disk_a + disk_b, populates the cache, and clean-shuts + down. ts2 starts against disk_a only (disk_b "fails"/dropped) sharing the + shm prefix, and asserts ts2: + - enters partial-attach mode (storage signature changed) keeping the + control segment rather than recreating it, + - fast-attaches the surviving disk_a stripe from shm, + - reclaims exactly the orphaned disk_b stripe segment, + - reports neither an unclean shutdown nor a schema/ABI mismatch, + - and serves a request (200). + """ + + TS_PID_SCRIPT = 'ts_process_handler.py' + + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB per disk + + def __init__(self): + self._setup_shared_state() + # ts1 sees both disks; ts2 sees only disk_a (disk_b dropped). + self.ts1 = self._configure_ts('shmbd_ts1', [self._storage_path_a, self._storage_path_b]) + self.ts2 = self._configure_ts('shmbd_ts2', [self._storage_path_a]) + self._add_diags_log_assertions() + self._url_path = f'/cache/40/{uuid.uuid4()}' + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + # Absolute paths keep the spans independent of MakeATSProcess's + # per-instance STORAGEDIR so disk_a has identical geometry for ts1 and + # ts2 (hence identical stripe identity -> fast attach). + self._storage_path_a = os.path.join(shared_storage_dir, 'disk_a.img') + self._storage_path_b = os.path.join(shared_storage_dir, 'disk_b.img') + for path in (self._storage_path_a, self._storage_path_b): + with open(path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # macOS PSHMNAMLEN is 31 chars incl. '/'; 'bd' = bad-disk-dropped variant. + self._shm_prefix = f'/cshmbd-{os.getpid() % 100000}-' + + def _configure_ts(self, name, storage_paths): + ts = Test.MakeATSProcess(name) + storage_lines = ['cache:', ' spans:'] + for i, storage_path in enumerate(storage_paths): + storage_lines += [ + f' - name: disk.{i}', + f' path: {storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ] + storage_lines += [' volumes:', ' - id: 1', ' scheme: http', ' size: 100%'] + ts.Disk.storage_yaml.AddLines(storage_lines) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm|cache_init', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine('map / http://127.0.0.1/ @plugin=generator.so') + return ts + + def _add_diags_log_assertions(self): + # ts1 cold start across both disks, clean shutdown. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create a fresh shm control segment on first start') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'created stripe \S+ \(\d+ bytes\) for key=', 'ts1 should create the shm-backed stripe segments') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: marking clean shutdown', 'ts1 should mark the shm clean before exit') + + # ts2 warm start with disk_b dropped: partial attach -- the surviving + # disk_a stripe attaches, the orphaned disk_b segment is reclaimed. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'attaching up to \d+ stripes \(fast restart, partial -- storage changed\)', + 'ts2 must enter partial-attach mode after the disk was dropped') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'attached stripe \S+ \(\d+ bytes\) for key=', 'ts2 must fast-attach the surviving disk_a stripe from shm') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: reclaiming orphaned stripe segment', 'ts2 must reclaim the dropped disk_b stripe segment') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'reclaimed \d+ orphaned stripe segment\(s\) after storage change', 'ts2 must report the reclaim summary') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: creating fresh control segment', 'ts2 must keep the control segment across the disk drop') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: previous run did not shutdown cleanly', + 'the partial attach must be due to the disk drop, not an unclean shutdown') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: (schema|ABI) mismatch', 'the partial attach must be due to the disk drop, not schema/ABI') + + def _populate_cache(self): + tr = Test.AddTestRun('Populate cache via ts1 (disk_a + disk_b)') + tr.Processes.Default.StartBefore(self.ts1) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts1.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression('200', 'ts1 first GET should return 200') + tr.StillRunningAfter = self.ts1 + + def _clean_shutdown_ts1(self): + tr = Test.AddTestRun('Drain and clean-shutdown ts1') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shmbd_ts1 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _dump_shm_state(self): + # Between ts1's clean shutdown and ts2's start the control segment is + # marked clean and records both stripes (nothing reclaimed yet). Capture + # it with `traffic_ctl cache shm status` and compare against a gold file. + # The gold masks the run-specific names, the ABI/storage hashes, and the + # page-rounded sizes with the `` wildcard, so what is asserted literally + # is the meaningful state: valid magic/schema, clean_shutdown=1, + # stripe_count=2, and both stripe segments present. + tr = Test.AddTestRun('Dump shm control state after ts1 clean shutdown') + # Use ts1's Env: it has been started, so the per-instance bin dir is on + # PATH (ts2's Env only gains it once ts2 starts, which is the next step). + # `cache shm status` reads the segment directly and needs no live server. + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm status --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = 'gold/cache_shm_state_after_shutdown.gold' + + def _verify_survivor_attach_and_reclaim(self): + tr = Test.AddTestRun('Start ts2 (disk_b dropped); verify survivor fast-attach + orphan reclaim') + tr.Processes.Default.StartBefore(self.ts2) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts2.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + '200', 'ts2 should serve correctly after the partial attach') + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown_ts2(self): + # Stop ts2 before clearing the shm: `cache shm clear` refuses to unlink a + # segment a live traffic_server still owns, so the segments must be + # ownerless (clean_shutdown clears owner_pid) before cleanup runs. + tr = Test.AddTestRun('Drain and clean-shutdown ts2') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shmbd_ts2 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _cleanup_shm(self): + # A clean shutdown deliberately keeps the control + live stripe segments + # for the next fast restart, so they outlive the test. Unlink them by + # prefix to avoid leaking POSIX shm across repeated local runs (macOS has + # no /dev/shm to clear out of band). + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._populate_cache() + self._clean_shutdown_ts1() + self._dump_shm_state() + self._verify_survivor_attach_and_reclaim() + self._clean_shutdown_ts2() + self._cleanup_shm() + + +CacheShmBadDiskDroppedTest().run() diff --git a/tests/gold_tests/cache/cache_shm_concurrent_attach.test.py b/tests/gold_tests/cache/cache_shm_concurrent_attach.test.py new file mode 100644 index 00000000000..90453312708 --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_concurrent_attach.test.py @@ -0,0 +1,185 @@ +''' +Verify the concurrent-attach guard: a second traffic_server must never map the +shm directory read-write underneath a live owner. ts1 cold-starts and becomes +the owner of the control segment (it sets owner_pid and, on Linux, holds an +exclusive flock for its lifetime). While ts1 is still running, ts2 starts +against the *same* shm prefix; it must refuse shm for this run, disable it, and +come up on its own disk cache without touching ts1's segment. ts1 keeps serving +throughout. + +The two instances use *separate* on-disk cache files so the test isolates the +shm concurrent-attach guard from any contention over a shared cache file. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import uuid + +Test.Summary = ''' +A second traffic_server refuses to attach the shm directory while a live owner +holds it, disabling shm for its run instead of attaching concurrently. +''' +Test.ContinueOnFail = True + + +class CacheShmConcurrentAttachTest: + """ + The concurrent-attach guard (P0). A live owner is still mapping the Dir + read-write; a second writer would corrupt it, and clean_shutdown is no + protection against a concurrent *live* run. The guard fires from either of + two mechanisms, so this test asserts on the shared tail of both messages: + - Linux: ts1 holds an exclusive flock on the control segment for its + lifetime; ts2's lock attempt returns HeldByOther ("... is locked by a + live owner ..."). + - macOS (flock unsupported): the owner_pid liveness backstop fires + instead ("... claims a live owner ..."). + Both end in "disabling shm this run to avoid concurrent attach" and set the + run to shm-disabled, which is what this test pins -- so it runs on every + platform. + + ts2 must: + - log the concurrent-attach refusal, + - NOT create or attach a control segment (it bails before either), + - still serve a request (200) from its own disk cache. + ts1 must keep running and serving the whole time. + """ + + TS_PID_SCRIPT = 'ts_process_handler.py' + + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB + + def __init__(self): + self._setup_shared_state() + # Same shm prefix, different storage files: the collision under test is + # purely on the shm control segment, not the on-disk cache. + self.ts1 = self._configure_ts('shmc_ts1', self._storage_path_a) + self.ts2 = self._configure_ts('shmc_ts2', self._storage_path_b) + self._add_diags_log_assertions() + self._url_path = f'/cache/40/{uuid.uuid4()}' + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + self._storage_path_a = os.path.join(shared_storage_dir, 'disk_a.img') + self._storage_path_b = os.path.join(shared_storage_dir, 'disk_b.img') + for path in (self._storage_path_a, self._storage_path_b): + with open(path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # macOS PSHMNAMLEN is 31 chars incl. '/'; 'c' = concurrent-attach variant. + self._shm_prefix = f'/cshmc-{os.getpid() % 100000}-' + + def _configure_ts(self, name, storage_path): + ts = Test.MakeATSProcess(name) + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine('map / http://127.0.0.1/ @plugin=generator.so') + return ts + + def _add_diags_log_assertions(self): + # ts1 is the owner: it creates the fresh control segment. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create and own the shm control segment') + + # ts2 starts while ts1 owns the segment: it must refuse and disable shm. + # The message head differs by platform (flock vs owner_pid backstop); the + # tail is common, so anchor on it. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'disabling shm this run to avoid concurrent attach', 'ts2 must refuse to attach while ts1 owns the segment') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: creating fresh control segment', 'ts2 must not create a control segment when it refuses shm') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: attaching up to \d+ stripes \(fast restart', "ts2 must not attach ts1's live control segment") + + def _start_owner(self): + tr = Test.AddTestRun('Cold-start ts1 (becomes the shm owner)') + tr.Processes.Default.StartBefore(self.ts1) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts1.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression('200', 'ts1 GET should return 200') + tr.StillRunningAfter = self.ts1 + + def _start_second_refused(self): + # ts1 is still running (kept alive by StillRunningAfter above), so ts2's + # start hits the concurrent-attach guard. + tr = Test.AddTestRun('Start ts2 while ts1 is live; ts2 must refuse shm and serve from disk') + tr.Processes.Default.StartBefore(self.ts2) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts2.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + '200', 'ts2 should serve from its own disk cache with shm disabled') + tr.StillRunningAfter = self.ts1 + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown(self, ts, name): + tr = Test.AddTestRun(f'Drain and clean-shutdown {name}') + tr.Processes.Default.Env = ts.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} {name} --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _cleanup_shm(self): + # ts1 (the owner) is stopped before this so clean_shutdown clears + # owner_pid; otherwise `cache shm clear` refuses a live owner. + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._start_owner() + self._start_second_refused() + # Stop the non-owner first, then the owner (clears owner_pid), then clear. + self._clean_shutdown(self.ts2, 'shmc_ts2') + self._clean_shutdown(self.ts1, 'shmc_ts1') + self._cleanup_shm() + + +CacheShmConcurrentAttachTest().run() diff --git a/tests/gold_tests/cache/cache_shm_fast_restart.test.py b/tests/gold_tests/cache/cache_shm_fast_restart.test.py new file mode 100644 index 00000000000..520b2b00986 --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_fast_restart.test.py @@ -0,0 +1,247 @@ +''' +Verify the cache directory survives a clean shutdown via shared memory and is +attached on the next start (fast restart). Two ATS instances share an on-disk +cache file and a POSIX shm name prefix; ts1 populates the cache and is shut +down via traffic_ctl drain + SIGTERM, then ts2 starts and serves the same URL +out of cache without re-fetching from the origin. + +Traffic is driven with Proxy Verifier: a single verifier-server acts as the +origin and the verifier-client replays cache-shm-fast-restart.replay.yaml -- +the "fill" transaction against ts1 and the "hit" transaction against ts2. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +Test.Summary = ''' +Cache directory survives clean shutdown via POSIX shared memory. +''' +Test.ContinueOnFail = True + + +class CacheShmFastRestartTest: + """ + Cover the cache shm fast-restart scenario end-to-end. + + Sequence: + 1. ts1 cold-start: creates a fresh shm control segment and per-stripe + segments; populates the cache via the "fill" transaction (cache miss, + fetched from the verifier-server origin). + 2. ts1 is drained and SIGTERM'd. The shutdown hook flushes the directory + and marks the shm clean. + 3. ts2 starts against the same on-disk file and shm prefix: attaches the + existing control segment, attaches per-stripe segments, and reuses the + cached directory without re-reading it from disk. + 4. ts2 serves the same URL out of cache via the "hit" transaction + (X-Cache: hit-fresh). The transaction's origin response is a 502 + sentinel, so any forward to the origin would fail the run. + + Each step is verified both at the response level (proxy-verifier) and via + diags-log assertions on the cache_shm / cache_dir_init code paths. + """ + + # Helper script for sending signals to a traffic_server process by command-line + # identifier match. Reused from gold_tests/logging. + TS_PID_SCRIPT = 'ts_process_handler.py' + + # The replay file driving both the populate ("fill") and verify ("hit") + # transactions. They share a cache key and differ only by uuid. + REPLAY_FILE = 'replay/cache-shm-fast-restart.replay.yaml' + + # Stripe size for the shared cache. Must be large enough that the directory + # contains real entries; small enough that the disk.img is cheap to create. + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB + + def __init__(self): + self._setup_shared_state() + # A single verifier-server is the origin for both ts1 and ts2. It is + # started before ts1 and kept running across the whole test. + self.server = Test.MakeVerifierServerProcess('shm-origin', self.REPLAY_FILE) + self.ts1 = self._configure_ts('shm_ts1') + self.ts2 = self._configure_ts('shm_ts2') + self._add_diags_log_assertions() + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + # Shared storage file used by both ts1 and ts2. The absolute path makes + # storage.yaml independent of MakeATSProcess's per-instance STORAGEDIR. + # ATS opens regular-file spans with O_RDONLY first to stat them -- it + # does not auto-create the backing file -- so pre-create disk.img at the + # configured size before either ts starts. + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + self._shared_storage_path = os.path.join(shared_storage_dir, 'disk.img') + with open(self._shared_storage_path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # POSIX shm names: macOS PSHMNAMLEN limit is 31 chars including '/'. + # Keep the prefix short and unique per test run so concurrent autest + # runs do not collide. + self._shm_prefix = f'/cshm-{os.getpid() % 100000}-' + + def _configure_ts(self, name): + ts = Test.MakeATSProcess(name) + # Master configures cache storage via storage.yaml. An absolute span path + # keeps the span independent of MakeATSProcess's per-instance STORAGEDIR so + # ts1 and ts2 share the same on-disk cache, which yields identical stripe + # geometry (hence identical shm identity). + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {self._shared_storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + # The per-stripe 'created/attached stripe' lines are Dbg() calls; + # route debug output to diags.log (default is stderr) so the + # ContainsExpression assertions below can match them. + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine(f'map / http://127.0.0.1:{self.server.Variables.http_port}/') + return ts + + def _add_diags_log_assertions(self): + # These assertions match the *stable core* of each cache-shm log line and + # deliberately stop before the trailing parenthetical qualifier. The shm + # code appends optional context to several of these messages as it evolves + # -- "attaching N stripes" became "attaching up to N stripes", + # "(fast restart)" became "(fast restart, recovery skipped)" on the stripe + # path and "(fast restart, partial -- storage changed)" on the control + # path. Anchoring on the invariant prefix (not the closing paren) keeps the + # test from breaking every time such a qualifier is added. Likewise, the + # excludes name only log strings that actually exist in the source: an + # exclude on a non-existent string can never fire and gives false comfort. + + # ts1 (cold start): creates fresh shm, marks it clean on shutdown, and must + # NOT report any "drop" reason since there is nothing to drop. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create a fresh shm control segment on first start') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: created stripe \S+ \(\d+ bytes\) for key=', 'ts1 should create at least one shm-backed stripe segment') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: marking clean shutdown', 'ts1 should mark the shm clean before exit') + self.ts1.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: (schema|ABI) mismatch', 'ts1 should not detect any shm mismatch on cold start') + self.ts1.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: previous run did not shutdown cleanly', 'ts1 should not see a dirty shm on cold start') + self.ts1.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: stripe \S+ size mismatch', 'ts1 should not see a stripe size mismatch on cold start') + + # ts2 (warm start): attaches the existing control segment, fast-attaches the + # per-stripe segment, reuses the cached directory, and must NOT fall back to + # the disk-rebuild path. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: attaching up to \d+ stripes \(fast restart', 'ts2 should attach the existing shm (fast restart)') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: attached stripe \S+ \(\d+ bytes\) for key=', 'ts2 should attach at least one shm-backed stripe segment') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r"attaching cached directory from shm for '.+' \(fast restart", 'ts2 should reuse the per-stripe directory from shm') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: creating fresh control segment', 'ts2 should not create a fresh control segment') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: (schema|ABI) mismatch', 'ts2 should not detect any shm mismatch on warm start') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: previous run did not shutdown cleanly', 'ts2 should see the shm marked clean') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'shm directory invalid for', 'ts2 should not fall back from shm to disk read') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: stripe \S+ size mismatch', 'ts2 should fast-attach without a stripe size-mismatch recreate') + + def _start_ts1(self): + # Cold start ts1 against the verifier-server origin and replay the + # "fill" transaction: a cache miss that ATS fetches and stores. + tr = Test.AddTestRun('Start ts1, then cache contents (fill)') + tr.AddVerifierClientProcess( + 'shm-fill-client', self.REPLAY_FILE, http_ports=[self.ts1.Variables.port], keys='fill', other_args='--thread-limit 1') + tr.Processes.Default.StartBefore(self.server) + tr.Processes.Default.StartBefore(self.ts1) + tr.StillRunningAfter = self.server + tr.StillRunningAfter = self.ts1 + + def _clean_shutdown_ts1(self): + # Drain + SIGTERM ts1. SIGTERM goes through AutoStopCont which invokes + # TS_LIFECYCLE_SHUTDOWN_HOOK -> sync_cache_dir_on_shutdown -> + # CacheShm::mark_clean_shutdown. + tr = Test.AddTestRun('Drain and clean-shutdown ts1') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shm_ts1 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + tr.StillRunningAfter = self.server + + def _start_ts2(self): + # ts2 attaches the CacheDir from the shm created by ts1. Replay the + # "hit" transaction: ATS must serve it from cache (X-Cache: hit-fresh) + # without contacting the origin -- the replay's 502 sentinel response + # would otherwise surface as a proxy-response mismatch. + tr = Test.AddTestRun('Start ts2; verify shm fast-attach and cache HIT') + tr.AddVerifierClientProcess( + 'shm-hit-client', self.REPLAY_FILE, http_ports=[self.ts2.Variables.port], keys='hit', other_args='--thread-limit 1') + tr.Processes.Default.StartBefore(self.ts2) + tr.StillRunningAfter = self.server + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown_ts2(self): + # Stop ts2 before clearing the shm: `cache shm clear` refuses to unlink a + # segment a live traffic_server still owns, so the segments must be + # ownerless (clean_shutdown clears owner_pid) before cleanup runs. + tr = Test.AddTestRun('Drain and clean-shutdown ts2') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shm_ts2 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + tr.StillRunningAfter = self.server + + def _cleanup_shm(self): + # A clean shutdown deliberately keeps the control + live stripe segments + # for the next fast restart, so they outlive the test. Unlink them by + # prefix to avoid leaking POSIX shm across repeated local runs (macOS has + # no /dev/shm to clear out of band). + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._start_ts1() + self._clean_shutdown_ts1() + self._start_ts2() + self._clean_shutdown_ts2() + self._cleanup_shm() + + +CacheShmFastRestartTest().run() diff --git a/tests/gold_tests/cache/cache_shm_purge_on_disable.test.py b/tests/gold_tests/cache/cache_shm_purge_on_disable.test.py new file mode 100644 index 00000000000..51fd57cdedf --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_purge_on_disable.test.py @@ -0,0 +1,232 @@ +''' +Purge-stale-on-start: when shm is disabled but a prior run left segments behind, +proxy.config.cache.shm.purge_stale_on_start=1 removes them at startup. + +This guards two hazards of running with the feature disabled after it had been +enabled (see records.yaml docs): (a) the leftover segments keep consuming tmpfs +the disabled instance never reads, and (b) a later re-enabled run would otherwise +fast-attach a directory that went stale while ATS ran disabled (writing only to +disk). + +Three scenarios, each on its own shm prefix + on-disk storage so they do not +interact: + + - PURGE (positive): a seed instance runs shm-enabled and clean-shuts-down, + leaving a clean control + stripe segment. A second instance runs disabled + with purge_stale_on_start=1 and must remove them. Confirmed three ways: the + seed's "clean" segment exists before (traffic_ctl cache shm status, exit 0), + the disabled instance logs the purge Note, and the segment is gone after + (status exits 2, "not found"). + + - KEEP (negative): same seed, but the disabled instance has + purge_stale_on_start=0. It must NOT log the purge and the segment must remain. + + - NOOP (no leftover): a disabled instance with purge_stale_on_start=1 against a + never-used prefix must do nothing quietly -- no purge Note, no "cannot open" + warning. + +The segments are inspected with traffic_ctl (POSIX shm is not path-addressable on +macOS, so /dev/shm cannot be listed directly). +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import sys + +Test.Summary = ''' +shm.purge_stale_on_start removes leftover shm segments at startup when shm is +disabled, only when set, and only when a control segment exists. +''' +Test.ContinueOnFail = True + + +class CacheShmPurgeOnDisableTest: + + TS_PID_SCRIPT = 'ts_process_handler.py' + DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB; matches the other shm gold tests. + + # CtrlCommand sets this exit code when a shm control segment is absent/invalid + # (src/traffic_ctl/TrafficCtlStatus.h). + CTRL_EX_ERROR = 2 + + PURGE_NOTE = r"cache shm: purged stale segments while disabled \(removed [1-9]" + + def __init__(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + pid = os.getpid() % 100000 + # Each control name is "control"; keep well under macOS PSHMNAMLEN (31). + self._prefix_purge = f'/cshmp-{pid}-' # positive: must be purged + self._prefix_keep = f'/cshmk-{pid}-' # negative: must remain + self._prefix_noop = f'/cshmz-{pid}-' # no leftover: nothing to do + + # Seed (shm enabled) instances that create the leftover segments. + self.seed_purge = self._make_ts('cshm_seed_p', self._prefix_purge, 'disk_p.img', enabled=True, purge=False) + self.seed_keep = self._make_ts('cshm_seed_k', self._prefix_keep, 'disk_k.img', enabled=True, purge=False) + + # Disabled instances under test. + self.run_purge = self._make_ts('cshm_run_p', self._prefix_purge, 'disk_p.img', enabled=False, purge=True) + self.run_keep = self._make_ts('cshm_run_k', self._prefix_keep, 'disk_k.img', enabled=False, purge=False) + self.run_noop = self._make_ts('cshm_run_z', self._prefix_noop, 'disk_z.img', enabled=False, purge=True) + + self._add_diags_assertions() + + def _make_ts(self, name, prefix, disk_name, enabled, purge): + disk_path = self._ensure_disk(disk_name) + ts = Test.MakeATSProcess(name) + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {disk_path}', + f' size: {self.DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.remap_config.AddLine('map / http://127.0.0.1:8080/') # never exercised; keeps remap.config non-empty + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1 if enabled else 0, + 'proxy.config.cache.shm.name_prefix': prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.cache.shm.purge_stale_on_start': 1 if purge else 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + return ts + + def _ensure_disk(self, disk_name): + storage_dir = os.path.join(Test.RunDirectory, 'storage') + os.makedirs(storage_dir, exist_ok=True) + path = os.path.join(storage_dir, disk_name) + if not os.path.exists(path): + with open(path, 'ab') as f: + f.truncate(self.DISK_SIZE_BYTES) + return path + + def _add_diags_assertions(self): + # Seeds create a fresh control segment and mark it clean on the way out -- + # that is the "fast-attachable but now stale" state the purge must clean up. + for seed in (self.seed_purge, self.seed_keep): + seed.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'seed should create a fresh shm control segment') + seed.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: marking clean shutdown', 'seed should mark the shm clean before exit') + + # Positive: the disabled+purge instance logs the purge of at least one segment. + self.run_purge.Disk.diags_log.Content += Testers.ContainsExpression( + self.PURGE_NOTE, 'disabled instance with purge_stale_on_start=1 should purge the leftover segments') + + # Negative: purge_stale_on_start=0 must never purge. + self.run_keep.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: purged stale segments', 'purge_stale_on_start=0 must not purge') + + # No-op: nothing exists for this prefix, so neither a purge nor an error. + self.run_noop.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: purged stale segments', 'no leftover means nothing is purged') + self.run_noop.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: cannot open control segment', 'a missing control segment is a quiet no-op, not a warning') + + def _shm_status(self, description, ts, prefix, expect_present): + """Run `traffic_ctl cache shm status` and assert the control segment is (not) there.""" + control_name = prefix + 'control' + tr = Test.AddTestRun(description) + tr.Processes.Default.Env = ts.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm status --prefix {prefix}' + if expect_present: + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + r'Control segment:\s+' + re.escape(control_name), 'control segment should be present') + else: + tr.Processes.Default.ReturnCode = self.CTRL_EX_ERROR + tr.Processes.Default.Streams.stderr = Testers.ContainsExpression( + r"control segment '" + re.escape(control_name) + r"' not found", 'control segment should be gone') + return tr + + def _start_seed(self, description, seed, prefix): + # Starting the seed (shm enabled) creates the control + stripe segments; the + # status probe also confirms they exist while the seed is the live owner. + tr = self._shm_status(description, seed, prefix, expect_present=True) + tr.Processes.Default.StartBefore(seed) + tr.StillRunningAfter = seed + return tr + + def _clean_shutdown(self, description, seed, name): + tr = Test.AddTestRun(description) + tr.Processes.Default.Env = seed.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} {name} --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _start_disabled(self, description, ts, prefix, expect_present): + # Start the disabled instance under test; its purge (or no-op) runs during + # cache init, so by the time it is ready the status below reflects the result. + tr = self._shm_status(description, ts, prefix, expect_present=expect_present) + tr.Processes.Default.StartBefore(ts) + tr.StillRunningAfter = ts + return tr + + def _cleanup(self): + tr = Test.AddTestRun('Unlink any remaining test shm segments') + tr.Processes.Default.Env = self.run_keep.Env + tr.Processes.Default.Command = ( + f'traffic_ctl cache shm clear --prefix {self._prefix_purge} ; ' + f'traffic_ctl cache shm clear --prefix {self._prefix_keep} ; ' + f'traffic_ctl cache shm clear --prefix {self._prefix_noop}') + tr.Processes.Default.ReturnCode = 0 + + def run(self): + # PURGE (positive) + self._start_seed('PURGE: start shm-enabled seed; control segment is created', self.seed_purge, self._prefix_purge) + self._clean_shutdown('PURGE: clean-shutdown seed (leaves a clean segment)', self.seed_purge, 'cshm_seed_p') + # Probe with a Env whose bin/ autest has already populated (seed_purge was + # started above); run_purge has not started yet, so its bin/ does not exist. + self._shm_status( + 'PURGE: precondition -- clean leftover segment is present', self.seed_purge, self._prefix_purge, + expect_present=True).Processes.Default.Streams.stdout += Testers.ContainsExpression( + r'clean_shutdown:\s+1 \(clean\)', 'leftover segment should be marked clean (the stale-but-attachable case)') + self._start_disabled( + 'PURGE: start disabled+purge=1; leftover segments are removed', + self.run_purge, + self._prefix_purge, + expect_present=False) + + # KEEP (negative) + self._start_seed('KEEP: start shm-enabled seed; control segment is created', self.seed_keep, self._prefix_keep) + self._clean_shutdown('KEEP: clean-shutdown seed (leaves a clean segment)', self.seed_keep, 'cshm_seed_k') + self._start_disabled( + 'KEEP: start disabled+purge=0; leftover segments remain', self.run_keep, self._prefix_keep, expect_present=True) + + # NOOP (no leftover) + tr = Test.AddTestRun('NOOP: start disabled+purge=1 against an unused prefix; nothing to do') + tr.Processes.Default.Env = self.run_noop.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm status --prefix {self._prefix_noop}' + tr.Processes.Default.ReturnCode = self.CTRL_EX_ERROR # never existed + tr.Processes.Default.StartBefore(self.run_noop) + tr.StillRunningAfter = self.run_noop + + self._cleanup() + + +CacheShmPurgeOnDisableTest().run() diff --git a/tests/gold_tests/cache/cache_shm_schema_mismatch.test.py b/tests/gold_tests/cache/cache_shm_schema_mismatch.test.py new file mode 100644 index 00000000000..fac47ebde57 --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_schema_mismatch.test.py @@ -0,0 +1,207 @@ +''' +Verify the shm schema-version trust gate: a control segment whose schema_version +does not match the running build is dropped, never fast-attached. ts1 cold-starts, +caches an object, and clean-shuts-down (marking the segment clean). The segment +file under /dev/shm is then tampered -- schema_version is overwritten with a +bogus value -- before ts2 starts against the same shm prefix. ts2 must detect the +mismatch, drop the segment, recreate it fresh, and rebuild the directory from disk. + +Linux-only: it pokes raw bytes in the /dev/shm segment file, which exists only on +Linux (macOS POSIX shm segments are not path-addressable). +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import platform +import sys +import uuid + +Test.Summary = ''' +A control segment with a mismatched schema_version is dropped and rebuilt from +disk, never fast-attached. +''' +Test.ContinueOnFail = True + +# The byte-poke drives the gate by editing /dev/shm directly, which is a Linux +# facility; macOS POSIX shm is not exposed as a file. There is no Condition for +# the platform, so gate with a lambda (ports.py branches on platform the same way). +Test.SkipUnless(Condition(lambda: platform.system() == 'Linux', "shm byte-poke gates need Linux /dev/shm")) + + +class CacheShmSchemaMismatchTest: + """ + The schema-version gate. The control header records the build's + CACHE_SHM_SCHEMA_VERSION; on attach, a segment whose recorded version differs + is dropped ("schema mismatch ( vs ), dropping") rather than trusted + -- the on-disk struct layout it describes may no longer match this build. The + ABI-hash gate (abi_hash @16) works identically; this test exercises the + schema field (@8) as the representative case. + + Sequence: ts1 creates a clean segment, then schema_version is poked to a bogus + value, then ts2 starts and must: + - log the schema mismatch and drop, + - recreate a fresh control segment, + - NOT fast-attach, + - and still serve a request (200). + """ + + TS_PID_SCRIPT = 'ts_process_handler.py' + POKE_SCRIPT = 'shm_poke.py' + + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB + + # CacheShmControl layout (CacheShmLayout.h): magic[8] @0, schema_version @8. + SCHEMA_VERSION_OFFSET = 8 + # Little-endian uint32 = 9; the build's CACHE_SHM_SCHEMA_VERSION is small, so + # any value it never uses works. 9 is comfortably out of range. + BOGUS_SCHEMA_LE_HEX = '09000000' + + def __init__(self): + self._setup_shared_state() + self.ts1 = self._configure_ts('shmx_ts1') + self.ts2 = self._configure_ts('shmx_ts2') + self._add_diags_log_assertions() + self._url_path = f'/cache/40/{uuid.uuid4()}' + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + Test.Setup.Copy(os.path.join(Test.TestDirectory, self.POKE_SCRIPT)) + + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + self._shared_storage_path = os.path.join(shared_storage_dir, 'disk.img') + with open(self._shared_storage_path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # macOS PSHMNAMLEN is 31 chars incl. '/'; 'x' = schema-mismatch variant. + # (This test is Linux-only, but keep the prefix short for consistency.) + self._shm_prefix = f'/cshmx-{os.getpid() % 100000}-' + # The control segment is name_prefix + "control"; on Linux it is a file + # under /dev/shm by the same name (sans the leading '/'). + self._control_file = '/dev/shm/' + self._shm_prefix.lstrip('/') + 'control' + + def _configure_ts(self, name): + ts = Test.MakeATSProcess(name) + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {self._shared_storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine('map / http://127.0.0.1/ @plugin=generator.so') + return ts + + def _add_diags_log_assertions(self): + # ts1 cold start, clean shutdown -- a valid, clean segment to tamper with. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create a fresh shm control segment on first start') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: marking clean shutdown', 'ts1 should mark the shm clean before exit') + + # ts2 start against the poked segment: detect, drop, recreate, rebuild. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: schema mismatch \(\d+ vs \d+\), dropping', 'ts2 must detect the schema mismatch and drop the segment') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts2 must recreate the control segment after the drop') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'\(fast restart, recovery skipped\)', 'ts2 must rebuild from disk, never fast-attach the mismatched segment') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: previous run did not shutdown cleanly', + 'the drop must be due to the schema mismatch, not an unclean shutdown') + + def _populate_cache(self): + tr = Test.AddTestRun('Cold-start ts1 and cache an object') + tr.Processes.Default.StartBefore(self.ts1) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts1.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression('200', 'ts1 first GET should return 200') + tr.StillRunningAfter = self.ts1 + + def _clean_shutdown_ts1(self): + tr = Test.AddTestRun('Drain and clean-shutdown ts1') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shmx_ts1 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _poke_schema_version(self): + # ts1 is dead; the segment is just a file now. Overwrite schema_version. + tr = Test.AddTestRun('Tamper schema_version in the shm control segment') + tr.Processes.Default.Command = ( + f'{sys.executable} ./{self.POKE_SCRIPT} {self._control_file} ' + f'{self.SCHEMA_VERSION_OFFSET} {self.BOGUS_SCHEMA_LE_HEX}') + tr.Processes.Default.ReturnCode = 0 + + def _verify_mismatch_drop(self): + tr = Test.AddTestRun('Start ts2; verify the schema mismatch is dropped and rebuilt from disk') + tr.Processes.Default.StartBefore(self.ts2) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts2.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + '200', 'ts2 should serve correctly after dropping the mismatched segment') + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown_ts2(self): + tr = Test.AddTestRun('Drain and clean-shutdown ts2') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shmx_ts2 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _cleanup_shm(self): + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._populate_cache() + self._clean_shutdown_ts1() + self._poke_schema_version() + self._verify_mismatch_drop() + self._clean_shutdown_ts2() + self._cleanup_shm() + + +CacheShmSchemaMismatchTest().run() diff --git a/tests/gold_tests/cache/cache_shm_storage_mismatch.test.py b/tests/gold_tests/cache/cache_shm_storage_mismatch.test.py new file mode 100644 index 00000000000..24aadcb04e7 --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_storage_mismatch.test.py @@ -0,0 +1,210 @@ +''' +Verify that a changed storage layout never fast-attaches a stale directory. +A storage.yaml change no longer drops the whole shm control segment; instead +each stripe is matched to its prior segment by its own identity. ts1 caches an +object against one storage file and clean-shuts-down (marking the shm clean); +ts2 starts against a *different* storage file but the *same* shm name prefix. +ts2 finds ts1's control segment, keeps it (partial attach), but because its +stripe identity no longer matches any recorded entry it creates a fresh stripe +segment and reclaims ts1's orphaned one -- it must never fast-attach a segment +that describes a different on-disk layout. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import uuid + +Test.Summary = ''' +A changed storage layout never fast-attaches a stale directory: the control +segment is kept (partial attach), the relocated stripe creates a fresh segment, +and the orphaned prior segment is reclaimed. +''' +Test.ContinueOnFail = True + + +class CacheShmStorageMismatchTest: + """ + The storage signature is a fingerprint of every span's path and geometry, + stored in the shm control header. It is no longer a hard gate: a storage + change keeps the control segment and lets each stripe attach by its own + identity (its hash_text, which includes the disk path). This test points + ts1 and ts2 at different storage files (a repath) while sharing one shm + prefix, and asserts ts2: + - keeps the existing control segment (does NOT recreate it), + - enters partial-attach mode because the storage signature changed, + - never fast-attaches any stripe segment (its identity differs, so the + stale directory built for storage A is never reused), + - creates a fresh stripe segment for its own (storage B) layout, + - reclaims ts1's now-orphaned stripe segment, + - and still serves a request (200). + Because the storage change does not gate the clean-shutdown check, ts2 must + NOT report the prior run as unclean: the only reason for the recreate is the + storage change. + """ + + TS_PID_SCRIPT = 'ts_process_handler.py' + + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB + + def __init__(self): + self._setup_shared_state() + # ts1 and ts2 share the shm prefix but use different storage files. + self.ts1 = self._configure_ts('shms_ts1', self._storage_path_a) + self.ts2 = self._configure_ts('shms_ts2', self._storage_path_b) + self._add_diags_log_assertions() + self._url_path = f'/cache/40/{uuid.uuid4()}' + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + # Two distinct storage files -> distinct span paths -> distinct + # storage signatures, which is exactly the "repath" case under test. + self._storage_path_a = os.path.join(shared_storage_dir, 'disk_a.img') + self._storage_path_b = os.path.join(shared_storage_dir, 'disk_b.img') + for path in (self._storage_path_a, self._storage_path_b): + with open(path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # macOS PSHMNAMLEN is 31 chars incl. '/'; 's' = storage-mismatch variant. + self._shm_prefix = f'/cshms-{os.getpid() % 100000}-' + + def _configure_ts(self, name, storage_path): + ts = Test.MakeATSProcess(name) + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine('map / http://127.0.0.1/ @plugin=generator.so') + return ts + + def _add_diags_log_assertions(self): + # ts1 cold start against storage A, clean shutdown. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create a fresh shm control segment on first start') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'created stripe \S+ \(\d+ bytes\) for key=', 'ts1 should create at least one shm-backed stripe segment') + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: marking clean shutdown', 'ts1 should mark the shm clean before exit') + + # ts2 start against storage B: the storage signature differs, so the + # control segment is kept (partial attach) but the relocated stripe + # creates a fresh segment rather than fast-attaching the stale one. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'attaching up to \d+ stripes \(fast restart, partial -- storage changed\)', + 'ts2 must enter partial-attach mode after the storage change') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'created stripe \S+ \(\d+ bytes\) for key=', 'ts2 must create a fresh stripe segment for its own layout') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: reclaiming orphaned stripe segment', "ts2 must reclaim ts1's orphaned stripe segment") + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'reclaimed \d+ orphaned stripe segment\(s\) after storage change', 'ts2 must report the reclaim summary') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'attached stripe \S+ \(\d+ bytes\) for key=', + 'ts2 must never fast-attach a stripe segment built for a different layout') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: creating fresh control segment', 'ts2 must keep the control segment across the storage change') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: (schema|ABI) mismatch', 'the recreate must be due to the storage change, not schema/ABI') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: previous run did not shutdown cleanly', + 'the recreate must be due to the storage change, not an unclean shutdown') + + def _populate_cache(self): + tr = Test.AddTestRun('Populate cache via ts1 (storage A)') + tr.Processes.Default.StartBefore(self.ts1) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts1.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression('200', 'ts1 first GET should return 200') + tr.StillRunningAfter = self.ts1 + + def _clean_shutdown_ts1(self): + tr = Test.AddTestRun('Drain and clean-shutdown ts1') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shms_ts1 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _verify_partial_attach_and_reclaim(self): + tr = Test.AddTestRun('Start ts2 (storage B); verify partial attach: fresh stripe + orphan reclaim') + tr.Processes.Default.StartBefore(self.ts2) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts2.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + '200', 'ts2 should serve correctly after the partial attach') + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown_ts2(self): + # Stop ts2 before clearing the shm: `cache shm clear` refuses to unlink a + # segment a live traffic_server still owns, so the segments must be + # ownerless (clean_shutdown clears owner_pid) before cleanup runs. + tr = Test.AddTestRun('Drain and clean-shutdown ts2') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shms_ts2 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _cleanup_shm(self): + # A clean shutdown deliberately keeps the control + live stripe segments + # for the next fast restart, so they outlive the test. Unlink them by + # prefix to avoid leaking POSIX shm across repeated local runs (macOS has + # no /dev/shm to clear out of band). + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._populate_cache() + self._clean_shutdown_ts1() + self._verify_partial_attach_and_reclaim() + self._clean_shutdown_ts2() + self._cleanup_shm() + + +CacheShmStorageMismatchTest().run() diff --git a/tests/gold_tests/cache/cache_shm_unclean_shutdown.test.py b/tests/gold_tests/cache/cache_shm_unclean_shutdown.test.py new file mode 100644 index 00000000000..4711636c70b --- /dev/null +++ b/tests/gold_tests/cache/cache_shm_unclean_shutdown.test.py @@ -0,0 +1,185 @@ +''' +Verify the shm fast-restart path refuses to trust a directory left by a crash. +A clean shutdown marks the control segment clean; a crash (SIGKILL) does not, +so clean_shutdown stays 0. ts1 cold-starts, caches an object, and is *killed* +(no drain, no SIGTERM) so the shutdown hook never runs. ts2 starts against the +same on-disk cache and shm prefix: it must find the dirty segment, drop the +whole thing, rebuild the directory from disk, and never take the fast-attach +"recovery skipped" path. +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import uuid + +Test.Summary = ''' +An unclean shutdown (SIGKILL) leaves the shm control segment dirty, so the next +start drops it and rebuilds the directory from disk instead of fast-attaching. +''' +Test.ContinueOnFail = True + + +class CacheShmUncleanShutdownTest: + """ + The crash-safety gate. clean_shutdown is set to 1 only by the shutdown hook + (CacheShm::mark_clean_shutdown); a SIGKILL bypasses it, leaving the segment + with clean_shutdown == 0. On the next start the control segment is found but + rejected -- a crash may have left dir entries pointing at content that never + reached disk, so no stripe can safely skip recovery. ts2 must: + - log "previous run did not shutdown cleanly, dropping", + - recreate a fresh control segment, + - NOT take the stripe fast-attach "recovery skipped" path, + - and still serve a request (200). + + This gate is cross-platform: clean_shutdown lives in the control segment, so + it does not depend on the Linux-only flock path. + """ + + TS_PID_SCRIPT = 'ts_process_handler.py' + + SHARED_DISK_SIZE_BYTES = 256 * 1024 * 1024 # 256 MiB + + def __init__(self): + self._setup_shared_state() + # ts1 and ts2 share the same on-disk cache file and shm prefix so ts2 + # would fast-attach ts1's directory -- were it not left dirty by the kill. + self.ts1 = self._configure_ts('shmu_ts1') + # ts1 is SIGKILLed mid-test, so it exits on signal 9 (returncode -9, or + # 137 where the runner reports 128+signal). Declare that expected exit so + # the managed-process check does not flag the deliberate kill. ts1 still + # starts normally, so leave Ready at its default (port-open) condition. + self.ts1.ReturnCode = Any(-9, 137) + self.ts2 = self._configure_ts('shmu_ts2') + self._add_diags_log_assertions() + self._url_path = f'/cache/40/{uuid.uuid4()}' + + def _setup_shared_state(self): + Test.Setup.Copy(os.path.join(Test.TestDirectory, '..', 'logging', self.TS_PID_SCRIPT)) + + shared_storage_dir = os.path.join(Test.RunDirectory, 'shared-storage') + os.makedirs(shared_storage_dir, exist_ok=True) + self._shared_storage_path = os.path.join(shared_storage_dir, 'disk.img') + with open(self._shared_storage_path, 'ab') as f: + f.truncate(self.SHARED_DISK_SIZE_BYTES) + + # macOS PSHMNAMLEN is 31 chars incl. '/'; 'u' = unclean-shutdown variant. + self._shm_prefix = f'/cshmu-{os.getpid() % 100000}-' + + def _configure_ts(self, name): + ts = Test.MakeATSProcess(name) + ts.Disk.storage_yaml.AddLines( + [ + 'cache:', + ' spans:', + ' - name: disk.0', + f' path: {self._shared_storage_path}', + f' size: {self.SHARED_DISK_SIZE_BYTES}', + ' volumes:', + ' - id: 1', + ' scheme: http', + ' size: 100%', + ]) + ts.Disk.records_config.update( + { + 'proxy.config.cache.shm.enabled': 1, + 'proxy.config.cache.shm.name_prefix': self._shm_prefix, + 'proxy.config.cache.shm.use_hugepages': 0, + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'cache_shm', + 'proxy.config.diags.output.diag': 'L', + 'proxy.config.http.wait_for_cache': 1, + }) + ts.Disk.plugin_config.AddLine('xdebug.so --enable=x-cache,via') + ts.Disk.remap_config.AddLine('map / http://127.0.0.1/ @plugin=generator.so') + return ts + + def _add_diags_log_assertions(self): + # ts1 cold start: creates a fresh segment but is killed before it can mark + # the shutdown clean. + self.ts1.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts1 should create a fresh shm control segment on first start') + self.ts1.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: marking clean shutdown', 'ts1 is SIGKILLed, so it must never mark the shm clean') + + # ts2 start: finds the dirty segment, drops it, recreates, and rebuilds + # from disk -- it must NOT fast-attach. + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: previous run did not shutdown cleanly, dropping', 'ts2 must reject the dirty segment left by the crash') + self.ts2.Disk.diags_log.Content += Testers.ContainsExpression( + r'cache shm: creating fresh control segment', 'ts2 must recreate the control segment after dropping the dirty one') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'\(fast restart, recovery skipped\)', 'ts2 must rebuild from disk, never take the fast-attach path') + self.ts2.Disk.diags_log.Content += Testers.ExcludesExpression( + r'cache shm: attaching up to \d+ stripes \(fast restart', 'ts2 must not attach the dirty control segment') + + def _populate_cache(self): + tr = Test.AddTestRun('Cold-start ts1 and cache an object') + tr.Processes.Default.StartBefore(self.ts1) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts1.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression('200', 'ts1 first GET should return 200') + tr.StillRunningAfter = self.ts1 + + def _kill_ts1(self): + # SIGKILL -- no drain, no SIGTERM -- so the shutdown hook never runs and + # the control segment is left with clean_shutdown == 0. + tr = Test.AddTestRun('SIGKILL ts1 (unclean shutdown)') + tr.Processes.Default.Env = self.ts1.Env + tr.Processes.Default.Command = (f'{sys.executable} ./{self.TS_PID_SCRIPT} shmu_ts1 --signal KILL && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _verify_dirty_drop(self): + tr = Test.AddTestRun('Start ts2; verify the dirty segment is dropped and rebuilt from disk') + tr.Processes.Default.StartBefore(self.ts2) + tr.MakeCurlCommand( + f'-s -o /dev/null -w "%{{http_code}}\\n" ' + f'-H "x-debug: x-cache,via" ' + f'http://127.0.0.1:{self.ts2.Variables.port}{self._url_path}') + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stdout = Testers.ContainsExpression( + '200', 'ts2 should serve correctly after dropping the dirty segment') + tr.StillRunningAfter = self.ts2 + + def _clean_shutdown_ts2(self): + tr = Test.AddTestRun('Drain and clean-shutdown ts2') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = ( + f'traffic_ctl server drain && sleep 1 && ' + f'{sys.executable} ./{self.TS_PID_SCRIPT} shmu_ts2 --signal TERM && sleep 3') + tr.Processes.Default.ReturnCode = 0 + + def _cleanup_shm(self): + tr = Test.AddTestRun('Unlink the test shm segments') + tr.Processes.Default.Env = self.ts2.Env + tr.Processes.Default.Command = f'traffic_ctl cache shm clear --prefix {self._shm_prefix}' + tr.Processes.Default.ReturnCode = 0 + tr.Processes.Default.Streams.stderr = Testers.ExcludesExpression( + 'Invalid argument', 'clear must skip tombstoned slots, not fail on them') + + def run(self): + self._populate_cache() + self._kill_ts1() + self._verify_dirty_drop() + self._clean_shutdown_ts2() + self._cleanup_shm() + + +CacheShmUncleanShutdownTest().run() diff --git a/tests/gold_tests/cache/gold/cache_shm_state_after_shutdown.gold b/tests/gold_tests/cache/gold/cache_shm_state_after_shutdown.gold new file mode 100644 index 00000000000..5a981414f4f --- /dev/null +++ b/tests/gold_tests/cache/gold/cache_shm_state_after_shutdown.gold @@ -0,0 +1,13 @@ +Control segment: `` + segment size: `` + magic: `` [valid] + schema_version: `` [valid] + abi_hash: 0x`` + storage_sig: 0x`` + clean_shutdown: 1 (clean) + owner_pid: 0 (none -- not currently attached) + stripe_count: 2 + +Stripes: + [0] `` present + [1] `` present diff --git a/tests/gold_tests/cache/replay/cache-shm-fast-restart.replay.yaml b/tests/gold_tests/cache/replay/cache-shm-fast-restart.replay.yaml new file mode 100644 index 00000000000..a18a9951643 --- /dev/null +++ b/tests/gold_tests/cache/replay/cache-shm-fast-restart.replay.yaml @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Traffic for the cache shm fast-restart test. The two transactions share a +# cache key (same method + host + url); they differ only in the uuid so the +# verifier-client can drive them one at a time via --keys: +# +# * key "fill": replayed against ts1 (cold cache). The request misses, ATS +# forwards it to the origin (verifier-server) and caches the 200 response. +# +# * key "hit": replayed against ts2 after a clean shutdown + shm fast restart. +# ATS must serve it from the shm-attached directory WITHOUT contacting the +# origin. The 502 server-response is a sentinel: it is only ever returned if +# ATS wrongly forwards the request, in which case the proxy-response check +# (expecting the cached 200 / X-Cache: hit-fresh) fails. +# + +meta: + version: "1.0" + +sessions: +- transactions: + + - client-request: + method: "GET" + version: "1.1" + scheme: "http" + url: /cache-shm-fast-restart/object + headers: + fields: + - [ Host, example.com ] + - [ uuid, fill ] + - [ X-Debug, "x-cache,via" ] + + server-response: + status: 200 + reason: OK + headers: + fields: + - [ Content-Length, 16 ] + - [ Cache-Control, "max-age=300,public" ] + + proxy-response: + status: 200 + headers: + fields: + - [ X-Cache, { value: miss, as: equal } ] + + # Restart ATS + + - client-request: + method: "GET" + version: "1.1" + scheme: "http" + url: /cache-shm-fast-restart/object + headers: + fields: + - [ Host, example.com ] + - [ uuid, hit ] + - [ X-Debug, "x-cache,via" ] + + server-response: + status: 502 + reason: "Bad Gateway" + headers: + fields: + - [ Content-Length, 0 ] + + proxy-response: + status: 200 + headers: + fields: + - [ X-Cache, { value: hit-fresh, as: equal } ] diff --git a/tests/gold_tests/cache/shm_poke.py b/tests/gold_tests/cache/shm_poke.py new file mode 100644 index 00000000000..a4ed38c0485 --- /dev/null +++ b/tests/gold_tests/cache/shm_poke.py @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flip raw bytes in a cache shm segment file, for shm trust-gate autests. + +On Linux the POSIX shm segments are plain files under /dev/shm, so a segment +left behind by a clean shutdown can be tampered with between runs to drive the +control-segment trust gates (schema/ABI mismatch, an unterminated shm_name, +etc.). This is Linux-only: macOS POSIX shm segments are not path-addressable. + +Usage: + shm_poke.py + +Example (set schema_version @8 to 9, little-endian uint32): + shm_poke.py /dev/shm/cshmx-12345-control 8 09000000 +""" + +import sys + + +def main() -> int: + if len(sys.argv) != 4: + sys.stderr.write(f'usage: {sys.argv[0]} \n') + return 2 + path = sys.argv[1] + offset = int(sys.argv[2], 0) + data = bytes.fromhex(sys.argv[3]) + with open(path, 'r+b') as f: + f.seek(offset) + f.write(data) + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/tests/gold_tests/logging/ts_process_handler.py b/tests/gold_tests/logging/ts_process_handler.py index 40640e3922e..fdd1eb586cb 100644 --- a/tests/gold_tests/logging/ts_process_handler.py +++ b/tests/gold_tests/logging/ts_process_handler.py @@ -36,10 +36,13 @@ def __init__(self, message): def get_ts_process_pid(ts_identifier): processes = [] for proc in psutil.process_iter(['cmdline']): + # psutil returns a None cmdline for processes whose command line is not + # readable (e.g. system processes on macOS); skip those rather than + # letting ' '.join(None) raise TypeError before the target is found. cmdline = proc.info.get('cmdline', []) if not cmdline: continue - commandline = ' '.join(cmdline) + commandline = ' '.join(cmdline or []) if '/traffic_server' in commandline and ts_identifier in commandline: return proc raise GetPidError("Could not find a traffic_server process")