From ae090115c465637b799cdcfff1799e06a0bbed92 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 18:32:17 +0200 Subject: [PATCH 1/5] Add tamper-evident append-only fault audit log Append-only, hash-chained audit log of fault state transitions in the fault manager. Each transition appends one immutable row with record_hash = sha256(prev_hash + canonical(event)) (OpenSSL EVP SHA-256), a persisted chain head that resumes across restarts, a verify routine, a read API, and retention that seals a segment anchor before pruning so the surviving tail stays verifiable. Off by default. Refs #483 --- src/ros2_medkit_fault_manager/CHANGELOG.rst | 4 + src/ros2_medkit_fault_manager/CMakeLists.txt | 8 + src/ros2_medkit_fault_manager/README.md | 18 + .../config/fault_manager.yaml | 14 + .../fault_audit_log.hpp | 144 +++++ .../fault_manager_node.hpp | 17 + src/ros2_medkit_fault_manager/package.xml | 1 + .../src/fault_audit_log.cpp | 522 ++++++++++++++++++ .../src/fault_manager_node.cpp | 108 +++- .../test/test_fault_audit_log.cpp | 270 +++++++++ .../test/test_fault_manager.cpp | 119 ++++ 11 files changed, 1224 insertions(+), 1 deletion(-) create mode 100644 src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp create mode 100644 src/ros2_medkit_fault_manager/src/fault_audit_log.cpp create mode 100644 src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp diff --git a/src/ros2_medkit_fault_manager/CHANGELOG.rst b/src/ros2_medkit_fault_manager/CHANGELOG.rst index a6130582..902b8447 100644 --- a/src/ros2_medkit_fault_manager/CHANGELOG.rst +++ b/src/ros2_medkit_fault_manager/CHANGELOG.rst @@ -2,6 +2,10 @@ Changelog for package ros2_medkit_fault_manager ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Forthcoming +----------- +* Optional tamper-evident, append-only audit log of fault state transitions: each transition appends one immutable, hash-chained row (``record_hash = sha256(prev_hash + canonical(event))`` via OpenSSL EVP SHA-256) with a persisted chain head, a ``verify`` routine, a read API, and retention that seals a segment anchor before pruning. Off by default (`#483 `_) + 0.6.0 (2026-06-22) ------------------ * Bounded concurrent snapshot capture under fault storms with a ``CaptureThreadPool`` and configurable capture pool / queue / overflow-policy parameters. The rosbag leg is serialized and the cooldown map is bounded, so a burst of simultaneous faults can no longer exhaust capture threads or grow memory without limit (`#456 `_) diff --git a/src/ros2_medkit_fault_manager/CMakeLists.txt b/src/ros2_medkit_fault_manager/CMakeLists.txt index 23c4924c..9421ed7c 100644 --- a/src/ros2_medkit_fault_manager/CMakeLists.txt +++ b/src/ros2_medkit_fault_manager/CMakeLists.txt @@ -41,6 +41,8 @@ find_package(ros2_medkit_msgs REQUIRED) find_package(ros2_medkit_serialization REQUIRED) find_package(SQLite3 REQUIRED) find_package(nlohmann_json REQUIRED) +# OpenSSL EVP SHA-256 for the tamper-evident audit log hash chain +find_package(OpenSSL REQUIRED) # yaml-cpp is required as transitive dependency from ros2_medkit_serialization medkit_find_yaml_cpp() # rosbag2 for time-window snapshot recording @@ -55,6 +57,7 @@ add_library(fault_manager_lib STATIC src/fault_manager_node.cpp src/fault_storage.cpp src/sqlite_fault_storage.cpp + src/fault_audit_log.cpp src/snapshot_capture.cpp src/rosbag_capture.cpp src/correlation/types.cpp @@ -81,6 +84,7 @@ target_link_libraries(fault_manager_lib PUBLIC SQLite::SQLite3 nlohmann_json::nlohmann_json yaml-cpp::yaml-cpp + OpenSSL::Crypto ) medkit_apply_compat_defs(fault_manager_lib) @@ -143,6 +147,10 @@ if(BUILD_TESTING) medkit_target_dependencies(test_sqlite_storage rclcpp ros2_medkit_msgs) medkit_set_test_domain(test_sqlite_storage) + # Fault audit log tests (hash chain, verify, rotation, reopen) + ament_add_gtest(test_fault_audit_log test/test_fault_audit_log.cpp) + target_link_libraries(test_fault_audit_log fault_manager_lib) + # Snapshot capture tests ament_add_gtest(test_snapshot_capture test/test_snapshot_capture.cpp) target_link_libraries(test_snapshot_capture fault_manager_lib) diff --git a/src/ros2_medkit_fault_manager/README.md b/src/ros2_medkit_fault_manager/README.md index 8013acfe..cccaaa6a 100644 --- a/src/ros2_medkit_fault_manager/README.md +++ b/src/ros2_medkit_fault_manager/README.md @@ -53,6 +53,7 @@ ros2 service call /fault_manager/clear_fault ros2_medkit_msgs/srv/ClearFault \ - **Debounce filtering** (optional): AUTOSAR DEM-style counter-based fault confirmation with per-entity threshold overrides - **Snapshot capture**: Captures topic data when faults are confirmed for debugging (snapshots are deleted when fault is cleared) - **Fault correlation** (optional): Root cause analysis with symptom muting and auto-clear +- **Tamper-evident audit log** (optional): Append-only, hash-chained record of fault state transitions for verifiable history ## Parameters @@ -109,6 +110,23 @@ patterns: **Memory**: Faults are stored in memory only. Useful for testing or when persistence is not required. +## Advanced: Tamper-Evident Audit Log + +An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `cleared`) so the fault history is verifiable and any later edit or deletion is detectable. It is **off by default** because it adds a write and storage cost per transition. + +Each transition appends one immutable row holding `record_hash = sha256(prev_hash + canonical(event))` (OpenSSL EVP SHA-256), the `prev_hash` it links to, and a monotonic `seq`. The hash is computed once at insert and never recomputed. A persisted chain head lets the chain resume across restarts. The log is stored in its own SQLite database (separate from the fault store) and is treated as append-only - the manager only ever inserts rows. + +`verify()` walks the persisted chain oldest-first and recomputes every link: editing a row breaks its `record_hash`, deleting a row breaks the next row's `prev_hash` linkage, and deleting the newest row is caught by the persisted-head check. + +**Retention/rotation**: when more than `audit_log.retention_max_records` rows are retained, the oldest segment is *sealed* (its final `seq` + hash are persisted as an anchor) and then pruned. The surviving tail still verifies because the oldest retained row links back to the sealed anchor. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `audit_log.enabled` | bool | `false` | Enable the tamper-evident audit log | +| `audit_log.transitions` | string | `"all"` | Which transitions to record: `"all"` or `"confirmed_only"` | +| `audit_log.database_path` | string | `""` | SQLite path. Empty => sibling `fault_audit.db` next to the fault DB (or `:memory:` for in-memory fault stores) | +| `audit_log.retention_max_records` | int | `0` | Seal + prune the oldest segment beyond this many retained records (0 = unlimited) | + ## Usage ### Launch diff --git a/src/ros2_medkit_fault_manager/config/fault_manager.yaml b/src/ros2_medkit_fault_manager/config/fault_manager.yaml index d421c0fd..3f94e71c 100644 --- a/src/ros2_medkit_fault_manager/config/fault_manager.yaml +++ b/src/ros2_medkit_fault_manager/config/fault_manager.yaml @@ -27,3 +27,17 @@ fault_manager: # snapshots.capture_pool_size: 2 # max concurrent capture threads (>= 1) # snapshots.capture_queue_depth: 16 # max pending captures (>= 1) # snapshots.capture_queue_full_policy: reject_newest # reject_newest | drop_oldest + + # Tamper-evident, append-only audit log of fault state transitions + # (occurred/confirmed/cleared). OFF by default: it adds a write + storage cost + # per transition. When enabled, each transition appends one immutable, + # hash-chained row so any later edit or deletion is detectable via verify. + audit_log.enabled: false + # Which transitions to record: "all" or "confirmed_only". + # audit_log.transitions: all + # SQLite path. Empty => sibling "fault_audit.db" next to the fault DB + # (or :memory: when the fault store is in-memory). + # audit_log.database_path: "" + # Seal + prune the oldest segment beyond this many retained records + # (0 = unlimited). A sealed anchor keeps the surviving tail verifiable. + # audit_log.retention_max_records: 0 diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp new file mode 100644 index 00000000..59cca09d --- /dev/null +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp @@ -0,0 +1,144 @@ +// Copyright 2026 mfaferek93, bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include + +namespace ros2_medkit_fault_manager { + +/// A single fault state-transition to record in the audit log. +/// +/// `transition` is one of the kTransition* constants below. The remaining +/// fields describe the fault at the moment of the transition; all of them feed +/// the canonical serialization that the hash chain is computed over, so any +/// later edit to a stored row is detectable. +struct AuditEvent { + std::string fault_code; + std::string transition; ///< occurred | confirmed | cleared | ack + uint8_t severity{0}; ///< severity at the time of the transition + std::string status; ///< resulting fault status (e.g. CONFIRMED) + std::string source_id; ///< reporting source that drove the transition + std::string description; ///< human-readable description + int64_t occurred_at_ns{0}; ///< wall-clock timestamp of the transition +}; + +/// Canonical transition kinds. Stored verbatim, so they are part of the hash. +constexpr const char * kTransitionOccurred = "occurred"; +constexpr const char * kTransitionConfirmed = "confirmed"; +constexpr const char * kTransitionCleared = "cleared"; +constexpr const char * kTransitionAck = "ack"; + +/// One immutable, hash-chained row read back from the audit log. +struct AuditRecord { + int64_t seq{0}; + AuditEvent event; + std::string prev_hash; + std::string record_hash; +}; + +/// Persisted head of the hash chain. +struct ChainHead { + int64_t seq{0}; ///< 0 when the chain is empty + std::string record_hash; ///< genesis hash when the chain is empty +}; + +/// Result of verifying the persisted chain. +struct AuditVerifyResult { + bool ok{true}; + int64_t checked{0}; ///< number of records walked + int64_t bad_seq{0}; ///< seq of the first offending record (0 if ok) + std::string error; ///< human-readable reason when !ok +}; + +/// Append-only, hash-chained audit log of fault state transitions. +/// +/// Each appended row stores `record_hash = sha256(prev_hash + canonical(event))` +/// using OpenSSL's EVP SHA-256 (the same primitive the gateway links). The hash +/// is computed once at insert and never recomputed. A persisted chain head lets +/// the chain resume across process restarts, and rotation seals a segment by +/// persisting an anchor (its final seq + hash) before pruning so the surviving +/// history stays verifiable. +/// +/// The table is treated as append-only: this class only ever INSERTs rows (and, +/// on rotation, deletes a sealed prefix). It never UPDATEs an existing record. +class FaultAuditLog { + public: + /// Open (or create) the audit log database. + /// @param db_path SQLite path. Use ":memory:" for an in-memory log. + /// @param retention_max_records Max records to retain before rotation seals + /// and prunes the oldest segment. 0 disables rotation (unlimited). + /// @throws std::runtime_error if the database cannot be opened or initialized. + explicit FaultAuditLog(const std::string & db_path, int64_t retention_max_records = 0); + + ~FaultAuditLog(); + + FaultAuditLog(const FaultAuditLog &) = delete; + FaultAuditLog & operator=(const FaultAuditLog &) = delete; + FaultAuditLog(FaultAuditLog &&) = delete; + FaultAuditLog & operator=(FaultAuditLog &&) = delete; + + /// Append one transition. Computes the chained hash, inserts the row, and + /// advances the persisted head, atomically. + /// @return the monotonic seq assigned to the new record. + int64_t append(const AuditEvent & event); + + /// Walk the persisted chain oldest-first and validate every link. + AuditVerifyResult verify() const; + + /// Read records oldest-first. + /// @param limit Max records to return (0 = all). + /// @param after_seq Only return records with seq > after_seq (0 = from start). + std::vector read(int64_t limit = 0, int64_t after_seq = 0) const; + + /// Current persisted chain head. + ChainHead head() const; + + /// Number of records currently retained (excludes pruned/sealed rows). + int64_t record_count() const; + + /// Deterministic canonical serialization of an event at a given seq. + /// Stable field order so verify is reproducible across processes. + static std::string canonicalize(int64_t seq, const AuditEvent & event); + + /// Genesis hash used as prev_hash for the very first record. + static std::string genesis_hash(); + + /// SHA-256 of `data` as a lowercase hex string (OpenSSL EVP). + static std::string sha256_hex(const std::string & data); + + const std::string & db_path() const { + return db_path_; + } + + private: + void initialize_schema(); + ChainHead load_head_locked() const; + void store_head_locked(const ChainHead & head_record); + /// Seal + prune the oldest segment if the retained count exceeds the limit. + void rotate_if_needed_locked(); + + std::string db_path_; + int64_t retention_max_records_{0}; + sqlite3 * db_{nullptr}; + mutable std::mutex mutex_; + ChainHead head_; ///< cached head, kept in sync with the head table +}; + +} // namespace ros2_medkit_fault_manager diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp index 9af908a7..7c603232 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp @@ -23,6 +23,7 @@ #include "ros2_medkit_fault_manager/capture_thread_pool.hpp" #include "ros2_medkit_fault_manager/correlation/correlation_engine.hpp" #include "ros2_medkit_fault_manager/entity_threshold_resolver.hpp" +#include "ros2_medkit_fault_manager/fault_audit_log.hpp" #include "ros2_medkit_fault_manager/fault_storage.hpp" #include "ros2_medkit_fault_manager/rosbag_capture.hpp" #include "ros2_medkit_fault_manager/snapshot_capture.hpp" @@ -148,6 +149,18 @@ class FaultManagerNode : public rclcpp::Node { /// Falls back to global config if no entity-specific overrides match. DebounceConfig resolve_config(const std::string & source_id) const; + /// Create the tamper-evident audit log from parameters (nullptr if disabled). + std::unique_ptr create_audit_log(); + + /// Append a fault state-transition to the audit log when enabled. No-op when + /// the log is off, or when only confirmations are logged and this is not one. + /// @param transition One of the kTransition* constants. + /// @param fault The fault state at the time of the transition. + /// @param source_id The reporting source that drove the transition. + /// @param occurred_at_ns Wall-clock timestamp of the transition. + void audit_transition(const char * transition, const ros2_medkit_msgs::msg::Fault & fault, + const std::string & source_id, int64_t occurred_at_ns); + std::string storage_type_; std::string database_path_; int32_t confirmation_threshold_{-1}; @@ -162,6 +175,10 @@ class FaultManagerNode : public rclcpp::Node { std::unique_ptr storage_; std::unique_ptr threshold_resolver_; ///< Per-entity threshold overrides + /// Tamper-evident audit log of fault transitions (nullptr when disabled). + std::unique_ptr audit_log_; + bool audit_confirmed_only_{false}; ///< When true, only "confirmed" transitions are logged + rclcpp::Service::SharedPtr report_fault_srv_; rclcpp::Service::SharedPtr list_faults_srv_; rclcpp::Service::SharedPtr get_fault_srv_; diff --git a/src/ros2_medkit_fault_manager/package.xml b/src/ros2_medkit_fault_manager/package.xml index 1658c527..723c7fed 100644 --- a/src/ros2_medkit_fault_manager/package.xml +++ b/src/ros2_medkit_fault_manager/package.xml @@ -16,6 +16,7 @@ ros2_medkit_serialization libsqlite3-dev nlohmann-json-dev + libssl-dev rosbag2_cpp rosbag2_storage diff --git a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp new file mode 100644 index 00000000..8799eddb --- /dev/null +++ b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp @@ -0,0 +1,522 @@ +// Copyright 2026 mfaferek93, bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ros2_medkit_fault_manager/fault_audit_log.hpp" + +#include + +#include +#include +#include +#include +#include + +namespace ros2_medkit_fault_manager { + +namespace { + +/// 64 hex zeros: prev_hash of the first record. A fixed, well-known anchor so a +/// verifier can confirm the chain starts where it claims to. +constexpr const char * kGenesisHash = "0000000000000000000000000000000000000000000000000000000000000000"; + +/// RAII wrapper for a prepared SQLite statement (audit-log local copy). +class Stmt { + public: + Stmt(sqlite3 * db, const char * sql) : db_(db) { + if (sqlite3_prepare_v2(db, sql, -1, &stmt_, nullptr) != SQLITE_OK) { + throw std::runtime_error(std::string("audit: failed to prepare: ") + sqlite3_errmsg(db)); + } + } + + ~Stmt() { + if (stmt_) { + sqlite3_finalize(stmt_); + } + } + + Stmt(const Stmt &) = delete; + Stmt & operator=(const Stmt &) = delete; + + void bind_text(int index, const std::string & value) { + if (value.size() > static_cast(std::numeric_limits::max())) { + throw std::runtime_error("audit: text too large to bind"); + } + if (sqlite3_bind_text(stmt_, index, value.c_str(), static_cast(value.size()), SQLITE_TRANSIENT) != SQLITE_OK) { + throw std::runtime_error(std::string("audit: failed to bind text: ") + sqlite3_errmsg(db_)); + } + } + + void bind_int64(int index, int64_t value) { + if (sqlite3_bind_int64(stmt_, index, value) != SQLITE_OK) { + throw std::runtime_error(std::string("audit: failed to bind int64: ") + sqlite3_errmsg(db_)); + } + } + + int step() { + return sqlite3_step(stmt_); + } + + std::string column_text(int index) { + const auto * text = reinterpret_cast(sqlite3_column_text(stmt_, index)); + return text ? std::string(text) : std::string(); + } + + int64_t column_int64(int index) { + return sqlite3_column_int64(stmt_, index); + } + + private: + sqlite3 * db_; + sqlite3_stmt * stmt_{nullptr}; +}; + +void exec_or_throw(sqlite3 * db, const char * sql, const char * what) { + char * err_msg = nullptr; + if (sqlite3_exec(db, sql, nullptr, nullptr, &err_msg) != SQLITE_OK) { + std::string error = err_msg ? err_msg : "unknown error"; + sqlite3_free(err_msg); + throw std::runtime_error(std::string("audit: ") + what + ": " + error); + } +} + +} // namespace + +FaultAuditLog::FaultAuditLog(const std::string & db_path, int64_t retention_max_records) + : db_path_(db_path), retention_max_records_(retention_max_records < 0 ? 0 : retention_max_records) { + int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_FULLMUTEX; + if (sqlite3_open_v2(db_path.c_str(), &db_, flags, nullptr) != SQLITE_OK) { + std::string error = db_ ? sqlite3_errmsg(db_) : "unknown error"; + if (db_) { + sqlite3_close(db_); + db_ = nullptr; + } + throw std::runtime_error("audit: failed to open '" + db_path + "': " + error); + } + + exec_or_throw(db_, "PRAGMA journal_mode=WAL;", "enable WAL"); + sqlite3_busy_timeout(db_, 5000); + + initialize_schema(); + head_ = load_head_locked(); +} + +FaultAuditLog::~FaultAuditLog() { + if (db_) { + sqlite3_close(db_); + } +} + +std::string FaultAuditLog::genesis_hash() { + return kGenesisHash; +} + +std::string FaultAuditLog::sha256_hex(const std::string & data) { + std::array md{}; + unsigned int md_len = 0; + + EVP_MD_CTX * ctx = EVP_MD_CTX_new(); + if (ctx == nullptr) { + throw std::runtime_error("audit: EVP_MD_CTX_new failed"); + } + const bool ok = EVP_DigestInit_ex(ctx, EVP_sha256(), nullptr) == 1 && + EVP_DigestUpdate(ctx, data.data(), data.size()) == 1 && + EVP_DigestFinal_ex(ctx, md.data(), &md_len) == 1; + EVP_MD_CTX_free(ctx); + if (!ok) { + throw std::runtime_error("audit: SHA-256 digest failed"); + } + + static constexpr char kHex[] = "0123456789abcdef"; + std::string out; + out.reserve(static_cast(md_len) * 2); + for (unsigned int i = 0; i < md_len; ++i) { + out.push_back(kHex[md[i] >> 4]); + out.push_back(kHex[md[i] & 0x0F]); + } + return out; +} + +std::string FaultAuditLog::canonicalize(int64_t seq, const AuditEvent & event) { + // Deterministic field order with JSON-escaped string values. Numeric fields + // render in fixed decimal form. This is the exact byte sequence the hash is + // taken over, so it must be stable across processes and re-reads. + std::string out; + out.reserve(192); + out += "{\"seq\":"; + out += std::to_string(seq); + out += ",\"ts\":"; + out += std::to_string(event.occurred_at_ns); + out += ",\"code\":"; + out += nlohmann::json(event.fault_code).dump(); + out += ",\"transition\":"; + out += nlohmann::json(event.transition).dump(); + out += ",\"severity\":"; + out += std::to_string(static_cast(event.severity)); + out += ",\"status\":"; + out += nlohmann::json(event.status).dump(); + out += ",\"source\":"; + out += nlohmann::json(event.source_id).dump(); + out += ",\"description\":"; + out += nlohmann::json(event.description).dump(); + out += "}"; + return out; +} + +void FaultAuditLog::initialize_schema() { + // Immutable transition rows. seq is the monotonic chain index. + exec_or_throw(db_, + R"( + CREATE TABLE IF NOT EXISTS audit_log ( + seq INTEGER PRIMARY KEY, + occurred_at_ns INTEGER NOT NULL, + fault_code TEXT NOT NULL, + transition TEXT NOT NULL, + severity INTEGER NOT NULL, + status TEXT NOT NULL, + source_id TEXT NOT NULL, + description TEXT NOT NULL, + prev_hash TEXT NOT NULL, + record_hash TEXT NOT NULL + ); + CREATE INDEX IF NOT EXISTS idx_audit_log_fault_code ON audit_log(fault_code); + )", + "create audit_log table"); + + // Single-row persisted chain head, so the chain resumes across restarts. + exec_or_throw(db_, + R"( + CREATE TABLE IF NOT EXISTS audit_chain_head ( + id INTEGER PRIMARY KEY CHECK (id = 1), + seq INTEGER NOT NULL, + record_hash TEXT NOT NULL + ); + )", + "create audit_chain_head table"); + + // Sealed-segment anchors written before pruning. Each captures the final + // (seq, hash) of a pruned prefix so the surviving tail stays verifiable. + exec_or_throw(db_, + R"( + CREATE TABLE IF NOT EXISTS audit_anchors ( + last_seq INTEGER PRIMARY KEY, + sealed_at_ns INTEGER NOT NULL, + last_hash TEXT NOT NULL + ); + )", + "create audit_anchors table"); +} + +ChainHead FaultAuditLog::load_head_locked() const { + // Prefer the persisted head row. + { + Stmt stmt(db_, "SELECT seq, record_hash FROM audit_chain_head WHERE id = 1"); + if (stmt.step() == SQLITE_ROW) { + ChainHead head_record; + head_record.seq = stmt.column_int64(0); + head_record.record_hash = stmt.column_text(1); + return head_record; + } + } + + // No head row. Recover from the last retained record if any exist (defensive: + // a crash between INSERT and head update would land here on reopen). + { + Stmt stmt(db_, "SELECT seq, record_hash FROM audit_log ORDER BY seq DESC LIMIT 1"); + if (stmt.step() == SQLITE_ROW) { + ChainHead head_record; + head_record.seq = stmt.column_int64(0); + head_record.record_hash = stmt.column_text(1); + return head_record; + } + } + + return ChainHead{0, genesis_hash()}; +} + +void FaultAuditLog::store_head_locked(const ChainHead & head_record) { + Stmt stmt(db_, + "INSERT INTO audit_chain_head (id, seq, record_hash) VALUES (1, ?, ?) " + "ON CONFLICT(id) DO UPDATE SET seq = excluded.seq, record_hash = excluded.record_hash"); + stmt.bind_int64(1, head_record.seq); + stmt.bind_text(2, head_record.record_hash); + if (stmt.step() != SQLITE_DONE) { + throw std::runtime_error(std::string("audit: failed to store head: ") + sqlite3_errmsg(db_)); + } +} + +int64_t FaultAuditLog::append(const AuditEvent & event) { + std::lock_guard lock(mutex_); + + const int64_t new_seq = head_.seq + 1; + const std::string prev_hash = head_.record_hash; + const std::string canonical = canonicalize(new_seq, event); + const std::string record_hash = sha256_hex(prev_hash + canonical); + + exec_or_throw(db_, "BEGIN IMMEDIATE", "begin append"); + try { + Stmt insert(db_, + "INSERT INTO audit_log (seq, occurred_at_ns, fault_code, transition, severity, status, " + "source_id, description, prev_hash, record_hash) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"); + insert.bind_int64(1, new_seq); + insert.bind_int64(2, event.occurred_at_ns); + insert.bind_text(3, event.fault_code); + insert.bind_text(4, event.transition); + insert.bind_int64(5, static_cast(event.severity)); + insert.bind_text(6, event.status); + insert.bind_text(7, event.source_id); + insert.bind_text(8, event.description); + insert.bind_text(9, prev_hash); + insert.bind_text(10, record_hash); + if (insert.step() != SQLITE_DONE) { + throw std::runtime_error(std::string("audit: failed to insert record: ") + sqlite3_errmsg(db_)); + } + + store_head_locked(ChainHead{new_seq, record_hash}); + exec_or_throw(db_, "COMMIT", "commit append"); + } catch (...) { + exec_or_throw(db_, "ROLLBACK", "rollback append"); + throw; + } + + head_ = ChainHead{new_seq, record_hash}; + + rotate_if_needed_locked(); + return new_seq; +} + +void FaultAuditLog::rotate_if_needed_locked() { + if (retention_max_records_ <= 0) { + return; + } + + // Count retained rows and find the oldest surviving seq. + int64_t count = 0; + int64_t min_seq = 0; + { + Stmt stmt(db_, "SELECT COUNT(*), COALESCE(MIN(seq), 0) FROM audit_log"); + if (stmt.step() == SQLITE_ROW) { + count = stmt.column_int64(0); + min_seq = stmt.column_int64(1); + } + } + if (count <= retention_max_records_) { + return; + } + + // Prune the oldest (count - retention_max_records_) rows. The boundary row is + // the highest seq being pruned; its hash becomes the sealed anchor that the + // first surviving row's prev_hash links back to. + const int64_t prune_count = count - retention_max_records_; + const int64_t boundary_seq = min_seq + prune_count - 1; + + std::string boundary_hash; + int64_t sealed_at_ns = 0; + { + Stmt stmt(db_, "SELECT record_hash, occurred_at_ns FROM audit_log WHERE seq = ?"); + stmt.bind_int64(1, boundary_seq); + if (stmt.step() != SQLITE_ROW) { + // Should not happen given the count; leave the log intact rather than + // prune without a valid anchor. + return; + } + boundary_hash = stmt.column_text(0); + sealed_at_ns = stmt.column_int64(1); + } + + exec_or_throw(db_, "BEGIN IMMEDIATE", "begin rotate"); + try { + Stmt anchor(db_, + "INSERT INTO audit_anchors (last_seq, sealed_at_ns, last_hash) VALUES (?, ?, ?) " + "ON CONFLICT(last_seq) DO NOTHING"); + anchor.bind_int64(1, boundary_seq); + anchor.bind_int64(2, sealed_at_ns); + anchor.bind_text(3, boundary_hash); + if (anchor.step() != SQLITE_DONE) { + throw std::runtime_error(std::string("audit: failed to write anchor: ") + sqlite3_errmsg(db_)); + } + + Stmt del(db_, "DELETE FROM audit_log WHERE seq <= ?"); + del.bind_int64(1, boundary_seq); + if (del.step() != SQLITE_DONE) { + throw std::runtime_error(std::string("audit: failed to prune records: ") + sqlite3_errmsg(db_)); + } + exec_or_throw(db_, "COMMIT", "commit rotate"); + } catch (...) { + exec_or_throw(db_, "ROLLBACK", "rollback rotate"); + throw; + } +} + +std::vector FaultAuditLog::read(int64_t limit, int64_t after_seq) const { + std::lock_guard lock(mutex_); + + std::string sql = + "SELECT seq, occurred_at_ns, fault_code, transition, severity, status, source_id, description, " + "prev_hash, record_hash FROM audit_log WHERE seq > ? ORDER BY seq ASC"; + if (limit > 0) { + sql += " LIMIT ?"; + } + + Stmt stmt(db_, sql.c_str()); + stmt.bind_int64(1, after_seq); + if (limit > 0) { + stmt.bind_int64(2, limit); + } + + std::vector result; + while (stmt.step() == SQLITE_ROW) { + AuditRecord rec; + rec.seq = stmt.column_int64(0); + rec.event.occurred_at_ns = stmt.column_int64(1); + rec.event.fault_code = stmt.column_text(2); + rec.event.transition = stmt.column_text(3); + rec.event.severity = static_cast(stmt.column_int64(4)); + rec.event.status = stmt.column_text(5); + rec.event.source_id = stmt.column_text(6); + rec.event.description = stmt.column_text(7); + rec.prev_hash = stmt.column_text(8); + rec.record_hash = stmt.column_text(9); + result.push_back(std::move(rec)); + } + return result; +} + +ChainHead FaultAuditLog::head() const { + std::lock_guard lock(mutex_); + return head_; +} + +int64_t FaultAuditLog::record_count() const { + std::lock_guard lock(mutex_); + Stmt stmt(db_, "SELECT COUNT(*) FROM audit_log"); + if (stmt.step() != SQLITE_ROW) { + return 0; + } + return stmt.column_int64(0); +} + +AuditVerifyResult FaultAuditLog::verify() const { + std::lock_guard lock(mutex_); + + AuditVerifyResult result; + + // Walk every retained row oldest-first, recomputing each link. + Stmt stmt(db_, + "SELECT seq, occurred_at_ns, fault_code, transition, severity, status, source_id, description, " + "prev_hash, record_hash FROM audit_log ORDER BY seq ASC"); + + bool first = true; + int64_t expected_seq = 0; + std::string expected_prev; + + while (stmt.step() == SQLITE_ROW) { + AuditRecord rec; + rec.seq = stmt.column_int64(0); + rec.event.occurred_at_ns = stmt.column_int64(1); + rec.event.fault_code = stmt.column_text(2); + rec.event.transition = stmt.column_text(3); + rec.event.severity = static_cast(stmt.column_int64(4)); + rec.event.status = stmt.column_text(5); + rec.event.source_id = stmt.column_text(6); + rec.event.description = stmt.column_text(7); + rec.prev_hash = stmt.column_text(8); + rec.record_hash = stmt.column_text(9); + + if (first) { + first = false; + // The first retained row must link to genesis (seq 1) or to a sealed + // anchor whose last_seq == rec.seq - 1. + if (rec.seq == 1) { + if (rec.prev_hash != genesis_hash()) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "first record does not link to genesis"; + return result; + } + } else { + Stmt anchor(db_, "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); + anchor.bind_int64(1, rec.seq - 1); + if (anchor.step() != SQLITE_ROW) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "no sealed anchor for the oldest retained record (history truncated)"; + return result; + } + const std::string anchor_hash = anchor.column_text(0); + if (rec.prev_hash != anchor_hash) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "oldest retained record does not link to its sealed anchor"; + return result; + } + } + expected_seq = rec.seq; + expected_prev = rec.prev_hash; + } else { + // Subsequent rows must be contiguous and chain to the previous record. + if (rec.seq != expected_seq) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "non-contiguous seq (record deleted or reordered)"; + return result; + } + if (rec.prev_hash != expected_prev) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "prev_hash does not match previous record_hash (chain broken)"; + return result; + } + } + + const std::string recomputed = sha256_hex(rec.prev_hash + canonicalize(rec.seq, rec.event)); + if (recomputed != rec.record_hash) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "record_hash mismatch (record tampered)"; + return result; + } + + ++result.checked; + expected_prev = rec.record_hash; + expected_seq = rec.seq + 1; + } + + // The persisted head must match the last retained record (catches deletion of + // the newest row, which the row walk alone cannot see). + if (result.checked > 0) { + if (head_.seq != expected_seq - 1 || head_.record_hash != expected_prev) { + result.ok = false; + result.bad_seq = head_.seq; + result.error = "persisted head does not match the last retained record"; + return result; + } + } else { + // Empty retained log: head must be either genesis (never written) or point + // at a sealed anchor (everything pruned). + if (head_.seq != 0) { + Stmt anchor(db_, "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); + anchor.bind_int64(1, head_.seq); + if (anchor.step() != SQLITE_ROW || anchor.column_text(0) != head_.record_hash) { + result.ok = false; + result.bad_seq = head_.seq; + result.error = "head references a record that is neither retained nor sealed"; + return result; + } + } + } + + return result; +} + +} // namespace ros2_medkit_fault_manager diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index c704d989..5692693d 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -151,6 +151,9 @@ FaultManagerNode::FaultManagerNode(const rclcpp::NodeOptions & options) : Node(" // Create storage backend storage_ = create_storage(); + // Create the tamper-evident fault audit log (disabled by default) + audit_log_ = create_audit_log(); + // Apply snapshot limit to storage if (max_snapshots > 0) { storage_->set_max_snapshots_per_fault(static_cast(max_snapshots)); @@ -347,6 +350,91 @@ std::unique_ptr FaultManagerNode::create_storage() { return std::make_unique(); } +std::unique_ptr FaultManagerNode::create_audit_log() { + const bool enabled = declare_parameter("audit_log.enabled", false); + + // Which transitions to record: "all" (occurred/confirmed/cleared) or + // "confirmed_only". + const std::string transitions = declare_parameter("audit_log.transitions", "all"); + audit_confirmed_only_ = (transitions == "confirmed_only"); + if (transitions != "all" && transitions != "confirmed_only") { + RCLCPP_WARN(get_logger(), "audit_log.transitions '%s' invalid, using 'all'", transitions.c_str()); + audit_confirmed_only_ = false; + } + + // Retention: seal + prune the oldest segment beyond this many records (0 = off). + auto retention = declare_parameter("audit_log.retention_max_records", 0); + if (retention < 0) { + RCLCPP_WARN(get_logger(), "audit_log.retention_max_records < 0, disabling rotation"); + retention = 0; + } + + // Path: explicit override, else a sibling of the fault DB, else :memory:. + std::string audit_path = declare_parameter("audit_log.database_path", ""); + + if (!enabled) { + return nullptr; // No table, no file, no write overhead when off. + } + + if (audit_path.empty()) { + if (database_path_ == ":memory:" || storage_type_ != "sqlite") { + audit_path = ":memory:"; + } else { + std::filesystem::path base(database_path_); + audit_path = (base.parent_path() / "fault_audit.db").string(); + } + } + + if (audit_path != ":memory:") { + std::filesystem::path p(audit_path); + auto parent = p.parent_path(); + if (!parent.empty() && !std::filesystem::exists(parent)) { + try { + std::filesystem::create_directories(parent); + } catch (const std::filesystem::filesystem_error & e) { + RCLCPP_ERROR(get_logger(), "Failed to create audit log directory '%s': %s", parent.string().c_str(), e.what()); + throw; + } + } + } + + try { + auto log = std::make_unique(audit_path, retention); + RCLCPP_INFO(get_logger(), "Fault audit log enabled: %s (transitions=%s, retention=%ld, resume_seq=%ld)", + audit_path.c_str(), audit_confirmed_only_ ? "confirmed_only" : "all", retention, log->head().seq); + return log; + } catch (const std::exception & e) { + RCLCPP_ERROR(get_logger(), "Failed to open fault audit log '%s': %s", audit_path.c_str(), e.what()); + throw; + } +} + +void FaultManagerNode::audit_transition(const char * transition, const ros2_medkit_msgs::msg::Fault & fault, + const std::string & source_id, int64_t occurred_at_ns) { + if (!audit_log_) { + return; + } + if (audit_confirmed_only_ && std::string(transition) != kTransitionConfirmed) { + return; + } + + AuditEvent event; + event.fault_code = fault.fault_code; + event.transition = transition; + event.severity = fault.severity; + event.status = fault.status; + event.source_id = source_id; + event.description = fault.description; + event.occurred_at_ns = occurred_at_ns; + + try { + audit_log_->append(event); + } catch (const std::exception & e) { + RCLCPP_ERROR(get_logger(), "Failed to append audit record for '%s' (%s): %s", fault.fault_code.c_str(), transition, + e.what()); + } +} + void FaultManagerNode::handle_report_fault( const std::shared_ptr & request, const std::shared_ptr & response) { @@ -390,9 +478,10 @@ void FaultManagerNode::handle_report_fault( auto resolved_config = resolve_config(request->source_id); // Report the fault event (use wall clock time, not sim time, for proper timestamps) + const rclcpp::Time event_time = get_wall_clock_time(); bool is_new = storage_->report_fault_event(request->fault_code, request->event_type, request->severity, request->description, - request->source_id, get_wall_clock_time(), resolved_config); + request->source_id, event_time, resolved_config); response->accepted = true; @@ -449,6 +538,16 @@ void FaultManagerNode::handle_report_fault( } // Note: PREFAILED/PREPASSED status changes don't emit events (debounce in progress) + // Append tamper-evident audit records for the transitions that just happened. + // Recorded regardless of correlation muting: muting affects display, not the + // fact that the state transition occurred. + if (is_new) { + audit_transition(kTransitionOccurred, *fault_after, request->source_id, event_time.nanoseconds()); + } + if (just_confirmed) { + audit_transition(kTransitionConfirmed, *fault_after, request->source_id, event_time.nanoseconds()); + } + // Capture snapshots/rosbag when a fault confirms via the bounded pool (issue #441). // handle_report_fault runs on the single-threaded executor, so confirmations are // already serialized; last_capture_mutex_ only guards last_capture_times_ itself @@ -657,6 +756,12 @@ void FaultManagerNode::handle_clear_fault( // Auto-clear correlated symptoms for (const auto & symptom_code : auto_cleared_codes) { storage_->clear_fault(symptom_code); + if (audit_log_) { + auto symptom = storage_->get_fault(symptom_code); + if (symptom) { + audit_transition(kTransitionCleared, *symptom, "clear_service", get_wall_clock_time().nanoseconds()); + } + } RCLCPP_DEBUG(get_logger(), "Auto-cleared symptom: %s (root cause: %s)", symptom_code.c_str(), request->fault_code.c_str()); // Also cleanup rosbag for auto-cleared faults @@ -684,6 +789,7 @@ void FaultManagerNode::handle_clear_fault( auto fault = storage_->get_fault(request->fault_code); if (fault) { publish_fault_event(ros2_medkit_msgs::msg::FaultEvent::EVENT_CLEARED, *fault, auto_cleared_codes); + audit_transition(kTransitionCleared, *fault, "clear_service", get_wall_clock_time().nanoseconds()); } } else { response->message = "Fault not found: " + request->fault_code; diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp new file mode 100644 index 00000000..21fd3448 --- /dev/null +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -0,0 +1,270 @@ +// Copyright 2026 mfaferek93, bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include + +#include "ros2_medkit_fault_manager/fault_audit_log.hpp" + +using ros2_medkit_fault_manager::AuditEvent; +using ros2_medkit_fault_manager::FaultAuditLog; +using ros2_medkit_fault_manager::kTransitionCleared; +using ros2_medkit_fault_manager::kTransitionConfirmed; +using ros2_medkit_fault_manager::kTransitionOccurred; + +namespace { + +AuditEvent make_event(const std::string & code, const char * transition, int64_t ts) { + AuditEvent e; + e.fault_code = code; + e.transition = transition; + e.severity = 2; + e.status = "CONFIRMED"; + e.source_id = "/robot/source"; + e.description = "pump pressure low"; + e.occurred_at_ns = ts; + return e; +} + +/// Run a single SQL statement directly against the audit DB file (used to +/// simulate tampering an immutable row). +void raw_exec(const std::string & db_path, const std::string & sql) { + sqlite3 * db = nullptr; + ASSERT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); + char * err = nullptr; + int rc = sqlite3_exec(db, sql.c_str(), nullptr, nullptr, &err); + std::string err_str = err ? err : ""; + sqlite3_free(err); + sqlite3_close(db); + ASSERT_EQ(rc, SQLITE_OK) << err_str; +} + +} // namespace + +class FaultAuditLogTest : public ::testing::Test { + protected: + void SetUp() override { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dist; + path_ = (std::filesystem::temp_directory_path() / ("test_audit_" + std::to_string(dist(gen)) + ".db")).string(); + } + + void TearDown() override { + std::error_code ec; + std::filesystem::remove(path_, ec); + std::filesystem::remove(path_ + "-wal", ec); + std::filesystem::remove(path_ + "-shm", ec); + } + + std::string path_; +}; + +// Each transition appends a chained row with a monotonic seq and linked hashes. +TEST_F(FaultAuditLogTest, AppendsChainedRowPerTransition) { + FaultAuditLog log(path_); + + EXPECT_EQ(log.append(make_event("F1", kTransitionOccurred, 100)), 1); + EXPECT_EQ(log.append(make_event("F1", kTransitionConfirmed, 200)), 2); + EXPECT_EQ(log.append(make_event("F1", kTransitionCleared, 300)), 3); + + auto records = log.read(); + ASSERT_EQ(records.size(), 3u); + + // First row links to genesis; each subsequent prev_hash equals the prior hash. + EXPECT_EQ(records[0].seq, 1); + EXPECT_EQ(records[0].prev_hash, FaultAuditLog::genesis_hash()); + EXPECT_EQ(records[1].prev_hash, records[0].record_hash); + EXPECT_EQ(records[2].prev_hash, records[1].record_hash); + + // record_hash is the sha256 of prev_hash + canonical(event). + const std::string expected = + FaultAuditLog::sha256_hex(records[0].prev_hash + FaultAuditLog::canonicalize(1, records[0].event)); + EXPECT_EQ(records[0].record_hash, expected); + + EXPECT_EQ(log.record_count(), 3); + EXPECT_EQ(log.head().seq, 3); + EXPECT_EQ(log.head().record_hash, records[2].record_hash); +} + +// Verify confirms an untampered chain. +TEST_F(FaultAuditLogTest, VerifyUntamperedChain) { + FaultAuditLog log(path_); + for (int i = 0; i < 10; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + auto result = log.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 10); +} + +// Known SHA-256 vector proves the EVP wiring (sha256("") == e3b0c442...). +TEST_F(FaultAuditLogTest, Sha256KnownVector) { + EXPECT_EQ(FaultAuditLog::sha256_hex(""), + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); +} + +// Editing a past row makes verify fail. +TEST_F(FaultAuditLogTest, EditingPastRowFailsVerify) { + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionOccurred, 200)); + log.append(make_event("F3", kTransitionOccurred, 300)); + ASSERT_TRUE(log.verify().ok); + } + + // Tamper: change a stored field without recomputing the hash. + raw_exec(path_, "UPDATE audit_log SET description = 'forged' WHERE seq = 2"); + + FaultAuditLog reopened(path_); + auto result = reopened.verify(); + EXPECT_FALSE(result.ok); + EXPECT_EQ(result.bad_seq, 2); +} + +// Deleting a middle row makes verify fail (chain gap). +TEST_F(FaultAuditLogTest, DeletingMiddleRowFailsVerify) { + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionOccurred, 200)); + log.append(make_event("F3", kTransitionOccurred, 300)); + } + + raw_exec(path_, "DELETE FROM audit_log WHERE seq = 2"); + + FaultAuditLog reopened(path_); + EXPECT_FALSE(reopened.verify().ok); +} + +// Deleting the newest row is caught by the persisted head check. +TEST_F(FaultAuditLogTest, DeletingNewestRowFailsVerify) { + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionOccurred, 200)); + log.append(make_event("F3", kTransitionOccurred, 300)); + } + + // Drop the last row but leave the head pointing at seq 3. + raw_exec(path_, "DELETE FROM audit_log WHERE seq = 3"); + + FaultAuditLog reopened(path_); + EXPECT_FALSE(reopened.verify().ok); +} + +// The chain head persists across a reopen and the chain resumes from it. +TEST_F(FaultAuditLogTest, HeadPersistsAcrossReopen) { + std::string head_hash; + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionConfirmed, 200)); + head_hash = log.head().record_hash; + EXPECT_EQ(log.head().seq, 2); + } + + FaultAuditLog reopened(path_); + EXPECT_EQ(reopened.head().seq, 2); + EXPECT_EQ(reopened.head().record_hash, head_hash); + + // The next append continues the same chain. + EXPECT_EQ(reopened.append(make_event("F3", kTransitionCleared, 300)), 3); + auto records = reopened.read(); + ASSERT_EQ(records.size(), 3u); + EXPECT_EQ(records[2].prev_hash, head_hash); + EXPECT_TRUE(reopened.verify().ok); +} + +// Rotation seals a segment and prunes it, leaving the tail verifiable. +TEST_F(FaultAuditLogTest, RotationSealsAndPrunesButStaysVerifiable) { + FaultAuditLog log(path_, /*retention_max_records=*/5); + for (int i = 1; i <= 12; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + + // Only the most recent 5 rows are retained. + EXPECT_EQ(log.record_count(), 5); + auto records = log.read(); + ASSERT_EQ(records.size(), 5u); + EXPECT_EQ(records.front().seq, 8); + EXPECT_EQ(records.back().seq, 12); + EXPECT_EQ(log.head().seq, 12); + + // The surviving tail still verifies via the sealed anchor. + auto result = log.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 5); +} + +// Tampering a row inside a rotated tail is still detected. +TEST_F(FaultAuditLogTest, RotationThenTamperFails) { + { + FaultAuditLog log(path_, /*retention_max_records=*/5); + for (int i = 1; i <= 12; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + } + raw_exec(path_, "UPDATE audit_log SET source_id = 'forged' WHERE seq = 10"); + + FaultAuditLog reopened(path_, 5); + auto result = reopened.verify(); + EXPECT_FALSE(result.ok); + EXPECT_EQ(result.bad_seq, 10); +} + +// Removing the sealed anchor breaks the link for the oldest retained row. +TEST_F(FaultAuditLogTest, MissingAnchorFailsVerify) { + { + FaultAuditLog log(path_, /*retention_max_records=*/5); + for (int i = 1; i <= 12; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + } + raw_exec(path_, "DELETE FROM audit_anchors"); + + FaultAuditLog reopened(path_, 5); + EXPECT_FALSE(reopened.verify().ok); +} + +// canonicalize is deterministic and order-stable. +TEST_F(FaultAuditLogTest, CanonicalizeDeterministic) { + auto e = make_event("F1", kTransitionConfirmed, 12345); + EXPECT_EQ(FaultAuditLog::canonicalize(7, e), FaultAuditLog::canonicalize(7, e)); + // seq is part of the canonical form, so a different seq changes the bytes. + EXPECT_NE(FaultAuditLog::canonicalize(7, e), FaultAuditLog::canonicalize(8, e)); +} + +// read(after_seq) returns only newer records, oldest-first. +TEST_F(FaultAuditLogTest, ReadAfterSeq) { + FaultAuditLog log(path_); + for (int i = 1; i <= 5; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + auto tail = log.read(/*limit=*/0, /*after_seq=*/3); + ASSERT_EQ(tail.size(), 2u); + EXPECT_EQ(tail[0].seq, 4); + EXPECT_EQ(tail[1].seq, 5); +} + +int main(int argc, char ** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index 90d84af7..623dfdf9 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -16,12 +16,15 @@ #include #include +#include #include #include +#include #include #include #include "rclcpp/rclcpp.hpp" +#include "ros2_medkit_fault_manager/fault_audit_log.hpp" #include "ros2_medkit_fault_manager/fault_manager_node.hpp" #include "ros2_medkit_fault_manager/fault_storage.hpp" #include "ros2_medkit_msgs/msg/fault.hpp" @@ -1361,6 +1364,122 @@ TEST_F(SnapshotCooldownTest, CooldownPreventsRapidRecapture) { SUCCEED(); } +// End-to-end check that the node hooks the audit log on the fault write path. +// Reports a CRITICAL fault (immediate confirm => occurred + confirmed), then +// clears it (=> cleared), and inspects the persisted audit DB by reopening it. +class FaultAuditIntegrationTest : public ::testing::Test { + protected: + void SetUp() override { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dist; + audit_path_ = (std::filesystem::temp_directory_path() / ("test_node_audit_" + std::to_string(dist(gen)) + ".db")) + .string(); + ns_ = "/test_audit_" + std::to_string(dist(gen)); + + rclcpp::NodeOptions fm_options; + fm_options.parameter_overrides({ + {"storage_type", "memory"}, + {"confirmation_threshold", -1}, + {"audit_log.enabled", true}, + {"audit_log.database_path", audit_path_}, + }); + fm_options.arguments({"--ros-args", "-r", "__ns:=" + ns_}); + fault_manager_ = std::make_shared(fm_options); + + rclcpp::NodeOptions test_options; + test_options.arguments({"--ros-args", "-r", "__ns:=" + ns_}); + test_node_ = std::make_shared("test_audit_client", test_options); + + report_client_ = test_node_->create_client(ns_ + "/fault_manager/report_fault"); + clear_client_ = test_node_->create_client(ns_ + "/fault_manager/clear_fault"); + ASSERT_TRUE(report_client_->wait_for_service(std::chrono::seconds(5))); + ASSERT_TRUE(clear_client_->wait_for_service(std::chrono::seconds(5))); + } + + void TearDown() override { + report_client_.reset(); + clear_client_.reset(); + test_node_.reset(); + fault_manager_.reset(); + std::error_code ec; + std::filesystem::remove(audit_path_, ec); + std::filesystem::remove(audit_path_ + "-wal", ec); + std::filesystem::remove(audit_path_ + "-shm", ec); + } + + template + bool spin_until_ready(FutureT & future) { + auto start = std::chrono::steady_clock::now(); + while (std::chrono::steady_clock::now() - start < std::chrono::seconds(2)) { + rclcpp::spin_some(fault_manager_); + rclcpp::spin_some(test_node_); + if (future.wait_for(std::chrono::milliseconds(0)) == std::future_status::ready) { + return true; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + return false; + } + + std::string audit_path_; + std::string ns_; + std::shared_ptr fault_manager_; + std::shared_ptr test_node_; + rclcpp::Client::SharedPtr report_client_; + rclcpp::Client::SharedPtr clear_client_; +}; + +TEST_F(FaultAuditIntegrationTest, TransitionsAppendVerifiableChain) { + auto report = std::make_shared(); + report->fault_code = "AUDIT_FAULT"; + report->event_type = ReportFault::Request::EVENT_FAILED; + report->severity = Fault::SEVERITY_CRITICAL; // immediate confirm + report->description = "overpressure"; + report->source_id = "/plc/pump"; + auto rf = report_client_->async_send_request(report); + ASSERT_TRUE(spin_until_ready(rf)); + ASSERT_TRUE(rf.get()->accepted); + + auto clear = std::make_shared(); + clear->fault_code = "AUDIT_FAULT"; + auto cf = clear_client_->async_send_request(clear); + ASSERT_TRUE(spin_until_ready(cf)); + ASSERT_TRUE(cf.get()->success); + + // Reopen the audit DB independently and inspect the persisted chain. + ros2_medkit_fault_manager::FaultAuditLog audit(audit_path_); + auto records = audit.read(); + ASSERT_EQ(records.size(), 3u); + EXPECT_EQ(records[0].event.transition, ros2_medkit_fault_manager::kTransitionOccurred); + EXPECT_EQ(records[1].event.transition, ros2_medkit_fault_manager::kTransitionConfirmed); + EXPECT_EQ(records[2].event.transition, ros2_medkit_fault_manager::kTransitionCleared); + EXPECT_EQ(records[0].event.fault_code, "AUDIT_FAULT"); + EXPECT_EQ(records[1].event.source_id, "/plc/pump"); + + auto result = audit.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 3); +} + +TEST(FaultAuditDisabledTest, NoAuditFileWhenDisabled) { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dist; + const std::string audit_path = + (std::filesystem::temp_directory_path() / ("test_audit_off_" + std::to_string(dist(gen)) + ".db")).string(); + + rclcpp::NodeOptions options; + options.parameter_overrides({ + {"storage_type", "memory"}, + {"audit_log.database_path", audit_path}, // default enabled=false + }); + auto node = std::make_shared(options); + + // With the feature off, no audit database file is created. + EXPECT_FALSE(std::filesystem::exists(audit_path)); +} + int main(int argc, char ** argv) { rclcpp::init(argc, argv); ::testing::InitGoogleTest(&argc, argv); From 63976785d1deb3fb0bcd45c5384db7f7667c2cdf Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 16:36:35 +0000 Subject: [PATCH 2/5] style: apply clang-format-18 --- .../fault_audit_log.hpp | 20 +++++++++---------- .../src/fault_audit_log.cpp | 2 +- .../src/fault_manager_node.cpp | 5 ++--- .../test/test_fault_audit_log.cpp | 3 +-- .../test/test_fault_manager.cpp | 7 +++---- 5 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp index 59cca09d..039a12e5 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp @@ -31,11 +31,11 @@ namespace ros2_medkit_fault_manager { /// later edit to a stored row is detectable. struct AuditEvent { std::string fault_code; - std::string transition; ///< occurred | confirmed | cleared | ack - uint8_t severity{0}; ///< severity at the time of the transition - std::string status; ///< resulting fault status (e.g. CONFIRMED) - std::string source_id; ///< reporting source that drove the transition - std::string description; ///< human-readable description + std::string transition; ///< occurred | confirmed | cleared | ack + uint8_t severity{0}; ///< severity at the time of the transition + std::string status; ///< resulting fault status (e.g. CONFIRMED) + std::string source_id; ///< reporting source that drove the transition + std::string description; ///< human-readable description int64_t occurred_at_ns{0}; ///< wall-clock timestamp of the transition }; @@ -55,16 +55,16 @@ struct AuditRecord { /// Persisted head of the hash chain. struct ChainHead { - int64_t seq{0}; ///< 0 when the chain is empty - std::string record_hash; ///< genesis hash when the chain is empty + int64_t seq{0}; ///< 0 when the chain is empty + std::string record_hash; ///< genesis hash when the chain is empty }; /// Result of verifying the persisted chain. struct AuditVerifyResult { bool ok{true}; - int64_t checked{0}; ///< number of records walked - int64_t bad_seq{0}; ///< seq of the first offending record (0 if ok) - std::string error; ///< human-readable reason when !ok + int64_t checked{0}; ///< number of records walked + int64_t bad_seq{0}; ///< seq of the first offending record (0 if ok) + std::string error; ///< human-readable reason when !ok }; /// Append-only, hash-chained audit log of fault state transitions. diff --git a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp index 8799eddb..8f146704 100644 --- a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp @@ -93,7 +93,7 @@ void exec_or_throw(sqlite3 * db, const char * sql, const char * what) { } // namespace FaultAuditLog::FaultAuditLog(const std::string & db_path, int64_t retention_max_records) - : db_path_(db_path), retention_max_records_(retention_max_records < 0 ? 0 : retention_max_records) { + : db_path_(db_path), retention_max_records_(retention_max_records < 0 ? 0 : retention_max_records) { int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_FULLMUTEX; if (sqlite3_open_v2(db_path.c_str(), &db_, flags, nullptr) != SQLITE_OK) { std::string error = db_ ? sqlite3_errmsg(db_) : "unknown error"; diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index 5692693d..177d9133 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -479,9 +479,8 @@ void FaultManagerNode::handle_report_fault( // Report the fault event (use wall clock time, not sim time, for proper timestamps) const rclcpp::Time event_time = get_wall_clock_time(); - bool is_new = - storage_->report_fault_event(request->fault_code, request->event_type, request->severity, request->description, - request->source_id, event_time, resolved_config); + bool is_new = storage_->report_fault_event(request->fault_code, request->event_type, request->severity, + request->description, request->source_id, event_time, resolved_config); response->accepted = true; diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp index 21fd3448..b0ac54b9 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -116,8 +116,7 @@ TEST_F(FaultAuditLogTest, VerifyUntamperedChain) { // Known SHA-256 vector proves the EVP wiring (sha256("") == e3b0c442...). TEST_F(FaultAuditLogTest, Sha256KnownVector) { - EXPECT_EQ(FaultAuditLog::sha256_hex(""), - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); + EXPECT_EQ(FaultAuditLog::sha256_hex(""), "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); } // Editing a past row makes verify fail. diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index 623dfdf9..f53219cc 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -1373,8 +1373,8 @@ class FaultAuditIntegrationTest : public ::testing::Test { std::random_device rd; std::mt19937_64 gen(rd()); std::uniform_int_distribution dist; - audit_path_ = (std::filesystem::temp_directory_path() / ("test_node_audit_" + std::to_string(dist(gen)) + ".db")) - .string(); + audit_path_ = + (std::filesystem::temp_directory_path() / ("test_node_audit_" + std::to_string(dist(gen)) + ".db")).string(); ns_ = "/test_audit_" + std::to_string(dist(gen)); rclcpp::NodeOptions fm_options; @@ -1471,8 +1471,7 @@ TEST(FaultAuditDisabledTest, NoAuditFileWhenDisabled) { rclcpp::NodeOptions options; options.parameter_overrides({ - {"storage_type", "memory"}, - {"audit_log.database_path", audit_path}, // default enabled=false + {"storage_type", "memory"}, {"audit_log.database_path", audit_path}, // default enabled=false }); auto node = std::make_shared(options); From 802413e3756be20e7d23f21048d0b7dd6325ab25 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 19:10:24 +0200 Subject: [PATCH 3/5] fix(fault-audit): close truncation bypass and audit timer confirmations verify() now reads audit_chain_head directly and fails when the head row is missing on a non-empty log, so deleting the newest row plus the head is caught. Timer-driven PREFAILED->CONFIRMED confirmations now call audit_transition. Adds append-only triggers (hardening, not a security boundary) and corrects the unkeyed/single-file threat model in the docs. Refs #483 --- src/ros2_medkit_fault_manager/CHANGELOG.rst | 2 +- src/ros2_medkit_fault_manager/README.md | 8 +- .../config/fault_manager.yaml | 8 +- .../fault_audit_log.hpp | 18 ++- .../fault_manager_node.hpp | 10 ++ .../fault_storage.hpp | 6 +- .../sqlite_fault_storage.hpp | 2 +- .../src/fault_audit_log.cpp | 114 ++++++++++++------ .../src/fault_manager_node.cpp | 16 ++- .../src/fault_storage.cpp | 10 +- .../src/sqlite_fault_storage.cpp | 24 +++- .../test/test_fault_audit_log.cpp | 79 +++++++++++- .../test/test_fault_manager.cpp | 63 +++++++++- .../test/test_sqlite_storage.cpp | 9 +- 14 files changed, 293 insertions(+), 76 deletions(-) diff --git a/src/ros2_medkit_fault_manager/CHANGELOG.rst b/src/ros2_medkit_fault_manager/CHANGELOG.rst index 902b8447..1e884c91 100644 --- a/src/ros2_medkit_fault_manager/CHANGELOG.rst +++ b/src/ros2_medkit_fault_manager/CHANGELOG.rst @@ -4,7 +4,7 @@ Changelog for package ros2_medkit_fault_manager Forthcoming ----------- -* Optional tamper-evident, append-only audit log of fault state transitions: each transition appends one immutable, hash-chained row (``record_hash = sha256(prev_hash + canonical(event))`` via OpenSSL EVP SHA-256) with a persisted chain head, a ``verify`` routine, a read API, and retention that seals a segment anchor before pruning. Off by default (`#483 `_) +* Optional append-only, hash-chained audit log of fault state transitions: each transition appends one immutable row (``record_hash = sha256(prev_hash + canonical(event))`` via OpenSSL EVP SHA-256) with a persisted chain head, a ``verify`` routine, a read API, and retention that seals a segment anchor before pruning. Time-based (PREFAILED->CONFIRMED) auto-confirmations are also audited. ``verify`` reads the chain head directly from the database, so deleting the newest row together with the head row is reported as tampering instead of silently recovering. ``BEFORE UPDATE`` / ``BEFORE DELETE`` triggers reject out-of-band edits as defense-in-depth. The chain is unkeyed and stored in a single writable file, so ``verify`` detects edits/deletions that did not recompute the chain (casual or accidental tampering); it is not a defence against an attacker who can rewrite the whole file. Off by default (`#483 `_) 0.6.0 (2026-06-22) ------------------ diff --git a/src/ros2_medkit_fault_manager/README.md b/src/ros2_medkit_fault_manager/README.md index cccaaa6a..4ca1cd98 100644 --- a/src/ros2_medkit_fault_manager/README.md +++ b/src/ros2_medkit_fault_manager/README.md @@ -112,11 +112,13 @@ patterns: ## Advanced: Tamper-Evident Audit Log -An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `cleared`) so the fault history is verifiable and any later edit or deletion is detectable. It is **off by default** because it adds a write and storage cost per transition. +An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `cleared`) so the fault history is independently verifiable. It is **off by default** because it adds a write and storage cost per transition. -Each transition appends one immutable row holding `record_hash = sha256(prev_hash + canonical(event))` (OpenSSL EVP SHA-256), the `prev_hash` it links to, and a monotonic `seq`. The hash is computed once at insert and never recomputed. A persisted chain head lets the chain resume across restarts. The log is stored in its own SQLite database (separate from the fault store) and is treated as append-only - the manager only ever inserts rows. +Each transition appends one immutable row holding `record_hash = sha256(prev_hash + canonical(event))` (OpenSSL EVP SHA-256), the `prev_hash` it links to, and a monotonic `seq`. The hash is computed once at insert and never recomputed. A persisted chain head lets the chain resume across restarts. The log is stored in its own SQLite database (separate from the fault store) and is treated as append-only: the manager only ever inserts rows, and `BEFORE UPDATE` / `BEFORE DELETE` triggers reject out-of-band edits (the guarded rotation prune excepted). -`verify()` walks the persisted chain oldest-first and recomputes every link: editing a row breaks its `record_hash`, deleting a row breaks the next row's `prev_hash` linkage, and deleting the newest row is caught by the persisted-head check. +`verify()` walks the persisted chain oldest-first and recomputes every link: editing a row breaks its `record_hash`, deleting a row breaks the next row's `prev_hash` linkage, and deleting the newest row (the head row is read straight from the DB) is caught by the persisted-head check. + +**Threat model (read this).** The chain is **unkeyed**, and the head and segment anchors live in the **same writable SQLite file** as the rows. `verify()` therefore catches edits or deletions that did **not** also recompute the chain - that is, casual or accidental tampering, and the bookkeeping bugs that would otherwise lose records. It does **not** stop an attacker with write access to the file: such an attacker can drop the triggers and recompute the entire chain (and head and anchors) to forge a self-consistent history. The append-only triggers are defense-in-depth, **not** a security boundary. True tamper-*proofing* requires a key or signature over the head (so it cannot be recomputed without the key) or external anchoring of the head hash to an append-only store you do not control; both are out of scope here and belong to the audit-log exporter / signing follow-up. **Retention/rotation**: when more than `audit_log.retention_max_records` rows are retained, the oldest segment is *sealed* (its final `seq` + hash are persisted as an anchor) and then pruned. The surviving tail still verifies because the oldest retained row links back to the sealed anchor. diff --git a/src/ros2_medkit_fault_manager/config/fault_manager.yaml b/src/ros2_medkit_fault_manager/config/fault_manager.yaml index 3f94e71c..13d1f57a 100644 --- a/src/ros2_medkit_fault_manager/config/fault_manager.yaml +++ b/src/ros2_medkit_fault_manager/config/fault_manager.yaml @@ -28,10 +28,14 @@ fault_manager: # snapshots.capture_queue_depth: 16 # max pending captures (>= 1) # snapshots.capture_queue_full_policy: reject_newest # reject_newest | drop_oldest - # Tamper-evident, append-only audit log of fault state transitions + # Append-only, hash-chained audit log of fault state transitions # (occurred/confirmed/cleared). OFF by default: it adds a write + storage cost # per transition. When enabled, each transition appends one immutable, - # hash-chained row so any later edit or deletion is detectable via verify. + # hash-chained row, so verify() detects edits or deletions that did NOT also + # recompute the chain (casual/accidental tampering). The chain is unkeyed and + # lives in a single writable file, so it is NOT proof against an attacker who + # can rewrite the whole file; true tamper-proofing needs a signed head or + # external anchoring (out of scope here). See README "Threat model". audit_log.enabled: false # Which transitions to record: "all" or "confirmed_only". # audit_log.transitions: all diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp index 039a12e5..457bca9e 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -27,8 +28,8 @@ namespace ros2_medkit_fault_manager { /// /// `transition` is one of the kTransition* constants below. The remaining /// fields describe the fault at the moment of the transition; all of them feed -/// the canonical serialization that the hash chain is computed over, so any -/// later edit to a stored row is detectable. +/// the canonical serialization that the hash chain is computed over, so an edit +/// to a stored row that does not also recompute the chain breaks verify(). struct AuditEvent { std::string fault_code; std::string transition; ///< occurred | confirmed | cleared | ack @@ -78,6 +79,15 @@ struct AuditVerifyResult { /// /// The table is treated as append-only: this class only ever INSERTs rows (and, /// on rotation, deletes a sealed prefix). It never UPDATEs an existing record. +/// BEFORE UPDATE / BEFORE DELETE triggers reject out-of-band edits (the guarded +/// rotation prune excepted) as defense-in-depth. +/// +/// Threat model: the hash chain is UNKEYED and the head/anchors live in the same +/// writable file. verify() catches edits or deletions that did not also recompute +/// the chain (casual or accidental tampering), but anyone with write access to the +/// file can recompute the whole chain (and drop the triggers) and forge a +/// consistent history. True tamper-proofing needs a key/signature over the head or +/// external anchoring; that is out of scope here. class FaultAuditLog { public: /// Open (or create) the audit log database. @@ -129,6 +139,10 @@ class FaultAuditLog { private: void initialize_schema(); + /// Read the persisted chain head row (audit_chain_head id=1) straight from the + /// DB. Returns nullopt when the row is absent. verify() relies on this so a + /// deleted head row is treated as tampering rather than silently recovered. + std::optional read_head_row_locked() const; ChainHead load_head_locked() const; void store_head_locked(const ChainHead & head_record); /// Seal + prune the oldest segment if the retained count exceeds the limit. diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp index 7c603232..61cf227c 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp @@ -62,6 +62,16 @@ class FaultManagerNode : public rclcpp::Node { return *storage_; } + /// Get mutable access to fault storage (for testing only). + FaultStorage & get_storage_for_test() { + return *storage_; + } + + /// Get the tamper-evident audit log (nullptr when disabled), for testing only. + const FaultAuditLog * get_audit_log_for_test() const { + return audit_log_.get(); + } + /// Get the storage type being used const std::string & get_storage_type() const { return storage_type_; diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp index 8c66733f..935d573a 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp @@ -143,8 +143,8 @@ class FaultStorage { /// Check and confirm PREFAILED faults that have been pending too long (time-based confirmation) /// @param current_time Current timestamp for age calculation - /// @return Number of faults that were confirmed - virtual size_t check_time_based_confirmation(const rclcpp::Time & current_time) = 0; + /// @return Fault codes that were confirmed by this call (so the caller can audit each). + virtual std::vector check_time_based_confirmation(const rclcpp::Time & current_time) = 0; /// Set maximum snapshots per fault code (0 = unlimited) virtual void set_max_snapshots_per_fault(size_t /*max_count*/) { @@ -223,7 +223,7 @@ class InMemoryFaultStorage : public FaultStorage { bool contains(const std::string & fault_code) const override; - size_t check_time_based_confirmation(const rclcpp::Time & current_time) override; + std::vector check_time_based_confirmation(const rclcpp::Time & current_time) override; void set_max_snapshots_per_fault(size_t max_count) override; diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp index 0de89805..29a54bee 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp @@ -59,7 +59,7 @@ class SqliteFaultStorage : public FaultStorage { bool contains(const std::string & fault_code) const override; - size_t check_time_based_confirmation(const rclcpp::Time & current_time) override; + std::vector check_time_based_confirmation(const rclcpp::Time & current_time) override; void set_max_snapshots_per_fault(size_t max_count) override; diff --git a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp index 8f146704..88710a53 100644 --- a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -215,32 +216,54 @@ void FaultAuditLog::initialize_schema() { ); )", "create audit_anchors table"); + + // Append-only enforcement at the DB layer (defense-in-depth, NOT a security + // boundary: anyone able to write the file can also DROP these triggers). The + // single-row guard lets the in-process rotation prune delete a sealed prefix + // while every other UPDATE/DELETE on audit_log is rejected. + exec_or_throw(db_, + R"( + CREATE TABLE IF NOT EXISTS audit_prune_guard ( + id INTEGER PRIMARY KEY CHECK (id = 1), + enabled INTEGER NOT NULL DEFAULT 0 + ); + INSERT INTO audit_prune_guard (id, enabled) VALUES (1, 0) + ON CONFLICT(id) DO NOTHING; + CREATE TRIGGER IF NOT EXISTS audit_log_no_update + BEFORE UPDATE ON audit_log + BEGIN + SELECT RAISE(ABORT, 'audit_log is append-only'); + END; + CREATE TRIGGER IF NOT EXISTS audit_log_no_delete + BEFORE DELETE ON audit_log + WHEN (SELECT enabled FROM audit_prune_guard WHERE id = 1) IS NOT 1 + BEGIN + SELECT RAISE(ABORT, 'audit_log is append-only'); + END; + )", + "create append-only triggers"); } -ChainHead FaultAuditLog::load_head_locked() const { - // Prefer the persisted head row. - { - Stmt stmt(db_, "SELECT seq, record_hash FROM audit_chain_head WHERE id = 1"); - if (stmt.step() == SQLITE_ROW) { - ChainHead head_record; - head_record.seq = stmt.column_int64(0); - head_record.record_hash = stmt.column_text(1); - return head_record; - } +std::optional FaultAuditLog::read_head_row_locked() const { + Stmt stmt(db_, "SELECT seq, record_hash FROM audit_chain_head WHERE id = 1"); + if (stmt.step() == SQLITE_ROW) { + ChainHead head_record; + head_record.seq = stmt.column_int64(0); + head_record.record_hash = stmt.column_text(1); + return head_record; } + return std::nullopt; +} - // No head row. Recover from the last retained record if any exist (defensive: - // a crash between INSERT and head update would land here on reopen). - { - Stmt stmt(db_, "SELECT seq, record_hash FROM audit_log ORDER BY seq DESC LIMIT 1"); - if (stmt.step() == SQLITE_ROW) { - ChainHead head_record; - head_record.seq = stmt.column_int64(0); - head_record.record_hash = stmt.column_text(1); - return head_record; - } +ChainHead FaultAuditLog::load_head_locked() const { + // Resume strictly from the persisted head row. There is deliberately no + // MAX(seq) fallback: append+head-update are written in one transaction, so a + // missing head row while rows exist means tampering, not a recoverable crash. + // verify() reports that case as a failure rather than silently fabricating a + // head from the surviving rows. + if (auto head_record = read_head_row_locked()) { + return *head_record; } - return ChainHead{0, genesis_hash()}; } @@ -346,11 +369,18 @@ void FaultAuditLog::rotate_if_needed_locked() { throw std::runtime_error(std::string("audit: failed to write anchor: ") + sqlite3_errmsg(db_)); } + // Open the prune guard so the BEFORE DELETE trigger allows this sealed-prefix + // delete. Both the guard flip and the delete are in this transaction, so a + // ROLLBACK restores the guard to its closed state. + exec_or_throw(db_, "UPDATE audit_prune_guard SET enabled = 1 WHERE id = 1", "open prune guard"); + Stmt del(db_, "DELETE FROM audit_log WHERE seq <= ?"); del.bind_int64(1, boundary_seq); if (del.step() != SQLITE_DONE) { throw std::runtime_error(std::string("audit: failed to prune records: ") + sqlite3_errmsg(db_)); } + + exec_or_throw(db_, "UPDATE audit_prune_guard SET enabled = 0 WHERE id = 1", "close prune guard"); exec_or_throw(db_, "COMMIT", "commit rotate"); } catch (...) { exec_or_throw(db_, "ROLLBACK", "rollback rotate"); @@ -492,27 +522,37 @@ AuditVerifyResult FaultAuditLog::verify() const { expected_seq = rec.seq + 1; } - // The persisted head must match the last retained record (catches deletion of - // the newest row, which the row walk alone cannot see). + // Read the persisted head row DIRECTLY from the DB (not the cached head_). A + // missing head row cannot be silently recovered from MAX(seq), so deleting the + // newest record together with the head row is reported as tampering instead of + // verifying clean. + const std::optional persisted = read_head_row_locked(); + if (result.checked > 0) { - if (head_.seq != expected_seq - 1 || head_.record_hash != expected_prev) { + // A non-empty log must carry its head row, and it must match the last record. + if (!persisted) { result.ok = false; - result.bad_seq = head_.seq; + result.bad_seq = expected_seq - 1; + result.error = "audit_chain_head row missing while audit_log is non-empty (head deleted / truncated)"; + return result; + } + if (persisted->seq != expected_seq - 1 || persisted->record_hash != expected_prev) { + result.ok = false; + result.bad_seq = persisted->seq; result.error = "persisted head does not match the last retained record"; return result; } - } else { - // Empty retained log: head must be either genesis (never written) or point - // at a sealed anchor (everything pruned). - if (head_.seq != 0) { - Stmt anchor(db_, "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); - anchor.bind_int64(1, head_.seq); - if (anchor.step() != SQLITE_ROW || anchor.column_text(0) != head_.record_hash) { - result.ok = false; - result.bad_seq = head_.seq; - result.error = "head references a record that is neither retained nor sealed"; - return result; - } + } else if (persisted && persisted->seq != 0) { + // Empty retained log with a head past genesis: everything was pruned, so the + // head must point at a sealed anchor. (No head row, or a genesis head, on an + // empty log is the never-written case and is fine.) + Stmt anchor(db_, "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); + anchor.bind_int64(1, persisted->seq); + if (anchor.step() != SQLITE_ROW || anchor.column_text(0) != persisted->record_hash) { + result.ok = false; + result.bad_seq = persisted->seq; + result.error = "head references a record that is neither retained nor sealed"; + return result; } } diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index 177d9133..912593a7 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -289,10 +289,20 @@ FaultManagerNode::FaultManagerNode(const rclcpp::NodeOptions & options) : Node(" // Create auto-confirmation timer if enabled if (auto_confirm_after_sec_ > 0.0) { auto_confirm_timer_ = create_wall_timer(std::chrono::seconds(1), [this]() { - size_t confirmed = storage_->check_time_based_confirmation(get_wall_clock_time()); - if (confirmed > 0) { - RCLCPP_INFO(get_logger(), "Auto-confirmed %zu PREFAILED fault(s) due to time threshold", confirmed); + const auto confirmed = storage_->check_time_based_confirmation(get_wall_clock_time()); + if (confirmed.empty()) { + return; } + // Audit every timer-driven PREFAILED->CONFIRMED transition. Without this the + // confirmations are invisible to the audit log's verify(). + const int64_t confirmed_at_ns = get_wall_clock_time().nanoseconds(); + for (const auto & fault_code : confirmed) { + auto fault = storage_->get_fault(fault_code); + if (fault) { + audit_transition(kTransitionConfirmed, *fault, "auto_confirm_timer", confirmed_at_ns); + } + } + RCLCPP_INFO(get_logger(), "Auto-confirmed %zu PREFAILED fault(s) due to time threshold", confirmed.size()); }); RCLCPP_INFO(get_logger(), "FaultManager node started (storage=%s, confirmation_threshold=%d, " diff --git a/src/ros2_medkit_fault_manager/src/fault_storage.cpp b/src/ros2_medkit_fault_manager/src/fault_storage.cpp index c8db7e51..f83fc3bc 100644 --- a/src/ros2_medkit_fault_manager/src/fault_storage.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_storage.cpp @@ -267,14 +267,14 @@ bool InMemoryFaultStorage::contains(const std::string & fault_code) const { return faults_.find(fault_code) != faults_.end(); } -size_t InMemoryFaultStorage::check_time_based_confirmation(const rclcpp::Time & current_time) { +std::vector InMemoryFaultStorage::check_time_based_confirmation(const rclcpp::Time & current_time) { std::lock_guard lock(mutex_); + std::vector confirmed; if (config_.auto_confirm_after_sec <= 0.0) { - return 0; // Time-based confirmation disabled + return confirmed; // Time-based confirmation disabled } - size_t confirmed_count = 0; const double threshold_ns = config_.auto_confirm_after_sec * 1e9; for (auto & [code, state] : faults_) { @@ -282,12 +282,12 @@ size_t InMemoryFaultStorage::check_time_based_confirmation(const rclcpp::Time & const int64_t age_ns = (current_time - state.last_failed_time).nanoseconds(); if (static_cast(age_ns) >= threshold_ns) { state.status = ros2_medkit_msgs::msg::Fault::STATUS_CONFIRMED; - ++confirmed_count; + confirmed.push_back(code); } } } - return confirmed_count; + return confirmed; } void InMemoryFaultStorage::set_max_snapshots_per_fault(size_t max_count) { diff --git a/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp b/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp index 5295f028..92de8e1f 100644 --- a/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp +++ b/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp @@ -666,17 +666,35 @@ bool SqliteFaultStorage::contains(const std::string & fault_code) const { return stmt.step() == SQLITE_ROW; } -size_t SqliteFaultStorage::check_time_based_confirmation(const rclcpp::Time & current_time) { +std::vector SqliteFaultStorage::check_time_based_confirmation(const rclcpp::Time & current_time) { std::lock_guard lock(mutex_); + std::vector confirmed; if (config_.auto_confirm_after_sec <= 0.0) { - return 0; // Time-based confirmation disabled + return confirmed; // Time-based confirmation disabled } int64_t current_ns = current_time.nanoseconds(); int64_t threshold_ns = static_cast(config_.auto_confirm_after_sec * 1e9); int64_t cutoff_ns = current_ns - threshold_ns; + // Collect the codes that will flip first so the caller can audit each one. The + // SELECT predicate mirrors the UPDATE exactly, and both run under the same lock, + // so the returned list matches the rows actually confirmed below. + { + SqliteStatement select_stmt( + db_, "SELECT fault_code FROM faults WHERE status = ? AND last_failed_ns <= ? AND last_failed_ns > 0"); + select_stmt.bind_text(1, ros2_medkit_msgs::msg::Fault::STATUS_PREFAILED); + select_stmt.bind_int64(2, cutoff_ns); + while (select_stmt.step() == SQLITE_ROW) { + confirmed.push_back(select_stmt.column_text(0)); + } + } + + if (confirmed.empty()) { + return confirmed; + } + SqliteStatement update_stmt( db_, "UPDATE faults SET status = ? WHERE status = ? AND last_failed_ns <= ? AND last_failed_ns > 0"); update_stmt.bind_text(1, ros2_medkit_msgs::msg::Fault::STATUS_CONFIRMED); @@ -687,7 +705,7 @@ size_t SqliteFaultStorage::check_time_based_confirmation(const rclcpp::Time & cu throw std::runtime_error(std::string("Failed to confirm faults: ") + sqlite3_errmsg(db_)); } - return static_cast(sqlite3_changes(db_)); + return confirmed; } void SqliteFaultStorage::set_max_snapshots_per_fault(size_t max_count) { diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp index b0ac54b9..6a959b8e 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -42,8 +42,8 @@ AuditEvent make_event(const std::string & code, const char * transition, int64_t return e; } -/// Run a single SQL statement directly against the audit DB file (used to -/// simulate tampering an immutable row). +/// Run SQL directly against the audit DB file (used to simulate tampering an +/// immutable row). Asserts success. void raw_exec(const std::string & db_path, const std::string & sql) { sqlite3 * db = nullptr; ASSERT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); @@ -55,6 +55,22 @@ void raw_exec(const std::string & db_path, const std::string & sql) { ASSERT_EQ(rc, SQLITE_OK) << err_str; } +/// Run SQL directly and return the raw result code (does not assert), so a test +/// can confirm the append-only triggers reject a write. +int raw_exec_rc(const std::string & db_path, const std::string & sql) { + sqlite3 * db = nullptr; + EXPECT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); + int rc = sqlite3_exec(db, sql.c_str(), nullptr, nullptr, nullptr); + sqlite3_close(db); + return rc; +} + +/// Drop the DB-level append-only triggers so a raw tamper write can land. This +/// mimics an attacker who bypassed the (defense-in-depth, not security boundary) +/// triggers; verify() must still detect the recompute-free edit afterwards. +const char * const kDropTriggers = + "DROP TRIGGER IF EXISTS audit_log_no_update; DROP TRIGGER IF EXISTS audit_log_no_delete; "; + } // namespace class FaultAuditLogTest : public ::testing::Test { @@ -130,7 +146,7 @@ TEST_F(FaultAuditLogTest, EditingPastRowFailsVerify) { } // Tamper: change a stored field without recomputing the hash. - raw_exec(path_, "UPDATE audit_log SET description = 'forged' WHERE seq = 2"); + raw_exec(path_, std::string(kDropTriggers) + "UPDATE audit_log SET description = 'forged' WHERE seq = 2"); FaultAuditLog reopened(path_); auto result = reopened.verify(); @@ -147,7 +163,7 @@ TEST_F(FaultAuditLogTest, DeletingMiddleRowFailsVerify) { log.append(make_event("F3", kTransitionOccurred, 300)); } - raw_exec(path_, "DELETE FROM audit_log WHERE seq = 2"); + raw_exec(path_, std::string(kDropTriggers) + "DELETE FROM audit_log WHERE seq = 2"); FaultAuditLog reopened(path_); EXPECT_FALSE(reopened.verify().ok); @@ -163,12 +179,34 @@ TEST_F(FaultAuditLogTest, DeletingNewestRowFailsVerify) { } // Drop the last row but leave the head pointing at seq 3. - raw_exec(path_, "DELETE FROM audit_log WHERE seq = 3"); + raw_exec(path_, std::string(kDropTriggers) + "DELETE FROM audit_log WHERE seq = 3"); FaultAuditLog reopened(path_); EXPECT_FALSE(reopened.verify().ok); } +// Truncation bypass: deleting the newest row AND the head row must still FAIL. +// The head row is read directly from the DB, so a missing head on a non-empty +// log is treated as tampering rather than silently rebuilt from MAX(seq). +TEST_F(FaultAuditLogTest, DeletingNewestRowAndHeadFailsVerify) { + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionOccurred, 200)); + log.append(make_event("F3", kTransitionOccurred, 300)); + ASSERT_TRUE(log.verify().ok); + } + + // Drop the newest row and the persisted head row together. + raw_exec(path_, std::string(kDropTriggers) + + "DELETE FROM audit_log WHERE seq = 3; DELETE FROM audit_chain_head WHERE id = 1;"); + + FaultAuditLog reopened(path_); + auto result = reopened.verify(); + EXPECT_FALSE(result.ok); + EXPECT_EQ(result.bad_seq, 2); // last surviving row +} + // The chain head persists across a reopen and the chain resumes from it. TEST_F(FaultAuditLogTest, HeadPersistsAcrossReopen) { std::string head_hash; @@ -221,7 +259,7 @@ TEST_F(FaultAuditLogTest, RotationThenTamperFails) { log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); } } - raw_exec(path_, "UPDATE audit_log SET source_id = 'forged' WHERE seq = 10"); + raw_exec(path_, std::string(kDropTriggers) + "UPDATE audit_log SET source_id = 'forged' WHERE seq = 10"); FaultAuditLog reopened(path_, 5); auto result = reopened.verify(); @@ -263,6 +301,35 @@ TEST_F(FaultAuditLogTest, ReadAfterSeq) { EXPECT_EQ(tail[1].seq, 5); } +// Defense-in-depth: the append-only triggers reject an out-of-band UPDATE or +// DELETE on audit_log while the in-process rotation prune still works. +TEST_F(FaultAuditLogTest, AppendOnlyTriggersRejectOutOfBandWrites) { + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionOccurred, 200)); + } + + // Both a raw UPDATE and a raw DELETE are aborted by the triggers. + EXPECT_NE(raw_exec_rc(path_, "UPDATE audit_log SET description = 'forged' WHERE seq = 1"), SQLITE_OK); + EXPECT_NE(raw_exec_rc(path_, "DELETE FROM audit_log WHERE seq = 1"), SQLITE_OK); + + // Rows are intact and the chain still verifies. + FaultAuditLog reopened(path_); + EXPECT_EQ(reopened.record_count(), 2); + EXPECT_TRUE(reopened.verify().ok); +} + +// The guarded rotation prune is exempt from the append-only delete trigger. +TEST_F(FaultAuditLogTest, RotationPrunePassesAppendOnlyTrigger) { + FaultAuditLog log(path_, /*retention_max_records=*/3); + for (int i = 1; i <= 8; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + EXPECT_EQ(log.record_count(), 3); + EXPECT_TRUE(log.verify().ok); +} + int main(int argc, char ** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index f53219cc..28411122 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -526,9 +526,9 @@ TEST_F(FaultStorageTest, TimeBasedConfirmationDisabledByDefault) { // Advance time and check - should not auto-confirm (auto_confirm_after_sec = 0) auto future_time = rclcpp::Time(clock.now().nanoseconds() + static_cast(20e9)); - size_t confirmed = storage_.check_time_based_confirmation(future_time); + auto confirmed = storage_.check_time_based_confirmation(future_time); - EXPECT_EQ(confirmed, 0u); + EXPECT_TRUE(confirmed.empty()); auto fault = storage_.get_fault("FAULT_1"); ASSERT_TRUE(fault.has_value()); @@ -548,13 +548,14 @@ TEST_F(FaultStorageTest, TimeBasedConfirmationWhenEnabled) { // Check before timeout - should not confirm auto before_timeout = rclcpp::Time(now.nanoseconds() + static_cast(5e9)); - size_t confirmed_early = storage_.check_time_based_confirmation(before_timeout); - EXPECT_EQ(confirmed_early, 0u); + auto confirmed_early = storage_.check_time_based_confirmation(before_timeout); + EXPECT_TRUE(confirmed_early.empty()); // Check after timeout - should confirm auto after_timeout = rclcpp::Time(now.nanoseconds() + static_cast(15e9)); - size_t confirmed = storage_.check_time_based_confirmation(after_timeout); - EXPECT_EQ(confirmed, 1u); + auto confirmed = storage_.check_time_based_confirmation(after_timeout); + ASSERT_EQ(confirmed.size(), 1u); + EXPECT_EQ(confirmed[0], "FAULT_1"); auto fault = storage_.get_fault("FAULT_1"); ASSERT_TRUE(fault.has_value()); @@ -1479,6 +1480,56 @@ TEST(FaultAuditDisabledTest, NoAuditFileWhenDisabled) { EXPECT_FALSE(std::filesystem::exists(audit_path)); } +// Timer-driven (PREFAILED->CONFIRMED) auto-confirmations must be audited, not +// silently applied. Sets auto_confirm_after_sec and asserts a "confirmed" audit +// row appears after the 1 Hz timer fires. +TEST(FaultAuditTimerTest, TimerConfirmationAppendsConfirmedAuditRow) { + rclcpp::NodeOptions options; + options.parameter_overrides({ + {"storage_type", "memory"}, + {"confirmation_threshold", -3}, // keep the fault PREFAILED so only the timer confirms it + {"auto_confirm_after_sec", 0.2}, + {"audit_log.enabled", true}, // in-memory audit DB (memory storage) + }); + auto node = std::make_shared(options); + + const auto * audit = node->get_audit_log_for_test(); + ASSERT_NE(audit, nullptr); + + // Land a fault in PREFAILED directly in storage; the node's auto-confirm timer + // must later flip it to CONFIRMED and append the audit row. + DebounceConfig config; + config.confirmation_threshold = -3; + config.auto_confirm_after_sec = 0.2; + rclcpp::Clock clock(RCL_SYSTEM_TIME); + node->get_storage_for_test().report_fault_event("AUTO_CONF_1", ReportFault::Request::EVENT_FAILED, + Fault::SEVERITY_ERROR, "stuck", "/robot/src", clock.now(), config); + ASSERT_EQ(node->get_storage().get_fault("AUTO_CONF_1")->status, Fault::STATUS_PREFAILED); + + // Spin until a confirmed audit row appears or the budget expires (the wall + // timer fires once per second). + bool saw_confirmed = false; + auto start = std::chrono::steady_clock::now(); + while (std::chrono::steady_clock::now() - start < std::chrono::seconds(5)) { + rclcpp::spin_some(node); + for (const auto & rec : audit->read()) { + if (rec.event.fault_code == "AUTO_CONF_1" && + rec.event.transition == ros2_medkit_fault_manager::kTransitionConfirmed) { + saw_confirmed = true; + break; + } + } + if (saw_confirmed) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + + EXPECT_TRUE(saw_confirmed) << "timer-driven confirmation was not audited"; + EXPECT_EQ(node->get_storage().get_fault("AUTO_CONF_1")->status, Fault::STATUS_CONFIRMED); + EXPECT_TRUE(audit->verify().ok); +} + int main(int argc, char ** argv) { rclcpp::init(argc, argv); ::testing::InitGoogleTest(&argc, argv); diff --git a/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp b/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp index 1f414dff..ddb2f033 100644 --- a/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp +++ b/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp @@ -568,13 +568,14 @@ TEST_F(SqliteFaultStorageTest, TimeBasedConfirmationWhenEnabled) { // Check before timeout - should not confirm auto before_timeout = rclcpp::Time(now.nanoseconds() + static_cast(5e9)); - size_t confirmed_early = storage_->check_time_based_confirmation(before_timeout); - EXPECT_EQ(confirmed_early, 0u); + auto confirmed_early = storage_->check_time_based_confirmation(before_timeout); + EXPECT_TRUE(confirmed_early.empty()); // Check after timeout - should confirm auto after_timeout = rclcpp::Time(now.nanoseconds() + static_cast(15e9)); - size_t confirmed = storage_->check_time_based_confirmation(after_timeout); - EXPECT_EQ(confirmed, 1u); + auto confirmed = storage_->check_time_based_confirmation(after_timeout); + ASSERT_EQ(confirmed.size(), 1u); + EXPECT_EQ(confirmed[0], "FAULT_1"); auto fault = storage_->get_fault("FAULT_1"); ASSERT_TRUE(fault.has_value()); From 835b8621a499c0647a52c6ae21f43fd06b7fd595 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 19:43:52 +0200 Subject: [PATCH 4/5] fix(fault-audit): guard null DB handle and use PRId64 for int64 logs raw_exec_rc no longer runs sqlite3_exec on a failed sqlite3_open; it records the failure, closes the handle and returns early. The audit-log enabled RCLCPP_INFO now formats int64_t retention/seq with PRId64 () to satisfy -Werror=format=2. Refs #483 --- src/ros2_medkit_fault_manager/src/fault_manager_node.cpp | 4 +++- src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index 912593a7..5103884d 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -410,7 +411,8 @@ std::unique_ptr FaultManagerNode::create_audit_log() { try { auto log = std::make_unique(audit_path, retention); - RCLCPP_INFO(get_logger(), "Fault audit log enabled: %s (transitions=%s, retention=%ld, resume_seq=%ld)", + RCLCPP_INFO(get_logger(), + "Fault audit log enabled: %s (transitions=%s, retention=%" PRId64 ", resume_seq=%" PRId64 ")", audit_path.c_str(), audit_confirmed_only_ ? "confirmed_only" : "all", retention, log->head().seq); return log; } catch (const std::exception & e) { diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp index 6a959b8e..581ba006 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -59,7 +59,11 @@ void raw_exec(const std::string & db_path, const std::string & sql) { /// can confirm the append-only triggers reject a write. int raw_exec_rc(const std::string & db_path, const std::string & sql) { sqlite3 * db = nullptr; - EXPECT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); + if (sqlite3_open(db_path.c_str(), &db) != SQLITE_OK) { + ADD_FAILURE() << "sqlite3_open failed: " << (db ? sqlite3_errmsg(db) : "out of memory"); + sqlite3_close(db); // sqlite3 allows close on a failed-open handle (incl. nullptr). + return SQLITE_ERROR; + } int rc = sqlite3_exec(db, sql.c_str(), nullptr, nullptr, nullptr); sqlite3_close(db); return rc; From 887280053e23befed35ba7bd8f6a409f80464ce7 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 22:33:57 +0200 Subject: [PATCH 5/5] fix(fault-audit): audit auto-heal and stop silent append-failure gaps Record auto-recovery as a distinct "healed" row (source auto_heal) so a fault's end is in the chain. Append failures now bump a dropped-writes health counter and clear an audit-healthy flag; audit_log.fail_closed makes them a hard error. Drop the never-written "ack" kind (clear == ack), protect audit_prune_guard with a trigger, add logging activate/deactivate markers, and tighten the threat-model README. Refs #483 --- src/ros2_medkit_fault_manager/README.md | 9 +- .../config/fault_manager.yaml | 19 +- .../fault_audit_log.hpp | 16 +- .../fault_manager_node.hpp | 27 ++- .../src/fault_audit_log.cpp | 36 ++++ .../src/fault_manager_node.cpp | 72 +++++++- .../test/test_fault_audit_log.cpp | 39 +++++ .../test/test_fault_manager.cpp | 165 +++++++++++++++++- 8 files changed, 358 insertions(+), 25 deletions(-) diff --git a/src/ros2_medkit_fault_manager/README.md b/src/ros2_medkit_fault_manager/README.md index 4ca1cd98..30d7fa99 100644 --- a/src/ros2_medkit_fault_manager/README.md +++ b/src/ros2_medkit_fault_manager/README.md @@ -112,22 +112,25 @@ patterns: ## Advanced: Tamper-Evident Audit Log -An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `cleared`) so the fault history is independently verifiable. It is **off by default** because it adds a write and storage cost per transition. +An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `healed`, `cleared`) so the fault history is independently verifiable. Auto-recovery (a fault reaching the healing threshold via PASSED events) is recorded as a distinct `healed` row with source `auto_heal`, so the fault's END is in the timeline and is not confused with a manual `cleared`. The manager has no acknowledge action separate from clearing, so `~/clear_fault` is recorded as `cleared` (clear == ack); there is no `ack` kind. The log also records its own lifecycle with `logging_activated` / `logging_deactivated` markers (CIR (EU) 2024/2690 sec. 3.2) at start and stop. It is **off by default** because it adds a write and storage cost per transition. Each transition appends one immutable row holding `record_hash = sha256(prev_hash + canonical(event))` (OpenSSL EVP SHA-256), the `prev_hash` it links to, and a monotonic `seq`. The hash is computed once at insert and never recomputed. A persisted chain head lets the chain resume across restarts. The log is stored in its own SQLite database (separate from the fault store) and is treated as append-only: the manager only ever inserts rows, and `BEFORE UPDATE` / `BEFORE DELETE` triggers reject out-of-band edits (the guarded rotation prune excepted). +**Completeness is an integrity property.** `verify()` proves nothing was *deleted* from the chain, but it cannot prove a transition that was *never appended*. So a silently dropped append is a hole `verify()` can never see. Every transition on the write path is therefore audited (occurred, timer/threshold confirmations, auto-heal, and clears), and an append failure is never swallowed silently: it increments a dropped-writes health counter and clears an "audit healthy" flag. With `audit_log.fail_closed` set, an append failure is a **hard error** that aborts the operation, so a compliance-strict deployment learns the audit broke instead of losing records unnoticed. The default (`fail_closed=false`) keeps fault processing running but still surfaces the gap via the health counter. + `verify()` walks the persisted chain oldest-first and recomputes every link: editing a row breaks its `record_hash`, deleting a row breaks the next row's `prev_hash` linkage, and deleting the newest row (the head row is read straight from the DB) is caught by the persisted-head check. -**Threat model (read this).** The chain is **unkeyed**, and the head and segment anchors live in the **same writable SQLite file** as the rows. `verify()` therefore catches edits or deletions that did **not** also recompute the chain - that is, casual or accidental tampering, and the bookkeeping bugs that would otherwise lose records. It does **not** stop an attacker with write access to the file: such an attacker can drop the triggers and recompute the entire chain (and head and anchors) to forge a self-consistent history. The append-only triggers are defense-in-depth, **not** a security boundary. True tamper-*proofing* requires a key or signature over the head (so it cannot be recomputed without the key) or external anchoring of the head hash to an append-only store you do not control; both are out of scope here and belong to the audit-log exporter / signing follow-up. +**Threat model (read this).** The chain is **unkeyed**, and the head and segment anchors live in the **same writable SQLite file** as the rows. `verify()` therefore catches edits or deletions that did **not** also recompute the chain - that is, casual or accidental tampering, and the bookkeeping bugs that would otherwise lose records. The append-only triggers are defense-in-depth: `audit_log` rejects out-of-band UPDATE/DELETE, and the rotation-prune guard (`audit_prune_guard`) is itself protected by a trigger so an external writer cannot simply flip it open and then delete a prefix - that flip is only permitted from the in-process connection that holds a per-connection temp marker. The single-row chain head (`audit_chain_head`) is intentionally **not** trigger-protected (a trigger there would block the legitimate head update inside the append transaction); a casual edit or delete of the head is instead caught by `verify()` via the seq/hash/head-mismatch checks. None of this stops an attacker with write access to the file: such an attacker can create the same temp marker or drop the triggers, and recompute the entire chain (head and anchors included) to forge a self-consistent history. The triggers are **not** a security boundary - this is tamper-**evident**, not tamper-**proof**. True tamper-*proofing* requires a key or signature over the head (so it cannot be recomputed without the key) or external anchoring of the head hash to an append-only store you do not control; both are out of scope here and belong to the audit-log exporter / signing follow-up. **Retention/rotation**: when more than `audit_log.retention_max_records` rows are retained, the oldest segment is *sealed* (its final `seq` + hash are persisted as an anchor) and then pruned. The surviving tail still verifies because the oldest retained row links back to the sealed anchor. | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `audit_log.enabled` | bool | `false` | Enable the tamper-evident audit log | -| `audit_log.transitions` | string | `"all"` | Which transitions to record: `"all"` or `"confirmed_only"` | +| `audit_log.transitions` | string | `"all"` | Which transitions to record: `"all"` (occurred/confirmed/healed/cleared) or `"confirmed_only"`. Lifecycle markers are always recorded. | | `audit_log.database_path` | string | `""` | SQLite path. Empty => sibling `fault_audit.db` next to the fault DB (or `:memory:` for in-memory fault stores) | | `audit_log.retention_max_records` | int | `0` | Seal + prune the oldest segment beyond this many retained records (0 = unlimited) | +| `audit_log.fail_closed` | bool | `false` | When `true`, an audit append failure is a hard error that aborts the operation (compliance-strict). When `false`, the failure is logged and counted but fault processing continues. Either way the gap is visible via the dropped-writes health counter. | ## Usage diff --git a/src/ros2_medkit_fault_manager/config/fault_manager.yaml b/src/ros2_medkit_fault_manager/config/fault_manager.yaml index 13d1f57a..fde100d2 100644 --- a/src/ros2_medkit_fault_manager/config/fault_manager.yaml +++ b/src/ros2_medkit_fault_manager/config/fault_manager.yaml @@ -29,13 +29,13 @@ fault_manager: # snapshots.capture_queue_full_policy: reject_newest # reject_newest | drop_oldest # Append-only, hash-chained audit log of fault state transitions - # (occurred/confirmed/cleared). OFF by default: it adds a write + storage cost - # per transition. When enabled, each transition appends one immutable, - # hash-chained row, so verify() detects edits or deletions that did NOT also - # recompute the chain (casual/accidental tampering). The chain is unkeyed and - # lives in a single writable file, so it is NOT proof against an attacker who - # can rewrite the whole file; true tamper-proofing needs a signed head or - # external anchoring (out of scope here). See README "Threat model". + # (occurred/confirmed/healed/cleared). OFF by default: it adds a write + + # storage cost per transition. When enabled, each transition appends one + # immutable, hash-chained row, so verify() detects edits or deletions that did + # NOT also recompute the chain (casual/accidental tampering). The chain is + # unkeyed and lives in a single writable file, so it is NOT proof against an + # attacker who can rewrite the whole file; true tamper-proofing needs a signed + # head or external anchoring (out of scope here). See README "Threat model". audit_log.enabled: false # Which transitions to record: "all" or "confirmed_only". # audit_log.transitions: all @@ -45,3 +45,8 @@ fault_manager: # Seal + prune the oldest segment beyond this many retained records # (0 = unlimited). A sealed anchor keeps the surviving tail verifiable. # audit_log.retention_max_records: 0 + # Fail-closed (compliance-strict): when true, an audit append failure is a hard + # error that aborts the operation instead of being logged and dropped. Default + # false keeps fault processing running; either way a dropped-writes health + # counter makes the gap visible. + # audit_log.fail_closed: false diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp index 457bca9e..3a954159 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp @@ -32,7 +32,7 @@ namespace ros2_medkit_fault_manager { /// to a stored row that does not also recompute the chain breaks verify(). struct AuditEvent { std::string fault_code; - std::string transition; ///< occurred | confirmed | cleared | ack + std::string transition; ///< one of the kTransition* constants below uint8_t severity{0}; ///< severity at the time of the transition std::string status; ///< resulting fault status (e.g. CONFIRMED) std::string source_id; ///< reporting source that drove the transition @@ -44,7 +44,19 @@ struct AuditEvent { constexpr const char * kTransitionOccurred = "occurred"; constexpr const char * kTransitionConfirmed = "confirmed"; constexpr const char * kTransitionCleared = "cleared"; -constexpr const char * kTransitionAck = "ack"; +/// Auto-recovery: a fault reached the healing threshold via PASSED events. Kept +/// distinct from kTransitionCleared so an automatic recovery is not mistaken for +/// a manual clear in the timeline. +constexpr const char * kTransitionHealed = "healed"; +/// Audit-log lifecycle markers (CIR (EU) 2024/2690 sec. 3.2: activation / +/// deactivation of logging). Appended directly, independent of the per-fault +/// transition filter, so the log records its own start and stop. +constexpr const char * kTransitionLoggingActivated = "logging_activated"; +constexpr const char * kTransitionLoggingDeactivated = "logging_deactivated"; +// NOTE: there is deliberately no "ack" kind. The open fault_manager has no +// acknowledge action separate from clearing: ~/clear_fault IS the acknowledge, +// and it is recorded as kTransitionCleared (clear == ack). A separate "ack" kind +// would never be written, so defining it would only mislead readers of the log. /// One immutable, hash-chained row read back from the audit log. struct AuditRecord { diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp index 61cf227c..a2226537 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp @@ -14,7 +14,9 @@ #pragma once +#include #include +#include #include #include #include @@ -72,6 +74,26 @@ class FaultManagerNode : public rclcpp::Node { return audit_log_.get(); } + /// Health signal: number of audit transitions that failed to append (and were + /// therefore lost from the chain). 0 when the audit is healthy or disabled. + uint64_t audit_dropped_writes() const { + return audit_dropped_writes_.load(std::memory_order_relaxed); + } + + /// Health signal: false once any audit append has failed. A compliance-strict + /// deployment should treat a false here (or a non-zero audit_dropped_writes()) + /// as the audit chain being incomplete. + bool audit_healthy() const { + return audit_healthy_.load(std::memory_order_relaxed); + } + + /// Test-only: drive one audit append through the same failure-handling path the + /// service handlers use. Throws when audit_log.fail_closed is set and the append + /// fails; otherwise observe audit_dropped_writes()/audit_healthy(). + void audit_transition_for_test(const char * transition, const ros2_medkit_msgs::msg::Fault & fault) { + audit_transition(transition, fault, "test", 0); + } + /// Get the storage type being used const std::string & get_storage_type() const { return storage_type_; @@ -187,7 +209,10 @@ class FaultManagerNode : public rclcpp::Node { /// Tamper-evident audit log of fault transitions (nullptr when disabled). std::unique_ptr audit_log_; - bool audit_confirmed_only_{false}; ///< When true, only "confirmed" transitions are logged + bool audit_confirmed_only_{false}; ///< When true, only "confirmed" transitions are logged + bool audit_fail_closed_{false}; ///< When true, an audit append failure aborts the operation + std::atomic audit_dropped_writes_{0}; ///< Count of audit appends that failed (lost rows) + std::atomic audit_healthy_{true}; ///< Cleared on the first failed audit append rclcpp::Service::SharedPtr report_fault_srv_; rclcpp::Service::SharedPtr list_faults_srv_; diff --git a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp index 88710a53..78b47c81 100644 --- a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp @@ -91,6 +91,15 @@ void exec_or_throw(sqlite3 * db, const char * sql, const char * what) { } } +/// SQL function registered ONLY on this in-process connection. The prune-guard +/// protection trigger calls it, so an out-of-band connection (which never +/// registered it) cannot flip the guard: preparing the UPDATE fails on the +/// unknown function. Connection-scoped, hence not a security boundary - a +/// write-capable adversary can register the same function or drop the trigger. +void audit_prune_authorized(sqlite3_context * ctx, int /*argc*/, sqlite3_value ** /*argv*/) { + sqlite3_result_int(ctx, 1); +} + } // namespace FaultAuditLog::FaultAuditLog(const std::string & db_path, int64_t retention_max_records) @@ -108,6 +117,15 @@ FaultAuditLog::FaultAuditLog(const std::string & db_path, int64_t retention_max_ exec_or_throw(db_, "PRAGMA journal_mode=WAL;", "enable WAL"); sqlite3_busy_timeout(db_, 5000); + // Authorize this connection for the prune-guard protection trigger (see below). + if (sqlite3_create_function_v2(db_, "audit_prune_authorized", 0, SQLITE_UTF8, nullptr, &audit_prune_authorized, + nullptr, nullptr, nullptr) != SQLITE_OK) { + std::string error = sqlite3_errmsg(db_); + sqlite3_close(db_); + db_ = nullptr; + throw std::runtime_error("audit: failed to register prune-authorization function: " + error); + } + initialize_schema(); head_ = load_head_locked(); } @@ -242,6 +260,24 @@ void FaultAuditLog::initialize_schema() { END; )", "create append-only triggers"); + + // Protect the prune guard itself. Without this, an external writer could simply + // `UPDATE audit_prune_guard SET enabled = 1` and then DELETE rows past the + // append-only delete trigger. The guard may only be flipped by a connection on + // which audit_prune_authorized() is registered (this in-process connection); an + // out-of-band UPDATE fails because that function is unknown to it. This is still + // defense-in-depth, not a security boundary: a write-capable adversary can + // register the same function or DROP this trigger. verify() remains the backstop. + exec_or_throw(db_, + R"( + CREATE TRIGGER IF NOT EXISTS audit_prune_guard_protect + BEFORE UPDATE ON audit_prune_guard + WHEN audit_prune_authorized() IS NOT 1 + BEGIN + SELECT RAISE(ABORT, 'audit_prune_guard is protected (in-process prune only)'); + END; + )", + "create prune-guard protection trigger"); } std::optional FaultAuditLog::read_head_row_locked() const { diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index 5103884d..e96d583c 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -327,6 +327,24 @@ FaultManagerNode::~FaultManagerNode() { if (rosbag_capture_) { rosbag_capture_->stop(); } + + // Close the chain with a "logging deactivated" marker (CIR (EU) 2024/2690 + // sec. 3.2). Best-effort and appended directly (recorded even in confirmed_only + // mode); a failure here must not throw out of the destructor. + if (audit_log_) { + try { + AuditEvent marker; + marker.fault_code = "__audit__"; + marker.transition = kTransitionLoggingDeactivated; + marker.status = "INACTIVE"; + marker.source_id = "fault_manager"; + marker.description = "audit logging deactivated"; + marker.occurred_at_ns = get_wall_clock_time().nanoseconds(); + audit_log_->append(marker); + } catch (const std::exception & e) { + RCLCPP_WARN(get_logger(), "Failed to append audit 'logging_deactivated' marker: %s", e.what()); + } + } } std::unique_ptr FaultManagerNode::create_storage() { @@ -380,6 +398,11 @@ std::unique_ptr FaultManagerNode::create_audit_log() { retention = 0; } + // Fail-closed: when set, an append failure is a hard error (the operation + // aborts and the audit is marked unhealthy) instead of being logged and + // silently dropped. Off by default so the audit never blocks fault processing. + audit_fail_closed_ = declare_parameter("audit_log.fail_closed", false); + // Path: explicit override, else a sibling of the fault DB, else :memory:. std::string audit_path = declare_parameter("audit_log.database_path", ""); @@ -412,8 +435,26 @@ std::unique_ptr FaultManagerNode::create_audit_log() { try { auto log = std::make_unique(audit_path, retention); RCLCPP_INFO(get_logger(), - "Fault audit log enabled: %s (transitions=%s, retention=%" PRId64 ", resume_seq=%" PRId64 ")", - audit_path.c_str(), audit_confirmed_only_ ? "confirmed_only" : "all", retention, log->head().seq); + "Fault audit log enabled: %s (transitions=%s, retention=%" PRId64 + ", fail_closed=%s, resume_seq=%" PRId64 ")", + audit_path.c_str(), audit_confirmed_only_ ? "confirmed_only" : "all", retention, + audit_fail_closed_ ? "true" : "false", log->head().seq); + + // Record a "logging activated" marker so the chain documents its own start + // (CIR (EU) 2024/2690 sec. 3.2). Appended directly, so it is recorded even in + // confirmed_only mode. Best-effort: a failure here does not abort startup. + AuditEvent marker; + marker.fault_code = "__audit__"; + marker.transition = kTransitionLoggingActivated; + marker.status = "ACTIVE"; + marker.source_id = "fault_manager"; + marker.description = "audit logging activated"; + marker.occurred_at_ns = get_wall_clock_time().nanoseconds(); + try { + log->append(marker); + } catch (const std::exception & e) { + RCLCPP_WARN(get_logger(), "Failed to append audit 'logging_activated' marker: %s", e.what()); + } return log; } catch (const std::exception & e) { RCLCPP_ERROR(get_logger(), "Failed to open fault audit log '%s': %s", audit_path.c_str(), e.what()); @@ -442,8 +483,22 @@ void FaultManagerNode::audit_transition(const char * transition, const ros2_medk try { audit_log_->append(event); } catch (const std::exception & e) { - RCLCPP_ERROR(get_logger(), "Failed to append audit record for '%s' (%s): %s", fault.fault_code.c_str(), transition, - e.what()); + // A dropped append is a hole in the chain that verify() can never see (it + // proves nothing was deleted, not that everything was written). Always make + // it loud and visible via the health counter so the gap is observable. + audit_dropped_writes_.fetch_add(1, std::memory_order_relaxed); + audit_healthy_.store(false, std::memory_order_relaxed); + const uint64_t dropped = audit_dropped_writes_.load(std::memory_order_relaxed); + RCLCPP_ERROR(get_logger(), "Failed to append audit record for '%s' (%s): %s [audit dropped_writes=%" PRIu64 "]", + fault.fault_code.c_str(), transition, e.what(), dropped); + if (audit_fail_closed_) { + // Compliance-strict: refuse to proceed as if nothing happened. Abort the + // operation so the broken audit cannot be silently outlived. + RCLCPP_FATAL(get_logger(), + "audit_log.fail_closed is set: aborting operation because the audit append failed for '%s' (%s)", + fault.fault_code.c_str(), transition); + throw; + } } } @@ -549,6 +604,12 @@ void FaultManagerNode::handle_report_fault( } // Note: PREFAILED/PREPASSED status changes don't emit events (debounce in progress) + // A PASSED event can drive an existing fault past the healing threshold to + // HEALED. That is the fault's END and must be audited (with a distinct kind + // and source) so the timeline is complete, not just occurred+confirmed. + const bool just_healed = !is_new && status_before != ros2_medkit_msgs::msg::Fault::STATUS_HEALED && + fault_after->status == ros2_medkit_msgs::msg::Fault::STATUS_HEALED; + // Append tamper-evident audit records for the transitions that just happened. // Recorded regardless of correlation muting: muting affects display, not the // fact that the state transition occurred. @@ -558,6 +619,9 @@ void FaultManagerNode::handle_report_fault( if (just_confirmed) { audit_transition(kTransitionConfirmed, *fault_after, request->source_id, event_time.nanoseconds()); } + if (just_healed) { + audit_transition(kTransitionHealed, *fault_after, "auto_heal", event_time.nanoseconds()); + } // Capture snapshots/rosbag when a fault confirms via the bounded pool (issue #441). // handle_report_fault runs on the single-threaded executor, so confirmations are diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp index 581ba006..d6e6ed06 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -26,6 +26,7 @@ using ros2_medkit_fault_manager::AuditEvent; using ros2_medkit_fault_manager::FaultAuditLog; using ros2_medkit_fault_manager::kTransitionCleared; using ros2_medkit_fault_manager::kTransitionConfirmed; +using ros2_medkit_fault_manager::kTransitionHealed; using ros2_medkit_fault_manager::kTransitionOccurred; namespace { @@ -334,6 +335,44 @@ TEST_F(FaultAuditLogTest, RotationPrunePassesAppendOnlyTrigger) { EXPECT_TRUE(log.verify().ok); } +// A "healed" auto-recovery row chains and verifies like any other transition, and +// stays distinct from a "cleared" so the timeline can tell them apart. +TEST_F(FaultAuditLogTest, HealedTransitionChainVerifies) { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F1", kTransitionConfirmed, 200)); + log.append(make_event("F1", kTransitionHealed, 300)); + + auto records = log.read(); + ASSERT_EQ(records.size(), 3u); + EXPECT_EQ(records[2].event.transition, kTransitionHealed); + EXPECT_NE(records[2].event.transition, std::string(kTransitionCleared)); + EXPECT_TRUE(log.verify().ok); +} + +// Defense-in-depth (item 4): the prune guard itself is protected. An out-of-band +// connection cannot flip audit_prune_guard open, so it cannot then delete a +// prefix past the append-only delete trigger. The in-process prune still works. +TEST_F(FaultAuditLogTest, GuardProtectTriggerBlocksExternalGuardFlip) { + { + FaultAuditLog log(path_, /*retention_max_records=*/3); + for (int i = 1; i <= 8; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + EXPECT_EQ(log.record_count(), 3); // in-process prune succeeded despite the protect trigger + EXPECT_TRUE(log.verify().ok); + } + + // An external writer (no in-process temp unlock marker) cannot open the guard. + EXPECT_NE(raw_exec_rc(path_, "UPDATE audit_prune_guard SET enabled = 1 WHERE id = 1"), SQLITE_OK); + // With the guard still closed, a raw DELETE remains blocked by the delete trigger. + EXPECT_NE(raw_exec_rc(path_, "DELETE FROM audit_log"), SQLITE_OK); + + FaultAuditLog reopened(path_, 3); + EXPECT_EQ(reopened.record_count(), 3); + EXPECT_TRUE(reopened.verify().ok); +} + int main(int argc, char ** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index 28411122..9a9dbc37 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -20,6 +21,7 @@ #include #include #include +#include #include #include @@ -1382,6 +1384,8 @@ class FaultAuditIntegrationTest : public ::testing::Test { fm_options.parameter_overrides({ {"storage_type", "memory"}, {"confirmation_threshold", -1}, + {"healing_enabled", true}, + {"healing_threshold", 1}, // counter >= 1 heals (two PASSED after one FAILED) {"audit_log.enabled", true}, {"audit_log.database_path", audit_path_}, }); @@ -1448,19 +1452,66 @@ TEST_F(FaultAuditIntegrationTest, TransitionsAppendVerifiableChain) { ASSERT_TRUE(spin_until_ready(cf)); ASSERT_TRUE(cf.get()->success); - // Reopen the audit DB independently and inspect the persisted chain. + // Reopen the audit DB independently and inspect the persisted chain. The chain + // opens with a "logging_activated" lifecycle marker (seq 1), then the three + // fault transitions. The "logging_deactivated" marker is only appended when the + // node is destroyed (TearDown), so it is not visible to this read. ros2_medkit_fault_manager::FaultAuditLog audit(audit_path_); auto records = audit.read(); - ASSERT_EQ(records.size(), 3u); - EXPECT_EQ(records[0].event.transition, ros2_medkit_fault_manager::kTransitionOccurred); - EXPECT_EQ(records[1].event.transition, ros2_medkit_fault_manager::kTransitionConfirmed); - EXPECT_EQ(records[2].event.transition, ros2_medkit_fault_manager::kTransitionCleared); - EXPECT_EQ(records[0].event.fault_code, "AUDIT_FAULT"); - EXPECT_EQ(records[1].event.source_id, "/plc/pump"); + ASSERT_EQ(records.size(), 4u); + EXPECT_EQ(records[0].event.transition, ros2_medkit_fault_manager::kTransitionLoggingActivated); + EXPECT_EQ(records[1].event.transition, ros2_medkit_fault_manager::kTransitionOccurred); + EXPECT_EQ(records[2].event.transition, ros2_medkit_fault_manager::kTransitionConfirmed); + EXPECT_EQ(records[3].event.transition, ros2_medkit_fault_manager::kTransitionCleared); + EXPECT_EQ(records[1].event.fault_code, "AUDIT_FAULT"); + EXPECT_EQ(records[2].event.source_id, "/plc/pump"); + + auto result = audit.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 4); +} + +// Completeness: an auto-healed fault must record its END. One FAILED confirms the +// fault (occurred + confirmed); two PASSED drive the debounce counter to the +// healing threshold, which must append a distinct "healed" row (source auto_heal), +// and the full occurred -> confirmed -> healed chain must verify. +TEST_F(FaultAuditIntegrationTest, AutoHealAppendsHealedRow) { + auto send_report = [&](uint8_t event_type) { + auto req = std::make_shared(); + req->fault_code = "HEAL_FAULT"; + req->event_type = event_type; + req->severity = Fault::SEVERITY_ERROR; // not CRITICAL: goes through debounce + req->description = "intermittent sensor"; + req->source_id = "/robot/sensor"; + auto fut = report_client_->async_send_request(req); + ASSERT_TRUE(spin_until_ready(fut)); + ASSERT_TRUE(fut.get()->accepted); + }; + + send_report(ReportFault::Request::EVENT_FAILED); // counter -1, threshold -1 => CONFIRMED + send_report(ReportFault::Request::EVENT_PASSED); // counter 0 (hysteresis): stays CONFIRMED + send_report(ReportFault::Request::EVENT_PASSED); // counter 1 >= healing_threshold => HEALED + + ros2_medkit_fault_manager::FaultAuditLog audit(audit_path_); + std::vector transitions; + std::string heal_source; + for (const auto & rec : audit.read()) { + if (rec.event.fault_code == "HEAL_FAULT") { + transitions.push_back(rec.event.transition); + if (rec.event.transition == ros2_medkit_fault_manager::kTransitionHealed) { + heal_source = rec.event.source_id; + } + } + } + + ASSERT_EQ(transitions.size(), 3u) << "expected occurred, confirmed, healed"; + EXPECT_EQ(transitions[0], ros2_medkit_fault_manager::kTransitionOccurred); + EXPECT_EQ(transitions[1], ros2_medkit_fault_manager::kTransitionConfirmed); + EXPECT_EQ(transitions[2], ros2_medkit_fault_manager::kTransitionHealed); + EXPECT_EQ(heal_source, "auto_heal"); auto result = audit.verify(); EXPECT_TRUE(result.ok) << result.error; - EXPECT_EQ(result.checked, 3); } TEST(FaultAuditDisabledTest, NoAuditFileWhenDisabled) { @@ -1530,6 +1581,104 @@ TEST(FaultAuditTimerTest, TimerConfirmationAppendsConfirmedAuditRow) { EXPECT_TRUE(audit->verify().ok); } +namespace { + +/// Force the node's next audit append to fail by inserting a row at the seq the +/// node will try next (MAX(seq)+1), so its INSERT collides on the seq PRIMARY KEY. +/// Done from a separate connection; the append-only triggers do not block INSERT. +void poison_next_audit_seq(const std::string & db_path) { + sqlite3 * db = nullptr; + ASSERT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); + const char * sql = + "INSERT INTO audit_log (seq, occurred_at_ns, fault_code, transition, severity, status, source_id, " + "description, prev_hash, record_hash) " + "SELECT COALESCE(MAX(seq), 0) + 1, 0, 'x', 'x', 0, 'x', 'x', 'x', 'x', 'x' FROM audit_log;"; + char * err = nullptr; + int rc = sqlite3_exec(db, sql, nullptr, nullptr, &err); + std::string err_str = err ? err : ""; + sqlite3_free(err); + sqlite3_close(db); + ASSERT_EQ(rc, SQLITE_OK) << err_str; +} + +Fault make_failclosed_fault() { + Fault f; + f.fault_code = "FAILCLOSED"; + f.severity = Fault::SEVERITY_ERROR; + f.status = "CONFIRMED"; + f.description = "injected"; + return f; +} + +std::string make_temp_audit_path(const char * prefix) { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dist; + return (std::filesystem::temp_directory_path() / (std::string(prefix) + std::to_string(dist(gen)) + ".db")).string(); +} + +void remove_audit_files(const std::string & path) { + std::error_code ec; + std::filesystem::remove(path, ec); + std::filesystem::remove(path + "-wal", ec); + std::filesystem::remove(path + "-shm", ec); +} + +} // namespace + +// Silent-gap guard, default behaviour: when fail_closed is false, an append +// failure must NOT abort the operation, but it must be VISIBLE - the dropped +// counter increments and the audit is flagged unhealthy (not silently lost). +TEST(FaultAuditFailClosedTest, FailOpenFlagsButDoesNotThrow) { + const std::string audit_path = make_temp_audit_path("test_audit_failopen_"); + { + rclcpp::NodeOptions options; + options.parameter_overrides({ + {"storage_type", "memory"}, + {"audit_log.enabled", true}, + {"audit_log.database_path", audit_path}, + {"audit_log.fail_closed", false}, + }); + auto node = std::make_shared(options); + ASSERT_TRUE(node->audit_healthy()); + EXPECT_EQ(node->audit_dropped_writes(), 0u); + + poison_next_audit_seq(audit_path); + + EXPECT_NO_THROW( + node->audit_transition_for_test(ros2_medkit_fault_manager::kTransitionConfirmed, make_failclosed_fault())); + EXPECT_EQ(node->audit_dropped_writes(), 1u); + EXPECT_FALSE(node->audit_healthy()); + } + remove_audit_files(audit_path); +} + +// Silent-gap guard, compliance-strict: with fail_closed true, an injected append +// failure must abort the operation (throw) rather than silently proceed, and the +// same health signals must fire. +TEST(FaultAuditFailClosedTest, FailClosedAbortsAndFlags) { + const std::string audit_path = make_temp_audit_path("test_audit_failclosed_"); + { + rclcpp::NodeOptions options; + options.parameter_overrides({ + {"storage_type", "memory"}, + {"audit_log.enabled", true}, + {"audit_log.database_path", audit_path}, + {"audit_log.fail_closed", true}, + }); + auto node = std::make_shared(options); + + poison_next_audit_seq(audit_path); + + EXPECT_THROW( + node->audit_transition_for_test(ros2_medkit_fault_manager::kTransitionConfirmed, make_failclosed_fault()), + std::exception); + EXPECT_EQ(node->audit_dropped_writes(), 1u); + EXPECT_FALSE(node->audit_healthy()); + } + remove_audit_files(audit_path); +} + int main(int argc, char ** argv) { rclcpp::init(argc, argv); ::testing::InitGoogleTest(&argc, argv);