microsoft
diff --git a/‎.github/workflows/ci-verification.yml‎
Lines changed: 22 additions & 0 deletions b/‎.github/workflows/ci-verification.yml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 19 additions & 4 deletions b/‎CMakeLists.txt‎
Lines changed: 19 additions & 4 deletions
diff --git a/‎doc/audit/builtin_maps.rst‎
Lines changed: 55 additions & 0 deletions b/‎doc/audit/builtin_maps.rst‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎doc/host_config_schema/cchost_config.json‎
Lines changed: 45 additions & 0 deletions b/‎doc/host_config_schema/cchost_config.json‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎doc/operations/recovery.rst‎
Lines changed: 92 additions & 2 deletions b/‎doc/operations/recovery.rst‎
Lines changed: 92 additions & 2 deletions
diff --git a/‎include/ccf/node/startup_config.h‎
Lines changed: 11 additions & 0 deletions b/‎include/ccf/node/startup_config.h‎
Lines changed: 11 additions & 0 deletions
@@ -239,3 +239,25 @@ jobs:
           name: tlc-trace-validation-consensus
           path: |
             tla/traces/*
+
+  model-checking-self-healing-open:
+    name: Model Checking - Self-Healing Open
+    runs-on: [self-hosted, 1ES.Pool=gha-vmss-d16av5-ci]
+    container:
+      image: mcr.microsoft.com/azurelinux/base/core:3.0
+      options: --user root --publish-all --cap-add NET_ADMIN --cap-add NET_RAW --cap-add SYS_PTRACE
+
+    steps:
+      - name: "Checkout dependencies"
+        shell: bash
+        run: |
+          gpg --import /etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY
+          tdnf -y update
+          tdnf -y install ca-certificates git
+
+      - uses: actions/checkout@v5
+      - name: Install Stateright dependencies
+        run: |
+          tdnf install -y cargo
+
+      - run: cd tla/disaster-recovery && cargo run check
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## [7.0.0-dev7]
+
+[7.0.0-dev7]: https://github.com/microsoft/CCF/releases/tag/ccf-7.0.0-dev7
+
+### Added
+
+- Experimental self-healing-open protocol for automatically transitioning-to-open during a disaster recovery without operator intervention. (#7189)
+
 ## [7.0.0-dev6]
 
 [7.0.0-dev6]: https://github.com/microsoft/CCF/releases/tag/ccf-7.0.0-dev6
 
@@ -366,6 +366,7 @@ endif()
 set(CCF_IMPL_SOURCE
     ${CCF_DIR}/src/enclave/main.cpp ${CCF_DIR}/src/enclave/thread_local.cpp
     ${CCF_DIR}/src/node/quote.cpp ${CCF_DIR}/src/node/uvm_endorsements.cpp
+    ${CCF_DIR}/src/node/self_healing_open_impl.cpp
 )
 
 add_ccf_static_library(
@@ -736,7 +737,9 @@ if(BUILD_TESTS)
     add_unit_test(
       frontend_test
       ${CMAKE_CURRENT_SOURCE_DIR}/src/node/rpc/test/frontend_test.cpp
-      ${CCF_DIR}/src/node/quote.cpp ${CCF_DIR}/src/node/uvm_endorsements.cpp
+      ${CCF_DIR}/src/node/quote.cpp
+      ${CCF_DIR}/src/node/uvm_endorsements.cpp
+      ${CCF_DIR}/src/node/self_healing_open_impl.cpp
     )
     target_link_libraries(
       frontend_test
@@ -747,6 +750,8 @@ if(BUILD_TESTS)
               ccfcrypto
               ccf_kv
               ccf_tasks
+              curl
+              uv
     )
 
     add_unit_test(
@@ -772,11 +777,21 @@ if(BUILD_TESTS)
     add_unit_test(
       node_frontend_test
       ${CMAKE_CURRENT_SOURCE_DIR}/src/node/rpc/test/node_frontend_test.cpp
-      ${CCF_DIR}/src/node/quote.cpp ${CCF_DIR}/src/node/uvm_endorsements.cpp
+      ${CCF_DIR}/src/node/quote.cpp
+      ${CCF_DIR}/src/node/uvm_endorsements.cpp
+      ${CCF_DIR}/src/node/self_healing_open_impl.cpp
     )
     target_link_libraries(
-      node_frontend_test PRIVATE ${CMAKE_THREAD_LIBS_INIT} http_parser ccf_js
-                                 ccf_endpoints ccfcrypto ccf_kv
+      node_frontend_test
+      PRIVATE ${CMAKE_THREAD_LIBS_INIT}
+              http_parser
+              ccf_js
+              ccf_endpoints
+              ccfcrypto
+              ccf_kv
+              uv
+              curl
+              ccf_tasks
     )
 
     add_unit_test(
 
@@ -564,4 +564,59 @@ While the contents themselves are encrypted, the table is public so as to be acc
 **Value** The mechanism by which the ledger secret was recovered.
 
 .. doxygenenum:: ccf::RecoveryType
+   :project: CCF
+
+``self_healing_open.nodes``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Key** Intrinsic node ID: A string which is unique to a particular node role within a cluster.
+
+**Value** 
+
+.. doxygenstruct:: ccf::self_healing_open::NodeInfo
+   :project: CCF
+   :members:
+
+``self_healing_open.gossip``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Key** Intrinsic node ID of the source of the gossip message.
+
+**Value**
+
+.. doxygenstruct:: ccf::self_healing_open::GossipRequest
+   :project: CCF
+   :members:
+
+``self_healing_open.chosen_node``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Value** The intrinsic node ID of the chosen node. This will either be the node this node voted for, or the node that is has received an `IAmOpen` message from.
+
+``self_healing_open.votes``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Key** Intrinsic node ID of the node which has voted for this node to be opened.
+
+``self_healing_open.sm_state``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Value** State machine state of the self-healing open protocol.
+
+.. doxygenenum:: ccf::self_healing_open::StateMachine
+   :project: CCF
+
+``self_healing_open.timeout_sm_state``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Value** Timeout state machine state of the self-healing open protocol. Ticks based on `failover_timeout` and advances `self_healing_open.sm_state` if it falls behind.
+
+See :cpp:enum:`ccf::self_healing_open::StateMachine` above.
+
+``self_healing_open.open_kind``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Value** The kind of recovery that was performed, either `Quorum`-based which guarantees that there is at most one recovered service using this path, or `Failover`-based which could allow multiple services to recover.
+
+.. doxygenenum:: ccf::self_healing_open::OpenKinds
    :project: CCF
@@ -360,6 +360,51 @@
                   "previous_sealed_ledger_secret_location": {
                     "type": ["string"],
                     "description": "Path to the sealed ledger secret folder, the ledger secrets for the recovered service will be unsealed from here instead of reconstructed from recovery shares."
+                  },
+                  "self_healing_open": {
+                    "type": "object",
+                    "properties": {
+                      "identity": {
+                        "type": "object",
+                        "properties": {
+                          "intrinsic_id": {
+                            "type": "string",
+                            "description": "Intrinsic identifier of this node, used to identify it in the self-healing-open protocol"
+                          },
+                          "published_address": {
+                            "type": "string",
+                            "description": "Published address (host:port) of this node, used to identify it in the self-healing-open protocol"
+                          }
+                        }
+                      },
+                      "cluster_identities": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "intrinsic_id": {
+                              "type": "string",
+                              "description": "Intrinsic identifier of the node, used to identify it in the self-healing-open protocol"
+                            },
+                            "published_address": {
+                              "type": "string",
+                              "description": "Published address (host:port) of the node, used for communication during the self-healing-open protocol"
+                            }
+                          }
+                        },
+                        "description": "List of identities for all nodes in the cluster"
+                      },
+                      "retry_timeout": {
+                        "type": "string",
+                        "default": "100ms",
+                        "description": "Interval (time string) at which the node re-sends self-healing-open messages. This should be significantly less than 'failover_timeout'"
+                      },
+                      "failover_timeout": {
+                        "type": "string",
+                        "default": "2000ms",
+                        "description": "Interval (time string) after which the node forcibly advances to the next phase of the self-healing-open protocol"
+                      }
+                    }
                   }
                 },
                 "required": ["previous_service_identity_file"],
 
@@ -113,8 +113,8 @@ Summary Diagram
 
 Once operators have established a recovered crash-fault tolerant public network, the existing members of the consortium :ref:`must vote to accept the recovery of the network and submit their recovery shares <governance/accept_recovery:Accepting Recovery and Submitting Shares>`.
 
-Local Sealing Recovery
-----------------------
+Local Sealing Recovery (Experimental)
+-------------------------------------
 
 SNP provides the `DERIVED_KEY` guest message which derives a key from the CPU's VCEK (or VLEK), TCB version and the guest's measurement and host_data (policy), thus any change to the CPU, measurement or policy, or a rolled-back TCB version, will prevent the key from being reconstructed.
 If configured, the node will unseal the secrets it previously sealed instead of waiting for recovery shares from members after `transition_to_open` is triggered.
@@ -145,6 +145,96 @@ Which of these two paths is taken is noted in the `public:ccf.internal.last_reco
       ...
     $ /opt/ccf/bin/js_generic --config /path/to/config/file
 
+Self-Healing-Open recovery (Experimental)
+-----------------------------------------
+
+In environments with limited orchestration or limited operator access, it is desirable to allow an automated disaster recovery without operator intervention.
+At a high level, Self-Healing-Open recovery allows recovering replicas to discover which node has the most up-to-date ledger and automatically recover the network using that ledger.
+The protocol completes with a node choosing to `transition-to-open`, and so requires another mechanism to recover the private ledger.
+If it is likely that the nodes will restart on the same hardware, local sealing recovery (see above) can be used to recover the private ledger automatically, and bring the service fully online.
+
+There are two paths, an election path, and a very-high-availability failover path.
+The election path ensures that if all nodes restart and have full network connectivity, a majority of nodes' on-disk ledger contains every committed transaction, and no timeouts trigger, then there will be only one recovered network and all committed transactions will be persisted.
+However, the election path can become stuck, in which case the failover path is designed to ensure progress.
+
+In the election path, nodes first gossip with each other, learning of the ledgers of other nodes.
+Once they have heard from every node they vote for the node with the best ledger.
+If a node receives votes from a majority of nodes, it invokes `transition-to-open` and notifies the other nodes to restart and join it.
+This path is illustrated below, and is guaranteed to succeed if all nodes can communicate and no timeouts trigger.
+
+.. mermaid::
+
+    sequenceDiagram
+      participant N1
+      participant N2
+      participant N3
+      
+      Note over N1, N3: Gossip
+
+      N1 ->> N2: Gossip(Tx=1)
+      N1 ->> N3: Gossip(Tx=1)
+      N2 ->> N3: Gossip(Tx=2)
+      N3 ->> N2: Gossip(Tx=3)
+
+      Note over N1, N3: Vote
+      N2 ->> N3: Vote
+      N3 ->> N3: Vote
+
+      Note over N1, N3: Open/Join
+      N3 ->> N1: IAmOpen
+      N3 ->> N2: IAmOpen
+
+      Note over N1, N2: Restart
+
+      Note over N3: Transition-to-open
+
+      Note over N3: Local unsealing
+
+      Note over N3: Open
+
+      N1 ->> N3: Join
+      N2 ->> N3: Join
+
+In the failover path, each phase has a timeout to skip to the next phase if a failure has occurred.
+For example, the election path requires all nodes to communicate to advance from the gossip phase to the vote phase.
+However, if any node fails to recover, the election path is stuck.
+In this case, after a timeout, nodes will advance to the vote phase regardless of whether they have heard from all nodes, and vote for the best ledger they have heard of at that point.
+
+Unfortunately, this can lead to multiple forks of the service if different nodes cannot communicate with each other and timeout.
+Hence, we recommend setting the timeout substantially higher than the highest expected recovery time, to minimise the chance of this happening.
+To audit if timeouts were used to open the service, the `public:ccf.gov.selfhealingopen.failover_open` table tracks this.
+
+This failover path is illustrated below.
+
+.. mermaid::
+
+    sequenceDiagram
+      participant N1
+      participant N2
+      participant N3
+
+      Note over N1, N3: Gossip
+
+      N2 ->> N3: Gossip(Tx=2)
+      N3 ->> N2: Gossip(Tx=3)
+
+      Note over N1: Timeout
+      Note over N3: Timeout
+
+      Note over N1, N3: Vote
+
+      N1 ->> N1: Vote
+      N3 ->> N3: Vote
+      N2 ->> N3: Vote
+
+      Note over N1, N3: Open/Join
+      
+      Note over N1: Transition-to-open
+      Note over N3: Transition-to-open
+
+
+If the network fails during reconfiguration, each node will use its latest known configuration to recover. Since reconfiguration requires votes from a majority of nodes, the latest configuration should recover using the election path, however nodes in the previous configuration may recover using the election path.
+
 Notes
 -----
 
 
@@ -11,6 +11,7 @@
 #include "ccf/service/service_config.h"
 #include "ccf/service/tables/host_data.h"
 #include "ccf/service/tables/members.h"
+#include "ccf/service/tables/self_healing_open.h"
 
 #include <optional>
 #include <string>
@@ -102,6 +103,15 @@ namespace ccf
     Snapshots snapshots = {};
   };
 
+  struct SelfHealingOpenConfig
+  {
+    self_healing_open::Identity identity;
+    std::vector<self_healing_open::Identity> cluster_identities;
+    ccf::ds::TimeString retry_timeout = {"100ms"};
+    ccf::ds::TimeString failover_timeout = {"2000ms"};
+    bool operator==(const SelfHealingOpenConfig&) const = default;
+  };
+
   struct StartupConfig : CCFConfig
   {
     StartupConfig() = default;
@@ -146,6 +156,7 @@ namespace ccf
         std::nullopt;
       std::optional<std::string> previous_sealed_ledger_secret_location =
         std::nullopt;
+      std::optional<SelfHealingOpenConfig> self_healing_open = std::nullopt;
     };
     Recover recover = {};
   };