From 5b38622aa5f87fdd06681d0de6cdaa2839513fbf Mon Sep 17 00:00:00 2001 From: Tim Date: Wed, 15 Apr 2026 22:12:20 -0400 Subject: [PATCH 1/3] feat: generate operational runbooks in eject bundle (#218) Add day-to-day-ops and disaster-recovery runbook templates that render with workload-specific data (cloud, region, sites, cluster name). Wired into eject.Run() so `ptd eject --dry-run` produces runbooks/ in the output directory. --- lib/eject/eject.go | 48 +++ lib/eject/runbooks.go | 594 +++++++++++++++++++++++++++++++++++++ lib/eject/runbooks_test.go | 286 ++++++++++++++++++ 3 files changed, 928 insertions(+) create mode 100644 lib/eject/runbooks.go create mode 100644 lib/eject/runbooks_test.go diff --git a/lib/eject/eject.go b/lib/eject/eject.go index 9c54732..a0c414f 100644 --- a/lib/eject/eject.go +++ b/lib/eject/eject.go @@ -5,6 +5,7 @@ import ( "fmt" "log/slog" "os" + "path/filepath" "github.com/posit-dev/ptd/lib/helpers" "github.com/posit-dev/ptd/lib/types" @@ -55,6 +56,53 @@ func Run(ctx context.Context, t types.Target, opts Options) error { "connections", len(crDetails.Connections), ) + rbData, err := buildRunbookData(config, opts.TargetName) + if err != nil { + return fmt.Errorf("failed to build runbook data: %w", err) + } + + runbooks, err := GenerateRunbooks(rbData) + if err != nil { + return fmt.Errorf("failed to generate runbooks: %w", err) + } + + runbooksDir := filepath.Join(opts.OutputDir, "runbooks") + if err := os.MkdirAll(runbooksDir, 0755); err != nil { + return fmt.Errorf("failed to create runbooks directory: %w", err) + } + + for filename, content := range runbooks { + path := filepath.Join(runbooksDir, filename) + if err := os.WriteFile(path, []byte(content), 0644); err != nil { + return fmt.Errorf("failed to write runbook %s: %w", filename, err) + } + slog.Info("Generated runbook", "path", path) + } + slog.Info("Eject bundle generated", "path", opts.OutputDir) return nil } + +func buildRunbookData(config interface{}, targetName string) (*RunbookData, error) { + data := &RunbookData{WorkloadName: targetName} + + switch cfg := config.(type) { + case types.AWSWorkloadConfig: + data.Cloud = "aws" + data.Region = cfg.Region + for name, site := range cfg.Sites { + data.Sites = append(data.Sites, SiteData{Name: name, Domain: site.Spec.Domain}) + } + case types.AzureWorkloadConfig: + data.Cloud = "azure" + data.Region = cfg.Region + data.ResourceGroup = fmt.Sprintf("rsg-ptd-%s", targetName) + for name, site := range cfg.Sites { + data.Sites = append(data.Sites, SiteData{Name: name, Domain: site.Spec.Domain}) + } + default: + return nil, fmt.Errorf("unsupported config type for target %s", targetName) + } + + return data, nil +} diff --git a/lib/eject/runbooks.go b/lib/eject/runbooks.go new file mode 100644 index 0000000..a8a5da8 --- /dev/null +++ b/lib/eject/runbooks.go @@ -0,0 +1,594 @@ +package eject + +import ( + "fmt" + "io" + "strings" + "text/template" +) + +// SiteData holds per-site information for runbook templates. +type SiteData struct { + Name string + Domain string +} + +// RunbookData contains all data needed to render the operational runbooks. +type RunbookData struct { + WorkloadName string + Cloud string // "aws" or "azure" + Region string + ClusterName string + ResourceGroup string + Sites []SiteData +} + +var runbookFuncMap = template.FuncMap{ + "upper": strings.ToUpper, +} + +var dayToDayOpsTemplate = template.Must(template.New("day-to-day-ops").Funcs(runbookFuncMap).Parse( + `# Day-to-Day Operations — {{.WorkloadName}} + +**Workload:** {{.WorkloadName}} +**Cloud:** {{.Cloud | upper}} +**Region:** {{.Region}} +{{- range .Sites}} +**Site:** {{.Name}} ({{.Domain}}) +{{- end}} + +## Running PTD Ensure Steps + +Each infrastructure change is applied by running the relevant ` + "`ptd ensure`" + ` step. Always preview first with ` + "`--dry-run`" + `, then apply. + +| Step | When to Re-Run | What It Changes | +|---|---|---| +{{- if eq .Cloud "aws"}} +| bootstrap | Initial setup only; rarely re-run | S3 state bucket, KMS key, IAM bootstrap roles | +| persistent | VPC, RDS, S3, FSx, IAM, DNS, or TLS changes | VPC, subnets, RDS instance, S3 buckets, FSx filesystem, IAM roles, Route53 zones, ACM certificates | +| postgres_config | Database user/grant changes | PostgreSQL users, databases, grants | +| eks | Cluster or node group changes | EKS cluster, managed node groups, OIDC provider, storage classes | +| clusters | Namespace, RBAC, operator, or ingress controller changes | K8s namespaces, network policies, IAM-to-K8s bindings, Team Operator, Traefik | +| helm | Monitoring, cert-manager, or CSI driver changes | Loki, Grafana, Mimir, Alloy, cert-manager, Secrets Store CSI | +| sites | Product deployment, ingress, or site config changes | TeamSite CRDs, ingress resources, site-specific configuration | +{{- else}} +| bootstrap | Initial setup only; rarely re-run | Blob state container, Key Vault encryption key | +| persistent | VNet, PostgreSQL, storage, Key Vault, or identity changes | VNet, Azure PostgreSQL, storage accounts, NetApp Files, Key Vault, managed identities, NSGs | +| postgres_config | Database user/grant changes | PostgreSQL users, databases, grants | +| aks | Cluster or node pool changes | AKS cluster, node pools, managed identity, storage classes | +| clusters | Namespace, RBAC, operator, or ingress controller changes | K8s namespaces, network policies, workload identity bindings, Team Operator, Traefik | +| helm | Monitoring, cert-manager, or CSI driver changes | Loki, Grafana, Mimir, Alloy, cert-manager, Secrets Store CSI | +| sites | Product deployment, ingress, or site config changes | TeamSite CRDs, ingress resources, site-specific configuration | +{{- end}} + +**Preview a step (dry-run):** + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps --dry-run +` + "```" + ` + +**Apply a step:** + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps +` + "```" + ` + +## Scaling Product Replicas + +1. Edit the product replica count in the site's ` + "`site.yaml`" + `: + +` + "```" + `yaml +spec: + connect: + replicas: 3 +` + "```" + ` + +2. Preview and apply the sites step: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps sites --dry-run +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +3. Verify the new replica count: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl get pods -n posit-team -l app.kubernetes.io/managed-by=team-operator +` + "```" + ` + +## Updating Product Versions + +1. Edit the product image tag in the site's ` + "`site.yaml`" + `: + +` + "```" + `yaml +spec: + connect: + image: ghcr.io/rstudio/rstudio-connect:2025.01.0 +` + "```" + ` + +2. Preview and apply the sites step: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps sites --dry-run +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +3. Verify pods roll to the new version: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl rollout status deployment -n posit-team -l app.kubernetes.io/managed-by=team-operator +` + "```" + ` + +## Rotating TLS Certificates + +### ACM/Azure-Managed Certificates + +{{- if eq .Cloud "aws"}} + +ACM certificates auto-renew when DNS validation records are in place. To change the certificate: + +1. Update the certificate configuration in ` + "`ptd.yaml`" + `. +2. Re-run the persistent and sites steps: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run +ptd ensure {{.WorkloadName}} --only-steps persistent +ptd ensure {{.WorkloadName}} --only-steps sites --dry-run +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +{{- else}} + +Azure-managed certificates are handled by the platform. To change the certificate configuration: + +1. Update the certificate configuration in ` + "`ptd.yaml`" + `. +2. Re-run the persistent and sites steps: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run +ptd ensure {{.WorkloadName}} --only-steps persistent +ptd ensure {{.WorkloadName}} --only-steps sites --dry-run +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +{{- end}} + +### cert-manager Certificates + +cert-manager automatically renews certificates before expiry. To force renewal: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl delete certificate -n posit-team +` + "```" + ` + +cert-manager will detect the missing certificate and issue a new one. + +## Rotating Secrets + +### Database Passwords + +{{- if eq .Cloud "aws"}} + +Database passwords are stored in AWS Secrets Manager. To rotate: + +1. Update the password value in the relevant secret in Secrets Manager. +2. Update the password on the PostgreSQL server to match. +3. Re-run the persistent step to reconcile: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run +ptd ensure {{.WorkloadName}} --only-steps persistent +` + "```" + ` + +{{- else}} + +Database passwords are stored in Azure Key Vault. To rotate: + +1. Update the password value in the relevant Key Vault secret. +2. Update the password on the PostgreSQL server to match. +3. Re-run the persistent step to reconcile: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run +ptd ensure {{.WorkloadName}} --only-steps persistent +` + "```" + ` + +{{- end}} + +### Product Licenses + +Update the license key in the secret store ({{if eq .Cloud "aws"}}Secrets Manager{{else}}Key Vault{{end}}) and restart the affected product: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl rollout restart deployment/-deployment -n posit-team +` + "```" + ` + +### RSA Keys + +**Warning:** Rotating RSA keys for Connect or Package Manager will invalidate all content signed with the previous key. Plan accordingly. + +Update the key in the secret store and restart the affected product. + +## Checking Workload Health + +### Using ptd workon + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl get pods -n posit-team +ptd workon {{.WorkloadName}} -- kubectl get pods -n posit-team-system +ptd workon {{.WorkloadName}} -- kubectl get ingressroute -n posit-team +` + "```" + ` + +### Using kubectl directly + +{{- if eq .Cloud "aws"}} + +` + "```" + `bash +aws eks update-kubeconfig --name --region {{.Region}} +kubectl get pods -n posit-team +kubectl get pods -n posit-team-system +kubectl get ingressroute -n posit-team +` + "```" + ` + +{{- else}} + +` + "```" + `bash +az aks get-credentials --resource-group {{.ResourceGroup}} --name +kubectl get pods -n posit-team +kubectl get pods -n posit-team-system +kubectl get ingressroute -n posit-team +` + "```" + ` + +{{- end}} + +## Restarting Products + +Restart a product deployment: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl rollout restart deployment/-deployment -n posit-team +` + "```" + ` + +Monitor the rollout: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl rollout status deployment/-deployment -n posit-team +` + "```" + ` + +Using kubectl directly: + +` + "```" + `bash +kubectl rollout restart deployment/-deployment -n posit-team +kubectl rollout status deployment/-deployment -n posit-team +` + "```" + ` +`)) + +var disasterRecoveryTemplate = template.Must(template.New("disaster-recovery").Funcs(runbookFuncMap).Parse( + `# Disaster Recovery — {{.WorkloadName}} + +**Workload:** {{.WorkloadName}} +**Cloud:** {{.Cloud | upper}} +**Region:** {{.Region}} +{{- range .Sites}} +**Site:** {{.Name}} ({{.Domain}}) +{{- end}} + +## Pulumi State Recovery + +{{- if eq .Cloud "aws"}} + +**State backend:** S3 bucket ` + "`ptd-{{.WorkloadName}}`" + ` in {{.Region}} + +S3 versioning is enabled on the state bucket. If state is corrupted or accidentally overwritten, restore a previous version: + +` + "```" + `bash +aws s3api list-object-versions --bucket ptd-{{.WorkloadName}} --prefix .pulumi/stacks/ +aws s3api get-object --bucket ptd-{{.WorkloadName}} --key --version-id restored-state.json +` + "```" + ` + +{{- else}} + +**State backend:** Azure Blob Storage container in storage account for {{.WorkloadName}} + +Blob versioning is enabled on the state container. If state is corrupted or accidentally overwritten, restore a previous version: + +` + "```" + `bash +az storage blob list --container-name --account-name --prefix .pulumi/stacks/ --include v +az storage blob download --container-name --account-name --name --version-id --file restored-state.json +` + "```" + ` + +{{- end}} + +The eject bundle contains a resource inventory that lists every managed resource and its physical ID. Use this inventory to verify state consistency. + +To reconcile infrastructure with state, re-run ` + "`ptd ensure`" + `: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps --dry-run +ptd ensure {{.WorkloadName}} --only-steps +` + "```" + ` + +## Database Recovery + +{{- if eq .Cloud "aws"}} + +### RDS Point-in-Time Restore + +RDS supports point-in-time recovery within the configured backup retention window. + +` + "```" + `bash +aws rds restore-db-instance-to-point-in-time \ + --source-db-instance-identifier \ + --target-db-instance-identifier \ + --restore-time \ + --region {{.Region}} +` + "```" + ` + +{{- else}} + +### Azure PostgreSQL Point-in-Time Restore + +Azure PostgreSQL Flexible Server supports point-in-time recovery within the configured backup retention window. + +` + "```" + `bash +az postgres flexible-server restore \ + --resource-group {{.ResourceGroup}} \ + --name \ + --source-server \ + --restore-time +` + "```" + ` + +{{- end}} + +### Post-Restore Steps + +1. Update the database endpoint in the secret store ({{if eq .Cloud "aws"}}Secrets Manager{{else}}Key Vault{{end}}) if the restored instance has a new hostname. +2. Re-run the persistent step to reconcile infrastructure with the new database: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run +ptd ensure {{.WorkloadName}} --only-steps persistent +` + "```" + ` + +## Storage Recovery + +{{- if eq .Cloud "aws"}} + +### FSx Backups + +FSx OpenZFS creates automatic daily backups. To restore from a backup: + +` + "```" + `bash +aws fsx describe-backups --filters Name=file-system-id,Values= --region {{.Region}} +aws fsx create-file-system-from-backup --backup-id --region {{.Region}} +` + "```" + ` + +### S3 Versioning + +S3 buckets have versioning enabled. Recover deleted or overwritten objects: + +` + "```" + `bash +aws s3api list-object-versions --bucket --prefix +aws s3api get-object --bucket --key --version-id restored-file +` + "```" + ` + +{{- else}} + +### Azure Files / Managed Disk Snapshots + +Restore from Azure file share or managed disk snapshots: + +` + "```" + `bash +az snapshot list --resource-group {{.ResourceGroup}} +az disk create --resource-group {{.ResourceGroup}} --name --source +` + "```" + ` + +### Blob Versioning + +Azure Blob Storage has versioning enabled. Recover previous versions: + +` + "```" + `bash +az storage blob list --container-name --account-name --include v +az storage blob download --container-name --account-name --name --version-id --file restored-file +` + "```" + ` + +{{- end}} + +## Kubernetes Cluster Recovery + +### Total Cluster Loss + +If the cluster is completely lost, rebuild from the eject bundle configuration: + +` + "```" + `bash +{{- if eq .Cloud "aws"}} +ptd ensure {{.WorkloadName}} --only-steps eks --dry-run +ptd ensure {{.WorkloadName}} --only-steps eks +{{- else}} +ptd ensure {{.WorkloadName}} --only-steps aks --dry-run +ptd ensure {{.WorkloadName}} --only-steps aks +{{- end}} +ptd ensure {{.WorkloadName}} --only-steps clusters --dry-run +ptd ensure {{.WorkloadName}} --only-steps clusters +ptd ensure {{.WorkloadName}} --only-steps helm --dry-run +ptd ensure {{.WorkloadName}} --only-steps helm +ptd ensure {{.WorkloadName}} --only-steps sites --dry-run +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +### Partial Failure (Node Groups) + +{{- if eq .Cloud "aws"}} + +If a node group is unhealthy, cordon and replace it: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl cordon +ptd workon {{.WorkloadName}} -- kubectl drain --ignore-daemonsets --delete-emptydir-data +` + "```" + ` + +Then re-run the eks step to reconcile the node group: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps eks --dry-run +ptd ensure {{.WorkloadName}} --only-steps eks +` + "```" + ` + +{{- else}} + +If a node pool is unhealthy, cordon and replace it: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl cordon +ptd workon {{.WorkloadName}} -- kubectl drain --ignore-daemonsets --delete-emptydir-data +` + "```" + ` + +Then re-run the aks step to reconcile the node pool: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps aks --dry-run +ptd ensure {{.WorkloadName}} --only-steps aks +` + "```" + ` + +{{- end}} + +### Stuck Pods + +Delete stuck pods to let the controller recreate them: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl delete pod -n posit-team +` + "```" + ` + +If a deployment is stuck, restart it: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl rollout restart deployment/ -n posit-team +` + "```" + ` + +## DNS and Ingress Recovery + +1. Verify DNS resolution: + +` + "```" + `bash +{{- range .Sites}} +dig {{.Domain}} +{{- end}} +` + "```" + ` + +2. Check load balancer health: + +` + "```" + `bash +{{- if eq .Cloud "aws"}} +aws elbv2 describe-target-health --target-group-arn --region {{.Region}} +{{- else}} +az network lb show --resource-group {{.ResourceGroup}} --name +{{- end}} +` + "```" + ` + +3. Check Traefik IngressRoutes: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl get ingressroute -n posit-team +ptd workon {{.WorkloadName}} -- kubectl describe ingressroute -n posit-team +` + "```" + ` + +4. Check TLS certificates: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl get certificate -n posit-team +ptd workon {{.WorkloadName}} -- kubectl describe certificate -n posit-team +` + "```" + ` + +5. If DNS or ingress is misconfigured, re-run the sites step: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps sites --dry-run +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +## Full Environment Rebuild + +To rebuild the full environment from the eject bundle configuration: + +1. Re-run the infrastructure pipeline in order: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps bootstrap --dry-run +ptd ensure {{.WorkloadName}} --only-steps bootstrap +ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run +ptd ensure {{.WorkloadName}} --only-steps persistent +ptd ensure {{.WorkloadName}} --only-steps postgres_config --dry-run +ptd ensure {{.WorkloadName}} --only-steps postgres_config +{{- if eq .Cloud "aws"}} +ptd ensure {{.WorkloadName}} --only-steps eks --dry-run +ptd ensure {{.WorkloadName}} --only-steps eks +{{- else}} +ptd ensure {{.WorkloadName}} --only-steps aks --dry-run +ptd ensure {{.WorkloadName}} --only-steps aks +{{- end}} +ptd ensure {{.WorkloadName}} --only-steps clusters --dry-run +ptd ensure {{.WorkloadName}} --only-steps clusters +ptd ensure {{.WorkloadName}} --only-steps helm --dry-run +ptd ensure {{.WorkloadName}} --only-steps helm +ptd ensure {{.WorkloadName}} --only-steps sites --dry-run +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +2. Restore data from backups: + +{{- if eq .Cloud "aws"}} + - Restore RDS from snapshot or point-in-time recovery (see Database Recovery above). + - Restore FSx from backup (see Storage Recovery above). + - Restore S3 objects from versioned copies if needed. +{{- else}} + - Restore Azure PostgreSQL from point-in-time recovery (see Database Recovery above). + - Restore Azure Files or managed disks from snapshots (see Storage Recovery above). + - Restore blob objects from versioned copies if needed. +{{- end}} + +3. Re-populate manual secrets: + + The following secrets must be manually re-entered in {{if eq .Cloud "aws"}}Secrets Manager{{else}}Key Vault{{end}}: + - Product license keys (Connect, Workbench, Package Manager) + - OIDC client secrets + - Any other manually-managed secrets listed in the eject bundle's secrets inventory +`)) + +// GenerateRunbooks renders both operational runbooks and returns them as a map +// keyed by filename. +func GenerateRunbooks(data *RunbookData) (map[string]string, error) { + results := make(map[string]string, 2) + + ops, err := renderTemplate(dayToDayOpsTemplate, data) + if err != nil { + return nil, fmt.Errorf("failed to render day-to-day-ops runbook: %w", err) + } + results["day-to-day-ops.md"] = ops + + dr, err := renderTemplate(disasterRecoveryTemplate, data) + if err != nil { + return nil, fmt.Errorf("failed to render disaster-recovery runbook: %w", err) + } + results["disaster-recovery.md"] = dr + + return results, nil +} + +func renderTemplate(tmpl *template.Template, data *RunbookData) (string, error) { + var buf strings.Builder + if err := tmpl.Execute(&buf, data); err != nil { + return "", err + } + return buf.String(), nil +} + +// RenderDayToDayOps writes the day-to-day operations runbook to the given writer. +func RenderDayToDayOps(w io.Writer, data *RunbookData) error { + return dayToDayOpsTemplate.Execute(w, data) +} + +// RenderDisasterRecovery writes the disaster recovery runbook to the given writer. +func RenderDisasterRecovery(w io.Writer, data *RunbookData) error { + return disasterRecoveryTemplate.Execute(w, data) +} diff --git a/lib/eject/runbooks_test.go b/lib/eject/runbooks_test.go new file mode 100644 index 0000000..d2f564e --- /dev/null +++ b/lib/eject/runbooks_test.go @@ -0,0 +1,286 @@ +package eject + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func awsRunbookData() *RunbookData { + return &RunbookData{ + WorkloadName: "acme-prod", + Cloud: "aws", + Region: "us-east-1", + ClusterName: "default_acme-prod-control-plane", + Sites: []SiteData{ + {Name: "main", Domain: "connect.acme.com"}, + {Name: "secondary", Domain: "dev.acme.com"}, + }, + } +} + +func azureRunbookData() *RunbookData { + return &RunbookData{ + WorkloadName: "contoso-staging", + Cloud: "azure", + Region: "eastus", + ClusterName: "aks-ptd-contoso", + ResourceGroup: "rsg-ptd-contoso-staging", + Sites: []SiteData{ + {Name: "main", Domain: "connect.contoso.com"}, + }, + } +} + +func TestGenerateRunbooks_AWS_ReturnsExpectedFiles(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + + require.NoError(t, err) + assert.Len(t, results, 2) + assert.Contains(t, results, "day-to-day-ops.md") + assert.Contains(t, results, "disaster-recovery.md") +} + +func TestGenerateRunbooks_Azure_ReturnsExpectedFiles(t *testing.T) { + results, err := GenerateRunbooks(azureRunbookData()) + + require.NoError(t, err) + assert.Len(t, results, 2) + assert.Contains(t, results, "day-to-day-ops.md") + assert.Contains(t, results, "disaster-recovery.md") +} + +func TestRunbook_DayToDayOps_AWS_ContainsSections(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + assert.Contains(t, ops, "# Day-to-Day Operations — acme-prod") + assert.Contains(t, ops, "## Running PTD Ensure Steps") + assert.Contains(t, ops, "## Scaling Product Replicas") + assert.Contains(t, ops, "## Updating Product Versions") + assert.Contains(t, ops, "## Rotating TLS Certificates") + assert.Contains(t, ops, "## Rotating Secrets") + assert.Contains(t, ops, "## Checking Workload Health") + assert.Contains(t, ops, "## Restarting Products") +} + +func TestRunbook_DayToDayOps_AWS_Content(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + assert.Contains(t, ops, "eks") + assert.Contains(t, ops, "aws eks update-kubeconfig") + assert.Contains(t, ops, "Secrets Manager") + assert.Contains(t, ops, "ACM") + assert.Contains(t, ops, "ptd ensure acme-prod") +} + +func TestRunbook_DayToDayOps_Azure_Content(t *testing.T) { + results, err := GenerateRunbooks(azureRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + assert.Contains(t, ops, "aks") + assert.Contains(t, ops, "az aks get-credentials") + assert.Contains(t, ops, "Key Vault") + assert.Contains(t, ops, "rsg-ptd-contoso-staging") + assert.Contains(t, ops, "ptd ensure contoso-staging") +} + +func TestRunbook_DayToDayOps_AWS_SitesRendered(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + assert.Contains(t, ops, "connect.acme.com") + assert.Contains(t, ops, "dev.acme.com") +} + +func TestRunbook_DisasterRecovery_AWS_ContainsSections(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + dr := results["disaster-recovery.md"] + + assert.Contains(t, dr, "# Disaster Recovery — acme-prod") + assert.Contains(t, dr, "## Pulumi State Recovery") + assert.Contains(t, dr, "## Database Recovery") + assert.Contains(t, dr, "## Storage Recovery") + assert.Contains(t, dr, "## Kubernetes Cluster Recovery") + assert.Contains(t, dr, "## DNS and Ingress Recovery") + assert.Contains(t, dr, "## Full Environment Rebuild") +} + +func TestRunbook_DisasterRecovery_AWS_Content(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + dr := results["disaster-recovery.md"] + + assert.Contains(t, dr, "ptd-acme-prod") + assert.Contains(t, dr, "aws rds restore-db-instance-to-point-in-time") + assert.Contains(t, dr, "aws fsx describe-backups") + assert.Contains(t, dr, "aws s3api list-object-versions") + assert.Contains(t, dr, "ptd ensure acme-prod --only-steps eks") +} + +func TestRunbook_DisasterRecovery_Azure_Content(t *testing.T) { + results, err := GenerateRunbooks(azureRunbookData()) + require.NoError(t, err) + + dr := results["disaster-recovery.md"] + + assert.Contains(t, dr, "Azure Blob Storage") + assert.Contains(t, dr, "az postgres flexible-server restore") + assert.Contains(t, dr, "az snapshot list") + assert.Contains(t, dr, "rsg-ptd-contoso-staging") + assert.Contains(t, dr, "ptd ensure contoso-staging --only-steps aks") +} + +func TestRunbook_DisasterRecovery_AWS_SitesRendered(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + dr := results["disaster-recovery.md"] + + assert.Contains(t, dr, "dig connect.acme.com") + assert.Contains(t, dr, "dig dev.acme.com") +} + +func TestRunbooks_NoAutoApply(t *testing.T) { + for _, cloud := range []string{"aws", "azure"} { + t.Run(cloud, func(t *testing.T) { + data := &RunbookData{ + WorkloadName: "test-workload", + Cloud: cloud, + Region: "us-east-1", + ClusterName: "test-cluster", + ResourceGroup: "rsg-ptd-test", + Sites: []SiteData{ + {Name: "main", Domain: "test.example.com"}, + }, + } + results, err := GenerateRunbooks(data) + require.NoError(t, err) + + for filename, content := range results { + assert.NotContains(t, content, "--auto-apply", + "%s for %s should not contain --auto-apply", filename, cloud) + } + }) + } +} + +func TestRunbooks_NoPulumiCommands(t *testing.T) { + for _, cloud := range []string{"aws", "azure"} { + t.Run(cloud, func(t *testing.T) { + data := &RunbookData{ + WorkloadName: "test-workload", + Cloud: cloud, + Region: "us-east-1", + ClusterName: "test-cluster", + ResourceGroup: "rsg-ptd-test", + Sites: []SiteData{ + {Name: "main", Domain: "test.example.com"}, + }, + } + results, err := GenerateRunbooks(data) + require.NoError(t, err) + + for filename, content := range results { + assert.NotContains(t, content, "pulumi up", + "%s for %s should not contain 'pulumi up'", filename, cloud) + assert.NotContains(t, content, "pulumi preview", + "%s for %s should not contain 'pulumi preview'", filename, cloud) + assert.NotContains(t, content, "pulumi stack select", + "%s for %s should not contain 'pulumi stack select'", filename, cloud) + assert.NotContains(t, content, "pulumi import", + "%s for %s should not contain 'pulumi import'", filename, cloud) + } + }) + } +} + +func TestRunbook_DayToDayOps_AWS_StepTable(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + for _, step := range []string{"bootstrap", "persistent", "postgres_config", "eks", "clusters", "helm", "sites"} { + assert.Contains(t, ops, "| "+step+" |", "step table should contain %s", step) + } +} + +func TestRunbook_DayToDayOps_Azure_StepTable(t *testing.T) { + results, err := GenerateRunbooks(azureRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + for _, step := range []string{"bootstrap", "persistent", "postgres_config", "aks", "clusters", "helm", "sites"} { + assert.Contains(t, ops, "| "+step+" |", "step table should contain %s", step) + } + assert.NotContains(t, ops, "| eks |", "Azure runbook should not contain eks step") +} + +func TestRunbook_RenderDayToDayOps_WritesToWriter(t *testing.T) { + var buf strings.Builder + err := RenderDayToDayOps(&buf, awsRunbookData()) + + require.NoError(t, err) + assert.Contains(t, buf.String(), "Day-to-Day Operations") +} + +func TestRunbook_RenderDisasterRecovery_WritesToWriter(t *testing.T) { + var buf strings.Builder + err := RenderDisasterRecovery(&buf, azureRunbookData()) + + require.NoError(t, err) + assert.Contains(t, buf.String(), "Disaster Recovery") +} + +func TestRunbook_DayToDayOps_PtdWorkonCommands(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + assert.Contains(t, ops, "ptd workon acme-prod --") +} + +func TestRunbook_DisasterRecovery_FullRebuildOrder(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + dr := results["disaster-recovery.md"] + + // Extract only the Full Environment Rebuild section to avoid matching + // commands that appear earlier in the document. + rebuildStart := strings.Index(dr, "## Full Environment Rebuild") + require.Greater(t, rebuildStart, 0, "should contain Full Environment Rebuild section") + rebuild := dr[rebuildStart:] + + bootstrapIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps bootstrap\n") + persistentIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps persistent\n") + postgresIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps postgres_config\n") + eksIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps eks\n") + clustersIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps clusters\n") + helmIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps helm\n") + sitesIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps sites\n") + + assert.Greater(t, persistentIdx, bootstrapIdx, "persistent should come after bootstrap") + assert.Greater(t, postgresIdx, persistentIdx, "postgres_config should come after persistent") + assert.Greater(t, eksIdx, postgresIdx, "eks should come after postgres_config") + assert.Greater(t, clustersIdx, eksIdx, "clusters should come after eks") + assert.Greater(t, helmIdx, clustersIdx, "helm should come after clusters") + assert.Greater(t, sitesIdx, helmIdx, "sites should come after helm") +} From b98318de7f4c62475120906852a75a17128c098c Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 16 Apr 2026 09:48:34 -0400 Subject: [PATCH 2/3] fix: wire up ClusterName, sort sites, sanitize Azure resource group Address code review feedback on #249: - Populate ClusterName from config (AWS: default_{name}-{release}-control-plane, Azure: {sanitized-name}-{release}) and render in kubeconfig commands - Sort sites deterministically by name to produce stable output - Apply Azure naming sanitization (lowercase, strip non-alphanumeric) to resource group name, matching lib/azure/target.go convention --- lib/eject/eject.go | 49 ++++++++++++++++++---- lib/eject/runbooks.go | 4 +- lib/eject/runbooks_test.go | 84 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 9 deletions(-) diff --git a/lib/eject/eject.go b/lib/eject/eject.go index a0c414f..541b08c 100644 --- a/lib/eject/eject.go +++ b/lib/eject/eject.go @@ -4,8 +4,12 @@ import ( "context" "fmt" "log/slog" + "maps" "os" "path/filepath" + "regexp" + "slices" + "strings" "github.com/posit-dev/ptd/lib/helpers" "github.com/posit-dev/ptd/lib/types" @@ -90,19 +94,50 @@ func buildRunbookData(config interface{}, targetName string) (*RunbookData, erro case types.AWSWorkloadConfig: data.Cloud = "aws" data.Region = cfg.Region - for name, site := range cfg.Sites { - data.Sites = append(data.Sites, SiteData{Name: name, Domain: site.Spec.Domain}) - } + data.ClusterName = awsClusterName(targetName, cfg.Clusters) + data.Sites = sortedSites(cfg.Sites) case types.AzureWorkloadConfig: data.Cloud = "azure" data.Region = cfg.Region - data.ResourceGroup = fmt.Sprintf("rsg-ptd-%s", targetName) - for name, site := range cfg.Sites { - data.Sites = append(data.Sites, SiteData{Name: name, Domain: site.Spec.Domain}) - } + data.ResourceGroup = fmt.Sprintf("rsg-ptd-%s", sanitizeName(targetName)) + data.ClusterName = azureClusterName(targetName, cfg.Clusters) + data.Sites = sortedSites(cfg.Sites) default: return nil, fmt.Errorf("unsupported config type for target %s", targetName) } return data, nil } + +func sortedSites(sites map[string]types.SiteConfig) []SiteData { + names := slices.Sorted(maps.Keys(sites)) + out := make([]SiteData, 0, len(names)) + for _, name := range names { + out = append(out, SiteData{Name: name, Domain: sites[name].Spec.Domain}) + } + return out +} + +// sanitizeName mirrors the Azure naming convention: lowercase, non-alphanumeric +// characters replaced with hyphens. +func sanitizeName(name string) string { + s := strings.ToLower(name) + re := regexp.MustCompile(`[^a-z0-9-]`) + return re.ReplaceAllString(s, "-") +} + +func awsClusterName(targetName string, clusters map[string]types.AWSWorkloadClusterConfig) string { + releases := slices.Sorted(maps.Keys(clusters)) + if len(releases) == 0 { + return fmt.Sprintf("default_%s-control-plane", targetName) + } + return fmt.Sprintf("default_%s-%s-control-plane", targetName, releases[0]) +} + +func azureClusterName(targetName string, clusters map[string]types.AzureWorkloadClusterConfig) string { + releases := slices.Sorted(maps.Keys(clusters)) + if len(releases) == 0 { + return sanitizeName(targetName) + } + return fmt.Sprintf("%s-%s", sanitizeName(targetName), releases[0]) +} diff --git a/lib/eject/runbooks.go b/lib/eject/runbooks.go index a8a5da8..8abe935 100644 --- a/lib/eject/runbooks.go +++ b/lib/eject/runbooks.go @@ -224,7 +224,7 @@ ptd workon {{.WorkloadName}} -- kubectl get ingressroute -n posit-team {{- if eq .Cloud "aws"}} ` + "```" + `bash -aws eks update-kubeconfig --name --region {{.Region}} +aws eks update-kubeconfig --name {{.ClusterName}} --region {{.Region}} kubectl get pods -n posit-team kubectl get pods -n posit-team-system kubectl get ingressroute -n posit-team @@ -233,7 +233,7 @@ kubectl get ingressroute -n posit-team {{- else}} ` + "```" + `bash -az aks get-credentials --resource-group {{.ResourceGroup}} --name +az aks get-credentials --resource-group {{.ResourceGroup}} --name {{.ClusterName}} kubectl get pods -n posit-team kubectl get pods -n posit-team-system kubectl get ingressroute -n posit-team diff --git a/lib/eject/runbooks_test.go b/lib/eject/runbooks_test.go index d2f564e..7b4b716 100644 --- a/lib/eject/runbooks_test.go +++ b/lib/eject/runbooks_test.go @@ -4,6 +4,7 @@ import ( "strings" "testing" + "github.com/posit-dev/ptd/lib/types" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -284,3 +285,86 @@ func TestRunbook_DisasterRecovery_FullRebuildOrder(t *testing.T) { assert.Greater(t, helmIdx, clustersIdx, "helm should come after clusters") assert.Greater(t, sitesIdx, helmIdx, "sites should come after helm") } + +func TestRunbook_AWS_ClusterNameRendered(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + assert.Contains(t, ops, "aws eks update-kubeconfig --name default_acme-prod-control-plane --region us-east-1") +} + +func TestRunbook_Azure_ClusterNameRendered(t *testing.T) { + results, err := GenerateRunbooks(azureRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + assert.Contains(t, ops, "az aks get-credentials --resource-group rsg-ptd-contoso-staging --name aks-ptd-contoso") +} + +func TestBuildRunbookData_AWS_SortsSites(t *testing.T) { + config := types.AWSWorkloadConfig{ + Region: "us-east-1", + Sites: map[string]types.SiteConfig{ + "zebra": {Spec: types.SiteConfigSpec{Domain: "z.example.com"}}, + "alpha": {Spec: types.SiteConfigSpec{Domain: "a.example.com"}}, + "middle": {Spec: types.SiteConfigSpec{Domain: "m.example.com"}}, + }, + Clusters: map[string]types.AWSWorkloadClusterConfig{ + "20240101": {}, + }, + } + + data, err := buildRunbookData(config, "test-workload") + require.NoError(t, err) + + require.Len(t, data.Sites, 3) + assert.Equal(t, "alpha", data.Sites[0].Name) + assert.Equal(t, "middle", data.Sites[1].Name) + assert.Equal(t, "zebra", data.Sites[2].Name) +} + +func TestBuildRunbookData_Azure_SanitizesResourceGroup(t *testing.T) { + config := types.AzureWorkloadConfig{ + Region: "eastus", + Sites: map[string]types.SiteConfig{ + "main": {Spec: types.SiteConfigSpec{Domain: "test.example.com"}}, + }, + Clusters: map[string]types.AzureWorkloadClusterConfig{ + "20240101": {}, + }, + } + + data, err := buildRunbookData(config, "MyWorkload_Test") + require.NoError(t, err) + + assert.Equal(t, "rsg-ptd-myworkload-test", data.ResourceGroup) +} + +func TestBuildRunbookData_AWS_ClusterName(t *testing.T) { + config := types.AWSWorkloadConfig{ + Region: "us-west-2", + Clusters: map[string]types.AWSWorkloadClusterConfig{ + "20240601": {}, + }, + } + + data, err := buildRunbookData(config, "acme-prod") + require.NoError(t, err) + + assert.Equal(t, "default_acme-prod-20240601-control-plane", data.ClusterName) +} + +func TestBuildRunbookData_Azure_ClusterName(t *testing.T) { + config := types.AzureWorkloadConfig{ + Region: "eastus", + Clusters: map[string]types.AzureWorkloadClusterConfig{ + "20240601": {}, + }, + } + + data, err := buildRunbookData(config, "Contoso-Staging") + require.NoError(t, err) + + assert.Equal(t, "contoso-staging-20240601", data.ClusterName) +} From 86a723d1ed68c588ed820fb8576d9bccf4ea2c9b Mon Sep 17 00:00:00 2001 From: Tim Date: Thu, 16 Apr 2026 10:34:32 -0400 Subject: [PATCH 3/3] fix: correct DR runbook assumptions, remove redundant --dry-run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove false claims about S3/Azure blob versioning on state and data buckets — none of these have versioning enabled - Remove false claims about Azure storage snapshots/soft-delete - Keep accurate content: RDS 7-day backups, Azure PG default backups, FSx 30-day automatic backups - Add "Prevention" notes suggesting customers enable versioning post-eject - Remove redundant --dry-run + apply pattern from both runbooks — ptd ensure already shows a preview and prompts for confirmation - Full rebuild uses bare `ptd ensure` instead of listing every step, which also avoids skipping custom steps --- lib/eject/runbooks.go | 158 ++++++++++++------------------------- lib/eject/runbooks_test.go | 28 ++----- 2 files changed, 57 insertions(+), 129 deletions(-) diff --git a/lib/eject/runbooks.go b/lib/eject/runbooks.go index 8abe935..939e3e7 100644 --- a/lib/eject/runbooks.go +++ b/lib/eject/runbooks.go @@ -39,7 +39,11 @@ var dayToDayOpsTemplate = template.Must(template.New("day-to-day-ops").Funcs(run ## Running PTD Ensure Steps -Each infrastructure change is applied by running the relevant ` + "`ptd ensure`" + ` step. Always preview first with ` + "`--dry-run`" + `, then apply. +Each infrastructure change is applied by running the relevant ` + "`ptd ensure`" + ` step. Each step shows a preview of planned changes and prompts for confirmation before applying. + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps +` + "```" + ` | Step | When to Re-Run | What It Changes | |---|---|---| @@ -61,18 +65,6 @@ Each infrastructure change is applied by running the relevant ` + "`ptd ensure`" | sites | Product deployment, ingress, or site config changes | TeamSite CRDs, ingress resources, site-specific configuration | {{- end}} -**Preview a step (dry-run):** - -` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps --dry-run -` + "```" + ` - -**Apply a step:** - -` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps -` + "```" + ` - ## Scaling Product Replicas 1. Edit the product replica count in the site's ` + "`site.yaml`" + `: @@ -83,10 +75,9 @@ spec: replicas: 3 ` + "```" + ` -2. Preview and apply the sites step: +2. Run the sites step: ` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps sites --dry-run ptd ensure {{.WorkloadName}} --only-steps sites ` + "```" + ` @@ -106,10 +97,9 @@ spec: image: ghcr.io/rstudio/rstudio-connect:2025.01.0 ` + "```" + ` -2. Preview and apply the sites step: +2. Run the sites step: ` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps sites --dry-run ptd ensure {{.WorkloadName}} --only-steps sites ` + "```" + ` @@ -131,9 +121,7 @@ ACM certificates auto-renew when DNS validation records are in place. To change 2. Re-run the persistent and sites steps: ` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run ptd ensure {{.WorkloadName}} --only-steps persistent -ptd ensure {{.WorkloadName}} --only-steps sites --dry-run ptd ensure {{.WorkloadName}} --only-steps sites ` + "```" + ` @@ -145,9 +133,7 @@ Azure-managed certificates are handled by the platform. To change the certificat 2. Re-run the persistent and sites steps: ` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run ptd ensure {{.WorkloadName}} --only-steps persistent -ptd ensure {{.WorkloadName}} --only-steps sites --dry-run ptd ensure {{.WorkloadName}} --only-steps sites ` + "```" + ` @@ -176,7 +162,6 @@ Database passwords are stored in AWS Secrets Manager. To rotate: 3. Re-run the persistent step to reconcile: ` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run ptd ensure {{.WorkloadName}} --only-steps persistent ` + "```" + ` @@ -189,7 +174,6 @@ Database passwords are stored in Azure Key Vault. To rotate: 3. Re-run the persistent step to reconcile: ` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run ptd ensure {{.WorkloadName}} --only-steps persistent ` + "```" + ` @@ -279,42 +263,30 @@ var disasterRecoveryTemplate = template.Must(template.New("disaster-recovery").F **State backend:** S3 bucket ` + "`ptd-{{.WorkloadName}}`" + ` in {{.Region}} -S3 versioning is enabled on the state bucket. If state is corrupted or accidentally overwritten, restore a previous version: - -` + "```" + `bash -aws s3api list-object-versions --bucket ptd-{{.WorkloadName}} --prefix .pulumi/stacks/ -aws s3api get-object --bucket ptd-{{.WorkloadName}} --key --version-id restored-state.json -` + "```" + ` - {{- else}} **State backend:** Azure Blob Storage container in storage account for {{.WorkloadName}} -Blob versioning is enabled on the state container. If state is corrupted or accidentally overwritten, restore a previous version: - -` + "```" + `bash -az storage blob list --container-name --account-name --prefix .pulumi/stacks/ --include v -az storage blob download --container-name --account-name --name --version-id --file restored-state.json -` + "```" + ` - {{- end}} -The eject bundle contains a resource inventory that lists every managed resource and its physical ID. Use this inventory to verify state consistency. +The state bucket does not have object versioning enabled. If Pulumi state is corrupted or lost, recovery options are: -To reconcile infrastructure with state, re-run ` + "`ptd ensure`" + `: +1. **Re-run ` + "`ptd ensure`" + `** — Pulumi will detect drift between state and actual infrastructure and reconcile. This is the primary recovery path. +2. **Use the eject bundle resource inventory** — ` + "`state/resource-inventory.json`" + ` lists every managed resource with its physical ID. This can guide manual re-import if needed. ` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps --dry-run ptd ensure {{.WorkloadName}} --only-steps ` + "```" + ` +**Prevention:** Consider enabling versioning on the state bucket post-eject so you can recover from accidental state overwrites. + ## Database Recovery {{- if eq .Cloud "aws"}} ### RDS Point-in-Time Restore -RDS supports point-in-time recovery within the configured backup retention window. +RDS automated backups are enabled with a 7-day retention window. Point-in-time restore creates a new DB instance from any point within that window. ` + "```" + `bash aws rds restore-db-instance-to-point-in-time \ @@ -324,11 +296,21 @@ aws rds restore-db-instance-to-point-in-time \ --region {{.Region}} ` + "```" + ` +To restore from a manual snapshot instead: + +` + "```" + `bash +aws rds describe-db-snapshots --db-instance-identifier --region {{.Region}} +aws rds restore-db-instance-from-db-snapshot \ + --db-snapshot-identifier \ + --db-instance-identifier \ + --region {{.Region}} +` + "```" + ` + {{- else}} ### Azure PostgreSQL Point-in-Time Restore -Azure PostgreSQL Flexible Server supports point-in-time recovery within the configured backup retention window. +Azure PostgreSQL Flexible Server has automated backups with the default 7-day retention window. Point-in-time restore creates a new server from any point within that window. ` + "```" + `bash az postgres flexible-server restore \ @@ -343,10 +325,9 @@ az postgres flexible-server restore \ ### Post-Restore Steps 1. Update the database endpoint in the secret store ({{if eq .Cloud "aws"}}Secrets Manager{{else}}Key Vault{{end}}) if the restored instance has a new hostname. -2. Re-run the persistent step to reconcile infrastructure with the new database: +2. Re-run the persistent step to reconcile Pulumi state with the new database: ` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run ptd ensure {{.WorkloadName}} --only-steps persistent ` + "```" + ` @@ -354,43 +335,37 @@ ptd ensure {{.WorkloadName}} --only-steps persistent {{- if eq .Cloud "aws"}} -### FSx Backups +### FSx OpenZFS + +FSx OpenZFS has automatic daily backups with a 30-day retention window. -FSx OpenZFS creates automatic daily backups. To restore from a backup: +List available backups: ` + "```" + `bash aws fsx describe-backups --filters Name=file-system-id,Values= --region {{.Region}} -aws fsx create-file-system-from-backup --backup-id --region {{.Region}} ` + "```" + ` -### S3 Versioning - -S3 buckets have versioning enabled. Recover deleted or overwritten objects: +Restore from a backup (creates a new filesystem): ` + "```" + `bash -aws s3api list-object-versions --bucket --prefix -aws s3api get-object --bucket --key --version-id restored-file +aws fsx create-file-system-from-backup --backup-id --region {{.Region}} ` + "```" + ` -{{- else}} +After restore, update the FSx DNS name in the workload secret and re-run the persistent step. -### Azure Files / Managed Disk Snapshots +### S3 Buckets -Restore from Azure file share or managed disk snapshots: +S3 data buckets (chronicle, packagemanager) do not have versioning enabled. Deleted or overwritten objects cannot be recovered from S3 alone. -` + "```" + `bash -az snapshot list --resource-group {{.ResourceGroup}} -az disk create --resource-group {{.ResourceGroup}} --name --source -` + "```" + ` +**Prevention:** Consider enabling versioning on critical data buckets post-eject. + +{{- else}} -### Blob Versioning +### Azure Storage -Azure Blob Storage has versioning enabled. Recover previous versions: +Azure storage accounts (file shares, blob containers) do not have soft delete or versioning enabled by default. Deleted or overwritten data cannot be recovered from Azure Storage alone. -` + "```" + `bash -az storage blob list --container-name --account-name --include v -az storage blob download --container-name --account-name --name --version-id --file restored-file -` + "```" + ` +**Prevention:** Consider enabling blob soft delete and versioning on critical storage accounts post-eject. {{- end}} @@ -398,21 +373,16 @@ az storage blob download --container-name --account-name ptd workon {{.WorkloadName}} -- kubectl drain --ignore-daemonsets --delete-emptydir-data -` + "```" + ` - -Then re-run the eks step to reconcile the node group: - -` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps eks --dry-run ptd ensure {{.WorkloadName}} --only-steps eks ` + "```" + ` {{- else}} -If a node pool is unhealthy, cordon and replace it: +If a node pool is unhealthy, cordon and drain the affected nodes, then re-run the aks step: ` + "```" + `bash ptd workon {{.WorkloadName}} -- kubectl cordon ptd workon {{.WorkloadName}} -- kubectl drain --ignore-daemonsets --delete-emptydir-data -` + "```" + ` - -Then re-run the aks step to reconcile the node pool: - -` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps aks --dry-run ptd ensure {{.WorkloadName}} --only-steps aks ` + "```" + ` @@ -503,7 +461,6 @@ ptd workon {{.WorkloadName}} -- kubectl describe certificate -n posit-team 5. If DNS or ingress is misconfigured, re-run the sites step: ` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps sites --dry-run ptd ensure {{.WorkloadName}} --only-steps sites ` + "```" + ` @@ -511,40 +468,23 @@ ptd ensure {{.WorkloadName}} --only-steps sites To rebuild the full environment from the eject bundle configuration: -1. Re-run the infrastructure pipeline in order: +1. Re-run the full infrastructure pipeline: ` + "```" + `bash -ptd ensure {{.WorkloadName}} --only-steps bootstrap --dry-run -ptd ensure {{.WorkloadName}} --only-steps bootstrap -ptd ensure {{.WorkloadName}} --only-steps persistent --dry-run -ptd ensure {{.WorkloadName}} --only-steps persistent -ptd ensure {{.WorkloadName}} --only-steps postgres_config --dry-run -ptd ensure {{.WorkloadName}} --only-steps postgres_config -{{- if eq .Cloud "aws"}} -ptd ensure {{.WorkloadName}} --only-steps eks --dry-run -ptd ensure {{.WorkloadName}} --only-steps eks -{{- else}} -ptd ensure {{.WorkloadName}} --only-steps aks --dry-run -ptd ensure {{.WorkloadName}} --only-steps aks -{{- end}} -ptd ensure {{.WorkloadName}} --only-steps clusters --dry-run -ptd ensure {{.WorkloadName}} --only-steps clusters -ptd ensure {{.WorkloadName}} --only-steps helm --dry-run -ptd ensure {{.WorkloadName}} --only-steps helm -ptd ensure {{.WorkloadName}} --only-steps sites --dry-run -ptd ensure {{.WorkloadName}} --only-steps sites +ptd ensure {{.WorkloadName}} ` + "```" + ` + This runs all steps in order (bootstrap through sites), including any custom steps. + 2. Restore data from backups: {{- if eq .Cloud "aws"}} - Restore RDS from snapshot or point-in-time recovery (see Database Recovery above). - Restore FSx from backup (see Storage Recovery above). - - Restore S3 objects from versioned copies if needed. + - S3 data buckets have no versioning — data loss is permanent unless you have external backups. {{- else}} - Restore Azure PostgreSQL from point-in-time recovery (see Database Recovery above). - - Restore Azure Files or managed disks from snapshots (see Storage Recovery above). - - Restore blob objects from versioned copies if needed. + - Azure storage has no versioning or soft delete — data loss is permanent unless you have external backups. {{- end}} 3. Re-populate manual secrets: diff --git a/lib/eject/runbooks_test.go b/lib/eject/runbooks_test.go index 7b4b716..7770671 100644 --- a/lib/eject/runbooks_test.go +++ b/lib/eject/runbooks_test.go @@ -129,7 +129,7 @@ func TestRunbook_DisasterRecovery_AWS_Content(t *testing.T) { assert.Contains(t, dr, "ptd-acme-prod") assert.Contains(t, dr, "aws rds restore-db-instance-to-point-in-time") assert.Contains(t, dr, "aws fsx describe-backups") - assert.Contains(t, dr, "aws s3api list-object-versions") + assert.Contains(t, dr, "S3 data buckets have no versioning") assert.Contains(t, dr, "ptd ensure acme-prod --only-steps eks") } @@ -141,7 +141,7 @@ func TestRunbook_DisasterRecovery_Azure_Content(t *testing.T) { assert.Contains(t, dr, "Azure Blob Storage") assert.Contains(t, dr, "az postgres flexible-server restore") - assert.Contains(t, dr, "az snapshot list") + assert.Contains(t, dr, "Azure storage has no versioning or soft delete") assert.Contains(t, dr, "rsg-ptd-contoso-staging") assert.Contains(t, dr, "ptd ensure contoso-staging --only-steps aks") } @@ -156,7 +156,7 @@ func TestRunbook_DisasterRecovery_AWS_SitesRendered(t *testing.T) { assert.Contains(t, dr, "dig dev.acme.com") } -func TestRunbooks_NoAutoApply(t *testing.T) { +func TestRunbooks_NoBannedFlags(t *testing.T) { for _, cloud := range []string{"aws", "azure"} { t.Run(cloud, func(t *testing.T) { data := &RunbookData{ @@ -175,6 +175,8 @@ func TestRunbooks_NoAutoApply(t *testing.T) { for filename, content := range results { assert.NotContains(t, content, "--auto-apply", "%s for %s should not contain --auto-apply", filename, cloud) + assert.NotContains(t, content, "--dry-run", + "%s for %s should not contain --dry-run", filename, cloud) } }) } @@ -258,32 +260,18 @@ func TestRunbook_DayToDayOps_PtdWorkonCommands(t *testing.T) { assert.Contains(t, ops, "ptd workon acme-prod --") } -func TestRunbook_DisasterRecovery_FullRebuildOrder(t *testing.T) { +func TestRunbook_DisasterRecovery_FullRebuild(t *testing.T) { results, err := GenerateRunbooks(awsRunbookData()) require.NoError(t, err) dr := results["disaster-recovery.md"] - // Extract only the Full Environment Rebuild section to avoid matching - // commands that appear earlier in the document. rebuildStart := strings.Index(dr, "## Full Environment Rebuild") require.Greater(t, rebuildStart, 0, "should contain Full Environment Rebuild section") rebuild := dr[rebuildStart:] - bootstrapIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps bootstrap\n") - persistentIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps persistent\n") - postgresIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps postgres_config\n") - eksIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps eks\n") - clustersIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps clusters\n") - helmIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps helm\n") - sitesIdx := strings.Index(rebuild, "ptd ensure acme-prod --only-steps sites\n") - - assert.Greater(t, persistentIdx, bootstrapIdx, "persistent should come after bootstrap") - assert.Greater(t, postgresIdx, persistentIdx, "postgres_config should come after persistent") - assert.Greater(t, eksIdx, postgresIdx, "eks should come after postgres_config") - assert.Greater(t, clustersIdx, eksIdx, "clusters should come after eks") - assert.Greater(t, helmIdx, clustersIdx, "helm should come after clusters") - assert.Greater(t, sitesIdx, helmIdx, "sites should come after helm") + assert.Contains(t, rebuild, "ptd ensure acme-prod\n") + assert.NotContains(t, rebuild, "--only-steps", "full rebuild should run all steps, not individual ones") } func TestRunbook_AWS_ClusterNameRendered(t *testing.T) {