diff --git a/lib/eject/eject.go b/lib/eject/eject.go index 9c54732..541b08c 100644 --- a/lib/eject/eject.go +++ b/lib/eject/eject.go @@ -4,7 +4,12 @@ import ( "context" "fmt" "log/slog" + "maps" "os" + "path/filepath" + "regexp" + "slices" + "strings" "github.com/posit-dev/ptd/lib/helpers" "github.com/posit-dev/ptd/lib/types" @@ -55,6 +60,84 @@ func Run(ctx context.Context, t types.Target, opts Options) error { "connections", len(crDetails.Connections), ) + rbData, err := buildRunbookData(config, opts.TargetName) + if err != nil { + return fmt.Errorf("failed to build runbook data: %w", err) + } + + runbooks, err := GenerateRunbooks(rbData) + if err != nil { + return fmt.Errorf("failed to generate runbooks: %w", err) + } + + runbooksDir := filepath.Join(opts.OutputDir, "runbooks") + if err := os.MkdirAll(runbooksDir, 0755); err != nil { + return fmt.Errorf("failed to create runbooks directory: %w", err) + } + + for filename, content := range runbooks { + path := filepath.Join(runbooksDir, filename) + if err := os.WriteFile(path, []byte(content), 0644); err != nil { + return fmt.Errorf("failed to write runbook %s: %w", filename, err) + } + slog.Info("Generated runbook", "path", path) + } + slog.Info("Eject bundle generated", "path", opts.OutputDir) return nil } + +func buildRunbookData(config interface{}, targetName string) (*RunbookData, error) { + data := &RunbookData{WorkloadName: targetName} + + switch cfg := config.(type) { + case types.AWSWorkloadConfig: + data.Cloud = "aws" + data.Region = cfg.Region + data.ClusterName = awsClusterName(targetName, cfg.Clusters) + data.Sites = sortedSites(cfg.Sites) + case types.AzureWorkloadConfig: + data.Cloud = "azure" + data.Region = cfg.Region + data.ResourceGroup = fmt.Sprintf("rsg-ptd-%s", sanitizeName(targetName)) + data.ClusterName = azureClusterName(targetName, cfg.Clusters) + data.Sites = sortedSites(cfg.Sites) + default: + return nil, fmt.Errorf("unsupported config type for target %s", targetName) + } + + return data, nil +} + +func sortedSites(sites map[string]types.SiteConfig) []SiteData { + names := slices.Sorted(maps.Keys(sites)) + out := make([]SiteData, 0, len(names)) + for _, name := range names { + out = append(out, SiteData{Name: name, Domain: sites[name].Spec.Domain}) + } + return out +} + +// sanitizeName mirrors the Azure naming convention: lowercase, non-alphanumeric +// characters replaced with hyphens. +func sanitizeName(name string) string { + s := strings.ToLower(name) + re := regexp.MustCompile(`[^a-z0-9-]`) + return re.ReplaceAllString(s, "-") +} + +func awsClusterName(targetName string, clusters map[string]types.AWSWorkloadClusterConfig) string { + releases := slices.Sorted(maps.Keys(clusters)) + if len(releases) == 0 { + return fmt.Sprintf("default_%s-control-plane", targetName) + } + return fmt.Sprintf("default_%s-%s-control-plane", targetName, releases[0]) +} + +func azureClusterName(targetName string, clusters map[string]types.AzureWorkloadClusterConfig) string { + releases := slices.Sorted(maps.Keys(clusters)) + if len(releases) == 0 { + return sanitizeName(targetName) + } + return fmt.Sprintf("%s-%s", sanitizeName(targetName), releases[0]) +} diff --git a/lib/eject/runbooks.go b/lib/eject/runbooks.go new file mode 100644 index 0000000..939e3e7 --- /dev/null +++ b/lib/eject/runbooks.go @@ -0,0 +1,534 @@ +package eject + +import ( + "fmt" + "io" + "strings" + "text/template" +) + +// SiteData holds per-site information for runbook templates. +type SiteData struct { + Name string + Domain string +} + +// RunbookData contains all data needed to render the operational runbooks. +type RunbookData struct { + WorkloadName string + Cloud string // "aws" or "azure" + Region string + ClusterName string + ResourceGroup string + Sites []SiteData +} + +var runbookFuncMap = template.FuncMap{ + "upper": strings.ToUpper, +} + +var dayToDayOpsTemplate = template.Must(template.New("day-to-day-ops").Funcs(runbookFuncMap).Parse( + `# Day-to-Day Operations — {{.WorkloadName}} + +**Workload:** {{.WorkloadName}} +**Cloud:** {{.Cloud | upper}} +**Region:** {{.Region}} +{{- range .Sites}} +**Site:** {{.Name}} ({{.Domain}}) +{{- end}} + +## Running PTD Ensure Steps + +Each infrastructure change is applied by running the relevant ` + "`ptd ensure`" + ` step. Each step shows a preview of planned changes and prompts for confirmation before applying. + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps +` + "```" + ` + +| Step | When to Re-Run | What It Changes | +|---|---|---| +{{- if eq .Cloud "aws"}} +| bootstrap | Initial setup only; rarely re-run | S3 state bucket, KMS key, IAM bootstrap roles | +| persistent | VPC, RDS, S3, FSx, IAM, DNS, or TLS changes | VPC, subnets, RDS instance, S3 buckets, FSx filesystem, IAM roles, Route53 zones, ACM certificates | +| postgres_config | Database user/grant changes | PostgreSQL users, databases, grants | +| eks | Cluster or node group changes | EKS cluster, managed node groups, OIDC provider, storage classes | +| clusters | Namespace, RBAC, operator, or ingress controller changes | K8s namespaces, network policies, IAM-to-K8s bindings, Team Operator, Traefik | +| helm | Monitoring, cert-manager, or CSI driver changes | Loki, Grafana, Mimir, Alloy, cert-manager, Secrets Store CSI | +| sites | Product deployment, ingress, or site config changes | TeamSite CRDs, ingress resources, site-specific configuration | +{{- else}} +| bootstrap | Initial setup only; rarely re-run | Blob state container, Key Vault encryption key | +| persistent | VNet, PostgreSQL, storage, Key Vault, or identity changes | VNet, Azure PostgreSQL, storage accounts, NetApp Files, Key Vault, managed identities, NSGs | +| postgres_config | Database user/grant changes | PostgreSQL users, databases, grants | +| aks | Cluster or node pool changes | AKS cluster, node pools, managed identity, storage classes | +| clusters | Namespace, RBAC, operator, or ingress controller changes | K8s namespaces, network policies, workload identity bindings, Team Operator, Traefik | +| helm | Monitoring, cert-manager, or CSI driver changes | Loki, Grafana, Mimir, Alloy, cert-manager, Secrets Store CSI | +| sites | Product deployment, ingress, or site config changes | TeamSite CRDs, ingress resources, site-specific configuration | +{{- end}} + +## Scaling Product Replicas + +1. Edit the product replica count in the site's ` + "`site.yaml`" + `: + +` + "```" + `yaml +spec: + connect: + replicas: 3 +` + "```" + ` + +2. Run the sites step: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +3. Verify the new replica count: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl get pods -n posit-team -l app.kubernetes.io/managed-by=team-operator +` + "```" + ` + +## Updating Product Versions + +1. Edit the product image tag in the site's ` + "`site.yaml`" + `: + +` + "```" + `yaml +spec: + connect: + image: ghcr.io/rstudio/rstudio-connect:2025.01.0 +` + "```" + ` + +2. Run the sites step: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +3. Verify pods roll to the new version: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl rollout status deployment -n posit-team -l app.kubernetes.io/managed-by=team-operator +` + "```" + ` + +## Rotating TLS Certificates + +### ACM/Azure-Managed Certificates + +{{- if eq .Cloud "aws"}} + +ACM certificates auto-renew when DNS validation records are in place. To change the certificate: + +1. Update the certificate configuration in ` + "`ptd.yaml`" + `. +2. Re-run the persistent and sites steps: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps persistent +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +{{- else}} + +Azure-managed certificates are handled by the platform. To change the certificate configuration: + +1. Update the certificate configuration in ` + "`ptd.yaml`" + `. +2. Re-run the persistent and sites steps: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps persistent +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +{{- end}} + +### cert-manager Certificates + +cert-manager automatically renews certificates before expiry. To force renewal: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl delete certificate -n posit-team +` + "```" + ` + +cert-manager will detect the missing certificate and issue a new one. + +## Rotating Secrets + +### Database Passwords + +{{- if eq .Cloud "aws"}} + +Database passwords are stored in AWS Secrets Manager. To rotate: + +1. Update the password value in the relevant secret in Secrets Manager. +2. Update the password on the PostgreSQL server to match. +3. Re-run the persistent step to reconcile: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps persistent +` + "```" + ` + +{{- else}} + +Database passwords are stored in Azure Key Vault. To rotate: + +1. Update the password value in the relevant Key Vault secret. +2. Update the password on the PostgreSQL server to match. +3. Re-run the persistent step to reconcile: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps persistent +` + "```" + ` + +{{- end}} + +### Product Licenses + +Update the license key in the secret store ({{if eq .Cloud "aws"}}Secrets Manager{{else}}Key Vault{{end}}) and restart the affected product: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl rollout restart deployment/-deployment -n posit-team +` + "```" + ` + +### RSA Keys + +**Warning:** Rotating RSA keys for Connect or Package Manager will invalidate all content signed with the previous key. Plan accordingly. + +Update the key in the secret store and restart the affected product. + +## Checking Workload Health + +### Using ptd workon + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl get pods -n posit-team +ptd workon {{.WorkloadName}} -- kubectl get pods -n posit-team-system +ptd workon {{.WorkloadName}} -- kubectl get ingressroute -n posit-team +` + "```" + ` + +### Using kubectl directly + +{{- if eq .Cloud "aws"}} + +` + "```" + `bash +aws eks update-kubeconfig --name {{.ClusterName}} --region {{.Region}} +kubectl get pods -n posit-team +kubectl get pods -n posit-team-system +kubectl get ingressroute -n posit-team +` + "```" + ` + +{{- else}} + +` + "```" + `bash +az aks get-credentials --resource-group {{.ResourceGroup}} --name {{.ClusterName}} +kubectl get pods -n posit-team +kubectl get pods -n posit-team-system +kubectl get ingressroute -n posit-team +` + "```" + ` + +{{- end}} + +## Restarting Products + +Restart a product deployment: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl rollout restart deployment/-deployment -n posit-team +` + "```" + ` + +Monitor the rollout: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl rollout status deployment/-deployment -n posit-team +` + "```" + ` + +Using kubectl directly: + +` + "```" + `bash +kubectl rollout restart deployment/-deployment -n posit-team +kubectl rollout status deployment/-deployment -n posit-team +` + "```" + ` +`)) + +var disasterRecoveryTemplate = template.Must(template.New("disaster-recovery").Funcs(runbookFuncMap).Parse( + `# Disaster Recovery — {{.WorkloadName}} + +**Workload:** {{.WorkloadName}} +**Cloud:** {{.Cloud | upper}} +**Region:** {{.Region}} +{{- range .Sites}} +**Site:** {{.Name}} ({{.Domain}}) +{{- end}} + +## Pulumi State Recovery + +{{- if eq .Cloud "aws"}} + +**State backend:** S3 bucket ` + "`ptd-{{.WorkloadName}}`" + ` in {{.Region}} + +{{- else}} + +**State backend:** Azure Blob Storage container in storage account for {{.WorkloadName}} + +{{- end}} + +The state bucket does not have object versioning enabled. If Pulumi state is corrupted or lost, recovery options are: + +1. **Re-run ` + "`ptd ensure`" + `** — Pulumi will detect drift between state and actual infrastructure and reconcile. This is the primary recovery path. +2. **Use the eject bundle resource inventory** — ` + "`state/resource-inventory.json`" + ` lists every managed resource with its physical ID. This can guide manual re-import if needed. + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps +` + "```" + ` + +**Prevention:** Consider enabling versioning on the state bucket post-eject so you can recover from accidental state overwrites. + +## Database Recovery + +{{- if eq .Cloud "aws"}} + +### RDS Point-in-Time Restore + +RDS automated backups are enabled with a 7-day retention window. Point-in-time restore creates a new DB instance from any point within that window. + +` + "```" + `bash +aws rds restore-db-instance-to-point-in-time \ + --source-db-instance-identifier \ + --target-db-instance-identifier \ + --restore-time \ + --region {{.Region}} +` + "```" + ` + +To restore from a manual snapshot instead: + +` + "```" + `bash +aws rds describe-db-snapshots --db-instance-identifier --region {{.Region}} +aws rds restore-db-instance-from-db-snapshot \ + --db-snapshot-identifier \ + --db-instance-identifier \ + --region {{.Region}} +` + "```" + ` + +{{- else}} + +### Azure PostgreSQL Point-in-Time Restore + +Azure PostgreSQL Flexible Server has automated backups with the default 7-day retention window. Point-in-time restore creates a new server from any point within that window. + +` + "```" + `bash +az postgres flexible-server restore \ + --resource-group {{.ResourceGroup}} \ + --name \ + --source-server \ + --restore-time +` + "```" + ` + +{{- end}} + +### Post-Restore Steps + +1. Update the database endpoint in the secret store ({{if eq .Cloud "aws"}}Secrets Manager{{else}}Key Vault{{end}}) if the restored instance has a new hostname. +2. Re-run the persistent step to reconcile Pulumi state with the new database: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps persistent +` + "```" + ` + +## Storage Recovery + +{{- if eq .Cloud "aws"}} + +### FSx OpenZFS + +FSx OpenZFS has automatic daily backups with a 30-day retention window. + +List available backups: + +` + "```" + `bash +aws fsx describe-backups --filters Name=file-system-id,Values= --region {{.Region}} +` + "```" + ` + +Restore from a backup (creates a new filesystem): + +` + "```" + `bash +aws fsx create-file-system-from-backup --backup-id --region {{.Region}} +` + "```" + ` + +After restore, update the FSx DNS name in the workload secret and re-run the persistent step. + +### S3 Buckets + +S3 data buckets (chronicle, packagemanager) do not have versioning enabled. Deleted or overwritten objects cannot be recovered from S3 alone. + +**Prevention:** Consider enabling versioning on critical data buckets post-eject. + +{{- else}} + +### Azure Storage + +Azure storage accounts (file shares, blob containers) do not have soft delete or versioning enabled by default. Deleted or overwritten data cannot be recovered from Azure Storage alone. + +**Prevention:** Consider enabling blob soft delete and versioning on critical storage accounts post-eject. + +{{- end}} + +## Kubernetes Cluster Recovery + +### Total Cluster Loss + +Persistent data (database, storage) survives cluster loss. Rebuild the cluster and redeploy: + +` + "```" + `bash +{{- if eq .Cloud "aws"}} +ptd ensure {{.WorkloadName}} --only-steps eks +{{- else}} +ptd ensure {{.WorkloadName}} --only-steps aks +{{- end}} +ptd ensure {{.WorkloadName}} --only-steps clusters +ptd ensure {{.WorkloadName}} --only-steps helm +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +### Partial Failure (Node Groups) + +{{- if eq .Cloud "aws"}} + +If a node group is unhealthy, cordon and drain the affected nodes, then re-run the eks step: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl cordon +ptd workon {{.WorkloadName}} -- kubectl drain --ignore-daemonsets --delete-emptydir-data +ptd ensure {{.WorkloadName}} --only-steps eks +` + "```" + ` + +{{- else}} + +If a node pool is unhealthy, cordon and drain the affected nodes, then re-run the aks step: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl cordon +ptd workon {{.WorkloadName}} -- kubectl drain --ignore-daemonsets --delete-emptydir-data +ptd ensure {{.WorkloadName}} --only-steps aks +` + "```" + ` + +{{- end}} + +### Stuck Pods + +Delete stuck pods to let the controller recreate them: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl delete pod -n posit-team +` + "```" + ` + +If a deployment is stuck, restart it: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl rollout restart deployment/ -n posit-team +` + "```" + ` + +## DNS and Ingress Recovery + +1. Verify DNS resolution: + +` + "```" + `bash +{{- range .Sites}} +dig {{.Domain}} +{{- end}} +` + "```" + ` + +2. Check load balancer health: + +` + "```" + `bash +{{- if eq .Cloud "aws"}} +aws elbv2 describe-target-health --target-group-arn --region {{.Region}} +{{- else}} +az network lb show --resource-group {{.ResourceGroup}} --name +{{- end}} +` + "```" + ` + +3. Check Traefik IngressRoutes: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl get ingressroute -n posit-team +ptd workon {{.WorkloadName}} -- kubectl describe ingressroute -n posit-team +` + "```" + ` + +4. Check TLS certificates: + +` + "```" + `bash +ptd workon {{.WorkloadName}} -- kubectl get certificate -n posit-team +ptd workon {{.WorkloadName}} -- kubectl describe certificate -n posit-team +` + "```" + ` + +5. If DNS or ingress is misconfigured, re-run the sites step: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} --only-steps sites +` + "```" + ` + +## Full Environment Rebuild + +To rebuild the full environment from the eject bundle configuration: + +1. Re-run the full infrastructure pipeline: + +` + "```" + `bash +ptd ensure {{.WorkloadName}} +` + "```" + ` + + This runs all steps in order (bootstrap through sites), including any custom steps. + +2. Restore data from backups: + +{{- if eq .Cloud "aws"}} + - Restore RDS from snapshot or point-in-time recovery (see Database Recovery above). + - Restore FSx from backup (see Storage Recovery above). + - S3 data buckets have no versioning — data loss is permanent unless you have external backups. +{{- else}} + - Restore Azure PostgreSQL from point-in-time recovery (see Database Recovery above). + - Azure storage has no versioning or soft delete — data loss is permanent unless you have external backups. +{{- end}} + +3. Re-populate manual secrets: + + The following secrets must be manually re-entered in {{if eq .Cloud "aws"}}Secrets Manager{{else}}Key Vault{{end}}: + - Product license keys (Connect, Workbench, Package Manager) + - OIDC client secrets + - Any other manually-managed secrets listed in the eject bundle's secrets inventory +`)) + +// GenerateRunbooks renders both operational runbooks and returns them as a map +// keyed by filename. +func GenerateRunbooks(data *RunbookData) (map[string]string, error) { + results := make(map[string]string, 2) + + ops, err := renderTemplate(dayToDayOpsTemplate, data) + if err != nil { + return nil, fmt.Errorf("failed to render day-to-day-ops runbook: %w", err) + } + results["day-to-day-ops.md"] = ops + + dr, err := renderTemplate(disasterRecoveryTemplate, data) + if err != nil { + return nil, fmt.Errorf("failed to render disaster-recovery runbook: %w", err) + } + results["disaster-recovery.md"] = dr + + return results, nil +} + +func renderTemplate(tmpl *template.Template, data *RunbookData) (string, error) { + var buf strings.Builder + if err := tmpl.Execute(&buf, data); err != nil { + return "", err + } + return buf.String(), nil +} + +// RenderDayToDayOps writes the day-to-day operations runbook to the given writer. +func RenderDayToDayOps(w io.Writer, data *RunbookData) error { + return dayToDayOpsTemplate.Execute(w, data) +} + +// RenderDisasterRecovery writes the disaster recovery runbook to the given writer. +func RenderDisasterRecovery(w io.Writer, data *RunbookData) error { + return disasterRecoveryTemplate.Execute(w, data) +} diff --git a/lib/eject/runbooks_test.go b/lib/eject/runbooks_test.go new file mode 100644 index 0000000..7770671 --- /dev/null +++ b/lib/eject/runbooks_test.go @@ -0,0 +1,358 @@ +package eject + +import ( + "strings" + "testing" + + "github.com/posit-dev/ptd/lib/types" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func awsRunbookData() *RunbookData { + return &RunbookData{ + WorkloadName: "acme-prod", + Cloud: "aws", + Region: "us-east-1", + ClusterName: "default_acme-prod-control-plane", + Sites: []SiteData{ + {Name: "main", Domain: "connect.acme.com"}, + {Name: "secondary", Domain: "dev.acme.com"}, + }, + } +} + +func azureRunbookData() *RunbookData { + return &RunbookData{ + WorkloadName: "contoso-staging", + Cloud: "azure", + Region: "eastus", + ClusterName: "aks-ptd-contoso", + ResourceGroup: "rsg-ptd-contoso-staging", + Sites: []SiteData{ + {Name: "main", Domain: "connect.contoso.com"}, + }, + } +} + +func TestGenerateRunbooks_AWS_ReturnsExpectedFiles(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + + require.NoError(t, err) + assert.Len(t, results, 2) + assert.Contains(t, results, "day-to-day-ops.md") + assert.Contains(t, results, "disaster-recovery.md") +} + +func TestGenerateRunbooks_Azure_ReturnsExpectedFiles(t *testing.T) { + results, err := GenerateRunbooks(azureRunbookData()) + + require.NoError(t, err) + assert.Len(t, results, 2) + assert.Contains(t, results, "day-to-day-ops.md") + assert.Contains(t, results, "disaster-recovery.md") +} + +func TestRunbook_DayToDayOps_AWS_ContainsSections(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + assert.Contains(t, ops, "# Day-to-Day Operations — acme-prod") + assert.Contains(t, ops, "## Running PTD Ensure Steps") + assert.Contains(t, ops, "## Scaling Product Replicas") + assert.Contains(t, ops, "## Updating Product Versions") + assert.Contains(t, ops, "## Rotating TLS Certificates") + assert.Contains(t, ops, "## Rotating Secrets") + assert.Contains(t, ops, "## Checking Workload Health") + assert.Contains(t, ops, "## Restarting Products") +} + +func TestRunbook_DayToDayOps_AWS_Content(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + assert.Contains(t, ops, "eks") + assert.Contains(t, ops, "aws eks update-kubeconfig") + assert.Contains(t, ops, "Secrets Manager") + assert.Contains(t, ops, "ACM") + assert.Contains(t, ops, "ptd ensure acme-prod") +} + +func TestRunbook_DayToDayOps_Azure_Content(t *testing.T) { + results, err := GenerateRunbooks(azureRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + assert.Contains(t, ops, "aks") + assert.Contains(t, ops, "az aks get-credentials") + assert.Contains(t, ops, "Key Vault") + assert.Contains(t, ops, "rsg-ptd-contoso-staging") + assert.Contains(t, ops, "ptd ensure contoso-staging") +} + +func TestRunbook_DayToDayOps_AWS_SitesRendered(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + assert.Contains(t, ops, "connect.acme.com") + assert.Contains(t, ops, "dev.acme.com") +} + +func TestRunbook_DisasterRecovery_AWS_ContainsSections(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + dr := results["disaster-recovery.md"] + + assert.Contains(t, dr, "# Disaster Recovery — acme-prod") + assert.Contains(t, dr, "## Pulumi State Recovery") + assert.Contains(t, dr, "## Database Recovery") + assert.Contains(t, dr, "## Storage Recovery") + assert.Contains(t, dr, "## Kubernetes Cluster Recovery") + assert.Contains(t, dr, "## DNS and Ingress Recovery") + assert.Contains(t, dr, "## Full Environment Rebuild") +} + +func TestRunbook_DisasterRecovery_AWS_Content(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + dr := results["disaster-recovery.md"] + + assert.Contains(t, dr, "ptd-acme-prod") + assert.Contains(t, dr, "aws rds restore-db-instance-to-point-in-time") + assert.Contains(t, dr, "aws fsx describe-backups") + assert.Contains(t, dr, "S3 data buckets have no versioning") + assert.Contains(t, dr, "ptd ensure acme-prod --only-steps eks") +} + +func TestRunbook_DisasterRecovery_Azure_Content(t *testing.T) { + results, err := GenerateRunbooks(azureRunbookData()) + require.NoError(t, err) + + dr := results["disaster-recovery.md"] + + assert.Contains(t, dr, "Azure Blob Storage") + assert.Contains(t, dr, "az postgres flexible-server restore") + assert.Contains(t, dr, "Azure storage has no versioning or soft delete") + assert.Contains(t, dr, "rsg-ptd-contoso-staging") + assert.Contains(t, dr, "ptd ensure contoso-staging --only-steps aks") +} + +func TestRunbook_DisasterRecovery_AWS_SitesRendered(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + dr := results["disaster-recovery.md"] + + assert.Contains(t, dr, "dig connect.acme.com") + assert.Contains(t, dr, "dig dev.acme.com") +} + +func TestRunbooks_NoBannedFlags(t *testing.T) { + for _, cloud := range []string{"aws", "azure"} { + t.Run(cloud, func(t *testing.T) { + data := &RunbookData{ + WorkloadName: "test-workload", + Cloud: cloud, + Region: "us-east-1", + ClusterName: "test-cluster", + ResourceGroup: "rsg-ptd-test", + Sites: []SiteData{ + {Name: "main", Domain: "test.example.com"}, + }, + } + results, err := GenerateRunbooks(data) + require.NoError(t, err) + + for filename, content := range results { + assert.NotContains(t, content, "--auto-apply", + "%s for %s should not contain --auto-apply", filename, cloud) + assert.NotContains(t, content, "--dry-run", + "%s for %s should not contain --dry-run", filename, cloud) + } + }) + } +} + +func TestRunbooks_NoPulumiCommands(t *testing.T) { + for _, cloud := range []string{"aws", "azure"} { + t.Run(cloud, func(t *testing.T) { + data := &RunbookData{ + WorkloadName: "test-workload", + Cloud: cloud, + Region: "us-east-1", + ClusterName: "test-cluster", + ResourceGroup: "rsg-ptd-test", + Sites: []SiteData{ + {Name: "main", Domain: "test.example.com"}, + }, + } + results, err := GenerateRunbooks(data) + require.NoError(t, err) + + for filename, content := range results { + assert.NotContains(t, content, "pulumi up", + "%s for %s should not contain 'pulumi up'", filename, cloud) + assert.NotContains(t, content, "pulumi preview", + "%s for %s should not contain 'pulumi preview'", filename, cloud) + assert.NotContains(t, content, "pulumi stack select", + "%s for %s should not contain 'pulumi stack select'", filename, cloud) + assert.NotContains(t, content, "pulumi import", + "%s for %s should not contain 'pulumi import'", filename, cloud) + } + }) + } +} + +func TestRunbook_DayToDayOps_AWS_StepTable(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + for _, step := range []string{"bootstrap", "persistent", "postgres_config", "eks", "clusters", "helm", "sites"} { + assert.Contains(t, ops, "| "+step+" |", "step table should contain %s", step) + } +} + +func TestRunbook_DayToDayOps_Azure_StepTable(t *testing.T) { + results, err := GenerateRunbooks(azureRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + for _, step := range []string{"bootstrap", "persistent", "postgres_config", "aks", "clusters", "helm", "sites"} { + assert.Contains(t, ops, "| "+step+" |", "step table should contain %s", step) + } + assert.NotContains(t, ops, "| eks |", "Azure runbook should not contain eks step") +} + +func TestRunbook_RenderDayToDayOps_WritesToWriter(t *testing.T) { + var buf strings.Builder + err := RenderDayToDayOps(&buf, awsRunbookData()) + + require.NoError(t, err) + assert.Contains(t, buf.String(), "Day-to-Day Operations") +} + +func TestRunbook_RenderDisasterRecovery_WritesToWriter(t *testing.T) { + var buf strings.Builder + err := RenderDisasterRecovery(&buf, azureRunbookData()) + + require.NoError(t, err) + assert.Contains(t, buf.String(), "Disaster Recovery") +} + +func TestRunbook_DayToDayOps_PtdWorkonCommands(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + + assert.Contains(t, ops, "ptd workon acme-prod --") +} + +func TestRunbook_DisasterRecovery_FullRebuild(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + dr := results["disaster-recovery.md"] + + rebuildStart := strings.Index(dr, "## Full Environment Rebuild") + require.Greater(t, rebuildStart, 0, "should contain Full Environment Rebuild section") + rebuild := dr[rebuildStart:] + + assert.Contains(t, rebuild, "ptd ensure acme-prod\n") + assert.NotContains(t, rebuild, "--only-steps", "full rebuild should run all steps, not individual ones") +} + +func TestRunbook_AWS_ClusterNameRendered(t *testing.T) { + results, err := GenerateRunbooks(awsRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + assert.Contains(t, ops, "aws eks update-kubeconfig --name default_acme-prod-control-plane --region us-east-1") +} + +func TestRunbook_Azure_ClusterNameRendered(t *testing.T) { + results, err := GenerateRunbooks(azureRunbookData()) + require.NoError(t, err) + + ops := results["day-to-day-ops.md"] + assert.Contains(t, ops, "az aks get-credentials --resource-group rsg-ptd-contoso-staging --name aks-ptd-contoso") +} + +func TestBuildRunbookData_AWS_SortsSites(t *testing.T) { + config := types.AWSWorkloadConfig{ + Region: "us-east-1", + Sites: map[string]types.SiteConfig{ + "zebra": {Spec: types.SiteConfigSpec{Domain: "z.example.com"}}, + "alpha": {Spec: types.SiteConfigSpec{Domain: "a.example.com"}}, + "middle": {Spec: types.SiteConfigSpec{Domain: "m.example.com"}}, + }, + Clusters: map[string]types.AWSWorkloadClusterConfig{ + "20240101": {}, + }, + } + + data, err := buildRunbookData(config, "test-workload") + require.NoError(t, err) + + require.Len(t, data.Sites, 3) + assert.Equal(t, "alpha", data.Sites[0].Name) + assert.Equal(t, "middle", data.Sites[1].Name) + assert.Equal(t, "zebra", data.Sites[2].Name) +} + +func TestBuildRunbookData_Azure_SanitizesResourceGroup(t *testing.T) { + config := types.AzureWorkloadConfig{ + Region: "eastus", + Sites: map[string]types.SiteConfig{ + "main": {Spec: types.SiteConfigSpec{Domain: "test.example.com"}}, + }, + Clusters: map[string]types.AzureWorkloadClusterConfig{ + "20240101": {}, + }, + } + + data, err := buildRunbookData(config, "MyWorkload_Test") + require.NoError(t, err) + + assert.Equal(t, "rsg-ptd-myworkload-test", data.ResourceGroup) +} + +func TestBuildRunbookData_AWS_ClusterName(t *testing.T) { + config := types.AWSWorkloadConfig{ + Region: "us-west-2", + Clusters: map[string]types.AWSWorkloadClusterConfig{ + "20240601": {}, + }, + } + + data, err := buildRunbookData(config, "acme-prod") + require.NoError(t, err) + + assert.Equal(t, "default_acme-prod-20240601-control-plane", data.ClusterName) +} + +func TestBuildRunbookData_Azure_ClusterName(t *testing.T) { + config := types.AzureWorkloadConfig{ + Region: "eastus", + Clusters: map[string]types.AzureWorkloadClusterConfig{ + "20240601": {}, + }, + } + + data, err := buildRunbookData(config, "Contoso-Staging") + require.NoError(t, err) + + assert.Equal(t, "contoso-staging-20240601", data.ClusterName) +}