diff --git a/cli/cmd/bootstrap_gcp.go b/cli/cmd/bootstrap_gcp.go index 5a37b801..f60e2768 100644 --- a/cli/cmd/bootstrap_gcp.go +++ b/cli/cmd/bootstrap_gcp.go @@ -105,6 +105,7 @@ func AddBootstrapGcpCmd(parent *cobra.Command, opts *GlobalOptions) { parent.AddCommand(bootstrapGcpCmd.cmd) AddBootstrapGcpPostconfigCmd(bootstrapGcpCmd.cmd, opts) AddBootstrapGcpCleanupCmd(bootstrapGcpCmd.cmd, opts) + AddBootstrapGcpRestartVMsCmd(bootstrapGcpCmd.cmd, opts) } func (c *BootstrapGcpCmd) BootstrapGcp() error { diff --git a/cli/cmd/bootstrap_gcp_restart_vms.go b/cli/cmd/bootstrap_gcp_restart_vms.go new file mode 100644 index 00000000..470d2fd7 --- /dev/null +++ b/cli/cmd/bootstrap_gcp_restart_vms.go @@ -0,0 +1,135 @@ +// Copyright (c) Codesphere Inc. +// SPDX-License-Identifier: Apache-2.0 + +package cmd + +import ( + "fmt" + "log" + "os" + + csio "github.com/codesphere-cloud/cs-go/pkg/io" + "github.com/codesphere-cloud/oms/internal/bootstrap" + "github.com/codesphere-cloud/oms/internal/bootstrap/gcp" + "github.com/codesphere-cloud/oms/internal/util" + "github.com/spf13/cobra" +) + +type BootstrapGcpRestartVMsCmd struct { + cmd *cobra.Command + Opts *BootstrapGcpRestartVMsOpts +} + +type BootstrapGcpRestartVMsOpts struct { + *GlobalOptions + ProjectID string + Zone string + Name string +} + +func (c *BootstrapGcpRestartVMsCmd) RunE(_ *cobra.Command, args []string) error { + ctx := c.cmd.Context() + stlog := bootstrap.NewStepLogger(false) + fw := util.NewFilesystemWriter() + + projectID := c.Opts.ProjectID + zone := c.Opts.Zone + + // If only one of --project-id/--zone is provided, require both + if (projectID == "") != (zone == "") { + return fmt.Errorf("--project-id and --zone must be provided together") + } + + if projectID == "" && zone == "" { + infraFilePath := gcp.GetInfraFilePath() + infraEnv, exists, err := gcp.LoadInfraFile(fw, infraFilePath) + if err != nil { + return fmt.Errorf("failed to load infra file: %w", err) + } + if !exists { + return fmt.Errorf("infra file not found at %s; use --project-id and --zone flags", infraFilePath) + } + projectID = infraEnv.ProjectID + zone = infraEnv.Zone + } + + if projectID == "" { + return fmt.Errorf("project ID is required; set --project-id and --zone or ensure the infra file exists") + } + if zone == "" { + return fmt.Errorf("zone is required; set --project-id and --zone or ensure the infra file exists") + } + + gcpClient := gcp.NewGCPClient(ctx, stlog, os.Getenv("GOOGLE_APPLICATION_CREDENTIALS")) + + csEnv := &gcp.CodesphereEnvironment{ + ProjectID: projectID, + Zone: zone, + } + + bs, err := gcp.NewGCPBootstrapper( + ctx, + nil, + stlog, + csEnv, + nil, + gcpClient, + fw, + nil, + nil, + util.NewTime(), + nil, + ) + if err != nil { + return fmt.Errorf("failed to create bootstrapper: %w", err) + } + + if c.Opts.Name != "" { + log.Printf("Restarting VM %s in project %s (zone %s)...", c.Opts.Name, projectID, zone) + if err := bs.RestartVM(c.Opts.Name); err != nil { + return fmt.Errorf("failed to restart VM: %w", err) + } + log.Printf("VM %s restarted successfully.", c.Opts.Name) + } else { + log.Printf("Restarting all VMs in project %s (zone %s)...", projectID, zone) + if err := bs.RestartVMs(); err != nil { + return fmt.Errorf("failed to restart VMs: %w", err) + } + log.Printf("All VMs restarted successfully.") + } + + return nil +} + +func AddBootstrapGcpRestartVMsCmd(bootstrapGcp *cobra.Command, opts *GlobalOptions) { + restartVMs := BootstrapGcpRestartVMsCmd{ + cmd: &cobra.Command{ + Use: "restart-vms", + Short: "Restart stopped or terminated GCP VMs", + Long: csio.Long(`Restarts GCP compute instances that were stopped or terminated, + for example after spot VM preemption. + By default, restarts all VMs defined in the infrastructure. + Use --name to restart a single VM. + Project ID and zone are read from the local infra file if available, + or can be specified via flags.`), + Example: formatExamples("beta bootstrap-gcp restart-vms", []csio.Example{ + {Desc: "Restart all VMs using project info from the local infra file"}, + {Cmd: "--name jumpbox", Desc: "Restart only the jumpbox VM"}, + {Cmd: "--name k0s-1", Desc: "Restart a specific k0s node"}, + {Cmd: "--project-id my-project --zone us-central1-a", Desc: "Restart all VMs with explicit project and zone"}, + {Cmd: "--project-id my-project --zone us-central1-a --name ceph-1", Desc: "Restart a specific VM with explicit project and zone"}, + }), + }, + Opts: &BootstrapGcpRestartVMsOpts{ + GlobalOptions: opts, + }, + } + + flags := restartVMs.cmd.Flags() + flags.StringVar(&restartVMs.Opts.ProjectID, "project-id", "", "GCP Project ID (optional, will use infra file if not provided)") + flags.StringVar(&restartVMs.Opts.Zone, "zone", "", "GCP Zone (optional, will use infra file if not provided)") + flags.StringVar(&restartVMs.Opts.Name, "name", "", "Name of a specific VM to restart (e.g. jumpbox, postgres, ceph-1, k0s-1). Restarts all VMs if not specified.") + + restartVMs.cmd.RunE = restartVMs.RunE + bootstrapGcp.AddCommand(restartVMs.cmd) +} diff --git a/cli/cmd/bootstrap_gcp_restart_vms_test.go b/cli/cmd/bootstrap_gcp_restart_vms_test.go new file mode 100644 index 00000000..36b036c6 --- /dev/null +++ b/cli/cmd/bootstrap_gcp_restart_vms_test.go @@ -0,0 +1,108 @@ +// Copyright (c) Codesphere Inc. +// SPDX-License-Identifier: Apache-2.0 + +package cmd_test + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/spf13/cobra" + + "github.com/codesphere-cloud/oms/cli/cmd" +) + +var _ = Describe("BootstrapGcpRestartVMsCmd", func() { + var globalOpts *cmd.GlobalOptions + + BeforeEach(func() { + globalOpts = &cmd.GlobalOptions{} + }) + + Describe("BootstrapGcpRestartVMsOpts structure", func() { + Context("when initialized", func() { + It("should have correct default values", func() { + opts := &cmd.BootstrapGcpRestartVMsOpts{ + GlobalOptions: globalOpts, + } + Expect(opts.ProjectID).To(Equal("")) + Expect(opts.Zone).To(Equal("")) + Expect(opts.Name).To(Equal("")) + }) + + It("should store provided values", func() { + opts := &cmd.BootstrapGcpRestartVMsOpts{ + GlobalOptions: globalOpts, + ProjectID: "my-project", + Zone: "us-central1-a", + Name: "jumpbox", + } + Expect(opts.ProjectID).To(Equal("my-project")) + Expect(opts.Zone).To(Equal("us-central1-a")) + Expect(opts.Name).To(Equal("jumpbox")) + }) + }) + }) + + Describe("AddBootstrapGcpRestartVMsCmd", func() { + Context("when adding command", func() { + It("should not panic when adding to parent command", func() { + Expect(func() { + parentCmd := &cobra.Command{ + Use: "bootstrap-gcp", + } + cmd.AddBootstrapGcpRestartVMsCmd(parentCmd, globalOpts) + }).NotTo(Panic()) + }) + + It("should create command with correct flags", func() { + parentCmd := &cobra.Command{ + Use: "bootstrap-gcp", + } + cmd.AddBootstrapGcpRestartVMsCmd(parentCmd, globalOpts) + + restartCmd, _, err := parentCmd.Find([]string{"restart-vms"}) + Expect(err).NotTo(HaveOccurred()) + Expect(restartCmd).NotTo(BeNil()) + Expect(restartCmd.Use).To(Equal("restart-vms")) + + projectIDFlag := restartCmd.Flags().Lookup("project-id") + Expect(projectIDFlag).NotTo(BeNil()) + + zoneFlag := restartCmd.Flags().Lookup("zone") + Expect(zoneFlag).NotTo(BeNil()) + + nameFlag := restartCmd.Flags().Lookup("name") + Expect(nameFlag).NotTo(BeNil()) + }) + + It("should bind flag values to opts", func() { + parentCmd := &cobra.Command{ + Use: "bootstrap-gcp", + } + cmd.AddBootstrapGcpRestartVMsCmd(parentCmd, globalOpts) + + restartCmd, _, err := parentCmd.Find([]string{"restart-vms"}) + Expect(err).NotTo(HaveOccurred()) + Expect(restartCmd).NotTo(BeNil()) + + err = restartCmd.Flags().Set("project-id", "flag-project") + Expect(err).NotTo(HaveOccurred()) + projectIDVal, err := restartCmd.Flags().GetString("project-id") + Expect(err).NotTo(HaveOccurred()) + Expect(projectIDVal).To(Equal("flag-project")) + + err = restartCmd.Flags().Set("zone", "flag-zone") + Expect(err).NotTo(HaveOccurred()) + zoneVal, err := restartCmd.Flags().GetString("zone") + Expect(err).NotTo(HaveOccurred()) + Expect(zoneVal).To(Equal("flag-zone")) + + err = restartCmd.Flags().Set("name", "jumpbox") + Expect(err).NotTo(HaveOccurred()) + nameVal, err := restartCmd.Flags().GetString("name") + Expect(err).NotTo(HaveOccurred()) + Expect(nameVal).To(Equal("jumpbox")) + }) + }) + }) +}) diff --git a/docs/oms_beta_bootstrap-gcp.md b/docs/oms_beta_bootstrap-gcp.md index e7bc829b..9818e095 100644 --- a/docs/oms_beta_bootstrap-gcp.md +++ b/docs/oms_beta_bootstrap-gcp.md @@ -63,4 +63,5 @@ oms beta bootstrap-gcp [flags] * [oms beta](oms_beta.md) - Commands for early testing * [oms beta bootstrap-gcp cleanup](oms_beta_bootstrap-gcp_cleanup.md) - Clean up GCP infrastructure created by bootstrap-gcp * [oms beta bootstrap-gcp postconfig](oms_beta_bootstrap-gcp_postconfig.md) - Run post-configuration steps for GCP bootstrapping +* [oms beta bootstrap-gcp restart-vms](oms_beta_bootstrap-gcp_restart-vms.md) - Restart stopped or terminated GCP VMs diff --git a/docs/oms_beta_bootstrap-gcp_restart-vms.md b/docs/oms_beta_bootstrap-gcp_restart-vms.md new file mode 100644 index 00000000..1a84ef76 --- /dev/null +++ b/docs/oms_beta_bootstrap-gcp_restart-vms.md @@ -0,0 +1,50 @@ +## oms beta bootstrap-gcp restart-vms + +Restart stopped or terminated GCP VMs + +### Synopsis + +Restarts GCP compute instances that were stopped or terminated, +for example after spot VM preemption. +By default, restarts all VMs defined in the infrastructure. +Use --name to restart a single VM. +Project ID and zone are read from the local infra file if available, +or can be specified via flags. + +``` +oms beta bootstrap-gcp restart-vms [flags] +``` + +### Examples + +``` +# Restart all VMs using project info from the local infra file +$ oms beta bootstrap-gcp restart-vms + +# Restart only the jumpbox VM +$ oms beta bootstrap-gcp restart-vms --name jumpbox + +# Restart a specific k0s node +$ oms beta bootstrap-gcp restart-vms --name k0s-1 + +# Restart all VMs with explicit project and zone +$ oms beta bootstrap-gcp restart-vms --project-id my-project --zone us-central1-a + +# Restart a specific VM with explicit project and zone +$ oms beta bootstrap-gcp restart-vms --project-id my-project --zone us-central1-a --name ceph-1 + +``` + +### Options + +``` + -h, --help help for restart-vms + --name string Name of a specific VM to restart (e.g. jumpbox, postgres, ceph-1, k0s-1). Restarts all VMs if not specified. + --project-id string GCP Project ID (optional, will use infra file if not provided) + --zone string GCP Zone (optional, will use infra file if not provided) +``` + +### SEE ALSO + +* [oms beta bootstrap-gcp](oms_beta_bootstrap-gcp.md) - Bootstrap GCP infrastructure for Codesphere + diff --git a/internal/bootstrap/gcp/gce.go b/internal/bootstrap/gcp/gce.go index 31039168..eb96207b 100644 --- a/internal/bootstrap/gcp/gce.go +++ b/internal/bootstrap/gcp/gce.go @@ -366,6 +366,83 @@ func (b *GCPBootstrapper) waitForInstanceRunning(projectID, zone, name string, n name, pollInterval*time.Duration(maxAttempts)) } +// findVMDef looks up a VM definition by name. Returns the VMDef and true if found. +func findVMDef(name string) (VMDef, bool) { + for _, vm := range vmDefs { + if vm.Name == name { + return vm, true + } + } + return VMDef{}, false +} + +// validVMNames returns the list of known VM names from vmDefs. +func validVMNames() []string { + names := make([]string, len(vmDefs)) + for i, vm := range vmDefs { + names[i] = vm.Name + } + return names +} + +// RestartVM restarts a single stopped or terminated VM by a name that is defined in vmDefs. +func (b *GCPBootstrapper) RestartVM(name string) error { + vm, found := findVMDef(name) + if !found { + return fmt.Errorf("unknown VM name %q; valid names are: %s", name, strings.Join(validVMNames(), ", ")) + } + + projectID := b.Env.ProjectID + zone := b.Env.Zone + + inst, err := b.GCPClient.GetInstance(projectID, zone, name) + if err != nil { + if IsNotFoundError(err) { + return fmt.Errorf("instance %s does not exist in project %s / zone %s; did you run bootstrap first?", name, projectID, zone) + } + return fmt.Errorf("failed to get instance %s: %w", name, err) + } + + switch s := inst.GetStatus(); s { + case "RUNNING": + b.stlog.Logf("Instance %s is already running", name) + return nil + case "TERMINATED", "STOPPED": + b.stlog.Logf("Starting stopped instance %s...", name) + if err := b.GCPClient.StartInstance(projectID, zone, name); err != nil { + return fmt.Errorf("failed to start instance %s: %w", name, err) + } + case "SUSPENDED": + return fmt.Errorf("instance %s is SUSPENDED; manual resume is required", name) + default: + return fmt.Errorf("instance %s is in unexpected state %q", name, s) + } + + readyInstance, err := b.waitForInstanceRunning(projectID, zone, name, vm.ExternalIP) + if err != nil { + return fmt.Errorf("instance %s did not become ready: %w", name, err) + } + + internalIP, externalIP := ExtractInstanceIPs(readyInstance) + b.stlog.Logf("Instance %s is now running (internal=%s, external=%s)", name, internalIP, externalIP) + return nil +} + +// RestartVMs restarts all stopped or terminated VMs defined in vmDefs. +// VMs are restarted sequentially because StepLogger is not thread-safe. +func (b *GCPBootstrapper) RestartVMs() error { + var errs []error + for _, vm := range vmDefs { + if err := b.RestartVM(vm.Name); err != nil { + errs = append(errs, err) + } + } + if len(errs) > 0 { + return fmt.Errorf("errors restarting VMs: %w", errors.Join(errs...)) + } + return nil +} + // ReadSSHKey reads an SSH key file, expanding ~ in the path func (b *GCPBootstrapper) ReadSSHKey(path string) (string, error) { realPath := util.ExpandPath(path) diff --git a/internal/bootstrap/gcp/gce_test.go b/internal/bootstrap/gcp/gce_test.go index 34cea951..61436dfe 100644 --- a/internal/bootstrap/gcp/gce_test.go +++ b/internal/bootstrap/gcp/gce_test.go @@ -786,4 +786,160 @@ var _ = Describe("GCE", func() { }) }) }) + + Describe("RestartVM", func() { + var ( + gc *gcp.MockGCPClientManager + csEnv *gcp.CodesphereEnvironment + bs *gcp.GCPBootstrapper + ) + + BeforeEach(func() { + gc = gcp.NewMockGCPClientManager(GinkgoT()) + csEnv = &gcp.CodesphereEnvironment{ + ProjectID: "test-project", + Zone: "us-central1-a", + } + bs = newTestBootstrapper(csEnv, gc) + }) + + It("returns error for unknown VM name", func() { + err := bs.RestartVM("nonexistent") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("unknown VM name")) + Expect(err.Error()).To(ContainSubstring("jumpbox")) + }) + + It("is a no-op when instance is already running", func() { + runningInst := makeRunningInstance("10.0.0.1", "1.2.3.4") + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "jumpbox").Return(runningInst, nil) + + err := bs.RestartVM("jumpbox") + Expect(err).NotTo(HaveOccurred()) + }) + + It("starts a TERMINATED instance and waits for it to be running", func() { + stoppedInst := makeStoppedInstance("10.0.0.1", "1.2.3.4") + runningInst := makeRunningInstance("10.0.0.1", "1.2.3.4") + + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "jumpbox").Return(stoppedInst, nil).Once() + gc.EXPECT().StartInstance(csEnv.ProjectID, csEnv.Zone, "jumpbox").Return(nil) + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "jumpbox").Return(runningInst, nil).Once() + + err := bs.RestartVM("jumpbox") + Expect(err).NotTo(HaveOccurred()) + }) + + It("starts a STOPPED instance", func() { + stoppedInst := makeInstance("STOPPED", "10.0.0.1", "1.2.3.4") + runningInst := makeRunningInstance("10.0.0.1", "1.2.3.4") + + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "postgres").Return(stoppedInst, nil).Once() + gc.EXPECT().StartInstance(csEnv.ProjectID, csEnv.Zone, "postgres").Return(nil) + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "postgres").Return(runningInst, nil).Once() + + err := bs.RestartVM("postgres") + Expect(err).NotTo(HaveOccurred()) + }) + + It("returns error for SUSPENDED instance", func() { + suspendedInst := makeInstance("SUSPENDED", "10.0.0.1", "") + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "jumpbox").Return(suspendedInst, nil) + + err := bs.RestartVM("jumpbox") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("SUSPENDED")) + Expect(err.Error()).To(ContainSubstring("manual resume")) + }) + + It("returns error when GetInstance fails", func() { + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "jumpbox").Return(nil, fmt.Errorf("permission denied")) + + err := bs.RestartVM("jumpbox") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to get instance")) + }) + + It("returns actionable error when instance is not found", func() { + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "jumpbox").Return(nil, grpcstatus.Errorf(codes.NotFound, "not found")) + + err := bs.RestartVM("jumpbox") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("does not exist")) + Expect(err.Error()).To(ContainSubstring("bootstrap first")) + }) + + It("returns error when StartInstance fails", func() { + stoppedInst := makeStoppedInstance("10.0.0.1", "1.2.3.4") + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "jumpbox").Return(stoppedInst, nil) + gc.EXPECT().StartInstance(csEnv.ProjectID, csEnv.Zone, "jumpbox").Return(fmt.Errorf("quota exceeded")) + + err := bs.RestartVM("jumpbox") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to start instance")) + }) + + It("handles VM without external IP (ceph node)", func() { + stoppedInst := makeStoppedInstance("10.0.0.5", "") + runningInst := makeRunningInstance("10.0.0.5", "") + + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "ceph-1").Return(stoppedInst, nil).Once() + gc.EXPECT().StartInstance(csEnv.ProjectID, csEnv.Zone, "ceph-1").Return(nil) + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, "ceph-1").Return(runningInst, nil).Once() + + err := bs.RestartVM("ceph-1") + Expect(err).NotTo(HaveOccurred()) + }) + }) + + Describe("RestartVMs", func() { + var ( + gc *gcp.MockGCPClientManager + csEnv *gcp.CodesphereEnvironment + bs *gcp.GCPBootstrapper + ) + + BeforeEach(func() { + gc = gcp.NewMockGCPClientManager(GinkgoT()) + csEnv = &gcp.CodesphereEnvironment{ + ProjectID: "test-project", + Zone: "us-central1-a", + } + bs = newTestBootstrapper(csEnv, gc) + }) + + It("succeeds when all VMs are already running", func() { + runningInst := makeRunningInstance("10.0.0.1", "1.2.3.4") + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(runningInst, nil).Times(8) + + err := bs.RestartVMs() + Expect(err).NotTo(HaveOccurred()) + }) + + It("starts stopped VMs and succeeds", func() { + stoppedInst := makeStoppedInstance("10.0.0.1", "1.2.3.4") + runningInst := makeRunningInstance("10.0.0.1", "1.2.3.4") + + callCounts := map[string]int{} + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(_, _, name string) (*computepb.Instance, error) { + callCounts[name]++ + if callCounts[name] == 1 { + return stoppedInst, nil + } + return runningInst, nil + }).Times(16) + gc.EXPECT().StartInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Times(8) + + err := bs.RestartVMs() + Expect(err).NotTo(HaveOccurred()) + }) + + It("returns aggregated errors when some VMs fail", func() { + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("api error")).Times(8) + + err := bs.RestartVMs() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("errors restarting VMs")) + }) + }) })