Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions v1/providers/nebius/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (

v1 "github.com/brevdev/cloud/v1"
"github.com/nebius/gosdk/operations"
"github.com/nebius/gosdk/serviceerror"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
Expand Down Expand Up @@ -37,6 +38,17 @@ func handleErrToCloudErr(e error) error {
if e == nil {
return nil
}
var serviceErr *serviceerror.Error
if errors.As(e, &serviceErr) {
for _, detail := range serviceErr.Details {
switch detail.(type) {
case *serviceerror.NotEnoughResources:
return v1.ErrInsufficientResources
case *serviceerror.QuotaFailure:
return v1.ErrOutOfQuota
}
}
}
Comment on lines +41 to +51
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. I wonder if the below ever actually fire? Interesting that we are looking directly at grpc codes there, whereas here we are testing the err type (which seems more appropriate and expected).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ya I was poking through the code and was similarly confused by the casting. It seems there are a couple paths and the errors get rewrapped as serviceerror or operations. I figured no harm if this block is the capture now. Though I am wondering if the below block was just always wrong.

// Check for Nebius operations.Error for ResourceExhausted (returned by operation.Wait on async failures)
var opErr *operations.Error
if errors.As(e, &opErr) {
Expand Down
68 changes: 68 additions & 0 deletions v1/providers/nebius/errors_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package v1

import (
"errors"
"testing"

cloudv1 "github.com/brevdev/cloud/v1"
common "github.com/nebius/gosdk/proto/nebius/common/v1"
"github.com/nebius/gosdk/serviceerror"
"github.com/stretchr/testify/require"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

func TestHandleErrToCloudErrMapsNotEnoughResourcesToInsufficientResources(t *testing.T) {
t.Parallel()

err := &serviceerror.Error{
Wrapped: status.Error(codes.ResourceExhausted, "operation failed"),
Details: []serviceerror.Detail{
serviceerror.NewDetail(&common.ServiceError{
Service: "compute",
Code: "NotEnoughResources",
Details: &common.ServiceError_NotEnoughResources{
NotEnoughResources: &common.NotEnoughResources{
Violations: []*common.NotEnoughResources_Violation{
{
ResourceType: "virtualMachine",
Requested: "1gpu-16vcpu-64gb",
Message: "VM schedule timeout, most likely due to insufficient hardware resources",
},
},
},
},
}),
},
}

require.True(t, errors.Is(handleErrToCloudErr(err), cloudv1.ErrInsufficientResources))
}

func TestHandleErrToCloudErrMapsQuotaFailureToOutOfQuota(t *testing.T) {
t.Parallel()

err := &serviceerror.Error{
Wrapped: status.Error(codes.ResourceExhausted, "operation failed"),
Details: []serviceerror.Detail{
serviceerror.NewDetail(&common.ServiceError{
Service: "compute",
Code: "QuotaFailure",
Details: &common.ServiceError_QuotaFailure{
QuotaFailure: &common.QuotaFailure{
Violations: []*common.QuotaFailure_Violation{
{
Quota: "compute.instance.gpu.h100",
Limit: "0",
Requested: "1",
Message: "quota exceeded",
},
},
},
},
}),
},
}

require.True(t, errors.Is(handleErrToCloudErr(err), cloudv1.ErrOutOfQuota))
}
114 changes: 66 additions & 48 deletions v1/providers/nebius/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package v1

import (
"context"
_ "embed"
"encoding/base64"
"fmt"
"strings"
"time"
Expand All @@ -20,6 +22,12 @@ const (
nebiusCPUImageFamily = "ubuntu24.04-driverless"
)

//go:embed scripts/brev-apply-docker-firewall.sh
var dockerFirewallScript string

//go:embed scripts/10-brev-firewall.conf
var dockerFirewallDropIn string

//nolint:gocyclo,funlen // Complex instance creation with resource management
func (c *NebiusClient) CreateInstance(ctx context.Context, attrs v1.CreateInstanceAttrs) (*v1.Instance, error) {
// Track created resources for automatic cleanup on failure
Expand Down Expand Up @@ -868,7 +876,6 @@ func matchesTagFilters(instanceTags map[string]string, tagFilters map[string][]s
return true
}

//nolint:dupl // StopInstance and StartInstance have similar structure but different operations
func (c *NebiusClient) StopInstance(ctx context.Context, instanceID v1.CloudProviderInstanceID) error {
c.logger.Debug(ctx, "initiating instance stop operation",
v1.LogField("instanceID", instanceID))
Expand Down Expand Up @@ -906,7 +913,6 @@ func (c *NebiusClient) StopInstance(ctx context.Context, instanceID v1.CloudProv
return nil
}

//nolint:dupl // StartInstance and StopInstance have similar structure but different operations
func (c *NebiusClient) StartInstance(ctx context.Context, instanceID v1.CloudProviderInstanceID) error {
c.logger.Debug(ctx, "initiating instance start operation",
v1.LogField("instanceID", instanceID))
Expand All @@ -916,17 +922,18 @@ func (c *NebiusClient) StartInstance(ctx context.Context, instanceID v1.CloudPro
Id: string(instanceID),
})
if err != nil {
return fmt.Errorf("failed to initiate instance start: %w", err)
return fmt.Errorf("failed to initiate instance start: %w", handleErrToCloudErr(err))
}

// Wait for the start operation to complete
finalOp, err := operation.Wait(ctx)
if err != nil {
return fmt.Errorf("failed to wait for instance start: %w", err)
return fmt.Errorf("failed to wait for instance start: %w", handleErrToCloudErr(err))
}

if !finalOp.Successful() {
return fmt.Errorf("instance start failed: %v", finalOp.Status())
statusErr := fmt.Errorf("instance start failed: %v", finalOp.Status())
return handleErrToCloudErr(statusErr)
}

c.logger.Debug(ctx, "start operation completed, waiting for instance to reach RUNNING state",
Expand Down Expand Up @@ -1577,13 +1584,12 @@ func (c *NebiusClient) cleanupOrphanedBootDisks(ctx context.Context, testID stri
}

// generateCloudInitUserData generates a cloud-init user-data script for SSH key injection and firewall configuration
// This is inspired by Shadeform's LaunchConfiguration approach but uses cloud-init instead of base64 scripts
// This is inspired by Shadeform's LaunchConfiguration approach but uses cloud-init directly.
func generateCloudInitUserData(publicKey string, firewallRules v1.FirewallRules) string {
// Start with cloud-init header
script := `#cloud-config
packages:
- ufw
- iptables-persistent
`

// Add SSH key configuration if provided
Expand All @@ -1593,35 +1599,19 @@ packages:
`, publicKey)
}

script += generateDockerFirewallWriteFiles()

var commands []string

// Fix a systemd race condition: ufw.service and netfilter-persistent.service
// both start in parallel (both are Before=network-pre.target with no mutual
// ordering). Both call iptables-restore concurrently, and with the iptables-nft
// backend the competing nftables transactions cause UFW to fail with
// "iptables-restore: line 4 failed". This drop-in forces UFW to wait for
// netfilter-persistent to finish first.
commands = append(commands,
"sudo mkdir -p /etc/systemd/system/ufw.service.d",
`printf '[Unit]\nAfter=netfilter-persistent.service\n' | sudo tee /etc/systemd/system/ufw.service.d/after-netfilter.conf > /dev/null`,
"sudo systemctl daemon-reload",
)
commands = append(commands, "sudo systemctl daemon-reload")

// Generate UFW firewall commands (similar to Shadeform's approach)
// UFW (Uncomplicated Firewall) is available on Ubuntu/Debian instances
commands = append(commands, generateUFWCommands(firewallRules)...)

// Generate IPTables firewall commands to ensure docker ports are not made immediately
// accessible from the internet by default.
commands = append(commands, generateIPTablesCommands()...)

// Save the complete iptables state (UFW chains + DOCKER-USER rules) so it
// survives instance stop/start cycles. Cloud-init runcmd only executes on
// first boot; on subsequent boots netfilter-persistent restores this snapshot,
// then UFW starts after it (due to the drop-in above) and re-applies its rules.
// This provides defense-in-depth: even if UFW fails for any reason, the
// netfilter-persistent snapshot ensures port 22 and DOCKER-USER rules persist.
commands = append(commands, "sudo netfilter-persistent save")
// Apply immediately for images where Docker is already running. The
// docker.service ExecStartPost hook handles images where Docker starts later.
commands = append(commands, "sudo /usr/local/sbin/brev-apply-docker-firewall.sh || true")

if len(commands) > 0 {
// Use runcmd to execute firewall setup commands
Expand Down Expand Up @@ -1663,25 +1653,53 @@ func generateUFWCommands(firewallRules v1.FirewallRules) []string {
return commands
}

// generateIPTablesCommands generates IPTables firewall commands to ensure docker ports are not made immediately
// accessible from the internet by default.
func generateIPTablesCommands() []string {
commands := []string{
"iptables -F DOCKER-USER",
"iptables -A DOCKER-USER -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT",
"iptables -A DOCKER-USER -i docker0 ! -o docker0 -j ACCEPT",
"iptables -A DOCKER-USER -i br+ ! -o br+ -j ACCEPT",
"iptables -A DOCKER-USER -i cni+ ! -o cni+ -j ACCEPT", // TODO: add these back in when we have a way to test it
"iptables -A DOCKER-USER -i cali+ ! -o cali+ -j ACCEPT",
"iptables -A DOCKER-USER -i docker0 -o docker0 -j ACCEPT",
"iptables -A DOCKER-USER -i br+ -o br+ -j ACCEPT",
"iptables -A DOCKER-USER -i cni+ -o cni+ -j ACCEPT",
"iptables -A DOCKER-USER -i cali+ -o cali+ -j ACCEPT",
"iptables -A DOCKER-USER -i lo -j ACCEPT",
"iptables -A DOCKER-USER -j DROP",
"iptables -A DOCKER-USER -j RETURN", // Expected by Docker
}
return commands
const (
// Keep these generated paths stable: cloud-init, systemd, and validation
// tests all depend on this Docker firewall handoff.
dockerFirewallScriptPath = "/usr/local/sbin/brev-apply-docker-firewall.sh"

// This is a docker.service drop-in because the firewall rules must be
// re-applied immediately after Docker initializes or resets DOCKER-USER. If
// we need a separately inspectable status surface later, this can move to a
// named oneshot unit such as brev-docker-firewall.service; for now the
// execution is visible through docker.service journal/status output.
dockerServiceDropInDir = "/etc/systemd/system/docker.service.d"
dockerFirewallDropInPath = dockerServiceDropInDir + "/10-brev-firewall.conf"
Comment on lines +1666 to +1667
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one actually might be worth a comment as we alternatively could make a oneshot unit.

The benefit of a standalone unit is that we can inspect it individually, so we could do something like systemctl status brev-docker-firewall. With this we'll need to just go straight to the docker service (e.g. journalctl docker. Not necessarily a bad thing, but worth considering if/when we need to answer the question "what happened to this VM's iptables?"

)

func generateDockerFirewallWriteFiles() string {
// This function emits the only write_files block in this cloud-config. If
// another generated file is added later, merge it into this block instead of
// adding a second top-level write_files key.
//
// Docker published ports are not governed by UFW's INPUT policy. Docker adds
// NAT/FORWARD rules that can make `docker run -p host:container` reachable
// from the public internet even when UFW says incoming traffic is denied.
//
// DOCKER-USER is Docker's documented filter hook for this traffic. The script
// ensures the chain exists before configuring it. If Docker already created
// the chain, the create command fails harmlessly and the script continues.
//
// The generated script exits successfully even if an iptables command fails
// because failing Docker startup would be worse operationally. Validation
// tests assert that the rule set is actually present and blocks published
// ports.
//
// UFW persists its own rules in /etc/ufw; Docker firewall rules are applied
// through cloud-init and the docker.service post-start hook.
return fmt.Sprintf(`
write_files:
- path: %s
owner: root:root
permissions: '0755'
encoding: b64
content: %s
- path: %s
owner: root:root
permissions: '0644'
encoding: b64
content: %s
`, dockerFirewallScriptPath, base64.StdEncoding.EncodeToString([]byte(dockerFirewallScript)), dockerFirewallDropInPath, base64.StdEncoding.EncodeToString([]byte(dockerFirewallDropIn)))
}

// convertIngressRuleToUFW converts an ingress firewall rule to UFW command(s)
Expand Down
30 changes: 30 additions & 0 deletions v1/providers/nebius/instance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,36 @@ func TestNebiusClient_MergeInstanceForUpdate(t *testing.T) {
assert.Equal(t, newInstance.Status, merged.Status)
}

func TestGenerateCloudInitUserDataInstallsDockerFirewallHook(t *testing.T) {
script := generateCloudInitUserData("ssh-rsa test", v1.FirewallRules{})

assert.NotContains(t, script, "iptables-persistent")
assert.NotContains(t, script, "netfilter-persistent")
assert.Contains(t, script, "write_files:")
assert.Contains(t, script, "encoding: b64")
assert.Contains(t, script, "/usr/local/sbin/brev-apply-docker-firewall.sh")
assert.Contains(t, script, "/etc/systemd/system/docker.service.d")
assert.Contains(t, script, "sudo /usr/local/sbin/brev-apply-docker-firewall.sh || true")
assert.NotContains(t, script, "content: |")
assert.NotContains(t, script, " #!/bin/sh")
assert.NotContains(t, script, "ExecStartPost=/usr/local/sbin/brev-apply-docker-firewall.sh")
assert.NotContains(t, script, "printf '%s\\n'")
assert.NotContains(t, script, "| sudo tee")
}

func TestDockerFirewallScriptCreatesDockerUserChainBeforeFlush(t *testing.T) {
createChainIndex := strings.Index(dockerFirewallScript, "iptables -N DOCKER-USER")
flushChainIndex := strings.Index(dockerFirewallScript, "iptables -F DOCKER-USER")

assert.Greater(t, createChainIndex, -1)
assert.Greater(t, flushChainIndex, createChainIndex)
assert.Contains(t, dockerFirewallScript, "iptables -A DOCKER-USER -j DROP")
}

func TestDockerFirewallDropInIgnoresExecStartPostFailure(t *testing.T) {
assert.Contains(t, dockerFirewallDropIn, "ExecStartPost=-/usr/local/sbin/brev-apply-docker-firewall.sh")
}

// BenchmarkCreateInstance benchmarks the CreateInstance method
func BenchmarkCreateInstance(b *testing.B) {
b.Skip("CreateInstance requires real SDK initialization - use integration tests instead")
Expand Down
2 changes: 2 additions & 0 deletions v1/providers/nebius/scripts/10-brev-firewall.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[Service]
ExecStartPost=-/usr/local/sbin/brev-apply-docker-firewall.sh
18 changes: 18 additions & 0 deletions v1/providers/nebius/scripts/brev-apply-docker-firewall.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/sh

iptables -N DOCKER-USER 2>/dev/null || true
iptables -F DOCKER-USER || true
iptables -A DOCKER-USER -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT
iptables -A DOCKER-USER -i docker0 ! -o docker0 -j ACCEPT
iptables -A DOCKER-USER -i br+ ! -o br+ -j ACCEPT
iptables -A DOCKER-USER -i cni+ ! -o cni+ -j ACCEPT
iptables -A DOCKER-USER -i cali+ ! -o cali+ -j ACCEPT
iptables -A DOCKER-USER -i docker0 -o docker0 -j ACCEPT
iptables -A DOCKER-USER -i br+ -o br+ -j ACCEPT
iptables -A DOCKER-USER -i cni+ -o cni+ -j ACCEPT
iptables -A DOCKER-USER -i cali+ -o cali+ -j ACCEPT
iptables -A DOCKER-USER -i lo -j ACCEPT
iptables -A DOCKER-USER -j DROP
iptables -A DOCKER-USER -j RETURN

exit 0
5 changes: 5 additions & 0 deletions v1/providers/shadeform/firewall.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ const (
ufwDefaultAllowPort2222 = "ufw allow 2222/tcp"
ufwForceEnable = "ufw --force enable"

// Ensure DOCKER-USER exists before clearing it. Docker normally creates this
// chain, but firewall setup can run before Docker has initialized iptables.
ipTablesCreateDockerUserChain = "iptables -N DOCKER-USER || true"

// Clear DOCKER-USER policy.
ipTablesResetDockerUserChain = "iptables -F DOCKER-USER"

Expand Down Expand Up @@ -83,6 +87,7 @@ func (c *ShadeformClient) getUFWCommands(firewallRules v1.FirewallRules) []strin

func (c *ShadeformClient) getIPTablesCommands() []string {
commands := []string{
ipTablesCreateDockerUserChain,
ipTablesResetDockerUserChain,
ipTablesAllowDockerUserOutbound,
ipTablesAllowDockerUserOutboundInit0,
Expand Down
19 changes: 19 additions & 0 deletions v1/providers/shadeform/firewall_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package v1

import (
"strings"
"testing"

"github.com/stretchr/testify/assert"
)

func TestShadeformIPTablesCommandsCreateDockerUserChainBeforeFlush(t *testing.T) {
client := &ShadeformClient{}
commands := strings.Join(client.getIPTablesCommands(), "\n")

createChainIndex := strings.Index(commands, "iptables -N DOCKER-USER")
flushChainIndex := strings.Index(commands, "iptables -F DOCKER-USER")

assert.Greater(t, createChainIndex, -1)
assert.Greater(t, flushChainIndex, createChainIndex)
}
Loading