diff --git a/server/cmd/api/api/chromium.go b/server/cmd/api/api/chromium.go index 0fbda144..79c991e5 100644 --- a/server/cmd/api/api/chromium.go +++ b/server/cmd/api/api/chromium.go @@ -333,6 +333,11 @@ func writeChromiumFlags(tokens []string) error { // restartChromiumAndWait restarts Chromium via supervisorctl and waits for DevTools to be ready. // Returns an error if the restart fails or times out. +// +// On success, emits a chromium_restart_complete log line decomposing the wall clock into +// supervisorctl_ms (time for supervisorctl restart to return) and post_supervisorctl_ms +// (gap between supervisorctl returning and a new DevTools URL appearing). Combined with +// launcher_total_ms from the chromium-launcher event, this localises slow restarts. func (s *ApiService) restartChromiumAndWait(ctx context.Context, operation string) error { log := logger.FromContext(ctx) start := time.Now() @@ -341,32 +346,61 @@ func (s *ApiService) restartChromiumAndWait(ctx context.Context, operation strin updates, cancelSub := s.upstreamMgr.Subscribe() defer cancelSub() + type restartResult struct { + elapsedMs int64 + err error + } + // Run supervisorctl restart with a new context to let it run beyond the lifetime of the http request. // This lets us return as soon as the DevTools URL is updated. - errCh := make(chan error, 1) + resultCh := make(chan restartResult, 1) log.Info("restarting chromium via supervisorctl", "operation", operation) go func() { cmdCtx, cancelCmd := context.WithTimeout(context.WithoutCancel(ctx), 1*time.Minute) defer cancelCmd() + cmdStart := time.Now() out, err := exec.CommandContext(cmdCtx, "supervisorctl", "-c", "/etc/supervisor/supervisord.conf", "restart", "chromium").CombinedOutput() if err != nil { log.Error("failed to restart chromium", "error", err, "out", string(out)) - errCh <- fmt.Errorf("supervisorctl restart failed: %w", err) } + resultCh <- restartResult{elapsedMs: time.Since(cmdStart).Milliseconds(), err: err} }() - // Wait for either a new upstream, a restart error, or timeout + var supervisorctlMs int64 = -1 + var supervisorctlDoneAt time.Time timeout := time.NewTimer(15 * time.Second) defer timeout.Stop() - select { - case <-updates: - log.Info("devtools ready", "operation", operation, "elapsed", time.Since(start).String()) - return nil - case err := <-errCh: - return err - case <-timeout.C: - log.Info("devtools not ready in time", "operation", operation, "elapsed", time.Since(start).String()) - return fmt.Errorf("devtools not ready in time") + for { + select { + case r := <-resultCh: + if r.err != nil { + return fmt.Errorf("supervisorctl restart failed: %w", r.err) + } + supervisorctlMs = r.elapsedMs + supervisorctlDoneAt = time.Now() + // Drop resultCh from select so we keep waiting for updates without re-firing. + resultCh = nil + case url := <-updates: + postSupervisorMs := int64(-1) + if !supervisorctlDoneAt.IsZero() { + postSupervisorMs = time.Since(supervisorctlDoneAt).Milliseconds() + } + log.Info("chromium_restart_complete", + "operation", operation, + "supervisorctl_ms", supervisorctlMs, + "post_supervisorctl_ms", postSupervisorMs, + "total_ms", time.Since(start).Milliseconds(), + "first_devtools_url_len", len(url), + ) + return nil + case <-timeout.C: + log.Info("chromium_restart_timeout", + "operation", operation, + "elapsed", time.Since(start).String(), + "supervisorctl_ms", supervisorctlMs, + ) + return fmt.Errorf("devtools not ready in time") + } } } diff --git a/server/cmd/chromium-launcher/main.go b/server/cmd/chromium-launcher/main.go index e765b4d8..6e41abae 100644 --- a/server/cmd/chromium-launcher/main.go +++ b/server/cmd/chromium-launcher/main.go @@ -1,13 +1,16 @@ package main import ( + "bufio" "context" "flag" "fmt" + "log/slog" "net" "os" "os/exec" "path/filepath" + "strconv" "strings" "syscall" "time" @@ -16,6 +19,8 @@ import ( ) func main() { + launcherStart := time.Now() + headless := flag.Bool("headless", false, "Run Chromium with headless flags") chromiumPath := flag.String("chromium", "chromium", "Chromium binary path (default: chromium)") runtimeFlagsPath := flag.String("runtime-flags", "/chromium/flags", "Path to runtime flags overlay file") @@ -31,7 +36,7 @@ func main() { // This is necessary because supervisord's stopwaitsecs=0 doesn't wait for // the old process to fully die before starting the new one, which can cause // the new process to fall back to IPv6 while the old one holds IPv4. - killExistingChromium() + killStats := killExistingChromium() // Inputs internalPort := strings.TrimSpace(os.Getenv("INTERNAL_PORT")) @@ -39,8 +44,14 @@ func main() { internalPort = "9223" } + // Parse devtools port for /proc/net/tcp lookups; default to 9223 on parse failure. + internalPortNum := uint16(9223) + if n, err := strconv.ParseUint(internalPort, 10, 16); err == nil { + internalPortNum = uint16(n) + } + // Wait for devtools port to be available (handles SIGKILL socket cleanup delay) - waitForPort(internalPort, 5*time.Second) + waitportStats := waitForPort(internalPort, 5*time.Second) baseFlags := os.Getenv("CHROMIUM_FLAGS") runtimeTokens, err := chromiumflags.ReadOptionalFlagFile(*runtimeFlagsPath) @@ -77,6 +88,8 @@ func main() { "DBUS_SESSION_BUS_ADDRESS=unix:path=/run/dbus/system_bus_socket", ) + emitPhaseCompleteEvent(launcherStart, *headless, internalPortNum, killStats, waitportStats) + if runAsRoot { // Replace current process with Chromium if p, err := execLookPath(*chromiumPath); err == nil { @@ -125,12 +138,23 @@ func execLookPath(file string) (string, error) { return exec.LookPath(file) } +// waitportStats summarizes a single waitForPort invocation for the +// chromium_launcher_phase_complete instrumentation event. +type waitportStats struct { + elapsedMs int64 + attempts int + timedOut bool + lastErr string +} + // waitForPort waits until the given port is available for binding on IPv4. // This handles the delay after SIGKILL before the kernel releases the socket. // We disable SO_REUSEADDR to get an accurate check matching chromium's bind behavior. // Only IPv4 is checked because IPv6 is disabled at the kernel level in the VM. -func waitForPort(port string, timeout time.Duration) { - deadline := time.Now().Add(timeout) +func waitForPort(port string, timeout time.Duration) waitportStats { + stats := waitportStats{} + start := time.Now() + deadline := start.Add(timeout) addrs := []string{"127.0.0.1:" + port} // ListenConfig with Control to disable SO_REUSEADDR for accurate port availability check @@ -150,42 +174,224 @@ func waitForPort(port string, timeout time.Duration) { ctx := context.Background() for time.Now().Before(deadline) { + stats.attempts++ allFree := true for _, addr := range addrs { ln, err := lc.Listen(ctx, "tcp", addr) if err != nil { + stats.lastErr = err.Error() allFree = false break } ln.Close() } if allFree { - return + stats.elapsedMs = time.Since(start).Milliseconds() + return stats } time.Sleep(50 * time.Millisecond) } // Timeout reached, proceed anyway and let chromium report the error + stats.elapsedMs = time.Since(start).Milliseconds() + stats.timedOut = true + return stats +} + +// killStats summarizes a single killExistingChromium invocation for the +// chromium_launcher_phase_complete instrumentation event. +type killStats struct { + pkillElapsedMs int64 + pgrepWaitElapsedMs int64 + pgrepAttempts int + timedOut bool } // killExistingChromium kills any existing chromium browser processes and waits for them to die. // This ensures a clean restart where the new process can bind to IPv4. // Note: We use -x for exact match to avoid killing chromium-launcher itself. -func killExistingChromium() { +func killExistingChromium() killStats { + stats := killStats{} + // Kill chromium processes by exact name match. // Using -x prevents matching "chromium-launcher" which would kill this process. + pkillStart := time.Now() _ = exec.Command("pkill", "-9", "-x", "chromium").Run() + stats.pkillElapsedMs = time.Since(pkillStart).Milliseconds() // Wait up to 2 seconds for processes to fully terminate - deadline := time.Now().Add(2 * time.Second) + pgrepStart := time.Now() + deadline := pgrepStart.Add(2 * time.Second) for time.Now().Before(deadline) { + stats.pgrepAttempts++ // Check if any chromium browser processes are still running (exact match) output, err := exec.Command("pgrep", "-x", "chromium").Output() if err != nil || len(strings.TrimSpace(string(output))) == 0 { // No processes found, we're done - return + stats.pgrepWaitElapsedMs = time.Since(pgrepStart).Milliseconds() + return stats } time.Sleep(100 * time.Millisecond) } // Timeout - processes may still exist but we continue anyway + stats.pgrepWaitElapsedMs = time.Since(pgrepStart).Milliseconds() + stats.timedOut = true fmt.Fprintf(os.Stderr, "warning: chromium processes may still be running after kill attempt\n") + return stats +} + +// readVMUptimeMs returns the VM uptime in milliseconds by parsing /proc/uptime. +// Returns 0 on any error. Read-only, fail-soft. +func readVMUptimeMs() int64 { + data, err := os.ReadFile("/proc/uptime") + if err != nil { + return 0 + } + fields := strings.Fields(string(data)) + if len(fields) < 1 { + return 0 + } + secs, err := strconv.ParseFloat(fields[0], 64) + if err != nil { + return 0 + } + return int64(secs * 1000) +} + +// readLoadAvg1m returns the 1-minute load average from /proc/loadavg. +// Returns 0 on any error. Read-only, fail-soft. +func readLoadAvg1m() float64 { + data, err := os.ReadFile("/proc/loadavg") + if err != nil { + return 0 + } + fields := strings.Fields(string(data)) + if len(fields) < 1 { + return 0 + } + v, err := strconv.ParseFloat(fields[0], 64) + if err != nil { + return 0 + } + return v +} + +// readMemInfo returns MemAvailable / Cached / Dirty in kB from /proc/meminfo. +// Any field that fails to parse returns 0. Read-only, fail-soft. +func readMemInfo() (availKb, cachedKb, dirtyKb int64) { + availKb, cachedKb, dirtyKb = parseMemInfo("/proc/meminfo") + return +} + +func parseMemInfo(path string) (availKb, cachedKb, dirtyKb int64) { + f, err := os.Open(path) + if err != nil { + return + } + defer f.Close() + sc := bufio.NewScanner(f) + for sc.Scan() { + line := sc.Text() + idx := strings.Index(line, ":") + if idx < 0 { + continue + } + key := line[:idx] + rest := strings.Fields(line[idx+1:]) + if len(rest) == 0 { + continue + } + val, err := strconv.ParseInt(rest[0], 10, 64) + if err != nil { + continue + } + switch key { + case "MemAvailable": + availKb = val + case "Cached": + cachedKb = val + case "Dirty": + dirtyKb = val + } + } + return +} + +// countSocketStates returns counts of ESTABLISHED / TIME_WAIT / LISTEN sockets +// on 127.0.0.1:port by parsing /proc/net/tcp. Returns zeros on any error. +// /proc/net/tcp local_address column encodes IP and port as hex; the port is +// big-endian uppercase hex (e.g. 9223 = 0x2407 = ":2407"). +func countSocketStates(port uint16) (estab, timewait, listen int) { + estab, timewait, listen = parseSocketStates("/proc/net/tcp", port) + return +} + +func parseSocketStates(path string, port uint16) (estab, timewait, listen int) { + f, err := os.Open(path) + if err != nil { + return + } + defer f.Close() + portSuffix := fmt.Sprintf(":%04X", port) + sc := bufio.NewScanner(f) + headerSkipped := false + for sc.Scan() { + if !headerSkipped { + headerSkipped = true + continue + } + fields := strings.Fields(sc.Text()) + if len(fields) < 4 { + continue + } + if !strings.HasSuffix(fields[1], portSuffix) { + continue + } + switch fields[3] { + case "01": + estab++ + case "06": + timewait++ + case "0A": + listen++ + } + } + return +} + +// emitPhaseCompleteEvent writes a single chromium_launcher_phase_complete JSON +// log line to stderr right before exec'ing chromium. This is the entry point +// for SigNoz aggregation against vm-logs / s2.stream. +func emitPhaseCompleteEvent( + launcherStart time.Time, + headless bool, + internalPortNum uint16, + ks killStats, + wp waitportStats, +) { + estab, timewait, listen := countSocketStates(internalPortNum) + availKb, cachedKb, dirtyKb := readMemInfo() + logger := slog.New(slog.NewJSONHandler(os.Stderr, nil)) + logger.Info("chromium_launcher_phase_complete", + "instance_name", strings.TrimSpace(os.Getenv("INSTANCE_NAME")), + "headless", headless, + "internal_port", internalPortNum, + "pkill_elapsed_ms", ks.pkillElapsedMs, + "pgrep_wait_elapsed_ms", ks.pgrepWaitElapsedMs, + "pgrep_attempts", ks.pgrepAttempts, + "pgrep_timed_out", ks.timedOut, + "waitport_elapsed_ms", wp.elapsedMs, + "waitport_attempts", wp.attempts, + "waitport_timed_out", wp.timedOut, + "waitport_last_err", wp.lastErr, + "ss_estab_devtools", estab, + "ss_timewait_devtools", timewait, + "ss_listen_devtools", listen, + "vm_uptime_ms", readVMUptimeMs(), + "load_avg_1m", readLoadAvg1m(), + "mem_available_kb", availKb, + "mem_cached_kb", cachedKb, + "mem_dirty_kb", dirtyKb, + "launcher_total_ms", time.Since(launcherStart).Milliseconds(), + "exec_started_unix_ms", time.Now().UnixMilli(), + ) } diff --git a/server/cmd/chromium-launcher/main_test.go b/server/cmd/chromium-launcher/main_test.go index e8f19d35..12ca7f75 100644 --- a/server/cmd/chromium-launcher/main_test.go +++ b/server/cmd/chromium-launcher/main_test.go @@ -5,6 +5,9 @@ import ( "os/exec" "path/filepath" "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestExecLookPath(t *testing.T) { @@ -35,3 +38,87 @@ func TestExecLookPath(t *testing.T) { } } +func writeFixture(t *testing.T, contents string) string { + t.Helper() + dir := t.TempDir() + p := filepath.Join(dir, "fixture") + require.NoError(t, os.WriteFile(p, []byte(contents), 0o644)) + return p +} + +func TestParseSocketStates(t *testing.T) { + // Real /proc/net/tcp format. Port 9223 = 0x2407, port 8888 = 0x22B8. + // state column (4th): 01=ESTABLISHED, 06=TIME_WAIT, 0A=LISTEN. + fixture := " sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n" + + " 0: 0100007F:2407 00000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 1 0 0\n" + + " 1: 0100007F:2407 0100007F:E1A4 01 00000000:00000000 00:00000000 00000000 0 0 2 0 0\n" + + " 2: 0100007F:2407 0100007F:E1A5 01 00000000:00000000 00:00000000 00000000 0 0 3 0 0\n" + + " 3: 0100007F:2407 0100007F:E1A6 06 00000000:00000000 00:00000000 00000000 0 0 4 0 0\n" + + " 4: 0100007F:22B8 00000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 5 0 0\n" + path := writeFixture(t, fixture) + + estab, timewait, listen := parseSocketStates(path, 9223) + assert.Equal(t, 2, estab, "ESTABLISHED count for port 9223") + assert.Equal(t, 1, timewait, "TIME_WAIT count for port 9223") + assert.Equal(t, 1, listen, "LISTEN count for port 9223") + + // Port 8888 only has one LISTEN socket. + estab, timewait, listen = parseSocketStates(path, 8888) + assert.Equal(t, 0, estab) + assert.Equal(t, 0, timewait) + assert.Equal(t, 1, listen) + + // Unknown port returns zeros. + estab, timewait, listen = parseSocketStates(path, 1234) + assert.Equal(t, 0, estab) + assert.Equal(t, 0, timewait) + assert.Equal(t, 0, listen) +} + +func TestParseSocketStatesMissingFile(t *testing.T) { + estab, timewait, listen := parseSocketStates("/does/not/exist", 9223) + assert.Equal(t, 0, estab) + assert.Equal(t, 0, timewait) + assert.Equal(t, 0, listen) +} + +func TestParseMemInfo(t *testing.T) { + fixture := "MemTotal: 2048000 kB\n" + + "MemFree: 512000 kB\n" + + "MemAvailable: 1024000 kB\n" + + "Buffers: 20000 kB\n" + + "Cached: 400000 kB\n" + + "Dirty: 5000 kB\n" + + "Writeback: 0 kB\n" + path := writeFixture(t, fixture) + + avail, cached, dirty := parseMemInfo(path) + assert.Equal(t, int64(1024000), avail) + assert.Equal(t, int64(400000), cached) + assert.Equal(t, int64(5000), dirty) +} + +func TestParseMemInfoMissingFile(t *testing.T) { + avail, cached, dirty := parseMemInfo("/does/not/exist") + assert.Equal(t, int64(0), avail) + assert.Equal(t, int64(0), cached) + assert.Equal(t, int64(0), dirty) +} + +func TestParseMemInfoMissingFields(t *testing.T) { + fixture := "MemTotal: 2048000 kB\n" + + "SomeOtherField: 12345 kB\n" + path := writeFixture(t, fixture) + + avail, cached, dirty := parseMemInfo(path) + assert.Equal(t, int64(0), avail) + assert.Equal(t, int64(0), cached) + assert.Equal(t, int64(0), dirty) +} + +func TestReadVMUptimeMsAndLoadAvg(t *testing.T) { + // These read real /proc on Linux test runners. On non-Linux they return 0. + // We just exercise the code path; values may be 0 in CI containers without /proc. + _ = readVMUptimeMs() + _ = readLoadAvg1m() +}