Skip to content

Add Basic Error Recovery #21

@aaydin-tr

Description

@aaydin-tr

Add Basic Error Recovery

🛡️ Priority: MEDIUM

Labels: enhancement, reliability, error-handling, stability
Estimated Effort: 1 day
Assignee: @aaydin-tr

Problem Description

Currently, Divisor has minimal error recovery mechanisms. If a panic occurs during request processing, it can crash the entire service. There's also limited error handling for:

  • Backend connection failures
  • Malformed requests
  • Resource exhaustion
  • Unexpected runtime errors

This can lead to:

  • Service outages from single request failures
  • Poor error messages for debugging
  • Cascading failures
  • Reduced service reliability

Proposed Solution

Implement basic error recovery mechanisms including:

  1. Panic recovery middleware
  2. Better error handling for backend failures
  3. Circuit breaker pattern for unhealthy backends
  4. Request timeout handling
  5. Improved error logging and monitoring

Implementation Plan

1. Add Panic Recovery Middleware

File: core/types/middleware.go (new file)

package types

import (
    "runtime/debug"
    "github.com/valyala/fasthttp"
    "go.uber.org/zap"
)

// PanicRecoveryMiddleware wraps handlers to recover from panics
func PanicRecoveryMiddleware(next fasthttp.RequestHandler) fasthttp.RequestHandler {
    return func(ctx *fasthttp.RequestCtx) {
        defer func() {
            if r := recover(); r != nil {
                stack := debug.Stack()
                
                // Log the panic with stack trace
                zap.S().Errorf("Panic recovered: %v\nStack trace:\n%s", r, string(stack))
                
                // Set error response
                ctx.SetStatusCode(fasthttp.StatusInternalServerError)
                ctx.SetContentType("application/json")
                ctx.SetBodyString(`{"error":"Internal server error","code":"PANIC_RECOVERED"}`)
                
                // Optional: Send to monitoring/alerting
                // metrics.IncrementPanicCounter()
            }
        }()
        
        // Call the next handler
        next(ctx)
    }
}

// TimeoutMiddleware adds request timeout handling
func TimeoutMiddleware(timeout time.Duration) func(fasthttp.RequestHandler) fasthttp.RequestHandler {
    return func(next fasthttp.RequestHandler) fasthttp.RequestHandler {
        return func(ctx *fasthttp.RequestCtx) {
            done := make(chan bool, 1)
            
            go func() {
                next(ctx)
                done <- true
            }()
            
            select {
            case <-done:
                // Request completed normally
                return
            case <-time.After(timeout):
                // Request timed out
                zap.S().Warnf("Request timeout after %v: %s %s", 
                    timeout, ctx.Method(), ctx.RequestURI())
                
                ctx.SetStatusCode(fasthttp.StatusRequestTimeout)
                ctx.SetContentType("application/json")
                ctx.SetBodyString(`{"error":"Request timeout","code":"TIMEOUT"}`)
                return
            }
        }
    }
}

2. Update Balancer to Use Middleware

File: core/round-robin/round-robin.go (example implementation)

import (
    "time"
    "github.com/aaydin-tr/divisor/core/types"
)

func (rr *RoundRobin) Serve() fasthttp.RequestHandler {
    // Create the core handler
    coreHandler := func(ctx *fasthttp.RequestCtx) {
        rr.mutex.RLock()
        defer rr.mutex.RUnlock()

        if len(rr.backends) == 0 {
            ctx.SetStatusCode(fasthttp.StatusServiceUnavailable)
            ctx.SetContentType("application/json")
            ctx.SetBodyString(`{"error":"No healthy backends available","code":"NO_BACKENDS"}`)
            return
        }

        // Get next backend with retry logic
        backend, err := rr.getNextHealthyBackend()
        if err != nil {
            zap.S().Errorf("Failed to get healthy backend: %v", err)
            ctx.SetStatusCode(fasthttp.StatusServiceUnavailable)
            ctx.SetContentType("application/json")
            ctx.SetBodyString(`{"error":"No healthy backends available","code":"NO_HEALTHY_BACKENDS"}`)
            return
        }

        // Attempt to proxy the request
        if err := backend.ReverseProxyHandler(ctx); err != nil {
            zap.S().Errorf("Backend request failed: %v", err)
            
            // Mark backend as potentially unhealthy
            rr.handleBackendError(backend, err)
            
            // Try to retry with another backend
            if retryBackend, retryErr := rr.getNextHealthyBackend(); retryErr == nil && retryBackend != backend {
                zap.S().Info("Retrying request with different backend")
                if retryErr := retryBackend.ReverseProxyHandler(ctx); retryErr == nil {
                    return // Success on retry
                }
            }
            
            // All backends failed
            ctx.SetStatusCode(fasthttp.StatusBadGateway)
            ctx.SetContentType("application/json")
            ctx.SetBodyString(`{"error":"Backend service unavailable","code":"BACKEND_ERROR"}`)
        }
    }

    // Wrap with middleware
    handler := types.PanicRecoveryMiddleware(coreHandler)
    handler = types.TimeoutMiddleware(30 * time.Second)(handler)
    
    return handler
}

func (rr *RoundRobin) getNextHealthyBackend() (types.IBackend, error) {
    attempts := 0
    maxAttempts := len(rr.backends)
    
    for attempts < maxAttempts {
        next := atomic.AddUint64(&rr.current, 1)
        backend := rr.backends[next%uint64(len(rr.backends))]
        
        if backend.IsAlive() {
            return backend, nil
        }
        
        attempts++
    }
    
    return nil, errors.New("no healthy backends available")
}

func (rr *RoundRobin) handleBackendError(backend types.IBackend, err error) {
    // Simple error counting - mark as unhealthy after multiple failures
    if errorCountable, ok := backend.(interface{ IncrementErrorCount() int }); ok {
        errorCount := errorCountable.IncrementErrorCount()
        if errorCount >= 3 {
            zap.S().Warnf("Marking backend as unhealthy due to %d consecutive errors", errorCount)
            if markable, ok := backend.(interface{ MarkUnhealthy() }); ok {
                markable.MarkUnhealthy()
            }
        }
    }
}

3. Enhance Backend Error Handling

File: internal/proxy/proxy.go

import (
    "errors"
    "sync/atomic"
    "time"
)

type ProxyClient struct {
    proxy             *fasthttp.HostClient
    totalRequestCount *uint64
    totalResTime      *uint64
    errorCount        *uint64  // NEW
    lastError         *int64   // NEW (timestamp)
    customHeaders     map[string]string
    Addr              string
    addrB             []byte
    useHTTPS          bool
    isHealthy         *int32   // NEW (1 = healthy, 0 = unhealthy)
}

func (h *ProxyClient) ReverseProxyHandler(ctx *fasthttp.RequestCtx) error {
    atomic.AddUint64(h.totalRequestCount, 1)
    start := time.Now()

    req := &ctx.Request
    res := &ctx.Response
    clientIP := helper.S2b(ctx.RemoteIP().String())

    h.preReq(req, clientIP)
    
    // Attempt the request with timeout and error handling
    err := h.doRequestWithRetry(req, res)
    if err != nil {
        h.handleError(err)
        h.serverError(res, err.Error())
        return err
    }
    
    // Reset error count on successful request
    atomic.StoreUint64(h.errorCount, 0)
    atomic.StoreInt32(h.isHealthy, 1)
    
    h.postRes(res)
    atomic.AddUint64(h.totalResTime, uint64(time.Since(start).Milliseconds()))
    
    return nil
}

func (h *ProxyClient) doRequestWithRetry(req *fasthttp.Request, res *fasthttp.Response) error {
    maxRetries := 2
    var lastErr error
    
    for attempt := 0; attempt <= maxRetries; attempt++ {
        err := h.proxy.Do(req, res)
        if err == nil {
            return nil
        }
        
        lastErr = err
        zap.S().Warnf("Backend request attempt %d failed: %v", attempt+1, err)
        
        // Don't retry on certain error types
        if isNonRetryableError(err) {
            break
        }
        
        // Small delay between retries
        if attempt < maxRetries {
            time.Sleep(time.Duration(attempt+1) * 100 * time.Millisecond)
        }
    }
    
    return lastErr
}

func (h *ProxyClient) handleError(err error) {
    atomic.AddUint64(h.errorCount, 1)
    atomic.StoreInt64(h.lastError, time.Now().Unix())
    
    errorCount := atomic.LoadUint64(h.errorCount)
    if errorCount >= 3 {
        atomic.StoreInt32(h.isHealthy, 0)
        zap.S().Warnf("Backend %s marked unhealthy after %d errors", h.Addr, errorCount)
    }
}

func isNonRetryableError(err error) bool {
    // Don't retry on certain errors like invalid responses, etc.
    return false // Implement based on specific error types
}

Testing Plan

Unit Tests

Create core/types/middleware_test.go:

func TestPanicRecoveryMiddleware(t *testing.T) {
    // Test that panic is caught and returns 500
    handler := PanicRecoveryMiddleware(func(ctx *fasthttp.RequestCtx) {
        panic("test panic")
    })
    
    ctx := &fasthttp.RequestCtx{}
    handler(ctx)
    
    assert.Equal(t, fasthttp.StatusInternalServerError, ctx.Response.StatusCode())
    assert.Contains(t, string(ctx.Response.Body()), "Internal server error")
}

func TestTimeoutMiddleware(t *testing.T) {
    // Test request timeout
    handler := TimeoutMiddleware(100 * time.Millisecond)(func(ctx *fasthttp.RequestCtx) {
        time.Sleep(200 * time.Millisecond) // Longer than timeout
    })
    
    ctx := &fasthttp.RequestCtx{}
    start := time.Now()
    handler(ctx)
    duration := time.Since(start)
    
    assert.Less(t, duration, 150*time.Millisecond)
    assert.Equal(t, fasthttp.StatusRequestTimeout, ctx.Response.StatusCode())
}

Integration Tests

func TestErrorRecoveryIntegration(t *testing.T) {
    // Test full error recovery with real backends
    // - Start divisor with panic-prone backend
    // - Send requests that cause panics
    // - Verify service continues to operate
    // - Verify proper error responses
}

Acceptance Criteria

  • Panic recovery middleware catches and logs panics
  • Service continues operating after panics
  • Request timeout middleware prevents hanging requests
  • Backend retry logic with exponential backoff
  • Error counting and circuit breaker behavior
  • Proper error response formats (JSON)
  • Comprehensive error logging with stack traces
  • Unit tests for all error scenarios
  • Integration tests with real error conditions

Files to Modify

  1. core/types/middleware.go - New panic recovery and timeout middleware
  2. All balancer implementations - Add middleware wrapping
  3. internal/proxy/proxy.go - Enhanced error handling
  4. Tests - Comprehensive error scenario testing

Dependencies: None
Follows: Issue #3 (Graceful Shutdown)
Documentation: Update README with error handling capabilities

Metadata

Metadata

Assignees

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions