Add Basic Error Recovery
🛡️ Priority: MEDIUM
Labels: enhancement, reliability, error-handling, stability
Estimated Effort: 1 day
Assignee: @aaydin-tr
Problem Description
Currently, Divisor has minimal error recovery mechanisms. If a panic occurs during request processing, it can crash the entire service. There's also limited error handling for:
- Backend connection failures
- Malformed requests
- Resource exhaustion
- Unexpected runtime errors
This can lead to:
- Service outages from single request failures
- Poor error messages for debugging
- Cascading failures
- Reduced service reliability
Proposed Solution
Implement basic error recovery mechanisms including:
- Panic recovery middleware
- Better error handling for backend failures
- Circuit breaker pattern for unhealthy backends
- Request timeout handling
- Improved error logging and monitoring
Implementation Plan
1. Add Panic Recovery Middleware
File: core/types/middleware.go (new file)
package types
import (
"runtime/debug"
"github.com/valyala/fasthttp"
"go.uber.org/zap"
)
// PanicRecoveryMiddleware wraps handlers to recover from panics
func PanicRecoveryMiddleware(next fasthttp.RequestHandler) fasthttp.RequestHandler {
return func(ctx *fasthttp.RequestCtx) {
defer func() {
if r := recover(); r != nil {
stack := debug.Stack()
// Log the panic with stack trace
zap.S().Errorf("Panic recovered: %v\nStack trace:\n%s", r, string(stack))
// Set error response
ctx.SetStatusCode(fasthttp.StatusInternalServerError)
ctx.SetContentType("application/json")
ctx.SetBodyString(`{"error":"Internal server error","code":"PANIC_RECOVERED"}`)
// Optional: Send to monitoring/alerting
// metrics.IncrementPanicCounter()
}
}()
// Call the next handler
next(ctx)
}
}
// TimeoutMiddleware adds request timeout handling
func TimeoutMiddleware(timeout time.Duration) func(fasthttp.RequestHandler) fasthttp.RequestHandler {
return func(next fasthttp.RequestHandler) fasthttp.RequestHandler {
return func(ctx *fasthttp.RequestCtx) {
done := make(chan bool, 1)
go func() {
next(ctx)
done <- true
}()
select {
case <-done:
// Request completed normally
return
case <-time.After(timeout):
// Request timed out
zap.S().Warnf("Request timeout after %v: %s %s",
timeout, ctx.Method(), ctx.RequestURI())
ctx.SetStatusCode(fasthttp.StatusRequestTimeout)
ctx.SetContentType("application/json")
ctx.SetBodyString(`{"error":"Request timeout","code":"TIMEOUT"}`)
return
}
}
}
}
2. Update Balancer to Use Middleware
File: core/round-robin/round-robin.go (example implementation)
import (
"time"
"github.com/aaydin-tr/divisor/core/types"
)
func (rr *RoundRobin) Serve() fasthttp.RequestHandler {
// Create the core handler
coreHandler := func(ctx *fasthttp.RequestCtx) {
rr.mutex.RLock()
defer rr.mutex.RUnlock()
if len(rr.backends) == 0 {
ctx.SetStatusCode(fasthttp.StatusServiceUnavailable)
ctx.SetContentType("application/json")
ctx.SetBodyString(`{"error":"No healthy backends available","code":"NO_BACKENDS"}`)
return
}
// Get next backend with retry logic
backend, err := rr.getNextHealthyBackend()
if err != nil {
zap.S().Errorf("Failed to get healthy backend: %v", err)
ctx.SetStatusCode(fasthttp.StatusServiceUnavailable)
ctx.SetContentType("application/json")
ctx.SetBodyString(`{"error":"No healthy backends available","code":"NO_HEALTHY_BACKENDS"}`)
return
}
// Attempt to proxy the request
if err := backend.ReverseProxyHandler(ctx); err != nil {
zap.S().Errorf("Backend request failed: %v", err)
// Mark backend as potentially unhealthy
rr.handleBackendError(backend, err)
// Try to retry with another backend
if retryBackend, retryErr := rr.getNextHealthyBackend(); retryErr == nil && retryBackend != backend {
zap.S().Info("Retrying request with different backend")
if retryErr := retryBackend.ReverseProxyHandler(ctx); retryErr == nil {
return // Success on retry
}
}
// All backends failed
ctx.SetStatusCode(fasthttp.StatusBadGateway)
ctx.SetContentType("application/json")
ctx.SetBodyString(`{"error":"Backend service unavailable","code":"BACKEND_ERROR"}`)
}
}
// Wrap with middleware
handler := types.PanicRecoveryMiddleware(coreHandler)
handler = types.TimeoutMiddleware(30 * time.Second)(handler)
return handler
}
func (rr *RoundRobin) getNextHealthyBackend() (types.IBackend, error) {
attempts := 0
maxAttempts := len(rr.backends)
for attempts < maxAttempts {
next := atomic.AddUint64(&rr.current, 1)
backend := rr.backends[next%uint64(len(rr.backends))]
if backend.IsAlive() {
return backend, nil
}
attempts++
}
return nil, errors.New("no healthy backends available")
}
func (rr *RoundRobin) handleBackendError(backend types.IBackend, err error) {
// Simple error counting - mark as unhealthy after multiple failures
if errorCountable, ok := backend.(interface{ IncrementErrorCount() int }); ok {
errorCount := errorCountable.IncrementErrorCount()
if errorCount >= 3 {
zap.S().Warnf("Marking backend as unhealthy due to %d consecutive errors", errorCount)
if markable, ok := backend.(interface{ MarkUnhealthy() }); ok {
markable.MarkUnhealthy()
}
}
}
}
3. Enhance Backend Error Handling
File: internal/proxy/proxy.go
import (
"errors"
"sync/atomic"
"time"
)
type ProxyClient struct {
proxy *fasthttp.HostClient
totalRequestCount *uint64
totalResTime *uint64
errorCount *uint64 // NEW
lastError *int64 // NEW (timestamp)
customHeaders map[string]string
Addr string
addrB []byte
useHTTPS bool
isHealthy *int32 // NEW (1 = healthy, 0 = unhealthy)
}
func (h *ProxyClient) ReverseProxyHandler(ctx *fasthttp.RequestCtx) error {
atomic.AddUint64(h.totalRequestCount, 1)
start := time.Now()
req := &ctx.Request
res := &ctx.Response
clientIP := helper.S2b(ctx.RemoteIP().String())
h.preReq(req, clientIP)
// Attempt the request with timeout and error handling
err := h.doRequestWithRetry(req, res)
if err != nil {
h.handleError(err)
h.serverError(res, err.Error())
return err
}
// Reset error count on successful request
atomic.StoreUint64(h.errorCount, 0)
atomic.StoreInt32(h.isHealthy, 1)
h.postRes(res)
atomic.AddUint64(h.totalResTime, uint64(time.Since(start).Milliseconds()))
return nil
}
func (h *ProxyClient) doRequestWithRetry(req *fasthttp.Request, res *fasthttp.Response) error {
maxRetries := 2
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
err := h.proxy.Do(req, res)
if err == nil {
return nil
}
lastErr = err
zap.S().Warnf("Backend request attempt %d failed: %v", attempt+1, err)
// Don't retry on certain error types
if isNonRetryableError(err) {
break
}
// Small delay between retries
if attempt < maxRetries {
time.Sleep(time.Duration(attempt+1) * 100 * time.Millisecond)
}
}
return lastErr
}
func (h *ProxyClient) handleError(err error) {
atomic.AddUint64(h.errorCount, 1)
atomic.StoreInt64(h.lastError, time.Now().Unix())
errorCount := atomic.LoadUint64(h.errorCount)
if errorCount >= 3 {
atomic.StoreInt32(h.isHealthy, 0)
zap.S().Warnf("Backend %s marked unhealthy after %d errors", h.Addr, errorCount)
}
}
func isNonRetryableError(err error) bool {
// Don't retry on certain errors like invalid responses, etc.
return false // Implement based on specific error types
}
Testing Plan
Unit Tests
Create core/types/middleware_test.go:
func TestPanicRecoveryMiddleware(t *testing.T) {
// Test that panic is caught and returns 500
handler := PanicRecoveryMiddleware(func(ctx *fasthttp.RequestCtx) {
panic("test panic")
})
ctx := &fasthttp.RequestCtx{}
handler(ctx)
assert.Equal(t, fasthttp.StatusInternalServerError, ctx.Response.StatusCode())
assert.Contains(t, string(ctx.Response.Body()), "Internal server error")
}
func TestTimeoutMiddleware(t *testing.T) {
// Test request timeout
handler := TimeoutMiddleware(100 * time.Millisecond)(func(ctx *fasthttp.RequestCtx) {
time.Sleep(200 * time.Millisecond) // Longer than timeout
})
ctx := &fasthttp.RequestCtx{}
start := time.Now()
handler(ctx)
duration := time.Since(start)
assert.Less(t, duration, 150*time.Millisecond)
assert.Equal(t, fasthttp.StatusRequestTimeout, ctx.Response.StatusCode())
}
Integration Tests
func TestErrorRecoveryIntegration(t *testing.T) {
// Test full error recovery with real backends
// - Start divisor with panic-prone backend
// - Send requests that cause panics
// - Verify service continues to operate
// - Verify proper error responses
}
Acceptance Criteria
Files to Modify
core/types/middleware.go - New panic recovery and timeout middleware
- All balancer implementations - Add middleware wrapping
internal/proxy/proxy.go - Enhanced error handling
- Tests - Comprehensive error scenario testing
Dependencies: None
Follows: Issue #3 (Graceful Shutdown)
Documentation: Update README with error handling capabilities
Add Basic Error Recovery
🛡️ Priority: MEDIUM
Labels:
enhancement,reliability,error-handling,stabilityEstimated Effort: 1 day
Assignee: @aaydin-tr
Problem Description
Currently, Divisor has minimal error recovery mechanisms. If a panic occurs during request processing, it can crash the entire service. There's also limited error handling for:
This can lead to:
Proposed Solution
Implement basic error recovery mechanisms including:
Implementation Plan
1. Add Panic Recovery Middleware
File:
core/types/middleware.go(new file)2. Update Balancer to Use Middleware
File:
core/round-robin/round-robin.go(example implementation)3. Enhance Backend Error Handling
File:
internal/proxy/proxy.goTesting Plan
Unit Tests
Create
core/types/middleware_test.go:Integration Tests
Acceptance Criteria
Files to Modify
core/types/middleware.go- New panic recovery and timeout middlewareinternal/proxy/proxy.go- Enhanced error handlingDependencies: None
Follows: Issue #3 (Graceful Shutdown)
Documentation: Update README with error handling capabilities