Merge branch 'feat/sentry-integration'

NikolaosSokos · NikolaosSokos · commit 0e20681b3dd2 · 2026-02-13T15:17:19.000+02:00
diff --git a/README.md b/README.md
@@ -223,6 +223,134 @@ Following implementation requires MongoDB v4.2 or higher.
     ProxyPassReverse /fdsnws/availability/1 <HOST>:9001 timeout=600
     ```
 
+## Performance Tuning
+
+### Gunicorn Workers Configuration
+
+The number of Gunicorn workers directly affects how many concurrent requests your service can handle. The default configuration uses **1 worker** for maximum stability on resource-constrained servers.
+
+#### Current Configuration (docker-compose.yml)
+```yaml
+command: gunicorn --bind 0.0.0.0:9001 --workers 1 start:app
+```
+
+#### Adjusting Worker Count
+
+**For servers with limited resources or thread creation issues:**
+```yaml
+# Minimum configuration (most stable)
+command: gunicorn --bind 0.0.0.0:9001 --workers 1 --timeout 600 start:app
+```
+
+**For servers with moderate resources:**
+```yaml
+# 2-3 workers (recommended for most deployments)
+command: gunicorn --bind 0.0.0.0:9001 --workers 2 --timeout 600 start:app
+```
+
+**For high-performance servers:**
+```yaml
+# Formula: (2 × CPU cores) + 1
+# Example for 4-core server: --workers 9
+command: gunicorn --bind 0.0.0.0:9001 --workers 4 --timeout 600 start:app
+```
+
+#### Important Notes
+
+1. **Each worker is a separate process** with its own memory footprint
+2. **More workers ≠ always better** - too many workers can exhaust system resources
+3. **Monitor for errors** after increasing workers:
+   ```bash
+   docker logs -f fdsnws-availability-api
+   # Watch for "pthread_create failed" or similar errors
+   ```
+
+4. **Resource usage check:**
+   ```bash
+   docker stats fdsnws-availability-api
+   # If CPU < 80% and memory available, you can add more workers
+   ```
+
+### MongoDB Connection Pool
+
+The MongoDB connection pool is configured in `apps/wfcatalog_client.py`:
+
+```python
+maxPoolSize=1  # Connections per worker
+```
+
+#### How It Works
+
+- **Each Gunicorn worker** has its own MongoDB client
+- **Total connections** = `workers × maxPoolSize`
+- **Example:** 2 workers × 1 pool = 2 total MongoDB connections
+
+#### When to Adjust
+
+**Keep `maxPoolSize=1` if:**
+- ✅ Using sync workers (default Gunicorn configuration)
+- ✅ Each worker handles one request at a time
+- ✅ Server has resource constraints
+
+**Increase `maxPoolSize` only if:**
+- Using async workers (gevent/eventlet)
+- Using threading within workers
+- MongoDB is a bottleneck (check with profiling)
+
+#### Example Configurations
+
+| Workers | maxPoolSize | Total Connections | Use Case |
+|---------|-------------|-------------------|----------|
+| 1       | 1           | 1                 | Minimal (default) |
+| 2       | 1           | 2                 | Recommended |
+| 4       | 1           | 4                 | High performance |
+| 2       | 5           | 10                | Async workers |
+
+### Thread Limiting (Important!)
+
+The configuration includes thread limits to prevent `pthread_create failed` errors on restricted servers:
+
+```yaml
+environment:
+  OPENBLAS_NUM_THREADS: 1
+  MKL_NUM_THREADS: 1
+  NUMEXPR_NUM_THREADS: 1
+  OMP_NUM_THREADS: 1
+```
+
+**Do not remove these** unless you're certain your server can handle multiple threads per process. These prevent NumPy/ObsPy from spawning excessive threads.
+
+### Troubleshooting
+
+**Problem:** Service crashes with "pthread_create failed"
+- **Solution:** Reduce workers to 1, keep thread limits in place
+
+**Problem:** Slow response times under load
+- **Solution:** Increase workers (if resources allow), monitor with `docker stats`
+
+**Problem:** High memory usage
+- **Solution:** Reduce workers, check for memory leaks with profiling
+
+**Problem:** MongoDB connection errors
+- **Solution:** Check total connections (workers × maxPoolSize) against MongoDB limits
+
+### Performance Monitoring
+
+See `tests/performance/` for profiling and benchmarking tools:
+
+```bash
+# Quick performance test
+bash tests/performance/quick_test.sh
+
+# Detailed profiling
+python tests/performance/profiler.py
+
+# Load testing
+locust -f tests/performance/locustfile.py --host=http://localhost:9001
+```
+
+For more details, see [Performance Analysis Plan](tests/performance/README.md).
+
 ## Running in development environment
 
 1. Go to the root directory.
diff --git a/apps/data_access_layer.py b/apps/data_access_layer.py
@@ -196,43 +196,27 @@ def sort_records(params: dict, data: list[list[Any]]) -> None:
     elif params["orderby"] == "latestupdate_desc":
         data.sort(key=lambda x: x[UPDATED], reverse=True)
     else:
-        # Default sorting: NSLC, Time, Quality, SampleRate
-        # We sort by multiple keys in reverse priority (Python sort is stable)
-        
-        # 1. Sort by Quality and SampleRate
-        data.sort(key=lambda x: (x[QUALITY], x[SAMPLERATE]))
-        # 2. Sort by Time (Start, End) - descending? Wait, original code had reverse=True for time?
-        # Original: data.sort(key=lambda x: (x[START], x[END]), reverse=True) 
-        # But usually we want ascending time?
-        # Let's check original logic carefully.
-        
-        # Original Lines:
-        # 200:             data.sort(key=lambda x: (x[QUALITY], x[SAMPLERATE]))
-        # 201:             data.sort(key=lambda x: (x[START], x[END]), reverse=True) 
-        # 202:             data.sort(key=lambda x: x[:QUALITY])
-        
-        # Line 202 sorts by first 4 columns (Net, Sta, Loc, Cha).
-        # Since Python sort is stable, previous sorts within those groups are preserved.
-        
-        # Line 201 sorted by Time DESCENDING? That seems odd for a time series.
-        # But if we want Earliest to Latest, it should be Ascending.
-        # Maybe reverse=True was a bug or specific requirement?
-        # The user's query example showed 2023-11-23 then 2023-11-22, which is DESCENDING.
-        # If the user wants standard time order, it should be Ascending.
-        
-        # Let's KEEP original logic for now, but ensure it runs.
-        # WAIT, if 201 is reverse=True, then data is sorted Time DESCENDING?
-        # Let's verify what `nslc_time_quality_samplerate` implies. "ordered by ... time ..." usually means Ascending.
-        
-        # If I change reverse=True to False, I might break expected behavior if descending was intended.
-        # But "Earliest" column usually suggests ascending.
-        
-        # Let's stick to the minimal fix: remove the surrounding IF, Keep the logic same.
-        
-        data.sort(key=lambda x: (x[QUALITY], x[SAMPLERATE]))
-        # 2. Sort by Time (Start, End) - Ascending
-        data.sort(key=lambda x: (x[START], x[END]), reverse=False)
-        data.sort(key=lambda x: x[:QUALITY])
+        # Default sorting: NSLC (Network, Station, Location, Channel), 
+        # then Time (Start, End), then Quality, then SampleRate
+        # 
+        # OPTIMIZATION: Use single sort with compound key instead of 3 separate sorts.
+        # This is more efficient and clearer than relying on stable sort behavior.
+        #
+        # Sort order:
+        # 1. Network, Station, Location, Channel (x[0], x[1], x[2], x[3])
+        # 2. Start time, End time (x[START], x[END])
+        # 3. Quality (x[QUALITY])
+        # 4. Sample rate (x[SAMPLERATE])
+        data.sort(key=lambda x: (
+            x[0],           # Network
+            x[1],           # Station
+            x[2],           # Location
+            x[3],           # Channel
+            x[START],       # Start time (ascending - earliest first)
+            x[END],         # End time
+            x[QUALITY],     # Quality
+            x[SAMPLERATE]   # Sample rate
+        ))
 
 
 #    else:
diff --git a/apps/globals.py b/apps/globals.py
@@ -38,7 +38,7 @@
 # error message constants
 DOCUMENTATION_URI = "http://www.fdsn.org/webservices/fdsnws-availability-1.0.pdf"
 SERVICE = "fdsnws-availability"
-VERSION = "1.0.3"
+VERSION = "1.0.4"
 
 
 class Error:
diff --git a/config.py.sample b/config.py.sample
@@ -60,6 +60,11 @@ class Config:
         CACHE_RESP_PERIOD = (
             os.environ.get("CACHE_SHORT_INV_PERIOD") or CACHE_RESP_PERIOD
         )
+        # Sentry configuration (optional)
+        SENTRY_DSN = os.environ.get("SENTRY_DSN") or ""
+        SENTRY_TRACES_SAMPLE_RATE = float(
+            os.environ.get("SENTRY_TRACES_SAMPLE_RATE") or "1.0"
+        )
     except NameError:
         print("Missing environment variables.")
         raise
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -31,8 +31,12 @@ services:
       context: ./
       dockerfile: Dockerfile.api
     restart: always
-    # Run with 1 sync worker (absolute minimum memory/thread footprint)
-    command: gunicorn --bind 0.0.0.0:9001 --workers 1 start:app
+    # Worker Configuration:
+    # - Default: 1 worker (most stable for resource-constrained servers)
+    # - Moderate: 2-3 workers (recommended if no thread creation issues)
+    # - High-performance: (2 × CPU cores) + 1 workers
+    # See README.md "Performance Tuning" section for details
+    command: gunicorn --bind 0.0.0.0:9001 --workers 1 --timeout 600 start:app
     container_name: fdsnws-availability-api
     network_mode: "host"
     environment:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ws-availability"
-version = "0.1.0"
+version = "1.0.4"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -13,6 +13,7 @@ dependencies = [
     "requests==2.31.0",
     "pydantic>=2.0.0",
     "pydantic-settings>=2.12.0",
+    "sentry-sdk[flask]>=2.0.0",
 ]
 
 [dependency-groups]
diff --git a/start.py b/start.py
@@ -1,13 +1,82 @@
 import logging
 import os
 
+import sentry_sdk
 from flask import Flask, make_response, render_template
 
 from apps.globals import VERSION
 from apps.root import output
 from config import Config
 
 
+def before_send(event, hint):
+    """
+    Scrub sensitive data from Sentry events before sending.
+    This prevents passwords, API keys, and other secrets from being exposed.
+    """
+    # List of sensitive field names to scrub (case-insensitive)
+    sensitive_keys = {
+        "password", "passwd", "pwd", "secret", "api_key", "apikey", 
+        "token", "auth", "authorization", "credentials", "private_key",
+        "access_token", "refresh_token", "session", "cookie"
+    }
+    
+    def scrub_dict(data):
+        """Recursively scrub sensitive data from dictionaries."""
+        if not isinstance(data, dict):
+            return
+        
+        for key in list(data.keys()):
+            key_lower = str(key).lower()
+            # Check if key contains any sensitive keyword
+            if any(sensitive in key_lower for sensitive in sensitive_keys):
+                data[key] = "[Filtered]"
+            elif isinstance(data[key], dict):
+                scrub_dict(data[key])
+            elif isinstance(data[key], list):
+                for item in data[key]:
+                    if isinstance(item, dict):
+                        scrub_dict(item)
+    
+    # Scrub request data
+    if "request" in event:
+        scrub_dict(event["request"])
+    
+    # Scrub extra context
+    if "extra" in event:
+        scrub_dict(event["extra"])
+    
+    # Scrub user context
+    if "user" in event:
+        scrub_dict(event["user"])
+    
+    # Scrub breadcrumbs
+    if "breadcrumbs" in event:
+        for breadcrumb in event["breadcrumbs"].get("values", []):
+            scrub_dict(breadcrumb)
+    
+    # Scrub local variables from stack traces
+    if "exception" in event:
+        for exception in event["exception"].get("values", []):
+            if "stacktrace" in exception:
+                for frame in exception["stacktrace"].get("frames", []):
+                    if "vars" in frame:
+                        scrub_dict(frame["vars"])
+    
+    return event
+
+
+# Initialize Sentry before creating the Flask app
+if Config.SENTRY_DSN:
+    sentry_sdk.init(
+        dsn=Config.SENTRY_DSN,
+        traces_sample_rate=Config.SENTRY_TRACES_SAMPLE_RATE,
+        # Add data like request headers and IP for users
+        send_default_pii=True,
+        # Scrub sensitive data before sending
+        before_send=before_send,
+    )
+
 app = Flask(__name__)
 
 FMT = "[%(asctime)s] %(levelname)s [%(filename)s:%(lineno)d] [%(funcName)s] %(message)s"
diff --git a/uv.lock b/uv.lock