From 27db5952202ab1d2a6445261323f08014388cac4 Mon Sep 17 00:00:00 2001
From: ckouder <bingjiguo@icloud.com>
Date: Tue, 12 May 2026 18:10:41 -0500
Subject: [PATCH] feat: add user guide, license, and Python examples to API
 docs

Mirrors the OpenEnzymeDB-api pattern (moleculemaker/oed-api) for the
Swagger UI at /api/v1/docs:

- Expand the FastAPI description with User Guide, Automatic Pagination
  technical note, CC BY 4.0 data license, and support contact sections.
- Switch the Swagger UI syntax-highlight theme to "obsidian" so Python
  code blocks render with a neutral dark palette instead of purple.
- Add URL examples and copy-pasteable Python (requests) snippets to the
  docstrings for /search, /typeahead, /curation-statuses, and /ec_lookup.

Refs: moleculemaker/CLEANDB-api#12
---
 app/main.py           |  51 ++++++++--
 app/routers/search.py | 221 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 258 insertions(+), 14 deletions(-)

diff --git a/app/main.py b/app/main.py
index 54ab19f..cf72447 100644
--- a/app/main.py
+++ b/app/main.py
@@ -25,14 +25,52 @@ async def lifespan(app: FastAPI):
     description=settings.DESCRIPTION
     + f"""
 
-## Automatic Pagination
+# User Guide
 
-When a query would return more than {settings.AUTO_PAGINATION_THRESHOLD} records and no explicit limit is
-provided, the API will automatically paginate results to return {settings.AUTO_PAGINATION_THRESHOLD} records
-at a time. The response will include pagination metadata with links to navigate
-to next and previous pages.
+The CLEAN Database API provides programmatic access to the same enzyme annotations and
+CLEAN-predicted EC numbers available through the
+[CLEAN Database user interface](https://cleandb.frontend.staging.mmli2.ncsa.illinois.edu).
+
+This page serves as the API documentation, detailing available endpoints, request parameters,
+response formats, and example usage. There are also interactive features to test endpoints
+directly from your browser.
+
+## Endpoint Documentation, Python Examples, and Interactive Testing
+
+Below you will find a list of available endpoints. Each endpoint appears as a colored box with a
+title and brief description. You can click on any endpoint to expand it and see more details,
+including:
+- **Description**: A brief overview of the endpoint's purpose.
+- **Example Usage**: Sample code snippets in Python demonstrating how to call the endpoint and
+  handle the response.
+- **Parameters**: Required and optional parameters for the endpoint.
+- **Response Format**: The structure of the data returned by the endpoint.
+- **Try it out**: An interactive feature that allows you to test the endpoint directly from this
+  page.
+
+### Technical Note: Automatic Pagination
+
+When a query would return more than {settings.AUTO_PAGINATION_THRESHOLD} records and no explicit
+limit is provided, the API will automatically paginate results to return
+{settings.AUTO_PAGINATION_THRESHOLD} records at a time. The response will include pagination
+metadata with links to navigate to next and previous pages.
+
+This threshold can be configured using the `AUTO_PAGINATION_THRESHOLD` environment variable.
+
+## Data License
+
+The CLEAN Database dataset is licensed under the Creative Commons Attribution 4.0 International
+License (CC BY 4.0). This means you are free to share and adapt the material for any purpose,
+even commercially, as long as you give appropriate credit, provide a link to the license, and
+indicate if changes were made.
+
+Full license text: [https://creativecommons.org/licenses/by/4.0/](https://creativecommons.org/licenses/by/4.0/)
+
+## Support
+
+If you have any questions, issues, or feedback regarding the API, please reach out to us via
+email at <cleandb-feedback@moleculemaker.org>.
 
-This threshold can be configured using the AUTO_PAGINATION_THRESHOLD environment variable.
 """,
     version=settings.VERSION,
     lifespan=lifespan,
@@ -40,6 +78,7 @@ async def lifespan(app: FastAPI):
     openapi_url="/api/v1/openapi.json",
     docs_url="/api/v1/docs",
     redoc_url="/api/v1/redoc",
+    swagger_ui_parameters={"syntaxHighlight.theme": "obsidian"},
 )
 
 # Add CORS middleware
diff --git a/app/routers/search.py b/app/routers/search.py
index 1426a61..a660662 100644
--- a/app/routers/search.py
+++ b/app/routers/search.py
@@ -105,8 +105,102 @@ async def get_data(
     db: Database = Depends(get_db),
     request: Request = None,
 ) -> CLEANSearchResponse:
-    """
-    Get enzyme kinetic data with filtering options.
+    r"""
+Get enzyme records and CLEAN-predicted EC numbers with filtering options.
+
+This endpoint allows querying the CLEAN Database with various filters across UniProt
+metadata and CLEAN prediction confidence ranges.
+
+Filters that accept multiple values on the same parameter (e.g. `organism`) are combined
+with OR logic, while filters on different parameters are combined with AND logic.
+
+The response format can be either JSON (default) or CSV. Results are automatically
+paginated when no explicit `limit` is provided.
+
+### URL examples
+
+- /api/v1/search?organism=Homo%20sapiens&organism=Mus%20musculus
+
+- /api/v1/search?ec_number=1.1.1.1&clean_ec_confidence_min=0.8
+
+- /api/v1/search?curation_status=reviewed&format=csv&limit=100
+
+### Python example: retrieving JSON data
+
+```python
+import requests
+
+# Query CLEAN records filtered by organism and CLEAN-prediction confidence
+params = {
+    "organism": ["Escherichia coli", "Homo sapiens"],   # OR within the same param
+    "clean_ec_confidence_min": 0.8,                       # CLEAN confidence floor
+    "curation_status": "reviewed",                        # Swiss-Prot only
+    "limit": 10,
+}
+
+response = requests.get(
+    "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1/search",
+    params=params,
+)
+
+if response.status_code == 200:
+    payload = response.json()
+    print(f"Total matching records: {payload['total']}")
+    print(f"Returned in this page: {len(payload['data'])}")
+
+    if payload["data"]:
+        first = payload["data"][0]
+        print("\nFirst record:")
+        print(f"  Accession: {first['accession']}")
+        print(f"  Protein:   {first['protein']}")
+        print(f"  Organism:  {first['organism']}")
+        # CLEAN-predicted EC numbers are returned as a list of {ec_number, score}
+        for prediction in first.get("predicted_ec") or []:
+            print(f"  CLEAN EC:  {prediction['ec_number']} (score={prediction['score']:.3f})")
+
+    # Follow pagination links if present
+    if payload.get("next"):
+        print(f"\nNext page: {payload['next']}")
+else:
+    print(f"Error: {response.status_code} - {response.text}")
+```
+
+### Python example: downloading filtered results as CSV
+
+```python
+import csv
+import requests
+from io import StringIO
+
+params = {
+    "ec_number": ["1.1.1.1", "2.7.1.1"],
+    "format": "csv",
+    "limit": 50,
+}
+
+response = requests.get(
+    "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1/search",
+    params=params,
+)
+
+if response.status_code == 200:
+    # Save the CSV response to disk
+    with open("clean_search_export.csv", "w", newline="") as fh:
+        fh.write(response.text)
+    print("Saved clean_search_export.csv")
+
+    # Preview the first few rows
+    reader = csv.DictReader(StringIO(response.text))
+    for i, row in enumerate(reader):
+        if i >= 3:
+            break
+        print(f"\nRecord {i + 1}:")
+        print(f"  Accession: {row.get('accession')}")
+        print(f"  Organism:  {row.get('organism')}")
+        print(f"  Amino acids: {row.get('amino_acids')}")
+else:
+    print(f"Error: {response.status_code} - {response.text}")
+```
     """
 
     try:
@@ -330,8 +424,67 @@ async def get_typeahead(
     db: Database = Depends(get_db),
     request: Request = None,
 ) -> CLEANTypeaheadResponse:
-    """
-    Get typeahead suggestions for searching the database of predicted EC numbers.
+    r"""
+Get autocomplete suggestions for a chosen field, optionally constrained by a search context.
+
+Use `field_name` to choose which column to search (`accession`, `organism`, `protein_name`,
+`gene_name`, `uniprot_id`, or `predicted_ec`). The `search` term must be at least 3 characters.
+
+Optionally pass any of the `/search` filter parameters (e.g. `organism`, `curation_status`,
+`clean_ec_confidence_min`) to scope the suggestions to records that already match those filters.
+
+### URL examples
+
+- /api/v1/typeahead?field_name=organism&search=esch
+
+- /api/v1/typeahead?field_name=protein_name&search=kin&organism=Homo%20sapiens
+
+- /api/v1/typeahead?field_name=predicted_ec&search=1.1.1&clean_ec_confidence_min=0.7
+
+### Python example: simple autocomplete
+
+```python
+import requests
+
+response = requests.get(
+    "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1/typeahead",
+    params={"field_name": "organism", "search": "esch", "limit": 10},
+)
+
+if response.status_code == 200:
+    payload = response.json()
+    print(f"Field: {payload['field_name']}, query: {payload['search']!r}")
+    print(f"Total matches: {payload['total']}")
+    for match in payload["matches"]:
+        print(f"  - {match}")
+else:
+    print(f"Error: {response.status_code} - {response.text}")
+```
+
+### Python example: autocomplete scoped by a search context
+
+```python
+import requests
+
+# Look up protein names that contain "kin" — but only within Homo sapiens records
+response = requests.get(
+    "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1/typeahead",
+    params={
+        "field_name": "protein_name",
+        "search": "kin",
+        "organism": "Homo sapiens",
+        "curation_status": "reviewed",
+    },
+)
+
+if response.status_code == 200:
+    payload = response.json()
+    print(f"Search context applied: {payload.get('search_context')}")
+    for match in payload["matches"][:5]:
+        print(f"  - {match}")
+else:
+    print(f"Error: {response.status_code} - {response.text}")
+```
     """
 
     try:
@@ -410,8 +563,32 @@ async def get_typeahead(
 
 @router.get("/curation-statuses", summary="Get available curation status options")
 async def get_curation_statuses() -> CLEANCurationStatusResponse:
-    """
-    Get the list of available curation status options for filtering.
+    r"""
+Get the list of available curation status options that can be used with the `curation_status`
+filter on `/search` and `/typeahead`.
+
+### URL example
+
+- /api/v1/curation-statuses
+
+### Python example
+
+```python
+import requests
+
+response = requests.get(
+    "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1/curation-statuses",
+)
+
+if response.status_code == 200:
+    for option in response.json()["statuses"]:
+        print(f"{option['value']:>11}  ->  {option['label']}")
+    # Expected output:
+    #    reviewed  ->  Reviewed (Swiss-Prot)
+    #  unreviewed  ->  Unreviewed (TrEMBL)
+else:
+    print(f"Error: {response.status_code} - {response.text}")
+```
     """
     return CLEANCurationStatusResponse(
         statuses=[
@@ -445,8 +622,36 @@ async def get_ec_lookup(
     db: Database = Depends(get_db),
     request: Request = None,
 ) -> CLEANECLookupResponse:
-    """
-    Look up EC numbers or classes based on a search term.
+    r"""
+Look up EC numbers or EC classes by partial number or descriptive name.
+
+Useful for resolving free-text user input into canonical EC numbers before calling
+`/search` with the `ec_number` filter.
+
+### URL examples
+
+- /api/v1/ec_lookup?search=1.1.1
+
+- /api/v1/ec_lookup?search=oxidoreductase
+
+### Python example
+
+```python
+import requests
+
+response = requests.get(
+    "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1/ec_lookup",
+    params={"search": "1.1.1"},
+)
+
+if response.status_code == 200:
+    payload = response.json()
+    print(f"Matches for {payload['search']!r}:")
+    for match in payload["matches"][:5]:
+        print(f"  {match['ec_number']}  -  {match['ec_name']}")
+else:
+    print(f"Error: {response.status_code} - {response.text}")
+```
     """
 
     try: