CLEANDB-api/app/main.py at d87ddb5d8ab8cf35c088ee09789b8a1feadfb912 · moleculemaker/CLEANDB-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
from contextlib import asynccontextmanager

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.gzip import GZipMiddleware

from app.core.config import settings
from app.db.database import _db
from app.routers import search


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Lifespan context manager for database connection handling."""
    # Connect to database on startup
    await _db.connect()
    yield
    # Disconnect from database on shutdown
    await _db.disconnect()


# Initialize FastAPI application
app = FastAPI(
    title=settings.PROJECT_NAME,
    description=settings.DESCRIPTION
    + f"""

## Automatic Pagination

When a query would return more than {settings.AUTO_PAGINATION_THRESHOLD} records and no explicit limit is
provided, the API will automatically paginate results to return {settings.AUTO_PAGINATION_THRESHOLD} records
at a time. The response will include pagination metadata with links to navigate
to next and previous pages.

This threshold can be configured using the AUTO_PAGINATION_THRESHOLD environment variable.

## How to Use

The CLEAN Data API provides programmatic access to enzyme EC number predictions generated
by the [CLEAN tool](https://github.com/tttianhao/CLEAN). You can query the database by
organism, protein name, gene name, UniProt accession, EC number, curation status, EC
confidence score, and sequence length.

**Base URL:** `https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1`

**Available Endpoints:**
- `GET /search` — Search and filter enzyme records
- `GET /typeahead` — Retrieve typeahead suggestions for a given field and search term
- `GET /ec_lookup` — Look up EC numbers or enzyme class names
- `GET /curation-statuses` — List available curation status options

Use the interactive documentation below to explore query parameters and response schemas,
or refer to the Python examples in the next section to get started quickly.

## Python Examples

The following examples use the [requests](https://docs.python-requests.org/) library.
Install it with `pip install requests` if needed.

### Search by organism name

```python
import requests

BASE_URL = "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1"

response = requests.get(
    f"{{BASE_URL}}/search",
    params={{"organism": "Homo sapiens"}},
)
response.raise_for_status()
data = response.json()
print(f"Total results: {{data['total']}}")
for record in data["data"]:
    print(record["uniprot"], record["predicted_ec"])
```

### Search by EC number

```python
import requests

BASE_URL = "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1"

response = requests.get(
    f"{{BASE_URL}}/search",
    params={{"ec_number": "3.5.1.18"}},
)
response.raise_for_status()
data = response.json()
print(f"Total results: {{data['total']}}")
for record in data["data"]:
    print(record["accession"], record["organism"])
```

### Filter by EC confidence and curation status

```python
import requests

BASE_URL = "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1"

response = requests.get(
    f"{{BASE_URL}}/search",
    params={{
        "clean_ec_confidence_min": 0.9,
        "curation_status": "reviewed",
        "limit": 100,
        "offset": 0,
    }},
)
response.raise_for_status()
data = response.json()
print(f"Total results: {{data['total']}}")
for record in data["data"]:
    print(record["accession"], record["predicted_ec"])
```

### Download results as CSV

```python
import requests

BASE_URL = "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1"

response = requests.get(
    f"{{BASE_URL}}/search",
    params={{"organism": "Escherichia coli", "format": "csv"}},
)
response.raise_for_status()
with open("cleandb_results.csv", "wb") as f:
    f.write(response.content)
print("Results saved to cleandb_results.csv")
```

### Typeahead suggestions for organism field

```python
import requests

BASE_URL = "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1"

response = requests.get(
    f"{{BASE_URL}}/typeahead",
    params={{"field_name": "organism", "search": "Homo"}},
)
response.raise_for_status()
data = response.json()
print(data["matches"])
```

### Look up EC numbers by name or number

```python
import requests

BASE_URL = "https://fastapi.cleandb.mmli2.ncsa.illinois.edu/api/v1"

response = requests.get(
    f"{{BASE_URL}}/ec_lookup",
    params={{"search": "hydrolase"}},
)
response.raise_for_status()
data = response.json()
for match in data["matches"]:
    print(match["ec_number"], match["ec_name"])
```

## Data License

The CLEAN Data API provides access to enzyme EC number predictions produced by the CLEAN
machine-learning tool, combined with protein annotations sourced from
[UniProt](https://www.uniprot.org/).

**UniProt data** is made available under the
[Creative Commons Attribution 4.0 International (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/)
license. See the [UniProt license page](https://www.uniprot.org/help/license) for details.

**CLEAN predictions** are provided for research and educational use. If you use this data
in your research, please cite:

> Tianhao Yu, Haiyang Cui, Jianan Canal Li, Yunan Luo, Guangde Jiang, Huimin Zhao.
> *Enzyme function prediction using contrastive learning.*
> **Science**, 379(6639), 1358-1363 (2023).
> [https://doi.org/10.1126/science.adf2465](https://doi.org/10.1126/science.adf2465)

This API and its source code are released under the
[MIT License](https://opensource.org/licenses/MIT).
""",
    version=settings.VERSION,
    lifespan=lifespan,
    # Increase timeout and response size limits
    openapi_url="/api/v1/openapi.json",
    docs_url="/api/v1/docs",
    redoc_url="/api/v1/redoc",
)

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=settings.CORS_ORIGINS,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
    expose_headers=["Content-Disposition", "Content-Length"],
    max_age=600,
)

# Add GZip compression middleware to compress large responses
app.add_middleware(GZipMiddleware, minimum_size=1000)

# Include API routers
app.include_router(search.router, prefix="/api/v1")