Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 238 additions & 0 deletions demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "8dbbe089-484a-4539-acac-b4e324494f7c",
"metadata": {},
"source": [
"# Demo run"
]
},
{
"cell_type": "markdown",
"id": "844ab0fd-5e73-4629-b2fb-40ce50b755fe",
"metadata": {},
"source": [
"This notebook shows an example way of using FRIdata toolbox."
]
},
{
"cell_type": "markdown",
"id": "697ce575-1fb5-4f58-b881-bfd91220618c",
"metadata": {},
"source": [
"## Prerequisities"
]
},
{
"cell_type": "markdown",
"id": "91f9b031-94bf-4d42-83ce-d433c9677258",
"metadata": {},
"source": [
"- conda environment is set (accordingly to README.md file)\n",
"- FRIdata CLI tool is created\n",
"- File with IDs"
]
},
{
"cell_type": "markdown",
"id": "4328af25-ac41-4061-be92-f4b195902bdc",
"metadata": {},
"source": [
"## 1. Create a dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9db9ff88-a90e-4e9a-a2c2-0ad243ad96f6",
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"\n",
"export PATH_TO_DATA=\"<path to data>\"\n",
"export IDS_FILE_PATH=\"<path to idx files>\"\n",
"export DASHBOARD_NAME=\"testset_inference_over\"\n",
"export AFDB_PATH=\"$PATH_TO_DATA/afdb/structures\"\n",
"\n",
"eval \"$(conda shell.bash hook)\"\n",
"conda activate tbe\n",
"\n",
"fridata generate_data \\\n",
" -t dataset \\\n",
" -d AFDB \\\n",
" -c subset \\\n",
" --overwrite \\\n",
" --version ${DASHBOARD_NAME} \\\n",
" -i ${IDS_FILE_PATH} \\\n",
" --input-path ${AFDB_PATH} \\\n",
" -e \"esm2_t33_650M_UR50D\""
]
},
{
"cell_type": "markdown",
"id": "f867009b-d9d5-4038-8669-6bc9bf1c40b5",
"metadata": {},
"source": [
"## 2. Create dashboard"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4203f2e3-3f3d-4a80-8cb3-4ed3b6b9c9ba",
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"\n",
"# Path to dataset.json generated from 1. task\n",
"export PATH_TO_DATA=\"<path to data>\"\n",
"export PATH_TO_DATASET_JSON=\"$PATH_TO_DATA/datasets/AFDB-subset--testset_inference_over/dataset.json\"\n",
"\n",
"eval \"$(conda shell.bash hook)\"\n",
"conda activate tbe\n",
"\n",
"fridata create_dashboard \\\n",
" --dataset ${PATH_TO_DATASET_JSON}"
]
},
{
"cell_type": "markdown",
"id": "2b91b83c-3ff5-41cd-8c4d-2971baadd2f0",
"metadata": {},
"source": [
"Generated dashboard is saved to \"./reports/AFDB-subset--testset_inference_over.html\""
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "50a15ebb-9f64-4846-9587-0b6984727891",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"700\"\n",
" height=\"600\"\n",
" src=\"reports/AFDB-subset--testset_inference_over.html\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" \n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.IFrame at 0x1090e6420>"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import IFrame\n",
"\n",
"IFrame(src='reports/AFDB-subset--testset_inference_over.html', width=700, height=600)"
]
},
{
"cell_type": "markdown",
"id": "76ae7b67-5565-41c2-97c3-91f9a63ba542",
"metadata": {},
"source": [
"## 3. Loading dataset for more details"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea69cf70-2c29-47d4-a6d8-5153c7a9b1b3",
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"\n",
"export PATH_TO_DATA=\"<path to data>\"\n",
"\n",
"eval \"$(conda shell.bash hook)\"\n",
"conda activate tbe\n",
"\n",
"fridata load --file-path \"$PATH_TO_DATA/datasets/AFDB-subset--testset_inference_over\""
]
},
{
"cell_type": "markdown",
"id": "15688d7f-e57f-4305-b8ee-5b9db79fefe3",
"metadata": {},
"source": [
"Results:\n",
"\n",
"```\n",
"db_type=<DatabaseType.AFDB: 'AFDB'> collection_type=<CollectionType.subset: 'subset'>\n",
"proteome=''\n",
"version='testset_inference_over'\n",
"ids_file=PosixPath('<Path>') \n",
"seqres_file=None \n",
"archive_path=None \n",
"overwrite=True \n",
"batch_size=1000 \n",
"binary_data_download=False \n",
"is_hpc_cluster=False \n",
"input_path=PosixPath('<Path to dataset>') \n",
"embedder_type=<EmbedderType.ESM2_T33_650M: 'esm2_t33_650M_UR50D'> \n",
"embedding_size=1280 \n",
"created_at='1778013094005832' \n",
"config=Config(debug_mode='warning', data_path='<Path to data>', disto_type='CA', disto_thr='inf', separator='-', batch_size=1000)\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5fe4827-1f06-4a27-874b-b26b158ad784",
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"\n",
"export PATH_TO_DATA=\"<path to data>\"\n",
"export PATH_TO_STRUCTURES=\"$PATH_TO_DATA/structures/AFDB/subset_/testset_inference_over/0/pdbs.h5\"\n",
"export PATH_TO_EMBEDDINGS=\"$PATH_TO_DATA/embeddings/AFDB-subset--_test_dask/batch_0.h5\"\n",
"export PATH_TO_DISTOGRAMS=\"$PATH_TO_DATA/distograms/AFDB-subset--_test_dask/batch_0.h5\"\n",
"export PATH_TO_COORDINATES=\"$PATH_TO_DATA/coordinates/AFDB-subset--_test_dask/batch_0_ca.h5\"\n",
"\n",
"eval \"$(conda shell.bash hook)\"\n",
"conda activate tbe\n",
"\n",
"fridata inspect_h5 --help"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
124 changes: 124 additions & 0 deletions reports/AFDB-subset--testset_inference_over.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>AFDB-subset--testset_inference_over</title>
<style>
body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; margin: 24px; }
.badge { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 12px; margin-right: 6px; }
.ok { background: #e8f5e9; color: #1b5e20; }
.no { background: #ffebee; color: #b71c1c; }
table { border-collapse: collapse; width: 100%; margin: 12px 0; }
th, td { border: 1px solid #ddd; padding: 6px 8px; text-align: left; }
th { background: #fafafa; }
tfoot td { border-top: 2px solid #bbb; }
.panel { border: 1px solid #eee; padding: 12px; border-radius: 6px; margin: 12px 0; }
details > summary { cursor: pointer; }
.muted { color: #666; }
.pct { float: right; color: #999; font-size: 0.9em; }
.controls { margin: 8px 0; }
</style>
<script type="application/json" id="summary-data">{"dataset": {"identity": {"db": "AFDB", "collection": "subset", "slug": "testset_inference_over", "folder_name": "AFDB-subset--testset_inference_over"}, "path": "/datasets/AFDB-subset--testset_inference_over"}, "index_paths": {"dataset": {"forward": "/datasets/AFDB-subset--testset_inference_over/dataset.idx", "reversed": "/datasets/AFDB-subset--testset_inference_over/dataset_reversed.idx"}, "sequences": {"forward": null, "reversed": null}, "coordinates": {"forward": null, "reversed": null}, "embeddings": {"forward": null, "reversed": null}, "distograms": {"forward": null, "reversed": null}}, "per_index": {"dataset": {"index_type": "dataset", "forward_present": true, "reversed_present": true, "num_proteins_forward": 81, "num_files_referenced": 1, "num_edges_reversed": 81, "by_dataset": {"testset_inference_over": {"files_referenced": 1, "proteins_referencing": 81, "is_self": true, "files_per_batch": {"1": 1}}}}, "sequences": {"index_type": "sequences", "forward_present": false, "reversed_present": false, "num_proteins_forward": 0, "num_files_referenced": 0, "num_edges_reversed": 0, "by_dataset": {}}, "coordinates": {"index_type": "coordinates", "forward_present": false, "reversed_present": false, "num_proteins_forward": 0, "num_files_referenced": 0, "num_edges_reversed": 0, "by_dataset": {}}, "embeddings": {"index_type": "embeddings", "forward_present": false, "reversed_present": false, "num_proteins_forward": 0, "num_files_referenced": 0, "num_edges_reversed": 0, "by_dataset": {}}, "distograms": {"index_type": "distograms", "forward_present": false, "reversed_present": false, "num_proteins_forward": 0, "num_files_referenced": 0, "num_edges_reversed": 0, "by_dataset": {}}}, "global": {"aggregate": {"testset_inference_over": {"files_referenced": 1, "proteins_referencing": 81}}, "top": [{"slug": "testset_inference_over", "files_referenced": 1, "proteins_referencing": 81}]}}</script>
<script>
function $(id) { return document.getElementById(id); }

function render() {
const payload = JSON.parse($("summary-data").textContent);
const root = $("root");
root.innerHTML = '';
const header = document.createElement('div');
header.innerHTML = `<h2>${payload.dataset.identity.folder_name}</h2><div class="muted">${payload.dataset.path}</div>`;
root.appendChild(header);

// Per-index panels
for (const [t, stats] of Object.entries(payload.per_index)) {
const panel = document.createElement('div');
panel.className = 'panel';
panel.innerHTML = `<h3>${t}</h3>
<div class="muted">proteins: ${stats.num_proteins_forward}, files referenced: ${stats.num_files_referenced}`;

const ds = stats.by_dataset || {};
const keys = Object.keys(ds);
if (keys.length) {
const table = document.createElement('table');
table.innerHTML = '<thead><tr><th>dataset slug</th><th>files referenced</th><th>proteins referencing</th></tr></thead>';
const sorted = keys.sort();
let sumFiles = 0, sumProteins = 0;
for (const slug of sorted) {
sumFiles += ds[slug].files_referenced || 0;
sumProteins += ds[slug].proteins_referencing || 0;
}
const tbody = document.createElement('tbody');
for (const slug of sorted) {
const f = ds[slug].files_referenced || 0;
const p = ds[slug].proteins_referencing || 0;
const fPct = sumFiles ? (f / sumFiles * 100).toFixed(1) : '0.0';
const pPct = sumProteins ? (p / sumProteins * 100).toFixed(1) : '0.0';
const row = document.createElement('tr');
row.innerHTML = `<td>${slug}</td><td>${f} <span class="pct">${fPct}%</span></td><td>${p} <span class="pct">${pPct}%</span></td>`;
tbody.appendChild(row);
}
table.appendChild(tbody);
const tfoot = document.createElement('tfoot');
tfoot.innerHTML = `<tr><td><strong>Total</strong></td><td><strong>${sumFiles}</strong></td><td><strong>${sumProteins}</strong></td></tr>`;
table.appendChild(tfoot);
panel.appendChild(table);

const details = document.createElement('details');
details.innerHTML = '<summary>Show batches per dataset</summary>';
for (const slug of keys.sort()) {
const sub = document.createElement('div');
const batches = ds[slug].files_per_batch || {};
const subt = document.createElement('table');
subt.innerHTML = '<thead><tr><th>dataset slug</th><th>batch id</th><th>files</th></tr></thead>';
const sb = document.createElement('tbody');
for (const [b, n] of Object.entries(batches)) {
const r = document.createElement('tr');
r.innerHTML = `<td>${slug}</td><td>${b}</td><td>${n}</td>`;
sb.appendChild(r);
}
subt.appendChild(sb);
sub.appendChild(subt);
details.appendChild(sub);
}
panel.appendChild(details);
}
root.appendChild(panel);
}

// Global rollup
const roll = document.createElement('div');
roll.className = 'panel';
roll.innerHTML = '<h3>Global rollup</h3>';
const table = document.createElement('table');
table.innerHTML = '<thead><tr><th>dataset slug</th><th>files referenced</th><th>proteins referencing</th></tr></thead>';
const tb = document.createElement('tbody');
let gSumFiles = 0, gSumProteins = 0;
for (const row of payload.global.top) {
gSumFiles += row.files_referenced;
gSumProteins += row.proteins_referencing;
}
for (const row of payload.global.top) {
const fPct = gSumFiles ? (row.files_referenced / gSumFiles * 100).toFixed(1) : '0.0';
const pPct = gSumProteins ? (row.proteins_referencing / gSumProteins * 100).toFixed(1) : '0.0';
const tr = document.createElement('tr');
tr.innerHTML = `<td>${row.slug}</td><td>${row.files_referenced} <span class="pct">${fPct}%</span></td><td>${row.proteins_referencing} <span class="pct">${pPct}%</span></td>`;
tb.appendChild(tr);
}
table.appendChild(tb);
const gFoot = document.createElement('tfoot');
gFoot.innerHTML = `<tr><td><strong>Total</strong></td><td><strong>${gSumFiles}</strong></td><td><strong>${gSumProteins}</strong></td></tr>`;
table.appendChild(gFoot);
roll.appendChild(table);
root.appendChild(roll);
}

window.addEventListener('DOMContentLoaded', render);

</script>
</head>
<body>
<div id="root"></div>
</body>
</html>
Loading