Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c13f819
ready for testing on server
magnujo Dec 18, 2025
2c7ab5a
added some AI suggestions
magnujo Dec 18, 2025
64b8c08
small changes
magnujo Dec 18, 2025
228d841
tested and works on dandy
Dec 18, 2025
e631352
addded requirement
Jan 12, 2026
5171886
calling smdb upload script from wrapper_bclconvert.sh. todo before me…
Jan 12, 2026
194ee76
added .vscode/settings.json to .gitignore
Jan 12, 2026
c6a4828
Tweak log message
fgvieira Jan 13, 2026
f4485b0
Code clean-up
fgvieira Jan 13, 2026
12355af
Apply suggestion from @fgvieira
fgvieira Jan 13, 2026
ad63db3
Ensure abs path
fgvieira Jan 13, 2026
3750b40
Use abs path
fgvieira Jan 13, 2026
e379509
Fix checking of CAEG data
fgvieira Jan 13, 2026
1a66fda
Fix typo
fgvieira Jan 13, 2026
cf57cb7
added some required dependencies to pixi and depency checks
Jan 13, 2026
265019e
Merge branch 'main' into smdb-auto-upload
fgvieira Jan 23, 2026
9d04a9f
Merge branch 'main' of https://github.com/GeoGenetics/seqcenter into …
magnujo Jan 28, 2026
ab3acab
hard coded db configs except pw
magnujo Jan 28, 2026
0e5a5cd
Merge pull request #1 from GeoGenetics/main
magnujo Jan 29, 2026
bf32912
Update SQL query table_id from 47 to 9
magnujo Jan 29, 2026
9e53f79
Change DB_TABLE from 'demultiplex_stats' to 'flowcell'
magnujo Jan 29, 2026
ce0d3c4
Merge pull request #2 from GeoGenetics/main
magnujo Feb 19, 2026
1206e0b
fixeed merge conflict from changes from filipe
magnujo Feb 19, 2026
0efd038
Merge branch 'smdb-auto-upload' of https://github.com/magnujo/seqcent…
magnujo Feb 19, 2026
16a75de
fixed realpath issue with DB_PASSWORD
magnujo Feb 19, 2026
ce2c260
changes date parsing in smdb_upload.py
magnujo Feb 19, 2026
6d89248
swapped code positions in wrapper script
magnujo Feb 19, 2026
aa22646
removed 4 lane assertion
magnujo Feb 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,5 @@ __marimo__/
# pixi environments
.pixi
*.egg-info
.vscode/launch.json
.vscode/settings.json
55 changes: 55 additions & 0 deletions demux/pixi.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions demux/pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ matplotlib = ">=3.10.0,<4"
samshee = ">=0.2.0,<0.3"
pandas = ">=2.3.0,<3"
numpy = ">=2.3.0,<3"
sqlalchemy = ">=2.0.45,<3"
psycopg2 = ">=2.9.11,<3"
8 changes: 8 additions & 0 deletions demux/smdb-upload/environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: smdb_upload_env
channels:
- defaults
dependencies:
- python=3.11
- pandas
- sqlalchemy
- psycopg2
6 changes: 6 additions & 0 deletions demux/smdb-upload/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Requirements for smdb_upload.py

# List your package dependencies here, for example:
pandas
sqlalchemy
psycopg2
133 changes: 133 additions & 0 deletions demux/smdb-upload/smdb_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import subprocess
import argparse
from sqlalchemy import create_engine
import pandas as pd
from datetime import datetime, timezone
from uuid import uuid4
from zoneinfo import ZoneInfo

parser = argparse.ArgumentParser(
description="Upload demultiplex stats, run info, and sample sheet metadata to SMDB."
)
parser.add_argument("-c", "--path_to_demultiplex_stats", required=True, help="Path to demultiplex stats CSV")
parser.add_argument("-r", "--path_to_run_info", required=True, help="Path to run info XML")
parser.add_argument("-x", "--path_to_sample_sheet", required=True, help="Path to sample sheet CSV")
parser.add_argument("-n", "--db_name", required=True, help="Database name")
parser.add_argument("-s", "--schema_name", required=True, help="Target schema name")
parser.add_argument("-u", "--db_user", required=True, help="Database user")
parser.add_argument("-p", "--db_password", required=True, help="Database password")
parser.add_argument("-d", "--db_host", required=True, help="Database host")
parser.add_argument("-o", "--db_port", required=True, type=int, help="Database port")
parser.add_argument("-t", "--table_name", required=True, help="Target table name")
parser.add_argument("-e", "--send_upload_receipts_to", required=True, help="Space-separated emails for upload receipts")

args = parser.parse_args()

def upload_demultiplex_stats(path_to_demultiplex_stats, path_to_run_info, path_to_sample_sheet, db_name, schema_name, db_user, db_password, db_host, db_port, table_name, send_upload_receipts_to):
"""
Connects to a PostgreSQL database, queries a view, and exports the results to a TSV file.

Args:
path_to_demultiplex_stats (str): Path to the demultiplex stats CSV file.
path_to_run_info (str): Path to the run info CSV file.
path_to_sample_sheet (str): Path to the sample sheet CSV file.
db_name (str): Database name.
db_user (str): Database user.
db_password (str): Database password.
db_port (int): Database port (default is 5432).
table_name (str): Name of the PostgreSQL table to export.
output_file (str): Path to the output TSV file.
"""

dmux_stats = pd.read_csv(path_to_demultiplex_stats)
run_info = pd.read_xml(path_to_run_info, parser='etree')
sample_sheet = pd.read_csv(path_to_sample_sheet)

sample_sheet = (
sample_sheet.iloc[0:17]
.dropna(how="all", axis=1)
.dropna(how="any", axis=0)
.rename(columns={"[Header]": "Attribute", "Unnamed: 1": "Value"})
.set_index("Attribute")
)

send_upload_receipts_to = send_upload_receipts_to.replace(" ", "; ")
upload_sheets = str(path_to_demultiplex_stats) + "; " + str(path_to_run_info) + "; " + str(path_to_sample_sheet)
timestamp = datetime.now(ZoneInfo("Europe/Copenhagen"))
seq_run_id = run_info.at[0, 'Id']
seq_run_number = run_info.at[0, 'Number']
seq_machine_id = run_info.at[0, 'Instrument']
flowcell_position = run_info.at[0, 'Id'].split('_')[3][0]
flowcell_id = run_info.at[0, 'Flowcell']

unformatted_seq_date = run_info.at[0, 'Date']
dt = datetime.strptime(unformatted_seq_date, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
formatted_date = dt.strftime("%Y-%m-%d")

dmux_stats['database_insert_by'] = send_upload_receipts_to
dmux_stats['upload_sheet'] = upload_sheets
dmux_stats['database_insert_datetime_utc'] = timestamp
dmux_stats['upload_uuid'] = uuid4()
dmux_stats['sequencing_run_id'] = seq_run_id
dmux_stats['sequencing_run_number'] = seq_run_number
dmux_stats['sequencing_machine_id'] = seq_machine_id
dmux_stats['sequencing_run_date'] = str(formatted_date)
dmux_stats['sequencing_tube_tag'] = None
dmux_stats['flowcell_position'] = flowcell_position
dmux_stats['flowcell'] = flowcell_id

# Add sequencing pool based on lane
pool_lanes = [sample_sheet.loc['PoolLane1', 'Value'],
sample_sheet.loc['PoolLane2', 'Value'],
sample_sheet.loc['PoolLane3', 'Value'],
sample_sheet.loc['PoolLane4', 'Value']]
# num_lanes = len(dmux_stats['Lane'].unique())
# assert num_lanes == 4, 'Expected 4 lanes in the demultiplex stats file. got ' + str(num_lanes)
dmux_stats['sequencing_tube_tag'] = dmux_stats['Lane'].apply(lambda x: pool_lanes[x - 1])

ENGINE = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")

q = 'select column_name_db, column_name_sheet from name_maps.column_names where table_id = 9;'

renamer = pd.read_sql(q, ENGINE)
rename_dict = dict(zip(renamer['column_name_sheet'], renamer['column_name_db']))
dmux_stats = dmux_stats.rename(columns=rename_dict)

dmux_stats.to_sql(table_name, ENGINE, schema=schema_name, if_exists="append", index=False)

email_cmd = [
"mail",
"-s",
"Sequencing stats successfully uploaded to SMDB",
"-a",
str(path_to_demultiplex_stats),
"-a",
str(path_to_run_info),
"-a",
str(path_to_sample_sheet)
]
email_cmd.extend(send_upload_receipts_to.split("; "))
subprocess.run(
email_cmd,
input="The appended sequencing stats has been successfully uploaded to SMDB.",
text=True,
check=False,
)


upload_demultiplex_stats(
path_to_demultiplex_stats=args.path_to_demultiplex_stats,
path_to_run_info=args.path_to_run_info,
path_to_sample_sheet=args.path_to_sample_sheet,
db_name=args.db_name,
schema_name=args.schema_name,
db_user=args.db_user,
db_password=args.db_password,
db_port=args.db_port,
table_name=args.table_name,
db_host=args.db_host,
send_upload_receipts_to=args.send_upload_receipts_to
)



36 changes: 36 additions & 0 deletions demux/wrapper_bclconvert.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ python3 -c 'import pathlib'
python3 -c 'import collections'
python3 -c 'import plotly'
python3 -c 'import matplotlib'
python3 -c 'import psycopg2'
python3 -c 'import sqlalchemy'


## Functions
Expand Down Expand Up @@ -63,13 +65,16 @@ THREADS=5
IN_FOLDER=`realpath --canonicalize-existing --no-symlinks $1`; shift
SS=`realpath --canonicalize-existing --no-symlinks $1`; shift
OUT_FOLDER=`realpath --canonicalize-existing --no-symlinks $1`; shift
DB_PASSWORD=$1; shift
EXTRA=$@

RUN=`basename $IN_FOLDER`
if [[ `realpath $OUT_FOLDER` == "/maps/datasets/caeg_fastq" ]]; then
OUT_FOLDER=$OUT_FOLDER/${RUN:0:4}/$RUN
CAEG_DATA=true
else
OUT_FOLDER=$OUT_FOLDER/$RUN
CAEG_DATA=false
fi

# Check if output folder exists
Expand Down Expand Up @@ -156,6 +161,37 @@ mkdir -p $OUT_FOLDER
TIMESTAMP=`date "+%Y%m%d_%H%M%S"`
touch seqcenter.$TIMESTAMP.done
cd ../

if [ $CAEG_DATA = true ]; then
echo `date`" [$RUN] uploading metadata to SMDB"
SMDB_UPLOAD_SCRIPT="$BASEDIR/smdb-upload/smdb_upload.py"

DEMUX_STATS_CSV="$OUT_FOLDER/Reports/Demultiplex_Stats.csv"
RUNINFO_XML="$OUT_FOLDER/Reports/RunInfo.xml"
UPLOAD_RECEIPTS_TO="julie.bitz-thorsen@sund.ku.dk"
DB_NAME="smdb"
DB_SCHEMA="uploaded_data"
DB_USER="upload_user"
DB_HOST="dandypdb01fl"
DB_PORT="5432"
DB_TABLE="flowcell"

python3 "$SMDB_UPLOAD_SCRIPT" \
--path_to_demultiplex_stats "$DEMUX_STATS_CSV" \
--path_to_run_info "$RUNINFO_XML" \
--path_to_sample_sheet "$SS" \
--db_name "$DB_NAME" \
--schema_name "$DB_SCHEMA" \
--db_user "$DB_USER" \
--db_password "$DB_PASSWORD" \
--db_host "$DB_HOST" \
--db_port "$DB_PORT" \
--table_name "$DB_TABLE" \
--send_upload_receipts_to "$UPLOAD_RECEIPTS_TO"
fi

} 2>&1 | tee $OUT_FOLDER/$RUN.demux.log



exit 0