Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
c13f819
ready for testing on server
magnujo Dec 18, 2025
2c7ab5a
added some AI suggestions
magnujo Dec 18, 2025
64b8c08
small changes
magnujo Dec 18, 2025
228d841
tested and works on dandy
Dec 18, 2025
e631352
addded requirement
Jan 12, 2026
5171886
calling smdb upload script from wrapper_bclconvert.sh. todo before me…
Jan 12, 2026
194ee76
added .vscode/settings.json to .gitignore
Jan 12, 2026
c6a4828
Tweak log message
fgvieira Jan 13, 2026
f4485b0
Code clean-up
fgvieira Jan 13, 2026
12355af
Apply suggestion from @fgvieira
fgvieira Jan 13, 2026
ad63db3
Ensure abs path
fgvieira Jan 13, 2026
3750b40
Use abs path
fgvieira Jan 13, 2026
e379509
Fix checking of CAEG data
fgvieira Jan 13, 2026
1a66fda
Fix typo
fgvieira Jan 13, 2026
cf57cb7
added some required dependencies to pixi and depency checks
Jan 13, 2026
265019e
Merge branch 'main' into smdb-auto-upload
fgvieira Jan 23, 2026
9d04a9f
Merge branch 'main' of https://github.com/GeoGenetics/seqcenter into …
magnujo Jan 28, 2026
ab3acab
hard coded db configs except pw
magnujo Jan 28, 2026
0e5a5cd
Merge pull request #1 from GeoGenetics/main
magnujo Jan 29, 2026
bf32912
Update SQL query table_id from 47 to 9
magnujo Jan 29, 2026
9e53f79
Change DB_TABLE from 'demultiplex_stats' to 'flowcell'
magnujo Jan 29, 2026
ce0d3c4
Merge pull request #2 from GeoGenetics/main
magnujo Feb 19, 2026
1206e0b
fixeed merge conflict from changes from filipe
magnujo Feb 19, 2026
0efd038
Merge branch 'smdb-auto-upload' of https://github.com/magnujo/seqcent…
magnujo Feb 19, 2026
16a75de
fixed realpath issue with DB_PASSWORD
magnujo Feb 19, 2026
ce2c260
changes date parsing in smdb_upload.py
magnujo Feb 19, 2026
6d89248
swapped code positions in wrapper script
magnujo Feb 19, 2026
aa22646
removed 4 lane assertion
magnujo Feb 19, 2026
e9fcf34
parsing now expects 8 lanes
magnujo Feb 23, 2026
6420ff2
adding unknown to index for all undetermined
magnujo Mar 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,5 @@ __marimo__/
# pixi environments
.pixi
*.egg-info
.vscode/launch.json
.vscode/settings.json
8 changes: 8 additions & 0 deletions demux/smdb-upload/environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: smdb_upload_env
channels:
- defaults
dependencies:
- python=3.11
- pandas
- sqlalchemy
- psycopg2
6 changes: 6 additions & 0 deletions demux/smdb-upload/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Requirements for smdb_upload.py

# List your package dependencies here, for example:
pandas
sqlalchemy
psycopg2
133 changes: 133 additions & 0 deletions demux/smdb-upload/smdb_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import subprocess
import argparse
from sqlalchemy import create_engine
import pandas as pd
from datetime import datetime
from uuid import uuid4
from zoneinfo import ZoneInfo

parser = argparse.ArgumentParser(
description="Upload demultiplex stats, run info, and sample sheet metadata to SMDB."
)
parser.add_argument("-c", "--path_to_demultiplex_stats", required=True, help="Path to demultiplex stats CSV")
parser.add_argument("-r", "--path_to_run_info", required=True, help="Path to run info XML")
parser.add_argument("-x", "--path_to_sample_sheet", required=True, help="Path to sample sheet CSV")
parser.add_argument("-n", "--db_name", required=True, help="Database name")
parser.add_argument("-s", "--schema_name", required=True, help="Target schema name")
parser.add_argument("-u", "--db_user", required=True, help="Database user")
parser.add_argument("-p", "--db_password", required=True, help="Database password")
parser.add_argument("-d", "--db_host", required=True, help="Database host")
parser.add_argument("-o", "--db_port", required=True, type=int, help="Database port")
parser.add_argument("-t", "--table_name", required=True, help="Target table name")
parser.add_argument("-e", "--send_upload_receipts_to", required=True, help="Space-separated emails for upload receipts")

args = parser.parse_args()

def upload_demultiplex_stats(path_to_demultiplex_stats, path_to_run_info, path_to_sample_sheet, db_name, schema_name, db_user, db_password, db_host, db_port, table_name, send_upload_receipts_to):
"""
Connects to a PostgreSQL database, queries a view, and exports the results to a TSV file.

Args:
path_to_demultiplex_stats (str): Path to the demultiplex stats CSV file.
path_to_run_info (str): Path to the run info CSV file.
path_to_sample_sheet (str): Path to the sample sheet CSV file.
db_name (str): Database name.
db_user (str): Database user.
db_password (str): Database password.
db_port (int): Database port (default is 5432).
table_name (str): Name of the PostgreSQL table to export.
output_file (str): Path to the output TSV file.
"""

dmux_stats = pd.read_csv(path_to_demultiplex_stats)
run_info = pd.read_xml(path_to_run_info, parser='etree')
sample_sheet = pd.read_csv(path_to_sample_sheet)

sample_sheet = (
sample_sheet.iloc[0:17]
.dropna(how="all", axis=1)
.dropna(how="any", axis=0)
.rename(columns={"[Header]": "Attribute", "Unnamed: 1": "Value"})
.set_index("Attribute")
)

send_upload_receipts_to = send_upload_receipts_to.replace(" ", "; ")
upload_sheets = str(path_to_demultiplex_stats) + "; " + str(path_to_run_info) + "; " + str(path_to_sample_sheet)
timestamp = datetime.now(ZoneInfo("Europe/Copenhagen"))
seq_run_id = run_info.at[0, 'Id']
seq_run_number = run_info.at[0, 'Number']
seq_machine_id = run_info.at[0, 'Instrument']
flowcell_position = run_info.at[0, 'Id'].split('_')[3][0]
flowcell_id = run_info.at[0, 'Flowcell']

unformatted_seq_date = run_info.at[0, 'Date']
dt = datetime.strptime(unformatted_seq_date, "%m/%d/%Y %I:%M:%S %p")
formatted_date = dt.strftime("%Y-%m-%d")

dmux_stats['database_insert_by'] = send_upload_receipts_to
dmux_stats['upload_sheet'] = upload_sheets
dmux_stats['database_insert_datetime_utc'] = timestamp
dmux_stats['upload_uuid'] = uuid4()
dmux_stats['sequencing_run_id'] = seq_run_id
dmux_stats['sequencing_run_number'] = seq_run_number
dmux_stats['sequencing_machine_id'] = seq_machine_id
dmux_stats['sequencing_run_date'] = str(formatted_date)
dmux_stats['sequencing_tube_tag'] = None
dmux_stats['flowcell_position'] = flowcell_position
dmux_stats['flowcell'] = flowcell_id

# Add sequencing pool based on lane
pool_lanes = [sample_sheet.loc['PoolLane1', 'Value'],
sample_sheet.loc['PoolLane2', 'Value'],
sample_sheet.loc['PoolLane3', 'Value'],
sample_sheet.loc['PoolLane4', 'Value']]
num_lanes = len(dmux_stats['Lane'].unique())
assert num_lanes == 4, 'Expected 4 lanes in the demultiplex stats file. got ' + str(num_lanes)
dmux_stats['sequencing_tube_tag'] = dmux_stats['Lane'].apply(lambda x: pool_lanes[x - 1])

ENGINE = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")

q = 'select column_name_db, column_name_sheet from name_maps.column_names where table_id = 47;'

renamer = pd.read_sql(q, ENGINE)
rename_dict = dict(zip(renamer['column_name_sheet'], renamer['column_name_db']))
dmux_stats = dmux_stats.rename(columns=rename_dict)

dmux_stats.to_sql(table_name, ENGINE, schema=schema_name, if_exists="append", index=False)

email_cmd = [
"mail",
"-s",
"Sequencing stats successfully uploaded to SMDB",
"-a",
str(path_to_demultiplex_stats),
"-a",
str(path_to_run_info),
"-a",
str(path_to_sample_sheet)
]
email_cmd.extend(send_upload_receipts_to.split("; "))
subprocess.run(
email_cmd,
input="The appended sequencing stats has been successfully uploaded to SMDB.",
text=True,
check=False,
)


upload_demultiplex_stats(
path_to_demultiplex_stats=args.path_to_demultiplex_stats,
path_to_run_info=args.path_to_run_info,
path_to_sample_sheet=args.path_to_sample_sheet,
db_name=args.db_name,
schema_name=args.schema_name,
db_user=args.db_user,
db_password=args.db_password,
db_port=args.db_port,
table_name=args.table_name,
db_host=args.db_host,
send_upload_receipts_to=args.send_upload_receipts_to
)



36 changes: 35 additions & 1 deletion demux/wrapper_bclconvert.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ THREADS=5

IN_FOLDER=$1; shift
SS=$1; shift
OUT_FOLDER=$1; shift
OUT_BASE="$1"; shift # <-- keep the original "base output root"
OUT_FOLDER="$OUT_BASE"
Comment thread
fgvieira marked this conversation as resolved.
Outdated
EXTRA=$@

RUN=20`basename $IN_FOLDER`
Expand Down Expand Up @@ -143,9 +144,42 @@ mkdir -p $OUT_FOLDER
cd ../
done

if [[ $OUT_BASE == "/datasets/caeg_fastq" ]]; then
Comment thread
fgvieira marked this conversation as resolved.
Outdated
echo "$(date) OUT_BASE is /datasets/caeg_fastq – running SMDB upload script"
Comment thread
fgvieira marked this conversation as resolved.
Outdated
SMDB_UPLOAD_SCRIPT="$BASEDIR/smdb-upload/smdb_upload.py"

DEMUX_STATS_CSV="$OUT_FOLDER/Reports/Demultiplex_Stats.csv"
RUNINFO_XML="$OUT_FOLDER/Reports/RunInfo.xml"
UPLOAD_RECEIPTS_TO="julie.bitz-thorsen@sund.ku.dk"

# Required DB env vars (do not hardcode secrets in the script)
: "${DB_NAME:?Set DB_NAME in environment}"
: "${DB_SCHEMA:?Set DB_SCHEMA in environment}"
: "${DB_USER:?Set DB_USER in environment}"
: "${DB_PASSWORD:?Set DB_PASSWORD in environment}"
: "${DB_HOST:?Set DB_HOST in environment}"
: "${DB_PORT:?Set DB_PORT in environment (integer)}"
: "${DB_TABLE:?Set DB_TABLE in environment}"

python3 "$SMDB_UPLOAD_SCRIPT" \
--path_to_demultiplex_stats "$DEMUX_STATS_CSV" \
--path_to_run_info "$RUNINFO_XML" \
--path_to_sample_sheet "$SS" \
--db_name "$DB_NAME" \
--schema_name "$DB_SCHEMA" \
--db_user "$DB_USER" \
--db_password "$DB_PASSWORD" \
--db_host "$DB_HOST" \
--db_port "$DB_PORT" \
--table_name "$DB_TABLE" \
--send_upload_receipts_to "$UPLOAD_RECEIPTS_TO"
fi

TIMESTAMP=`date "+%Y%m%d_%H%M%S"`
touch seqcenter.$TIMESTAMP.done
cd ../
} 2>&1 | tee $OUT_FOLDER/$RUN.demux.log



exit 0