Skip to content
Open

Dev #41

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v1.0.1 - XX/XX/XXXX

### `Added`

- `populate_janno.py` -> `0.5.2`:
- Minor tweaks to avoid FutureWarnings from pandas.

### `Fixed`

### `Dependencies`

### `Deprecated`

## v1.0.0 - 02/09/2025

### `Added`
Expand Down
142 changes: 72 additions & 70 deletions poseidon-eager.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,99 +4,101 @@ channels:
- conda-forge
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- _openmp_mutex=4.5=20_gnu
- _r-mutex=1.0.1=anacondar_1
- alsa-lib=1.2.8=h166bdaf_0
- binutils_impl_linux-64=2.43=h4bf12b8_2
- bwidget=1.10.1=ha770c72_0
- bzip2=1.0.8=h4bc722e_7
- binutils_impl_linux-64=2.45.1=default_hfdba357_102
- bwidget=1.10.1=ha770c72_1
- bzip2=1.0.8=hda65f42_9
- c-ares=1.34.4=hb9d3cd8_0
- ca-certificates=2025.8.3=hbd8a1cb_0
- ca-certificates=2026.2.25=hbd8a1cb_0
- cairo=1.16.0=ha61ee94_1012
- coreutils=8.25=1
- curl=8.1.2=h409715c_0
- expat=2.6.4=h5888daf_0
- expat=2.7.4=hecca717_0
- font-ttf-dejavu-sans-mono=2.37=hab24e00_0
- font-ttf-inconsolata=3.000=h77eed37_0
- font-ttf-source-code-pro=2.038=h77eed37_0
- font-ttf-ubuntu=0.83=h77eed37_3
- fontconfig=2.15.0=h7e30c49_1
- fontconfig=2.17.1=h27c8c51_0
- fonts-conda-ecosystem=1=0
- fonts-conda-forge=1=0
- freetype=2.12.1=h267a509_2
- fribidi=1.0.10=h36c2ea0_0
- gcc_impl_linux-64=14.2.0=h6b349bd_1
- gettext=0.23.1=h5888daf_0
- gettext-tools=0.23.1=h5888daf_0
- gfortran_impl_linux-64=14.2.0=hc73f493_1
- fonts-conda-forge=1=hc364b38_1
- freetype=2.14.2=ha770c72_0
- fribidi=1.0.16=hb03c661_0
- gcc_impl_linux-64=15.2.0=he420e7e_18
- gettext=0.25.1=h3f43e3d_1
- gettext-tools=0.25.1=h3f43e3d_1
- gfortran_impl_linux-64=15.2.0=h281d09f_18
- giflib=5.2.2=hd590300_0
- gmp=6.3.0=hac33072_2
- graphite2=1.3.13=h59595ed_1003
- graphite2=1.3.14=hecca717_2
- gsl=2.7=he838d99_0
- gxx_impl_linux-64=14.2.0=h2c03514_1
- gxx_impl_linux-64=15.2.0=hda75c37_18
- harfbuzz=6.0.0=h8e241bc_0
- icu=70.1=h27087fc_0
- jpeg=9e=h0b41bf4_3
- kernel-headers_linux-64=3.10.0=he073ed8_18
- keyutils=1.6.1=h166bdaf_0
- kernel-headers_linux-64=4.18.0=he073ed8_9
- keyutils=1.6.3=hb9d3cd8_0
- krb5=1.20.1=h81ceb04_0
- lcms2=2.15=hfd0df8a_0
- ld_impl_linux-64=2.43=h712a8e2_2
- lerc=4.0.0=h27087fc_0
- libasprintf=0.23.1=h8e693c7_0
- libasprintf-devel=0.23.1=h8e693c7_0
- libblas=3.9.0=28_h59b9bed_openblas
- libcblas=3.9.0=28_he106b2a_openblas
- ld_impl_linux-64=2.45.1=default_hbd61a6d_102
- lerc=4.1.0=hdb68285_0
- libasprintf=0.25.1=h3f43e3d_1
- libasprintf-devel=0.25.1=h3f43e3d_1
- libblas=3.11.0=5_h4a7cf45_openblas
- libcblas=3.11.0=5_h0358290_openblas
- libcups=2.3.3=h36d4200_3
- libcurl=8.1.2=h409715c_0
- libdeflate=1.17=h0b41bf4_0
- libedit=3.1.20250104=pl5321h7949ede_0
- libev=4.33=hd590300_2
- libexpat=2.6.4=h5888daf_0
- libffi=3.4.6=h2dba641_0
- libgcc=14.2.0=h77fa898_1
- libgcc-devel_linux-64=14.2.0=h41c2201_101
- libgcc-ng=14.2.0=h69a702a_1
- libgettextpo=0.23.1=h5888daf_0
- libgettextpo-devel=0.23.1=h5888daf_0
- libgfortran=14.2.0=h69a702a_1
- libgfortran-ng=14.2.0=h69a702a_1
- libgfortran5=14.2.0=hd5240d6_1
- libexpat=2.7.4=hecca717_0
- libffi=3.5.2=h3435931_0
- libfreetype=2.14.2=ha770c72_0
- libfreetype6=2.14.2=h73754d4_0
- libgcc=15.2.0=he0feb66_18
- libgcc-devel_linux-64=15.2.0=hcc6f6b0_118
- libgcc-ng=15.2.0=h69a702a_18
- libgettextpo=0.25.1=h3f43e3d_1
- libgettextpo-devel=0.25.1=h3f43e3d_1
- libgfortran=15.2.0=h69a702a_18
- libgfortran-ng=15.2.0=h69a702a_18
- libgfortran5=15.2.0=h68bc16d_18
- libglib=2.78.1=hebfc3b9_0
- libgomp=14.2.0=h77fa898_1
- libiconv=1.18=h4ce23a2_0
- liblapack=3.9.0=28_h7ac8fdf_openblas
- liblzma=5.6.4=hb9d3cd8_0
- liblzma-devel=5.6.4=hb9d3cd8_0
- libgomp=15.2.0=he0feb66_18
- libiconv=1.18=h3b78370_2
- liblapack=3.11.0=5_h47877c9_openblas
- liblzma=5.8.2=hb03c661_0
- liblzma-devel=5.8.2=hb03c661_0
- libnghttp2=1.64.0=h161d5f1_0
- libnsl=2.0.1=hd590300_0
- libopenblas=0.3.28=pthreads_h94d23a6_1
- libpng=1.6.47=h943b412_0
- libsanitizer=14.2.0=h2a3dede_1
- libsqlite=3.49.1=hee588c1_1
- libssh2=1.11.1=hf672d98_0
- libstdcxx=14.2.0=hc0a3c3a_1
- libstdcxx-devel_linux-64=14.2.0=h41c2201_101
- libstdcxx-ng=14.2.0=h4852527_1
- libnsl=2.0.1=hb9d3cd8_1
- libopenblas=0.3.30=pthreads_h94d23a6_4
- libpng=1.6.55=h421ea60_0
- libsanitizer=15.2.0=h90f66d4_18
- libsqlite=3.52.0=h0c1763c_0
- libssh2=1.11.1=hcf80075_0
- libstdcxx=15.2.0=h934c35e_18
- libstdcxx-devel_linux-64=15.2.0=hd446a21_118
- libstdcxx-ng=15.2.0=hdf11a46_18
- libtiff=4.5.0=h6adf6a1_2
- libuuid=2.38.1=h0b41bf4_0
- libwebp-base=1.5.0=h851e524_0
- libuuid=2.41.3=h5347b49_0
- libwebp-base=1.6.0=hd42ef1d_0
- libxcb=1.13=h7f98852_1004
- libxcrypt=4.4.36=hd590300_1
- libxml2=2.10.3=hca2bb57_4
- libzlib=1.3.1=hb9d3cd8_2
- libzlib=1.3.2=h25fd6f3_2
- make=4.4.1=hb9d3cd8_2
- ncurses=6.5=h2d0b736_3
- nextflow=24.10.4=hdfd78af_0
- nextflow=25.10.4=h2a3209d_0
- openjdk=17.0.3=h58dac75_5
- openssl=3.5.2=h26f9b46_0
- openssl=3.6.1=h35e630c_1
- pango=1.50.14=hd33c08f_0
- pcre2=10.40=hc3806b6_0
- pip=25.0.1=pyh8b19718_0
- pixman=0.44.2=h29eaf8c_0
- poseidon-trident=1.6.7.1=hebebf5b_0
- pip=25.2=pyh8b19718_0
- pixman=0.46.4=h54a6638_1
- poseidon-trident=1.7.0.0=h894548b_0
- pthread-stubs=0.4=hb9d3cd8_1002
- python=3.9.21=h9c0c6dc_1_cpython
- python=3.9.23=hc30ae73_0_cpython
- r-assertthat=0.2.1=r42hc72bb7e_4
- r-backports=1.5.0=r42hb1dbf0f_0
- r-base=4.2.3=ha7d60f8_0
Expand Down Expand Up @@ -149,21 +151,21 @@ dependencies:
- r-vctrs=0.6.5=r42ha503ecb_0
- r-waldo=0.5.2=r42hc72bb7e_0
- r-withr=3.0.0=r42hc72bb7e_0
- readline=8.2=h8228510_1
- sed=4.8=he412f7d_0
- setuptools=75.8.0=pyhff2d567_0
- sysroot_linux-64=2.17=h0157908_18
- tk=8.6.13=noxft_h4845f30_101
- tktable=2.10=h8bc8fbc_6
- readline=8.3=h853b02a_0
- sed=4.9=h6688a6e_0
- setuptools=80.9.0=pyhff2d567_0
- sysroot_linux-64=2.28=h4ee821c_9
- tk=8.6.13=noxft_hd72426e_102
- tktable=2.10=h8d826fa_7
- wheel=0.45.1=pyhd8ed1ab_1
- xorg-fixesproto=5.0=hb9d3cd8_1003
- xorg-inputproto=2.3.2=hb9d3cd8_1003
- xorg-kbproto=1.0.7=hb9d3cd8_1003
- xorg-libice=1.0.10=h7f98852_0
- xorg-libsm=1.2.3=hd9c2040_1000
- xorg-libx11=1.8.4=h0b41bf4_0
- xorg-libxau=1.0.12=hb9d3cd8_0
- xorg-libxdmcp=1.1.5=hb9d3cd8_0
- xorg-libxau=1.0.12=hb03c661_1
- xorg-libxdmcp=1.1.5=hb03c661_1
- xorg-libxext=1.3.4=h0b41bf4_2
- xorg-libxfixes=5.0.3=h7f98852_1004
- xorg-libxi=1.7.10=h7f98852_0
Expand All @@ -174,11 +176,11 @@ dependencies:
- xorg-renderproto=0.11.1=hb9d3cd8_1003
- xorg-xextproto=7.3.0=hb9d3cd8_1004
- xorg-xproto=7.0.31=hb9d3cd8_1008
- xz=5.6.4=hbcc6ac9_0
- xz-gpl-tools=5.6.4=hbcc6ac9_0
- xz-tools=5.6.4=hb9d3cd8_0
- zlib=1.3.1=hb9d3cd8_2
- zstd=1.5.6=ha6fb4c9_0
- xz=5.8.2=ha02ee65_0
- xz-gpl-tools=5.8.2=ha02ee65_0
- xz-tools=5.8.2=hb03c661_0
- zlib=1.3.2=h25fd6f3_2
- zstd=1.5.7=hb78ec9c_6
- pip:
- argparse==1.4.0
- biopython==1.81
Expand Down
20 changes: 19 additions & 1 deletion scripts/minotaur_packager.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
VERSION='0.5.0'
VERSION='0.5.2'
set -o pipefail ## Pipefail, complain on new unassigned variables.
# set -x ## Debugging

Expand Down Expand Up @@ -128,6 +128,7 @@ function add_versions_file() {
local capture_type_version_string
local pipeline_report_fn
local populate_janno_version
local trident_version

## Read in function params
package_eager_result_dir=${1}
Expand All @@ -153,6 +154,7 @@ function add_versions_file() {
config_version=$(grep "config_template_version" ${pipeline_report_fn} | awk -F ' ' '{print $NF}')
package_config_version=$(grep "package_config_version" ${pipeline_report_fn} | awk -F ' ' '{print $NF}')
populate_janno_version=$(${repo_dir}/scripts/populate_janno.py -v)
trident_version=$(trident --version 2>/dev/null)

errecho -y "[${package_name}]: Writing version info to '${version_fn}'."
## Create the versions file. Flush any old file contents if the file exists.
Expand All @@ -168,6 +170,7 @@ function add_versions_file() {
echo " - Package config version: ${package_config_version}" >> ${version_fn}
echo " - Minotaur-packager version: ${VERSION}" >> ${version_fn}
echo " - populate_janno.py version: ${populate_janno_version}" >> ${version_fn}
echo " - trident version for packaging: ${trident_version}" >> ${version_fn}
}

## Function to add SSF file to minotaur package
Expand Down Expand Up @@ -347,6 +350,11 @@ genotype_fns=($(ls -1 ${root_results_dir}/genotyping/*geno)) ## List of genotype
snp_set="1240K"
errecho -y "[${package_name}]: SNP set inferred as '${snp_set}'."

## TODO-dev Currently hardcoded referenceAssembly info, but in the future maybe infer from genotype input names?
referenceAssembly="GRCh37"
referenceAssemblyURL="https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_000001405.14"
errecho -y "[${package_name}]: Genome assembly inferred as '${referenceAssembly}' with URL '${referenceAssemblyURL}'."

## Check that the inferred snp set is supported. Should trigger if the inference somehow breaks.
supported_snpsets=($(ls -1 ${repo_dir}/conf/CaptureType_profiles/ | cut -d "." -f 1))
if [[ ! -z $(all_x_in_y 1 ${snp_set} ${#supported_snpsets[@]} ${supported_snpsets[@]}) ]]; then
Expand All @@ -369,6 +377,16 @@ elif [[ ! -d ${output_package_dir} ]] || [[ ${newest_genotype_fn} -nt ${output_p
trident init -p ${tmp_dir}/${package_name}.geno -o ${tmp_dir}/package/ -n ${package_name} --snpSet ${snp_set}
check_fail $? "[${package_name}]: Failed to initialise package. Aborting."

## To be replaced once trident init can do this natively.
## First infer the line number tha includes the snpSet. Then add the reference assembly info underneath that line, then the rest of the yml file.
errecho -y "[${package_name}]: Adding reference genome assembly information."
insert_after=$(grep -n "snpSet" ${tmp_dir}/package/POSEIDON.yml | cut -d ':' -f 1)
head -n ${insert_after} ${tmp_dir}/package/POSEIDON.yml > ${tmp_dir}/package/tmp_POSEIDON.yml
echo " referenceGenomeAssembly: ${referenceAssembly}" >> ${tmp_dir}/package/tmp_POSEIDON.yml
echo " referenceGenomeAssemblyURL: ${referenceAssemblyURL}" >> ${tmp_dir}/package/tmp_POSEIDON.yml
tail -n +$((${insert_after}+1)) ${tmp_dir}/package/POSEIDON.yml >> ${tmp_dir}/package/tmp_POSEIDON.yml
mv ${tmp_dir}/package/tmp_POSEIDON.yml ${tmp_dir}/package/POSEIDON.yml

## Add Thiseas as contributor to poseidon package
## Trident 1.5.* does not include Josiah Carberry anymore, which breaks pyJanno if the field is empty.
errecho -y "[${package_name}]: Adding Thiseas as contributor to package."
Expand Down
20 changes: 11 additions & 9 deletions scripts/populate_janno.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np
from collections import namedtuple

VERSION = "0.5.1"
VERSION = "0.5.2"


def get_eager_version(eager_result_dir):
Expand Down Expand Up @@ -272,9 +272,9 @@ def infer_minotaur_library_id(df):
## Read poseidon yaml, infer path to janno file and read janno file.
poseidon_yaml_data = PoseidonYaml(args.poseidon_yml_path)
janno_table = pd.read_table(poseidon_yaml_data.janno_file, dtype=str)
## Add Main_ID to janno table. That is the Poseidon_ID after removing minotaur processing related suffixes.
## Add Individual_ID to janno table. That is the Poseidon_ID after removing minotaur processing related suffixes.
janno_table["Eager_ID"] = janno_table["Poseidon_ID"].str.replace(r"_MNT", "")
janno_table["Main_ID"] = janno_table["Eager_ID"].str.replace(r"_ss", "")
janno_table["Individual_ID"] = janno_table["Eager_ID"].str.replace(r"_ss", "")

## Prepare damage table for joining. Infer eager Library_ID from id column, by removing '_rmdup.bam' suffix
## The "_rmdup" is removed separately to also apply to mapdamage results (which lack the .bam suffix)
Expand Down Expand Up @@ -610,7 +610,7 @@ def unique_values_join(x, sep=";"):
summarised_stats = (
compound_eager_table.groupby("Sample_Name")["endogenous"]
.apply(
max,
np.maximum.reduce,
)
.reset_index("Sample_Name")
.rename(columns={"endogenous": "Endogenous"})
Expand Down Expand Up @@ -643,6 +643,7 @@ def unique_values_join(x, sep=";"):
)
## Replace columns in original janno with values in final_eager_table
## TODO-dev need to infer Genetic_Sex from 'RateX', 'RateY', 'RateErrX', 'RateErrY'
## TODO: turn this to a dict with each colum having its type, so typing can be made explicit, then remove pd.set_option('future.no_silent_downcasting', True)
for col in [
"Nr_SNPs",
"Damage",
Expand All @@ -658,9 +659,10 @@ def unique_values_join(x, sep=";"):
"UDG",
"Genetic_Source_Accession_IDs",
]:
filled_janno_table[col] = (
filled_janno_table[[col + "_x", col + "_y"]].bfill(axis=1).iloc[:, 0]
)
with pd.option_context("future.no_silent_downcasting", True):
filled_janno_table[col] = (
filled_janno_table[[col + "_x", col + "_y"]].bfill(axis=1).iloc[:, 0]
)

## Drop columns duplicated from merges, and columns that are not relevant anymore.
filled_janno_table = filled_janno_table.drop(
Expand All @@ -682,7 +684,7 @@ def unique_values_join(x, sep=";"):
"Genetic_Sex",
"Group_Name",
"Alternative_IDs",
"Main_ID", ## Added
"Individual_ID", ## Added
"Relation_To",
"Relation_Degree",
"Relation_Type",
Expand All @@ -704,7 +706,7 @@ def unique_values_join(x, sep=";"):
"Date_Note",
"MT_Haplogroup",
"Y_Haplogroup",
"Source_Tissue",
"Source_Material",
"Nr_Libraries",
"Library_Names",
"Capture_Type",
Expand Down