diff --git a/CHANGELOG.md b/CHANGELOG.md index 359a764..d9df202 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,19 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v1.0.1 - XX/XX/XXXX + +### `Added` + +- `populate_janno.py` -> `0.5.2`: + - Minor tweaks to avoid FutureWarnings from pandas. + +### `Fixed` + +### `Dependencies` + +### `Deprecated` + ## v1.0.0 - 02/09/2025 ### `Added` diff --git a/poseidon-eager.yml b/poseidon-eager.yml index b8d57cb..01d3f3a 100644 --- a/poseidon-eager.yml +++ b/poseidon-eager.yml @@ -4,99 +4,101 @@ channels: - conda-forge dependencies: - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu + - _openmp_mutex=4.5=20_gnu - _r-mutex=1.0.1=anacondar_1 - alsa-lib=1.2.8=h166bdaf_0 - - binutils_impl_linux-64=2.43=h4bf12b8_2 - - bwidget=1.10.1=ha770c72_0 - - bzip2=1.0.8=h4bc722e_7 + - binutils_impl_linux-64=2.45.1=default_hfdba357_102 + - bwidget=1.10.1=ha770c72_1 + - bzip2=1.0.8=hda65f42_9 - c-ares=1.34.4=hb9d3cd8_0 - - ca-certificates=2025.8.3=hbd8a1cb_0 + - ca-certificates=2026.2.25=hbd8a1cb_0 - cairo=1.16.0=ha61ee94_1012 - coreutils=8.25=1 - curl=8.1.2=h409715c_0 - - expat=2.6.4=h5888daf_0 + - expat=2.7.4=hecca717_0 - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 - font-ttf-inconsolata=3.000=h77eed37_0 - font-ttf-source-code-pro=2.038=h77eed37_0 - font-ttf-ubuntu=0.83=h77eed37_3 - - fontconfig=2.15.0=h7e30c49_1 + - fontconfig=2.17.1=h27c8c51_0 - fonts-conda-ecosystem=1=0 - - fonts-conda-forge=1=0 - - freetype=2.12.1=h267a509_2 - - fribidi=1.0.10=h36c2ea0_0 - - gcc_impl_linux-64=14.2.0=h6b349bd_1 - - gettext=0.23.1=h5888daf_0 - - gettext-tools=0.23.1=h5888daf_0 - - gfortran_impl_linux-64=14.2.0=hc73f493_1 + - fonts-conda-forge=1=hc364b38_1 + - freetype=2.14.2=ha770c72_0 + - fribidi=1.0.16=hb03c661_0 + - gcc_impl_linux-64=15.2.0=he420e7e_18 + - gettext=0.25.1=h3f43e3d_1 + - gettext-tools=0.25.1=h3f43e3d_1 + - gfortran_impl_linux-64=15.2.0=h281d09f_18 - giflib=5.2.2=hd590300_0 - gmp=6.3.0=hac33072_2 - - graphite2=1.3.13=h59595ed_1003 + - graphite2=1.3.14=hecca717_2 - gsl=2.7=he838d99_0 - - gxx_impl_linux-64=14.2.0=h2c03514_1 + - gxx_impl_linux-64=15.2.0=hda75c37_18 - harfbuzz=6.0.0=h8e241bc_0 - icu=70.1=h27087fc_0 - jpeg=9e=h0b41bf4_3 - - kernel-headers_linux-64=3.10.0=he073ed8_18 - - keyutils=1.6.1=h166bdaf_0 + - kernel-headers_linux-64=4.18.0=he073ed8_9 + - keyutils=1.6.3=hb9d3cd8_0 - krb5=1.20.1=h81ceb04_0 - lcms2=2.15=hfd0df8a_0 - - ld_impl_linux-64=2.43=h712a8e2_2 - - lerc=4.0.0=h27087fc_0 - - libasprintf=0.23.1=h8e693c7_0 - - libasprintf-devel=0.23.1=h8e693c7_0 - - libblas=3.9.0=28_h59b9bed_openblas - - libcblas=3.9.0=28_he106b2a_openblas + - ld_impl_linux-64=2.45.1=default_hbd61a6d_102 + - lerc=4.1.0=hdb68285_0 + - libasprintf=0.25.1=h3f43e3d_1 + - libasprintf-devel=0.25.1=h3f43e3d_1 + - libblas=3.11.0=5_h4a7cf45_openblas + - libcblas=3.11.0=5_h0358290_openblas - libcups=2.3.3=h36d4200_3 - libcurl=8.1.2=h409715c_0 - libdeflate=1.17=h0b41bf4_0 - libedit=3.1.20250104=pl5321h7949ede_0 - libev=4.33=hd590300_2 - - libexpat=2.6.4=h5888daf_0 - - libffi=3.4.6=h2dba641_0 - - libgcc=14.2.0=h77fa898_1 - - libgcc-devel_linux-64=14.2.0=h41c2201_101 - - libgcc-ng=14.2.0=h69a702a_1 - - libgettextpo=0.23.1=h5888daf_0 - - libgettextpo-devel=0.23.1=h5888daf_0 - - libgfortran=14.2.0=h69a702a_1 - - libgfortran-ng=14.2.0=h69a702a_1 - - libgfortran5=14.2.0=hd5240d6_1 + - libexpat=2.7.4=hecca717_0 + - libffi=3.5.2=h3435931_0 + - libfreetype=2.14.2=ha770c72_0 + - libfreetype6=2.14.2=h73754d4_0 + - libgcc=15.2.0=he0feb66_18 + - libgcc-devel_linux-64=15.2.0=hcc6f6b0_118 + - libgcc-ng=15.2.0=h69a702a_18 + - libgettextpo=0.25.1=h3f43e3d_1 + - libgettextpo-devel=0.25.1=h3f43e3d_1 + - libgfortran=15.2.0=h69a702a_18 + - libgfortran-ng=15.2.0=h69a702a_18 + - libgfortran5=15.2.0=h68bc16d_18 - libglib=2.78.1=hebfc3b9_0 - - libgomp=14.2.0=h77fa898_1 - - libiconv=1.18=h4ce23a2_0 - - liblapack=3.9.0=28_h7ac8fdf_openblas - - liblzma=5.6.4=hb9d3cd8_0 - - liblzma-devel=5.6.4=hb9d3cd8_0 + - libgomp=15.2.0=he0feb66_18 + - libiconv=1.18=h3b78370_2 + - liblapack=3.11.0=5_h47877c9_openblas + - liblzma=5.8.2=hb03c661_0 + - liblzma-devel=5.8.2=hb03c661_0 - libnghttp2=1.64.0=h161d5f1_0 - - libnsl=2.0.1=hd590300_0 - - libopenblas=0.3.28=pthreads_h94d23a6_1 - - libpng=1.6.47=h943b412_0 - - libsanitizer=14.2.0=h2a3dede_1 - - libsqlite=3.49.1=hee588c1_1 - - libssh2=1.11.1=hf672d98_0 - - libstdcxx=14.2.0=hc0a3c3a_1 - - libstdcxx-devel_linux-64=14.2.0=h41c2201_101 - - libstdcxx-ng=14.2.0=h4852527_1 + - libnsl=2.0.1=hb9d3cd8_1 + - libopenblas=0.3.30=pthreads_h94d23a6_4 + - libpng=1.6.55=h421ea60_0 + - libsanitizer=15.2.0=h90f66d4_18 + - libsqlite=3.52.0=h0c1763c_0 + - libssh2=1.11.1=hcf80075_0 + - libstdcxx=15.2.0=h934c35e_18 + - libstdcxx-devel_linux-64=15.2.0=hd446a21_118 + - libstdcxx-ng=15.2.0=hdf11a46_18 - libtiff=4.5.0=h6adf6a1_2 - - libuuid=2.38.1=h0b41bf4_0 - - libwebp-base=1.5.0=h851e524_0 + - libuuid=2.41.3=h5347b49_0 + - libwebp-base=1.6.0=hd42ef1d_0 - libxcb=1.13=h7f98852_1004 - libxcrypt=4.4.36=hd590300_1 - libxml2=2.10.3=hca2bb57_4 - - libzlib=1.3.1=hb9d3cd8_2 + - libzlib=1.3.2=h25fd6f3_2 - make=4.4.1=hb9d3cd8_2 - ncurses=6.5=h2d0b736_3 - - nextflow=24.10.4=hdfd78af_0 + - nextflow=25.10.4=h2a3209d_0 - openjdk=17.0.3=h58dac75_5 - - openssl=3.5.2=h26f9b46_0 + - openssl=3.6.1=h35e630c_1 - pango=1.50.14=hd33c08f_0 - pcre2=10.40=hc3806b6_0 - - pip=25.0.1=pyh8b19718_0 - - pixman=0.44.2=h29eaf8c_0 - - poseidon-trident=1.6.7.1=hebebf5b_0 + - pip=25.2=pyh8b19718_0 + - pixman=0.46.4=h54a6638_1 + - poseidon-trident=1.7.0.0=h894548b_0 - pthread-stubs=0.4=hb9d3cd8_1002 - - python=3.9.21=h9c0c6dc_1_cpython + - python=3.9.23=hc30ae73_0_cpython - r-assertthat=0.2.1=r42hc72bb7e_4 - r-backports=1.5.0=r42hb1dbf0f_0 - r-base=4.2.3=ha7d60f8_0 @@ -149,12 +151,12 @@ dependencies: - r-vctrs=0.6.5=r42ha503ecb_0 - r-waldo=0.5.2=r42hc72bb7e_0 - r-withr=3.0.0=r42hc72bb7e_0 - - readline=8.2=h8228510_1 - - sed=4.8=he412f7d_0 - - setuptools=75.8.0=pyhff2d567_0 - - sysroot_linux-64=2.17=h0157908_18 - - tk=8.6.13=noxft_h4845f30_101 - - tktable=2.10=h8bc8fbc_6 + - readline=8.3=h853b02a_0 + - sed=4.9=h6688a6e_0 + - setuptools=80.9.0=pyhff2d567_0 + - sysroot_linux-64=2.28=h4ee821c_9 + - tk=8.6.13=noxft_hd72426e_102 + - tktable=2.10=h8d826fa_7 - wheel=0.45.1=pyhd8ed1ab_1 - xorg-fixesproto=5.0=hb9d3cd8_1003 - xorg-inputproto=2.3.2=hb9d3cd8_1003 @@ -162,8 +164,8 @@ dependencies: - xorg-libice=1.0.10=h7f98852_0 - xorg-libsm=1.2.3=hd9c2040_1000 - xorg-libx11=1.8.4=h0b41bf4_0 - - xorg-libxau=1.0.12=hb9d3cd8_0 - - xorg-libxdmcp=1.1.5=hb9d3cd8_0 + - xorg-libxau=1.0.12=hb03c661_1 + - xorg-libxdmcp=1.1.5=hb03c661_1 - xorg-libxext=1.3.4=h0b41bf4_2 - xorg-libxfixes=5.0.3=h7f98852_1004 - xorg-libxi=1.7.10=h7f98852_0 @@ -174,11 +176,11 @@ dependencies: - xorg-renderproto=0.11.1=hb9d3cd8_1003 - xorg-xextproto=7.3.0=hb9d3cd8_1004 - xorg-xproto=7.0.31=hb9d3cd8_1008 - - xz=5.6.4=hbcc6ac9_0 - - xz-gpl-tools=5.6.4=hbcc6ac9_0 - - xz-tools=5.6.4=hb9d3cd8_0 - - zlib=1.3.1=hb9d3cd8_2 - - zstd=1.5.6=ha6fb4c9_0 + - xz=5.8.2=ha02ee65_0 + - xz-gpl-tools=5.8.2=ha02ee65_0 + - xz-tools=5.8.2=hb03c661_0 + - zlib=1.3.2=h25fd6f3_2 + - zstd=1.5.7=hb78ec9c_6 - pip: - argparse==1.4.0 - biopython==1.81 diff --git a/scripts/minotaur_packager.sh b/scripts/minotaur_packager.sh index b0e6fd6..93977da 100755 --- a/scripts/minotaur_packager.sh +++ b/scripts/minotaur_packager.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -VERSION='0.5.0' +VERSION='0.5.2' set -o pipefail ## Pipefail, complain on new unassigned variables. # set -x ## Debugging @@ -128,6 +128,7 @@ function add_versions_file() { local capture_type_version_string local pipeline_report_fn local populate_janno_version + local trident_version ## Read in function params package_eager_result_dir=${1} @@ -153,6 +154,7 @@ function add_versions_file() { config_version=$(grep "config_template_version" ${pipeline_report_fn} | awk -F ' ' '{print $NF}') package_config_version=$(grep "package_config_version" ${pipeline_report_fn} | awk -F ' ' '{print $NF}') populate_janno_version=$(${repo_dir}/scripts/populate_janno.py -v) + trident_version=$(trident --version 2>/dev/null) errecho -y "[${package_name}]: Writing version info to '${version_fn}'." ## Create the versions file. Flush any old file contents if the file exists. @@ -168,6 +170,7 @@ function add_versions_file() { echo " - Package config version: ${package_config_version}" >> ${version_fn} echo " - Minotaur-packager version: ${VERSION}" >> ${version_fn} echo " - populate_janno.py version: ${populate_janno_version}" >> ${version_fn} + echo " - trident version for packaging: ${trident_version}" >> ${version_fn} } ## Function to add SSF file to minotaur package @@ -347,6 +350,11 @@ genotype_fns=($(ls -1 ${root_results_dir}/genotyping/*geno)) ## List of genotype snp_set="1240K" errecho -y "[${package_name}]: SNP set inferred as '${snp_set}'." +## TODO-dev Currently hardcoded referenceAssembly info, but in the future maybe infer from genotype input names? +referenceAssembly="GRCh37" +referenceAssemblyURL="https://www.ncbi.nlm.nih.gov/datasets/genome/GCA_000001405.14" +errecho -y "[${package_name}]: Genome assembly inferred as '${referenceAssembly}' with URL '${referenceAssemblyURL}'." + ## Check that the inferred snp set is supported. Should trigger if the inference somehow breaks. supported_snpsets=($(ls -1 ${repo_dir}/conf/CaptureType_profiles/ | cut -d "." -f 1)) if [[ ! -z $(all_x_in_y 1 ${snp_set} ${#supported_snpsets[@]} ${supported_snpsets[@]}) ]]; then @@ -369,6 +377,16 @@ elif [[ ! -d ${output_package_dir} ]] || [[ ${newest_genotype_fn} -nt ${output_p trident init -p ${tmp_dir}/${package_name}.geno -o ${tmp_dir}/package/ -n ${package_name} --snpSet ${snp_set} check_fail $? "[${package_name}]: Failed to initialise package. Aborting." + ## To be replaced once trident init can do this natively. + ## First infer the line number tha includes the snpSet. Then add the reference assembly info underneath that line, then the rest of the yml file. + errecho -y "[${package_name}]: Adding reference genome assembly information." + insert_after=$(grep -n "snpSet" ${tmp_dir}/package/POSEIDON.yml | cut -d ':' -f 1) + head -n ${insert_after} ${tmp_dir}/package/POSEIDON.yml > ${tmp_dir}/package/tmp_POSEIDON.yml + echo " referenceGenomeAssembly: ${referenceAssembly}" >> ${tmp_dir}/package/tmp_POSEIDON.yml + echo " referenceGenomeAssemblyURL: ${referenceAssemblyURL}" >> ${tmp_dir}/package/tmp_POSEIDON.yml + tail -n +$((${insert_after}+1)) ${tmp_dir}/package/POSEIDON.yml >> ${tmp_dir}/package/tmp_POSEIDON.yml + mv ${tmp_dir}/package/tmp_POSEIDON.yml ${tmp_dir}/package/POSEIDON.yml + ## Add Thiseas as contributor to poseidon package ## Trident 1.5.* does not include Josiah Carberry anymore, which breaks pyJanno if the field is empty. errecho -y "[${package_name}]: Adding Thiseas as contributor to package." diff --git a/scripts/populate_janno.py b/scripts/populate_janno.py index 60567dc..1306d04 100755 --- a/scripts/populate_janno.py +++ b/scripts/populate_janno.py @@ -10,7 +10,7 @@ import numpy as np from collections import namedtuple -VERSION = "0.5.1" +VERSION = "0.5.2" def get_eager_version(eager_result_dir): @@ -272,9 +272,9 @@ def infer_minotaur_library_id(df): ## Read poseidon yaml, infer path to janno file and read janno file. poseidon_yaml_data = PoseidonYaml(args.poseidon_yml_path) janno_table = pd.read_table(poseidon_yaml_data.janno_file, dtype=str) -## Add Main_ID to janno table. That is the Poseidon_ID after removing minotaur processing related suffixes. +## Add Individual_ID to janno table. That is the Poseidon_ID after removing minotaur processing related suffixes. janno_table["Eager_ID"] = janno_table["Poseidon_ID"].str.replace(r"_MNT", "") -janno_table["Main_ID"] = janno_table["Eager_ID"].str.replace(r"_ss", "") +janno_table["Individual_ID"] = janno_table["Eager_ID"].str.replace(r"_ss", "") ## Prepare damage table for joining. Infer eager Library_ID from id column, by removing '_rmdup.bam' suffix ## The "_rmdup" is removed separately to also apply to mapdamage results (which lack the .bam suffix) @@ -610,7 +610,7 @@ def unique_values_join(x, sep=";"): summarised_stats = ( compound_eager_table.groupby("Sample_Name")["endogenous"] .apply( - max, + np.maximum.reduce, ) .reset_index("Sample_Name") .rename(columns={"endogenous": "Endogenous"}) @@ -643,6 +643,7 @@ def unique_values_join(x, sep=";"): ) ## Replace columns in original janno with values in final_eager_table ## TODO-dev need to infer Genetic_Sex from 'RateX', 'RateY', 'RateErrX', 'RateErrY' +## TODO: turn this to a dict with each colum having its type, so typing can be made explicit, then remove pd.set_option('future.no_silent_downcasting', True) for col in [ "Nr_SNPs", "Damage", @@ -658,9 +659,10 @@ def unique_values_join(x, sep=";"): "UDG", "Genetic_Source_Accession_IDs", ]: - filled_janno_table[col] = ( - filled_janno_table[[col + "_x", col + "_y"]].bfill(axis=1).iloc[:, 0] - ) + with pd.option_context("future.no_silent_downcasting", True): + filled_janno_table[col] = ( + filled_janno_table[[col + "_x", col + "_y"]].bfill(axis=1).iloc[:, 0] + ) ## Drop columns duplicated from merges, and columns that are not relevant anymore. filled_janno_table = filled_janno_table.drop( @@ -682,7 +684,7 @@ def unique_values_join(x, sep=";"): "Genetic_Sex", "Group_Name", "Alternative_IDs", - "Main_ID", ## Added + "Individual_ID", ## Added "Relation_To", "Relation_Degree", "Relation_Type", @@ -704,7 +706,7 @@ def unique_values_join(x, sep=";"): "Date_Note", "MT_Haplogroup", "Y_Haplogroup", - "Source_Tissue", + "Source_Material", "Nr_Libraries", "Library_Names", "Capture_Type",