diff --git a/.images/multiply-pipeline.png b/.images/multiply-pipeline.png index 39ef522..89e6f53 100644 Binary files a/.images/multiply-pipeline.png and b/.images/multiply-pipeline.png differ diff --git a/README.md b/README.md index 0c6a7e1..82d9578 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,16 @@ $1$ New organisms can be made available for download by adding them to the colle $2$ You can make your own primer3 settings by creating new or alterating existing JSON files in the `settings/primer3` folder. +## Extending previously-designed panels + +It's also possible to ask `multiply` to find designs that extend an existing multiplex primer panel (for example, one that has already been tested in the lab). To use this functionality: + +- specify the path to the `multiply` multiplex output for the previous designs - and a list of regions - in the `[Extend]` section of your design file. +- then run the pipeline as usual. + +Multiple will search for primers in your regions of interest at the `generate` step, but will combine them with the previously-designed primers for all subsequent steps of the pipeline to build new multiplexes extending the original one. (An example of this process can be found in the `designs/pf-extend.ini` file.) + + ## Resources `multiply` uses the following external software and databases: - `primer3`. Individual primer pair design. https://primer3.org/ diff --git a/designs/pf-extend.ini b/designs/pf-extend.ini new file mode 100644 index 0000000..af8b726 --- /dev/null +++ b/designs/pf-extend.ini @@ -0,0 +1,25 @@ +[Sample] +genome = PlasmodiumFalciparum + +[Genes] +target_ids = PF3D7_0629500, PF3D7_0304600 +target_names = AAT1, CSP + +[Extend] +target_ids = PF3D7_0206800, PF3D7_0810800, PF3D7_1407900, PF3D7_1408100, PF3D7_0709000, PF3D7_1343700, PF3D7_0417200, MDR1part +target_names = MSP2, DHPS, PMI, PMIII, CRT1, K13, DHFR, MDR1part +design_file = results/pf-default/select/table.multiplexes_information.csv + +[Primers] +include_tails = False +F_tail = TTTCTGTTGGTGCTGATATTGC +R_tail = ACTTGCCTGTCGCTCTATCTTC + +[Amplicons] +min_size_bp = 3000 +max_size_bp = 5000 +primer3_settings = default, stringent, relaxed, variable + +[Output] +name = pf-extend +primer_code = v diff --git a/src/multiply/blast/annotator.py b/src/multiply/blast/annotator.py index 4024325..e3d746c 100644 --- a/src/multiply/blast/annotator.py +++ b/src/multiply/blast/annotator.py @@ -82,7 +82,7 @@ def summarise_by_primer(self, output_path=None): primer_pair_name=qseqid[:-2], target_name=qseqid.split("_")[0], total_alignments=qseqid_df.shape[0], - **qseqid_df[self.annotations].sum().to_dict(), + **qseqid_df[self.annotations.keys()].sum().to_dict(), ) for qseqid, qseqid_df in self.blast_df.groupby("qseqid") ] diff --git a/src/multiply/generate/main.py b/src/multiply/generate/main.py index 29e1e26..eedb267 100644 --- a/src/multiply/generate/main.py +++ b/src/multiply/generate/main.py @@ -140,7 +140,6 @@ def generate(design): print("Done.\n") # WRITE - print("Writing output table...") primer_df = pd.DataFrame( [ pair.get_primer_as_dict(direction) @@ -166,7 +165,16 @@ def generate(design): "pair_penalty", ] ] + + if params['from_extend']: + print( f"Adding {params['extend_primers'].shape[0]} previously-generated primers from [Extend] section..." ) + primer_df = pd.concat([ + primer_df, + params['extend_primers'] + ]) + output_csv = f"{params['output_dir']}/table.candidate_primers.csv" + print( f"Writing output table to \"{output_csv}\"..." ) primer_df.to_csv(output_csv, index=False) print(f" to: {output_csv}") print("Done.\n") diff --git a/src/multiply/util/parsing.py b/src/multiply/util/parsing.py index c70e565..eb5d54c 100644 --- a/src/multiply/util/parsing.py +++ b/src/multiply/util/parsing.py @@ -25,6 +25,7 @@ def parse_parameters(design_path): params = {} params = add_samples(config, params) params = add_genes(config, params) + params = add_extend(config, params) params = add_regions(config, params) params = add_primers(config, params) params = add_amplicons(config, params) @@ -58,7 +59,7 @@ def check_design_exists(design_path): def check_valid_sections( config, must_include=["Sample", "Primers", "Amplicons", "Output"], - one_of=[["Genes", "Regions"]], + one_of=[["Genes", "Regions"]] ): """ Check that all expected sections are found within the configuration object @@ -91,7 +92,6 @@ def check_valid_sections( f"Design must include at least one of these sections: {', '.join(section_set)}. Please add." ) - def add_samples(config, params): """ Add [Sample] information to a parameter dictionary @@ -174,6 +174,122 @@ def add_genes(config, params): return params +def add_extend(config, params): + """ + Add [Extend] information from a configparser object to a params dictionary + This is similar to the [Genes] section but lists previously-defined primers, via + a well-formed candidate primers table. + The input design_file can be, e.g. either the table.candidate_primers.csv or the + output table.multiplexes_information.csv from a previous run - it must have these columns: + 'target_id', + 'target_name', + 'pair_name', + 'primer_name', + 'direction', + 'seq', + 'length', + 'tm', + 'gc', + 'chrom', + 'start', + 'product_bp', + 'pair_penalty' + + params + config: ConfigParser + ConfigParser object holding design file information. + params: dict + Dictionary of MULTIPLY parameters. + returns + params: dict + Dictionary of MULTIPLY parameters, with [Extend] + parameters added. + """ + + # Check if genes have been provided + if not config.has_section("Extend"): + params["from_extend"] = False + return params + + # Parse gene IDs + target_ids = [g.strip() for g in config.get("Extend", "target_ids").split(",")] + + # Parse gene namess + has_names = config.has_option("Extend", "target_names") + if has_names: + target_names = [ + g.strip() for g in config.get("Extend", "target_names").split(",") + ] + else: + target_names = target_ids + + # Sanity checks + n_ids = len(target_ids) + n_names = len(target_names) + if not n_ids == n_names: + raise DesignFileError( + f"In [Extend], found {n_ids} `target_ids` and {n_names} `target_names`. Ensure equal." + ) + + # Load the previously-defined primers: + import pandas + design_file = config.get("Extend", "design_file" ) + primer_df = pandas.read_csv( design_file ) + primer_df = primer_df[ + [ + 'target_id', + 'target_name', + 'pair_name', + 'primer_name', + 'direction', + 'seq', + 'length', + 'tm', + 'gc', + 'chrom', + 'start', + 'product_bp', + 'pair_penalty' + ] + ] + # Check all specified genes are represented in the file and have the correct names: + for index, id in enumerate( target_ids ): + this_target = primer_df[ primer_df.target_id == id ] + if this_target.shape[0] == 0: + raise DesignFileError( + f"In [Extend], target id \"{id}\" was not among the targets in the specified design file \"{design_file}\"." + ) + target_name = list( set( this_target.target_name.to_list() )) + if len(target_name) > 1: + raise DesignFileError( + f"In [Extend], in the specified design file \"{design_file}\", target {target_id} seems to have more than one name." + ) + target_name = target_name[0] + if has_names: + # If we're given names, make sure they are correct: + if target_name != target_names[index]: + raise DesignFileError( + f"In [Extend], the name (\"{target_name}\") for target {id} in the design file \"{design_file}\", does not match the one specified (\"{target_names[index]}\")." + ) + else: + # Otherwise get the name from the file: + target_names[index] = target_name + + primer_df = primer_df[ primer_df.target_id.isin( target_ids )] + + # Mapping between IDs and names + id_to_name = {i: n for i, n in zip(target_ids, target_names)} + name_to_id = {n: i for i, n in id_to_name.items()} + + # Add to dictionary + params["from_extend"] = True + params["extend_ids"] = target_ids + params["extend_has_names"] = True + params["extend_names"] = target_names + params["extend_id_to_name"] = id_to_name + params["extend_primers"] = primer_df + + return params def add_regions(config, params): """