From 0ce38ad134710b89bc1a27c6bd9ff883d83942d2 Mon Sep 17 00:00:00 2001 From: Robert Baldwin Date: Wed, 18 Feb 2026 21:20:26 +0000 Subject: [PATCH 1/3] first --- src/formats/ncbi.rs | 34 +++++++++++++++++++++++++++++----- src/python.rs | 17 ++++++++++++++--- taxonomy.pyi | 1 + 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs index c3b8522..62b19a0 100644 --- a/src/formats/ncbi.rs +++ b/src/formats/ncbi.rs @@ -14,7 +14,7 @@ const NAMES_FILENAME: &str = "names.dmp"; /// Loads a NCBI taxonomy from the given directory. /// The directory should contain at least two files: `nodes.dmp` and `names.dmp`. -pub fn load>(ncbi_directory: P) -> TaxonomyResult { +pub fn load>(ncbi_directory: P, genetic_code: Option) -> TaxonomyResult { let dir = ncbi_directory.as_ref(); let nodes_file = std::fs::File::open(dir.join(NODES_FILENAME))?; let names_file = std::fs::File::open(dir.join(NAMES_FILENAME))?; @@ -23,6 +23,7 @@ pub fn load>(ncbi_directory: P) -> TaxonomyResult = Vec::new(); let mut parents: Vec = Vec::new(); let mut ranks: Vec = Vec::new(); + let mut data: Vec> = Vec::new(); let mut tax_to_idx: HashMap = HashMap::new(); for (ix, line) in BufReader::new(nodes_file).lines().enumerate() { @@ -37,10 +38,24 @@ pub fn load>(ncbi_directory: P) -> TaxonomyResult 3 { + fields[3].trim().to_string() + } else { + String::new() + }; tax_ids.push(tax_id.clone()); parents.push(parent_tax_id.to_string()); ranks.push(TaxRank::from_str(&rank)?); + + let mut node_data = HashMap::new(); + if genetic_code.unwrap_or(false) && !genetic_code_id.is_empty() { + node_data.insert("genetic_code_id".to_string(), serde_json::Value::String(genetic_code_id)); + } + data.push(node_data); + tax_to_idx.insert(tax_id, ix); } @@ -78,7 +93,7 @@ pub fn load>(ncbi_directory: P) -> TaxonomyResult::name(&tax, "562").unwrap(), "Escherichia coli" @@ -223,7 +247,7 @@ mod tests { save::<&str, _, _>(&tax, &out).unwrap(); // now load again and validate a few taxids - let tax2 = load(&out).unwrap(); + let tax2 = load(&out, Some(true)).unwrap(); // Check E. coli (562) assert_eq!( diff --git a/src/python.rs b/src/python.rs index ba104fd..f5eb0e0 100644 --- a/src/python.rs +++ b/src/python.rs @@ -74,6 +74,8 @@ pub struct TaxonomyNode { parent: Option, #[pyo3(get)] rank: String, + #[pyo3(get)] + genetic_code: Option, // Ideally this would be private extra: HashMap, } @@ -86,6 +88,7 @@ impl TaxonomyNode { self.name.hash(&mut hasher); self.parent.hash(&mut hasher); self.rank.hash(&mut hasher); + self.genetic_code.hash(&mut hasher); for key in self.extra.keys() { key.hash(&mut hasher); } @@ -108,6 +111,7 @@ impl TaxonomyNode { "name" => Ok(self.name.to_object(py)), "parent" => Ok(self.parent.to_object(py)), "rank" => Ok(self.rank.to_object(py)), + "genetic_code" => Ok(self.genetic_code.to_object(py)), _ => { if self.extra.contains_key(key) { Ok(json_value_to_pyobject(self.extra.get(key).unwrap())) @@ -153,10 +157,17 @@ impl Taxonomy { let parent = py_try!(self.tax.parent(tax_id)).map(|(p, _)| p.to_string()); let extra = py_try!(self.tax.data(tax_id)); + // Extract genetic_code_id from extra data if present + let genetic_code = extra + .get("genetic_code_id") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + Ok(TaxonomyNode { id: tax_id.to_string(), name: name.to_string(), rank, + genetic_code, extra: (*extra).to_owned(), parent, }) @@ -201,14 +212,14 @@ impl Taxonomy { Ok(Taxonomy { tax }) } - /// from_ncbi(cls, dump_dir: str) + /// from_ncbi(cls, dump_dir: str, genetic_code: bool = None) /// -- /// /// Load a Taxonomy from a directory. /// The directory must contain the `nodes.dmp` and `names.dmp` files. #[classmethod] - fn from_ncbi(_cls: &PyType, dump_dir: &str) -> PyResult { - let tax = py_try!(ncbi::load(dump_dir)); + fn from_ncbi(_cls: &PyType, dump_dir: &str, genetic_code: Option) -> PyResult { + let tax = py_try!(ncbi::load(dump_dir, genetic_code)); Ok(Taxonomy { tax }) } diff --git a/taxonomy.pyi b/taxonomy.pyi index 91eb779..a5cad3b 100644 --- a/taxonomy.pyi +++ b/taxonomy.pyi @@ -12,6 +12,7 @@ class TaxonomyNode: name: str parent: Optional[str] rank: str + genetic_code: Optional[str] def __hash__(self) -> int: ... def __repr__(self) -> str: ... From f9cd3c226a95c635d0c344315e76ac9037fd3dc5 Mon Sep 17 00:00:00 2001 From: Robert Baldwin Date: Wed, 18 Feb 2026 21:37:45 +0000 Subject: [PATCH 2/3] comments --- src/formats/ncbi.rs | 22 ++++++++++++++++++++++ src/python.rs | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs index 62b19a0..2e891d7 100644 --- a/src/formats/ncbi.rs +++ b/src/formats/ncbi.rs @@ -243,6 +243,13 @@ mod tests { Some(("561", 1.)) ); + // Check that genetic code is included when genetic_code=Some(true) + let data_562 = Taxonomy::<&str>::data(&tax, "562").unwrap(); + assert_eq!( + data_562.get("genetic_code_id").and_then(|v| v.as_str()), + Some("11") + ); + let out = path.join("out"); save::<&str, _, _>(&tax, &out).unwrap(); @@ -282,5 +289,20 @@ mod tests { Taxonomy::<&str>::children(&tax2, "561").unwrap(), vec!["562"] ); + + // Test loading without genetic_code (None) + let tax_no_gc = load(path, None).unwrap(); + assert_eq!( + Taxonomy::<&str>::name(&tax_no_gc, "562").unwrap(), + "Escherichia coli" + ); + // Check that genetic code is NOT included when genetic_code=None + let data_no_gc = Taxonomy::<&str>::data(&tax_no_gc, "562").unwrap(); + assert_eq!(data_no_gc.get("genetic_code_id"), None); + + // Test loading with genetic_code=Some(false) + let tax_false_gc = load(path, Some(false)).unwrap(); + let data_false_gc = Taxonomy::<&str>::data(&tax_false_gc, "562").unwrap(); + assert_eq!(data_false_gc.get("genetic_code_id"), None); } } diff --git a/src/python.rs b/src/python.rs index f5eb0e0..6fe3e15 100644 --- a/src/python.rs +++ b/src/python.rs @@ -212,7 +212,7 @@ impl Taxonomy { Ok(Taxonomy { tax }) } - /// from_ncbi(cls, dump_dir: str, genetic_code: bool = None) + /// from_ncbi(cls, dump_dir: str, genetic_code: bool|None = None) /// -- /// /// Load a Taxonomy from a directory. From a07cb301e86077be37cbf677285b31327d3427d6 Mon Sep 17 00:00:00 2001 From: Robert Baldwin Date: Wed, 18 Feb 2026 21:39:01 +0000 Subject: [PATCH 3/3] lint --- src/formats/ncbi.rs | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/formats/ncbi.rs b/src/formats/ncbi.rs index 2e891d7..7424628 100644 --- a/src/formats/ncbi.rs +++ b/src/formats/ncbi.rs @@ -14,7 +14,10 @@ const NAMES_FILENAME: &str = "names.dmp"; /// Loads a NCBI taxonomy from the given directory. /// The directory should contain at least two files: `nodes.dmp` and `names.dmp`. -pub fn load>(ncbi_directory: P, genetic_code: Option) -> TaxonomyResult { +pub fn load>( + ncbi_directory: P, + genetic_code: Option, +) -> TaxonomyResult { let dir = ncbi_directory.as_ref(); let nodes_file = std::fs::File::open(dir.join(NODES_FILENAME))?; let names_file = std::fs::File::open(dir.join(NAMES_FILENAME))?; @@ -52,7 +55,10 @@ pub fn load>(ncbi_directory: P, genetic_code: Option) -> Ta let mut node_data = HashMap::new(); if genetic_code.unwrap_or(false) && !genetic_code_id.is_empty() { - node_data.insert("genetic_code_id".to_string(), serde_json::Value::String(genetic_code_id)); + node_data.insert( + "genetic_code_id".to_string(), + serde_json::Value::String(genetic_code_id), + ); } data.push(node_data); @@ -92,8 +98,14 @@ pub fn load>(ncbi_directory: P, genetic_code: Option) -> Ta } } - let gt = - GeneralTaxonomy::from_arrays(tax_ids, parent_ids, Some(names), Some(ranks), None, Some(data))?; + let gt = GeneralTaxonomy::from_arrays( + tax_ids, + parent_ids, + Some(names), + Some(ranks), + None, + Some(data), + )?; gt.validate_uniqueness()?; Ok(gt) }