Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 64 additions & 6 deletions src/formats/ncbi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

/// Loads a NCBI taxonomy from the given directory.
/// The directory should contain at least two files: `nodes.dmp` and `names.dmp`.
pub fn load<P: AsRef<Path>>(ncbi_directory: P) -> TaxonomyResult<GeneralTaxonomy> {
pub fn load<P: AsRef<Path>>(
ncbi_directory: P,
genetic_code: Option<bool>,
) -> TaxonomyResult<GeneralTaxonomy> {
let dir = ncbi_directory.as_ref();
let nodes_file = std::fs::File::open(dir.join(NODES_FILENAME))?;
let names_file = std::fs::File::open(dir.join(NAMES_FILENAME))?;
Expand All @@ -23,6 +26,7 @@
let mut tax_ids: Vec<String> = Vec::new();
let mut parents: Vec<String> = Vec::new();
let mut ranks: Vec<TaxRank> = Vec::new();
let mut data: Vec<HashMap<String, serde_json::Value>> = Vec::new();
let mut tax_to_idx: HashMap<String, usize> = HashMap::new();

for (ix, line) in BufReader::new(nodes_file).lines().enumerate() {
Expand All @@ -37,10 +41,27 @@
let tax_id = fields.remove(0).trim().to_string();
let parent_tax_id = fields.remove(0).trim().to_string();
let rank = fields.remove(0);
// After removing tax_id, parent_tax_id, and rank, fields[3] contains the genetic code ID
// (originally the 7th column, index 6)
let genetic_code_id = if fields.len() > 3 {
fields[3].trim().to_string()
} else {
String::new()
};

tax_ids.push(tax_id.clone());
parents.push(parent_tax_id.to_string());
ranks.push(TaxRank::from_str(&rank)?);

let mut node_data = HashMap::new();
if genetic_code.unwrap_or(false) && !genetic_code_id.is_empty() {
node_data.insert(
"genetic_code_id".to_string(),
serde_json::Value::String(genetic_code_id),
);
}
data.push(node_data);

tax_to_idx.insert(tax_id, ix);
}

Expand Down Expand Up @@ -77,13 +98,19 @@
}
}

let gt =
GeneralTaxonomy::from_arrays(tax_ids, parent_ids, Some(names), Some(ranks), None, None)?;
let gt = GeneralTaxonomy::from_arrays(
tax_ids,
parent_ids,
Some(names),
Some(ranks),
None,
Some(data),
)?;
gt.validate_uniqueness()?;
Ok(gt)
}

pub fn save<'t, T: 't, P: AsRef<Path>, X: Taxonomy<'t, T>>(

Check warning on line 113 in src/formats/ncbi.rs

View workflow job for this annotation

GitHub Actions / clippy

bound is defined in more than one place
tax: &'t X,
out_dir: P,
) -> TaxonomyResult<()>
Expand All @@ -106,14 +133,23 @@
.map(|(x, _)| format!("{}", x))
.unwrap_or_default()
};

// Extract genetic_code_id from data if present
let genetic_code_data = tax.data(key.clone())?;
let genetic_code = genetic_code_data
.get("genetic_code_id")
.and_then(|v| v.as_str())
.unwrap_or("1");

name_writer
.write_all(format!("{}\t|\t{}\t|\t\t|\tscientific name\t|\n", &key, name).as_bytes())?;
node_writer.write_all(
format!(
"{}\t|\t{}\t|\t{}\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n",
"{}\t|\t{}\t|\t{}\t|\t\t|\t\t|\t\t|\t{}\t|\t\t|\t\t|\t\t|\t\t|\t\t|\t\t|\n",
&key,
parent,
rank.to_ncbi_rank(),
genetic_code,
)
.as_bytes(),
)?;
Expand Down Expand Up @@ -201,7 +237,7 @@
let mut names_file = std::fs::File::create(path.join(NAMES_FILENAME)).unwrap();
writeln!(names_file, "{}", names).unwrap();

let tax = load(path).unwrap();
let tax = load(path, Some(true)).unwrap();
assert_eq!(
Taxonomy::<&str>::name(&tax, "562").unwrap(),
"Escherichia coli"
Expand All @@ -219,11 +255,18 @@
Some(("561", 1.))
);

// Check that genetic code is included when genetic_code=Some(true)
let data_562 = Taxonomy::<&str>::data(&tax, "562").unwrap();
assert_eq!(
data_562.get("genetic_code_id").and_then(|v| v.as_str()),
Some("11")
);

let out = path.join("out");
save::<&str, _, _>(&tax, &out).unwrap();

// now load again and validate a few taxids
let tax2 = load(&out).unwrap();
let tax2 = load(&out, Some(true)).unwrap();

// Check E. coli (562)
assert_eq!(
Expand Down Expand Up @@ -258,5 +301,20 @@
Taxonomy::<&str>::children(&tax2, "561").unwrap(),
vec!["562"]
);

// Test loading without genetic_code (None)
let tax_no_gc = load(path, None).unwrap();
assert_eq!(
Taxonomy::<&str>::name(&tax_no_gc, "562").unwrap(),
"Escherichia coli"
);
// Check that genetic code is NOT included when genetic_code=None
let data_no_gc = Taxonomy::<&str>::data(&tax_no_gc, "562").unwrap();
assert_eq!(data_no_gc.get("genetic_code_id"), None);

// Test loading with genetic_code=Some(false)
let tax_false_gc = load(path, Some(false)).unwrap();
let data_false_gc = Taxonomy::<&str>::data(&tax_false_gc, "562").unwrap();
assert_eq!(data_false_gc.get("genetic_code_id"), None);
}
}
17 changes: 14 additions & 3 deletions src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ pub struct TaxonomyNode {
parent: Option<String>,
#[pyo3(get)]
rank: String,
#[pyo3(get)]
genetic_code: Option<String>,
// Ideally this would be private
extra: HashMap<String, Value>,
}
Expand All @@ -86,6 +88,7 @@ impl TaxonomyNode {
self.name.hash(&mut hasher);
self.parent.hash(&mut hasher);
self.rank.hash(&mut hasher);
self.genetic_code.hash(&mut hasher);
for key in self.extra.keys() {
key.hash(&mut hasher);
}
Expand All @@ -108,6 +111,7 @@ impl TaxonomyNode {
"name" => Ok(self.name.to_object(py)),
"parent" => Ok(self.parent.to_object(py)),
"rank" => Ok(self.rank.to_object(py)),
"genetic_code" => Ok(self.genetic_code.to_object(py)),
_ => {
if self.extra.contains_key(key) {
Ok(json_value_to_pyobject(self.extra.get(key).unwrap()))
Expand Down Expand Up @@ -153,10 +157,17 @@ impl Taxonomy {
let parent = py_try!(self.tax.parent(tax_id)).map(|(p, _)| p.to_string());
let extra = py_try!(self.tax.data(tax_id));

// Extract genetic_code_id from extra data if present
let genetic_code = extra
.get("genetic_code_id")
.and_then(|v| v.as_str())
.map(|s| s.to_string());

Ok(TaxonomyNode {
id: tax_id.to_string(),
name: name.to_string(),
rank,
genetic_code,
extra: (*extra).to_owned(),
parent,
})
Expand Down Expand Up @@ -201,14 +212,14 @@ impl Taxonomy {
Ok(Taxonomy { tax })
}

/// from_ncbi(cls, dump_dir: str)
/// from_ncbi(cls, dump_dir: str, genetic_code: bool|None = None)
/// --
///
/// Load a Taxonomy from a directory.
/// The directory must contain the `nodes.dmp` and `names.dmp` files.
#[classmethod]
fn from_ncbi(_cls: &PyType, dump_dir: &str) -> PyResult<Taxonomy> {
let tax = py_try!(ncbi::load(dump_dir));
fn from_ncbi(_cls: &PyType, dump_dir: &str, genetic_code: Option<bool>) -> PyResult<Taxonomy> {
let tax = py_try!(ncbi::load(dump_dir, genetic_code));
Ok(Taxonomy { tax })
}

Expand Down
1 change: 1 addition & 0 deletions taxonomy.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class TaxonomyNode:
name: str
parent: Optional[str]
rank: str
genetic_code: Optional[str]

def __hash__(self) -> int: ...
def __repr__(self) -> str: ...
Expand Down
Loading