Skip to content

Commit b24f0d4

Browse files
Add PackedNSeq::from_fastx
1 parent 9bfe8d9 commit b24f0d4

1 file changed

Lines changed: 39 additions & 12 deletions

File tree

src/packed_n_seq.rs

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -135,35 +135,62 @@ impl PackedNSeqVec {
135135
}
136136
}
137137

138+
/// Read a fasta/fastq file at the given path into a `PackedNSeq`, separating records with `N`.
139+
#[cfg(feature = "file_io")]
140+
pub fn from_fastx(path: &std::path::Path) -> Self {
141+
let mut ascii = Vec::with_capacity(16000);
142+
143+
let mut seq = Self::default();
144+
145+
let mut reader = needletail::parse_fastx_file(&path).unwrap();
146+
while let Some(record) = reader.next() {
147+
let seqrec = record.expect("Invalid FASTA/Q record");
148+
ascii.extend_from_slice(&seqrec.seq());
149+
ascii.push(b'N');
150+
151+
// Convert from ascii dna to packed dna+mask in batches of 16kbp.
152+
if ascii.len() > 16000 {
153+
seq.push_ascii(&ascii);
154+
ascii.clear();
155+
}
156+
}
157+
158+
if ascii.len() > 0 {
159+
seq.push_ascii(&ascii);
160+
}
161+
162+
seq
163+
}
164+
165+
/// Read a fasta/fastq file at the given path into a `PackedNSeq`, separating records with `N`.
166+
/// Low-quality bases are masked out.
138167
#[cfg(feature = "file_io")]
139168
pub fn from_fastq_with_quality(path: &std::path::Path, min_qual: u8) -> Self {
140-
let mut dna = vec![];
169+
let mut ascii = vec![];
141170
let mut qual = vec![];
142171

143-
let mut seqs = Self::default();
172+
let mut seq = Self::default();
144173

145174
let mut reader = needletail::parse_fastx_file(&path).unwrap();
146175
while let Some(record) = reader.next() {
147176
let seqrec = record.expect("Invalid FASTA/Q record");
148-
dna.extend_from_slice(&seqrec.seq());
177+
ascii.extend_from_slice(&seqrec.seq());
149178
qual.extend_from_slice(&seqrec.qual().unwrap());
150-
dna.push(b'N');
179+
ascii.push(b'N');
151180
qual.push(0);
152181

153182
// Convert from ascii dna+qual to packed dna+mask in batches of 16kbp.
154-
if dna.len() > 16000 {
155-
seqs.push_from_ascii_and_quality(&dna, &qual, min_qual);
156-
dna.clear();
183+
if ascii.len() > 16000 {
184+
seq.push_from_ascii_and_quality(&ascii, &qual, min_qual);
185+
ascii.clear();
157186
qual.clear();
158187
}
159188
}
160189

161-
if dna.len() > 0 {
162-
seqs.push_from_ascii_and_quality(&dna, &qual, min_qual);
163-
dna.clear();
164-
qual.clear();
190+
if ascii.len() > 0 {
191+
seq.push_from_ascii_and_quality(&ascii, &qual, min_qual);
165192
}
166193

167-
seqs
194+
seq
168195
}
169196
}

0 commit comments

Comments
 (0)