@@ -135,35 +135,62 @@ impl PackedNSeqVec {
135135 }
136136 }
137137
138+ /// Read a fasta/fastq file at the given path into a `PackedNSeq`, separating records with `N`.
139+ #[ cfg( feature = "file_io" ) ]
140+ pub fn from_fastx ( path : & std:: path:: Path ) -> Self {
141+ let mut ascii = Vec :: with_capacity ( 16000 ) ;
142+
143+ let mut seq = Self :: default ( ) ;
144+
145+ let mut reader = needletail:: parse_fastx_file ( & path) . unwrap ( ) ;
146+ while let Some ( record) = reader. next ( ) {
147+ let seqrec = record. expect ( "Invalid FASTA/Q record" ) ;
148+ ascii. extend_from_slice ( & seqrec. seq ( ) ) ;
149+ ascii. push ( b'N' ) ;
150+
151+ // Convert from ascii dna to packed dna+mask in batches of 16kbp.
152+ if ascii. len ( ) > 16000 {
153+ seq. push_ascii ( & ascii) ;
154+ ascii. clear ( ) ;
155+ }
156+ }
157+
158+ if ascii. len ( ) > 0 {
159+ seq. push_ascii ( & ascii) ;
160+ }
161+
162+ seq
163+ }
164+
165+ /// Read a fasta/fastq file at the given path into a `PackedNSeq`, separating records with `N`.
166+ /// Low-quality bases are masked out.
138167 #[ cfg( feature = "file_io" ) ]
139168 pub fn from_fastq_with_quality ( path : & std:: path:: Path , min_qual : u8 ) -> Self {
140- let mut dna = vec ! [ ] ;
169+ let mut ascii = vec ! [ ] ;
141170 let mut qual = vec ! [ ] ;
142171
143- let mut seqs = Self :: default ( ) ;
172+ let mut seq = Self :: default ( ) ;
144173
145174 let mut reader = needletail:: parse_fastx_file ( & path) . unwrap ( ) ;
146175 while let Some ( record) = reader. next ( ) {
147176 let seqrec = record. expect ( "Invalid FASTA/Q record" ) ;
148- dna . extend_from_slice ( & seqrec. seq ( ) ) ;
177+ ascii . extend_from_slice ( & seqrec. seq ( ) ) ;
149178 qual. extend_from_slice ( & seqrec. qual ( ) . unwrap ( ) ) ;
150- dna . push ( b'N' ) ;
179+ ascii . push ( b'N' ) ;
151180 qual. push ( 0 ) ;
152181
153182 // Convert from ascii dna+qual to packed dna+mask in batches of 16kbp.
154- if dna . len ( ) > 16000 {
155- seqs . push_from_ascii_and_quality ( & dna , & qual, min_qual) ;
156- dna . clear ( ) ;
183+ if ascii . len ( ) > 16000 {
184+ seq . push_from_ascii_and_quality ( & ascii , & qual, min_qual) ;
185+ ascii . clear ( ) ;
157186 qual. clear ( ) ;
158187 }
159188 }
160189
161- if dna. len ( ) > 0 {
162- seqs. push_from_ascii_and_quality ( & dna, & qual, min_qual) ;
163- dna. clear ( ) ;
164- qual. clear ( ) ;
190+ if ascii. len ( ) > 0 {
191+ seq. push_from_ascii_and_quality ( & ascii, & qual, min_qual) ;
165192 }
166193
167- seqs
194+ seq
168195 }
169196}
0 commit comments