Skip to content

Commit 1350bc1

Browse files
committed
faster fastqs with oxbow library
1 parent 845cb51 commit 1350bc1

4 files changed

Lines changed: 30 additions & 46 deletions

File tree

countess/core/parameters.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import math
44
import os.path
55
import re
6+
import string
67
from decimal import Decimal
78
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, Type, Union
89

@@ -282,6 +283,11 @@ def copy(self) -> "StringCharacterSetParam":
282283
return self.__class__(self.label, self.value, character_set=self.character_set)
283284

284285

286+
class ColumnLabelParam(StringCharacterSetParam):
287+
288+
character_set: set[str] = set(string.ascii_letters + string.digits + ".-_:")
289+
290+
285291
class FileBaseParam(StringParam):
286292
"""A StringParam for holding a filename, either to be read or written."""
287293

countess/core/pipeline.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,9 +305,10 @@ def run(self):
305305
# we can improve on this a little.
306306
# see https://github.com/duckdb/duckdb/issues/1848
307307

308-
logger.info("Starting")
309308
start_time = time.time()
309+
logger.info("Starting: %s", start_time)
310310
for node in self.traverse_nodes():
311+
logger.info("... starting %s", node.name)
311312
node.load_config()
312313
sources = {pn.name: pn.result for pn in node.parent_nodes}
313314
node.plugin.prepare_multi(self.ddbc, sources)
@@ -316,8 +317,10 @@ def run(self):
316317
node.result = duckdb_source_to_view(self.ddbc, result)
317318
else:
318319
node.result = None
320+
logger.info("... completed %s", node.name)
319321

320-
logger.info("Finished, elapsed time: %d", time.time() - start_time)
322+
finish_time = time.time()
323+
logger.info("Finished: %s, elapsed time: %d", finish_time, finish_time - start_time)
321324

322325
def reset(self):
323326
for node in self.nodes:

countess/plugins/fastq.py

Lines changed: 18 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
1-
import itertools
21
import logging
32
from typing import Iterable, Optional
43

5-
import dnaio
64
import duckdb
7-
import pyarrow
5+
import oxbow
86

97
from countess import VERSION
108
from countess.core.parameters import BaseParam, BooleanParam, FloatParam
@@ -32,45 +30,31 @@ class LoadFastqPlugin(DuckdbLoadFileWithTheLotPlugin):
3230
def load_file(
3331
self, cursor: duckdb.DuckDBPyConnection, filename: str, file_param: BaseParam, row_limit: Optional[int] = None
3432
) -> duckdb.DuckDBPyRelation:
35-
# Open the file, convert it to a RecordBatchReader and then
36-
# wrap that up as a DuckDBPyRelation so we can filter it.
3733
logger.debug("Loading file %s row_limit %s", filename, row_limit)
3834

39-
# Take up to row_limit records from this file
40-
fastq_iter = itertools.islice(dnaio.open(filename, open_threads=1), row_limit)
35+
fields = ['sequence']
36+
if self.min_avg_quality:
37+
fields.append('quality')
38+
if self.header_column:
39+
fields.append('name')
40+
41+
rel = oxbow.from_fastq(filename, fields=fields).to_duckdb(cursor)
4142

42-
def _record_to_dict(record):
43-
d = {"sequence": record.sequence}
44-
if self.header_column:
45-
d["header"] = record.name
46-
return d
43+
if row_limit:
44+
rel = rel.limit(row_limit)
4745

48-
def _avg_quality(record):
49-
return sum(ord(c) for c in record.qualities) / len(record.qualities) - 33
46+
if self.min_avg_quality:
47+
filt = "list_avg(list_transform(split(quality,''), lambda x: ord(x))) >= %d" % (self.min_avg_quality+33)
48+
rel = rel.filter(filt)
5049

51-
pyarrow_schema = pyarrow.schema([pyarrow.field("sequence", pyarrow.string())])
5250
if self.header_column:
53-
pyarrow_schema.append(pyarrow.field("header", pyarrow.string()))
54-
55-
# Generator which batches records 5000 at a time into RecordBatches
56-
record_batch_iter = (
57-
pyarrow.RecordBatch.from_pylist(
58-
[
59-
_record_to_dict(record)
60-
for record in batch
61-
if self.min_avg_quality <= 0 or self.min_avg_quality <= _avg_quality(record)
62-
]
63-
)
64-
for batch in itertools.batched(fastq_iter, 5000)
65-
)
66-
67-
# We can turn that generator of RecordBatches into a temporary table
68-
rel = cursor.from_arrow(pyarrow.RecordBatchReader.from_batches(pyarrow_schema, record_batch_iter))
51+
rel = rel.project("sequence, name as header")
52+
else:
53+
rel = rel.project("sequence")
6954

7055
if self.group:
7156
rel = rel.aggregate("sequence, count(*) as count")
7257

73-
logger.debug("Loading file %s row_limit %s done", filename, row_limit)
7458
return rel
7559

7660
def combine(
@@ -96,14 +80,5 @@ class LoadFastaPlugin(DuckdbLoadFileWithTheLotPlugin):
9680
def load_file(
9781
self, cursor: duckdb.DuckDBPyConnection, filename: str, file_param: BaseParam, row_limit: Optional[int] = None
9882
) -> duckdb.DuckDBPyRelation:
99-
pyarrow_schema = pyarrow.schema(
100-
[pyarrow.field("sequence", pyarrow.string()), pyarrow.field("header", pyarrow.string())]
101-
)
102-
103-
fasta_iter = itertools.islice(dnaio.open(filename, open_threads=1), row_limit)
104-
record_batch_iter = (
105-
pyarrow.RecordBatch.from_pylist([{"sequence": z.sequence, "header": z.name} for z in y])
106-
for y in itertools.batched(fasta_iter, 5000)
107-
)
108-
rel = cursor.from_arrow(pyarrow.RecordBatchReader.from_batches(pyarrow_schema, record_batch_iter))
109-
return rel
83+
rel = oxbow.from_fasta(filename).to_duckdb(cursor)
84+
return rel.limit(row_limit) if row_limit else rel

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@ classifiers = [
1818
'Topic :: Scientific/Engineering :: Bio-Informatics',
1919
]
2020
dependencies = [
21-
'dnaio~=1.2.3',
2221
'duckdb~=1.3.1',
2322
'fqfa~=1.3.1',
2423
'more_itertools~=10.7.0',
2524
'numpy~=2.2.6',
25+
'oxbow~=0.5.1',
2626
'pandas~=2.3.0',
2727
'psutil~=7.0.0',
2828
'pyarrow~=20.0.0',

0 commit comments

Comments
 (0)