faster fastqs with oxbow library

nickzoic · nickzoic · commit 1350bc16095d · 2026-01-15T17:57:43.000+11:00
diff --git a/countess/core/parameters.py b/countess/core/parameters.py
@@ -3,6 +3,7 @@
 import math
 import os.path
 import re
+import string
 from decimal import Decimal
 from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, Type, Union
 
@@ -282,6 +283,11 @@ def copy(self) -> "StringCharacterSetParam":
         return self.__class__(self.label, self.value, character_set=self.character_set)
 
 
+class ColumnLabelParam(StringCharacterSetParam):
+
+    character_set: set[str] = set(string.ascii_letters + string.digits + ".-_:")
+
+
 class FileBaseParam(StringParam):
     """A StringParam for holding a filename, either to be read or written."""
 
diff --git a/countess/core/pipeline.py b/countess/core/pipeline.py
@@ -305,9 +305,10 @@ def run(self):
         # we can improve on this a little.
         # see https://github.com/duckdb/duckdb/issues/1848
 
-        logger.info("Starting")
         start_time = time.time()
+        logger.info("Starting: %s", start_time)
         for node in self.traverse_nodes():
+            logger.info("... starting %s", node.name)
             node.load_config()
             sources = {pn.name: pn.result for pn in node.parent_nodes}
             node.plugin.prepare_multi(self.ddbc, sources)
@@ -316,8 +317,10 @@ def run(self):
                 node.result = duckdb_source_to_view(self.ddbc, result)
             else:
                 node.result = None
+            logger.info("... completed %s", node.name)
 
-        logger.info("Finished, elapsed time: %d", time.time() - start_time)
+        finish_time = time.time()
+        logger.info("Finished: %s, elapsed time: %d", finish_time, finish_time - start_time)
 
     def reset(self):
         for node in self.nodes:
diff --git a/countess/plugins/fastq.py b/countess/plugins/fastq.py
@@ -1,10 +1,8 @@
-import itertools
 import logging
 from typing import Iterable, Optional
 
-import dnaio
 import duckdb
-import pyarrow
+import oxbow
 
 from countess import VERSION
 from countess.core.parameters import BaseParam, BooleanParam, FloatParam
@@ -32,45 +30,31 @@ class LoadFastqPlugin(DuckdbLoadFileWithTheLotPlugin):
     def load_file(
         self, cursor: duckdb.DuckDBPyConnection, filename: str, file_param: BaseParam, row_limit: Optional[int] = None
     ) -> duckdb.DuckDBPyRelation:
-        # Open the file, convert it to a RecordBatchReader and then
-        # wrap that up as a DuckDBPyRelation so we can filter it.
         logger.debug("Loading file %s row_limit %s", filename, row_limit)
 
-        # Take up to row_limit records from this file
-        fastq_iter = itertools.islice(dnaio.open(filename, open_threads=1), row_limit)
+        fields = ['sequence']
+        if self.min_avg_quality:
+            fields.append('quality')
+        if self.header_column:
+            fields.append('name')
+
+        rel = oxbow.from_fastq(filename, fields=fields).to_duckdb(cursor)
 
-        def _record_to_dict(record):
-            d = {"sequence": record.sequence}
-            if self.header_column:
-                d["header"] = record.name
-            return d
+        if row_limit:
+            rel = rel.limit(row_limit)
 
-        def _avg_quality(record):
-            return sum(ord(c) for c in record.qualities) / len(record.qualities) - 33
+        if self.min_avg_quality:
+            filt = "list_avg(list_transform(split(quality,''), lambda x: ord(x))) >= %d" % (self.min_avg_quality+33)
+            rel = rel.filter(filt)
 
-        pyarrow_schema = pyarrow.schema([pyarrow.field("sequence", pyarrow.string())])
         if self.header_column:
-            pyarrow_schema.append(pyarrow.field("header", pyarrow.string()))
-
-        # Generator which batches records 5000 at a time into RecordBatches
-        record_batch_iter = (
-            pyarrow.RecordBatch.from_pylist(
-                [
-                    _record_to_dict(record)
-                    for record in batch
-                    if self.min_avg_quality <= 0 or self.min_avg_quality <= _avg_quality(record)
-                ]
-            )
-            for batch in itertools.batched(fastq_iter, 5000)
-        )
-
-        # We can turn that generator of RecordBatches into a temporary table
-        rel = cursor.from_arrow(pyarrow.RecordBatchReader.from_batches(pyarrow_schema, record_batch_iter))
+            rel = rel.project("sequence, name as header")
+        else:
+            rel = rel.project("sequence")
 
         if self.group:
             rel = rel.aggregate("sequence, count(*) as count")
 
-        logger.debug("Loading file %s row_limit %s done", filename, row_limit)
         return rel
 
     def combine(
@@ -96,14 +80,5 @@ class LoadFastaPlugin(DuckdbLoadFileWithTheLotPlugin):
     def load_file(
         self, cursor: duckdb.DuckDBPyConnection, filename: str, file_param: BaseParam, row_limit: Optional[int] = None
     ) -> duckdb.DuckDBPyRelation:
-        pyarrow_schema = pyarrow.schema(
-            [pyarrow.field("sequence", pyarrow.string()), pyarrow.field("header", pyarrow.string())]
-        )
-
-        fasta_iter = itertools.islice(dnaio.open(filename, open_threads=1), row_limit)
-        record_batch_iter = (
-            pyarrow.RecordBatch.from_pylist([{"sequence": z.sequence, "header": z.name} for z in y])
-            for y in itertools.batched(fasta_iter, 5000)
-        )
-        rel = cursor.from_arrow(pyarrow.RecordBatchReader.from_batches(pyarrow_schema, record_batch_iter))
-        return rel
+        rel = oxbow.from_fasta(filename).to_duckdb(cursor)
+        return rel.limit(row_limit) if row_limit else rel
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,11 +18,11 @@ classifiers = [
     'Topic :: Scientific/Engineering :: Bio-Informatics',
 ]
 dependencies = [
-    'dnaio~=1.2.3',
     'duckdb~=1.3.1',
     'fqfa~=1.3.1',
     'more_itertools~=10.7.0',
     'numpy~=2.2.6',
+    'oxbow~=0.5.1',
     'pandas~=2.3.0',
     'psutil~=7.0.0',
     'pyarrow~=20.0.0',