-
Notifications
You must be signed in to change notification settings - Fork 1
Test fastxpp implmentations #14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Trecek
wants to merge
16
commits into
main
Choose a base branch
from
BFXINT_906_Implement_fastxpp
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+714
−2
Open
Changes from 10 commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
d299d67
Add support for generating fastxpp format
TalonTrecek 250ef26
Add support for reading fastxpp format
TalonTrecek 0d210b4
Add file for benchmarking
TalonTrecek f229a57
Comments
TalonTrecek 69830a3
Add bpl
TalonTrecek f1a1df3
Add bpl generation
TalonTrecek e3c3c8b
Add swar decode
TalonTrecek a4614eb
remove hlen
TalonTrecek ae738bb
move swar decode for eventual vendor
TalonTrecek d20d1b0
clean up
TalonTrecek f82709d
Switch to gzip to open
TalonTrecek 9c4697c
Add memcpy free method
TalonTrecek 8d56122
add read once test
TalonTrecek b3b11d5
remove toying around code
TalonTrecek 6462073
cleanup
TalonTrecek b7a8cd4
Clean up
TalonTrecek File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| import sys | ||
| from time.time import perf_counter | ||
| from ishlib.vendor.kseq import FastxReader, BufferedReader, KRead | ||
| from ExtraMojo.utils.ir import dump_ir | ||
|
|
||
|
|
||
| # ── thin wrapper so FileHandle implements KRead ───────────────────────── | ||
| struct FileReader(KRead): | ||
| var fh: FileHandle | ||
|
|
||
| fn __init__(out self, owned fh: FileHandle): | ||
| self.fh = fh^ | ||
|
|
||
| fn __moveinit__(out self, owned other: Self): | ||
| self.fh = other.fh^ | ||
|
|
||
| fn unbuffered_read[ | ||
| o: MutableOrigin | ||
| ](mut self, buffer: Span[UInt8, o]) raises -> Int: | ||
| return Int(self.fh.read(buffer.unsafe_ptr(), len(buffer))) | ||
|
|
||
|
|
||
| # ──────────────────────────────────────────────────────────────────────── | ||
|
|
||
|
|
||
| fn bench_original(path: String) raises -> (Int, Int, Float64): | ||
| var fh = open(path, "r") | ||
| var rdr = FastxReader[read_comment=False](BufferedReader(FileReader(fh^))) | ||
| var rec = 0 | ||
| var seq = 0 | ||
| var t0 = perf_counter() | ||
| while rdr.read() > 0: | ||
| rec += 1 | ||
| seq += len(rdr.seq) | ||
| return (rec, seq, perf_counter() - t0) | ||
|
|
||
|
|
||
| fn bench_fastxpp(path: String) raises -> (Int, Int, Float64): | ||
| var fh = open(path, "r") | ||
| var rdr = FastxReader[read_comment=False](BufferedReader(FileReader(fh^))) | ||
| var rec = 0 | ||
| var seq = 0 | ||
| var t0 = perf_counter() | ||
| while True: | ||
| var n = rdr.read_fastxpp() | ||
| if n < 0: | ||
| break | ||
| rec += 1 | ||
| seq += n | ||
| return (rec, seq, perf_counter() - t0) | ||
|
|
||
|
|
||
| fn bench_fastxpp_bpl(path: String) raises -> (Int, Int, Float64): | ||
| var fh = open(path, "r") | ||
| var rdr = FastxReader[read_comment=False](BufferedReader(FileReader(fh^))) | ||
| var rec = 0 | ||
| var seq = 0 | ||
| var t0 = perf_counter() | ||
| while True: | ||
| var n = rdr.read_fastxpp_bpl() | ||
| if n < 0: | ||
| break | ||
| rec += 1 | ||
| seq += n | ||
| return (rec, seq, perf_counter() - t0) | ||
|
|
||
|
|
||
| fn bench_fastxpp_swar(path: String) raises -> (Int, Int, Float64): | ||
| var fh = open(path, "r") | ||
| var rdr = FastxReader[read_comment=False](BufferedReader(FileReader(fh^))) | ||
| var rec = 0 | ||
| var seq = 0 | ||
| var t0 = perf_counter() | ||
| while True: | ||
| var n = rdr.read_fastxpp_swar() | ||
| if n < 0: | ||
| break | ||
| rec += 1 | ||
| seq += n | ||
| return (rec, seq, perf_counter() - t0) | ||
|
|
||
|
|
||
| fn bench_fastxpp_bpl2(path: String) raises -> (Int, Int, Float64): | ||
| var fh = open(path, "r") | ||
| var rdr = FastxReader[read_comment=False](BufferedReader(FileReader(fh^))) | ||
| var rec = 0 | ||
| var seq = 0 | ||
| var t0 = perf_counter() | ||
| while True: | ||
| var n = rdr.read_fastxpp_bpl() | ||
| if n < 0: | ||
| break | ||
| rec += 1 | ||
| seq += n | ||
| return (rec, seq, perf_counter() - t0) | ||
|
|
||
|
|
||
| fn main() raises: | ||
| var argv = sys.argv() | ||
| if len(argv) < 2 or len(argv) > 3: | ||
| print("Usage: mojo run fastxpp_bench.mojo <file> [orig|fastxpp|bpl]") | ||
| return | ||
|
|
||
| var path = String(argv[1]) | ||
| var mode: String = "orig" # default when no flag given | ||
| if len(argv) == 3: | ||
| mode = String(argv[2]) | ||
|
|
||
| if mode == "orig": | ||
| r, s, t = bench_original(path) | ||
| print( | ||
| "mode=orig records=", r, " bases=", s, " time=", t, "s" | ||
| ) | ||
| elif mode == "bpl": | ||
| r, s, t = bench_fastxpp_bpl(path) | ||
| print( | ||
| "mode=fastxpp_bpl records=", r, " bases=", s, " time=", t, "s" | ||
| ) | ||
| elif mode == "fastxpp": | ||
| r, s, t = bench_fastxpp(path) | ||
| print( | ||
| "mode=fastxpp records=", r, " bases=", s, " time=", t, "s" | ||
| ) | ||
| elif mode == "swar": | ||
| r, s, t = bench_fastxpp_swar(path) | ||
| print( | ||
| "mode=fastxpp_swar records=", r, " bases=", s, " time=", t, "s" | ||
| ) | ||
| elif mode == "filler": | ||
| r, s, t = bench_fastxpp_bpl2(path) | ||
| print( | ||
| "mode=fastxpp_bpl records=", r, " bases=", s, " time=", t, "s" | ||
| ) | ||
| else: | ||
| print("Unknown mode:", mode) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,183 @@ | ||
| import sys | ||
| from collections import Optional | ||
| from ExtraMojo.io.buffered import BufferedReader, BufferedWriter | ||
| from collections import List # dynamic grow-able buffer | ||
| from memory import Span # view into the List for zero-copy writes | ||
|
|
||
| # ---------- helpers ------------------------------------------------- | ||
|
|
||
|
|
||
| fn string_count(s: String) -> Int: | ||
| var n: Int = 0 | ||
| for _ in s.codepoints(): | ||
| n = n + 1 | ||
| return n | ||
|
|
||
|
|
||
| fn read_line(mut rdr: BufferedReader) raises -> String: | ||
| var buf = List[UInt8]() | ||
| var n = rdr.read_until(buf, ord("\n")) | ||
| if n == 0: | ||
| return "" | ||
| var s = String() | ||
| s.write_bytes(Span(buf)) | ||
| return s | ||
|
|
||
|
|
||
| # ---------- FASTX++ builder ----------------------------------------- | ||
|
|
||
|
|
||
| fn generate_fastxpp( | ||
| marker: String, | ||
| header: String, | ||
| seq_lines: List[String], | ||
| qualities: Optional[List[String]] = None, | ||
| ) -> String: | ||
| var bpl = string_count(seq_lines[0]) + 1 # bases + LF | ||
| var seq_len: Int = 0 | ||
| for i in range(len(seq_lines)): | ||
| seq_len = seq_len + string_count(seq_lines[i]) | ||
|
|
||
| var meta = String(string_count(header)) + ":" + String( | ||
| seq_len | ||
| ) + ":" + String(len(seq_lines)) | ||
|
|
||
| var rec = marker + "`" + meta + "`" + header + "\n" | ||
|
|
||
| for i in range(len(seq_lines)): | ||
| rec.write(seq_lines[i], "\n") | ||
|
|
||
| if qualities: | ||
| var q = qualities.value() | ||
| rec += "+\n" | ||
| for i in range(len(q)): | ||
| rec.write(q[i], "\n") | ||
|
|
||
| return rec | ||
|
|
||
|
|
||
| fn generate_fastxpp_bpl( | ||
| marker: String, | ||
| header: String, | ||
| seq_lines: List[String], | ||
| qualities: Optional[List[String]] = None, | ||
| ) -> String: | ||
| var bpl = string_count(seq_lines[0]) + 1 # bases + LF | ||
| var slen = (bpl - 1) * (len(seq_lines) - 1) + # (bases per full line) | ||
| string_count(seq_lines[-1]) # + last (ragged) line | ||
| var meta = String(string_count(header)) + ":" + | ||
| String(slen) + ":" + | ||
| String(len(seq_lines)) + ":" + | ||
| String(bpl) | ||
| var rec = marker + "`" + meta + "`" + header + "\n" | ||
| for i in range(len(seq_lines)): | ||
| rec.write(seq_lines[i], "\n") | ||
| if qualities: | ||
| var q = qualities.value() | ||
| for i in range(len(q)): | ||
| rec.write(q[i], "\n") | ||
| return rec | ||
|
|
||
| # Helper: encode an unsigned ≤9-digit value as zero-padded ASCII. | ||
| fn to_ascii_padded(value: Int, width: Int) -> String: | ||
| # build the decimal text first … | ||
| var digits = String(value) # e.g. "123" | ||
| var pad = width - string_count(digits) # how many zeros needed | ||
|
|
||
| # … then emit into a single pre-sized String | ||
| var out = String(capacity=width) | ||
| for _ in range(pad): | ||
| out.write("0") | ||
| out.write(digits) # concat is zero-copy | ||
| return out # length == width | ||
|
|
||
| fn generate_fastxpp_bpl_fixed( | ||
| marker: String, | ||
| header: String, | ||
| seq_lines: List[String], | ||
| qualities: Optional[List[String]] = None, | ||
| ) -> String: | ||
|
|
||
| # --- numeric fields ------------------------------------------------ | ||
| var bpl = string_count(seq_lines[0]) + 1 # incl. LF | ||
| var slen = (bpl - 1) * (len(seq_lines) - 1) + | ||
| string_count(seq_lines[-1]) | ||
|
|
||
| # --- fixed-width metadata block ------------------------------------ | ||
| var meta = "`" + | ||
| #to_ascii_padded(string_count(header), 6) + # hlen | ||
| to_ascii_padded(slen, 9) + # slen | ||
| to_ascii_padded(len(seq_lines), 7) + # nlin | ||
| to_ascii_padded(bpl, 3) + # bpl | ||
| "`" | ||
|
|
||
| # --- assemble record ----------------------------------------------- | ||
| var rec = marker + meta + header + "\n" | ||
| for i in range(len(seq_lines)): | ||
| rec.write(seq_lines[i], "\n") | ||
| if qualities: | ||
| var q = qualities.value() | ||
| for i in range(len(q)): | ||
| rec.write(q[i], "\n") | ||
| return rec | ||
|
|
||
| # ---------- main ---------------------------------------------------- | ||
|
|
||
|
|
||
| fn main() raises: | ||
| var argv = sys.argv() | ||
| if len(argv) != 3: | ||
| print( | ||
| "Usage: mojo run generate_fastxpp.mojo <input.fastx>" | ||
| " <output.fastxpp>" | ||
| ) | ||
| return | ||
|
|
||
| var reader = BufferedReader( | ||
| open(String(argv[1]), "r"), buffer_capacity=128 * 1024 | ||
| ) | ||
| var writer = BufferedWriter( | ||
| open(String(argv[2]), "w"), buffer_capacity=128 * 1024 | ||
| ) | ||
|
|
||
| var pending_header = String() # carries a header we already read | ||
|
|
||
| while True: | ||
| var header_line = pending_header | ||
| if header_line == "": | ||
| header_line = read_line(reader) | ||
| pending_header = String() | ||
|
|
||
| if header_line == "": | ||
| break | ||
|
|
||
| var marker = String(header_line[0:1]) | ||
| var header = String(header_line[1:]) | ||
|
|
||
| var seq = List[String]() | ||
| var line: String | ||
|
|
||
| while True: | ||
| line = read_line(reader) | ||
| if line == "": | ||
| break | ||
| if ( | ||
| line.startswith(">") | ||
| or line.startswith("@") | ||
| or (marker == "@" and line.startswith("+")) | ||
| ): | ||
| pending_header = line # save for the next record | ||
| break | ||
| seq.append(line) | ||
|
|
||
| var qual: Optional[List[String]] = None | ||
| if marker == "@" and line.startswith("+"): | ||
| var qlines = List[String]() | ||
| for _ in range(len(seq)): | ||
| qlines.append(read_line(reader)) | ||
| qual = Optional[List[String]](qlines) | ||
|
|
||
| writer.write(generate_fastxpp_bpl_fixed(marker, header, seq, qual)) | ||
|
|
||
| writer.flush() | ||
| writer.close() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.