diff --git a/biome.json b/biome.json index 353ba7be..9b0ea9f9 100644 --- a/biome.json +++ b/biome.json @@ -98,6 +98,9 @@ }, "complexity": { "useLiteralKeys": "off" + }, + "suspicious": { + "noMisplacedAssertion": "off" } } } diff --git a/playground/arrays.html b/playground/arrays.html new file mode 100644 index 00000000..e386a8a6 --- /dev/null +++ b/playground/arrays.html @@ -0,0 +1,336 @@ + + + + + + tsb β€” pd.arrays: Nullable Typed Extension Arrays + + + +
← tsb playground
+
+

πŸ”’ pd.arrays β€” Nullable Typed Extension Arrays

+

Mirrors pandas.arrays: nullable integers, floats, booleans, strings, datetimes, timedeltas.

+ βœ… Complete + +

Overview

+

+ The pd.arrays namespace provides typed extension arrays with first-class support + for missing values (NA). Each array type stores values and a boolean mask separately β€” when + mask[i] = true the element at position i is NA (missing). +

+

+ These arrays mirror the pandas nullable array types introduced in pandas 1.0+. They differ from + plain JavaScript arrays in that null / undefined are never stored in + the data buffer β€” missing values are tracked by a separate mask, enabling efficient aggregate + operations that skip NA values. +

+ +

Quick Start

+
import {
+  arrays,
+  IntegerArray,
+  FloatingArray,
+  BooleanArray,
+  StringArray,
+  DatetimeArray,
+  TimedeltaArray,
+} from "tsb";
+
+// Nullable integer array
+const ints = arrays.IntegerArray.from([1, 2, null, 4, 5], "Int32");
+ints.dtype;              // "Int32"
+ints.toArray();          // [1, 2, null, 4, 5]
+ints.sum();              // 12
+ints.fillna(0).toArray(); // [1, 2, 0, 4, 5]
+
+// Nullable float array
+const floats = arrays.FloatingArray.from([1.5, NaN, 3.0]);
+floats.mean();           // 2.25  (NaN treated as NA)
+
+// Nullable boolean β€” three-valued logic
+const bools = arrays.BooleanArray.from([true, null, false]);
+bools.any();             // true
+bools.all();             // false
+
+// Nullable string array
+const strs = arrays.StringArray.from(["hello", null, "world"]);
+strs.upper().toArray();  // ["HELLO", null, "WORLD"]
+strs.len().toArray();    // [5, null, 5]
+ +

Array Types

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Classpandas equivalentDtypesNA behaviour
IntegerArraypandas.arrays.IntegerArrayInt8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64null / undefined β†’ NA
FloatingArraypandas.arrays.FloatingArrayFloat32, Float64null, undefined, NaN β†’ NA
BooleanArraypandas.arrays.BooleanArray"boolean"Kleene 3-valued logic
StringArraypandas.arrays.StringArray"string"null / undefined β†’ NA
DatetimeArraypandas.arrays.DatetimeArray"datetime64[ns]"NA preserved through all ops
TimedeltaArraypandas.arrays.TimedeltaArray"timedelta64[ns]"NA preserved through all ops
+ +

IntegerArray

+
import { IntegerArray } from "tsb";
+
+// Construction
+const a = IntegerArray.from([1, 2, null, 4], "Int32");
+a.dtype;          // "Int32"
+a.size;           // 4
+a.at(2);          // null  (NA)
+a.isna();         // [false, false, true, false]
+
+// Arithmetic (NA propagates)
+a.add(10).toArray();    // [11, 12, null, 14]
+a.mul(2).toArray();     // [2, 4, null, 8]
+a.floordiv(2).toArray(); // [0, 1, null, 2]
+
+// Reductions
+a.sum();          // 7
+a.mean();         // 7/3 β‰ˆ 2.33
+a.min();          // 1
+a.max();          // 4
+a.count();        // 3
+
+// Fill and drop NA
+a.fillna(0).toArray();  // [1, 2, 0, 4]
+a.dropna();             // [1, 2, 4]
+
+// Type conversion
+a.astype("Int64");
+ +

FloatingArray

+
import { FloatingArray } from "tsb";
+
+const f = FloatingArray.from([1.0, 2.5, NaN, 4.5]);
+// NaN is treated as NA
+f.toArray();      // [1.0, 2.5, null, 4.5]
+
+// Statistics
+f.sum();          // 8.0
+f.mean();         // 8.0 / 3 β‰ˆ 2.67
+f.std();          // sample standard deviation (ddof=1)
+f.min();          // 1.0
+f.max();          // 4.5
+
+// Arithmetic
+f.add(f).toArray();  // [2.0, 5.0, null, 9.0]
+f.pow(2).toArray();  // [1.0, 6.25, null, 20.25]
+ +

BooleanArray β€” Three-Valued Logic

+
import { BooleanArray } from "tsb";
+
+const b = BooleanArray.from([true, null, false]);
+b.any();          // true
+b.all();          // false
+b.sum();          // 1  (count of true elements)
+
+// Kleene logic:  false AND NA β†’ false,  true AND NA β†’ NA
+const x = BooleanArray.from([true,  false, null, true ]);
+const y = BooleanArray.from([true,  null,  true, false]);
+x.and(y).toArray(); // [true, false, null, false]
+x.or(y).toArray();  // [true, null,  true, false]  β€” note: false OR NA = NA
+x.not().toArray();  // [false, null,  true, false]
+ +

StringArray

+
import { StringArray } from "tsb";
+
+const s = StringArray.from(["  Hello  ", null, "world"]);
+
+s.strip().toArray();       // ["Hello", null, "world"]
+s.upper().toArray();       // ["  HELLO  ", null, "WORLD"]
+s.lower().toArray();       // ["  hello  ", null, "world"]
+s.replace("o", "0").toArray(); // ["  Hell0  ", null, "w0rld"]
+
+// Pattern matching β†’ BooleanArray
+s.strip().contains("Hello").toArray();   // [true, null, false]
+s.strip().startswith("H").toArray();     // [true, null, false]
+s.strip().endswith("d").toArray();       // [false, null, true]
+
+// Lengths β†’ IntegerArray
+s.strip().len().toArray();   // [5, null, 5]
+
+// Concatenation
+const a = StringArray.from(["foo", "bar"]);
+const b = StringArray.from(["baz", "qux"]);
+a.cat("-", b).toArray(); // ["foo-baz", "bar-qux"]
+ +

DatetimeArray

+
import { DatetimeArray, Timestamp } from "tsb";
+
+const dts = DatetimeArray.from([
+  "2024-01-15T10:30:00Z",
+  null,
+  "2024-06-21T00:00:00Z",
+]);
+dts.dtype;   // "datetime64[ns]"
+dts.year;    // [2024, null, 2024]
+dts.month;   // [1, null, 6]
+dts.day;     // [15, null, 21]
+dts.hour;    // [10, null, 0]
+
+// Min / max
+dts.min(); // Timestamp("2024-01-15T10:30:00Z")
+dts.max(); // Timestamp("2024-06-21T00:00:00Z")
+
+// Fill NA
+const fill = new Timestamp("2000-01-01");
+dts.fillna(fill).toArray();  // no nulls
+
+// Millisecond timestamps
+dts.asMs();  // [number, null, number]
+ +

TimedeltaArray

+
import { TimedeltaArray, Timedelta } from "tsb";
+
+const tds = TimedeltaArray.from([
+  Timedelta.fromComponents({ days: 1 }),
+  null,
+  86_400_000 * 2,         // 2 days in ms
+  "P3DT6H",               // ISO 8601 duration
+]);
+tds.dtype;     // "timedelta64[ns]"
+tds.days;      // [1, null, 2, 3]
+tds.hours;     // [0, null, 0, 6]
+tds.totalSeconds; // [86400, null, 172800, 291600]
+
+// Arithmetic
+const extra = Timedelta.fromComponents({ hours: 12 });
+tds.add(extra).days; // [1, null, 2, 3] (hours += 12)
+tds.mul(2).totalDays; // [2, null, 4, 7]
+
+// Reductions
+tds.sum()?.totalDays;  // 6.25  (1 + 2 + 3.25)
+tds.min();             // Timedelta(1 day)
+tds.max();             // Timedelta(3 days 6 hours)
+ +

Shared API (all array types)

+
// Every array type exposes the same base interface:
+
+a.size;               // number of elements (including NA)
+a.dtype;              // dtype string
+a.at(i);              // element at index i, or null (supports negative)
+a.isna();             // boolean[] β€” true where NA
+a.notna();            // boolean[] β€” true where not NA
+a.hasNa();            // boolean β€” true if any NA
+a.toArray();          // (T | null)[] β€” plain JS array with nulls
+a.dropna();           // T[] β€” non-NA values only
+a.fillna(value);      // new array with NA replaced by value
+[...a];               // iterable over (T | null) elements
+ +
+ πŸ’‘ pandas.array() analogue
+ tsb also exports pdArray(values, dtype) β€” a universal factory that returns a + PandasArray. The typed arrays here provide more specific operations (arithmetic, + string methods, etc.) and should be preferred when the element type is known. +
+ +

Design Notes

+

+ All nullable arrays store a parallel _mask: boolean[] where true + means NA. The data buffer _data: T[] always has a sentinel value at masked + positions (typically 0, false, or "") β€” these values are never + exposed through the public API. +

+

+ Integer arithmetic truncates toward zero. Float32 values are rounded with + Math.fround. Integer arrays validate bounds on construction. All operations that + return a new array preserve the dtype of the input unless astype() is called. +

+ + diff --git a/playground/case_when.html b/playground/case_when.html new file mode 100644 index 00000000..46e4fe92 --- /dev/null +++ b/playground/case_when.html @@ -0,0 +1,434 @@ + + + + + + tsb β€” case_when + + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap +

case_when

+

Conditional value selection using CASE WHEN semantics β€” mirrors pandas.Series.case_when() (pandas 2.2+).

+ +
+

1 β€” Basic grade classification

+

caseWhen(series, caselist) applies an ordered list of [condition, replacement] pairs. The first matching condition determines the output; if no condition matches the original value is kept.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

2 β€” Using boolean Series as conditions

+

Conditions can be boolean Series objects (e.g. from comparison operations).

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

3 β€” Using predicate functions

+

Conditions can be predicate functions (value, index) => boolean.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

4 β€” Series as replacement values

+

Replacements can be Series objects β€” the matching positional value is used.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

5 β€” Unmatched rows keep original values

+

Any row not matched by any condition retains its original value β€” there is no implicit "else" replacement.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

6 β€” First matching condition wins

+

When multiple conditions match the same row, the first one in caselist takes effect β€” just like CASE WHEN … THEN … WHEN … THEN … END in SQL.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

7 β€” Positional index in predicate

+

Predicate functions receive both the value and its positional index as the second argument.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

8 β€” String Series classification

+

caseWhen works on any Series type β€” numbers, strings, booleans, or mixed.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

9 β€” Comparison with where / mask

+

caseWhen generalises whereSeries to multiple branches. Use whereSeries for a single condition; use caseWhen for multi-branch logic.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + + + + + diff --git a/playground/feather.html b/playground/feather.html new file mode 100644 index 00000000..5fa2aeb4 --- /dev/null +++ b/playground/feather.html @@ -0,0 +1,357 @@ + + + + + + tsb β€” readFeather & toFeather + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap + +

πŸͺΆ Apache Arrow Feather v2 I/O

+

+ readFeather(data, options?) and toFeather(df, options?) + implement a pure-TypeScript Apache Arrow IPC (Feather v2) reader and writer with no + native dependencies. The format uses FlatBuffers for metadata and stores column data + contiguously with 8-byte alignment. +

+ +
+ Supported column types (read & write): Int8/16/32/64, + UInt8/16/32/64, Float32/64, Bool, + Utf8. + Null / nullable columns fully supported via Arrow validity bitmaps. + Equivalent to pandas.read_feather() / DataFrame.to_feather(). +
+ + +
+

1 Β· Basic read & write

+

Serialize a DataFrame to an Arrow IPC buffer with + toFeather() and read it back with readFeather(). + The buffer starts and ends with the ARROW1 magic bytes.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

2 Β· Column types β€” int, float, boolean, string

+

All major column types round-trip correctly. Integers are stored as + Int64, floats as Float64, booleans are bit-packed, and strings use + the Arrow Utf8 layout (int32 offsets + UTF-8 byte data).

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

3 Β· Null values β€” Arrow validity bitmaps

+

Columns with nulls have a validity bitmap prepended (1 bit per row, LSB-first). + Columns with no nulls omit the bitmap (zero-length validity buffer) to save space.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

4 Β· usecols β€” selective column reads

+

Use usecols to read only a subset of columns. + Buffer tracking skips over the buffers for unselected columns, + so no extra allocation is needed.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

5 Β· indexCol β€” row index from a column

+

Promote any column to the DataFrame's row index via indexCol. + Use writeIndex: true in toFeather() to persist the + index as __index_level_0__.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

6 Β· Unicode strings

+

Utf8 columns store length-prefixed UTF-8 byte data. Any Unicode string β€” + including emoji, CJK characters, and accented letters β€” round-trips exactly.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + + + + + diff --git a/playground/flags.html b/playground/flags.html new file mode 100644 index 00000000..18c8cbf6 --- /dev/null +++ b/playground/flags.html @@ -0,0 +1,300 @@ + + + + + + tsb β€” Flags: metadata for DataFrame and Series + + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap +

Flags: metadata for DataFrame and Series

+

+ Mirrors + pandas.DataFrame.flags β€” controls duplicate-label behaviour. +

+ + +
+

1 Β· Default flags

+

+ Every DataFrame and Series exposes a + flags getter returning a Flags object. + By default, allowsDuplicateLabels is true. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· Setting flags

+

+ Mutate allowsDuplicateLabels directly on the + Flags object. The change is shared across all + Flags wrappers for the same underlying object. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· DuplicateLabelError

+

+ Setting allowsDuplicateLabels = false on an object with + duplicate index labels immediately throws a + DuplicateLabelError. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· copy() and raiseOnDuplicates()

+

+ Flags.copy() returns a new wrapper sharing the same state. + raiseOnDuplicates() validates only when + allowsDuplicateLabels is false. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + + + + + diff --git a/playground/fwf.html b/playground/fwf.html new file mode 100644 index 00000000..8435429c --- /dev/null +++ b/playground/fwf.html @@ -0,0 +1,504 @@ + + + + + + tsb β€” readFwf + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

πŸ“ readFwf β€” Interactive Playground

+

+ Parse fixed-width formatted text into a + DataFrame with readFwf(). + Mirrors pandas + read_fwf() β€” column boundaries are inferred from whitespace patterns + automatically, or provided explicitly via colspecs / widths.
+ Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· Auto column-width inference

+

When colspecs is omitted (default "infer"), + readFwf() scans the data rows and identifies separator positions β€” + character columns where every row contains a space. This mirrors + pandas.read_fwf(colspecs='infer').

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· Explicit colspecs

+

Provide colspecs β€” an array of [start, end) character + index pairs β€” for precise control over column boundaries. Useful when separator + positions vary between rows.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· Column widths

+

Alternatively, pass widths β€” an array of integers β€” to define + consecutive column widths. This produces [0,w0], [w0,w0+w1], … + colspecs internally.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· Missing values & dtype forcing

+

Standard NA strings (NA, NaN, null, …) are + recognised automatically. Add custom NA strings with naValues. + Force a column's dtype with the dtype option.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

5 Β· Index column, row limits & skip rows

+

Promote a column to the row index with indexCol. + Limit rows with nRows and skip leading data rows with + skipRows.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

6 Β· Real-world: Census-style population table

+

Fixed-width format is common in government datasets, legacy mainframe exports, + and statistical software output. Here is a Census-style table.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

API Reference

+

Parse a fixed-width formatted text string into a DataFrame. + Equivalent to pandas.read_fwf().

+
readFwf(text: string, options?: ReadFwfOptions): DataFrame
+
+type ColSpec = readonly [number, number];   // [start, end) character indices
+
+interface ReadFwfOptions {
+  colspecs?:   readonly ColSpec[] | "infer"; // column boundaries (default: "infer")
+  widths?:     readonly number[];            // column widths (alternative to colspecs)
+  inferNrows?: number;                       // rows to sample for inference (default: 100)
+  header?:     number | null;               // header row index (default: 0)
+  names?:      readonly string[];           // explicit column names
+  indexCol?:   string | number | null;      // column to use as row index
+  dtype?:      Record<string, DtypeName>; // force dtype for named columns
+  naValues?:   readonly string[];           // extra NA string values
+  skipRows?:   number;                      // data rows to skip after header
+  nRows?:      number;                      // maximum data rows to read
+}
+
+ + + + + diff --git a/playground/hdf.html b/playground/hdf.html new file mode 100644 index 00000000..e6a3df08 --- /dev/null +++ b/playground/hdf.html @@ -0,0 +1,400 @@ + + + + + + tsb β€” readHdf & toHdf + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap + +

πŸ—‚οΈ HDF5 I/O

+

+ readHdf(data, options?) and toHdf(df, options?) + implement a pure-TypeScript HDF5 v0 Superblock reader and writer with no + native dependencies. Each file encodes a single DataFrame under a + configurable HDF5 group key (default "df"). The format is compatible + with pandas.read_hdf() / DataFrame.to_hdf(). +

+ +
+ Supported column types: Float64/Float32, + Int8/16/32/64, UInt8/16/32/64, + Bool (stored as UInt8), + String (fixed-length null-padded UTF-8). + Max 120 columns per DataFrame. One DataFrame per file (single HDF5 group key). +
+ + +
+

1 Β· Basic read & write

+

Serialize a DataFrame to an HDF5 binary buffer with + toHdf() and read it back with readHdf(). + The buffer begins with the standard HDF5 magic bytes + 0x89 HDF\r\n\x1a\n.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

2 Β· Column types β€” int, float, boolean, string

+

HDF5 stores numeric types as contiguous fixed-width binary arrays. + Booleans are stored as UInt8 (0 or 1). + Strings are fixed-length null-padded UTF-8 β€” the element size is the + byte length of the longest string in the column.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

3 Β· Custom HDF5 group key

+

The HDF5 group key specifies where within the file the DataFrame is stored. + The default is "df". A leading / is stripped + automatically (both in write and read).

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

4 Β· usecols β€” selective column reads

+

Pass usecols to read only a subset of columns from the file. + Unselected columns are skipped during dataset parsing.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

5 Β· writeIndex & indexCol β€” persisting the row index

+

Use writeIndex: true to store the DataFrame's row index as an + extra column named __index__. When reading back, pass + indexCol: "__index__" to restore it as the row index.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

6 Β· Unicode strings

+

Strings are stored as fixed-length null-padded UTF-8 arrays. The element + size is the byte length of the longest encoded string. Any Unicode string β€” + including emoji, CJK, and accented characters β€” round-trips exactly.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

7 Β· Special float values β€” NaN, Infinity

+

IEEE 754 special values round-trip correctly since the data is stored + as raw binary float64 without any encoding layer.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + + + + + diff --git a/playground/holiday.html b/playground/holiday.html new file mode 100644 index 00000000..4d9e3561 --- /dev/null +++ b/playground/holiday.html @@ -0,0 +1,505 @@ + + + + + + tsb β€” Holiday Calendars (pandas.tseries.holiday) + + + +
+ ← Back to playground + +
+

πŸ—“οΈ Holiday Calendars

+

+ New + pandas.tseries.holiday + Holiday calendars, observance rules, and US Federal holidays β€” all from scratch. +

+
+ + +

1. US Federal Holiday Calendar

+
+

Query year range:

+
+ + + +
+

+    
+ + +

2. Custom Holiday Calendar

+
+

+ Build a calendar from arbitrary holiday rules using the + Holiday class and observance functions. +

+
+
+

Code

+ +
+
+

Output

+
(click Run)
+
+
+ +
+ + +

3. Observance Functions

+
+

See how observance functions shift weekend holidays:

+
+
+ + +

4. Floating Holidays with Weekday Offsets

+
+

+ MO(n), TH(n) etc. find the n-th occurrence + of a weekday on/after the base date β€” powering "last Monday of May" rules. +

+
+
+

Code

+ +
+
+

Output

+
(click Run)
+
+
+ +
+ + +

5. Calendar Registry

+
+
+
+

Code

+ +
+
+

Output

+
(click Run)
+
+
+ +
+
+ + + + diff --git a/playground/index.html b/playground/index.html index e5b0bf00..4d9deb46 100644 --- a/playground/index.html +++ b/playground/index.html @@ -235,6 +235,11 @@

Wide-to-long reshape. Unpivot columns into variable/value pairs with id_vars, value_vars, var_name, value_name.

βœ… Complete
+
+

↕ lreshape

+

Wide-to-long reshape with named column groups. Stack multiple wide columns into long columns with explicit grouping, dropna support.

+
βœ… Complete
+

πŸ”„ pivot & pivotTable

Reshape with aggregation. pivot() for unique reshaping; pivotTable() for aggregation (mean/sum/count/min/max/first/last) with fill_value and dropna support.

@@ -330,6 +335,11 @@

Attach arbitrary key→value metadata to any Series or DataFrame via a WeakMap registry. Provides getAttrs, setAttrs, updateAttrs, copyAttrs, withAttrs, mergeAttrs, clearAttrs, getAttr, setAttr, deleteAttr, attrsCount, attrsKeys. Mirrors pandas.DataFrame.attrs / pandas.Series.attrs.

βœ… Complete

+
+

🚩 flags β€” Metadata Flags

+

Metadata flags for DataFrame and Series. The flags getter returns a Flags object with allowsDuplicateLabels property. Setting allowsDuplicateLabels = false on an object with duplicate index labels raises DuplicateLabelError. Mirrors pandas.DataFrame.flags / pandas.core.flags.Flags.

+
βœ… Complete
+

πŸ”€ string_ops β€” Standalone String Ops

Module-level string utilities: strNormalize (Unicode NFC/NFD/NFKC/NFKD), strGetDummies (one-hot DataFrame), strExtractAll (all regex matches), strRemovePrefix, strRemoveSuffix, strTranslate (char-level substitution), strCharWidth (CJK-aware display width), strByteLength. Works on Series, arrays, or scalars.

@@ -501,6 +511,61 @@

βœ… Complete

+
+

πŸ“„ readXml / toXml β€” pd.read_xml() / DataFrame.to_xml()

+

readXml(text, opts?) / toXml(df, opts?) β€” parse XML into DataFrames and serialize back. rowTag auto-detection, attributes, CDATA, entities, namespaces, usecols, nrows, indexCol. Mirrors pandas.read_xml() / DataFrame.to_xml().

+
βœ… Complete
+
+
+

πŸ“‹ readTable β€” pd.read_table()

+

readTable(text, opts?) β€” parse delimiter-separated text into a DataFrame. Defaults to tab separator; all ReadCsvOptions forwarded. Mirrors pandas.read_table().

+
βœ… Complete
+
+
+

πŸ—„οΈ SQL I/O β€” pd.read_sql() / DataFrame.to_sql()

+

readSql / readSqlQuery / readSqlTable / toSql β€” adapter-based SQL I/O. Bring your own DB driver; zero runtime dependencies. Mirrors pandas.read_sql(), read_sql_query(), read_sql_table(), DataFrame.to_sql().

+
βœ… Complete
+
+
+

πŸ“Š readStata & toStata β€” pd.read_stata() / DataFrame.to_stata()

+

readStata / toStata β€” Stata DTA binary file I/O. Supports reading v114/115 (old binary) and v117/118/119 (new XML-tagged) formats; writes v118. Missing values, string columns, value labels (convertCategoricals). Mirrors pandas.read_stata(), DataFrame.to_stata().

+
βœ… Complete
+
+
+

πŸ“¦ readParquet & toParquet β€” pd.read_parquet() / DataFrame.to_parquet()

+

readParquet / toParquet β€” Apache Parquet binary file I/O. Pure-TypeScript Thrift compact protocol, PLAIN encoding, INT32/INT64/DOUBLE/BOOLEAN/BYTE_ARRAY types, optional columns, usecols/nRows/indexCol/writeIndex. Mirrors pandas.read_parquet(), DataFrame.to_parquet().

+
βœ… Complete
+
+
+

πŸ“ readFwf β€” pd.read_fwf()

+

readFwf(text, opts?) β€” read fixed-width formatted text into a DataFrame. Auto-infers column boundaries from whitespace patterns; supports explicit colspecs / widths, header, names, indexCol, NA handling, dtype forcing, skipRows, nRows. Mirrors pandas.read_fwf().

+
βœ… Complete
+
+
+

πŸ”€ case_when β€” pd.Series.case_when()

+

caseWhen(series, caselist) β€” conditional value selection using ordered CASE WHEN semantics. Mirrors pandas.Series.case_when() (pandas 2.2+).

+
βœ… Complete
+
+
+

πŸ—‚οΈ readHdf & toHdf β€” pd.read_hdf() / DataFrame.to_hdf()

+

readHdf / toHdf β€” HDF5 v0 Superblock binary file I/O. Pure-TypeScript, no native deps. Float64/32, Int/UInt 8–64, Bool, fixed-length UTF-8 strings. usecols, indexCol, writeIndex, custom key. Mirrors pandas.read_hdf(), DataFrame.to_hdf().

+
βœ… Complete
+
+
+

πŸ”’ pd.arrays β€” Nullable Typed Extension Arrays

+

Nullable typed arrays: IntegerArray, FloatingArray, BooleanArray, StringArray, DatetimeArray, TimedeltaArray. Three-valued logic, NA masking, element-wise arithmetic, string ops. Mirrors pandas.arrays.

+
βœ… Complete
+
+
+

πŸ—“οΈ Holiday Calendars β€” pd.tseries.holiday

+

Holiday calendar system: Holiday rules (fixed & floating), AbstractHolidayCalendar, USFederalHolidayCalendar (11 US federal holidays), observance helpers (nearestWorkday, sundayToMonday, …), and weekday offsets (MO, TH, …). Mirrors pandas.tseries.holiday.

+
βœ… Complete
+
+
+

πŸ•³οΈ SparseArray & SparseDtype β€” pd.arrays.SparseArray

+

Memory-efficient sparse storage for arrays with many repeated (fill) values. SparseArray stores only non-fill values and their positions. Properties: sp_values, sp_index, density, npoints. Aggregations: sum, mean, max, min, std. Mirrors pandas.arrays.SparseArray and pandas.SparseDtype.

+
βœ… Complete
+
diff --git a/playground/lreshape.html b/playground/lreshape.html new file mode 100644 index 00000000..3f434a11 --- /dev/null +++ b/playground/lreshape.html @@ -0,0 +1,327 @@ + + + + + + tsb β€” lreshape + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

↕ lreshape β€” Interactive Playground

+

Reshape wide-format data to long format using named column groups β€” + mirrors pandas.lreshape().
+ Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· Basic lreshape

+

Stack two wide columns (v1, v2) into a single long + column v, repeating the id column for each block.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· Multiple groups

+

Reshape with multiple output columns simultaneously. Each output column is + fed from a separate list of input columns.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· dropna option

+

By default rows where any value column is null/NaN + are dropped. Pass dropna: false to keep them.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· Real-world: survey scores

+

Stack multiple rounds of survey scores into a long-format table.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

API Reference

+

Reshape wide-format data to long format by explicitly naming which input + columns map to each output column.

+
lreshape(
+  data: DataFrame,
+  groups: Record<string, string[]>,  // { outputCol: [inputCol1, inputCol2, ...] }
+  options?: {
+    dropna?: boolean,  // drop rows with null/NaN values (default: true)
+  }
+): DataFrame
+

All input columns not mentioned in groups + become identity (id) columns and are repeated for each block. All group lists must + have the same length k; the result has nRows Γ— k rows + (before applying dropna).

+
+ + + + + diff --git a/playground/parquet.html b/playground/parquet.html new file mode 100644 index 00000000..31f1b09b --- /dev/null +++ b/playground/parquet.html @@ -0,0 +1,361 @@ + + + + + + tsb β€” readParquet & toParquet + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap + +

πŸ“¦ Apache Parquet I/O

+

+ readParquet(data, options?) and toParquet(df, options?) + implement a pure-TypeScript Apache Parquet reader and writer with no native dependencies. + The implementation uses the Thrift compact protocol for metadata and PLAIN encoding for + column data pages. +

+ +
+ Supported physical types: INT32, INT64, + DOUBLE, BOOLEAN, BYTE_ARRAY (UTF-8 strings). + Compression: UNCOMPRESSED. Flat tables only (no nested or repeated fields). + Equivalent to pandas.read_parquet() / DataFrame.to_parquet(). +
+ + +
+

1 Β· Basic read & write

+

Serialize a DataFrame to a binary Parquet buffer with + toParquet() and read it back with readParquet(). + The buffer starts and ends with the PAR1 magic bytes.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

2 Β· Column types β€” int, float, boolean, string

+

All major column types round-trip correctly. Integers use INT32 or INT64, + floats use DOUBLE, booleans are bit-packed (1 byte per 8 values), + and strings are BYTE_ARRAY (UTF-8).

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

3 Β· usecols & nRows β€” selective reads

+

Use usecols to read a subset of columns and nRows + to limit the number of rows. Both options reduce memory usage and speed up parsing.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

4 Β· indexCol β€” row index from a column

+

Promote any column to the DataFrame's row index by passing indexCol + to readParquet(). Use writeIndex: true in toParquet() + to persist the index as __index_level_0__.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

5 Β· Unicode strings

+

BYTE_ARRAY columns are length-prefixed UTF-8. Any Unicode string β€” including + emoji, CJK characters, and accented letters β€” round-trips exactly.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

6 Β· Many columns β€” stress test

+

Each column is stored as a separate column chunk in the row group. + There is no limit on column count.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + + + + + + diff --git a/playground/read_table.html b/playground/read_table.html new file mode 100644 index 00000000..550913b8 --- /dev/null +++ b/playground/read_table.html @@ -0,0 +1,367 @@ + + + + + + tsb β€” readTable + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

πŸ“‹ readTable β€” Interactive Playground

+

+ Parse delimiter-separated text into a DataFrame + with readTable(). Mirrors + pandas + read_table() β€” identical to readCsv() but defaults + to a tab (\t) separator.
+ Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· Basic tab-separated file

+

By default readTable() splits on tabs, infers column dtypes, + and returns a DataFrame.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· Custom separator

+

Pass sep to use any delimiter β€” pipe, semicolon, or + multi-character strings.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· Handling missing values

+

readTable() recognises common NA strings (NA, + N/A, null, …) and converts them to + NaN. Extend the list with naValues.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· Index column, row limits & skip rows

+

Use indexCol to promote a column to the row index. + nRows caps the number of data rows read; skipRows + skips rows after the header.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

API Reference

+

Parse a delimiter-separated text string into a DataFrame. + Defaults to tab (\t) unlike readCsv which uses + a comma.

+
readTable(text: string, options?: ReadTableOptions): DataFrame
+
+interface ReadTableOptions {
+  sep?:      string;                     // separator (default: "\t")
+  header?:   number | null;              // header row index (default: 0)
+  indexCol?: string | number | null;     // column to use as row index
+  dtype?:    Record<string, DtypeName>; // force dtype for named columns
+  naValues?: readonly string[];          // extra NA string values
+  skipRows?: number;                     // data rows to skip after header
+  nRows?:    number;                     // maximum data rows to read
+}
+
+ + + + + diff --git a/playground/sas.html b/playground/sas.html new file mode 100644 index 00000000..760d3196 --- /dev/null +++ b/playground/sas.html @@ -0,0 +1,91 @@ + + + + + + tsb β€” readSas (SAS XPORT reader) + + + + + +

readSas β€” SAS XPORT reader

+

+ readSas(data) reads a SAS XPORT v5 (.xpt) file and returns a + DataFrame. SAS XPORT is a portable format widely used by the US FDA and CDC for + data submissions. +

+ +

Supported features

+
    +
  • SAS XPORT Version 5 (.xpt files)
  • +
  • Numeric variables (IBM 370 hex double-precision floating point)
  • +
  • Character variables (fixed-width ASCII strings)
  • +
  • Missing numeric values β†’ null
  • +
  • Optional index column via options.index
  • +
+ +

Basic usage

+
import { readSas } from "tsb";
+import { readFileSync } from "node:fs";
+
+// Load from disk
+const buf = new Uint8Array(readFileSync("data.xpt").buffer);
+const df = readSas(buf);
+df.head();
+
+// With index column
+const df2 = readSas(buf, { index: "SUBJID" });
+
+ +

Options

+ + + + + + + + + + + + + + + + + +
OptionTypeDefaultDescription
indexstring | nullnullColumn to use as the DataFrame index. null = default integer index.
+ +

IBM 370 floating-point

+

+ SAS XPORT stores numeric values as IBM System/370 hexadecimal double-precision floating-point + numbers. This is different from IEEE 754 (which JavaScript and most modern systems + use). readSas automatically converts IBM 370 doubles to IEEE 754. +

+
// IBM 370 double format:
+// Byte 0: [sign (1 bit)][exponent (7 bits, excess-64, base-16)]
+// Bytes 1–7: [56-bit mantissa (hexadecimal fraction)]
+// value = (-1)^sign Γ— 16^(expβˆ’64) Γ— mantissa / 2^56
+
+ +

Missing values

+

+ SAS encodes missing numeric values using a special first-byte: 0x2e + ('.') for the standard missing value, and 0x41–0x5A + (A–Z) for special missings. readSas maps all of these to + null. +

+ +

Related

+ + + diff --git a/playground/sparse.html b/playground/sparse.html new file mode 100644 index 00000000..3de58b1b --- /dev/null +++ b/playground/sparse.html @@ -0,0 +1,448 @@ + + + + + + tsb β€” SparseArray & SparseDtype + + + +
← tsb playground
+
+

πŸ•³οΈ SparseArray & SparseDtype

+

Memory-efficient storage for arrays where most values share a common fill value. Mirrors pandas.arrays.SparseArray and pandas.SparseDtype.

+ βœ… Complete + +

Overview

+

+ A SparseArray stores only the non-fill values and their positions. + When most elements share a common value β€” zeros in a sparse matrix, NaN in sensor data with + many gaps, or false in a boolean feature array β€” sparse storage dramatically reduces memory use. +

+

+ The fill_value is the implicit value for all positions not explicitly stored. + Common choices are 0 (numeric zero), NaN (missing values), or + false (boolean). By default tsb uses NaN (matching pandas behaviour). +

+ +
+ πŸ’‘ When to use SparseArray: when density < ~0.25 (fewer than 25% of values + are non-fill). Below that threshold, sparse storage saves memory and the bookkeeping overhead + is worth it. +
+ +

Quick Start

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+ +

Interactive Demo

+

Enter a comma-separated list of numbers and choose a fill value to see how SparseArray stores your data.

+ + + + + + + +
+ +

API Reference

+ +

SparseArray.fromDense(data, fill_value?, subtype?)

+

Create a SparseArray from a dense array. Values equal to fill_value are not stored.

+ +

SparseArray.fromSparse(length, indices, values, fill_value?, subtype?)

+

Create a SparseArray directly from COO (Coordinate) sparse components.

+ +

Properties

+ + + + + + + + + +
PropertyTypeDescription
lengthnumberTotal logical length (including fill positions)
npointsnumberNumber of explicitly stored (non-fill) values
densitynumberFraction stored: npoints / length (0–1)
fill_valuenumberImplicit value for positions not stored
sp_valuesnumber[]Array of stored (non-fill) values
sp_indexnumber[]Positions (0-based) of stored values
dtypeSparseDtypeDescribes element type and fill value
+ +

Methods

+ + + + + + + + + + + + + + + +
MethodDescription
at(i)Value at index i (fill_value for fill positions)
toDense()Convert to a regular number[] array
toCoo()Return {indices, values} COO representation
fillna(value)Replace NaN values; returns new SparseArray
withFillValue(v)Change fill value; returns new SparseArray
slice(start, end?)Slice to [start, end); returns new SparseArray
add(scalar)Add a scalar to all values; returns new SparseArray
mul(scalar)Multiply by a scalar; returns new SparseArray
sum()Sum of all values (NaN-skipped)
mean()Mean of all non-NaN values
max()Maximum value (NaN-ignored)
min()Minimum value (NaN-ignored)
std(ddof?)Standard deviation (default ddof=1)
+ +

Use Cases

+ +

Sensor data with gaps

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+ +

Feature matrix (recommendation systems)

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+ +

Sparse boolean flags

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+ + + + + diff --git a/playground/sql.html b/playground/sql.html new file mode 100644 index 00000000..8c28d1f6 --- /dev/null +++ b/playground/sql.html @@ -0,0 +1,476 @@ + + + + + + tsb β€” SQL I/O + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

πŸ—ƒοΈ SQL I/O β€” Interactive Playground

+

+ readSql, readSqlQuery, readSqlTable, and toSql + mirror pandas + read_sql() and + DataFrame.to_sql(). + Because tsb has zero runtime dependencies, you pass + a SqlConnection adapter for your database driver. + Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· readSqlQuery β€” run a SELECT statement

+

Pass a SQL string and a SqlConnection adapter. The result is a + DataFrame. An optional indexCol promotes a column to the row + index.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· readSqlTable β€” load an entire table

+

Pass a table name (not a SQL string). Use columns to select a subset, + or indexCol to set the row index.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· readSql β€” auto-detect query vs table name

+

readSql inspects the first argument: if it looks like a SQL statement + it calls readSqlQuery; otherwise it calls readSqlTable.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· toSql β€” write a DataFrame to a SQL table

+

Writes rows from a DataFrame into the database. Returns the number of + rows written. The ifExists option controls what happens when the table + already exists: "fail", "replace", or + "append".

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

API Reference

+

All four functions accept a SqlConnection adapter β€” implement + query() plus optional listTables() and insert() + for your database driver.

+
interface SqlConnection {
+  query(sql: string, params?: readonly SqlValue[]): SqlResult;
+  listTables?(): string[];
+  insert?(table: string, rows: object[], columns: string[], ifExists: IfExistsOption): number;
+}
+
+readSqlQuery(sql: string, con: SqlConnection, options?: ReadSqlOptions): DataFrame
+readSqlTable(table: string, con: SqlConnection, options?: ReadSqlOptions): DataFrame
+readSql(sqlOrTable: string, con: SqlConnection, options?: ReadSqlOptions): DataFrame
+toSql(df: DataFrame, name: string, con: SqlConnection, options?: ToSqlOptions): number
+
+interface ReadSqlOptions {
+  indexCol?: string | string[];
+  columns?:  string[];
+  params?:   readonly SqlValue[];
+  parseDates?: string[];
+}
+
+interface ToSqlOptions {
+  ifExists?: "fail" | "replace" | "append";  // default: "fail"
+  index?:    boolean;                          // include index column (default: true)
+  chunkSize?: number;
+}
+
+ + + + + diff --git a/playground/stata.html b/playground/stata.html new file mode 100644 index 00000000..18743f45 --- /dev/null +++ b/playground/stata.html @@ -0,0 +1,379 @@ + + + + + + tsb β€” readStata & toStata + + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

πŸ“Š readStata & toStata β€” Interactive Playground

+

Read and write Stata DTA files from TypeScript. + toStata(df) serializes a DataFrame to a Stata DTA v118 binary buffer. + readStata(buf, options) parses the buffer back into a DataFrame. + Numeric missing values are represented as null. Mirrors + pandas.read_stata() and DataFrame.to_stata().
+ Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· Basic round-trip β€” write and read back

+

Create a DataFrame, serialize it to a Stata DTA v118 binary buffer with + toStata(), then parse it back with readStata(). + All columns, values, and shape are preserved.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

2 Β· Missing values β€” null round-trip

+

Stata represents missing numeric values as special sentinel bit patterns. + readStata maps all missing sentinels to null. + toStata writes the standard Stata system-missing value for each type.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

3 Β· Options β€” dataLabel & variableLabels

+

Embed a dataset description with dataLabel and per-column annotations + with variableLabels. These metadata fields are stored in the DTA header + and are visible in Stata's describe command.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

4 Β· Options β€” usecols, nRows, indexCol

+

Restrict columns with usecols, limit rows with nRows, + and promote a column to the DataFrame index with indexCol.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

5 Β· Boolean columns

+

Boolean values are stored as Stata byte (int8) with + true β†’ 1 and false β†’ 0. Reading converts + them back to numbers; use .map() or comparison operators + to recover booleans if needed.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

6 Β· writeIndex β€” include the row index

+

Pass writeIndex: true to include the DataFrame's row index + as an extra _index column in the DTA file.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + + + diff --git a/playground/xml.html b/playground/xml.html new file mode 100644 index 00000000..23e2e96d --- /dev/null +++ b/playground/xml.html @@ -0,0 +1,462 @@ + + + + + + tsb β€” readXml & toXml + + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

πŸ“„ readXml & toXml β€” Interactive Playground

+

Parse XML text into a DataFrame with + auto-detection of row elements, attribute and child-element columns, entity decoding, + CDATA support, namespace stripping, and numeric coercion. Serialize any DataFrame + back to well-formed XML with full formatting control. Mirrors + pandas.read_xml() and pandas.DataFrame.to_xml().
+ Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· Basic readXml β€” child-element rows

+

The most common XML layout: a root element containing repeating row elements, + each with child elements as columns. readXml auto-detects the row + tag and coerces numeric strings automatically.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

2 Β· Attribute rows

+

XML elements can carry data as attributes instead of (or in addition to) child + elements. Use attribs: true (the default) to include them.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

3 Β· usecols, nrows, indexCol

+

Restrict the columns returned with usecols, limit rows with + nrows, and promote a column to the index with indexCol.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

4 Β· naValues β€” custom NA strings

+

Built-in NA strings include "", "NA", "NaN", + "N/A", "null", "None", "nan". + Use naValues to add your own.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

5 Β· Entities & CDATA

+

Named entities (&amp;, &lt;, …), decimal/hex + character references (&#65;, &#x41;), and + CDATA sections (<![CDATA[…]]>) are all handled transparently.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

6 Β· toXml β€” child elements (default)

+

toXml(df) produces a well-formed XML document with an XML declaration, + a configurable root element, and one child element per row containing one sub-element + per column.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

7 Β· toXml β€” attribs mode

+

Set attribs: true to emit column values as XML attributes on each + row element instead of as child elements β€” produces more compact output.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

8 Β· toXml β€” namespaces & CDATA columns

+

Declare XML namespace prefixes on the root element with namespaces. + Wrap sensitive columns in CDATA sections with cdataCols to preserve + special characters literally.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

9 Β· Round-trip: toXml β†’ readXml

+

Serializing a DataFrame to XML and reading it back should produce an identical + DataFrame (shape and values).

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + + + diff --git a/src/core/arrays/boolean_array.ts b/src/core/arrays/boolean_array.ts new file mode 100644 index 00000000..0ac8922a --- /dev/null +++ b/src/core/arrays/boolean_array.ts @@ -0,0 +1,233 @@ +/** + * BooleanArray β€” nullable boolean extension array. + * + * Mirrors `pandas.arrays.BooleanArray`. Stores boolean values with a separate + * mask for missing (NA) values, enabling three-valued logic (True / False / NA). + * + * @example + * ```ts + * import { arrays } from "tsb"; + * + * const a = arrays.BooleanArray.from([true, null, false]); + * a.dtype; // "boolean" + * a.at(1); // null + * a.any(); // true + * a.all(); // false + * a.fillna(false).toArray(); // [true, false, false] + * ``` + * + * @module + */ + +import { MaskedArray } from "./masked_array.ts"; + +// ─── BooleanArray ───────────────────────────────────────────────────────────── + +/** + * A nullable boolean array. + * + * Use {@link BooleanArray.from} to create instances. + */ +export class BooleanArray extends MaskedArray { + /** @internal */ + constructor(data: boolean[], mask: boolean[]) { + super(data, mask); + } + + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link BooleanArray} from a sequence of boolean (or null/undefined). + * + * @example + * ```ts + * BooleanArray.from([true, false, null, true]); + * ``` + */ + static from(values: Iterable): BooleanArray { + const data: boolean[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined) { + data.push(false); + mask.push(true); + } else { + data.push(Boolean(v)); + mask.push(false); + } + } + return new BooleanArray(data, mask); + } + + /** @internal */ + static _fromRaw(data: boolean[], mask: boolean[]): BooleanArray { + return new BooleanArray(data, mask); + } + + // ─── Dtype ──────────────────────────────────────────────────────────────── + + get dtype(): "boolean" { + return "boolean"; + } + + // ─── Reductions ─────────────────────────────────────────────────────────── + + /** + * Return `true` if any non-NA element is `true`. + * Returns `null` if all elements are NA and `skipna` is `false`. + */ + any(skipna = true): boolean | null { + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + if (this._data[i]) { + return true; + } + } + return false; + } + + /** + * Return `true` if all non-NA elements are `true`. + * Returns `null` if all elements are NA and `skipna` is `false`. + */ + all(skipna = true): boolean | null { + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + if (!this._data[i]) { + return false; + } + } + return true; + } + + /** Count of `true` (non-NA) elements. */ + sum(skipna = true): number | null { + let count = 0; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + if (this._data[i]) { + count++; + } + } + return count; + } + + // ─── Logical operations ─────────────────────────────────────────────────── + + /** + * Element-wise logical AND. + * + * Follows Kleene three-valued logic: + * - `false AND NA` β†’ `false` + * - `true AND NA` β†’ `NA` + */ + and(other: BooleanArray): BooleanArray { + if (other.size !== this.size) { + throw new RangeError( + `BooleanArray: operand size mismatch (${this.size} vs ${other.size})`, + ); + } + const data: boolean[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + const am = this._mask[i] === true; + const bm = other._mask[i] === true; + const av = this._data[i] === true; + const bv = other._data[i] === true; + if (!(am || bm)) { + // Both known + data.push(av && bv); + mask.push(false); + } else if (!(am || av)) { + // a is false β†’ false AND anything = false + data.push(false); + mask.push(false); + } else if (!(bm || bv)) { + // b is false β†’ anything AND false = false + data.push(false); + mask.push(false); + } else { + // Result is NA + data.push(false); + mask.push(true); + } + } + return BooleanArray._fromRaw(data, mask); + } + + /** + * Element-wise logical OR. + * + * Follows Kleene three-valued logic: + * - `true OR NA` β†’ `true` + * - `false OR NA` β†’ `NA` + */ + or(other: BooleanArray): BooleanArray { + if (other.size !== this.size) { + throw new RangeError( + `BooleanArray: operand size mismatch (${this.size} vs ${other.size})`, + ); + } + const data: boolean[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + const am = this._mask[i] === true; + const bm = other._mask[i] === true; + const av = this._data[i] === true; + const bv = other._data[i] === true; + if (!(am || bm)) { + // Both known + data.push(av || bv); + mask.push(false); + } else if (!am && av) { + // a is true β†’ true OR anything = true + data.push(true); + mask.push(false); + } else if (!bm && bv) { + // b is true β†’ anything OR true = true + data.push(true); + mask.push(false); + } else { + // Result is NA + data.push(false); + mask.push(true); + } + } + return BooleanArray._fromRaw(data, mask); + } + + /** + * Element-wise logical NOT. + * `NOT NA` β†’ `NA`; `NOT true` β†’ `false`; `NOT false` β†’ `true`. + */ + not(): BooleanArray { + const data = this._data.map((v, i) => (this._mask[i] ? false : !v)); + return BooleanArray._fromRaw(data, this._mask.slice()); + } + + // ─── fillna ─────────────────────────────────────────────────────────────── + + /** + * Return a new {@link BooleanArray} with NAs replaced by `value`. + */ + fillna(value: boolean): BooleanArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return BooleanArray._fromRaw(data, mask); + } +} diff --git a/src/core/arrays/datetime_array.ts b/src/core/arrays/datetime_array.ts new file mode 100644 index 00000000..15e29741 --- /dev/null +++ b/src/core/arrays/datetime_array.ts @@ -0,0 +1,280 @@ +/** + * DatetimeArray β€” extension array of nullable {@link Timestamp} values. + * + * Mirrors `pandas.arrays.DatetimeArray`. Stores an array of Timestamps (with + * optional timezone) with a separate boolean mask for missing (NA) values. + * + * @example + * ```ts + * import { arrays } from "tsb"; + * import { Timestamp } from "tsb"; + * + * const a = arrays.DatetimeArray.from([ + * new Timestamp("2024-01-01"), + * null, + * new Timestamp("2024-03-15"), + * ]); + * a.dtype; // "datetime64[ns]" + * a.at(1); // null + * a.year; // [2024, null, 2024] + * a.month; // [1, null, 3] + * ``` + * + * @module + */ + +import { Timestamp } from "../timestamp.ts"; +import type { TimestampOptions } from "../timestamp.ts"; + +// ─── DatetimeArray ──────────────────────────────────────────────────────────── + +/** + * A nullable array of {@link Timestamp} values. + * + * Use {@link DatetimeArray.from} to create instances. + */ +export class DatetimeArray { + private readonly _data: Timestamp[]; + private readonly _mask: boolean[]; + private readonly _tz: string | null; + + /** @internal */ + constructor(data: Timestamp[], mask: boolean[], tz: string | null = null) { + if (data.length !== mask.length) { + throw new RangeError( + `DatetimeArray: data length (${data.length}) !== mask length (${mask.length})`, + ); + } + this._data = data; + this._mask = mask; + this._tz = tz; + } + + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link DatetimeArray} from a sequence of Timestamps, strings, or numbers. + * + * @param values - Each element may be a {@link Timestamp}, an ISO string + * (e.g. `"2024-01-01"`), a millisecond-since-epoch number, a JS `Date`, + * `null`, or `undefined`. + * @param options - Options forwarded to the {@link Timestamp} constructor for + * non-Timestamp inputs (e.g. `{ unit: "s", tz: "UTC" }`). + * + * @example + * ```ts + * DatetimeArray.from(["2024-01-01", null, "2024-03-15"]); + * DatetimeArray.from([1704067200000, null], { unit: "ms" }); + * ``` + */ + static from( + values: Iterable, + options?: Readonly, + ): DatetimeArray { + const data: Timestamp[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined) { + data.push(new Timestamp(0)); + mask.push(true); + } else if (v instanceof Timestamp) { + data.push(v); + mask.push(false); + } else { + data.push(new Timestamp(v as string | number | Date, options)); + mask.push(false); + } + } + const tz = options?.tz ?? null; + return new DatetimeArray(data, mask, typeof tz === "string" ? tz : null); + } + + /** @internal */ + static _fromRaw( + data: Timestamp[], + mask: boolean[], + tz: string | null = null, + ): DatetimeArray { + return new DatetimeArray(data, mask, tz); + } + + // ─── Core accessors ──────────────────────────────────────────────────────── + + /** Number of elements (including NAs). */ + get size(): number { + return this._data.length; + } + + /** Dtype string β€” mirrors pandas `datetime64[ns]` or `datetime64[ns, tz]`. */ + get dtype(): string { + return this._tz ? `datetime64[ns, ${this._tz}]` : "datetime64[ns]"; + } + + /** IANA timezone, or `null` for timezone-naive arrays. */ + get tz(): string | null { + return this._tz; + } + + /** + * Return the element at index `i`, or `null` if masked. + * Supports negative indexing. + */ + at(i: number): Timestamp | null { + const idx = i < 0 ? this._data.length + i : i; + if (idx < 0 || idx >= this._data.length) { + return null; + } + if (this._mask[idx]) { + return null; + } + return this._data[idx] ?? null; + } + + // ─── NA ──────────────────────────────────────────────────────────────────── + + /** Boolean array where `true` = NA. */ + isna(): boolean[] { + return this._mask.slice(); + } + + /** Boolean array where `true` = not NA. */ + notna(): boolean[] { + return this._mask.map((m) => !m); + } + + // ─── Component accessors ────────────────────────────────────────────────── + + /** Numeric year for each element (NA β†’ null). */ + get year(): (number | null)[] { + return this._extractComponent((ts) => ts.year); + } + + /** Month (1–12) for each element (NA β†’ null). */ + get month(): (number | null)[] { + return this._extractComponent((ts) => ts.month); + } + + /** Day (1–31) for each element (NA β†’ null). */ + get day(): (number | null)[] { + return this._extractComponent((ts) => ts.day); + } + + /** Hour (0–23) for each element (NA β†’ null). */ + get hour(): (number | null)[] { + return this._extractComponent((ts) => ts.hour); + } + + /** Minute (0–59) for each element (NA β†’ null). */ + get minute(): (number | null)[] { + return this._extractComponent((ts) => ts.minute); + } + + /** Second (0–59) for each element (NA β†’ null). */ + get second(): (number | null)[] { + return this._extractComponent((ts) => ts.second); + } + + /** Millisecond (0–999) for each element (NA β†’ null). */ + get millisecond(): (number | null)[] { + return this._extractComponent((ts) => ts.millisecond); + } + + /** Day of week (0=Monday … 6=Sunday) for each element (NA β†’ null). */ + get dayofweek(): (number | null)[] { + return this._extractComponent((ts) => ts.dayofweek); + } + + /** Day of year (1–366) for each element (NA β†’ null). */ + get dayofyear(): (number | null)[] { + return this._extractComponent((ts) => ts.dayofyear); + } + + /** Quarter (1–4) for each element (NA β†’ null). */ + get quarter(): (number | null)[] { + return this._extractComponent((ts) => ts.quarter); + } + + // ─── Conversion ──────────────────────────────────────────────────────────── + + /** Return an array of {@link Timestamp} or `null` for NA positions. */ + toArray(): (Timestamp | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : v)); + } + + /** Milliseconds since epoch for each element (NA β†’ null). */ + asMs(): (number | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : v._utcMs)); + } + + // ─── fillna ─────────────────────────────────────────────────────────────── + + /** Return a new DatetimeArray with NAs replaced by `value`. */ + fillna(value: Timestamp): DatetimeArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return DatetimeArray._fromRaw(data, mask, this._tz); + } + + // ─── Min / Max ───────────────────────────────────────────────────────────── + + /** Earliest (minimum) non-NA Timestamp, or `null` if all are NA. */ + min(): Timestamp | null { + let result: Timestamp | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + continue; + } + const v = this._data[i] as Timestamp; + if (result === null || v._utcMs < result._utcMs) { + result = v; + } + } + return result; + } + + /** Latest (maximum) non-NA Timestamp, or `null` if all are NA. */ + max(): Timestamp | null { + let result: Timestamp | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + continue; + } + const v = this._data[i] as Timestamp; + if (result === null || v._utcMs > result._utcMs) { + result = v; + } + } + return result; + } + + // ─── Iteration ───────────────────────────────────────────────────────────── + + [Symbol.iterator](): Iterator { + let i = 0; + const data = this._data; + const mask = this._mask; + return { + next() { + if (i >= data.length) { + return { value: null, done: true }; + } + const value = mask[i] ? null : (data[i] ?? null); + i++; + return { value, done: false }; + }, + }; + } + + // ─── String representation ───────────────────────────────────────────────── + + toString(): string { + const items = this.toArray().map((v) => (v === null ? "" : v.isoformat())); + return `DatetimeArray([${items.join(", ")}], dtype="${this.dtype}")`; + } + + // ─── Private helper ──────────────────────────────────────────────────────── + + private _extractComponent(fn: (ts: Timestamp) => number): (number | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : fn(v))); + } +} diff --git a/src/core/arrays/floating_array.ts b/src/core/arrays/floating_array.ts new file mode 100644 index 00000000..924c2167 --- /dev/null +++ b/src/core/arrays/floating_array.ts @@ -0,0 +1,290 @@ +/** + * FloatingArray β€” nullable floating-point extension array. + * + * Mirrors `pandas.arrays.FloatingArray`. Stores float values with a separate + * boolean mask for missing (NA) values. Supports `Float32` and `Float64` + * (capital-F nullable variants). + * + * @example + * ```ts + * import { arrays } from "tsb"; + * + * const a = arrays.FloatingArray.from([1.5, null, 3.14], "Float64"); + * a.dtype; // "Float64" + * a.size; // 3 + * a.at(1); // null + * a.sum(); // 4.64 + * a.fillna(0).toArray(); // [1.5, 0, 3.14] + * ``` + * + * @module + */ + +import { MaskedArray } from "./masked_array.ts"; + +// ─── Types ──────────────────────────────────────────────────────────────────── + +/** + * Nullable float dtype names. + */ +export type FloatingDtypeName = "Float32" | "Float64"; + +// ─── FloatingArray ──────────────────────────────────────────────────────────── + +/** + * A nullable floating-point array. + * + * Use {@link FloatingArray.from} to create instances. + */ +export class FloatingArray extends MaskedArray { + private readonly _dtype: FloatingDtypeName; + + /** @internal */ + constructor(data: number[], mask: boolean[], dtype: FloatingDtypeName) { + super(data, mask); + this._dtype = dtype; + } + + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link FloatingArray} from a sequence of values. + * + * @param values - Source values. `null`, `undefined`, and `NaN` become NA. + * @param dtype - Target dtype. Defaults to `"Float64"`. + * + * @example + * ```ts + * FloatingArray.from([1.1, 2.2, null, 4.4]); // Float64 + * FloatingArray.from([1.1, NaN, 3.3], "Float32"); // Float32 + * ``` + */ + static from( + values: Iterable, + dtype: FloatingDtypeName = "Float64", + ): FloatingArray { + if (dtype !== "Float32" && dtype !== "Float64") { + throw new TypeError(`FloatingArray: unknown dtype "${dtype}"`); + } + const data: number[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined || (typeof v === "number" && isNaN(v))) { + data.push(0); + mask.push(true); + } else { + data.push(dtype === "Float32" ? Math.fround(v) : v); + mask.push(false); + } + } + return new FloatingArray(data, mask, dtype); + } + + /** @internal */ + static _fromRaw( + data: number[], + mask: boolean[], + dtype: FloatingDtypeName, + ): FloatingArray { + return new FloatingArray(data, mask, dtype); + } + + // ─── Dtype ──────────────────────────────────────────────────────────────── + + get dtype(): FloatingDtypeName { + return this._dtype; + } + + // ─── Operations ─────────────────────────────────────────────────────────── + + /** Sum of non-NA elements. */ + sum(skipna = true): number | null { + let total = 0; + let hasNonNa = false; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + total += this._data[i] as number; + hasNonNa = true; + } + return hasNonNa || skipna ? total : null; + } + + /** Mean of non-NA elements. */ + mean(skipna = true): number | null { + let total = 0; + let count = 0; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + total += this._data[i] as number; + count++; + } + return count > 0 ? total / count : null; + } + + /** Minimum non-NA element. */ + min(skipna = true): number | null { + let result: number | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + const v = this._data[i] as number; + if (result === null || v < result) { + result = v; + } + } + return result; + } + + /** Maximum non-NA element. */ + max(skipna = true): number | null { + let result: number | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + const v = this._data[i] as number; + if (result === null || v > result) { + result = v; + } + } + return result; + } + + /** Number of non-NA elements. */ + count(): number { + return this._mask.filter((m) => !m).length; + } + + /** Standard deviation of non-NA elements (sample, ddof=1). */ + std(skipna = true, ddof = 1): number | null { + const m = this.mean(skipna); + if (m === null) { + return null; + } + let sumSq = 0; + let count = 0; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + continue; + } + const d = (this._data[i] as number) - m; + sumSq += d * d; + count++; + } + return count > ddof ? Math.sqrt(sumSq / (count - ddof)) : null; + } + + // ─── Element-wise arithmetic ────────────────────────────────────────────── + + /** Element-wise addition. NA propagates. */ + add(other: FloatingArray | number): FloatingArray { + const [data, mask] = this._binop(other, (a, b) => a + b); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise subtraction. NA propagates. */ + sub(other: FloatingArray | number): FloatingArray { + const [data, mask] = this._binop(other, (a, b) => a - b); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise multiplication. NA propagates. */ + mul(other: FloatingArray | number): FloatingArray { + const [data, mask] = this._binop(other, (a, b) => a * b); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise division. NA propagates. Division by zero β†’ Β±Infinity (masked). */ + truediv(other: FloatingArray | number): FloatingArray { + const [data, mask] = this._binop(other, (a, b) => a / b); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise exponentiation. NA propagates. */ + pow(other: FloatingArray | number): FloatingArray { + const [data, mask] = this._binop(other, (a, b) => a ** b); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + /** @internal */ + private _binop( + other: FloatingArray | number, + fn: (a: number, b: number) => number, + ): [number[], boolean[]] { + if (typeof other === "number") { + const data: number[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + data.push(0); + mask.push(true); + } else { + data.push(fn(this._data[i] as number, other)); + mask.push(false); + } + } + return [data, mask]; + } + if (other.size !== this.size) { + throw new RangeError( + `FloatingArray: operand size mismatch (${this.size} vs ${other.size})`, + ); + } + const data: number[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i] || other._mask[i]) { + data.push(0); + mask.push(true); + } else { + data.push(fn(this._data[i] as number, other._data[i] as number)); + mask.push(false); + } + } + return [data, mask]; + } + + // ─── fillna ─────────────────────────────────────────────────────────────── + + /** + * Return a new {@link FloatingArray} with NAs replaced by `value`. + */ + fillna(value: number): FloatingArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + // ─── Type conversion ────────────────────────────────────────────────────── + + /** Convert to another floating dtype. */ + astype(dtype: FloatingDtypeName): FloatingArray { + if (dtype !== "Float32" && dtype !== "Float64") { + throw new TypeError(`FloatingArray.astype: unknown dtype "${dtype}"`); + } + const data = this._data.map((v, i) => { + if (this._mask[i]) { + return 0; + } + return dtype === "Float32" ? Math.fround(v) : v; + }); + return FloatingArray._fromRaw(data, this._mask.slice(), dtype); + } +} diff --git a/src/core/arrays/index.ts b/src/core/arrays/index.ts new file mode 100644 index 00000000..9dc5a01f --- /dev/null +++ b/src/core/arrays/index.ts @@ -0,0 +1,55 @@ +/** + * pd.arrays β€” Pandas-compatible typed extension arrays for tsb. + * + * Mirrors the `pandas.arrays` namespace. Provides nullable typed arrays for + * integers, floats, booleans, strings, datetimes, and timedeltas. + * + * @example + * ```ts + * import { arrays } from "tsb"; + * + * // Nullable integer array + * const ints = arrays.IntegerArray.from([1, 2, null, 4], "Int32"); + * ints.toArray(); // [1, 2, null, 4] + * ints.sum(); // 7 + * + * // Nullable float array + * const floats = arrays.FloatingArray.from([1.5, null, 3.0]); + * floats.mean(); // 2.25 + * + * // Nullable boolean array (three-valued logic) + * const bools = arrays.BooleanArray.from([true, false, null]); + * bools.any(); // true + * + * // Nullable string array + * const strs = arrays.StringArray.from(["hello", null, "world"]); + * strs.upper().toArray(); // ["HELLO", null, "WORLD"] + * + * // Datetime array + * const dts = arrays.DatetimeArray.from(["2024-01-01", null]); + * dts.year; // [2024, null] + * + * // Timedelta array + * const tds = arrays.TimedeltaArray.from([86400000, null]); + * tds.days; // [1, null] + * ``` + * + * @module + */ + +export { MaskedArray } from "./masked_array.ts"; +export type { FillValue } from "./masked_array.ts"; + +export { IntegerArray } from "./integer_array.ts"; +export type { IntegerDtypeName } from "./integer_array.ts"; + +export { FloatingArray } from "./floating_array.ts"; +export type { FloatingDtypeName } from "./floating_array.ts"; + +export { BooleanArray } from "./boolean_array.ts"; + +export { StringArray } from "./string_array.ts"; + +export { DatetimeArray } from "./datetime_array.ts"; + +export { TimedeltaArray } from "./timedelta_array.ts"; diff --git a/src/core/arrays/integer_array.ts b/src/core/arrays/integer_array.ts new file mode 100644 index 00000000..7e5275b8 --- /dev/null +++ b/src/core/arrays/integer_array.ts @@ -0,0 +1,338 @@ +/** + * IntegerArray β€” nullable integer extension array. + * + * Mirrors `pandas.arrays.IntegerArray`. Stores integer values with a separate + * boolean mask to represent missing (NA) values. Supports all integer dtypes + * that pandas uses: `Int8`, `Int16`, `Int32`, `Int64`, `UInt8`, `UInt16`, + * `UInt32`, `UInt64` (note capital letter β€” these are the *nullable* variants + * distinct from NumPy `int8` etc.). + * + * @example + * ```ts + * import { arrays } from "tsb"; + * + * const a = arrays.IntegerArray.from([1, null, 3, null, 5], "Int32"); + * a.dtype; // "Int32" + * a.size; // 5 + * a.at(1); // null + * a.toArray(); // [1, null, 3, null, 5] + * a.sum(); // 9 + * a.fillna(0).toArray(); // [1, 0, 3, 0, 5] + * ``` + * + * @module + */ + +import { MaskedArray } from "./masked_array.ts"; + +// ─── Types ──────────────────────────────────────────────────────────────────── + +/** + * Nullable integer dtype names (capital letter prefix = nullable in pandas). + */ +export type IntegerDtypeName = + | "Int8" + | "Int16" + | "Int32" + | "Int64" + | "UInt8" + | "UInt16" + | "UInt32" + | "UInt64"; + +const INTEGER_DTYPES = new Set([ + "Int8", + "Int16", + "Int32", + "Int64", + "UInt8", + "UInt16", + "UInt32", + "UInt64", +]); + +/** @internal */ +function isIntegerDtypeName(s: string): s is IntegerDtypeName { + return INTEGER_DTYPES.has(s as IntegerDtypeName); +} + +// ─── Bounds checking ───────────────────────────────────────────────────────── + +const BOUNDS: Record = { + Int8: [-128, 127], + Int16: [-32768, 32767], + Int32: [-2147483648, 2147483647], + Int64: [Number.MIN_SAFE_INTEGER, Number.MAX_SAFE_INTEGER], + UInt8: [0, 255], + UInt16: [0, 65535], + UInt32: [0, 4294967295], + UInt64: [0, Number.MAX_SAFE_INTEGER], +}; + +/** @internal */ +function checkBounds(value: number, dtype: IntegerDtypeName): void { + const [lo, hi] = BOUNDS[dtype]; + if (value < lo || value > hi) { + throw new RangeError( + `IntegerArray(${dtype}): value ${value} out of bounds [${lo}, ${hi}]`, + ); + } +} + +// ─── IntegerArray ───────────────────────────────────────────────────────────── + +/** + * A nullable integer array. + * + * Use {@link IntegerArray.from} to create instances. + */ +export class IntegerArray extends MaskedArray { + private readonly _dtype: IntegerDtypeName; + + /** @internal */ + constructor(data: number[], mask: boolean[], dtype: IntegerDtypeName) { + super(data, mask); + this._dtype = dtype; + } + + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create an {@link IntegerArray} from a sequence of values (or `null`/`undefined` + * for missing values) and an optional dtype. + * + * @param values - Source values. `null` and `undefined` become NA. + * @param dtype - Target dtype. Defaults to `"Int64"`. + * + * @example + * ```ts + * IntegerArray.from([1, 2, null, 4]); // Int64 + * IntegerArray.from([1, 2, null], "Int32"); // Int32 + * ``` + */ + static from( + values: Iterable, + dtype: IntegerDtypeName = "Int64", + ): IntegerArray { + if (!isIntegerDtypeName(dtype)) { + throw new TypeError(`IntegerArray: unknown dtype "${dtype}"`); + } + const data: number[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined) { + data.push(0); + mask.push(true); + } else { + const int = Math.trunc(v); + checkBounds(int, dtype); + data.push(int); + mask.push(false); + } + } + return new IntegerArray(data, mask, dtype); + } + + /** + * Create an {@link IntegerArray} from a raw buffer (no copying, no validation). + * + * @internal + */ + static _fromRaw( + data: number[], + mask: boolean[], + dtype: IntegerDtypeName, + ): IntegerArray { + return new IntegerArray(data, mask, dtype); + } + + // ─── Dtype ──────────────────────────────────────────────────────────────── + + get dtype(): IntegerDtypeName { + return this._dtype; + } + + // ─── Operations ─────────────────────────────────────────────────────────── + + /** + * Sum of non-NA elements. Returns `null` if all elements are NA and + * `skipna` is `false`. + */ + sum(skipna = true): number | null { + let total = 0; + let hasNonNa = false; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + total += this._data[i] as number; + hasNonNa = true; + } + return hasNonNa || skipna ? total : null; + } + + /** Mean of non-NA elements. */ + mean(skipna = true): number | null { + let total = 0; + let count = 0; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + total += this._data[i] as number; + count++; + } + return count > 0 ? total / count : null; + } + + /** Minimum non-NA element. */ + min(skipna = true): number | null { + let result: number | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + const v = this._data[i] as number; + if (result === null || v < result) { + result = v; + } + } + return result; + } + + /** Maximum non-NA element. */ + max(skipna = true): number | null { + let result: number | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + const v = this._data[i] as number; + if (result === null || v > result) { + result = v; + } + } + return result; + } + + /** Number of non-NA elements. */ + count(): number { + return this._mask.filter((m) => !m).length; + } + + // ─── Element-wise arithmetic ────────────────────────────────────────────── + + /** Element-wise addition. NA propagates. */ + add(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => a + b); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise subtraction. NA propagates. */ + sub(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => a - b); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise multiplication. NA propagates. */ + mul(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => a * b); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise integer division. NA propagates. */ + floordiv(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => Math.trunc(a / b)); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise modulo. NA propagates. */ + mod(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => a % b); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise exponentiation. NA propagates. */ + pow(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => Math.trunc(a ** b)); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** @internal */ + private _binop( + other: IntegerArray | number, + fn: (a: number, b: number) => number, + ): [number[], boolean[]] { + if (typeof other === "number") { + const data: number[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + data.push(0); + mask.push(true); + } else { + data.push(fn(this._data[i] as number, other)); + mask.push(false); + } + } + return [data, mask]; + } + if (other.size !== this.size) { + throw new RangeError( + `IntegerArray: operand size mismatch (${this.size} vs ${other.size})`, + ); + } + const data: number[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i] || other._mask[i]) { + data.push(0); + mask.push(true); + } else { + data.push(fn(this._data[i] as number, other._data[i] as number)); + mask.push(false); + } + } + return [data, mask]; + } + + // ─── fillna ─────────────────────────────────────────────────────────────── + + /** + * Return a new {@link IntegerArray} with NAs replaced by `value`. + */ + fillna(value: number): IntegerArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + // ─── Type conversion ────────────────────────────────────────────────────── + + /** Convert to another integer dtype. */ + astype(dtype: IntegerDtypeName): IntegerArray { + if (!isIntegerDtypeName(dtype)) { + throw new TypeError(`IntegerArray.astype: unknown dtype "${dtype}"`); + } + const data = this._data.map((v, i) => { + if (this._mask[i]) { + return 0; + } + checkBounds(v, dtype); + return v; + }); + return IntegerArray._fromRaw(data, this._mask.slice(), dtype); + } +} diff --git a/src/core/arrays/masked_array.ts b/src/core/arrays/masked_array.ts new file mode 100644 index 00000000..238082a4 --- /dev/null +++ b/src/core/arrays/masked_array.ts @@ -0,0 +1,194 @@ +/** + * MaskedArray β€” base class for nullable extension arrays. + * + * Mirrors `pandas.core.arrays.masked.BaseMaskedArray`. Stores values and a + * separate boolean mask where `true` means the element is NA (missing). + * + * All concrete nullable array types ({@link IntegerArray}, {@link FloatingArray}, + * {@link BooleanArray}) extend this class. + * + * @module + */ + +import type { Scalar } from "../../types.ts"; + +// ─── Types ──────────────────────────────────────────────────────────────────── + +/** + * Values accepted as fill value for {@link MaskedArray.fillna}. + */ +export type FillValue = T | null | undefined; + +// ─── MaskedArray ───────────────────────────────────────────────────────────── + +/** + * Abstract base class for masked (nullable) arrays. + * + * @typeParam T - The underlying element type (number, boolean, string, etc.) + * + * @example + * ```ts + * // Constructed via subclasses, e.g. IntegerArray.from([1, null, 3]) + * ``` + */ +export abstract class MaskedArray { + /** + * Stored element values. When `_mask[i]` is `true` this value is + * undefined/unused, but we always maintain the same length for both arrays. + */ + protected readonly _data: T[]; + /** + * Boolean mask where `true` indicates a missing value (NA). + */ + protected readonly _mask: boolean[]; + + /** @internal */ + constructor(data: T[], mask: boolean[]) { + if (data.length !== mask.length) { + throw new RangeError( + `MaskedArray: data length (${data.length}) !== mask length (${mask.length})`, + ); + } + this._data = data; + this._mask = mask; + } + + // ─── Core accessors ──────────────────────────────────────────────────────── + + /** Number of elements (including NAs). */ + get size(): number { + return this._data.length; + } + + /** The dtype name for this array (defined by subclasses). */ + abstract get dtype(): string; + + /** + * Return the element at index `i`, or `null` if it is masked. + * Supports negative indexing. + */ + at(i: number): T | null { + const idx = i < 0 ? this._data.length + i : i; + if (idx < 0 || idx >= this._data.length) { + return null; + } + if (this._mask[idx]) { + return null; + } + return this._data[idx] ?? null; + } + + // ─── NA / notna ──────────────────────────────────────────────────────────── + + /** + * Return a boolean array where `true` indicates a missing element. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).isna(); // [false, true, false] + * ``` + */ + isna(): boolean[] { + return this._mask.slice(); + } + + /** + * Return a boolean array where `true` indicates a non-missing element. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).notna(); // [true, false, true] + * ``` + */ + notna(): boolean[] { + return this._mask.map((m) => !m); + } + + /** `true` if any element is NA. */ + hasNa(): boolean { + return this._mask.some(Boolean); + } + + // ─── Conversion ──────────────────────────────────────────────────────────── + + /** + * Return a plain JS array where masked elements are represented as `null`. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).toArray(); // [1, null, 3] + * ``` + */ + toArray(): (T | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : v)); + } + + /** + * Return a plain JS array, replacing each NA with `naValue`. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).toArray(0); // [1, 0, 3] + * ``` + */ + toArrayFilled(naValue: T): T[] { + return this._data.map((v, i) => (this._mask[i] ? naValue : v)); + } + + // ─── fillna ──────────────────────────────────────────────────────────────── + + /** + * Return a new array with NAs replaced by `value`. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).fillna(0).toArray(); // [1, 0, 3] + * ``` + */ + abstract fillna(value: T): MaskedArray; + + // ─── dropna ──────────────────────────────────────────────────────────────── + + /** + * Return the non-NA values as a plain JS array. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).dropna(); // [1, 3] + * ``` + */ + dropna(): T[] { + const out: T[] = []; + for (let i = 0; i < this._data.length; i++) { + if (!this._mask[i]) { + out.push(this._data[i] as T); + } + } + return out; + } + + // ─── Iteration ───────────────────────────────────────────────────────────── + + [Symbol.iterator](): Iterator { + let i = 0; + const data = this._data; + const mask = this._mask; + return { + next() { + if (i >= data.length) { + return { value: null, done: true }; + } + const value = mask[i] ? null : (data[i] ?? null); + i++; + return { value, done: false }; + }, + }; + } + + // ─── String representation ───────────────────────────────────────────────── + + toString(): string { + const items = this.toArray().map((v) => (v === null ? "" : String(v))); + return `${this.dtype}([${items.join(", ")}])`; + } +} diff --git a/src/core/arrays/string_array.ts b/src/core/arrays/string_array.ts new file mode 100644 index 00000000..96735909 --- /dev/null +++ b/src/core/arrays/string_array.ts @@ -0,0 +1,250 @@ +/** + * StringArray β€” nullable string extension array. + * + * Mirrors `pandas.arrays.StringArray`. Stores string values with a separate + * mask for missing (NA) values. + * + * @example + * ```ts + * import { arrays } from "tsb"; + * + * const a = arrays.StringArray.from(["hello", null, "world"]); + * a.dtype; // "string" + * a.at(1); // null + * a.upper().toArray(); // ["HELLO", null, "WORLD"] + * a.fillna("").toArray(); // ["hello", "", "world"] + * ``` + * + * @module + */ + +import { MaskedArray } from "./masked_array.ts"; +import { BooleanArray } from "./boolean_array.ts"; +import { IntegerArray } from "./integer_array.ts"; + +// ─── StringArray ────────────────────────────────────────────────────────────── + +/** + * A nullable string array. + * + * Use {@link StringArray.from} to create instances. + */ +export class StringArray extends MaskedArray { + /** @internal */ + constructor(data: string[], mask: boolean[]) { + super(data, mask); + } + + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link StringArray} from a sequence of string values (or null/undefined). + * + * @example + * ```ts + * StringArray.from(["a", "b", null, "d"]); + * ``` + */ + static from(values: Iterable): StringArray { + const data: string[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined) { + data.push(""); + mask.push(true); + } else { + data.push(String(v)); + mask.push(false); + } + } + return new StringArray(data, mask); + } + + /** @internal */ + static _fromRaw(data: string[], mask: boolean[]): StringArray { + return new StringArray(data, mask); + } + + // ─── Dtype ──────────────────────────────────────────────────────────────── + + get dtype(): "string" { + return "string"; + } + + // ─── String operations ──────────────────────────────────────────────────── + + /** Return a new StringArray with all strings uppercased. NA is preserved. */ + upper(): StringArray { + return this._mapStr((s) => s.toUpperCase()); + } + + /** Return a new StringArray with all strings lowercased. NA is preserved. */ + lower(): StringArray { + return this._mapStr((s) => s.toLowerCase()); + } + + /** Return a new StringArray with leading/trailing whitespace stripped. */ + strip(): StringArray { + return this._mapStr((s) => s.trim()); + } + + /** Return a new StringArray with leading whitespace stripped. */ + lstrip(): StringArray { + return this._mapStr((s) => s.trimStart()); + } + + /** Return a new StringArray with trailing whitespace stripped. */ + rstrip(): StringArray { + return this._mapStr((s) => s.trimEnd()); + } + + /** + * Return a {@link BooleanArray} where `true` if the element contains `pattern`. + * NA elements remain NA in the result. + * + * @example + * ```ts + * StringArray.from(["abc", null, "xyz"]).contains("a"); + * // BooleanArray [true, null, false] + * ``` + */ + contains(pattern: string | RegExp): BooleanArray { + const data: boolean[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + data.push(false); + mask.push(true); + } else { + const s = this._data[i] as string; + data.push(typeof pattern === "string" ? s.includes(pattern) : pattern.test(s)); + mask.push(false); + } + } + return BooleanArray._fromRaw(data, mask); + } + + /** + * Return a BooleanArray where `true` if the element starts with `prefix`. + */ + startswith(prefix: string): BooleanArray { + const data: boolean[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + data.push(false); + mask.push(true); + } else { + data.push((this._data[i] as string).startsWith(prefix)); + mask.push(false); + } + } + return BooleanArray._fromRaw(data, mask); + } + + /** + * Return a BooleanArray where `true` if the element ends with `suffix`. + */ + endswith(suffix: string): BooleanArray { + const data: boolean[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + data.push(false); + mask.push(true); + } else { + data.push((this._data[i] as string).endsWith(suffix)); + mask.push(false); + } + } + return BooleanArray._fromRaw(data, mask); + } + + /** + * Return a new StringArray with occurrences of `pat` replaced by `repl`. + */ + replace(pat: string | RegExp, repl: string): StringArray { + return this._mapStr((s) => s.replace(pat, repl)); + } + + /** Return a StringArray with strings zero-padded on the left to `width`. */ + zfill(width: number): StringArray { + return this._mapStr((s) => s.padStart(width, "0")); + } + + /** + * String length for each element as an {@link IntegerArray} (NA β†’ NA). + * + * @example + * ```ts + * StringArray.from(["hi", null, "world"]).len().toArray(); // [2, null, 5] + * ``` + */ + len(): IntegerArray { + const data: number[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + data.push(this._mask[i] ? 0 : (this._data[i] as string).length); + mask.push(this._mask[i] === true); + } + return IntegerArray._fromRaw(data, mask, "Int64"); + } + + /** + * Concatenate strings element-wise with a separator. + * + * @example + * ```ts + * StringArray.from(["a", "b"]).cat(" ", StringArray.from(["x", "y"])); + * // StringArray ["a x", "b y"] + * ``` + */ + cat(sep: string, other: StringArray): StringArray { + if (other.size !== this.size) { + throw new RangeError( + `StringArray.cat: size mismatch (${this.size} vs ${other.size})`, + ); + } + const data: string[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i] || other._mask[i]) { + data.push(""); + mask.push(true); + } else { + data.push((this._data[i] as string) + sep + (other._data[i] as string)); + mask.push(false); + } + } + return StringArray._fromRaw(data, mask); + } + + /** + * Return a new StringArray with NA elements replaced. + * + * @example + * ```ts + * StringArray.from(["a", null, "c"]).fillna("x").toArray(); + * // ["a", "x", "c"] + * ``` + */ + fillna(value: string): StringArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return StringArray._fromRaw(data, mask); + } + + // ─── Reductions ─────────────────────────────────────────────────────────── + + /** Count of non-NA elements. */ + count(): number { + return this._mask.filter((m) => !m).length; + } + + // ─── Internal helper ────────────────────────────────────────────────────── + + private _mapStr(fn: (s: string) => string): StringArray { + const data = this._data.map((v, i) => (this._mask[i] ? "" : fn(v as string))); + return StringArray._fromRaw(data, this._mask.slice()); + } +} diff --git a/src/core/arrays/timedelta_array.ts b/src/core/arrays/timedelta_array.ts new file mode 100644 index 00000000..54d2d5d8 --- /dev/null +++ b/src/core/arrays/timedelta_array.ts @@ -0,0 +1,344 @@ +/** + * TimedeltaArray β€” extension array of nullable {@link Timedelta} values. + * + * Mirrors `pandas.arrays.TimedeltaArray`. Stores an array of Timedelta values + * with a separate boolean mask for missing (NA) values. + * + * @example + * ```ts + * import { arrays } from "tsb"; + * import { Timedelta } from "tsb"; + * + * const a = arrays.TimedeltaArray.from([ + * Timedelta.fromComponents({ days: 1 }), + * null, + * Timedelta.fromComponents({ hours: 6 }), + * ]); + * a.dtype; // "timedelta64[ns]" + * a.at(1); // null + * a.days; // [1, null, 0] + * a.totalSeconds; // [86400, null, 21600] + * ``` + * + * @module + */ + +import { Timedelta } from "../timedelta.ts"; + +// ─── TimedeltaArray ─────────────────────────────────────────────────────────── + +/** + * A nullable array of {@link Timedelta} values. + * + * Use {@link TimedeltaArray.from} to create instances. + */ +export class TimedeltaArray { + private readonly _data: Timedelta[]; + private readonly _mask: boolean[]; + + /** @internal */ + constructor(data: Timedelta[], mask: boolean[]) { + if (data.length !== mask.length) { + throw new RangeError( + `TimedeltaArray: data length (${data.length}) !== mask length (${mask.length})`, + ); + } + this._data = data; + this._mask = mask; + } + + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link TimedeltaArray} from a sequence of Timedelta values, + * numbers (milliseconds), ISO strings, or null/undefined. + * + * @param values - Source values. Numbers are interpreted as milliseconds. + * ISO duration strings like `"1 days 02:00:00"` or `"P1DT2H"` are parsed. + * + * @example + * ```ts + * TimedeltaArray.from([ + * Timedelta.fromComponents({ days: 1 }), + * null, + * 86400000, // 1 day in ms + * "1 days 00:00:00", + * ]); + * ``` + */ + static from( + values: Iterable, + ): TimedeltaArray { + const data: Timedelta[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined) { + data.push(Timedelta.fromMilliseconds(0)); + mask.push(true); + } else if (v instanceof Timedelta) { + data.push(v); + mask.push(false); + } else if (typeof v === "number") { + data.push(Timedelta.fromMilliseconds(v)); + mask.push(false); + } else { + data.push(Timedelta.parse(v)); + mask.push(false); + } + } + return new TimedeltaArray(data, mask); + } + + /** @internal */ + static _fromRaw(data: Timedelta[], mask: boolean[]): TimedeltaArray { + return new TimedeltaArray(data, mask); + } + + // ─── Core accessors ──────────────────────────────────────────────────────── + + /** Number of elements (including NAs). */ + get size(): number { + return this._data.length; + } + + /** Dtype string β€” `"timedelta64[ns]"`. */ + get dtype(): "timedelta64[ns]" { + return "timedelta64[ns]"; + } + + /** + * Return the element at index `i`, or `null` if masked. + * Supports negative indexing. + */ + at(i: number): Timedelta | null { + const idx = i < 0 ? this._data.length + i : i; + if (idx < 0 || idx >= this._data.length) { + return null; + } + if (this._mask[idx]) { + return null; + } + return this._data[idx] ?? null; + } + + // ─── NA ──────────────────────────────────────────────────────────────────── + + /** Boolean array where `true` = NA. */ + isna(): boolean[] { + return this._mask.slice(); + } + + /** Boolean array where `true` = not NA. */ + notna(): boolean[] { + return this._mask.map((m) => !m); + } + + // ─── Component accessors ────────────────────────────────────────────────── + + /** Integer days component for each element (NA β†’ null). */ + get days(): (number | null)[] { + return this._extractComponent((td) => td.days); + } + + /** Integer hours component for each element (NA β†’ null). */ + get hours(): (number | null)[] { + return this._extractComponent((td) => td.hours); + } + + /** Integer minutes component for each element (NA β†’ null). */ + get minutes(): (number | null)[] { + return this._extractComponent((td) => td.minutes); + } + + /** Integer seconds component for each element (NA β†’ null). */ + get seconds(): (number | null)[] { + return this._extractComponent((td) => td.seconds); + } + + /** Integer milliseconds component for each element (NA β†’ null). */ + get milliseconds(): (number | null)[] { + return this._extractComponent((td) => td.milliseconds); + } + + /** Total number of milliseconds for each element (NA β†’ null). */ + get totalMilliseconds(): (number | null)[] { + return this._extractComponent((td) => td.totalMilliseconds); + } + + /** Total number of seconds (float) for each element (NA β†’ null). */ + get totalSeconds(): (number | null)[] { + return this._extractComponent((td) => td.totalSeconds); + } + + /** Total number of hours (float) for each element (NA β†’ null). */ + get totalHours(): (number | null)[] { + return this._extractComponent((td) => td.totalHours); + } + + /** Total number of days (float) for each element (NA β†’ null). */ + get totalDays(): (number | null)[] { + return this._extractComponent((td) => td.totalDays); + } + + // ─── Arithmetic ─────────────────────────────────────────────────────────── + + /** + * Add a scalar {@link Timedelta} to every element. NA propagates. + */ + add(other: TimedeltaArray | Timedelta): TimedeltaArray { + if (other instanceof Timedelta) { + const data = this._data.map((v, i) => (this._mask[i] ? v : v.add(other))); + return TimedeltaArray._fromRaw(data, this._mask.slice()); + } + if (other.size !== this.size) { + throw new RangeError( + `TimedeltaArray: operand size mismatch (${this.size} vs ${other.size})`, + ); + } + const data: Timedelta[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i] || other._mask[i]) { + data.push(Timedelta.fromMilliseconds(0)); + mask.push(true); + } else { + data.push((this._data[i] as Timedelta).add(other._data[i] as Timedelta)); + mask.push(false); + } + } + return TimedeltaArray._fromRaw(data, mask); + } + + /** + * Subtract a scalar {@link Timedelta} from every element. NA propagates. + */ + sub(other: TimedeltaArray | Timedelta): TimedeltaArray { + if (other instanceof Timedelta) { + const data = this._data.map((v, i) => + this._mask[i] ? v : v.sub(other), + ); + return TimedeltaArray._fromRaw(data, this._mask.slice()); + } + if (other.size !== this.size) { + throw new RangeError( + `TimedeltaArray: operand size mismatch (${this.size} vs ${other.size})`, + ); + } + const data: Timedelta[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i] || other._mask[i]) { + data.push(Timedelta.fromMilliseconds(0)); + mask.push(true); + } else { + data.push((this._data[i] as Timedelta).sub(other._data[i] as Timedelta)); + mask.push(false); + } + } + return TimedeltaArray._fromRaw(data, mask); + } + + /** Multiply every element by a scalar. NA propagates. */ + mul(factor: number): TimedeltaArray { + const data = this._data.map((v, i) => + this._mask[i] ? v : v.mul(factor), + ); + return TimedeltaArray._fromRaw(data, this._mask.slice()); + } + + // ─── Conversion ──────────────────────────────────────────────────────────── + + /** Return an array of {@link Timedelta} or `null` for NA positions. */ + toArray(): (Timedelta | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : v)); + } + + // ─── Reductions ─────────────────────────────────────────────────────────── + + /** Sum of non-NA elements (millisecond precision). */ + sum(skipna = true): Timedelta | null { + let total = 0; + let hasNonNa = false; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + total += (this._data[i] as Timedelta).totalMilliseconds; + hasNonNa = true; + } + return hasNonNa || skipna ? Timedelta.fromMilliseconds(total) : null; + } + + /** Minimum non-NA element. */ + min(): Timedelta | null { + let result: Timedelta | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + continue; + } + const v = this._data[i] as Timedelta; + if (result === null || v.totalMilliseconds < result.totalMilliseconds) { + result = v; + } + } + return result; + } + + /** Maximum non-NA element. */ + max(): Timedelta | null { + let result: Timedelta | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + continue; + } + const v = this._data[i] as Timedelta; + if (result === null || v.totalMilliseconds > result.totalMilliseconds) { + result = v; + } + } + return result; + } + + // ─── fillna ─────────────────────────────────────────────────────────────── + + /** Return a new TimedeltaArray with NAs replaced by `value`. */ + fillna(value: Timedelta): TimedeltaArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return TimedeltaArray._fromRaw(data, mask); + } + + // ─── Iteration ───────────────────────────────────────────────────────────── + + [Symbol.iterator](): Iterator { + let i = 0; + const data = this._data; + const mask = this._mask; + return { + next() { + if (i >= data.length) { + return { value: null, done: true }; + } + const value = mask[i] ? null : (data[i] ?? null); + i++; + return { value, done: false }; + }, + }; + } + + // ─── String representation ───────────────────────────────────────────────── + + toString(): string { + const items = this.toArray().map((v) => (v === null ? "" : v.toString())); + return `TimedeltaArray([${items.join(", ")}], dtype="${this.dtype}")`; + } + + // ─── Private helper ──────────────────────────────────────────────────────── + + private _extractComponent(fn: (td: Timedelta) => number): (number | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : fn(v))); + } +} diff --git a/src/core/flags.ts b/src/core/flags.ts new file mode 100644 index 00000000..546cb031 --- /dev/null +++ b/src/core/flags.ts @@ -0,0 +1,186 @@ +/** + * Flags β€” metadata flags for DataFrame and Series objects. + * + * Mirrors `pandas.core.flags.Flags`. Provides the `allowsDuplicateLabels` + * flag that controls whether duplicate row/column labels are permitted in the + * associated DataFrame or Series. + * + * @example + * ```ts + * import { DataFrame, DuplicateLabelError } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [1, 2, 3] }); + * df.flags.allowsDuplicateLabels; // true (default) + * + * df.flags.allowsDuplicateLabels = false; + * // Setting false on a DataFrame with no duplicates is fine. + * + * const dfDup = new DataFrame( + * new Map([["a", df.col("a")]]), + * df.index.append(df.index), // duplicate index + * ); + * dfDup.flags.allowsDuplicateLabels = false; // throws DuplicateLabelError + * ``` + * + * @packageDocumentation + */ + +import { DuplicateLabelError } from "../errors.ts"; + +// --------------------------------------------------------------------------- +// Structural interfaces (no imports from frame.ts / series.ts) +// --------------------------------------------------------------------------- + +/** + * Minimal structural interface satisfied by any `Index` instance. + * Defined here (instead of importing from base-index.ts) to avoid circular + * imports β€” frame.ts β†’ flags.ts must not require flags.ts β†’ frame.ts. + */ +interface IndexLike { + readonly values: readonly unknown[]; + readonly size: number; +} + +/** + * Structural interface satisfied by both `DataFrame` and `Series`. + * Used as the WeakMap key so flags.ts never imports the concrete classes. + */ +export interface FlaggedObject { + /** Row index of the object. */ + readonly index: IndexLike; +} + +// --------------------------------------------------------------------------- +// Internal state registry +// --------------------------------------------------------------------------- + +interface FlagsState { + allowsDuplicateLabels: boolean; +} + +const registry = new WeakMap(); + +function getState(obj: FlaggedObject): FlagsState { + let state = registry.get(obj); + if (state === undefined) { + state = { allowsDuplicateLabels: true }; + registry.set(obj, state); + } + return state; +} + +// --------------------------------------------------------------------------- +// Flags class +// --------------------------------------------------------------------------- + +/** + * Metadata flags for a `DataFrame` or `Series`. + * + * Accessible via `df.flags` or `series.flags`. Mutations are reflected + * immediately on the underlying object because state is stored in a + * module-level WeakMap keyed by the object reference. + * + * ### pandas reference + * `pandas.core.flags.Flags` + */ +export class Flags { + private readonly _obj: FlaggedObject; + + /** + * @param obj - The DataFrame or Series this Flags object is bound to. + * @param opts.allowsDuplicateLabels - Initial value for `allowsDuplicateLabels`. + * Defaults to `true` when not previously set. + */ + constructor(obj: FlaggedObject, opts: { allowsDuplicateLabels?: boolean } = {}) { + this._obj = obj; + if (opts.allowsDuplicateLabels !== undefined) { + getState(obj).allowsDuplicateLabels = opts.allowsDuplicateLabels; + } + } + + // ── allowsDuplicateLabels ───────────────────────────────────────────────── + + /** + * Whether duplicate labels (along any axis) are allowed. + * + * Defaults to `true`. When set to `false`, any existing duplicate labels + * trigger a `DuplicateLabelError` immediately. Future operations that would + * produce duplicate labels also raise. + * + * @example + * ```ts + * df.flags.allowsDuplicateLabels; // true + * df.flags.allowsDuplicateLabels = false; + * df.flags.allowsDuplicateLabels; // false + * ``` + */ + get allowsDuplicateLabels(): boolean { + return getState(this._obj).allowsDuplicateLabels; + } + + set allowsDuplicateLabels(value: boolean) { + getState(this._obj).allowsDuplicateLabels = value; + if (!value) { + this._validateNoDuplicates(); + } + } + + // ── helpers ─────────────────────────────────────────────────────────────── + + /** + * Raise `DuplicateLabelError` if the bound object currently has duplicate + * row-index labels. + */ + private _validateNoDuplicates(): void { + const { values } = this._obj.index; + const seen = new Set(); + for (const label of values) { + if (seen.has(label)) { + throw new DuplicateLabelError(`Index has duplicate keys: [${String(label)}]`); + } + seen.add(label); + } + } + + /** + * Raise `DuplicateLabelError` if `allowsDuplicateLabels` is `false` and + * the bound object has duplicate labels. Called by DataFrame/Series methods + * after operations that could introduce duplicates. + */ + raiseOnDuplicates(): void { + if (!this.allowsDuplicateLabels) { + this._validateNoDuplicates(); + } + } + + /** + * Return a copy of this Flags object bound to the **same** underlying object. + * + * The returned `Flags` shares state with the original β€” mutations to either + * are reflected in both (they both write to the same WeakMap entry). + */ + copy(): Flags { + return new Flags(this._obj); + } + + /** Human-readable representation mirroring pandas' `repr(df.flags)`. */ + toString(): string { + return ``; + } +} + +// --------------------------------------------------------------------------- +// Registry accessor (used by DataFrame.flags / Series.flags getters) +// --------------------------------------------------------------------------- + +/** + * Return (or lazily create) the `Flags` wrapper for the given object. + * + * Each call creates a *new* `Flags` wrapper object, but all wrappers for the + * same `obj` share the same state via the module-level WeakMap registry. + * + * @param obj - The DataFrame or Series to get flags for. + */ +export function getFlags(obj: FlaggedObject): Flags { + return new Flags(obj); +} diff --git a/src/core/frame.ts b/src/core/frame.ts index ec18d144..e21c341e 100644 --- a/src/core/frame.ts +++ b/src/core/frame.ts @@ -26,6 +26,8 @@ import type { ExpandingOptions } from "../window/index.ts"; import { Rolling } from "../window/index.ts"; import type { RollingOptions } from "../window/index.ts"; import { Index } from "./base-index.ts"; +import { getFlags } from "./flags.ts"; +import type { Flags } from "./flags.ts"; import { RangeIndex } from "./range-index.ts"; import { Series } from "./series.ts"; @@ -245,6 +247,21 @@ export class DataFrame { return this.index.size === 0 || this.columns.size === 0; } + /** + * Metadata flags for this DataFrame. + * + * Controls behaviour such as whether duplicate labels are allowed. + * + * @example + * ```ts + * df.flags.allowsDuplicateLabels; // true (default) + * df.flags.allowsDuplicateLabels = false; + * ``` + */ + get flags(): Flags { + return getFlags(this); + } + // ─── column access ──────────────────────────────────────────────────────── /** diff --git a/src/core/index.ts b/src/core/index.ts index 130c748e..01a0c60c 100644 --- a/src/core/index.ts +++ b/src/core/index.ts @@ -151,3 +151,23 @@ export type { ExtensionDtypeConstructor, ExtensionArrayConstructor, } from "./extensions.ts"; + +export { Flags, getFlags } from "./flags.ts"; +export type { FlaggedObject } from "./flags.ts"; + +// pd.arrays β€” nullable typed extension arrays +export { + MaskedArray, + IntegerArray, + FloatingArray, + BooleanArray, + StringArray, + DatetimeArray, + TimedeltaArray, +} from "./arrays/index.ts"; +export type { + FillValue, + IntegerDtypeName, + FloatingDtypeName, +} from "./arrays/index.ts"; +export { SparseArray, SparseDtype } from "./sparse.ts"; diff --git a/src/core/series.ts b/src/core/series.ts index 38b5fd64..a6e4900c 100644 --- a/src/core/series.ts +++ b/src/core/series.ts @@ -21,6 +21,8 @@ import type { CatSeriesLike } from "./cat_accessor.ts"; import { DatetimeAccessor } from "./datetime_accessor.ts"; import type { DatetimeSeriesLike } from "./datetime_accessor.ts"; import { Dtype } from "./dtype.ts"; +import { getFlags } from "./flags.ts"; +import type { Flags } from "./flags.ts"; import { RangeIndex } from "./range-index.ts"; import { StringAccessor } from "./string_accessor.ts"; import type { StringSeriesLike } from "./string_accessor.ts"; @@ -296,6 +298,21 @@ export class Series { return this._values.length === 0; } + /** + * Metadata flags for this Series. + * + * Controls behaviour such as whether duplicate labels are allowed. + * + * @example + * ```ts + * s.flags.allowsDuplicateLabels; // true (default) + * s.flags.allowsDuplicateLabels = false; + * ``` + */ + get flags(): Flags { + return getFlags(this); + } + /** Snapshot of the underlying values as a plain array. */ get values(): readonly T[] { return this._values; diff --git a/src/core/sparse.ts b/src/core/sparse.ts new file mode 100644 index 00000000..5a1e2de3 --- /dev/null +++ b/src/core/sparse.ts @@ -0,0 +1,655 @@ +/** + * core/sparse β€” SparseArray and SparseDtype. + * + * Mirrors `pandas.arrays.SparseArray` and `pandas.SparseDtype`. + * + * A {@link SparseArray} stores data efficiently when most values equal a + * {@link SparseDtype.fill_value fill_value} (commonly `NaN` for floats or + * `0` for integers). Only the **non-fill** values and their indices are stored; + * the fill value is inferred for all other positions. + * + * @example + * ```ts + * import { SparseArray, SparseDtype } from "tsb"; + * + * // Create a sparse array where most elements are 0 + * const arr = SparseArray.fromDense([1, 0, 0, 0, 2, 0, 0, 3], 0); + * arr.length; // 8 + * arr.npoints; // 3 (only three non-zero values stored) + * arr.density; // 0.375 + * arr.sp_values; // [1, 2, 3] + * arr.sp_index; // [0, 4, 7] + * arr.toDense(); // [1, 0, 0, 0, 2, 0, 0, 3] + * + * // With NaN fill (the pandas default) + * const a2 = SparseArray.fromDense([1, NaN, NaN, 4]); + * a2.density; // 0.5 + * ``` + * + * @module + */ + +// ─── SparseDtype ────────────────────────────────────────────────────────────── + +/** + * Dtype representing a sparse array backed by {@link SparseArray}. + * + * Mirrors `pandas.SparseDtype`. The dtype is parameterised by: + * - `subtype` β€” the dtype of the stored values, e.g. `"float64"`, `"int64"`. + * - `fill_value` β€” the implicit value for positions not stored. Defaults to + * `NaN` for float subtypes and `0` for integer subtypes. + * + * @example + * ```ts + * const dt = new SparseDtype("float64"); + * dt.name; // "Sparse[float64]" + * dt.fill_value; // NaN + * + * const di = new SparseDtype("int64", 0); + * di.name; // "Sparse[int64, 0]" + * di.fill_value; // 0 + * ``` + */ +export class SparseDtype { + /** The element dtype, e.g. `"float64"` or `"int64"`. */ + readonly subtype: string; + /** The implicit fill value for positions not stored. */ + readonly fill_value: number; + + /** + * Create a SparseDtype. + * + * @param subtype - Underlying numeric dtype name. Defaults to `"float64"`. + * @param fill_value - Implicit fill value. Defaults to `NaN` for float + * subtypes and `0` for integer subtypes. + */ + constructor(subtype = "float64", fill_value?: number) { + this.subtype = subtype; + if (fill_value !== undefined) { + this.fill_value = fill_value; + } else { + this.fill_value = SparseDtype._defaultFillValue(subtype); + } + } + + /** Returns the default fill value for a given subtype. */ + private static _defaultFillValue(subtype: string): number { + if (subtype.startsWith("int") || subtype.startsWith("uint")) { + return 0; + } + return Number.NaN; + } + + /** + * String representation, e.g. `"Sparse[float64]"` or + * `"Sparse[int64, 0]"`. + */ + get name(): string { + const fv = this.fill_value; + const isDefaultFill = + (Number.isNaN(fv) && Number.isNaN(SparseDtype._defaultFillValue(this.subtype))) || + fv === SparseDtype._defaultFillValue(this.subtype); + if (isDefaultFill) { + return `Sparse[${this.subtype}]`; + } + return `Sparse[${this.subtype}, ${fv}]`; + } + + /** @internal */ + toString(): string { + return this.name; + } +} + +// ─── SparseArray ───────────────────────────────────────────────────────────── + +/** + * An array that stores data sparsely β€” only non-fill values and their + * positions are held in memory. + * + * Mirrors `pandas.arrays.SparseArray`. Useful when a large fraction of + * elements share a common value (the {@link fill_value}) such as `NaN`, + * `0`, or `false`. + * + * @example + * ```ts + * import { SparseArray } from "tsb"; + * + * const arr = SparseArray.fromDense([0, 0, 5, 0, 0, 3], 0); + * arr.sp_values; // [5, 3] + * arr.sp_index; // [2, 5] + * arr.toDense(); // [0, 0, 5, 0, 0, 3] + * arr.density; // 0.333… + * arr.sum(); // 8 + * ``` + */ +export class SparseArray { + private readonly _length: number; + /** Positions (0-based) of the non-fill values. */ + private readonly _indices: Int32Array; + /** The non-fill values, in position order. */ + private readonly _values: Float64Array; + private readonly _fillValue: number; + private readonly _dtype: SparseDtype; + + /** @internal β€” use {@link SparseArray.fromDense} or the constructor. */ + private constructor( + length: number, + indices: Int32Array, + values: Float64Array, + fillValue: number, + subtype: string, + ) { + this._length = length; + this._indices = indices; + this._values = values; + this._fillValue = fillValue; + this._dtype = new SparseDtype(subtype, fillValue); + } + + // ─── factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link SparseArray} from a dense array of numbers. + * + * Values that satisfy `isFill(v, fill_value)` are **not** stored. The + * default fill equality uses `Object.is` so that `NaN === NaN` (i.e. + * `NaN` is treated as equal to itself). + * + * @param data - Dense input array. `NaN` and `null`/`undefined` are + * treated as `NaN` internally. + * @param fill_value - The implicit fill value. Defaults to `NaN`. + * @param subtype - The element dtype label. Defaults to `"float64"`. + */ + static fromDense( + data: readonly (number | null | undefined)[], + fill_value = Number.NaN, + subtype = "float64", + ): SparseArray { + const indList: number[] = []; + const valList: number[] = []; + + for (let i = 0; i < data.length; i++) { + const raw = data[i]; + const v = raw == null ? Number.NaN : raw; + if (!SparseArray._isFill(v, fill_value)) { + indList.push(i); + valList.push(v); + } + } + + return new SparseArray( + data.length, + new Int32Array(indList), + new Float64Array(valList), + fill_value, + subtype, + ); + } + + /** + * Create a {@link SparseArray} directly from sparse (COO) components. + * + * @param length - Total logical length of the array. + * @param indices - Sorted positions of the non-fill values (0-based). + * @param values - Non-fill values, one per index. + * @param fill_value - Implicit fill value. Defaults to `NaN`. + * @param subtype - Element dtype label. Defaults to `"float64"`. + */ + static fromSparse( + length: number, + indices: readonly number[], + values: readonly number[], + fill_value = Number.NaN, + subtype = "float64", + ): SparseArray { + if (indices.length !== values.length) { + throw new RangeError( + `indices.length (${indices.length}) must equal values.length (${values.length})`, + ); + } + return new SparseArray( + length, + new Int32Array(indices), + new Float64Array(values), + fill_value, + subtype, + ); + } + + /** Check whether `v` equals the fill value (NaN-safe). */ + private static _isFill(v: number, fill: number): boolean { + return Object.is(v, fill); + } + + // ─── properties ──────────────────────────────────────────────────────────── + + /** Total logical length of the array (including fill positions). */ + get length(): number { + return this._length; + } + + /** Number of explicitly stored (non-fill) values. */ + get npoints(): number { + return this._values.length; + } + + /** + * Fraction of positions that are stored (0.0 – 1.0). + * + * Lower density = more memory savings. + */ + get density(): number { + if (this._length === 0) { + return 0; + } + return this._values.length / this._length; + } + + /** The implicit fill value. */ + get fill_value(): number { + return this._fillValue; + } + + /** + * The stored (non-fill) values in position order. + * + * Mirrors `pandas.arrays.SparseArray.sp_values`. + */ + get sp_values(): number[] { + return Array.from(this._values); + } + + /** + * The positions (0-based) of the stored values. + * + * Mirrors `pandas.arrays.SparseArray.sp_index`. + */ + get sp_index(): number[] { + return Array.from(this._indices); + } + + /** The {@link SparseDtype} of this array. */ + get dtype(): SparseDtype { + return this._dtype; + } + + // ─── element access ──────────────────────────────────────────────────────── + + /** + * Return the value at position `i`. + * + * Returns the {@link fill_value} for positions not explicitly stored. + * + * @example + * ```ts + * const arr = SparseArray.fromDense([1, 0, 0, 4], 0); + * arr.at(0); // 1 + * arr.at(1); // 0 (fill) + * arr.at(3); // 4 + * ``` + */ + at(i: number): number { + if (i < 0 || i >= this._length) { + throw new RangeError(`Index ${i} out of bounds for length ${this._length}`); + } + const pos = this._bsearch(i); + if (pos >= 0) { + return this._values[pos] ?? this._fillValue; + } + return this._fillValue; + } + + /** + * Binary search for position `idx` in `this._indices`. + * Returns the array position if found, or -1 if not. + */ + private _bsearch(idx: number): number { + let lo = 0; + let hi = this._indices.length - 1; + while (lo <= hi) { + const mid = (lo + hi) >>> 1; + const v = this._indices[mid]; + if (v === undefined) { + return -1; + } + if (v === idx) { + return mid; + } + if (v < idx) { + lo = mid + 1; + } else { + hi = mid - 1; + } + } + return -1; + } + + // ─── conversion ──────────────────────────────────────────────────────────── + + /** + * Convert to a dense `number[]`, replacing fill positions with + * {@link fill_value}. + * + * @example + * ```ts + * const arr = SparseArray.fromDense([1, 0, 0, 4], 0); + * arr.toDense(); // [1, 0, 0, 4] + * ``` + */ + toDense(): number[] { + const out = new Array(this._length).fill(this._fillValue); + for (let k = 0; k < this._indices.length; k++) { + const idx = this._indices[k]; + const val = this._values[k]; + if (idx !== undefined && val !== undefined) { + out[idx] = val; + } + } + return out; + } + + /** + * Return sparse COO (Coordinate) format representation. + * + * Returned object has `indices` (positions) and `values` (stored values). + */ + toCoo(): { indices: number[]; values: number[] } { + return { indices: this.sp_index, values: this.sp_values }; + } + + // ─── operations ──────────────────────────────────────────────────────────── + + /** + * Fill NaN values with `value` and return a new {@link SparseArray}. + * + * Only affects `NaN` positions in the dense view β€” positions already + * storing a number are unchanged. + * + * @example + * ```ts + * const arr = SparseArray.fromDense([1, NaN, NaN, 4]); + * arr.fillna(0).toDense(); // [1, 0, 0, 4] + * ``` + */ + fillna(value: number): SparseArray { + // If the fill_value is NaN, filling changes the fill_value to `value` + if (Number.isNaN(this._fillValue)) { + // Re-create with new fill_value; existing stored values stay + return new SparseArray( + this._length, + new Int32Array(this._indices), + new Float64Array(this._values), + value, + this._dtype.subtype, + ); + } + // fill_value is not NaN β€” nothing to fill (NaN must be in sp_values) + const newIndices: number[] = []; + const newValues: number[] = []; + for (let k = 0; k < this._indices.length; k++) { + const idx = this._indices[k]; + const v = this._values[k]; + if (idx === undefined || v === undefined) { + continue; + } + if (Number.isNaN(v)) { + // Don't store it if it equals new fill; otherwise store value + if (value !== this._fillValue) { + newIndices.push(idx); + newValues.push(value); + } + } else { + newIndices.push(idx); + newValues.push(v); + } + } + return new SparseArray( + this._length, + new Int32Array(newIndices), + new Float64Array(newValues), + this._fillValue, + this._dtype.subtype, + ); + } + + /** + * Return a new {@link SparseArray} with a different fill value. + * + * Positions whose value equals the current fill are not stored; positions + * whose value equals the new fill are removed from storage. + */ + withFillValue(newFill: number): SparseArray { + return SparseArray.fromDense(this.toDense(), newFill, this._dtype.subtype); + } + + /** + * Element-wise arithmetic: add a scalar. + * + * @example + * ```ts + * const arr = SparseArray.fromDense([1, 0, 0, 4], 0); + * arr.add(10).toDense(); // [11, 10, 10, 14] + * ``` + */ + add(scalar: number): SparseArray { + const dense = this.toDense().map((v) => v + scalar); + return SparseArray.fromDense(dense, this._fillValue + scalar, this._dtype.subtype); + } + + /** + * Element-wise arithmetic: multiply by a scalar. + * + * @example + * ```ts + * const arr = SparseArray.fromDense([1, 0, 0, 4], 0); + * arr.mul(2).toDense(); // [2, 0, 0, 8] + * ``` + */ + mul(scalar: number): SparseArray { + const newFill = this._fillValue * scalar; + const newIndices = new Int32Array(this._indices); + const newValues = new Float64Array(this._values.length); + for (let k = 0; k < this._values.length; k++) { + const v = this._values[k]; + if (v !== undefined) { + newValues[k] = v * scalar; + } + } + return new SparseArray( + this._length, + newIndices, + newValues, + newFill, + this._dtype.subtype, + ); + } + + // ─── aggregations ────────────────────────────────────────────────────────── + + /** + * Sum of all values (treating NaN fill positions as 0, consistent with + * `numpy.nansum` behaviour for sparse arrays). + * + * @example + * ```ts + * const arr = SparseArray.fromDense([1, NaN, NaN, 4]); + * arr.sum(); // 5 + * ``` + */ + sum(): number { + let total = 0; + // Stored (non-fill) values + for (const v of this._values) { + if (!Number.isNaN(v)) { + total += v; + } + } + // Fill positions: if fill_value is a real number (not NaN), add it for + // each fill position. + if (!Number.isNaN(this._fillValue)) { + const nFill = this._length - this._values.length; + total += this._fillValue * nFill; + } + return total; + } + + /** + * Mean of all non-NaN values. + * + * @example + * ```ts + * const arr = SparseArray.fromDense([1, NaN, NaN, 3]); + * arr.mean(); // 2 (mean of [1, 3]) + * ``` + */ + mean(): number { + let total = 0; + let count = 0; + // Stored values + for (const v of this._values) { + if (!Number.isNaN(v)) { + total += v; + count++; + } + } + // Fill positions (if fill_value is real) + if (!Number.isNaN(this._fillValue)) { + const nFill = this._length - this._values.length; + total += this._fillValue * nFill; + count += nFill; + } + if (count === 0) { + return Number.NaN; + } + return total / count; + } + + /** + * Maximum value (ignoring NaN). Returns `NaN` if all values are NaN. + * + * @example + * ```ts + * const arr = SparseArray.fromDense([1, 0, 0, 4], 0); + * arr.max(); // 4 + * ``` + */ + max(): number { + let result = Number.NaN; + // Start from fill_value if it's real + if (!Number.isNaN(this._fillValue) && this._length > this._values.length) { + result = this._fillValue; + } + for (const v of this._values) { + if (!Number.isNaN(v)) { + if (Number.isNaN(result) || v > result) { + result = v; + } + } + } + return result; + } + + /** + * Minimum value (ignoring NaN). Returns `NaN` if all values are NaN. + * + * @example + * ```ts + * const arr = SparseArray.fromDense([1, 0, 0, 4], 0); + * arr.min(); // 0 + * ``` + */ + min(): number { + let result = Number.NaN; + // Start from fill_value if it's real + if (!Number.isNaN(this._fillValue) && this._length > this._values.length) { + result = this._fillValue; + } + for (const v of this._values) { + if (!Number.isNaN(v)) { + if (Number.isNaN(result) || v < result) { + result = v; + } + } + } + return result; + } + + /** + * Standard deviation of all non-NaN values (ddof=1 by default). + * + * @param ddof - Delta degrees of freedom. Defaults to `1` (sample std). + */ + std(ddof = 1): number { + const dense = this.toDense().filter((v) => !Number.isNaN(v)); + if (dense.length <= ddof) { + return Number.NaN; + } + const m = dense.reduce((a, b) => a + b, 0) / dense.length; + const variance = dense.reduce((a, b) => a + (b - m) ** 2, 0) / (dense.length - ddof); + return Math.sqrt(variance); + } + + // ─── slicing ─────────────────────────────────────────────────────────────── + + /** + * Return a new {@link SparseArray} for the slice `[start, end)`. + * + * @example + * ```ts + * const arr = SparseArray.fromDense([1, 0, 0, 4, 0, 3], 0); + * arr.slice(1, 5).toDense(); // [0, 0, 4, 0] + * ``` + */ + slice(start: number, end: number = this._length): SparseArray { + const s = Math.max(0, start < 0 ? this._length + start : start); + const e = Math.min(this._length, end < 0 ? this._length + end : end); + const newLen = Math.max(0, e - s); + + const newIndices: number[] = []; + const newValues: number[] = []; + for (let k = 0; k < this._indices.length; k++) { + const idx = this._indices[k]; + const v = this._values[k]; + if (idx === undefined || v === undefined) { + continue; + } + if (idx >= s && idx < e) { + newIndices.push(idx - s); + newValues.push(v); + } + } + return new SparseArray( + newLen, + new Int32Array(newIndices), + new Float64Array(newValues), + this._fillValue, + this._dtype.subtype, + ); + } + + // ─── iteration ───────────────────────────────────────────────────────────── + + /** + * Iterate over all values (including fill positions) in order. + * + * @example + * ```ts + * for (const v of SparseArray.fromDense([1, 0, 0, 4], 0)) { + * console.log(v); // 1, 0, 0, 4 + * } + * ``` + */ + [Symbol.iterator](): Iterator { + return this.toDense()[Symbol.iterator](); + } + + // ─── display ─────────────────────────────────────────────────────────────── + + /** @internal */ + toString(): string { + const preview = this.toDense().slice(0, 6).join(", "); + const ellipsis = this._length > 6 ? ", ..." : ""; + return `SparseArray([${preview}${ellipsis}], fill_value=${this._fillValue}, dtype=${this._dtype})`; + } +} diff --git a/src/errors.ts b/src/errors.ts index 4ea24681..83099389 100644 --- a/src/errors.ts +++ b/src/errors.ts @@ -86,6 +86,19 @@ export class EmptyDataError extends Error { } } +/** + * Raised when an operation would produce (or encounters) duplicate labels + * on an object where `flags.allowsDuplicateLabels` is `false`. + * + * Equivalent to `pandas.errors.DuplicateLabelError`. + */ +export class DuplicateLabelError extends ValueError { + override readonly name = "DuplicateLabelError"; + constructor(message = "Index has duplicates") { + super(message); + } +} + /** Raised when casting to integer would lose data due to NaN values. */ export class IntCastingNaNError extends Error { override readonly name = "IntCastingNaNError"; @@ -233,6 +246,7 @@ export const errors = { DatabaseError, DataError, DtypeWarning, + DuplicateLabelError, EmptyDataError, IntCastingNaNError, InvalidColumnName, diff --git a/src/index.ts b/src/index.ts index 2f49842f..c5892cf5 100644 --- a/src/index.ts +++ b/src/index.ts @@ -62,6 +62,36 @@ export { toJsonDenormalize, toJsonRecords, toJsonSplit, toJsonIndex } from "./io export type { JsonDenormalizeOptions, JsonSplitOptions, JsonSplitResult } from "./io/index.ts"; export { readHtml } from "./io/index.ts"; export type { ReadHtmlOptions } from "./io/index.ts"; +export { readXml, toXml } from "./io/index.ts"; +export type { ReadXmlOptions, ToXmlOptions } from "./io/index.ts"; +export { readTable } from "./io/index.ts"; +export type { ReadTableOptions } from "./io/index.ts"; +export { readSql, readSqlQuery, readSqlTable, toSql } from "./io/index.ts"; +export { TableExistsError, TableNotFoundError } from "./io/index.ts"; +export { readStata, toStata } from "./io/index.ts"; +export type { ReadStataOptions, ToStataOptions } from "./io/index.ts"; +export { readParquet, toParquet } from "./io/index.ts"; +export type { ReadParquetOptions, ToParquetOptions } from "./io/index.ts"; +export { readFeather, toFeather } from "./io/index.ts"; +export type { ReadFeatherOptions, ToFeatherOptions } from "./io/index.ts"; +export { readHdf, toHdf } from "./io/index.ts"; +export type { ReadHdfOptions, ToHdfOptions } from "./io/index.ts"; +export { readFwf } from "./io/index.ts"; +export type { ReadFwfOptions, ColSpec } from "./io/index.ts"; +export { toExcel } from "./io/index.ts"; +export type { ToExcelOptions } from "./io/index.ts"; +export type { + SqlValue, + SqlRow, + SqlResult, + SqlConnection, + IfExistsStrategy, + ReadSqlBaseOptions, + ReadSqlQueryOptions, + ReadSqlTableOptions, + ReadSqlOptions, + ToSqlOptions, +} from "./io/index.ts"; export { pearsonCorr, dataFrameCorr, dataFrameCov } from "./stats/index.ts"; export type { CorrMethod, CorrOptions, CovOptions } from "./stats/index.ts"; export { Rolling } from "./window/index.ts"; @@ -103,6 +133,8 @@ export { wideToLong } from "./reshape/index.ts"; export type { WideToLongOptions } from "./reshape/index.ts"; export { pivotTableFull } from "./reshape/index.ts"; export type { PivotTableFullOptions } from "./reshape/index.ts"; +export { lreshape } from "./reshape/index.ts"; +export type { LreshapeGroups, LreshapeOptions } from "./reshape/index.ts"; export { MultiIndex } from "./core/index.ts"; export type { MultiIndexOptions } from "./core/index.ts"; export { rankSeries, rankDataFrame } from "./stats/index.ts"; @@ -783,3 +815,116 @@ export { IndexError, } from "./errors.ts"; export type { PandasError } from "./errors.ts"; +export { DuplicateLabelError } from "./errors.ts"; +export { caseWhen } from "./stats/index.ts"; +export type { CaseWhenBranch, CaseWhenPredicate } from "./stats/index.ts"; +export { Flags, getFlags } from "./core/index.ts"; +export type { FlaggedObject } from "./core/index.ts"; + +// pd.arrays β€” nullable typed extension arrays (also exported individually) +export type { + FillValue, + IntegerDtypeName, + FloatingDtypeName, +} from "./core/index.ts"; + +import { + MaskedArray, + IntegerArray, + FloatingArray, + BooleanArray, + StringArray, + DatetimeArray, + TimedeltaArray, +} from "./core/index.ts"; +export { + MaskedArray, + IntegerArray, + FloatingArray, + BooleanArray, + StringArray, + DatetimeArray, + TimedeltaArray, +}; + +/** + * `pd.arrays` namespace β€” mirrors `pandas.arrays`. + * + * Provides nullable typed extension arrays for integers, floats, booleans, + * strings, datetimes, and timedeltas. + * + * @example + * ```ts + * import { arrays } from "tsb"; + * const a = arrays.IntegerArray.from([1, null, 3], "Int32"); + * a.toArray(); // [1, null, 3] + * ``` + */ +export const arrays = { + IntegerArray, + FloatingArray, + BooleanArray, + StringArray, + DatetimeArray, + TimedeltaArray, +} as const; + +// pd.tseries β€” holiday calendars and observance helpers +export { + Holiday, + AbstractHolidayCalendar, + USFederalHolidayCalendar, + USNewYearsDay, + USMartinLutherKingJrDay, + USPresidentsDay, + USMemorialDay, + USJuneteenth, + USIndependenceDay, + USLaborDay, + USColumbusDay, + USVeteransDay, + USThanksgivingDay, + USChristmasDay, + get_calendar, + register_calendar, + nearestWorkday, + sundayToMonday, + nextMonday, + nextMondayOrTuesday, + previousFriday, + previousWorkday, + MO, + TU, + WE, + TH, + FR, + SA, + SU, +} from "./tseries/index.ts"; +export type { + WeekdayOffset, + ObservanceFn, + HolidayOptions, + HolidayCalendarOptions, +} from "./tseries/index.ts"; + +// pd.tseries.offsets β€” extended date offset classes +export { + QuarterEnd, + QuarterBegin, + BMonthEnd, + BMonthBegin, + BYearEnd, + BYearBegin, +} from "./tseries/offsets.ts"; + +// pd.tseries.frequencies β€” frequency string utilities +export { toOffset, inferFreq, FREQ_ALIASES } from "./tseries/frequencies.ts"; + +// io.read_sas β€” SAS XPORT reader +export { readSas } from "./io/read_sas.ts"; +export type { ReadSasOptions } from "./io/read_sas.ts"; + +// pd.arrays.SparseArray / pd.SparseDtype β€” sparse storage for arrays +// with many repeated (fill) values +export { SparseArray, SparseDtype } from "./core/sparse.ts"; diff --git a/src/io/csv.ts b/src/io/csv.ts index 687355f0..331ee944 100644 --- a/src/io/csv.ts +++ b/src/io/csv.ts @@ -144,6 +144,7 @@ function isNaRaw(raw: string, naSet: ReadonlySet): boolean { /** Infer the most specific dtype for a column from its raw string values. */ function inferColumnDtype(raws: readonly string[], naSet: ReadonlySet): DtypeName { const nonNa = raws.filter((r) => !isNaRaw(r, naSet)); + const hasNa = nonNa.length < raws.length; if (nonNa.length === 0) { return "object"; } @@ -153,18 +154,23 @@ function inferColumnDtype(raws: readonly string[], naSet: ReadonlySet): } const allInt = nonNa.every((r) => RE_INT.test(r)); if (allInt) { - return "int64"; + // Upgrade to float64 when NAs are present so NaN can represent missing values. + return hasNa ? "float64" : "int64"; } const allFloat = nonNa.every((r) => RE_FLOAT.test(r)); if (allFloat) { return "float64"; } - return "string"; + return "object"; } /** Parse a raw string to a Scalar for an inferred dtype. */ function parseInferred(raw: string, dtype: DtypeName, naSet: ReadonlySet): Scalar { if (isNaRaw(raw, naSet)) { + // Numeric columns use NaN so callers can detect missing values via Number.isNaN(). + if (dtype === "float64" || dtype === "int64") { + return Number.NaN; + } return null; } if (dtype === "bool") { diff --git a/src/io/feather.ts b/src/io/feather.ts new file mode 100644 index 00000000..21160634 --- /dev/null +++ b/src/io/feather.ts @@ -0,0 +1,1084 @@ +/** + * readFeather / toFeather β€” Apache Arrow Feather v2 (IPC file) I/O for DataFrame. + * + * Mirrors `pandas.read_feather()` and `DataFrame.to_feather()`: + * - `readFeather(data, options?)` β€” parse an Arrow IPC binary buffer into a DataFrame + * - `toFeather(df, options?)` β€” serialize a DataFrame to an Arrow IPC binary buffer + * + * Supported column types: + * - Writing: int64 (all integer dtypes), float64, float32, bool, utf8 + * - Reading: Int8/16/32/64, UInt8/16/32/64, Float32/64, Bool, Utf8/LargeUtf8 + * + * Null values are fully supported via Arrow validity bitmaps. + * + * @module + */ + +import { DataFrame } from "../core/frame.ts"; +import { Index } from "../core/index.ts"; +import type { Label, Scalar } from "../types.ts"; + +// ─── Public types ───────────────────────────────────────────────────────────── + +/** Options for {@link readFeather}. */ +export interface ReadFeatherOptions { + /** Column to use as the row index. Default: `null` (RangeIndex). */ + readonly indexCol?: string | null; + /** Subset of columns to read. Default: all. */ + readonly usecols?: readonly string[] | null; +} + +/** Options for {@link toFeather}. */ +export interface ToFeatherOptions { + /** + * Write the DataFrame's row index as an extra column. + * Default: `false`. + */ + readonly writeIndex?: boolean; +} + +// ─── Arrow constants ────────────────────────────────────────────────────────── + +const MAGIC = new Uint8Array([0x41, 0x52, 0x52, 0x4f, 0x57, 0x31, 0x00, 0x00]); // "ARROW1\0\0" +const CONTINUATION_I32 = -1; // 0xFFFFFFFF interpreted as int32 + +// MetadataVersion V5 +const META_V5 = 4; + +// MessageHeader union type discriminants +const MSG_SCHEMA = 1; +const MSG_RECORD_BATCH = 3; + +// Arrow type union discriminants (Field.type_type) +const TYPE_INT = 2; +const TYPE_FLOAT = 3; +const TYPE_UTF8 = 5; +const TYPE_BOOL = 6; +const TYPE_LARGE_UTF8 = 13; + +// FloatingPoint precision +const PREC_SINGLE = 1; +const PREC_DOUBLE = 2; + +// Endianness +const ENDIAN_LITTLE = 0; + +// ─── Column type descriptor ─────────────────────────────────────────────────── + +type ColType = + | { kind: "int"; bitWidth: number; isSigned: boolean } + | { kind: "float"; precision: number } + | { kind: "bool" } + | { kind: "utf8" }; + +// ─── FlatBuffer backward builder ────────────────────────────────────────────── + +/** + * Minimal backward FlatBuffer builder for Arrow IPC FlatBuffer structures. + * + * In a backward builder the head pointer decreases as data is written; + * the final slice is `buf[head:]`. Every "absolute index" is the byte position + * within `buf` of a written value. uoffset_t values are positive distances + * from the field position to the target; soffset_t (vtable pointer) values can + * be negative (vtable before table body in the output slice). + */ +class FbBuilder { + private buf: Uint8Array; + private view: DataView; + /** First written byte (decrements as data is prepended). */ + private head: number; + + constructor(initialSize = 1024) { + this.buf = new Uint8Array(initialSize); + this.view = new DataView(this.buf.buffer); + this.head = initialSize; + } + + // ── internal helpers ─────────────────────────────────────────────────────── + + private grow(n: number): void { + while (this.head < n) { + const nb = new Uint8Array(this.buf.length * 2); + nb.set(this.buf, this.buf.length); // old data at END of new buffer β†’ OFEs are stable + this.head += this.buf.length; + this.buf = nb; + this.view = new DataView(this.buf.buffer); + } + } + + private align(a: number): void { + const used = this.buf.length - this.head; + const rem = used % a; + if (rem !== 0) { + const p = a - rem; + this.grow(p); + this.head -= p; + } + } + + // ── primitive writes (each returns absolute index of written value) ───────── + + writeU8(v: number): number { + this.grow(1); + this.buf[--this.head] = v & 0xff; + return this.head; + } + + writeU16(v: number): number { + this.align(2); + this.grow(2); + this.head -= 2; + this.view.setUint16(this.head, v, true); + return this.head; + } + + writeI16(v: number): number { + this.align(2); + this.grow(2); + this.head -= 2; + this.view.setInt16(this.head, v, true); + return this.head; + } + + writeI32(v: number): number { + this.align(4); + this.grow(4); + this.head -= 4; + this.view.setInt32(this.head, v, true); + return this.head; + } + + writeI64(v: bigint): number { + this.align(8); + this.grow(8); + this.head -= 8; + this.view.setBigInt64(this.head, v, true); + return this.head; + } + + writeUOffset(targetAbsIdx: number): number { + this.align(4); + this.grow(4); + this.head -= 4; + this.view.setUint32(this.head, targetAbsIdx - this.head, true); + return this.head; + } + + // ── composite writers ────────────────────────────────────────────────────── + + createString(s: string): number { + const bytes = new TextEncoder().encode(s); + this.grow(1); + this.buf[--this.head] = 0; // null terminator + for (let i = bytes.length - 1; i >= 0; i--) { + this.grow(1); + this.buf[--this.head] = bytes[i]!; + } + return this.writeI32(bytes.length); // write length prefix (int32) + } + + /** Offset vector (uoffset_t[] preceded by u32 count). */ + createOffsetVector(absIdxs: number[]): number { + this.align(4); + for (let i = absIdxs.length - 1; i >= 0; i--) this.writeUOffset(absIdxs[i]!); + return this.writeI32(absIdxs.length); + } + + /** Inline FieldNode vector ({length:i64, null_count:i64}Γ—n preceded by u32 count). */ + createFieldNodeVector(nodes: ReadonlyArray<{ length: bigint; nullCount: bigint }>): number { + this.align(8); + for (let i = nodes.length - 1; i >= 0; i--) { + const n = nodes[i]!; + this.grow(8); + this.head -= 8; + this.view.setBigInt64(this.head, n.nullCount, true); + this.grow(8); + this.head -= 8; + this.view.setBigInt64(this.head, n.length, true); + } + return this.writeI32(nodes.length); + } + + /** Inline Buffer vector ({offset:i64, length:i64}Γ—n preceded by u32 count). */ + createBufferVector(bufs: ReadonlyArray<{ offset: bigint; length: bigint }>): number { + this.align(8); + for (let i = bufs.length - 1; i >= 0; i--) { + const b = bufs[i]!; + this.grow(8); + this.head -= 8; + this.view.setBigInt64(this.head, b.length, true); + this.grow(8); + this.head -= 8; + this.view.setBigInt64(this.head, b.offset, true); + } + return this.writeI32(bufs.length); + } + + /** + * Inline Block vector (24-byte struct: {offset:i64, metaDataLength:i32, _pad:i32, bodyLength:i64}). + */ + createBlockVector( + blocks: ReadonlyArray<{ offset: bigint; metaDataLength: number; bodyLength: bigint }>, + ): number { + this.align(8); + for (let i = blocks.length - 1; i >= 0; i--) { + const b = blocks[i]!; + // write in reverse field order so layout is [offset][metaDataLength][pad][bodyLength] + this.grow(8); + this.head -= 8; + this.view.setBigInt64(this.head, b.bodyLength, true); + this.grow(4); + this.head -= 4; // 4-byte padding + this.grow(4); + this.head -= 4; + this.view.setInt32(this.head, b.metaDataLength, true); + this.grow(8); + this.head -= 8; + this.view.setBigInt64(this.head, b.offset, true); + } + return this.writeI32(blocks.length); + } + + // ── table builder ────────────────────────────────────────────────────────── + + /** + * Build a FlatBuffer table. `fields` maps field indices to typed values. + * Fields are written from highest to lowest index (backward building ensures + * lower-index fields end up at lower absolute positions in the output). + */ + buildTable( + fields: ReadonlyArray< + | { kind: "absent"; index: number } + | { kind: "bool"; index: number; value: boolean } + | { kind: "u8"; index: number; value: number } + | { kind: "i16"; index: number; value: number } + | { kind: "i32"; index: number; value: number } + | { kind: "i64"; index: number; value: bigint } + | { kind: "offset"; index: number; target: number } + >, + ): number { + const present = fields.filter((f) => f.kind !== "absent"); + const maxIndex = present.length === 0 ? -1 : Math.max(...present.map((f) => f.index)); + const numFields = maxIndex + 1; + + type FieldInfo = { index: number; abs: number; end: number }; + const fieldInfos: FieldInfo[] = []; + + for (let i = maxIndex; i >= 0; i--) { + const field = present.find((f) => f.index === i); + if (field === undefined) continue; + let abs: number; + let sz: number; + switch (field.kind) { + case "bool": + case "u8": + abs = this.writeU8(field.kind === "bool" ? (field.value ? 1 : 0) : field.value); + sz = 1; + break; + case "i16": + abs = this.writeI16(field.value); + sz = 2; + break; + case "i32": + abs = this.writeI32(field.value); + sz = 4; + break; + case "i64": + abs = this.writeI64(field.value); + sz = 8; + break; + case "offset": + abs = this.writeUOffset(field.target); + sz = 4; + break; + default: + continue; + } + fieldInfos.push({ index: i, abs, end: abs + sz }); + } + + // Reserve soffset_t (int32) β€” tableAbsIdx is the start of the table object + this.align(4); + this.grow(4); + this.head -= 4; + const tableAbsIdx = this.head; + + // Field offsets relative to tableAbsIdx (= tablePos in the output slice) + const fieldOffsets: number[] = new Array(numFields).fill(0); + for (const fi of fieldInfos) { + fieldOffsets[fi.index] = fi.abs - tableAbsIdx; + } + + const maxEnd = fieldInfos.reduce((m, f) => Math.max(m, f.end), tableAbsIdx + 4); + const objectSize = maxEnd - tableAbsIdx; + const vtableSize = (numFields + 2) * 2; + + // Write vtable (backward: field[numFields-1] … field[0], objectSize, vtableSize) + for (let i = numFields - 1; i >= 0; i--) this.writeU16(fieldOffsets[i] ?? 0); + this.writeU16(objectSize); + this.writeU16(vtableSize); + const vtableAbsIdx = this.head; + + // Patch soffset_t: vtable is before table, so delta is negative + this.view.setInt32(tableAbsIdx, vtableAbsIdx - tableAbsIdx, true); + return tableAbsIdx; + } + + /** Finish building: write root uoffset_t and return the FlatBuffer slice. */ + finish(rootAbsIdx: number): Uint8Array { + this.align(4); + this.grow(4); + this.head -= 4; + this.view.setUint32(this.head, rootAbsIdx - this.head, true); + return this.buf.slice(this.head); + } +} + +// ─── FlatBuffer reader ───────────────────────────────────────────────────────── + +class FbTable { + private readonly view: DataView; + private readonly tablePos: number; + private readonly vtablePos: number; + private readonly vtableBytes: number; + + constructor(view: DataView, tablePos: number) { + this.view = view; + this.tablePos = tablePos; + const soffset = view.getInt32(tablePos, true); + this.vtablePos = tablePos + soffset; + this.vtableBytes = view.getUint16(this.vtablePos, true); + } + + private fieldOff(idx: number): number { + const vOff = 4 + idx * 2; + if (vOff + 2 > this.vtableBytes) return 0; + return this.view.getUint16(this.vtablePos + vOff, true); + } + + readBool(idx: number): boolean | undefined { + const off = this.fieldOff(idx); + return off === 0 ? undefined : this.view.getUint8(this.tablePos + off) !== 0; + } + + readU8(idx: number): number | undefined { + const off = this.fieldOff(idx); + return off === 0 ? undefined : this.view.getUint8(this.tablePos + off); + } + + readI16(idx: number): number | undefined { + const off = this.fieldOff(idx); + return off === 0 ? undefined : this.view.getInt16(this.tablePos + off, true); + } + + readI32(idx: number): number | undefined { + const off = this.fieldOff(idx); + return off === 0 ? undefined : this.view.getInt32(this.tablePos + off, true); + } + + readI64(idx: number): bigint | undefined { + const off = this.fieldOff(idx); + return off === 0 ? undefined : this.view.getBigInt64(this.tablePos + off, true); + } + + readString(idx: number): string | undefined { + const off = this.fieldOff(idx); + if (off === 0) return undefined; + const fieldPos = this.tablePos + off; + const uoff = this.view.getUint32(fieldPos, true); + const strPos = fieldPos + uoff; + const len = this.view.getUint32(strPos, true); + return new TextDecoder().decode( + new Uint8Array(this.view.buffer, this.view.byteOffset + strPos + 4, len), + ); + } + + readSubTable(idx: number): FbTable | undefined { + const off = this.fieldOff(idx); + if (off === 0) return undefined; + const fieldPos = this.tablePos + off; + return new FbTable(this.view, fieldPos + this.view.getUint32(fieldPos, true)); + } + + readVectorCount(idx: number): number { + const off = this.fieldOff(idx); + if (off === 0) return 0; + const fieldPos = this.tablePos + off; + return this.view.getUint32(fieldPos + this.view.getUint32(fieldPos, true), true); + } + + readVectorTable(idx: number, i: number): FbTable | undefined { + const off = this.fieldOff(idx); + if (off === 0) return undefined; + const fieldPos = this.tablePos + off; + const vecPos = fieldPos + this.view.getUint32(fieldPos, true); + if (i >= this.view.getUint32(vecPos, true)) return undefined; + const elemPos = vecPos + 4 + i * 4; + return new FbTable(this.view, elemPos + this.view.getUint32(elemPos, true)); + } + + readVectorString(idx: number, i: number): string | undefined { + const off = this.fieldOff(idx); + if (off === 0) return undefined; + const fieldPos = this.tablePos + off; + const vecPos = fieldPos + this.view.getUint32(fieldPos, true); + if (i >= this.view.getUint32(vecPos, true)) return undefined; + const elemPos = vecPos + 4 + i * 4; + const strPos = elemPos + this.view.getUint32(elemPos, true); + const len = this.view.getUint32(strPos, true); + return new TextDecoder().decode( + new Uint8Array(this.view.buffer, this.view.byteOffset + strPos + 4, len), + ); + } + + /** + * Read one element from an inline 16-byte struct vector + * ({field_a: i64, field_b: i64}). Used for FieldNode and Buffer. + */ + readStruct16(vecIdx: number, i: number): { a: bigint; b: bigint } | undefined { + const off = this.fieldOff(vecIdx); + if (off === 0) return undefined; + const fieldPos = this.tablePos + off; + const vecPos = fieldPos + this.view.getUint32(fieldPos, true); + if (i >= this.view.getUint32(vecPos, true)) return undefined; + const elemPos = vecPos + 4 + i * 16; + return { + a: this.view.getBigInt64(elemPos, true), + b: this.view.getBigInt64(elemPos + 8, true), + }; + } + + /** + * Read one Block struct (24 bytes: {offset:i64, metaDataLength:i32, _pad:i32, bodyLength:i64}). + */ + readBlock(vecIdx: number, i: number): { offset: bigint; metaDataLength: number; bodyLength: bigint } | undefined { + const off = this.fieldOff(vecIdx); + if (off === 0) return undefined; + const fieldPos = this.tablePos + off; + const vecPos = fieldPos + this.view.getUint32(fieldPos, true); + if (i >= this.view.getUint32(vecPos, true)) return undefined; + const ep = vecPos + 4 + i * 24; + return { + offset: this.view.getBigInt64(ep, true), + metaDataLength: this.view.getInt32(ep + 8, true), + bodyLength: this.view.getBigInt64(ep + 16, true), + }; + } +} + +function fbRoot(buf: Uint8Array): FbTable { + const view = new DataView(buf.buffer, buf.byteOffset, buf.byteLength); + return new FbTable(view, view.getUint32(0, true)); +} + +// ─── Arrow schema builders ───────────────────────────────────────────────────── + +function buildSchema(b: FbBuilder, cols: ReadonlyArray<{ name: string; type: ColType }>): number { + const fieldAbsIdxs = cols.map(({ name, type }) => { + const nameAbs = b.createString(name); + let typeCode: number; + let typeAbs: number; + switch (type.kind) { + case "int": + typeCode = TYPE_INT; + typeAbs = b.buildTable([ + { kind: "i32", index: 0, value: type.bitWidth }, + { kind: "bool", index: 1, value: type.isSigned }, + ]); + break; + case "float": + typeCode = TYPE_FLOAT; + typeAbs = b.buildTable([{ kind: "i16", index: 0, value: type.precision }]); + break; + case "bool": + typeCode = TYPE_BOOL; + typeAbs = b.buildTable([]); + break; + case "utf8": + typeCode = TYPE_UTF8; + typeAbs = b.buildTable([]); + break; + } + // Field: 0=name, 1=nullable, 2=type_type, 3=type + return b.buildTable([ + { kind: "offset", index: 0, target: nameAbs }, + { kind: "bool", index: 1, value: true }, + { kind: "u8", index: 2, value: typeCode }, + { kind: "offset", index: 3, target: typeAbs }, + ]); + }); + const fieldsVec = b.createOffsetVector(fieldAbsIdxs); + return b.buildTable([ + { kind: "i16", index: 0, value: ENDIAN_LITTLE }, + { kind: "offset", index: 1, target: fieldsVec }, + ]); +} + +function buildSchemaMessage(cols: ReadonlyArray<{ name: string; type: ColType }>): Uint8Array { + const b = new FbBuilder(); + const schemaAbs = buildSchema(b, cols); + const msgAbs = b.buildTable([ + { kind: "i16", index: 0, value: META_V5 }, + { kind: "u8", index: 1, value: MSG_SCHEMA }, + { kind: "offset", index: 2, target: schemaAbs }, + { kind: "i64", index: 3, value: 0n }, + ]); + return b.finish(msgAbs); +} + +function buildRecordBatchMessage( + numRows: number, + nodes: ReadonlyArray<{ length: bigint; nullCount: bigint }>, + buffers: ReadonlyArray<{ offset: bigint; length: bigint }>, + bodyLength: bigint, +): Uint8Array { + const b = new FbBuilder(); + const nodesVec = b.createFieldNodeVector(nodes); + const bufsVec = b.createBufferVector(buffers); + const rbAbs = b.buildTable([ + { kind: "i64", index: 0, value: BigInt(numRows) }, + { kind: "offset", index: 1, target: nodesVec }, + { kind: "offset", index: 2, target: bufsVec }, + ]); + const msgAbs = b.buildTable([ + { kind: "i16", index: 0, value: META_V5 }, + { kind: "u8", index: 1, value: MSG_RECORD_BATCH }, + { kind: "offset", index: 2, target: rbAbs }, + { kind: "i64", index: 3, value: bodyLength }, + ]); + return b.finish(msgAbs); +} + +function buildFooter( + cols: ReadonlyArray<{ name: string; type: ColType }>, + blocks: ReadonlyArray<{ offset: bigint; metaDataLength: number; bodyLength: bigint }>, +): Uint8Array { + const b = new FbBuilder(); + const schemaAbs = buildSchema(b, cols); + const dictsVec = b.createOffsetVector([]); + const blocksVec = b.createBlockVector(blocks); + const footerAbs = b.buildTable([ + { kind: "i16", index: 0, value: META_V5 }, + { kind: "offset", index: 1, target: schemaAbs }, + { kind: "offset", index: 2, target: dictsVec }, + { kind: "offset", index: 3, target: blocksVec }, + ]); + return b.finish(footerAbs); +} + +// ─── Column encoding helpers ─────────────────────────────────────────────────── + +function padTo8(n: number): number { + return (n + 7) & ~7; +} + +/** Returns a bitpacked validity bitmap, or `null` if all values are non-null. */ +function encodeValidity(values: readonly (Scalar | null)[]): Uint8Array | null { + let anyNull = false; + for (const v of values) { + if (v === null || v === undefined) { + anyNull = true; + break; + } + } + if (!anyNull) return null; + const bitmap = new Uint8Array(Math.ceil(values.length / 8)); + for (let i = 0; i < values.length; i++) { + if (values[i] !== null && values[i] !== undefined) { + bitmap[Math.floor(i / 8)]! |= 1 << (i % 8); + } + } + return bitmap; +} + +/** Count nulls in a value array. */ +function countNulls(values: readonly (Scalar | null)[]): number { + let n = 0; + for (const v of values) if (v === null || v === undefined) n++; + return n; +} + +function encodeInt64s(values: readonly (Scalar | null)[]): Uint8Array { + const buf = new Uint8Array(values.length * 8); + const dv = new DataView(buf.buffer); + for (let i = 0; i < values.length; i++) { + const v = values[i]; + const n = + v === null || v === undefined + ? 0n + : typeof v === "bigint" + ? v + : BigInt(Math.trunc(Number(v))); + dv.setBigInt64(i * 8, n, true); + } + return buf; +} + +function encodeFloat64s(values: readonly (Scalar | null)[]): Uint8Array { + const buf = new Uint8Array(values.length * 8); + const dv = new DataView(buf.buffer); + for (let i = 0; i < values.length; i++) { + const v = values[i]; + dv.setFloat64(i * 8, v === null || v === undefined ? NaN : Number(v), true); + } + return buf; +} + +function encodeFloat32s(values: readonly (Scalar | null)[]): Uint8Array { + const buf = new Uint8Array(values.length * 4); + const dv = new DataView(buf.buffer); + for (let i = 0; i < values.length; i++) { + const v = values[i]; + dv.setFloat32(i * 4, v === null || v === undefined ? NaN : Number(v), true); + } + return buf; +} + +function encodeBools(values: readonly (Scalar | null)[]): Uint8Array { + const buf = new Uint8Array(Math.ceil(values.length / 8)); + for (let i = 0; i < values.length; i++) { + const v = values[i]; + if (v !== null && v !== undefined && Boolean(v)) { + buf[Math.floor(i / 8)]! |= 1 << (i % 8); + } + } + return buf; +} + +function encodeStrings(values: readonly (Scalar | null)[]): { offsets: Uint8Array; data: Uint8Array } { + const enc = new TextEncoder(); + const encoded: Uint8Array[] = []; + let totalBytes = 0; + for (const v of values) { + if (v !== null && v !== undefined) { + const b = enc.encode(String(v)); + encoded.push(b); + totalBytes += b.length; + } else { + encoded.push(new Uint8Array(0)); + } + } + const offsets = new Uint8Array((values.length + 1) * 4); + const ov = new DataView(offsets.buffer); + const data = new Uint8Array(totalBytes); + let pos = 0; + for (let i = 0; i < encoded.length; i++) { + ov.setInt32(i * 4, pos, true); + data.set(encoded[i]!, pos); + pos += encoded[i]!.length; + } + ov.setInt32(values.length * 4, pos, true); + return { offsets, data }; +} + +// ─── Column decoding helpers ─────────────────────────────────────────────────── + +function decodeValidity(bitmap: Uint8Array, count: number): boolean[] { + const valid = new Array(count); + for (let i = 0; i < count; i++) { + valid[i] = ((bitmap[Math.floor(i / 8)]! >> (i % 8)) & 1) === 1; + } + return valid; +} + +function decodeInt( + body: Uint8Array, + bodyOff: number, + count: number, + bitWidth: number, + isSigned: boolean, +): Scalar[] { + const dv = new DataView(body.buffer, body.byteOffset + bodyOff); + const out: Scalar[] = new Array(count); + for (let i = 0; i < count; i++) { + switch (bitWidth) { + case 8: + out[i] = isSigned ? dv.getInt8(i) : dv.getUint8(i); + break; + case 16: + out[i] = isSigned ? dv.getInt16(i * 2, true) : dv.getUint16(i * 2, true); + break; + case 32: + out[i] = isSigned ? dv.getInt32(i * 4, true) : dv.getUint32(i * 4, true); + break; + case 64: { + const v = isSigned ? dv.getBigInt64(i * 8, true) : dv.getBigUint64(i * 8, true); + out[i] = Number(v); + break; + } + default: + out[i] = 0; + } + } + return out; +} + +function decodeFloat( + body: Uint8Array, + bodyOff: number, + count: number, + precision: number, +): Scalar[] { + const dv = new DataView(body.buffer, body.byteOffset + bodyOff); + const out: Scalar[] = new Array(count); + for (let i = 0; i < count; i++) { + out[i] = + precision === PREC_SINGLE ? dv.getFloat32(i * 4, true) : dv.getFloat64(i * 8, true); + } + return out; +} + +function decodeBool(body: Uint8Array, bodyOff: number, count: number): Scalar[] { + const out: Scalar[] = new Array(count); + for (let i = 0; i < count; i++) { + out[i] = ((body[bodyOff + Math.floor(i / 8)]! >> (i % 8)) & 1) === 1; + } + return out; +} + +function decodeUtf8( + body: Uint8Array, + offsBodyOff: number, + dataBodyOff: number, + count: number, +): Scalar[] { + const ov = new DataView(body.buffer, body.byteOffset + offsBodyOff); + const dec = new TextDecoder(); + const out: Scalar[] = new Array(count); + for (let i = 0; i < count; i++) { + const start = ov.getInt32(i * 4, true); + const end = ov.getInt32((i + 1) * 4, true); + out[i] = dec.decode(body.subarray(dataBodyOff + start, dataBodyOff + end)); + } + return out; +} + +// ─── IPC message framing ────────────────────────────────────────────────────── + +/** + * Emit an Arrow IPC message frame into `out` (byte-array accumulator). + * Returns the byte offset within `out` at which this message starts. + */ +function appendMessage(out: number[], metadata: Uint8Array, body: Uint8Array | null): number { + const startPos = out.length; + const paddedMetaLen = padTo8(metadata.length); + + // Continuation marker + padded metadata size + const hdr = new Uint8Array(8); + const hdrDv = new DataView(hdr.buffer); + hdrDv.setInt32(0, CONTINUATION_I32, true); + hdrDv.setInt32(4, paddedMetaLen, true); + for (const b of hdr) out.push(b); + + // FlatBuffer bytes + zero padding + for (const b of metadata) out.push(b); + for (let i = metadata.length; i < paddedMetaLen; i++) out.push(0); + + // Optional body (already padded by caller) + if (body) for (const b of body) out.push(b); + + return startPos; +} + +// ─── toFeather ───────────────────────────────────────────────────────────────── + +/** + * Serialize a DataFrame to an Apache Arrow IPC (Feather v2) binary buffer. + * Mirrors `pandas.DataFrame.to_feather()`. + */ +export function toFeather(df: DataFrame, options: ToFeatherOptions = {}): Uint8Array { + const { writeIndex = false } = options; + + type ColData = { name: string; type: ColType; values: readonly (Scalar | null)[] }; + const cols: ColData[] = []; + + if (writeIndex) { + const idxVals = [...df.index.values] as (Scalar | null)[]; + cols.push({ + name: "__index_level_0__", + type: { kind: "utf8" }, + values: idxVals.map((v) => (v === null ? null : String(v))), + }); + } + + for (const name of df.columns.values as string[]) { + const s = df.col(name); + const values = s.values as readonly (Scalar | null)[]; + const dtype = s.dtype; + let type: ColType; + if (dtype.kind === "float") { + type = { kind: "float", precision: dtype.itemsize === 4 ? PREC_SINGLE : PREC_DOUBLE }; + } else if (dtype.kind === "bool") { + type = { kind: "bool" }; + } else if (dtype.kind === "string") { + type = { kind: "utf8" }; + } else if (dtype.kind === "int" || dtype.kind === "uint") { + type = { kind: "int", bitWidth: dtype.itemsize * 8, isSigned: dtype.kind === "int" }; + } else { + // Unknown dtype: sniff from values + let isFloat = false; + let hasBool = false; + let hasStr = false; + for (const v of values) { + if (v === null || v === undefined) continue; + if (typeof v === "boolean") { hasBool = true; break; } + if (typeof v === "string") { hasStr = true; break; } + if (typeof v === "number" && !Number.isInteger(v)) isFloat = true; + } + if (hasStr) type = { kind: "utf8" }; + else if (hasBool) type = { kind: "bool" }; + else if (isFloat) type = { kind: "float", precision: PREC_DOUBLE }; + else type = { kind: "int", bitWidth: 64, isSigned: true }; + } + cols.push({ name, type, values }); + } + + const numRows = cols.length > 0 ? cols[0]!.values.length : df.index.size; + const schemaCols = cols.map((c) => ({ name: c.name, type: c.type })); + + // Encode all column buffers into a single body array + const bodyParts: Uint8Array[] = []; + const nodes: { length: bigint; nullCount: bigint }[] = []; + const bufferInfos: { offset: bigint; length: bigint }[] = []; + let bodyOffset = 0n; + + function pushBodyBuf(buf: Uint8Array) { + bufferInfos.push({ offset: bodyOffset, length: BigInt(buf.length) }); + bodyParts.push(buf); + const padded = padTo8(buf.length); + if (padded > buf.length) bodyParts.push(new Uint8Array(padded - buf.length)); + bodyOffset += BigInt(padded); + } + + for (const col of cols) { + const { type, values } = col; + const validity = encodeValidity(values); + const nullCount = validity ? countNulls(values) : 0; + nodes.push({ length: BigInt(values.length), nullCount: BigInt(nullCount) }); + + // Validity buffer (empty = no nulls) + pushBodyBuf(validity ?? new Uint8Array(0)); + + // Data buffer(s) + switch (type.kind) { + case "int": + pushBodyBuf(encodeInt64s(values)); + break; + case "float": + pushBodyBuf(type.precision === PREC_SINGLE ? encodeFloat32s(values) : encodeFloat64s(values)); + break; + case "bool": + pushBodyBuf(encodeBools(values)); + break; + case "utf8": { + const { offsets, data } = encodeStrings(values); + pushBodyBuf(offsets); + pushBodyBuf(data); + break; + } + } + } + + // Assemble body + let totalBodyLen = 0; + for (const p of bodyParts) totalBodyLen += p.length; + const body = new Uint8Array(totalBodyLen); + let bpos = 0; + for (const p of bodyParts) { body.set(p, bpos); bpos += p.length; } + + // Build messages and file + const out: number[] = []; + for (const b of MAGIC) out.push(b); + + // Schema message (no body) + appendMessage(out, buildSchemaMessage(schemaCols), null); + + // RecordBatch message + const rbMeta = buildRecordBatchMessage(numRows, nodes, bufferInfos, bodyOffset); + const rbStart = out.length; + appendMessage(out, rbMeta, body); + + const rbPaddedMeta = padTo8(rbMeta.length); + const rbMetaLen = 8 + rbPaddedMeta; // 4-byte continuation + 4-byte size + padded FlatBuffer + + // Footer + const blocks = [ + { offset: BigInt(rbStart), metaDataLength: rbMetaLen, bodyLength: bodyOffset }, + ]; + const footer = buildFooter(schemaCols, blocks); + for (const b of footer) out.push(b); + + // Footer size (int32 LE) + trailing magic + const fsizeBuf = new Uint8Array(4); + new DataView(fsizeBuf.buffer).setInt32(0, footer.length, true); + for (const b of fsizeBuf) out.push(b); + for (const b of MAGIC) out.push(b); + + return new Uint8Array(out); +} + +// ─── readFeather ────────────────────────────────────────────────────────────── + +/** + * Parse an Apache Arrow IPC (Feather v2) binary buffer into a DataFrame. + * Mirrors `pandas.read_feather()`. + */ +export function readFeather(data: Uint8Array, options: ReadFeatherOptions = {}): DataFrame { + const { indexCol = null, usecols = null } = options; + + // Verify opening magic + if (new TextDecoder().decode(data.subarray(0, 6)) !== "ARROW1") { + throw new Error("readFeather: not an Arrow IPC file (bad magic bytes at start)"); + } + if (new TextDecoder().decode(data.subarray(data.length - 8, data.length - 2)) !== "ARROW1") { + throw new Error("readFeather: not an Arrow IPC file (bad magic bytes at end)"); + } + + const view = new DataView(data.buffer, data.byteOffset, data.byteLength); + + // Parse footer + const footerSize = view.getInt32(data.length - 12, true); + const footerStart = data.length - 12 - footerSize; + const footerFb = fbRoot(data.subarray(footerStart, footerStart + footerSize)); + + // Parse schema from footer + const schemaFb = footerFb.readSubTable(1); + if (!schemaFb) throw new Error("readFeather: missing schema in footer"); + + const numFields = schemaFb.readVectorCount(1); + type ParsedField = { name: string; typeCode: number; sub: FbTable | undefined }; + const parsedFields: ParsedField[] = []; + for (let i = 0; i < numFields; i++) { + const ft = schemaFb.readVectorTable(1, i); + if (!ft) continue; + parsedFields.push({ + name: ft.readString(0) ?? `col_${i}`, + typeCode: ft.readU8(2) ?? 0, + sub: ft.readSubTable(3), + }); + } + + // Count record batch blocks + let blockCount = 0; + while (footerFb.readBlock(3, blockCount) !== undefined) blockCount++; + + if (blockCount === 0) { + // Empty file + const empty: Record = {}; + for (const f of parsedFields) { + if (usecols !== null && !usecols.includes(f.name)) continue; + empty[f.name] = []; + } + return DataFrame.fromColumns(empty); + } + + // Use the first record batch block + const block = footerFb.readBlock(3, 0)!; + const blockOffset = Number(block.offset); + + // Parse RecordBatch message + if (view.getInt32(blockOffset, true) !== CONTINUATION_I32) { + throw new Error("readFeather: invalid continuation marker"); + } + const paddedMetaLen = view.getInt32(blockOffset + 4, true); + const metaBuf = data.subarray(blockOffset + 8, blockOffset + 8 + paddedMetaLen); + const msgFb = fbRoot(metaBuf); + + if (msgFb.readU8(1) !== MSG_RECORD_BATCH) { + throw new Error("readFeather: expected RecordBatch message"); + } + const rbFb = msgFb.readSubTable(2); + if (!rbFb) throw new Error("readFeather: missing RecordBatch in message"); + + const numRows = Number(rbFb.readI64(0) ?? 0n); + const bodyStart = blockOffset + 8 + paddedMetaLen; + const body = data.subarray(bodyStart, bodyStart + Number(block.bodyLength)); + + // Decode each column + const resultData: Record = {}; + let bufIdx = 0; + let nodeIdx = 0; + + for (const field of parsedFields) { + const numBufs = + field.typeCode === TYPE_UTF8 || field.typeCode === TYPE_LARGE_UTF8 ? 3 : 2; + + if (usecols !== null && !usecols.includes(field.name)) { + bufIdx += numBufs; + nodeIdx++; + continue; + } + + nodeIdx++; + + // Validity buffer + const validBufInfo = rbFb.readStruct16(2, bufIdx); + bufIdx++; + let validMask: boolean[] | null = null; + if (validBufInfo !== undefined && Number(validBufInfo.b) > 0) { + const vOff = Number(validBufInfo.a); + const vLen = Number(validBufInfo.b); + validMask = decodeValidity(body.subarray(vOff, vOff + vLen), numRows); + } + + let values: Scalar[]; + + switch (field.typeCode) { + case TYPE_INT: { + const bitWidth = field.sub?.readI32(0) ?? 64; + const isSigned = field.sub?.readBool(1) ?? true; + const dBuf = rbFb.readStruct16(2, bufIdx)!; + bufIdx++; + values = decodeInt(body, Number(dBuf.a), numRows, bitWidth, isSigned); + break; + } + case TYPE_FLOAT: { + const precision = field.sub?.readI16(0) ?? PREC_DOUBLE; + const dBuf = rbFb.readStruct16(2, bufIdx)!; + bufIdx++; + values = decodeFloat(body, Number(dBuf.a), numRows, precision); + break; + } + case TYPE_BOOL: { + const dBuf = rbFb.readStruct16(2, bufIdx)!; + bufIdx++; + values = decodeBool(body, Number(dBuf.a), numRows); + break; + } + case TYPE_UTF8: + case TYPE_LARGE_UTF8: { + const oBuf = rbFb.readStruct16(2, bufIdx)!; + bufIdx++; + const dBuf = rbFb.readStruct16(2, bufIdx)!; + bufIdx++; + values = decodeUtf8(body, Number(oBuf.a), Number(dBuf.a), numRows); + break; + } + default: + bufIdx++; + values = new Array(numRows).fill(null); + } + + // Apply validity mask (null = 0 bit in validity bitmap) + if (validMask !== null) { + for (let i = 0; i < numRows; i++) { + if (!validMask[i]) values[i] = null; + } + } + + resultData[field.name] = values; + } + + // Extract index column if requested + let index: Index