Skip to content

Commit 3e3fd94

Browse files
feat: Add RunEndEncodedIterator with O(1) amortized sequential access
Implements stateful caching optimization based on Arrow C++ PhysicalIndexFinder: - Caches last physical index from previous lookup - Fast path: validates cached index for sequential access patterns (O(1)) - Falls back to binary search in partitioned ranges when cache invalid - Typical iteration becomes O(1) amortized instead of O(log n) per element Algorithm: 1. Check if cached physical index is still valid for current logical index 2. If valid and within run bounds, return cached index (common case) 3. If not valid, use cached index to partition search space 4. Binary search only the relevant partition Worst case (random access) adds one extra probe vs standard binary search. Best case (sequential iteration) is O(1) per element.
1 parent df65c51 commit 3e3fd94

1 file changed

Lines changed: 99 additions & 1 deletion

File tree

src/visitor/iterator.ts

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
import { Data } from '../data.js';
1819
import { Vector } from '../vector.js';
1920
import { Visitor } from '../visitor.js';
2021
import { Type, Precision } from '../enum.js';
2122
import { TypeToDataType } from '../interfaces.js';
23+
import { instance as getVisitor } from './get.js';
2224
import {
2325
DataType, Dictionary,
2426
Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, LargeList, FixedSizeList, Map_, Struct,
@@ -131,6 +133,19 @@ function vectorIterator<T extends DataType>(vector: Vector<T>): IterableIterator
131133
});
132134
}
133135

136+
/** @ignore */
137+
function runEndEncodedIterator<T extends RunEndEncoded>(vector: Vector<T>): IterableIterator<T['TValue'] | null> {
138+
// Use specialized iterator with O(1) amortized sequential access
139+
let offset = 0;
140+
return new ChunkedIterator(vector.data.length, (chunkIndex) => {
141+
const data = vector.data[chunkIndex];
142+
const length = data.length;
143+
const inner = vector.slice(offset, offset + length);
144+
offset += length;
145+
return new RunEndEncodedIterator(inner);
146+
});
147+
}
148+
134149
/** @ignore */
135150
class VectorIterator<T extends DataType> implements IterableIterator<T['TValue'] | null> {
136151
private index = 0;
@@ -152,6 +167,89 @@ class VectorIterator<T extends DataType> implements IterableIterator<T['TValue']
152167
}
153168
}
154169

170+
/** @ignore */
171+
class RunEndEncodedIterator<T extends RunEndEncoded> implements IterableIterator<T['TValue'] | null> {
172+
private index = 0;
173+
private lastPhysicalIndex = 0;
174+
private readonly runEnds: Data<T['runEndsType']>;
175+
private readonly values: Data<T['valueType']>;
176+
private readonly getRunEnd: (data: Data<T['runEndsType']>, index: number) => T['runEndsType']['TValue'] | null;
177+
private readonly getValue: (data: Data<T['valueType']>, index: number) => T['TValue'] | null;
178+
179+
constructor(private vector: Vector<T>) {
180+
const data = vector.data[0];
181+
this.runEnds = data.children[0] as Data<T['runEndsType']>;
182+
this.values = data.children[1] as Data<T['valueType']>;
183+
this.getRunEnd = getVisitor.getVisitFn(this.runEnds);
184+
this.getValue = getVisitor.getVisitFn(this.values);
185+
}
186+
187+
next(): IteratorResult<T['TValue'] | null> {
188+
if (this.index < this.vector.length) {
189+
const value = this.getValueAtIndex(this.index++);
190+
return { value };
191+
}
192+
return { done: true, value: null };
193+
}
194+
195+
private getValueAtIndex(logicalIndex: number): T['TValue'] {
196+
const physicalIndex = this.findPhysicalIndex(logicalIndex);
197+
return this.getValue(this.values, physicalIndex);
198+
}
199+
200+
private findPhysicalIndex(i: number): number {
201+
const runEndsLength = this.runEnds.length;
202+
const offset = this.vector.data[0].offset;
203+
204+
// Fast path: check if the cached physical index is still valid
205+
const cachedRunEnd = Number(this.getRunEnd(this.runEnds, this.lastPhysicalIndex));
206+
if (offset + i < cachedRunEnd) {
207+
// Cached value is an upper bound, but is it the least upper bound?
208+
if (this.lastPhysicalIndex === 0) {
209+
return this.lastPhysicalIndex;
210+
}
211+
const prevRunEnd = Number(this.getRunEnd(this.runEnds, this.lastPhysicalIndex - 1));
212+
if (offset + i >= prevRunEnd) {
213+
// Cache hit - same run as before
214+
return this.lastPhysicalIndex;
215+
}
216+
// Search in the range before the cached index
217+
this.lastPhysicalIndex = this.binarySearchRange(0, this.lastPhysicalIndex, i, offset);
218+
return this.lastPhysicalIndex;
219+
}
220+
221+
// Cached index is not an upper bound, search after it
222+
const minPhysicalIndex = this.lastPhysicalIndex + 1;
223+
const relativeIndex = this.binarySearchRange(
224+
minPhysicalIndex,
225+
runEndsLength,
226+
i,
227+
offset
228+
);
229+
this.lastPhysicalIndex = relativeIndex;
230+
return this.lastPhysicalIndex;
231+
}
232+
233+
private binarySearchRange(start: number, end: number, i: number, offset: number): number {
234+
let low = start;
235+
let high = end - 1;
236+
while (low < high) {
237+
const mid = (low + high) >>> 1;
238+
const runEnd = Number(this.getRunEnd(this.runEnds, mid));
239+
if (offset + i < runEnd) {
240+
high = mid;
241+
} else {
242+
low = mid + 1;
243+
}
244+
}
245+
return low;
246+
}
247+
248+
[Symbol.iterator]() {
249+
return this;
250+
}
251+
}
252+
155253
IteratorVisitor.prototype.visitNull = vectorIterator;
156254
IteratorVisitor.prototype.visitBool = vectorIterator;
157255
IteratorVisitor.prototype.visitInt = vectorIterator;
@@ -206,7 +304,7 @@ IteratorVisitor.prototype.visitDurationMicrosecond = vectorIterator;
206304
IteratorVisitor.prototype.visitDurationNanosecond = vectorIterator;
207305
IteratorVisitor.prototype.visitFixedSizeList = vectorIterator;
208306
IteratorVisitor.prototype.visitMap = vectorIterator;
209-
IteratorVisitor.prototype.visitRunEndEncoded = vectorIterator;
307+
IteratorVisitor.prototype.visitRunEndEncoded = runEndEncodedIterator;
210308

211309
/** @ignore */
212310
export const instance = new IteratorVisitor();

0 commit comments

Comments
 (0)