-
-
Notifications
You must be signed in to change notification settings - Fork 38
Expand file tree
/
Copy pathextend.py
More file actions
117 lines (95 loc) · 4.04 KB
/
extend.py
File metadata and controls
117 lines (95 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################
# Benchmark for measuring CTable creation time from three different sources:
# 1. Python list of lists (1M rows)
# 2. NumPy structured array (1M rows) — list of named tuples
# 3. An existing CTable (previously created from Python lists, 1M rows)
from time import time
from typing import Annotated
import numpy as np
from pydantic import BaseModel, Field
import blosc2
class NumpyDtype:
def __init__(self, dtype):
self.dtype = dtype
# ---------------------------------------------------------------------------
# Row model
# ---------------------------------------------------------------------------
class RowModel(BaseModel):
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
active: Annotated[bool, NumpyDtype(np.bool_)] = True
N = 1_000_000
print(f"CTable creation benchmark with {N:,} rows\n")
# ---------------------------------------------------------------------------
# Base data generation (not part of the benchmark timing)
# ---------------------------------------------------------------------------
print("Generating base data...")
t0 = time()
data_list = [
[i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0]
for i in range(N)
]
t_gen_list = time() - t0
print(f" Python list generated in: {t_gen_list:.4f} s")
t0 = time()
np_dtype = np.dtype([
("id", np.int64),
("c_val", np.complex128),
("score", np.float64),
("active", np.bool_),
])
data_np = np.array(
[
(i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
for i in range(N)
],
dtype=np_dtype,
)
t_gen_np = time() - t0
print(f" NumPy structured array generated: {t_gen_np:.4f} s\n")
# ---------------------------------------------------------------------------
# 1. Creation from a Python list of lists
# ---------------------------------------------------------------------------
print("CTable from Python list of lists")
t0 = time()
ct_from_list = blosc2.CTable(RowModel, expected_size=N)
ct_from_list.extend(data_list)
t_from_list = time() - t0
print(f" extend() time (Python list): {t_from_list:.4f} s")
print(f" Rows: {len(ct_from_list):,}")
# ---------------------------------------------------------------------------
# 2. Creation from a NumPy structured array (list of named tuples)
# ---------------------------------------------------------------------------
print("CTable from NumPy structured array")
t0 = time()
ct_from_np = blosc2.CTable(RowModel, expected_size=N)
ct_from_np.extend(data_np)
t_from_np = time() - t0
print(f" extend() time (NumPy struct): {t_from_np:.4f} s")
print(f" Rows: {len(ct_from_np):,}")
# ---------------------------------------------------------------------------
# 3. Creation from an existing CTable (ct_from_list, already built above)
# ---------------------------------------------------------------------------
print("CTable from an existing CTable")
t0 = time()
ct_from_ctable = blosc2.CTable(RowModel, expected_size=N)
ct_from_ctable.extend(ct_from_list)
t_from_ctable = time() - t0
print(f" extend() time (CTable): {t_from_ctable:.4f} s")
print(f" Rows: {len(ct_from_ctable):,}")
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
print("\n")
print("=" * 60)
print(f"{'SOURCE':<30} {'TIME (s)':>12} {'SPEEDUP vs list':>18}")
print("-" * 60)
print(f"{'Python list of lists':<30} {t_from_list:>12.4f} {'1.00x':>18}")
print(f"{'NumPy structured array':<30} {t_from_np:>12.4f} {t_from_list / t_from_np:>17.2f}x")
print(f"{'Existing CTable':<30} {t_from_ctable:>12.4f} {t_from_list / t_from_ctable:>17.2f}x")