Skip to content

Commit e13ccaf

Browse files
Merge pull request #599 from Blosc/mini-matmul
Mini matmul. This allows to perform matmul operations with prefilters at block level. This provides automatic multi-threading, and opens the door towards using accelerated BLAS libraries more easily in the future. Some benchmarks for the supported cases show significant speedups over the chunked implementation: - aligned 400x400 float32: about 3.7x faster over chunked - aligned 400x400 float64: about 3.0x - aligned 800x800 float32: about 1.5x - misaligned case: auto correctly stays on chunked
2 parents 470fe8a + f2678e8 commit e13ccaf

11 files changed

Lines changed: 850 additions & 51 deletions

File tree

ANNOUNCE.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
Announcing Python-Blosc2 4.1.2
1+
Announcing Python-Blosc2 4.0.0
22
===============================
33

4-
This is patch release which updates the ``c-blosc2`` version to fix some memory leaks.
4+
This is patch release which updates the ``miniexpr`` version to fix a bug for ubuntu ARM64 failure.
55

66
You can think of Python-Blosc2 4.x as an extension of NumPy/numexpr that:
77

RELEASE_NOTES.md

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
11
# Release notes
22

3-
## Changes from 4.1.2 to 4.1.3
4-
5-
XXX version-specific blurb XXX
6-
73
## Changes from 4.1.1 to 4.1.2
84

9-
- Update `c-blosc2` version
5+
- A new fast path for src/blosc2/linalg.py that uses the matmul prefilter machinery in src/blosc2/blosc2_ext.pyx.
6+
- The fast path is only used for supported cases:
7+
- blosc2.NDArray inputs
8+
- 2-D only
9+
- floating-point only
10+
- matching dtypes
11+
- aligned chunk/block layouts that satisfy the current kernel assumptions
12+
- All other valid cases fall back to the existing chunk-by-chunk implementation in src/blosc2/linalg.py.
13+
- Some benchmarks for the supported cases show significant speedups over the chunked implementation:
14+
- aligned 400x400 float32: about 3.7x faster over chunked
15+
- aligned 400x400 float64: about 3.0x
16+
- aligned 800x800 float32: about 1.5x
17+
- misaligned case: auto correctly stays on chunked
1018

1119
## Changes from 4.1.0 to 4.1.1
1220

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
import argparse
2+
import json
3+
import statistics
4+
import time
5+
import warnings
6+
7+
import numpy as np
8+
9+
import blosc2
10+
import blosc2.linalg as linalg
11+
12+
13+
def parse_int_tuple(value: str) -> tuple[int, ...]:
14+
return tuple(int(item.strip()) for item in value.split(",") if item.strip())
15+
16+
17+
def build_arrays(
18+
shape_a: tuple[int, ...],
19+
shape_b: tuple[int, ...],
20+
dtype: np.dtype,
21+
chunks_a: tuple[int, ...] | None,
22+
chunks_b: tuple[int, ...] | None,
23+
blocks_a: tuple[int, ...] | None,
24+
blocks_b: tuple[int, ...] | None,
25+
):
26+
a_np = np.ones(shape_a, dtype=dtype)
27+
b_np = np.full(shape_b, 2, dtype=dtype)
28+
a = blosc2.asarray(a_np, chunks=chunks_a, blocks=blocks_a)
29+
b = blosc2.asarray(b_np, chunks=chunks_b, blocks=blocks_b)
30+
return a, b, a_np, b_np
31+
32+
33+
def expected_gflops(shape_a: tuple[int, ...], shape_b: tuple[int, ...], elapsed: float) -> float | None:
34+
if elapsed <= 0 or len(shape_a) < 2 or len(shape_b) < 2:
35+
return None
36+
m = shape_a[-2]
37+
k = shape_a[-1]
38+
n = shape_b[-1]
39+
batch = int(np.prod(np.broadcast_shapes(shape_a[:-2], shape_b[:-2]))) if len(shape_a) > 2 or len(shape_b) > 2 else 1
40+
flops = 2 * batch * m * n * k
41+
return flops / elapsed / 1e9
42+
43+
44+
def set_path_mode(mode: str) -> bool:
45+
original = linalg.try_miniexpr
46+
if mode == "chunked":
47+
linalg.try_miniexpr = False
48+
elif mode == "fast":
49+
linalg.try_miniexpr = True
50+
elif mode == "auto":
51+
linalg.try_miniexpr = original
52+
else:
53+
raise ValueError(f"unknown mode: {mode}")
54+
return original
55+
56+
57+
def run_case(
58+
mode: str,
59+
repeats: int,
60+
shape_a: tuple[int, ...],
61+
shape_b: tuple[int, ...],
62+
dtype: np.dtype,
63+
chunks_a: tuple[int, ...] | None,
64+
chunks_b: tuple[int, ...] | None,
65+
blocks_a: tuple[int, ...] | None,
66+
blocks_b: tuple[int, ...] | None,
67+
chunks_out: tuple[int, ...] | None,
68+
blocks_out: tuple[int, ...] | None,
69+
):
70+
a, b, a_np, b_np = build_arrays(shape_a, shape_b, dtype, chunks_a, chunks_b, blocks_a, blocks_b)
71+
with warnings.catch_warnings():
72+
# NumPy + Accelerate can emit spurious matmul RuntimeWarnings on macOS arm64.
73+
warnings.simplefilter("ignore", RuntimeWarning)
74+
expected = np.matmul(a_np, b_np)
75+
original_flag = set_path_mode(mode)
76+
original_set_pref_matmul = blosc2.NDArray._set_pref_matmul
77+
selected_paths = []
78+
times = []
79+
result = None
80+
81+
def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
82+
selected_paths.append("fast")
83+
return original_set_pref_matmul(self, inputs, fp_accuracy)
84+
85+
blosc2.NDArray._set_pref_matmul = wrapped_set_pref_matmul
86+
try:
87+
for _ in range(repeats):
88+
before = len(selected_paths)
89+
t0 = time.perf_counter()
90+
with warnings.catch_warnings():
91+
# NumPy + Accelerate can emit spurious matmul RuntimeWarnings on macOS arm64.
92+
warnings.simplefilter("ignore", RuntimeWarning)
93+
result = blosc2.matmul(a, b, chunks=chunks_out, blocks=blocks_out)
94+
times.append(time.perf_counter() - t0)
95+
if len(selected_paths) == before:
96+
selected_paths.append("chunked")
97+
finally:
98+
blosc2.NDArray._set_pref_matmul = original_set_pref_matmul
99+
linalg.try_miniexpr = original_flag
100+
101+
if result is None:
102+
raise RuntimeError("matmul did not produce a result")
103+
104+
actual = result[:]
105+
np.testing.assert_allclose(actual, expected, rtol=1e-6, atol=1e-6)
106+
107+
best = min(times)
108+
median = statistics.median(times)
109+
return {
110+
"mode": mode,
111+
"times_s": times,
112+
"best_s": best,
113+
"median_s": median,
114+
"gflops_best": expected_gflops(shape_a, shape_b, best),
115+
"gflops_median": expected_gflops(shape_a, shape_b, median),
116+
"correct": True,
117+
"selected_paths": selected_paths,
118+
"selected_path": selected_paths[0] if selected_paths and len(set(selected_paths)) == 1 else "mixed",
119+
}
120+
121+
122+
def main() -> None:
123+
parser = argparse.ArgumentParser(description="Compare chunked and fast blosc2.matmul paths.")
124+
parser.add_argument("--shape-a", default="400,400", help="Comma-separated shape for A.")
125+
parser.add_argument("--shape-b", default="400,400", help="Comma-separated shape for B.")
126+
parser.add_argument("--dtype", default="float32", choices=["float32", "float64", "int32", "int64"])
127+
parser.add_argument("--chunks-a", default="200,200", help="Comma-separated chunk shape for A.")
128+
parser.add_argument("--chunks-b", default="200,200", help="Comma-separated chunk shape for B.")
129+
parser.add_argument("--blocks-a", default="100,100", help="Comma-separated block shape for A.")
130+
parser.add_argument("--blocks-b", default="100,100", help="Comma-separated block shape for B.")
131+
parser.add_argument("--chunks-out", default="200,200", help="Comma-separated chunk shape for output.")
132+
parser.add_argument("--blocks-out", default="100,100", help="Comma-separated block shape for output.")
133+
parser.add_argument("--repeats", type=int, default=250)
134+
parser.add_argument("--modes", nargs="+", default=["chunked", "fast", "auto"], choices=["chunked", "fast", "auto"])
135+
parser.add_argument("--json", action="store_true", help="Emit full JSON instead of a compact text summary.")
136+
args = parser.parse_args()
137+
138+
shape_a = parse_int_tuple(args.shape_a)
139+
shape_b = parse_int_tuple(args.shape_b)
140+
chunks_a = parse_int_tuple(args.chunks_a) if args.chunks_a else None
141+
chunks_b = parse_int_tuple(args.chunks_b) if args.chunks_b else None
142+
blocks_a = parse_int_tuple(args.blocks_a) if args.blocks_a else None
143+
blocks_b = parse_int_tuple(args.blocks_b) if args.blocks_b else None
144+
chunks_out = parse_int_tuple(args.chunks_out) if args.chunks_out else None
145+
blocks_out = parse_int_tuple(args.blocks_out) if args.blocks_out else None
146+
dtype = np.dtype(args.dtype)
147+
148+
results = []
149+
for mode in args.modes:
150+
results.append(
151+
run_case(
152+
mode,
153+
args.repeats,
154+
shape_a,
155+
shape_b,
156+
dtype,
157+
chunks_a,
158+
chunks_b,
159+
blocks_a,
160+
blocks_b,
161+
chunks_out,
162+
blocks_out,
163+
)
164+
)
165+
166+
summary = {
167+
"shape_a": shape_a,
168+
"shape_b": shape_b,
169+
"dtype": str(dtype),
170+
"chunks_a": chunks_a,
171+
"chunks_b": chunks_b,
172+
"blocks_a": blocks_a,
173+
"blocks_b": blocks_b,
174+
"chunks_out": chunks_out,
175+
"blocks_out": blocks_out,
176+
"results": results,
177+
}
178+
179+
best_by_mode = {item["mode"]: item["best_s"] for item in results}
180+
if "chunked" in best_by_mode and "fast" in best_by_mode:
181+
summary["speedup_fast_vs_chunked"] = best_by_mode["chunked"] / best_by_mode["fast"]
182+
183+
if args.json:
184+
print(json.dumps(summary, indent=2, sort_keys=True))
185+
return
186+
187+
print(
188+
"case",
189+
json.dumps(
190+
{
191+
"shape_a": shape_a,
192+
"shape_b": shape_b,
193+
"dtype": str(dtype),
194+
"chunks_out": chunks_out,
195+
"blocks_out": blocks_out,
196+
},
197+
sort_keys=True,
198+
),
199+
)
200+
for item in results:
201+
print(
202+
"result",
203+
json.dumps(
204+
{
205+
"mode": item["mode"],
206+
"best_s": round(item["best_s"], 6),
207+
"median_s": round(item["median_s"], 6),
208+
"gflops_best": None if item["gflops_best"] is None else round(item["gflops_best"], 3),
209+
"correct": item["correct"],
210+
"selected_path": item["selected_path"],
211+
},
212+
sort_keys=True,
213+
),
214+
)
215+
if "speedup_fast_vs_chunked" in summary:
216+
print("speedup", json.dumps({"fast_vs_chunked": round(summary["speedup_fast_vs_chunked"], 3)}, sort_keys=True))
217+
218+
219+
if __name__ == "__main__":
220+
main()
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import blosc2
2+
import numpy as np
3+
import time
4+
5+
N = 10000
6+
ndim = 2
7+
ashape = (N,) * ndim
8+
bshape = ashape
9+
dtype = np.float64
10+
11+
achunks = (1000, 1000)
12+
bchunks = (achunks[1], achunks[0])
13+
ablocks = (200, 200)
14+
bblocks = (ablocks[1], ablocks[0])
15+
outblocks = (ablocks[0], bblocks[1])
16+
outchunks = (achunks[0], bchunks[1])
17+
# a = blosc2.linspace(0, 1, dtype=dtype, shape=ashape, chunks=achunks, blocks=ablocks)
18+
# b = blosc2.linspace(0, 1, dtype=dtype, shape=bshape, chunks=bchunks, blocks=bblocks)
19+
a = blosc2.ones(dtype=dtype, shape=ashape, chunks=achunks, blocks=ablocks)
20+
b = blosc2.full(fill_value=2, dtype=dtype, shape=bshape, chunks=bchunks, blocks=bblocks)
21+
22+
a_np = a[:]
23+
b_np = b[:]
24+
tic = time.time()
25+
np_res = np.matmul(a_np, b_np)
26+
print(f'numpy finished in {time.time()-tic} s')
27+
28+
tic = time.time()
29+
b2_res = blosc2.matmul(a, b, blocks=outblocks, chunks=outchunks)
30+
print(f'blosc2 multithreaded finished in {time.time()-tic} s')
31+
32+
tic = time.time()
33+
b2_res = blosc2.matmul(a, b)
34+
print(f'blosc2 normal finished in {time.time()-tic} s')
35+
36+
achunks = None #(1000, 1000)
37+
bchunks = None #(achunks[1], achunks[0])
38+
ablocks = None #(200, 200)
39+
bblocks = None #(ablocks[1], ablocks[0])
40+
outblocks = None #(ablocks[0], bblocks[1])
41+
outchunks = None #(achunks[0], bchunks[1])
42+
# a = blosc2.linspace(0, 1, dtype=dtype, shape=ashape, chunks=achunks, blocks=ablocks)
43+
# b = blosc2.linspace(0, 1, dtype=dtype, shape=bshape, chunks=bchunks, blocks=bblocks)
44+
a = blosc2.ones(dtype=dtype, shape=ashape, chunks=achunks, blocks=ablocks)
45+
b = blosc2.full(fill_value=2, dtype=dtype, shape=bshape, chunks=bchunks, blocks=bblocks)
46+
tic = time.time()
47+
b2_res = blosc2.matmul(a, b, blocks=outblocks, chunks=outchunks)
48+
print(f'blosc2 normal with default chunks etc. finished in {time.time()-tic} s')

bench/ndarray/stringops_bench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import time
1313
import numpy as np
1414
import blosc2
15-
from blosc2.lazyexpr import _toggle_miniexpr
15+
from blosc2.utils import _toggle_miniexpr
1616

1717
# nparr = np.random.randint(low=0, high=128, size=(N, 10), dtype=np.uint32)
1818
# nparr = nparr.view('S40').astype('U10')

pyproject.toml

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,7 @@ dependencies = [
3939
"numexpr>=2.14.1; platform_machine != 'wasm32'",
4040
"requests",
4141
]
42-
version = "4.1.3.dev0"
43-
44-
[project.optional-dependencies]
45-
recommended = [
46-
"pyarrow",
47-
]
48-
42+
version = "4.1.1.dev0"
4943
[project.entry-points."array_api"]
5044
blosc2 = "blosc2"
5145

0 commit comments

Comments
 (0)