Better vector interleaves by abadams · Pull Request #8925 · halide/Halide

abadams · 2026-01-27T19:30:26Z

Using the strategy in CodeGen_LLVM to do big vector interleaves (repeated 2-way interleaves), LLVM generates pretty poor code on x86. This is because x86 has no two-way vector interleave instruction until avx-512, and that instruction requires a runtime shuffle table, using up a register. The instructions x86 does have that take immediates are weird and hard to think about. It's important to stick to instructions that take immediates because interleaves often happen in high register pressure contexts (e.g. block transposes). This PR redoes vector interleaving for power of two blocks on x86 to use unpckl and shufi/vperm2/vinsert instructions only. The algorithm is somewhat complex and requires reasoning about permutations of the bits of the indices of each element. Hopefully it is understandable given the jumbo comment. I first got it working in python and Claude correctly translated that to C++ for me, after which I made extensive rewrites.

On my machine, this makes block transposes significantly faster and shorter in terms of code size and avoids some of the pathological cases on main. E.g. a 16x16 transpose of uint16s on avx2 on main is 621 instructions total, taking 419 cycles. I'd paste it but it's just a huge mess of various instructions. In this PR it's 134 instructions and 64 cycles:

	vmovdqu	-7680(%r9), %ymm0
	vmovdqu	-7168(%r9), %ymm1
	vmovdqu	-6656(%r9), %ymm12
	vmovdqu	-6144(%r9), %ymm14
	vmovdqu	-5632(%r9), %ymm6
	vmovdqu	-5120(%r9), %ymm8
	vmovdqu	-4608(%r9), %ymm15
	vmovdqu	-4096(%r9), %ymm2
	vmovdqu	-3584(%r9), %ymm5
	vmovdqu	-3072(%r9), %ymm9
	vmovups	-2560(%r9), %ymm3
	vmovups	%ymm3, (%rsp)                   
	vmovups	-2048(%r9), %ymm3
	vmovups	%ymm3, 32(%rsp)                 
	vmovdqu	-1536(%r9), %ymm4
	vmovdqu	-1024(%r9), %ymm11
	vmovdqu	-512(%r9), %ymm7
	vpunpcklwd	%ymm6, %ymm0, %ymm3     
	vpunpckhwd	%ymm6, %ymm0, %ymm10    
	vpunpcklwd	%ymm8, %ymm1, %ymm6     
	vpunpckhwd	%ymm8, %ymm1, %ymm13    
	vpunpcklwd	%ymm15, %ymm12, %ymm8   
	vpunpckhwd	%ymm15, %ymm12, %ymm0   
	vpunpcklwd	%ymm2, %ymm14, %ymm12   
	vpunpckhwd	%ymm2, %ymm14, %ymm15   
	vpunpcklwd	%ymm4, %ymm5, %ymm1     
	vpunpckhwd	%ymm4, %ymm5, %ymm2     
	vmovdqu	%ymm2, 96(%rsp)                 
	vpunpcklwd	%ymm11, %ymm9, %ymm2    
	vpunpckhwd	%ymm11, %ymm9, %ymm4    
	vmovdqu	%ymm4, 64(%rsp)                 
	vmovdqu	(%rsp), %ymm4                   
	vpunpcklwd	%ymm7, %ymm4, %ymm11    
	vpunpckhwd	%ymm7, %ymm4, %ymm9     
	vmovdqu	(%r9), %ymm5
	vmovdqu	32(%rsp), %ymm7                 
	vpunpcklwd	%ymm5, %ymm7, %ymm4     
	vpunpckhwd	%ymm5, %ymm7, %ymm7     
	vpunpcklwd	%ymm8, %ymm3, %ymm5     
	vpunpckhwd	%ymm8, %ymm3, %ymm8     
	vpunpcklwd	%ymm12, %ymm6, %ymm3    
	vpunpckhwd	%ymm12, %ymm6, %ymm14   
	vpunpcklwd	%ymm0, %ymm10, %ymm12   
	vpunpckhwd	%ymm0, %ymm10, %ymm10   
	vpunpcklwd	%ymm15, %ymm13, %ymm0   
	vpunpckhwd	%ymm15, %ymm13, %ymm15  
	vpunpcklwd	%ymm11, %ymm1, %ymm13   
	vpunpckhwd	%ymm11, %ymm1, %ymm11   
	vpunpcklwd	%ymm4, %ymm2, %ymm1     
	vpunpckhwd	%ymm4, %ymm2, %ymm4     
	vmovups	96(%rsp), %ymm2                 
	vpunpcklwd	%ymm9, %ymm2, %ymm6     
	vpunpckhwd	%ymm9, %ymm2, %ymm2     
	vmovdqu	%ymm2, 96(%rsp)                 
	vmovups	64(%rsp), %ymm9                 
	vpunpcklwd	%ymm7, %ymm9, %ymm2     
	vpunpckhwd	%ymm7, %ymm9, %ymm9     
	vpunpcklwd	%ymm3, %ymm5, %ymm7     
	vpunpckhwd	%ymm3, %ymm5, %ymm5     
	vpunpcklwd	%ymm14, %ymm8, %ymm3    
	vmovdqu	%ymm3, 160(%rsp)                
	vpunpckhwd	%ymm14, %ymm8, %ymm3    
	vmovdqu	%ymm3, 192(%rsp)                
	vpunpcklwd	%ymm0, %ymm12, %ymm8    
	vpunpckhwd	%ymm0, %ymm12, %ymm0    
	vmovdqu	%ymm0, 32(%rsp)                 
	vpunpcklwd	%ymm15, %ymm10, %ymm0   
	vmovdqu	%ymm0, (%rsp)                   
	vpunpckhwd	%ymm15, %ymm10, %ymm0   
	vmovdqu	%ymm0, 64(%rsp)                 
	vpunpcklwd	%ymm1, %ymm13, %ymm15   
	vpunpckhwd	%ymm1, %ymm13, %ymm12   
	vpunpcklwd	%ymm4, %ymm11, %ymm14   
	vpunpckhwd	%ymm4, %ymm11, %ymm11   
	vpunpcklwd	%ymm2, %ymm6, %ymm13    
	vpunpckhwd	%ymm2, %ymm6, %ymm10    
	vmovups	%ymm7, 128(%rsp)                
	vinsertf128	$1, %xmm15, %ymm7, %ymm0
	vmovups	%ymm0, (%r14,%r10)
	vmovdqa	%ymm5, %ymm7
	vinsertf128	$1, %xmm12, %ymm7, %ymm0
	leaq	(%r14,%r10), %r11
	vmovups	%ymm0, (%r11,%r15,2)
	vmovups	160(%rsp), %ymm5                
	vinsertf128	$1, %xmm14, %ymm5, %ymm0
	vmovups	%ymm0, (%r11,%r15,4)
	vmovups	192(%rsp), %ymm6                
	vinsertf128	$1, %xmm11, %ymm6, %ymm0
	leaq	(%r11,%rsi), %r12
	addq	%rsi, %r12
	vmovups	%ymm0, (%r12,%r15,2)
	vmovups	96(%rsp), %ymm0                 
	vpunpcklwd	%ymm9, %ymm0, %ymm1     
	vpunpckhwd	%ymm9, %ymm0, %ymm0     
	vinsertf128	$1, %xmm13, %ymm8, %ymm2
	vmovups	%ymm2, (%r11,%r15,8)
	vmovups	32(%rsp), %ymm4                 
	vinsertf128	$1, %xmm10, %ymm4, %ymm2
	addq	%r8, %r12
	vmovups	%ymm2, (%r12,%r15,2)
	vmovups	(%rsp), %ymm9                   
	vinsertf128	$1, %xmm1, %ymm9, %ymm2
	leaq	(%r12,%r15,2), %r11
	vmovups	%ymm2, (%rsi,%r11)
	vmovups	64(%rsp), %ymm3                 
	vinsertf128	$1, %xmm0, %ymm3, %ymm2
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$19, 128(%rsp), %ymm15, %ymm2 
                                        
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm12, %ymm7, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm14, %ymm5, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm11, %ymm6, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm13, %ymm8, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm10, %ymm4, %ymm2 
	addq	%rsi, %r11
	vmovups	%ymm2, (%rsi,%r11)
	vperm2f128	$49, %ymm1, %ymm9, %ymm1 
	addq	%rsi, %r11
	vmovups	%ymm1, (%rsi,%r11)
	vperm2f128	$49, %ymm0, %ymm3, %ymm0 
	addq	%rsi, %r11
	vmovups	%ymm0, (%rsi,%r11)
	addq	$32, %r10
	addq	$8192, %r9                      
	cmpq	$512, %r10

This changes what block sizes are best used for transposing. Here are the best block sizes for each type before and after this change:

AVX512:

State	Bytes per element	Best width	Best height	Bandwidth (GB/s)
Before	1	16	8	10.154
After	1	32	32	19.0136
Before	2	8	16	20.2936
After	2	16	32	26.3541
Before	4	8	8	24.3352
After	4	16	16	30.8833
Before	8	8	4	22.6106
After	8	8	8	22.5469

AVX2:

State	Bytes per element	Best width	Best height	Bandwidth (GB/s)
Before	1	16	8	11.5999
After	1	16	32	17.0604
Before	2	4	8	15.5821
After	2	16	16	23.5512
Before	4	4	8	21.977
After	4	8	8	27.7187
Before	8	4	8	19.0545
After	8	8	4	23.823

A good rule of thumb seems to be that you now want to use 512-byte blocks on avx2, and 1024-byte blocks on avx512.

The previous comment reported a time that seemed to have regressed. It was not 8.2ms on main - more like 11

abadams · 2026-01-27T19:54:35Z

Also notable: LLVM is happy to undo all this shuffle factorization work, fuse them back together, and just make a big mess. So a new mechanism in this PR is optimization_fence, which abuses llvm's arithmetic fence intrinsic to prevent fusion of shuffle instructions (it's supposed to be used to prevent of floating point ops).

alexreinking

A couple comment nits, but otherwise, everything read very clearly. Did you write most of these comments yourself, or did Claude?

src/CodeGen_X86.cpp

abadams · 2026-01-27T23:21:30Z

I wrote them all myself. And re-reading it all myself I found a bunch of stuff I didn't like and made more changes. Maybe it existed in the original python too but at this point I've rewritten enough of the code that I don't think this counts as coauthored by claude anymore.

Before: Computing best tile sizes for each type ................................................. bytes, tile width, tile height, bandwidth (GB/s): 1 8 8 20.9997 1 16 8 20.8329 1 8 16 18.5702 1 8 32 17.2463 1 8 64 14.312 2 8 16 19.2047 2 8 8 18.8368 2 16 8 17.0593 2 8 32 17.0591 2 4 8 15.7681 4 8 8 24.9364 4 4 16 22.9699 4 8 16 22.5743 4 4 32 22.255 4 4 8 20.4468 8 8 8 38.4094 8 16 4 28.4167 8 16 8 27.6184 8 8 4 27.6062 8 8 16 26.8693 After: Computing best tile sizes for each type ................................................. bytes, tile width, tile height, bandwidth (GB/s): 1 16 32 34.1921 1 16 16 31.8399 1 8 16 25.575 1 16 64 25.1665 1 32 16 25.0061 2 8 32 28.2635 2 8 16 27.7648 2 16 16 27.2126 2 16 32 23.9034 2 8 8 23.6345 4 8 16 34.5303 4 8 8 28.3653 4 16 8 26.8521 4 8 32 26.084 4 16 16 24.4519 8 8 8 33.7163 8 8 4 29.1339 8 4 16 26.418 8 16 4 25.4663 8 2 8 24.3949

abadams · 2026-01-30T04:52:08Z

It turns out LLVM was making a mess on ARM too. Adding optimization fences to the base class interleave implementation makes an 8-bit transpose 1.7x faster. See the last commit for the numbers.

src/CodeGen_LLVM.cpp

alexreinking · 2026-02-15T12:27:12Z

I rebased on main to trigger the new buildbot workflows

test/performance/interleave.cpp

abadams · 2026-02-17T19:48:16Z

Yes, sorry, this became a WIP because I wanted to add some functionality to it.

…pose

To help diagnose occasional illegal instruction errors

abadams · 2026-02-24T22:26:19Z

This PR now also has a partial implementation of "A decomposition for in-place matrix transposition" by Catanzaro et al. for non power-of-two interleaves and deinterleaves.

abadams · 2026-02-25T17:08:00Z

Ready for review. Failure is #8928

…pose

Fixes #8928 Pulls in the optimization_fence helper from #8925 Co-authored-by: Claude Code <noreply@anthropic.com>

* Work around an LLVM bug in widening casts of loads in wasm Fixes #8928 Pulls in the optimization_fence helper from #8925 Co-authored-by: Claude Code <noreply@anthropic.com> * Tweak comment * Add reference to llvm bug --------- Co-authored-by: Claude Code <noreply@anthropic.com>

abadams · 2026-03-05T21:35:54Z

Review ping. The changes inside CodeGen_X86 could be deferred to a follow-up PR, but the review burden is equal to the sum of its parts because it's exactly segregated by file, so it seemed better to get it over with and treat it as one monolithic thing.

alexreinking

Still combing through the big files, but there's some low-hanging fruit to address here.

alexreinking · 2026-02-25T18:56:16Z

test/performance/block_transpose.cpp

+    // Explicitly vectorized loads from the input. Was necessary before we
+    // automatically swizzled the 2D load into dense order.
+    // input.in().compute_at(output, x).vectorize(x).unroll(y);
+
+    // Explicit transpose in registers. This used to be the idiom, but is no
+    // longer necessary because stage_strided_loads should detect the strided
+    // loads from input.in() and turn it into a transpose.
+    // input.in().in().reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
+
+    // TODO: Should not be necessary, but prevents licm from doing something dumb.


Is there anything we can do, e.g. with a custom lowering pass, to test that the expected IR gets built? Does the old idiom still work? It should probably be tested not to regress.

Yes, I will add a test that checks all the idioms work.

test/performance/block_transpose.cpp

alexreinking · 2026-03-06T02:45:43Z

test/performance/interleave.cpp

+    // Set the target features to use for dumping to assembly
+    target.set_features({Target::NoRuntime, Target::NoAsserts, Target::NoBoundsQuery});
+
+    std::cout << "\nbytes, interleave factor, interleave bandwidth (GB/s), deinterleave bandwidth (GB/s):\n";


Is there any property at all we can test here?

llvm is free to do clever things under the hood, so I worry any asserts on the actual performance results risks being a flaky test. It asserts correctness and the performance output is just informational right now.

I think it wouldn't hurt to try adding a performance test? If LLVM does something weird, at least we know. The whole point of this PR is to get better and faster interleave codegen. If we don't really test the result that LLVM produces, it's not really demanding the fruits of your work.

I think this might also reveal that the optimization_fence breaks in the future.

alexreinking · 2026-03-06T02:51:05Z

src/Simplify_Stmts.cpp

        // foo[x] = foo[x] or foo[x] = undef is a no-op
        return Evaluate::make(0);
+    } else if (shuf && shuf->is_concat()) {
+        // Break a store of a concat of vector indices into separate stores


To what end? Should explain more in the comment.

alexreinking · 2026-03-06T03:05:44Z

src/IR.cpp

+    int cols = indices[1] - indices[0];
+    int rows = vectors[0].type().lanes() / cols;
+    if ((int)indices.size() != rows * cols) {
+        return false;
+    }


This looks weird. What if the shuffle is like...

shuffle({10,20,30,40}, {2, 1, 0, 3})

Then cols == -1, rows == -4 and so indices.size() == rows * cols, right? Then, below, the two outer for loops are both apparently infinite. But maybe you luck out by returning false right away? indices[0 * -4 + 0] == 2 != 0 * -1 + 0.

Aha! This is the way to make this code cleaner / safer... to return true, we know that indices[col * rows + row] == row * cols + col for all row/col. At 0/0, that's

indices[0 * rows + 0] == 0 * cols + 0 => indices[0] == 0

So why don't we check that up front?

Suggested change

int cols = indices[1] - indices[0];

int rows = vectors[0].type().lanes() / cols;

if ((int)indices.size() != rows * cols) {

return false;

}

if (indices[0] != 0) {

return false;

}

int cols = indices[1];

int rows = vectors[0].type().lanes() / cols;

if ((int)indices.size() != rows * cols) {

return false;

}

It actually also looks like this can divide by 0... need to check that indices[1] != 0, too.

alexreinking · 2026-03-06T03:08:13Z

src/CodeGen_LLVM.h

+    /** A fence to prevent fusion of ops by llvm. Designed for floats, but we
+     * abuse it to prevent shufflevector fusion too. */
+    virtual llvm::Value *optimization_fence(llvm::Value *);


I think we need to rebase this on main after merging the optimization_fence from the other PR.

Yeah, should be easy because that was copy-pasted out of this branch.

alexreinking · 2026-03-06T03:44:58Z

I'd also like to defer this until after #8977 so we have coverage of SVE2. I'm nervous about the interactions with scalable vectors in CodeGen_LLVM.cpp

mcourteaux

Reviewed everything except the bit-logic thing in x86. If the tests pass, I hope that's sufficient 😛

Looks mostly very good, but reading through it there were a few places where I got confused, so I suggested some comments explaining things.

Also, to my understanding, the tests for interleaving don't seem to test all of the vector length cases, but only the power of two cases?

mcourteaux · 2026-03-05T23:10:43Z

src/CodeGen_LLVM.cpp

+        std::vector<int> rotation(vec_elements, 0);
+        for (int i = 0; i < vec_elements; i++) {
+            int k = (i * num_vecs) % vec_elements;
+            rotation[k] = (i * num_vecs) / vec_elements;


as they are coprime, every index in rotation should have been produced. a for loop of internal_assert(rotation[i] != 0) for all i != 0?

mcourteaux · 2026-03-05T23:18:17Z

src/CodeGen_LLVM.cpp

+        // Using unary shuffles, get each element into the right ultimate
+        // lane. This works out without collisions because the number of vectors
+        // and the length of each vector is coprime.
+        const int num_vecs = (int)v.size();


Perhaps add a comment that signed datatype is required for bitlogic further.

mcourteaux · 2026-03-05T23:19:58Z

src/CodeGen_LLVM.cpp

+        if (f == 1 || f == num_vecs) {
+            for (int i = 2; i < num_vecs; i++) {
+                if (num_vecs % i == 0) {
+                    f = i;


This logic seems to find the smallest divisor; instead of the largest power of two, like the comment suggests?

It's the initial value that is the largest power of two factor: num_vecs & -num_vecs. I'll pull it out into a helper function so it's less magic

Okay. I think the word "first" raises some questions to a reader. It seems to be some divide-and-conquer strategy, and we prefer big powers of two factors at high levels of the recursion, if I understand well.

src/CodeGen_LLVM.cpp

mcourteaux · 2026-03-05T23:26:47Z

src/CodeGen_LLVM.cpp

+        // Use the inverse of Catanzaro's algorithm from above. We slice into
+        // distinct vectors, then rotate each element into the correct final
+        // vector, then do a unary permutation of each vector.
+        std::vector<int> shuffle(vec_elements);


move this declaration down to where it's actually used

mcourteaux · 2026-03-06T09:02:19Z

src/Simplify_Stmts.cpp

        // foo[x] = foo[x] or foo[x] = undef is a no-op
        return Evaluate::make(0);
+    } else if (shuf && shuf->is_concat()) {
+        // Break a store of a concat of vector indices into separate stores


Are we sure that concat(ramp(0, 1, 8), ramp(8, 1, 8)) is already simplified to ramp(0, 1, 16) at this point, before we needlessly split those up? What brought you to add this? What happened without this transformation?

If you tile a transpose like .tile(x, y, xi, xi, 8, 8), and vectorize both xi and yi, then the store index becomes a concat of dense ramps. This breaks it up. The benchmark uses this but I'll also add a new test that clarifies.

.github/workflows/testing-make.yml

mcourteaux · 2026-03-06T09:25:14Z

src/IRMatch.h

+            return false;
+        }
+        const Shuffle &v = (const Shuffle &)e;
+        return v.vectors.size() == 1 &&


This makes me wonder; transpose is only happening on one vector. Is make_transpose(make_concat(a, b), 4) going to make problems? I hope not, and it just doesn't get lowered to the nice work you did?

In general the simplifier doesn't fuse shuffles, so it should be fine.

mcourteaux · 2026-03-06T09:35:07Z

test/performance/interleave.cpp

+    output(x) = in(x / factor, x % factor);
+
+    Var xi, yi;
+    output.unroll(x, factor, TailStrategy::RoundUp).vectorize(x, t.natural_vector_size<T>(), TailStrategy::RoundUp);


This needs also testing different vector sizes, but perhaps in a correctness test? You have logic for handling non-POT, but multiple of native. This does not seem explicitly tested right now?

Can you document why you do unroll(factor).vectorize()? I would have expected to vectorize(x, natural * factor)?

mcourteaux · 2026-03-06T09:42:40Z

src/CodeGen_X86.cpp

+    if (!is_power_of_two(vec_elements) &&
+        vec_elements % elems_per_native_vec == 0) {
+        // It's not a power of two, but it's a multiple of the native vector
+        // length, so slice it and recurse.


Is this tested?

Yes the interleave tests ends up with input vectors of size say 24 when deinterleaving 8-vectors by a factor of three.

abadams added 4 commits January 26, 2026 15:52

Specialized x86 implementation of interleave_vectors

9e89b7c

Update test to be more exhaustive

188bee0

Fix comment.

2ba8dde

The previous comment reported a time that seemed to have regressed. It was not 8.2ms on main - more like 11

Comment fix

d102f7b

alexreinking self-requested a review January 27, 2026 20:16

abadams added 2 commits January 27, 2026 12:54

clang-tidy fixes

46d41dd

Make variable names more consistent

27f1220

alexreinking approved these changes Jan 27, 2026

View reviewed changes

src/CodeGen_X86.cpp Outdated Show resolved Hide resolved

src/CodeGen_X86.cpp Outdated Show resolved Hide resolved

abadams added 2 commits January 27, 2026 15:18

Simplify code with helper lambda

5576f46

Comment tweaks

107aaa5

abadams added 2 commits January 28, 2026 13:36

Don't do half-width unpcks

0bc1b9f

abadams changed the title ~~Better vector interleaves on x86~~ Better vector interleaves Jan 30, 2026

Merge branch 'main' into abadams/fix_x86_transpose

23b79ba

rootjalex reviewed Feb 3, 2026

View reviewed changes

src/CodeGen_LLVM.cpp Outdated Show resolved Hide resolved

Use Catanzaro's algorithm for non-power-of-two interleaves

3eef5db

abadams force-pushed the abadams/fix_x86_transpose branch from 9a09bfa to 3eef5db Compare February 12, 2026 15:51

alexreinking force-pushed the abadams/fix_x86_transpose branch from 3eef5db to 484bd4c Compare February 15, 2026 12:26

alexreinking requested changes Feb 15, 2026

View reviewed changes

test/performance/interleave.cpp Show resolved Hide resolved

alexreinking marked this pull request as draft February 17, 2026 22:46

alexreinking self-requested a review February 17, 2026 22:47

Support more interleave and deinterleave patterns

678a353

abadams force-pushed the abadams/fix_x86_transpose branch from 7452679 to 678a353 Compare February 18, 2026 22:29

abadams added 2 commits February 18, 2026 14:30

Merge remote-tracking branch 'origin/main' into abadams/fix_x86_trans…

a0b7d66

…pose

clang-tidy fix

4c1adf7

abadams added 9 commits February 20, 2026 09:54

Fix innermost_containing_node

794df0b

Fix some simd op check failures

486addd

Fix infinite recursion issue and missed case in interleave codegen

a1ecca9

Adjust expectations in stage_strided_loads test

f66d5ea

Allow reversed suffix or not in sve test

c25142f

Don't use optimization fences on hexagon

bae3e02

Fix infinite simplifier loop

b7defbd

Don't hoist transposes on hexagon

23944a0

Make distinct strided load nodes in the IR distinct in memory too

0d110d2

alexreinking removed their request for review February 23, 2026 22:08

abadams added 5 commits February 24, 2026 11:39

Merge remote-tracking branch 'origin/main' into abadams/fix_x86_trans…

53ae7e4

…pose

arm-32 has no vst2 for 64-bit elements

84f10b1

Windows bad filename fix in simd op check

8d93c3c

Temporary dumping of cpu info to debug github actions issue

36565ce

dump cpuinfo in makefile testing workflow

3f45c47

To help diagnose occasional illegal instruction errors

abadams marked this pull request as ready for review February 25, 2026 17:07

alexreinking self-requested a review February 25, 2026 18:49

abadams mentioned this pull request Feb 25, 2026

Hoist the shared dense load as a let in stage_strided_loads #8964

Merged

Merge remote-tracking branch 'origin/main' into abadams/fix_x86_trans…

223dd7f

…pose

abadams added a commit that referenced this pull request Mar 4, 2026

Work around an LLVM bug in widening casts of loads in wasm

36fe5ce

Fixes #8928 Pulls in the optimization_fence helper from #8925 Co-authored-by: Claude Code <noreply@anthropic.com>

abadams mentioned this pull request Mar 4, 2026

Work around an LLVM bug in widening casts of loads in wasm #8981

Merged

alexreinking requested changes Mar 6, 2026

View reviewed changes

mcourteaux requested changes Mar 6, 2026

View reviewed changes

Address review comments

2695151

halide deleted a comment from alexreinking Mar 8, 2026

Conversation

abadams commented Jan 27, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

abadams commented Jan 27, 2026

Uh oh!

alexreinking left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

abadams commented Jan 27, 2026

Uh oh!

abadams commented Jan 30, 2026

Uh oh!

Uh oh!

alexreinking commented Feb 15, 2026

Uh oh!

Uh oh!

abadams commented Feb 17, 2026

Uh oh!

abadams commented Feb 24, 2026

Uh oh!

abadams commented Feb 25, 2026

Uh oh!

abadams commented Mar 5, 2026

Uh oh!

alexreinking left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

alexreinking commented Mar 6, 2026

Uh oh!

mcourteaux left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

abadams commented Jan 27, 2026 •

edited

Loading