Skip to content

Commit 10f0e1c

Browse files
committed
Update neon_intrinsics.h
working for both neon32 and neon64
1 parent 190a1cd commit 10f0e1c

1 file changed

Lines changed: 117 additions & 135 deletions

File tree

Lines changed: 117 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -1,148 +1,130 @@
11
#pragma once
2-
#include <arm_neon.h> // NEON intrinsics
2+
#include <arm_neon.h>
33

44
namespace
55
{
6-
typedef float32x4_t f32x4;
7-
8-
//------------------------------------------------------
9-
// Load / Store
10-
//------------------------------------------------------
11-
inline f32x4 v_load(const float* p) { return vld1q_f32(p); }
12-
inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); }
13-
inline f32x4 v_set1(float s) { return vdupq_n_f32(s); }
14-
inline f32x4 v_zero() { return vdupq_n_f32(0.0f); }
15-
inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }
16-
17-
//------------------------------------------------------
18-
// Mask helpers
19-
//------------------------------------------------------
20-
inline f32x4 v_mask_xyz()
21-
{
22-
float vals[4] = { 1.0f, 1.0f, 1.0f, 0.0f };
23-
return vld1q_f32(vals);
24-
}
25-
26-
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
27-
{
28-
float tmp[4];
29-
vst1q_f32(tmp, newv);
30-
tmp[3] = vgetq_lane_f32(original, 3);
31-
return vld1q_f32(tmp);
32-
}
33-
34-
//------------------------------------------------------
35-
// Float3 helpers
36-
//------------------------------------------------------
37-
inline f32x4 v_load3_vec(const float* p) // w = 0
38-
{
39-
float vals[4] = { p[0], p[1], p[2], 0.0f };
40-
return vld1q_f32(vals);
41-
}
42-
43-
inline f32x4 v_load3_pos(const float* p) // w = 1
44-
{
45-
float vals[4] = { p[0], p[1], p[2], 1.0f };
46-
return vld1q_f32(vals);
47-
}
48-
49-
inline void v_store3(float* dst, f32x4 v)
50-
{
51-
float tmp[4];
52-
vst1q_f32(tmp, v);
53-
dst[0] = tmp[0];
54-
dst[1] = tmp[1];
55-
dst[2] = tmp[2];
56-
}
57-
58-
//------------------------------------------------------
59-
// Arithmetic
60-
//------------------------------------------------------
61-
inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
62-
inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
63-
inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
64-
65-
// Fast reciprocal
66-
inline f32x4 v_rcp_nr(f32x4 b)
67-
{
68-
f32x4 r = vrecpeq_f32(b);
69-
// one Newton-Raphson iteration
70-
r = vmulq_f32(r, vrecpsq_f32(b, r));
71-
return r;
72-
}
73-
74-
inline f32x4 v_div(f32x4 a, f32x4 b) { return vmulq_f32(a, v_rcp_nr(b)); }
6+
typedef float32x4_t f32x4;
7+
8+
//------------------------------------------------------
9+
// Load / Store
10+
//------------------------------------------------------
11+
inline f32x4 v_load(const float* p) { return vld1q_f32(p); }
12+
inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); }
13+
inline f32x4 v_set1(float s) { return vdupq_n_f32(s); }
14+
inline f32x4 v_zero() { return vdupq_n_f32(0.0f); }
15+
inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); }
16+
17+
//------------------------------------------------------
18+
// Mask helpers
19+
//------------------------------------------------------
20+
inline f32x4 v_mask_xyz()
21+
{
22+
// equivalent to [1,1,1,0]
23+
float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
24+
return mask;
25+
}
26+
27+
inline f32x4 v_preserve_w(f32x4 newv, f32x4 original)
28+
{
29+
float32x4_t mask = {0.0f, 0.0f, 0.0f, 1.0f};
30+
return vbslq_f32(vreinterpretq_u32_f32(mask), original, newv);
31+
}
32+
33+
//------------------------------------------------------
34+
// Float3 helpers
35+
//------------------------------------------------------
36+
inline f32x4 v_load3_vec(const float* p) // w = 0
37+
{
38+
float tmp[4] = { p[0], p[1], p[2], 0.0f };
39+
return vld1q_f32(tmp);
40+
}
41+
42+
inline f32x4 v_load3_pos(const float* p) // w = 1
43+
{
44+
float tmp[4] = { p[0], p[1], p[2], 1.0f };
45+
return vld1q_f32(tmp);
46+
}
47+
48+
inline void v_store3(float* dst, f32x4 v)
49+
{
50+
float tmp[4];
51+
vst1q_f32(tmp, v);
52+
dst[0] = tmp[0];
53+
dst[1] = tmp[1];
54+
dst[2] = tmp[2];
55+
}
56+
57+
//------------------------------------------------------
58+
// Simple Arithmetic
59+
//------------------------------------------------------
60+
inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); }
61+
inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return vdivq_f32(a, b); } // only NEON64
62+
inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); }
63+
inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); }
64+
65+
//------------------------------------------------------
66+
// Fast recip
67+
//------------------------------------------------------
68+
inline f32x4 v_rcp_nr(f32x4 b)
69+
{
70+
f32x4 r = vrecpeq_f32(b);
71+
r = vmulq_f32(r, vrecpsq_f32(b, r)); // Newton-Raphson
72+
r = vmulq_f32(r, vrecpsq_f32(b, r));
73+
return r;
74+
}
75+
76+
inline f32x4 v_div(f32x4 a, f32x4 b)
77+
{
78+
return vmulq_f32(a, v_rcp_nr(b));
79+
}
7580

7681
inline f32x4 v_rsqrt_nr(f32x4 x)
77-
{
78-
f32x4 r = vrsqrteq_f32(x);
79-
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, r), x));
80-
return r;
81-
}
82-
83-
//------------------------------------------------------
84-
// Dot / Cross
85-
//------------------------------------------------------
86-
inline f32x4 v_dot3(f32x4 a, f32x4 b)
8782
{
88-
f32x4 mul = vmulq_f32(a, b); // element-wise multiply: [a0*b0, a1*b1, a2*b2, a3*b3]
89-
90-
float32x2_t low = vget_low_f32(mul); // lanes 0,1
91-
float32x2_t high = vget_high_f32(mul); // lanes 2,3
92-
93-
float32x2_t sum2 = vpadd_f32(low, high); // horizontal add: [a0*b0 + a1*b1, a2*b2 + a3*b3]
94-
95-
float32x2_t sum1 = vpadd_f32(sum2, sum2); // horizontal add: total sum in lane 0
96-
97-
return vdupq_n_f32(vget_lane_f32(sum1, 0)); // broadcast to all 4 lanes
98-
}
99-
100-
inline f32x4 v_dot4(f32x4 a, f32x4 b)
101-
{
102-
f32x4 mul = vmulq_f32(a, b);
103-
104-
float32x2_t low = vget_low_f32(mul); // lanes 0,1
105-
float32x2_t high = vget_high_f32(mul); // lanes 2,3
106-
107-
float32x2_t sum2 = vpadd_f32(low, high); // horizontal add: [a0*b0 + a1*b1, a2*b2 + a3*b3]
108-
109-
float32x2_t sum1 = vpadd_f32(sum2, sum2); // total sum in lane 0
110-
111-
return vdupq_n_f32(vget_lane_f32(sum1, 0)); // broadcast sum to all lanes
112-
}
113-
114-
inline f32x4 v_cross(f32x4 a, f32x4 b)
115-
{
116-
float a0 = vgetq_lane_f32(a, 0), a1 = vgetq_lane_f32(a, 1), a2 = vgetq_lane_f32(a, 2);
117-
float b0 = vgetq_lane_f32(b, 0), b1 = vgetq_lane_f32(b, 1), b2 = vgetq_lane_f32(b, 2);
118-
119-
float cx = a1 * b2 - a2 * b1;
120-
float cy = a2 * b0 - a0 * b2;
121-
float cz = a0 * b1 - a1 * b0;
122-
123-
f32x4 r = vdupq_n_f32(0.0f); // initialize all lanes to 0
124-
r = vsetq_lane_f32(cx, r, 0); // set x
125-
r = vsetq_lane_f32(cy, r, 1); // set y
126-
r = vsetq_lane_f32(cz, r, 2); // set z
127-
return r;
128-
}
129-
130-
inline f32x4 v_normalize3(f32x4 v)
131-
{
132-
f32x4 dot = v_dot3(v, v);
133-
f32x4 inv = v_rsqrt_nr(dot);
134-
return vmulq_f32(v, inv);
135-
}
83+
f32x4 r = vrsqrteq_f32(x);
84+
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); // refine
85+
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x));
86+
return r;
87+
}
88+
89+
//------------------------------------------------------
90+
// Vector intrinsic functions
91+
//------------------------------------------------------
92+
inline f32x4 v_dot4(f32x4 a, f32x4 b)
93+
{
94+
f32x4 mul = vmulq_f32(a, b);
95+
float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
96+
float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
97+
return vdupq_n_f32(sum);
98+
}
13699

137-
inline f32x4 v_hadd4(f32x4 a)
100+
inline f32x4 v_dot3(f32x4 a, f32x4 b)
138101
{
139-
// Step 1: add low and high halves
140-
float32x2_t sum2 = vadd_f32(vget_low_f32(a), vget_high_f32(a)); // sum low + high
102+
float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f};
103+
f32x4 mul = vmulq_f32(a, b);
104+
mul = vmulq_f32(mul, mask);
105+
float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
106+
float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
107+
return vdupq_n_f32(sum);
108+
}
109+
110+
inline f32x4 v_cross(f32x4 a, f32x4 b)
111+
{
112+
float32x4_t a_yzx = { vgetq_lane_f32(a,1), vgetq_lane_f32(a,2), vgetq_lane_f32(a,0), 0 };
113+
float32x4_t b_yzx = { vgetq_lane_f32(b,1), vgetq_lane_f32(b,2), vgetq_lane_f32(b,0), 0 };
114+
float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b));
115+
return (float32x4_t){ vgetq_lane_f32(c,2), vgetq_lane_f32(c,0), vgetq_lane_f32(c,1), 0 };
116+
}
141117

142-
// Step 2: add the two elements together
143-
float32x2_t sum1 = vpadd_f32(sum2, sum2); // horizontal add, single scalar in lane 0
118+
inline f32x4 v_normalize3(f32x4 v)
119+
{
120+
f32x4 inv = v_rsqrt_nr(v_dot3(v,v));
121+
return vmulq_f32(v, inv);
122+
}
144123

145-
// Step 3: duplicate the scalar into all 4 lanes
146-
return vcombine_f32(sum1, sum1); // combine two 2-lane vectors into 4-lane vector
124+
inline f32x4 v_hadd4(f32x4 a)
125+
{
126+
float32x2_t sum2 = vpadd_f32(vget_low_f32(a), vget_high_f32(a));
127+
float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1);
128+
return vdupq_n_f32(sum);
147129
}
148130
}

0 commit comments

Comments
 (0)