|
1 | 1 | #pragma once |
2 | | -#include <arm_neon.h> // NEON intrinsics |
| 2 | +#include <arm_neon.h> |
3 | 3 |
|
4 | 4 | namespace |
5 | 5 | { |
6 | | - typedef float32x4_t f32x4; |
7 | | - |
8 | | - //------------------------------------------------------ |
9 | | - // Load / Store |
10 | | - //------------------------------------------------------ |
11 | | - inline f32x4 v_load(const float* p) { return vld1q_f32(p); } |
12 | | - inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); } |
13 | | - inline f32x4 v_set1(float s) { return vdupq_n_f32(s); } |
14 | | - inline f32x4 v_zero() { return vdupq_n_f32(0.0f); } |
15 | | - inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); } |
16 | | - |
17 | | - //------------------------------------------------------ |
18 | | - // Mask helpers |
19 | | - //------------------------------------------------------ |
20 | | - inline f32x4 v_mask_xyz() |
21 | | - { |
22 | | - float vals[4] = { 1.0f, 1.0f, 1.0f, 0.0f }; |
23 | | - return vld1q_f32(vals); |
24 | | - } |
25 | | - |
26 | | - inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) |
27 | | - { |
28 | | - float tmp[4]; |
29 | | - vst1q_f32(tmp, newv); |
30 | | - tmp[3] = vgetq_lane_f32(original, 3); |
31 | | - return vld1q_f32(tmp); |
32 | | - } |
33 | | - |
34 | | - //------------------------------------------------------ |
35 | | - // Float3 helpers |
36 | | - //------------------------------------------------------ |
37 | | - inline f32x4 v_load3_vec(const float* p) // w = 0 |
38 | | - { |
39 | | - float vals[4] = { p[0], p[1], p[2], 0.0f }; |
40 | | - return vld1q_f32(vals); |
41 | | - } |
42 | | - |
43 | | - inline f32x4 v_load3_pos(const float* p) // w = 1 |
44 | | - { |
45 | | - float vals[4] = { p[0], p[1], p[2], 1.0f }; |
46 | | - return vld1q_f32(vals); |
47 | | - } |
48 | | - |
49 | | - inline void v_store3(float* dst, f32x4 v) |
50 | | - { |
51 | | - float tmp[4]; |
52 | | - vst1q_f32(tmp, v); |
53 | | - dst[0] = tmp[0]; |
54 | | - dst[1] = tmp[1]; |
55 | | - dst[2] = tmp[2]; |
56 | | - } |
57 | | - |
58 | | - //------------------------------------------------------ |
59 | | - // Arithmetic |
60 | | - //------------------------------------------------------ |
61 | | - inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); } |
62 | | - inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); } |
63 | | - inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); } |
64 | | - |
65 | | - // Fast reciprocal |
66 | | - inline f32x4 v_rcp_nr(f32x4 b) |
67 | | - { |
68 | | - f32x4 r = vrecpeq_f32(b); |
69 | | - // one Newton-Raphson iteration |
70 | | - r = vmulq_f32(r, vrecpsq_f32(b, r)); |
71 | | - return r; |
72 | | - } |
73 | | - |
74 | | - inline f32x4 v_div(f32x4 a, f32x4 b) { return vmulq_f32(a, v_rcp_nr(b)); } |
| 6 | + typedef float32x4_t f32x4; |
| 7 | + |
| 8 | + //------------------------------------------------------ |
| 9 | + // Load / Store |
| 10 | + //------------------------------------------------------ |
| 11 | + inline f32x4 v_load(const float* p) { return vld1q_f32(p); } |
| 12 | + inline void v_store(float* dst, f32x4 v) { vst1q_f32(dst, v); } |
| 13 | + inline f32x4 v_set1(float s) { return vdupq_n_f32(s); } |
| 14 | + inline f32x4 v_zero() { return vdupq_n_f32(0.0f); } |
| 15 | + inline float v_extract0(f32x4 v) { return vgetq_lane_f32(v, 0); } |
| 16 | + |
| 17 | + //------------------------------------------------------ |
| 18 | + // Mask helpers |
| 19 | + //------------------------------------------------------ |
| 20 | + inline f32x4 v_mask_xyz() |
| 21 | + { |
| 22 | + // equivalent to [1,1,1,0] |
| 23 | + float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f}; |
| 24 | + return mask; |
| 25 | + } |
| 26 | + |
| 27 | + inline f32x4 v_preserve_w(f32x4 newv, f32x4 original) |
| 28 | + { |
| 29 | + float32x4_t mask = {0.0f, 0.0f, 0.0f, 1.0f}; |
| 30 | + return vbslq_f32(vreinterpretq_u32_f32(mask), original, newv); |
| 31 | + } |
| 32 | + |
| 33 | + //------------------------------------------------------ |
| 34 | + // Float3 helpers |
| 35 | + //------------------------------------------------------ |
| 36 | + inline f32x4 v_load3_vec(const float* p) // w = 0 |
| 37 | + { |
| 38 | + float tmp[4] = { p[0], p[1], p[2], 0.0f }; |
| 39 | + return vld1q_f32(tmp); |
| 40 | + } |
| 41 | + |
| 42 | + inline f32x4 v_load3_pos(const float* p) // w = 1 |
| 43 | + { |
| 44 | + float tmp[4] = { p[0], p[1], p[2], 1.0f }; |
| 45 | + return vld1q_f32(tmp); |
| 46 | + } |
| 47 | + |
| 48 | + inline void v_store3(float* dst, f32x4 v) |
| 49 | + { |
| 50 | + float tmp[4]; |
| 51 | + vst1q_f32(tmp, v); |
| 52 | + dst[0] = tmp[0]; |
| 53 | + dst[1] = tmp[1]; |
| 54 | + dst[2] = tmp[2]; |
| 55 | + } |
| 56 | + |
| 57 | + //------------------------------------------------------ |
| 58 | + // Simple Arithmetic |
| 59 | + //------------------------------------------------------ |
| 60 | + inline f32x4 v_mul(f32x4 a, f32x4 b) { return vmulq_f32(a, b); } |
| 61 | + inline f32x4 v_div_exact(f32x4 a, f32x4 b) { return vdivq_f32(a, b); } // only NEON64 |
| 62 | + inline f32x4 v_add(f32x4 a, f32x4 b) { return vaddq_f32(a, b); } |
| 63 | + inline f32x4 v_sub(f32x4 a, f32x4 b) { return vsubq_f32(a, b); } |
| 64 | + |
| 65 | + //------------------------------------------------------ |
| 66 | + // Fast recip |
| 67 | + //------------------------------------------------------ |
| 68 | + inline f32x4 v_rcp_nr(f32x4 b) |
| 69 | + { |
| 70 | + f32x4 r = vrecpeq_f32(b); |
| 71 | + r = vmulq_f32(r, vrecpsq_f32(b, r)); // Newton-Raphson |
| 72 | + r = vmulq_f32(r, vrecpsq_f32(b, r)); |
| 73 | + return r; |
| 74 | + } |
| 75 | + |
| 76 | + inline f32x4 v_div(f32x4 a, f32x4 b) |
| 77 | + { |
| 78 | + return vmulq_f32(a, v_rcp_nr(b)); |
| 79 | + } |
75 | 80 |
|
76 | 81 | inline f32x4 v_rsqrt_nr(f32x4 x) |
77 | | - { |
78 | | - f32x4 r = vrsqrteq_f32(x); |
79 | | - r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, r), x)); |
80 | | - return r; |
81 | | - } |
82 | | - |
83 | | - //------------------------------------------------------ |
84 | | - // Dot / Cross |
85 | | - //------------------------------------------------------ |
86 | | - inline f32x4 v_dot3(f32x4 a, f32x4 b) |
87 | 82 | { |
88 | | - f32x4 mul = vmulq_f32(a, b); // element-wise multiply: [a0*b0, a1*b1, a2*b2, a3*b3] |
89 | | - |
90 | | - float32x2_t low = vget_low_f32(mul); // lanes 0,1 |
91 | | - float32x2_t high = vget_high_f32(mul); // lanes 2,3 |
92 | | - |
93 | | - float32x2_t sum2 = vpadd_f32(low, high); // horizontal add: [a0*b0 + a1*b1, a2*b2 + a3*b3] |
94 | | - |
95 | | - float32x2_t sum1 = vpadd_f32(sum2, sum2); // horizontal add: total sum in lane 0 |
96 | | - |
97 | | - return vdupq_n_f32(vget_lane_f32(sum1, 0)); // broadcast to all 4 lanes |
98 | | - } |
99 | | - |
100 | | - inline f32x4 v_dot4(f32x4 a, f32x4 b) |
101 | | - { |
102 | | - f32x4 mul = vmulq_f32(a, b); |
103 | | - |
104 | | - float32x2_t low = vget_low_f32(mul); // lanes 0,1 |
105 | | - float32x2_t high = vget_high_f32(mul); // lanes 2,3 |
106 | | - |
107 | | - float32x2_t sum2 = vpadd_f32(low, high); // horizontal add: [a0*b0 + a1*b1, a2*b2 + a3*b3] |
108 | | - |
109 | | - float32x2_t sum1 = vpadd_f32(sum2, sum2); // total sum in lane 0 |
110 | | - |
111 | | - return vdupq_n_f32(vget_lane_f32(sum1, 0)); // broadcast sum to all lanes |
112 | | - } |
113 | | - |
114 | | - inline f32x4 v_cross(f32x4 a, f32x4 b) |
115 | | - { |
116 | | - float a0 = vgetq_lane_f32(a, 0), a1 = vgetq_lane_f32(a, 1), a2 = vgetq_lane_f32(a, 2); |
117 | | - float b0 = vgetq_lane_f32(b, 0), b1 = vgetq_lane_f32(b, 1), b2 = vgetq_lane_f32(b, 2); |
118 | | - |
119 | | - float cx = a1 * b2 - a2 * b1; |
120 | | - float cy = a2 * b0 - a0 * b2; |
121 | | - float cz = a0 * b1 - a1 * b0; |
122 | | - |
123 | | - f32x4 r = vdupq_n_f32(0.0f); // initialize all lanes to 0 |
124 | | - r = vsetq_lane_f32(cx, r, 0); // set x |
125 | | - r = vsetq_lane_f32(cy, r, 1); // set y |
126 | | - r = vsetq_lane_f32(cz, r, 2); // set z |
127 | | - return r; |
128 | | - } |
129 | | - |
130 | | - inline f32x4 v_normalize3(f32x4 v) |
131 | | - { |
132 | | - f32x4 dot = v_dot3(v, v); |
133 | | - f32x4 inv = v_rsqrt_nr(dot); |
134 | | - return vmulq_f32(v, inv); |
135 | | - } |
| 83 | + f32x4 r = vrsqrteq_f32(x); |
| 84 | + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); // refine |
| 85 | + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r,r), x)); |
| 86 | + return r; |
| 87 | + } |
| 88 | + |
| 89 | + //------------------------------------------------------ |
| 90 | + // Vector intrinsic functions |
| 91 | + //------------------------------------------------------ |
| 92 | + inline f32x4 v_dot4(f32x4 a, f32x4 b) |
| 93 | + { |
| 94 | + f32x4 mul = vmulq_f32(a, b); |
| 95 | + float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul)); |
| 96 | + float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1); |
| 97 | + return vdupq_n_f32(sum); |
| 98 | + } |
136 | 99 |
|
137 | | - inline f32x4 v_hadd4(f32x4 a) |
| 100 | + inline f32x4 v_dot3(f32x4 a, f32x4 b) |
138 | 101 | { |
139 | | - // Step 1: add low and high halves |
140 | | - float32x2_t sum2 = vadd_f32(vget_low_f32(a), vget_high_f32(a)); // sum low + high |
| 102 | + float32x4_t mask = {1.0f, 1.0f, 1.0f, 0.0f}; |
| 103 | + f32x4 mul = vmulq_f32(a, b); |
| 104 | + mul = vmulq_f32(mul, mask); |
| 105 | + float32x2_t sum2 = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul)); |
| 106 | + float sum = vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1); |
| 107 | + return vdupq_n_f32(sum); |
| 108 | + } |
| 109 | + |
| 110 | + inline f32x4 v_cross(f32x4 a, f32x4 b) |
| 111 | + { |
| 112 | + float32x4_t a_yzx = { vgetq_lane_f32(a,1), vgetq_lane_f32(a,2), vgetq_lane_f32(a,0), 0 }; |
| 113 | + float32x4_t b_yzx = { vgetq_lane_f32(b,1), vgetq_lane_f32(b,2), vgetq_lane_f32(b,0), 0 }; |
| 114 | + float32x4_t c = vsubq_f32(vmulq_f32(a, b_yzx), vmulq_f32(a_yzx, b)); |
| 115 | + return (float32x4_t){ vgetq_lane_f32(c,2), vgetq_lane_f32(c,0), vgetq_lane_f32(c,1), 0 }; |
| 116 | + } |
141 | 117 |
|
142 | | - // Step 2: add the two elements together |
143 | | - float32x2_t sum1 = vpadd_f32(sum2, sum2); // horizontal add, single scalar in lane 0 |
| 118 | + inline f32x4 v_normalize3(f32x4 v) |
| 119 | + { |
| 120 | + f32x4 inv = v_rsqrt_nr(v_dot3(v,v)); |
| 121 | + return vmulq_f32(v, inv); |
| 122 | + } |
144 | 123 |
|
145 | | - // Step 3: duplicate the scalar into all 4 lanes |
146 | | - return vcombine_f32(sum1, sum1); // combine two 2-lane vectors into 4-lane vector |
| 124 | + inline f32x4 v_hadd4(f32x4 a) |
| 125 | + { |
| 126 | + float32x2_t sum2 = vpadd_f32(vget_low_f32(a), vget_high_f32(a)); |
| 127 | + float sum = vget_lane_f32(sum2,0) + vget_lane_f32(sum2,1); |
| 128 | + return vdupq_n_f32(sum); |
147 | 129 | } |
148 | 130 | } |
0 commit comments