[SIMD nf4] update w/ mod_mul_safe

lamphamsy · lamphamsy · commit 9614e743bbdf · 2019-05-22T14:18:20.000+02:00
diff --git a/src/simd_nf4.h b/src/simd_nf4.h
@@ -198,7 +198,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
     HalfVecType res;
     VecType vec_a = load_to_reg(a);
     VecType vec_b = load_to_reg(b);
-    store_low_half_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4));
+    store_low_half_to_mem(&res, mod_mul_safe<uint32_t>(vec_a, vec_b));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -230,7 +230,7 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
         VecType _x_p = load_to_reg(_x[i]);
         VecType _y_p = load_to_reg(_y[i]);
 
-        store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4));
+        store_low_half_to_mem(_x + i, mod_mul_safe<uint32_t>(_x_p, _y_p));
     }
 }
 
@@ -248,8 +248,9 @@ inline void hadamard_mul_doubled_rem(
         VecType _x_next_p = load_to_reg(_x_half[i]);
         VecType _y_p = load_to_reg(_y[i]);
 
-        store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4));
-        store_low_half_to_mem(_x_half + i, mod_mul_safe(_x_next_p, _y_p, F4));
+        store_low_half_to_mem(_x + i, mod_mul_safe<uint32_t>(_x_p, _y_p));
+        store_low_half_to_mem(
+            _x_half + i, mod_mul_safe<uint32_t>(_x_next_p, _y_p));
     }
 }
 
@@ -284,7 +285,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)
     VecType res;
     VecType vec_a = load_to_reg(a);
     VecType vec_b = load_to_reg(b);
-    store_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4));
+    store_to_mem(&res, mod_mul_safe<uint32_t>(vec_a, vec_b));
     return reinterpret_cast<__uint128_t>(res);
 }
 
@@ -354,7 +355,7 @@ inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y)
 
     // multiply y to the first half of `x`
     for (i = 0; i < vec_len; ++i) {
-        x[i] = mod_mul_safe(x[i], y[i], F4);
+        x[i] = mod_mul_safe<uint32_t>(x[i], y[i]);
     }
 
     if (rem_len > 0) {

Original file line number	Diff line number	Diff line change
`@@ -198,7 +198,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)`
`198`	`198`	`HalfVecType res;`
`199`	`199`	`VecType vec_a = load_to_reg(a);`
`200`	`200`	`VecType vec_b = load_to_reg(b);`
`201`		`- store_low_half_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4));`
	`201`	`+ store_low_half_to_mem(&res, mod_mul_safe<uint32_t>(vec_a, vec_b));`
`202`	`202`	`return reinterpret_cast<__uint128_t>(res);`
`203`	`203`	`}`
`204`	`204`
`@@ -230,7 +230,7 @@ inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)`
`230`	`230`	`VecType _x_p = load_to_reg(_x[i]);`
`231`	`231`	`VecType _y_p = load_to_reg(_y[i]);`
`232`	`232`
`233`		`- store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4));`
	`233`	`+ store_low_half_to_mem(_x + i, mod_mul_safe<uint32_t>(_x_p, _y_p));`
`234`	`234`	`}`
`235`	`235`	`}`
`236`	`236`
`@@ -248,8 +248,9 @@ inline void hadamard_mul_doubled_rem(`
`248`	`248`	`VecType _x_next_p = load_to_reg(_x_half[i]);`
`249`	`249`	`VecType _y_p = load_to_reg(_y[i]);`
`250`	`250`
`251`		`- store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4));`
`252`		`- store_low_half_to_mem(_x_half + i, mod_mul_safe(_x_next_p, _y_p, F4));`
	`251`	`+ store_low_half_to_mem(_x + i, mod_mul_safe<uint32_t>(_x_p, _y_p));`
	`252`	`+ store_low_half_to_mem(`
	`253`	`+ _x_half + i, mod_mul_safe<uint32_t>(_x_next_p, _y_p));`
`253`	`254`	`}`
`254`	`255`	`}`
`255`	`256`
`@@ -284,7 +285,7 @@ inline __uint128_t mul(__uint128_t a, __uint128_t b)`
`284`	`285`	`VecType res;`
`285`	`286`	`VecType vec_a = load_to_reg(a);`
`286`	`287`	`VecType vec_b = load_to_reg(b);`
`287`		`- store_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4));`
	`288`	`+ store_to_mem(&res, mod_mul_safe<uint32_t>(vec_a, vec_b));`
`288`	`289`	`return reinterpret_cast<__uint128_t>(res);`
`289`	`290`	`}`
`290`	`291`
`@@ -354,7 +355,7 @@ inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y)`
`354`	`355`
`355`	`356`	// multiply y to the first half of `x`
`356`	`357`	`for (i = 0; i < vec_len; ++i) {`
`357`		`- x[i] = mod_mul_safe(x[i], y[i], F4);`
	`358`	`+ x[i] = mod_mul_safe<uint32_t>(x[i], y[i]);`
`358`	`359`	`}`
`359`	`360`
`360`	`361`	`if (rem_len > 0) {`