From a25c9782b18e4f892108b8ac5297c0852d5d67a4 Mon Sep 17 00:00:00 2001
From: Matthieu Gomez <gomez.matthieu@gmail.com>
Date: Fri, 5 Jun 2026 11:10:28 +0200
Subject: [PATCH] Speed up CPU demeaning ~20%: Int32 refs and fast LSMR 2-norm

Two CPU-side optimizations on top of #80, with no change to results.

- FixedEffect: store group refs as Int32 when ngroups <= typemax(Int32),
  halving the dominant memory stream read by the scatter/gather kernels on
  every solver iteration. Every backend (CPU/GPU) and solve_coefficients!
  benefits; ngroups beyond Int32 keeps the original integer type.

- lsmr!: replace LinearAlgebra.norm's overflow-safe (~8x slower) path with a
  SIMD sum-of-squares at the four hot norm call sites, accumulated in Float64
  for accuracy. Restricted to concrete CPU Vector{Float32,Float64}; GPU
  arrays and FixedEffectCoefficients keep the generic norm.

Bumps version to 3.3.0.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Project.toml       |  2 +-
 src/FixedEffect.jl |  8 +++++++-
 src/utils/lsmr.jl  | 27 ++++++++++++++++++++++-----
 test/types.jl      |  4 ++--
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/Project.toml b/Project.toml
index a052baf..2ef19b4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "FixedEffects"
 uuid = "c8885935-8500-56a7-9867-7708b20db0eb"
-version = "3.2.0"
+version = "3.3.0"
 
 [deps]
 GroupedArrays = "6407cd72-fade-4a84-8a1e-56e431fc1533"
diff --git a/src/FixedEffect.jl b/src/FixedEffect.jl
index 7e6ae3d..48e205b 100644
--- a/src/FixedEffect.jl
+++ b/src/FixedEffect.jl
@@ -17,7 +17,13 @@ end
 
 function FixedEffect(args...; interaction::AbstractVector = uweights(length(args[1])))
 	g = GroupedArray(args..., sort = nothing)
-	FixedEffect{typeof(g.groups), typeof(interaction)}(g.groups, interaction, g.ngroups)
+	# Store refs as Int32 (refs lie in [0, ngroups]) to halve the dominant memory stream read by the
+	# scatter/gather kernels on every solver iteration. GroupedArrays always builds Int64 groups (it
+	# needs signed sentinels during construction); narrowing here, where FixedEffect manufactures its
+	# own ref representation, lets every backend (CPU/GPU) and solve_coefficients! stream the smaller
+	# type. The rare ngroups > typemax(Int32) keeps the original integer type.
+	refs = g.ngroups > typemax(Int32) ? g.groups : convert(Vector{Int32}, g.groups)
+	FixedEffect{typeof(refs), typeof(interaction)}(refs, interaction, g.ngroups)
 end
 
 Base.show(io::IO, ::FixedEffect) = print(io, "Fixed Effects")
diff --git a/src/utils/lsmr.jl b/src/utils/lsmr.jl
index 7b003c3..a8543f7 100644
--- a/src/utils/lsmr.jl
+++ b/src/utils/lsmr.jl
@@ -5,6 +5,23 @@ struct ConvergenceHistory{T, R}
     residuals::R
 end
 
+# Fast 2-norm for the well-scaled length-N CPU vectors in LSMR.
+# `LinearAlgebra.norm` (BLAS nrm2 / generic_norm2) takes an overflow-safe scaled path that is
+# ~8x slower than a plain SIMD sum-of-squares. LSMR's u/v are unit-normalized every iteration
+# and the residual stays O(data), so the overflow guard is unnecessary here. The sum of squares
+# is accumulated in Float64 even for Float32 inputs: this is more accurate than the in-precision
+# BLAS `snrm2` (avoids ~sqrt(N)*eps(Float32) drift over millions of rows) at negligible cost.
+# Restricted to concrete `Vector{<:BlasFloat}` so GPU arrays (CuVector/MtlVector) and
+# FixedEffectCoefficients keep the generic `norm` (the fast path would trigger scalar GPU indexing).
+@inline function _norm2(x::Vector{T}) where {T<:Union{Float32,Float64}}
+    s = 0.0
+    @inbounds @simd for i in eachindex(x)
+        s += abs2(Float64(x[i]))
+    end
+    return T(sqrt(s))
+end
+_norm2(x) = norm(x)
+
 
 
 ##############################################################################
@@ -62,10 +79,10 @@ function lsmr!(x, A, b, v, h, hbar;
     conlim > 0 ? ctol = convert(Tr, inv(conlim)) : ctol = zero(Tr)
     # form the first vectors u and v (satisfy  β*u = b,  α*v = A'u)    
     u = mul!(b, A, x, -1, 1)
-    β = norm(u)    
+    β = _norm2(u)    
     β > 0 && rmul!(u, inv(β))    
     mul!(v, A', u, 1, 0)
-    α = norm(v)
+    α = _norm2(v)
     α > 0 && rmul!(v, inv(α))
     # Initialize variables for 1st iteration.
     ζbar = α * β
@@ -103,11 +120,11 @@ function lsmr!(x, A, b, v, h, hbar;
         while iter < maxiter
             iter += 1
             mul!(u, A, v, 1, -α)
-            β = norm(u)
+            β = _norm2(u)
             if β > 0
                 rmul!(u, inv(β))
                 mul!(v, A', u, 1, -β)
-                α = norm(v)
+                α = _norm2(v)
                 α > 0 && rmul!(v, inv(α))
             end
         
@@ -190,7 +207,7 @@ function lsmr!(x, A, b, v, h, hbar;
         
             # Compute norms for convergence testing.
             normAr  = abs(ζbar)
-            normx = norm(x)
+            normx = _norm2(x)
         
 
 
diff --git a/test/types.jl b/test/types.jl
index 0a38a87..2908a27 100644
--- a/test/types.jl
+++ b/test/types.jl
@@ -9,14 +9,14 @@ import Base.==
     @test sprint(show, fe1) == "Fixed Effects"
     @test sprint(show, MIME("text/plain"), fe1) == """
         Fixed Effects:
-          refs (10-element Vector{Int64}):
+          refs (10-element Vector{Int32}):
             [1, 2, 3, 4, 5, ... ]
           interaction (UnitWeights):
             none"""
     fe2 = FixedEffect(1:10, interaction=fill(1.23456789, 10))
     @test sprint(show, MIME("text/plain"), fe2) == """
         Fixed Effects:
-          refs (10-element Vector{Int64}):
+          refs (10-element Vector{Int32}):
             [1, 2, 3, 4, 5, ... ]
           interaction (10-element Vector{Float64}):
             [1.23457, 1.23457, 1.23457, 1.23457, 1.23457, ... ]"""