From 417bc132fb52333181b88172efde5d03a898956a Mon Sep 17 00:00:00 2001
From: Phong Nguyen <phongn@gmail.com>
Date: Thu, 14 May 2026 19:52:33 +0000
Subject: [PATCH 01/12] tscore: SIMD ts::memcpy_tolower; use in URL.cc
 cache-key fast path

The bulk ASCII tolower loop used to canonicalize the scheme and host
portions of a URL before hashing into the cache key runs at ~1.5 GB/s
scalar (one byte and one ParseRules table lookup per iteration). The
work is trivially data-parallel and there is no per-byte branching, so
a SIMD kernel that lowercases a whole register at once gives a
straightforward speedup once the input is long enough to amortize the
vector setup.

Add a header-only helper ts::memcpy_tolower under
include/tscore/ink_memcpy_tolower.h with a compile-time-selected
cascade of SIMD bodies: 64-byte AVX-512BW, 32-byte AVX2, 16-byte SSE2
on x86_64, plus 16-byte NEON on ARMv8. Wider bodies fall through to
narrower drain loops, so the worst-case scalar tail is always <16
bytes. Selection is purely compile-time; runtime ifunc dispatch is
left for a follow-up.

The AVX-512BW body uses _mm512_mask_add_epi8 to fuse the conditional
"+0x20 where upper" into a single op, and a masked load/store handles
1..63 leftover bytes in a single SIMD pass (inspired by Tony Finch's
copytolower64.c, https://dotat.at/cgi/git/vectolower.git/). The whole
AVX-512BW block is gated at n >= 64 because the masked load/store has
~7 ns of fixed setup that loses to the narrower paths for short
inputs; below 64 bytes we fall through to the AVX2 + SSE2 cascade.

Semantics match the existing ParseRules::ink_tolower table exactly:
bytes in 'A'..'Z' map to 'a'..'z', all others (including 0x80..0xFF)
pass through unchanged.

Replace the static inline memcpy_tolower in src/proxy/hdrs/URL.cc with
this helper. Baseline x86_64 builds use the 16-byte SSE2 path; builds
that opt into a wider -march (x86-64-v3 = AVX2, x86-64-v4 = AVX-512BW)
get the wider bodies automatically. Sub-16-byte inputs (e.g. short
HTTP schemes like "http") use the scalar tail and see no perf change.

Measured throughput on a 2.0 GHz Ice Lake Xeon Gold 6338, mean ns:

  size   scalar   SSE2     AVX2     AVX-512BW
  ----   ------   ----     ----     ---------
  16 B   10.4     2.15     1.75     1.98
  32 B   15.4     2.90     2.24     2.31
  64 B   28.0     4.43     2.85     2.61
  256 B  113      13.87    7.57     6.20
  1024 B 425      50.47    24.23    17.49

Speedup vs scalar at 1024 B: SSE2 8.4x, AVX2 17.5x, AVX-512BW 24.3x.

A new microbenchmark under tools/benchmark covers correctness across
sizes 0..257 (bracketing each SIMD body size) plus an exhaustive byte-
value sweep that guards against any future widening of the case-fold
range, alongside scalar-vs-SIMD throughput numbers and a config-print
case that emits the selected ISA path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 include/tscore/ink_memcpy_tolower.h         | 163 +++++++++++++++++++
 src/proxy/hdrs/URL.cc                       |  15 +-
 tools/benchmark/CMakeLists.txt              |   3 +
 tools/benchmark/benchmark_memcpy_tolower.cc | 167 ++++++++++++++++++++
 4 files changed, 336 insertions(+), 12 deletions(-)
 create mode 100644 include/tscore/ink_memcpy_tolower.h
 create mode 100644 tools/benchmark/benchmark_memcpy_tolower.cc

diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_memcpy_tolower.h
new file mode 100644
index 00000000000..3c93559ae5e
--- /dev/null
+++ b/include/tscore/ink_memcpy_tolower.h
@@ -0,0 +1,163 @@
+/** @file
+
+  SIMD-accelerated bulk ASCII tolower copy.
+
+  Used on the URL canonicalization fast path for cache-key digests
+  (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast). The scalar loop is
+  the bottleneck for hosts and schemes long enough to vectorize; for
+  shorter inputs the scalar tail handles them with no SIMD overhead.
+
+  Semantics match a byte-at-a-time loop using ParseRules::ink_tolower():
+
+    - Bytes in 'A'..'Z' (0x41..0x5A) have bit 5 set, mapping them to
+      'a'..'z'. All other bytes (including 0x80..0xFF) pass through
+      unchanged. There is no UTF-8 case folding.
+
+    - The destination is written byte-for-byte; src and dst must point
+      to non-overlapping regions of size at least @n bytes.
+
+  Implementation note: the bodies are stacked widest-first and each
+  drains its block size before falling through to the next. A build
+  with AVX-512BW gets the 64-byte body as the main loop, then at most
+  one 32-byte AVX2 iteration and one 16-byte SSE2 iteration to drain
+  the remainder before the scalar tail handles 0-15 bytes. Builds
+  without the wider ISAs simply skip those blocks. Selection is purely
+  compile-time; no runtime dispatch.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__AVX512BW__) || defined(__AVX2__) || defined(__SSE2__)
+#include <immintrin.h>
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+namespace ts
+{
+
+inline void
+memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept
+{
+#if defined(__AVX512BW__)
+  // AVX-512BW: 64 bytes per iteration with two key optimizations over the
+  // narrower paths:
+  //   - _mm512_mask_add_epi8 fuses the "+0x20 where upper" into a single
+  //     op (no separate maskz_set1 + or).
+  //   - A masked load/store handles the 1..63-byte tail in a single SIMD
+  //     pass, so we don't need to cascade to AVX2/SSE2 to drain the
+  //     remainder.
+  //
+  // The masked tail does carry ~7 ns of fixed setup cost, which loses to
+  // the cascade on short inputs. Gating the whole block on n >= 64 means
+  // tiny inputs fall through to the AVX2/SSE2 path below, where they keep
+  // the speedup that path already provides.
+  //
+  // Inspired by Tony Finch's copytolower64.c
+  // (https://dotat.at/cgi/git/vectolower.git/).
+  if (n >= 64) {
+    const __m512i A_vec = _mm512_set1_epi8('A');
+    const __m512i Z_vec = _mm512_set1_epi8('Z');
+    const __m512i delta = _mm512_set1_epi8('a' - 'A');
+    do {
+      __m512i   bytes    = _mm512_loadu_epi8(src);
+      __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
+      _mm512_storeu_epi8(dst, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
+      src += 64;
+      dst += 64;
+      n   -= 64;
+    } while (n >= 64);
+    if (n != 0) {
+      auto      len_mask = static_cast<__mmask64>((~0ULL) >> (64 - n));
+      __m512i   bytes    = _mm512_maskz_loadu_epi8(len_mask, src);
+      __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
+      _mm512_mask_storeu_epi8(dst, len_mask, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
+    }
+    return;
+  }
+#endif
+
+#if defined(__AVX2__)
+  // 32 bytes per iteration. Same compare-and-OR pattern as SSE2.
+  {
+    const __m256i a_minus_one = _mm256_set1_epi8('A' - 1);
+    const __m256i z_plus_one  = _mm256_set1_epi8('Z' + 1);
+    const __m256i bit5        = _mm256_set1_epi8(0x20);
+    while (n >= 32) {
+      __m256i bytes = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+      __m256i ge_A  = _mm256_cmpgt_epi8(bytes, a_minus_one);
+      __m256i le_Z  = _mm256_cmpgt_epi8(z_plus_one, bytes);
+      __m256i mask  = _mm256_and_si256(_mm256_and_si256(ge_A, le_Z), bit5);
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), _mm256_or_si256(bytes, mask));
+      src += 32;
+      dst += 32;
+      n   -= 32;
+    }
+  }
+#endif
+
+#if defined(__SSE2__)
+  // 16 bytes per iteration. Signed compare works for ASCII A-Z because all
+  // letters live below 0x80; high bytes (0x80..0xFF) compare as negative
+  // and correctly miss the [A,Z] range so they pass through unchanged.
+  {
+    const __m128i a_minus_one = _mm_set1_epi8('A' - 1);
+    const __m128i z_plus_one  = _mm_set1_epi8('Z' + 1);
+    const __m128i bit5        = _mm_set1_epi8(0x20);
+    while (n >= 16) {
+      __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+      __m128i ge_A  = _mm_cmpgt_epi8(bytes, a_minus_one);
+      __m128i le_Z  = _mm_cmpgt_epi8(z_plus_one, bytes);
+      __m128i mask  = _mm_and_si128(_mm_and_si128(ge_A, le_Z), bit5);
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(bytes, mask));
+      src += 16;
+      dst += 16;
+      n   -= 16;
+    }
+  }
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+  // 16 bytes per iteration; unsigned compare available natively.
+  {
+    const uint8x16_t a_minus_one = vdupq_n_u8('A' - 1);
+    const uint8x16_t z_plus_one  = vdupq_n_u8('Z' + 1);
+    const uint8x16_t bit5        = vdupq_n_u8(0x20);
+    while (n >= 16) {
+      uint8x16_t bytes = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
+      uint8x16_t ge_A  = vcgtq_u8(bytes, a_minus_one);
+      uint8x16_t le_Z  = vcltq_u8(bytes, z_plus_one);
+      uint8x16_t mask  = vandq_u8(vandq_u8(ge_A, le_Z), bit5);
+      vst1q_u8(reinterpret_cast<uint8_t *>(dst), vorrq_u8(bytes, mask));
+      src += 16;
+      dst += 16;
+      n   -= 16;
+    }
+  }
+#endif
+
+  while (n--) {
+    auto c = static_cast<unsigned char>(*src++);
+    *dst++ = (c >= 'A' && c <= 'Z') ? static_cast<char>(c | 0x20) : static_cast<char>(c);
+  }
+}
+
+} // namespace ts
diff --git a/src/proxy/hdrs/URL.cc b/src/proxy/hdrs/URL.cc
index 68d84b5d481..c8eff4dd69f 100644
--- a/src/proxy/hdrs/URL.cc
+++ b/src/proxy/hdrs/URL.cc
@@ -25,6 +25,7 @@
 #include <new>
 #include "tscore/ink_platform.h"
 #include "tscore/ink_memory.h"
+#include "tscore/ink_memcpy_tolower.h"
 #include "proxy/hdrs/URL.h"
 #include "proxy/hdrs/MIME.h"
 #include "proxy/hdrs/HTTP.h"
@@ -1684,16 +1685,6 @@ url_describe(HdrHeapObjImpl *raw, bool /* recurse ATS_UNUSED */)
  *                                                                     *
  ***********************************************************************/
 
-static inline void
-memcpy_tolower(char *d, const char *s, int n)
-{
-  while (n--) {
-    *d = ParseRules::ink_tolower(*s);
-    s++;
-    d++;
-  }
-}
-
 // fast path for CryptoHash, HTTP, no user/password/params/query,
 // no buffer overflow, no unescaping needed
 
@@ -1704,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
   char *p;
 
   p = buffer;
-  memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
+  ts::memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
   p    += url->m_len_scheme;
   *p++  = ':';
   *p++  = '/';
@@ -1713,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
   *p++ = ':';
   // no password
   *p++ = '@';
-  memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
+  ts::memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
   p    += url->m_len_host;
   *p++  = '/';
   memcpy(p, url->m_ptr_path, url->m_len_path);
diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt
index 49f25fad1c1..84fc111925e 100644
--- a/tools/benchmark/CMakeLists.txt
+++ b/tools/benchmark/CMakeLists.txt
@@ -36,6 +36,9 @@ target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore li
 add_executable(benchmark_Random benchmark_Random.cc)
 target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore)
 
+add_executable(benchmark_memcpy_tolower benchmark_memcpy_tolower.cc)
+target_link_libraries(benchmark_memcpy_tolower PRIVATE Catch2::Catch2WithMain ts::tscore)
+
 add_executable(benchmark_HostDB benchmark_HostDB.cc)
 target_link_libraries(
   benchmark_HostDB
diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_memcpy_tolower.cc
new file mode 100644
index 00000000000..e2b45b48b5d
--- /dev/null
+++ b/tools/benchmark/benchmark_memcpy_tolower.cc
@@ -0,0 +1,167 @@
+/** @file
+
+  Micro benchmark for ts::memcpy_tolower against a byte-at-a-time scalar
+  loop equivalent to the prior URL.cc::memcpy_tolower definition.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#define CATCH_CONFIG_ENABLE_BENCHMARKING
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+
+#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ParseRules.h"
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+
+namespace
+{
+
+// Sizes chosen to mirror the URL.cc hot path:
+//   4-8B   - common HTTP scheme strings ("http", "https")
+//   16-32B - typical host names
+//   64-256B - long host names / cache-key segments
+//   1024B  - stress the inner loop
+constexpr std::array<std::size_t, 8> kSizes{4, 8, 16, 24, 32, 64, 256, 1024};
+
+// Same character distribution we expect from URL/host input: ASCII letters
+// (mixed case), digits, and the small set of non-alpha bytes that legitimately
+// appear in URLs.
+std::vector<char>
+make_mixed_case_ascii(std::size_t n, std::uint64_t seed = 0xABCDEFULL)
+{
+  std::mt19937_64   rng(seed);
+  std::vector<char> v(n);
+  for (std::size_t i = 0; i < n; ++i) {
+    auto r = static_cast<unsigned>(rng() & 0x3FU);
+    if (r < 26U) {
+      v[i] = static_cast<char>('A' + r);
+    } else if (r < 52U) {
+      v[i] = static_cast<char>('a' + (r - 26U));
+    } else {
+      static constexpr char kNonAlpha[] = "0123456789-_./:";
+      v[i]                              = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)];
+    }
+  }
+  return v;
+}
+
+// Mirror of the prior static inline memcpy_tolower() from URL.cc, kept here
+// as the baseline the SIMD path is expected to beat.
+inline void
+memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept
+{
+  while (n--) {
+    *d = ParseRules::ink_tolower(*s);
+    ++s;
+    ++d;
+  }
+}
+
+} // namespace
+
+TEST_CASE("active SIMD configuration", "[tolower][config]")
+{
+  // Print the compile-time ISA path so the benchmark output makes the
+  // selected configuration obvious. Cascades stack: AVX-512BW builds also
+  // emit the AVX2 and SSE2 drain loops; AVX2 builds emit the SSE2 drain.
+  std::cout << "ts::memcpy_tolower compiled with: ";
+#if defined(__AVX512BW__)
+  std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade";
+#elif defined(__AVX2__)
+  std::cout << "AVX2 (32B body) + SSE2 (16B drain)";
+#elif defined(__SSE2__)
+  std::cout << "SSE2 (16B body)";
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+  std::cout << "NEON (16B body)";
+#else
+  std::cout << "scalar only";
+#endif
+  std::cout << '\n';
+  SUCCEED();
+}
+
+TEST_CASE("ts::memcpy_tolower matches scalar reference", "[tolower][correctness]")
+{
+  // Cover sizes that bracket the 16-byte SIMD body: smaller-than, equal-to,
+  // a couple of multiples, and several offsets between multiples.
+  for (std::size_t sz : std::array<std::size_t, 12>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 64, 257}) {
+    auto              input = make_mixed_case_ascii(sz, 0xC0FFEE + sz);
+    std::vector<char> expected(sz);
+    std::vector<char> actual(sz);
+
+    memcpy_tolower_scalar(expected.data(), input.data(), sz);
+    ts::memcpy_tolower(actual.data(), input.data(), sz);
+
+    CAPTURE(sz);
+    REQUIRE(actual == expected);
+  }
+}
+
+TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[tolower][correctness]")
+{
+  // Every byte value from 0..255 should round-trip unchanged unless it is in
+  // 'A'..'Z', in which case it should map to 'a'..'z'. This catches anyone
+  // who later tries to "speed things up" by widening the range to Latin-1.
+  std::array<unsigned char, 256> input;
+  for (std::size_t i = 0; i < 256; ++i) {
+    input[i] = static_cast<unsigned char>(i);
+  }
+  std::array<char, 256> output;
+  ts::memcpy_tolower(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
+
+  for (std::size_t i = 0; i < 256; ++i) {
+    auto in  = static_cast<unsigned char>(i);
+    auto out = static_cast<unsigned char>(output[i]);
+    auto exp = (in >= 'A' && in <= 'Z') ? static_cast<unsigned char>(in | 0x20) : in;
+    CAPTURE(i);
+    REQUIRE(out == exp);
+  }
+}
+
+TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
+{
+  for (std::size_t sz : kSizes) {
+    auto              input = make_mixed_case_ascii(sz);
+    std::vector<char> output_scalar(sz);
+    std::vector<char> output_simd(sz);
+
+    std::string scalar_name = "scalar  " + std::to_string(sz) + "B";
+    BENCHMARK(scalar_name.c_str())
+    {
+      memcpy_tolower_scalar(output_scalar.data(), input.data(), sz);
+      return output_scalar[0];
+    };
+
+    std::string simd_name = "ts::mct " + std::to_string(sz) + "B";
+    BENCHMARK(simd_name.c_str())
+    {
+      ts::memcpy_tolower(output_simd.data(), input.data(), sz);
+      return output_simd[0];
+    };
+  }
+}

From 5f7cc13363302d45d824d4a74552e0750f1f2326 Mon Sep 17 00:00:00 2001
From: Phong Nguyen <phongn@gmail.com>
Date: Thu, 21 May 2026 20:18:47 +0000
Subject: [PATCH 02/12] tscore/QPACK: address #13167 review feedback

- Move ts::memcpy_tolower correctness coverage out of the
  ENABLE_BENCHMARKS-gated benchmark and into a new
  src/tscore/unit_tests/test_ink_memcpy_tolower.cc so ctest exercises
  the scalar and SIMD paths in every build. Covers boundary sizes
  bracketing each SIMD body width, the exhaustive 0..255 byte-value
  sweep, and the in-place (dst == src) form (Copilot).

- Fix the implementation-note comment on ts::memcpy_tolower to
  describe the actual AVX-512BW control flow (gated main loop +
  masked-tail load/store + early return), and document that in-place
  (dst == src) is supported on every path (Copilot).

- Add a Catch::Benchmark::keep_memory barrier in
  benchmark_memcpy_tolower so the compiler can no longer DCE the
  inlined stores past the first observed byte (Copilot).

- Migrate the in-place tolower loop in
  src/proxy/http3/QPACK.cc::_encode_header to ts::memcpy_tolower,
  demonstrating the in-place contract (bryancall).

- Add Tony Finch's copytolower64.c attribution to NOTICE
  (masaori335).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NOTICE                                        |   7 +
 include/tscore/ink_memcpy_tolower.h           |  31 ++--
 src/proxy/http3/QPACK.cc                      |   5 +-
 src/tscore/CMakeLists.txt                     |   1 +
 .../unit_tests/test_ink_memcpy_tolower.cc     | 133 ++++++++++++++++++
 tools/benchmark/benchmark_memcpy_tolower.cc   |  49 ++-----
 6 files changed, 173 insertions(+), 53 deletions(-)
 create mode 100644 src/tscore/unit_tests/test_ink_memcpy_tolower.cc

diff --git a/NOTICE b/NOTICE
index e1537312911..46ade352b22 100644
--- a/NOTICE
+++ b/NOTICE
@@ -133,3 +133,10 @@ Reference implementation: https://github.com/Thesys-lab/sosp23-s3fifo
 
 Highway is a C++ library that provides portable SIMD/vector intrinsics.
 https://github.com/google/highway
+
+~~
+
+include/tscore/ink_memcpy_tolower.h AVX-512BW kernel design (fused
+mask_add and masked-tail load/store) is adapted from Tony Finch's
+copytolower64.c (0BSD OR MIT-0).
+https://dotat.at/cgi/git/vectolower.git/
diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_memcpy_tolower.h
index 3c93559ae5e..578ef3606b5 100644
--- a/include/tscore/ink_memcpy_tolower.h
+++ b/include/tscore/ink_memcpy_tolower.h
@@ -13,16 +13,27 @@
       'a'..'z'. All other bytes (including 0x80..0xFF) pass through
       unchanged. There is no UTF-8 case folding.
 
-    - The destination is written byte-for-byte; src and dst must point
-      to non-overlapping regions of size at least @n bytes.
-
-  Implementation note: the bodies are stacked widest-first and each
-  drains its block size before falling through to the next. A build
-  with AVX-512BW gets the 64-byte body as the main loop, then at most
-  one 32-byte AVX2 iteration and one 16-byte SSE2 iteration to drain
-  the remainder before the scalar tail handles 0-15 bytes. Builds
-  without the wider ISAs simply skip those blocks. Selection is purely
-  compile-time; no runtime dispatch.
+    - In-place use (dst == src) is supported: every SIMD body loads a
+      full block into a register before storing back, and the AVX-512BW
+      masked tail uses masked-load/masked-store at the same offset.
+      Partial overlap where dst != src is not supported.
+
+  Implementation note: selection is purely compile-time; no runtime
+  dispatch. The bodies are stacked widest-first.
+
+    - AVX-512BW builds: when n >= 64, a 64-byte main loop handles the
+      bulk and a single masked load/store finishes any 1..63-byte tail,
+      then we return. When n < 64, we fall through to the AVX2 + SSE2
+      cascade below so tiny inputs avoid the masked tail's fixed setup
+      cost.
+
+    - AVX2 builds: a 32-byte main loop drains to a 16-byte SSE2 step
+      and then to a scalar tail of 0..15 bytes.
+
+    - SSE2 / NEON builds: a single 16-byte main loop drains to a
+      scalar tail.
+
+    - Other targets: scalar only.
 
   @section license License
 
diff --git a/src/proxy/http3/QPACK.cc b/src/proxy/http3/QPACK.cc
index dfdd2d278b3..e5145ecf18d 100644
--- a/src/proxy/http3/QPACK.cc
+++ b/src/proxy/http3/QPACK.cc
@@ -25,6 +25,7 @@
 #include "proxy/hdrs/XPACK.h"
 #include "proxy/http3/QPACK.h"
 #include "tscore/ink_defs.h"
+#include "tscore/ink_memcpy_tolower.h"
 #include "tscore/ink_memory.h"
 
 #define QPACKDebug(fmt, ...)   Dbg(dbg_ctl_qpack, "[%s] " fmt, this->_qc->cids().data(), ##__VA_ARGS__)
@@ -369,9 +370,7 @@ QPACK::_encode_header(const MIMEField &field, uint16_t base_index, IOBufferBlock
 {
   auto  name{field.name_get()};
   char *lowered_name = this->_arena.str_store(name.data(), name.length());
-  for (size_t i = 0; i < name.length(); i++) {
-    lowered_name[i] = ParseRules::ink_tolower(lowered_name[i]);
-  }
+  ts::memcpy_tolower(lowered_name, lowered_name, name.length());
   auto value{field.value_get()};
 
   // TODO Set never_index flag on/off according to encoding headers
diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt
index 3d70052b199..ec90904fc9c 100644
--- a/src/tscore/CMakeLists.txt
+++ b/src/tscore/CMakeLists.txt
@@ -160,6 +160,7 @@ if(BUILD_TESTING)
     unit_tests/test_arena.cc
     unit_tests/test_ink_base64.cc
     unit_tests/test_ink_inet.cc
+    unit_tests/test_ink_memcpy_tolower.cc
     unit_tests/test_ink_memory.cc
     unit_tests/test_ink_string.cc
     unit_tests/test_ink_sys_control.cc
diff --git a/src/tscore/unit_tests/test_ink_memcpy_tolower.cc b/src/tscore/unit_tests/test_ink_memcpy_tolower.cc
new file mode 100644
index 00000000000..1a33b13c681
--- /dev/null
+++ b/src/tscore/unit_tests/test_ink_memcpy_tolower.cc
@@ -0,0 +1,133 @@
+/** @file
+
+  Unit tests for ts::memcpy_tolower.
+
+  Runs as part of the standard test_tscore binary so the helper's SIMD
+  and scalar paths are exercised by ctest in every build, not just when
+  ENABLE_BENCHMARKS is set.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ParseRules.h"
+
+#include <array>
+#include <cstdint>
+#include <random>
+#include <vector>
+
+namespace
+{
+
+// Same mixed-case ASCII distribution we use in the benchmark, so the unit
+// tests exercise inputs that look like real URL/header bytes.
+std::vector<char>
+make_mixed_case_ascii(std::size_t n, std::uint64_t seed)
+{
+  std::mt19937_64   rng(seed);
+  std::vector<char> v(n);
+  for (std::size_t i = 0; i < n; ++i) {
+    auto r = static_cast<unsigned>(rng() & 0x3FU);
+    if (r < 26U) {
+      v[i] = static_cast<char>('A' + r);
+    } else if (r < 52U) {
+      v[i] = static_cast<char>('a' + (r - 26U));
+    } else {
+      static constexpr char kNonAlpha[] = "0123456789-_./:";
+      v[i]                              = kNonAlpha[r % (sizeof(kNonAlpha) - 1U)];
+    }
+  }
+  return v;
+}
+
+// Byte-at-a-time reference, equivalent to the prior static-inline
+// memcpy_tolower in URL.cc. Anything ts::memcpy_tolower produces must match
+// this for every input we test.
+void
+memcpy_tolower_reference(char *d, const char *s, std::size_t n) noexcept
+{
+  while (n--) {
+    *d = ParseRules::ink_tolower(*s);
+    ++s;
+    ++d;
+  }
+}
+
+} // namespace
+
+TEST_CASE("ts::memcpy_tolower matches scalar reference", "[ts_memcpy_tolower]")
+{
+  // Bracket every SIMD body width (16/32/64) with both equal-to and
+  // offset-from-multiple lengths so the cascade transitions and the
+  // AVX-512BW masked tail are all exercised.
+  for (std::size_t sz : std::array<std::size_t, 14>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) {
+    auto              input = make_mixed_case_ascii(sz, 0xC0FFEE + sz);
+    std::vector<char> expected(sz);
+    std::vector<char> actual(sz);
+
+    memcpy_tolower_reference(expected.data(), input.data(), sz);
+    ts::memcpy_tolower(actual.data(), input.data(), sz);
+
+    CAPTURE(sz);
+    REQUIRE(actual == expected);
+  }
+}
+
+TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]")
+{
+  // Every byte value 0..255 should round-trip unchanged unless it is in
+  // 'A'..'Z', in which case it should map to 'a'..'z'. Guards against any
+  // future "speed-up" that widens the case-fold range past ASCII.
+  std::array<unsigned char, 256> input;
+  for (std::size_t i = 0; i < 256; ++i) {
+    input[i] = static_cast<unsigned char>(i);
+  }
+  std::array<char, 256> output;
+  ts::memcpy_tolower(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
+
+  for (std::size_t i = 0; i < 256; ++i) {
+    auto in  = static_cast<unsigned char>(i);
+    auto out = static_cast<unsigned char>(output[i]);
+    auto exp = (in >= 'A' && in <= 'Z') ? static_cast<unsigned char>(in | 0x20) : in;
+    CAPTURE(i);
+    REQUIRE(out == exp);
+  }
+}
+
+TEST_CASE("ts::memcpy_tolower supports in-place (dst == src)", "[ts_memcpy_tolower]")
+{
+  // In-place use must match what an out-of-place call would have produced.
+  // Run across the same boundary sizes as the basic correctness case so the
+  // SIMD bodies and the AVX-512BW masked load/store are all exercised
+  // in-place.
+  for (std::size_t sz : std::array<std::size_t, 14>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) {
+    auto              input = make_mixed_case_ascii(sz, 0xBADF00D + sz);
+    std::vector<char> expected(sz);
+    std::vector<char> in_place(input);
+
+    memcpy_tolower_reference(expected.data(), input.data(), sz);
+    ts::memcpy_tolower(in_place.data(), in_place.data(), sz);
+
+    CAPTURE(sz);
+    REQUIRE(in_place == expected);
+  }
+}
diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_memcpy_tolower.cc
index e2b45b48b5d..079c0fdcb2f 100644
--- a/tools/benchmark/benchmark_memcpy_tolower.cc
+++ b/tools/benchmark/benchmark_memcpy_tolower.cc
@@ -26,6 +26,7 @@
 
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/benchmark/catch_benchmark.hpp>
+#include <catch2/benchmark/catch_optimizer.hpp>
 
 #include "tscore/ink_memcpy_tolower.h"
 #include "tscore/ParseRules.h"
@@ -87,8 +88,7 @@ memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept
 TEST_CASE("active SIMD configuration", "[tolower][config]")
 {
   // Print the compile-time ISA path so the benchmark output makes the
-  // selected configuration obvious. Cascades stack: AVX-512BW builds also
-  // emit the AVX2 and SSE2 drain loops; AVX2 builds emit the SSE2 drain.
+  // selected configuration obvious.
   std::cout << "ts::memcpy_tolower compiled with: ";
 #if defined(__AVX512BW__)
   std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade";
@@ -105,44 +105,6 @@ TEST_CASE("active SIMD configuration", "[tolower][config]")
   SUCCEED();
 }
 
-TEST_CASE("ts::memcpy_tolower matches scalar reference", "[tolower][correctness]")
-{
-  // Cover sizes that bracket the 16-byte SIMD body: smaller-than, equal-to,
-  // a couple of multiples, and several offsets between multiples.
-  for (std::size_t sz : std::array<std::size_t, 12>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 64, 257}) {
-    auto              input = make_mixed_case_ascii(sz, 0xC0FFEE + sz);
-    std::vector<char> expected(sz);
-    std::vector<char> actual(sz);
-
-    memcpy_tolower_scalar(expected.data(), input.data(), sz);
-    ts::memcpy_tolower(actual.data(), input.data(), sz);
-
-    CAPTURE(sz);
-    REQUIRE(actual == expected);
-  }
-}
-
-TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[tolower][correctness]")
-{
-  // Every byte value from 0..255 should round-trip unchanged unless it is in
-  // 'A'..'Z', in which case it should map to 'a'..'z'. This catches anyone
-  // who later tries to "speed things up" by widening the range to Latin-1.
-  std::array<unsigned char, 256> input;
-  for (std::size_t i = 0; i < 256; ++i) {
-    input[i] = static_cast<unsigned char>(i);
-  }
-  std::array<char, 256> output;
-  ts::memcpy_tolower(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
-
-  for (std::size_t i = 0; i < 256; ++i) {
-    auto in  = static_cast<unsigned char>(i);
-    auto out = static_cast<unsigned char>(output[i]);
-    auto exp = (in >= 'A' && in <= 'Z') ? static_cast<unsigned char>(in | 0x20) : in;
-    CAPTURE(i);
-    REQUIRE(out == exp);
-  }
-}
-
 TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
 {
   for (std::size_t sz : kSizes) {
@@ -150,10 +112,16 @@ TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
     std::vector<char> output_scalar(sz);
     std::vector<char> output_simd(sz);
 
+    // Catch::Benchmark::keep_memory clobbers the buffer in the compiler's
+    // model, forcing it to materialize every byte we wrote. Without this an
+    // optimizing compiler can shrink or DCE the inline body's stores past
+    // the first element we observed.
+
     std::string scalar_name = "scalar  " + std::to_string(sz) + "B";
     BENCHMARK(scalar_name.c_str())
     {
       memcpy_tolower_scalar(output_scalar.data(), input.data(), sz);
+      Catch::Benchmark::keep_memory(output_scalar.data());
       return output_scalar[0];
     };
 
@@ -161,6 +129,7 @@ TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
     BENCHMARK(simd_name.c_str())
     {
       ts::memcpy_tolower(output_simd.data(), input.data(), sz);
+      Catch::Benchmark::keep_memory(output_simd.data());
       return output_simd[0];
     };
   }

From f489b14848172b7b40fe86dd6edd749bd0ade329 Mon Sep 17 00:00:00 2001
From: Phong Nguyen <phongn@gmail.com>
Date: Thu, 21 May 2026 20:59:37 +0000
Subject: [PATCH 03/12] tscore: rename memcpy_tolower to
 ts::ascii::tolower_{copy,inplace}

memcpy_tolower carried two warts: the "memcpy" prefix implied
non-overlapping by convention with libc memcpy (we explicitly support
the in-place case), and the unqualified name didn't surface the
ASCII-only semantics. Rename the helper to ts::ascii::tolower_copy
and add a thin ts::ascii::tolower_inplace(buf, n) wrapper so call
sites that operate on a single buffer read naturally instead of
passing the same pointer twice.

Rename the header to include/tscore/ink_ascii_tolower.h, the unit
test to src/tscore/unit_tests/test_ink_ascii_tolower.cc, and the
benchmark to tools/benchmark/benchmark_ascii_tolower.cc to match.
Update the two existing call sites (URL.cc fast-path scheme/host and
QPACK::_encode_header in-place name lowercasing) accordingly. No
behavior change: the helper bodies are unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NOTICE                                        |  2 +-
 ...k_memcpy_tolower.h => ink_ascii_tolower.h} | 37 ++++++++++++-------
 src/proxy/hdrs/URL.cc                         |  6 +--
 src/proxy/http3/QPACK.cc                      |  4 +-
 src/tscore/CMakeLists.txt                     |  2 +-
 ...y_tolower.cc => test_ink_ascii_tolower.cc} | 33 ++++++++---------
 tools/benchmark/CMakeLists.txt                |  4 +-
 ..._tolower.cc => benchmark_ascii_tolower.cc} | 20 +++++-----
 8 files changed, 59 insertions(+), 49 deletions(-)
 rename include/tscore/{ink_memcpy_tolower.h => ink_ascii_tolower.h} (84%)
 rename src/tscore/unit_tests/{test_ink_memcpy_tolower.cc => test_ink_ascii_tolower.cc} (76%)
 rename tools/benchmark/{benchmark_memcpy_tolower.cc => benchmark_ascii_tolower.cc} (85%)

diff --git a/NOTICE b/NOTICE
index 46ade352b22..56a0a697d95 100644
--- a/NOTICE
+++ b/NOTICE
@@ -136,7 +136,7 @@ https://github.com/google/highway
 
 ~~
 
-include/tscore/ink_memcpy_tolower.h AVX-512BW kernel design (fused
+include/tscore/ink_ascii_tolower.h AVX-512BW kernel design (fused
 mask_add and masked-tail load/store) is adapted from Tony Finch's
 copytolower64.c (0BSD OR MIT-0).
 https://dotat.at/cgi/git/vectolower.git/
diff --git a/include/tscore/ink_memcpy_tolower.h b/include/tscore/ink_ascii_tolower.h
similarity index 84%
rename from include/tscore/ink_memcpy_tolower.h
rename to include/tscore/ink_ascii_tolower.h
index 578ef3606b5..d98aafb48dc 100644
--- a/include/tscore/ink_memcpy_tolower.h
+++ b/include/tscore/ink_ascii_tolower.h
@@ -3,9 +3,11 @@
   SIMD-accelerated bulk ASCII tolower copy.
 
   Used on the URL canonicalization fast path for cache-key digests
-  (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast). The scalar loop is
-  the bottleneck for hosts and schemes long enough to vectorize; for
-  shorter inputs the scalar tail handles them with no SIMD overhead.
+  (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast) and any other place
+  that needs to fold ASCII to lowercase over a small-to-moderate
+  buffer. The scalar byte-at-a-time loop is the bottleneck for hosts
+  and schemes long enough to vectorize; for shorter inputs the scalar
+  tail handles them with no SIMD overhead.
 
   Semantics match a byte-at-a-time loop using ParseRules::ink_tolower():
 
@@ -13,13 +15,14 @@
       'a'..'z'. All other bytes (including 0x80..0xFF) pass through
       unchanged. There is no UTF-8 case folding.
 
-    - In-place use (dst == src) is supported: every SIMD body loads a
-      full block into a register before storing back, and the AVX-512BW
-      masked tail uses masked-load/masked-store at the same offset.
-      Partial overlap where dst != src is not supported.
+    - In-place use (dst == src) is supported on every path. Each SIMD
+      body loads a full block into a register before storing back at
+      the same offset, and the AVX-512BW masked tail does masked-load
+      / masked-store at the same offset. Partial overlap where
+      dst != src but the ranges intersect is not supported.
 
   Implementation note: selection is purely compile-time; no runtime
-  dispatch. The bodies are stacked widest-first.
+  dispatch. Bodies are stacked widest-first.
 
     - AVX-512BW builds: when n >= 64, a 64-byte main loop handles the
       bulk and a single masked load/store finishes any 1..63-byte tail,
@@ -64,11 +67,11 @@
 #include <arm_neon.h>
 #endif
 
-namespace ts
+namespace ts::ascii
 {
 
 inline void
-memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept
+tolower_copy(char *dst, const char *src, std::size_t n) noexcept
 {
 #if defined(__AVX512BW__)
   // AVX-512BW: 64 bytes per iteration with two key optimizations over the
@@ -84,8 +87,7 @@ memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept
   // tiny inputs fall through to the AVX2/SSE2 path below, where they keep
   // the speedup that path already provides.
   //
-  // Inspired by Tony Finch's copytolower64.c
-  // (https://dotat.at/cgi/git/vectolower.git/).
+  // Adapted from Tony Finch's copytolower64.c (see NOTICE).
   if (n >= 64) {
     const __m512i A_vec = _mm512_set1_epi8('A');
     const __m512i Z_vec = _mm512_set1_epi8('Z');
@@ -171,4 +173,13 @@ memcpy_tolower(char *dst, const char *src, std::size_t n) noexcept
   }
 }
 
-} // namespace ts
+// Thin sugar over tolower_copy for the in-place case. Makes call sites
+// like ts::ascii::tolower_inplace(buf, n) read naturally instead of
+// ts::ascii::tolower_copy(buf, buf, n).
+inline void
+tolower_inplace(char *buf, std::size_t n) noexcept
+{
+  tolower_copy(buf, buf, n);
+}
+
+} // namespace ts::ascii
diff --git a/src/proxy/hdrs/URL.cc b/src/proxy/hdrs/URL.cc
index c8eff4dd69f..1c9eb4170b2 100644
--- a/src/proxy/hdrs/URL.cc
+++ b/src/proxy/hdrs/URL.cc
@@ -25,7 +25,7 @@
 #include <new>
 #include "tscore/ink_platform.h"
 #include "tscore/ink_memory.h"
-#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ink_ascii_tolower.h"
 #include "proxy/hdrs/URL.h"
 #include "proxy/hdrs/MIME.h"
 #include "proxy/hdrs/HTTP.h"
@@ -1695,7 +1695,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
   char *p;
 
   p = buffer;
-  ts::memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
+  ts::ascii::tolower_copy(p, url->m_ptr_scheme, url->m_len_scheme);
   p    += url->m_len_scheme;
   *p++  = ':';
   *p++  = '/';
@@ -1704,7 +1704,7 @@ url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash
   *p++ = ':';
   // no password
   *p++ = '@';
-  ts::memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
+  ts::ascii::tolower_copy(p, url->m_ptr_host, url->m_len_host);
   p    += url->m_len_host;
   *p++  = '/';
   memcpy(p, url->m_ptr_path, url->m_len_path);
diff --git a/src/proxy/http3/QPACK.cc b/src/proxy/http3/QPACK.cc
index e5145ecf18d..92ac2b024f6 100644
--- a/src/proxy/http3/QPACK.cc
+++ b/src/proxy/http3/QPACK.cc
@@ -25,7 +25,7 @@
 #include "proxy/hdrs/XPACK.h"
 #include "proxy/http3/QPACK.h"
 #include "tscore/ink_defs.h"
-#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ink_ascii_tolower.h"
 #include "tscore/ink_memory.h"
 
 #define QPACKDebug(fmt, ...)   Dbg(dbg_ctl_qpack, "[%s] " fmt, this->_qc->cids().data(), ##__VA_ARGS__)
@@ -370,7 +370,7 @@ QPACK::_encode_header(const MIMEField &field, uint16_t base_index, IOBufferBlock
 {
   auto  name{field.name_get()};
   char *lowered_name = this->_arena.str_store(name.data(), name.length());
-  ts::memcpy_tolower(lowered_name, lowered_name, name.length());
+  ts::ascii::tolower_inplace(lowered_name, name.length());
   auto value{field.value_get()};
 
   // TODO Set never_index flag on/off according to encoding headers
diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt
index ec90904fc9c..c9d5536c4cb 100644
--- a/src/tscore/CMakeLists.txt
+++ b/src/tscore/CMakeLists.txt
@@ -159,8 +159,8 @@ if(BUILD_TESTING)
     unit_tests/test_Tokenizer.cc
     unit_tests/test_arena.cc
     unit_tests/test_ink_base64.cc
+    unit_tests/test_ink_ascii_tolower.cc
     unit_tests/test_ink_inet.cc
-    unit_tests/test_ink_memcpy_tolower.cc
     unit_tests/test_ink_memory.cc
     unit_tests/test_ink_string.cc
     unit_tests/test_ink_sys_control.cc
diff --git a/src/tscore/unit_tests/test_ink_memcpy_tolower.cc b/src/tscore/unit_tests/test_ink_ascii_tolower.cc
similarity index 76%
rename from src/tscore/unit_tests/test_ink_memcpy_tolower.cc
rename to src/tscore/unit_tests/test_ink_ascii_tolower.cc
index 1a33b13c681..5f2cb8df3c9 100644
--- a/src/tscore/unit_tests/test_ink_memcpy_tolower.cc
+++ b/src/tscore/unit_tests/test_ink_ascii_tolower.cc
@@ -1,6 +1,6 @@
 /** @file
 
-  Unit tests for ts::memcpy_tolower.
+  Unit tests for ts::ascii::tolower_copy and ts::ascii::tolower_inplace.
 
   Runs as part of the standard test_tscore binary so the helper's SIMD
   and scalar paths are exercised by ctest in every build, not just when
@@ -27,7 +27,7 @@
 
 #include <catch2/catch_test_macros.hpp>
 
-#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ink_ascii_tolower.h"
 #include "tscore/ParseRules.h"
 
 #include <array>
@@ -60,10 +60,10 @@ make_mixed_case_ascii(std::size_t n, std::uint64_t seed)
 }
 
 // Byte-at-a-time reference, equivalent to the prior static-inline
-// memcpy_tolower in URL.cc. Anything ts::memcpy_tolower produces must match
-// this for every input we test.
+// memcpy_tolower in URL.cc. Anything ts::ascii::tolower_copy produces must
+// match this for every input we test.
 void
-memcpy_tolower_reference(char *d, const char *s, std::size_t n) noexcept
+tolower_reference(char *d, const char *s, std::size_t n) noexcept
 {
   while (n--) {
     *d = ParseRules::ink_tolower(*s);
@@ -74,7 +74,7 @@ memcpy_tolower_reference(char *d, const char *s, std::size_t n) noexcept
 
 } // namespace
 
-TEST_CASE("ts::memcpy_tolower matches scalar reference", "[ts_memcpy_tolower]")
+TEST_CASE("ts::ascii::tolower_copy matches scalar reference", "[ts_ascii_tolower]")
 {
   // Bracket every SIMD body width (16/32/64) with both equal-to and
   // offset-from-multiple lengths so the cascade transitions and the
@@ -84,15 +84,15 @@ TEST_CASE("ts::memcpy_tolower matches scalar reference", "[ts_memcpy_tolower]")
     std::vector<char> expected(sz);
     std::vector<char> actual(sz);
 
-    memcpy_tolower_reference(expected.data(), input.data(), sz);
-    ts::memcpy_tolower(actual.data(), input.data(), sz);
+    tolower_reference(expected.data(), input.data(), sz);
+    ts::ascii::tolower_copy(actual.data(), input.data(), sz);
 
     CAPTURE(sz);
     REQUIRE(actual == expected);
   }
 }
 
-TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]")
+TEST_CASE("ts::ascii::tolower_copy preserves non-ASCII bytes", "[ts_ascii_tolower]")
 {
   // Every byte value 0..255 should round-trip unchanged unless it is in
   // 'A'..'Z', in which case it should map to 'a'..'z'. Guards against any
@@ -102,7 +102,7 @@ TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]")
     input[i] = static_cast<unsigned char>(i);
   }
   std::array<char, 256> output;
-  ts::memcpy_tolower(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
+  ts::ascii::tolower_copy(output.data(), reinterpret_cast<const char *>(input.data()), input.size());
 
   for (std::size_t i = 0; i < 256; ++i) {
     auto in  = static_cast<unsigned char>(i);
@@ -113,19 +113,18 @@ TEST_CASE("ts::memcpy_tolower preserves non-ASCII bytes", "[ts_memcpy_tolower]")
   }
 }
 
-TEST_CASE("ts::memcpy_tolower supports in-place (dst == src)", "[ts_memcpy_tolower]")
+TEST_CASE("ts::ascii::tolower_inplace matches tolower_copy", "[ts_ascii_tolower]")
 {
-  // In-place use must match what an out-of-place call would have produced.
-  // Run across the same boundary sizes as the basic correctness case so the
-  // SIMD bodies and the AVX-512BW masked load/store are all exercised
-  // in-place.
+  // The inplace form must produce the same result as a non-overlapping copy.
+  // Exercise the same boundary sizes so the SIMD bodies and the AVX-512BW
+  // masked load/store are all exercised in-place.
   for (std::size_t sz : std::array<std::size_t, 14>{0, 1, 5, 15, 16, 17, 23, 31, 32, 33, 63, 64, 65, 257}) {
     auto              input = make_mixed_case_ascii(sz, 0xBADF00D + sz);
     std::vector<char> expected(sz);
     std::vector<char> in_place(input);
 
-    memcpy_tolower_reference(expected.data(), input.data(), sz);
-    ts::memcpy_tolower(in_place.data(), in_place.data(), sz);
+    tolower_reference(expected.data(), input.data(), sz);
+    ts::ascii::tolower_inplace(in_place.data(), sz);
 
     CAPTURE(sz);
     REQUIRE(in_place == expected);
diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt
index 84fc111925e..c08e9a94d12 100644
--- a/tools/benchmark/CMakeLists.txt
+++ b/tools/benchmark/CMakeLists.txt
@@ -36,8 +36,8 @@ target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore li
 add_executable(benchmark_Random benchmark_Random.cc)
 target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore)
 
-add_executable(benchmark_memcpy_tolower benchmark_memcpy_tolower.cc)
-target_link_libraries(benchmark_memcpy_tolower PRIVATE Catch2::Catch2WithMain ts::tscore)
+add_executable(benchmark_ascii_tolower benchmark_ascii_tolower.cc)
+target_link_libraries(benchmark_ascii_tolower PRIVATE Catch2::Catch2WithMain ts::tscore)
 
 add_executable(benchmark_HostDB benchmark_HostDB.cc)
 target_link_libraries(
diff --git a/tools/benchmark/benchmark_memcpy_tolower.cc b/tools/benchmark/benchmark_ascii_tolower.cc
similarity index 85%
rename from tools/benchmark/benchmark_memcpy_tolower.cc
rename to tools/benchmark/benchmark_ascii_tolower.cc
index 079c0fdcb2f..4c799713c99 100644
--- a/tools/benchmark/benchmark_memcpy_tolower.cc
+++ b/tools/benchmark/benchmark_ascii_tolower.cc
@@ -1,7 +1,7 @@
 /** @file
 
-  Micro benchmark for ts::memcpy_tolower against a byte-at-a-time scalar
-  loop equivalent to the prior URL.cc::memcpy_tolower definition.
+  Micro benchmark for ts::ascii::tolower_copy against a byte-at-a-time
+  scalar loop equivalent to the prior URL.cc::memcpy_tolower definition.
 
   @section license License
 
@@ -28,7 +28,7 @@
 #include <catch2/benchmark/catch_benchmark.hpp>
 #include <catch2/benchmark/catch_optimizer.hpp>
 
-#include "tscore/ink_memcpy_tolower.h"
+#include "tscore/ink_ascii_tolower.h"
 #include "tscore/ParseRules.h"
 
 #include <array>
@@ -74,7 +74,7 @@ make_mixed_case_ascii(std::size_t n, std::uint64_t seed = 0xABCDEFULL)
 // Mirror of the prior static inline memcpy_tolower() from URL.cc, kept here
 // as the baseline the SIMD path is expected to beat.
 inline void
-memcpy_tolower_scalar(char *d, const char *s, std::size_t n) noexcept
+tolower_scalar(char *d, const char *s, std::size_t n) noexcept
 {
   while (n--) {
     *d = ParseRules::ink_tolower(*s);
@@ -89,7 +89,7 @@ TEST_CASE("active SIMD configuration", "[tolower][config]")
 {
   // Print the compile-time ISA path so the benchmark output makes the
   // selected configuration obvious.
-  std::cout << "ts::memcpy_tolower compiled with: ";
+  std::cout << "ts::ascii::tolower_copy compiled with: ";
 #if defined(__AVX512BW__)
   std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade";
 #elif defined(__AVX2__)
@@ -105,7 +105,7 @@ TEST_CASE("active SIMD configuration", "[tolower][config]")
   SUCCEED();
 }
 
-TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
+TEST_CASE("tolower throughput", "[bench][tolower]")
 {
   for (std::size_t sz : kSizes) {
     auto              input = make_mixed_case_ascii(sz);
@@ -117,18 +117,18 @@ TEST_CASE("memcpy_tolower throughput", "[bench][tolower]")
     // optimizing compiler can shrink or DCE the inline body's stores past
     // the first element we observed.
 
-    std::string scalar_name = "scalar  " + std::to_string(sz) + "B";
+    std::string scalar_name = "scalar   " + std::to_string(sz) + "B";
     BENCHMARK(scalar_name.c_str())
     {
-      memcpy_tolower_scalar(output_scalar.data(), input.data(), sz);
+      tolower_scalar(output_scalar.data(), input.data(), sz);
       Catch::Benchmark::keep_memory(output_scalar.data());
       return output_scalar[0];
     };
 
-    std::string simd_name = "ts::mct " + std::to_string(sz) + "B";
+    std::string simd_name = "ts::atc  " + std::to_string(sz) + "B";
     BENCHMARK(simd_name.c_str())
     {
-      ts::memcpy_tolower(output_simd.data(), input.data(), sz);
+      ts::ascii::tolower_copy(output_simd.data(), input.data(), sz);
       Catch::Benchmark::keep_memory(output_simd.data());
       return output_simd[0];
     };

From 6286d13a6eff6b6d37cc599a8e8aa0db1c139038 Mon Sep 17 00:00:00 2001
From: Phong Nguyen <phongn@gmail.com>
Date: Thu, 21 May 2026 21:17:31 +0000
Subject: [PATCH 04/12] proxy: migrate HPACK / UrlRewrite tolower loops and add
 behavioral tests

Migrate two more byte-at-a-time ASCII tolower loops to
ts::ascii::tolower_copy. Both call sites use a separate destination
buffer, so the copy form is the right fit:

- hpack_encode_header_block(): lower-cases each MIMEField name before
  encoding to match the HTTP/2 lowercase-header-name requirement.

- UrlRewrite::_mappingLookup(): lower-cases the incoming request host
  into a stack buffer before the table lookup, so the lookup is
  case-insensitive against the lower-cased keys built at config-load
  time. The previous code used libc tolower(int) on signed char values,
  which is technically UB for bytes >= 0x80; the new call avoids that.

The existing unit tests in test_URL, test_HpackIndexingTable, and
test_RemapRules executed the tolower paths but only with inputs that
were already lower-case, so they would have missed a "skip the
lowercasing" regression. Add focused behavioral coverage:

- test_URL.cc: four extra get_hash_test_cases that hash a request with
  uppercase/mixed-case scheme or host and require an equal hash to the
  lower-case form. Includes a 49-byte uppercase host that crosses both
  the 16- and 32-byte SIMD bodies.

- test_RemapRules.cc: a new SCENARIO that builds a UrlRewrite from a
  map for a lower-case host and requires that uppercase, mixed-case,
  and long-uppercase request hosts all match.

- test_HpackIndexingTable.cc: a new TEST_CASE that encodes a long
  mixed-case field name with hpack_encode_header_block and requires
  the encoded byte stream to be identical to encoding the same field
  with an already-lower-case name.

QPACK already exercises the in-place path through its Encoding test
and the helper's own ts::ascii::tolower_inplace unit test covers
in-place semantics exhaustively; an additional focused QPACK test
would need the external .qif fixture infrastructure, which is out of
scope here.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/proxy/hdrs/unit_tests/test_URL.cc         | 33 +++++++++++
 src/proxy/http/remap/UrlRewrite.cc            |  7 +--
 .../http/remap/unit-tests/test_RemapRules.cc  | 55 +++++++++++++++++++
 src/proxy/http2/HPACK.cc                      |  5 +-
 .../unit_tests/test_HpackIndexingTable.cc     | 32 +++++++++++
 5 files changed, 125 insertions(+), 7 deletions(-)

diff --git a/src/proxy/hdrs/unit_tests/test_URL.cc b/src/proxy/hdrs/unit_tests/test_URL.cc
index dc5ff4ade74..8dda4e9501a 100644
--- a/src/proxy/hdrs/unit_tests/test_URL.cc
+++ b/src/proxy/hdrs/unit_tests/test_URL.cc
@@ -659,6 +659,39 @@ std::vector<get_hash_test_case> get_hash_test_cases = {
     !IGNORE_QUERY,
     HAS_EQUAL_HASH,
   },
+  {
+    // Verifies the scheme/host SIMD-tolower path in url_CryptoHash_get_fast:
+    // an uppercase host with a long enough prefix to hit the 16-byte SIMD
+    // body should hash identically to its lowercased form.
+    "Uppercase host: equal hashes",
+    "http://ONE.EXAMPLE.COM/a/path?name=value",
+    "http://one.example.com/a/path?name=value",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Mixed-case host: equal hashes",
+    "http://One.Example.Com/a/path?name=value",
+    "http://one.example.com/a/path?name=value",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Uppercase scheme: equal hashes",
+    "HTTP://one.example.com/a/path?name=value",
+    "http://one.example.com/a/path?name=value",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    // Long uppercase host crosses 16- and 32-byte SIMD body boundaries so
+    // the wider paths (when compiled in) are exercised by this fixture.
+    "Long uppercase host: equal hashes",
+    "http://A-VERY-LONG-HOST-NAME-FOR-SIMD.EXAMPLE.COM/a/path",
+    "http://a-very-long-host-name-for-simd.example.com/a/path",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
 };
 
 /** Return the hash related to a URI.
diff --git a/src/proxy/http/remap/UrlRewrite.cc b/src/proxy/http/remap/UrlRewrite.cc
index fbda217462b..d2af3b343e9 100644
--- a/src/proxy/http/remap/UrlRewrite.cc
+++ b/src/proxy/http/remap/UrlRewrite.cc
@@ -22,6 +22,8 @@
 
  */
 
+#include "tscore/ink_ascii_tolower.h"
+
 #include "proxy/http/remap/UrlRewrite.h"
 #include "proxy/http/remap/RemapYamlConfig.h"
 #include "iocore/eventsystem/ConfigProcessor.h"
@@ -931,10 +933,7 @@ UrlRewrite::_mappingLookup(MappingsStore &mappings, URL *request_url, int reques
     return false;
   }
 
-  // lowercase
-  for (int i = 0; i < request_host_len; ++i) {
-    request_host_lower[i] = tolower(request_host[i]);
-  }
+  ts::ascii::tolower_copy(request_host_lower, request_host, request_host_len);
   request_host_lower[request_host_len] = 0;
 
   bool         retval       = false;
diff --git a/src/proxy/http/remap/unit-tests/test_RemapRules.cc b/src/proxy/http/remap/unit-tests/test_RemapRules.cc
index f51e39d0dd6..35ae9c5e8c7 100644
--- a/src/proxy/http/remap/unit-tests/test_RemapRules.cc
+++ b/src/proxy/http/remap/unit-tests/test_RemapRules.cc
@@ -269,3 +269,58 @@ map_with_recv_port http://front.example.com \
     }
   }
 }
+
+SCENARIO("UrlRewrite host lookup is case-insensitive", "[proxy][remap]")
+{
+  // _mappingLookup lower-cases the request host before consulting the hash
+  // table; these scenarios exercise that path with inputs that would not
+  // match in a strict byte-compare. Sized to cross the 16-byte SSE2 body
+  // for hosts that get a real SIMD pass.
+  GIVEN("A forward map with a lowercase source host")
+  {
+    auto        urlrw  = std::make_unique<UrlRewrite>();
+    std::string config = R"RMCFG(
+map http://www.example.com http://origin.example.com
+  )RMCFG";
+
+    auto cpath = write_test_remap(config, "case_insensitive");
+    int  rc    = urlrw->BuildTable(cpath.c_str());
+    REQUIRE(rc == TS_SUCCESS);
+    REQUIRE(urlrw->rule_count() == 1);
+
+    EasyURL             url("http://www.example.com");
+    UrlMappingContainer urlmap;
+
+    THEN("uppercase request host matches the lowercase rule")
+    {
+      const char *host = "WWW.EXAMPLE.COM";
+      REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap));
+    }
+    THEN("mixed-case request host matches the lowercase rule")
+    {
+      const char *host = "Www.Example.Com";
+      REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap));
+    }
+  }
+
+  GIVEN("A forward map with a long host that exercises the 16-byte SIMD body")
+  {
+    auto        urlrw  = std::make_unique<UrlRewrite>();
+    std::string config = R"RMCFG(
+map http://a-very-long-host-name-for-simd.example.com http://origin.example.com
+  )RMCFG";
+
+    auto cpath = write_test_remap(config, "case_insensitive_long");
+    int  rc    = urlrw->BuildTable(cpath.c_str());
+    REQUIRE(rc == TS_SUCCESS);
+
+    EasyURL             url("http://a-very-long-host-name-for-simd.example.com");
+    UrlMappingContainer urlmap;
+
+    THEN("an all-uppercase 49-char host (covers >=32 SIMD bytes) matches")
+    {
+      const char *host = "A-VERY-LONG-HOST-NAME-FOR-SIMD.EXAMPLE.COM";
+      REQUIRE(urlrw->forwardMappingLookup(&url.url, 80, host, strlen(host), urlmap));
+    }
+  }
+}
diff --git a/src/proxy/http2/HPACK.cc b/src/proxy/http2/HPACK.cc
index 7e4fd974f57..34ff607ced2 100644
--- a/src/proxy/http2/HPACK.cc
+++ b/src/proxy/http2/HPACK.cc
@@ -23,6 +23,7 @@
 
 #include "proxy/http2/HPACK.h"
 
+#include "tscore/ink_ascii_tolower.h"
 #include "tsutil/LocalBuffer.h"
 #include "swoc/TextView.h"
 
@@ -789,9 +790,7 @@ hpack_encode_header_block(HpackIndexingTable &indexing_table, uint8_t *out_buf,
     int                   name_len      = original_name.size();
     ts::LocalBuffer<char> local_buffer(name_len);
     char                 *lower_name = local_buffer.data();
-    for (int i = 0; i < name_len; i++) {
-      lower_name[i] = ParseRules::ink_tolower(original_name[i]);
-    }
+    ts::ascii::tolower_copy(lower_name, original_name.data(), name_len);
 
     std::string_view name{lower_name, static_cast<size_t>(name_len)};
     std::string_view value = field.value_get();
diff --git a/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc b/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc
index ad373211fb8..7692931b1af 100644
--- a/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc
+++ b/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc
@@ -24,6 +24,7 @@
     limitations under the License.
  */
 
+#include <cstring>
 #include <memory>
 #include <string_view>
 
@@ -531,3 +532,34 @@ TEST_CASE("HPACK high level APIs", "[hpack]")
     }
   }
 }
+
+// Validates that hpack_encode_header_block() lower-cases mixed-case field
+// names per RFC 7540 § 8.1.2 before emitting them. The lower-case step is the
+// path that goes through ts::ascii::tolower_copy; if a regression broke the
+// lowercasing, the byte-for-byte comparison below would fail.
+TEST_CASE("HPACK encode lower-cases mixed-case field names", "[hpack]")
+{
+  uint8_t            buf_mixed[BUFSIZE_FOR_REGRESSION_TEST];
+  uint8_t            buf_lower[BUFSIZE_FOR_REGRESSION_TEST];
+  HpackIndexingTable table_mixed(MAX_TABLE_SIZE);
+  HpackIndexingTable table_lower(MAX_TABLE_SIZE);
+
+  // Use a name long enough to exercise the 16-byte SSE2 body when present.
+  auto encode_one = [](HpackIndexingTable &table, uint8_t *buf, const char *name, const char *value) -> int64_t {
+    std::unique_ptr<HTTPHdr, void (*)(HTTPHdr *)> headers(new HTTPHdr, destroy_http_hdr);
+    headers->create(HTTPType::REQUEST);
+    MIMEField *field = mime_field_create(headers->m_heap, headers->m_http->m_fields_impl);
+    field->name_set(headers->m_heap, headers->m_http->m_fields_impl, std::string_view{name});
+    field->value_set(headers->m_heap, headers->m_http->m_fields_impl, std::string_view{value});
+    mime_hdr_field_attach(headers->m_http->m_fields_impl, field, 1, nullptr);
+    std::memset(buf, 0, BUFSIZE_FOR_REGRESSION_TEST);
+    return hpack_encode_header_block(table, buf, BUFSIZE_FOR_REGRESSION_TEST, headers.get());
+  };
+
+  int64_t mixed_len = encode_one(table_mixed, buf_mixed, "Long-Custom-Header-Name", "abc");
+  int64_t lower_len = encode_one(table_lower, buf_lower, "long-custom-header-name", "abc");
+
+  REQUIRE(mixed_len > 0);
+  REQUIRE(mixed_len == lower_len);
+  REQUIRE(std::memcmp(buf_mixed, buf_lower, lower_len) == 0);
+}

From 2df3844250eedb1f02944cbd29ef8ac2aac9f041 Mon Sep 17 00:00:00 2001
From: Phong Nguyen <phongn@gmail.com>
Date: Thu, 28 May 2026 19:58:52 +0000
Subject: [PATCH 05/12] Implement ascii::to_lower using Google Highway's SIMD
 library

---
 CMakeLists.txt                             |   8 ++
 include/tscore/ink_ascii_tolower.h         |  49 ++++++-
 include/tscore/ink_config.h.cmake.in       |   1 +
 src/tscore/CMakeLists.txt                  |  12 ++
 src/tscore/ink_ascii_tolower_dispatch.cc   | 147 +++++++++++++++++++++
 tools/benchmark/benchmark_ascii_tolower.cc |  20 +--
 6 files changed, 226 insertions(+), 11 deletions(-)
 create mode 100644 src/tscore/ink_ascii_tolower_dispatch.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d28c3bd289c..6c3d1498cc2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,6 +216,9 @@ set(ENABLE_TPROXY
         anything else to enable."
 )
 option(ENABLE_QUICHE "Use quiche (default OFF)")
+option(ENABLE_HIGHWAY_DISPATCH
+       "Use Google Highway runtime dispatch for SIMD kernels (default OFF, default cascade compile-time selection)"
+)
 
 option(ENABLE_EXAMPLE "Build example directory (default OFF)")
 set(TS_MAX_HOST_NAME_LEN
@@ -327,6 +330,11 @@ elseif(TS_HAS_MIMALLOC)
   link_libraries(mimalloc)
 endif()
 
+if(ENABLE_HIGHWAY_DISPATCH)
+  find_package(hwy CONFIG REQUIRED)
+  set(TS_HAS_HIGHWAY_DISPATCH ${hwy_FOUND})
+endif()
+
 if(ENABLE_QUICHE)
   find_package(quiche REQUIRED)
 
diff --git a/include/tscore/ink_ascii_tolower.h b/include/tscore/ink_ascii_tolower.h
index d98aafb48dc..7159a1de6ae 100644
--- a/include/tscore/ink_ascii_tolower.h
+++ b/include/tscore/ink_ascii_tolower.h
@@ -21,8 +21,24 @@
       / masked-store at the same offset. Partial overlap where
       dst != src but the ranges intersect is not supported.
 
-  Implementation note: selection is purely compile-time; no runtime
-  dispatch. Bodies are stacked widest-first.
+  Two implementations exist, gated by the ENABLE_HIGHWAY_DISPATCH
+  CMake option (TS_HAS_HIGHWAY_DISPATCH at compile time):
+
+    - OFF (default): the compile-time cascade defined below. Selection
+      is purely compile-time; no runtime dispatch. Body is fully
+      inlined into every caller. Best when the build target is broad
+      enough to enable the widest SIMD the production hosts support.
+
+    - ON: an out-of-line dispatched implementation in
+      src/tscore/ink_ascii_tolower_dispatch.cc, built against Google
+      Highway. The header degenerates to a one-line forward shim. At
+      runtime the highest SIMD target supported by the live CPU is
+      selected once and cached. Best when the build target is
+      intentionally conservative (e.g. -march=westmere) but the
+      production CPU fleet is heterogeneous.
+
+  Compile-time cascade (default-off path). Bodies are stacked
+  widest-first.
 
     - AVX-512BW builds: when n >= 64, a 64-byte main loop handles the
       bulk and a single masked load/store finishes any 1..63-byte tail,
@@ -61,6 +77,33 @@
 #include <cstddef>
 #include <cstdint>
 
+#include "tscore/ink_config.h"
+
+#if TS_HAS_HIGHWAY_DISPATCH
+
+namespace ts::ascii
+{
+
+// Out-of-line, runtime-dispatched implementation. Defined in
+// src/tscore/ink_ascii_tolower_dispatch.cc via Highway HWY_EXPORT.
+void tolower_copy_dispatch(char *dst, const char *src, std::size_t n) noexcept;
+
+inline void
+tolower_copy(char *dst, const char *src, std::size_t n) noexcept
+{
+  tolower_copy_dispatch(dst, src, n);
+}
+
+inline void
+tolower_inplace(char *buf, std::size_t n) noexcept
+{
+  tolower_copy_dispatch(buf, buf, n);
+}
+
+} // namespace ts::ascii
+
+#else // !TS_HAS_HIGHWAY_DISPATCH — compile-time cascade
+
 #if defined(__AVX512BW__) || defined(__AVX2__) || defined(__SSE2__)
 #include <immintrin.h>
 #elif defined(__ARM_NEON) || defined(__aarch64__)
@@ -183,3 +226,5 @@ tolower_inplace(char *buf, std::size_t n) noexcept
 }
 
 } // namespace ts::ascii
+
+#endif // TS_HAS_HIGHWAY_DISPATCH
diff --git a/include/tscore/ink_config.h.cmake.in b/include/tscore/ink_config.h.cmake.in
index 73c8b860fb9..9867f06a56e 100644
--- a/include/tscore/ink_config.h.cmake.in
+++ b/include/tscore/ink_config.h.cmake.in
@@ -140,6 +140,7 @@ const int DEFAULT_STACKSIZE = @DEFAULT_STACK_SIZE@;
 /* Feature Flags */
 #cmakedefine01 TS_HAS_128BIT_CAS
 #cmakedefine01 TS_HAS_BACKTRACE
+#cmakedefine01 TS_HAS_HIGHWAY_DISPATCH
 #cmakedefine01 TS_HAS_IN6_IS_ADDR_UNSPECIFIED
 #cmakedefine01 TS_HAS_IP_TOS
 #cmakedefine01 TS_HAS_JEMALLOC
diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt
index c9d5536c4cb..0442f93e729 100644
--- a/src/tscore/CMakeLists.txt
+++ b/src/tscore/CMakeLists.txt
@@ -94,6 +94,18 @@ add_library(
 )
 add_library(ts::tscore ALIAS tscore)
 
+# Highway-dispatched implementation of ts::ascii::tolower_copy. Only
+# compiled when ENABLE_HIGHWAY_DISPATCH=ON; otherwise the cascade in
+# include/tscore/ink_ascii_tolower.h provides the implementation
+# entirely inline at the call site. The source dir is added to the
+# include path so Highway's foreach_target.h self-include of
+# ink_ascii_tolower_dispatch.cc resolves.
+if(TS_HAS_HIGHWAY_DISPATCH)
+  target_sources(tscore PRIVATE ink_ascii_tolower_dispatch.cc)
+  target_link_libraries(tscore PRIVATE hwy::hwy)
+  target_include_directories(tscore PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+endif()
+
 # Some plugins link against ts::tscore and therefore need it to be compiled as
 # position independent.
 set_target_properties(tscore PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
diff --git a/src/tscore/ink_ascii_tolower_dispatch.cc b/src/tscore/ink_ascii_tolower_dispatch.cc
new file mode 100644
index 00000000000..7cfc6f8eb83
--- /dev/null
+++ b/src/tscore/ink_ascii_tolower_dispatch.cc
@@ -0,0 +1,147 @@
+/** @file
+
+  Runtime-dispatched implementation of ts::ascii::tolower_copy.
+
+  Enabled via ENABLE_HIGHWAY_DISPATCH=ON at build time. When the option
+  is off, ink_ascii_tolower.h provides the original compile-time cascade
+  and this translation unit is not compiled.
+
+  Why dispatch: when the build target is intentionally conservative
+  (e.g. -march=westmere for binary portability), the compile-time
+  cascade in ink_ascii_tolower.h is locked to SSE2 and cannot exploit
+  AVX2 or AVX-512 on hosts that support them. Highway's foreach_target
+  mechanism emits one body per enabled target inside this TU using
+  per-function GCC target attributes, regardless of the TU's own
+  -march. At runtime, the highest target supported by the live CPU is
+  selected and its function pointer is cached for direct reuse. Effect:
+  a Westmere-compiled binary still runs the AVX-512 body on Ice Lake.
+
+  Per-call cost vs the fully-inlined cascade: ~1.3 ns flat (one direct
+  call to this function, then one indirect call through the cached
+  pointer). The trade for that overhead is unlocking the wider vector
+  path on bulk inputs, which can be ~2x faster on modern CPUs even from
+  a conservative build target.
+
+  Correctness is bit-for-bit identical to the cascade (see the
+  test_ink_ascii_tolower parity suite in src/tscore/unit_tests).
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include "tscore/ink_ascii_tolower.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "ink_ascii_tolower_dispatch.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace ts::ascii
+{
+namespace HWY_NAMESPACE
+{
+  namespace hn = hwy::HWY_NAMESPACE;
+
+  // Templated chunk worker, forced inline so the small-N branch below
+  // stays a per-call-site dead-code-eliminable check rather than a real
+  // jump table. Body is the same as the static-dispatch reference: single
+  // compare via the unsigned(v - 'A') < 26 trick, fused masked add via
+  // MaskedAddOr (lowers to _mm512_mask_add_epi8 on AVX3; falls back to
+  // IfThenElse(Add) on targets without native masked-add), and a
+  // LoadN/StoreN masked tail.
+  template <class D>
+  HWY_ATTR HWY_INLINE void
+  tolower_chunk(D d, char *dst, const char *src, std::size_t n)
+  {
+    using V              = hn::Vec<D>;
+    const V           A  = hn::Set(d, static_cast<hn::TFromD<D>>('A'));
+    const V           R  = hn::Set(d, static_cast<hn::TFromD<D>>(26));
+    const V           B5 = hn::Set(d, static_cast<hn::TFromD<D>>(0x20));
+    const std::size_t N  = hn::Lanes(d);
+
+    const auto *in  = reinterpret_cast<const uint8_t *>(src);
+    auto       *out = reinterpret_cast<uint8_t *>(dst);
+
+    std::size_t i = 0;
+    if (n >= N) {
+      for (; i + N <= n; i += N) {
+        const V    v        = hn::LoadU(d, in + i);
+        const auto is_upper = hn::Lt(hn::Sub(v, A), R);
+        const V    folded   = hn::MaskedAddOr(v, is_upper, v, B5);
+        hn::StoreU(folded, d, out + i);
+      }
+    }
+    if (i < n) {
+      const std::size_t rem      = n - i;
+      const V           v        = hn::LoadN(d, in + i, rem);
+      const auto        is_upper = hn::Lt(hn::Sub(v, A), R);
+      const V           folded   = hn::MaskedAddOr(v, is_upper, v, B5);
+      hn::StoreN(folded, d, out + i, rem);
+    }
+  }
+
+  // Small-N gate. On wide targets (AVX2 = 32 lanes, AVX3 = 64 lanes) the
+  // masked LoadN/StoreN on a sub-vector input pays ~10 ns of fixed setup
+  // before doing any work. Dropping to a 16-byte CappedTag for those
+  // inputs collapses the gate to SSSE3-class ops, which is what the
+  // hand-written cascade falls through to anyway. On targets where the
+  // ScalableTag is already 16 bytes (SSE2/NEON), both branches lower to
+  // the same body and the gate folds out at compile time.
+  HWY_ATTR void
+  tolower_copy_target(char *dst, const char *src, std::size_t n)
+  {
+    const hn::ScalableTag<uint8_t> d_full;
+    if (n < hn::Lanes(d_full)) {
+      const hn::CappedTag<uint8_t, 16> d16;
+      tolower_chunk(d16, dst, src, n);
+    } else {
+      tolower_chunk(d_full, dst, src, n);
+    }
+  }
+
+} // namespace HWY_NAMESPACE
+} // namespace ts::ascii
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace ts::ascii
+{
+
+HWY_EXPORT(tolower_copy_target);
+
+// Cached-pointer dispatch. On first call, the lazy-init lambda invokes
+// the dispatch table (which patches the entry to point to the chosen
+// target) and then captures the now-resolved pointer for direct reuse.
+// Thread-safe via C++11 magic-static semantics; runs the resolver once.
+// Per-call cost after init is one indirect call through the cached
+// pointer.
+void
+tolower_copy_dispatch(char *dst, const char *src, std::size_t n) noexcept
+{
+  using tolower_fn_t           = void (*)(char *, const char *, std::size_t);
+  static const tolower_fn_t fn = []() noexcept {
+    char dummy_dst = 0, dummy_src = 'A';
+    HWY_DYNAMIC_DISPATCH(tolower_copy_target)(&dummy_dst, &dummy_src, 1);
+    return HWY_DYNAMIC_POINTER(tolower_copy_target);
+  }();
+  fn(dst, src, n);
+}
+
+} // namespace ts::ascii
+#endif // HWY_ONCE
diff --git a/tools/benchmark/benchmark_ascii_tolower.cc b/tools/benchmark/benchmark_ascii_tolower.cc
index 4c799713c99..dcc155e4c8c 100644
--- a/tools/benchmark/benchmark_ascii_tolower.cc
+++ b/tools/benchmark/benchmark_ascii_tolower.cc
@@ -87,19 +87,21 @@ tolower_scalar(char *d, const char *s, std::size_t n) noexcept
 
 TEST_CASE("active SIMD configuration", "[tolower][config]")
 {
-  // Print the compile-time ISA path so the benchmark output makes the
-  // selected configuration obvious.
-  std::cout << "ts::ascii::tolower_copy compiled with: ";
-#if defined(__AVX512BW__)
-  std::cout << "AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2 cascade";
+  // Print the configuration so the benchmark output makes the selected
+  // implementation obvious.
+  std::cout << "ts::ascii::tolower_copy implementation: ";
+#if TS_HAS_HIGHWAY_DISPATCH
+  std::cout << "Highway runtime dispatch (selects best available target at startup)";
+#elif defined(__AVX512BW__)
+  std::cout << "compile-time cascade — AVX-512BW (64B body + masked tail, gated at n>=64) + AVX2 + SSE2";
 #elif defined(__AVX2__)
-  std::cout << "AVX2 (32B body) + SSE2 (16B drain)";
+  std::cout << "compile-time cascade — AVX2 (32B body) + SSE2 (16B drain)";
 #elif defined(__SSE2__)
-  std::cout << "SSE2 (16B body)";
+  std::cout << "compile-time cascade — SSE2 (16B body)";
 #elif defined(__ARM_NEON) || defined(__aarch64__)
-  std::cout << "NEON (16B body)";
+  std::cout << "compile-time cascade — NEON (16B body)";
 #else
-  std::cout << "scalar only";
+  std::cout << "compile-time cascade — scalar only";
 #endif
   std::cout << '\n';
   SUCCEED();

From 8f158c548ccaab808bc1e3be5d9f113b5f711047 Mon Sep 17 00:00:00 2001
From: "Phong X. Nguyen" <phongn@gmail.com>
Date: Thu, 11 Jun 2026 03:06:40 +0000
Subject: [PATCH 06/12] Integrate internal or external highway dispatch

---
 CMakeLists.txt | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c3d1498cc2..f102c9346af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -330,11 +330,6 @@ elseif(TS_HAS_MIMALLOC)
   link_libraries(mimalloc)
 endif()
 
-if(ENABLE_HIGHWAY_DISPATCH)
-  find_package(hwy CONFIG REQUIRED)
-  set(TS_HAS_HIGHWAY_DISPATCH ${hwy_FOUND})
-endif()
-
 if(ENABLE_QUICHE)
   find_package(quiche REQUIRED)
 
@@ -455,9 +450,13 @@ if(EXTERNAL_LIBSWOC)
   find_package(libswoc REQUIRED)
 endif()
 
-if(EXTERNAL_HWY)
-  message(STATUS "Looking for external highway")
-  find_package(HWY "1.4.0" CONFIG REQUIRED)
+if(ENABLE_HIGHWAY_DISPATCH)
+  if(EXTERNAL_HWY)
+    message(STATUS "Looking for external highway")
+    find_package(HWY "1.4.0" CONFIG REQUIRED)
+    set(TS_HAS_HIGHWAY_DISPATCH ${hwy_FOUND})
+  endif()
+  set(TS_HAS_HIGHWAY_DISPATCH 1)
 endif()
 
 include(Check128BitCas)

From 37afbe7ff84b435df92092bd21b71428044eae81 Mon Sep 17 00:00:00 2001
From: "Phong X. Nguyen" <phongn@gmail.com>
Date: Thu, 28 May 2026 22:59:10 +0000
Subject: [PATCH 07/12] tscore: SIMD base64 encode/decode via Google Highway

Add an optional runtime-dispatched SIMD path for ats_base64_encode /
ats_base64_decode, gated by the ENABLE_HIGHWAY_DISPATCH CMake option
(off by default; the scalar path is unchanged when off). The scalar
primitives move to ink_base64_scalar.h so the SIMD kernel's scalar
tail and the scalar path share one definition and cannot drift.

The kernels in ink_base64_dispatch.cc use Highway's portable SIMD ops
(foreach_target + HWY_DYNAMIC_DISPATCH), so one source compiles for
SSE4/AVX2/AVX-512 and the best target supported by the live CPU is
chosen at runtime. The algorithms and lookup tables derive from the
simdutf library (Mula/Lemire vectorized base64; aqrit's combined
standard/URL-safe classifier), re-expressed in Highway; see NOTICE.

Decode fuses validation into the SIMD loop (consuming only fully-valid
16-byte blocks and finishing the remainder, including any non-alphabet
truncation, on the scalar tail), so output is byte-for-byte identical
to the scalar decoder, including in-place use and mixed standard/URL
alphabets. encode reuses the scalar encoder for the padded tail.

Tests: unit_tests/test_ink_base64.cc cross-checks the public path
against the scalar reference across sizes that straddle the SIMD
thresholds, both alphabets, in-place decode, truncation at every
position, and undersized output buffers; with the option on these
become SIMD-vs-scalar parity checks. tests/fuzzing/fuzz_base64.cc adds
a libFuzzer target that decodes untrusted input and cross-checks both
paths under sanitizers.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt                           |   7 +-
 NOTICE                                   |   9 +
 src/tscore/CMakeLists.txt                |  11 +
 src/tscore/ink_base64.cc                 | 179 ++++++---------
 src/tscore/ink_base64_dispatch.cc        | 256 ++++++++++++++++++++++
 src/tscore/ink_base64_dispatch.h         |  49 +++++
 src/tscore/ink_base64_scalar.h           | 166 ++++++++++++++
 src/tscore/unit_tests/test_ink_base64.cc | 265 ++++++++++-------------
 tests/fuzzing/CMakeLists.txt             |   5 +
 tests/fuzzing/fuzz_base64.cc             | 115 ++++++++++
 10 files changed, 798 insertions(+), 264 deletions(-)
 create mode 100644 src/tscore/ink_base64_dispatch.cc
 create mode 100644 src/tscore/ink_base64_dispatch.h
 create mode 100644 src/tscore/ink_base64_scalar.h
 create mode 100644 tests/fuzzing/fuzz_base64.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f102c9346af..229aa1002be 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -217,7 +217,7 @@ set(ENABLE_TPROXY
 )
 option(ENABLE_QUICHE "Use quiche (default OFF)")
 option(ENABLE_HIGHWAY_DISPATCH
-       "Use Google Highway runtime dispatch for SIMD kernels (default OFF, default cascade compile-time selection)"
+       "Use Google Highway runtime dispatch for SIMD kernels (default OFF; without it the scalar paths are used)"
 )
 
 option(ENABLE_EXAMPLE "Build example directory (default OFF)")
@@ -330,6 +330,11 @@ elseif(TS_HAS_MIMALLOC)
   link_libraries(mimalloc)
 endif()
 
+if(ENABLE_HIGHWAY_DISPATCH)
+  find_package(hwy CONFIG REQUIRED)
+  set(TS_HAS_HIGHWAY_DISPATCH ${hwy_FOUND})
+endif()
+
 if(ENABLE_QUICHE)
   find_package(quiche REQUIRED)
 
diff --git a/NOTICE b/NOTICE
index 56a0a697d95..7fb1e7a6358 100644
--- a/NOTICE
+++ b/NOTICE
@@ -140,3 +140,12 @@ include/tscore/ink_ascii_tolower.h AVX-512BW kernel design (fused
 mask_add and masked-tail load/store) is adapted from Tony Finch's
 copytolower64.c (0BSD OR MIT-0).
 https://dotat.at/cgi/git/vectolower.git/
+
+~~
+
+src/tscore/ink_base64_dispatch.cc contains SIMD base64 encode/decode kernels
+derived from the simdutf library, re-expressed using Google Highway. The
+algorithms (Wojciech Muła and Daniel Lemire's vectorized base64, and aqrit's
+combined standard/URL-safe classifier) and lookup tables originate there.
+simdutf: https://github.com/simdutf/simdutf (Apache-2.0 / MIT / BSL-1.0)
+Copyright (c) 2021 The simdutf authors
diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt
index 0442f93e729..d687348f50a 100644
--- a/src/tscore/CMakeLists.txt
+++ b/src/tscore/CMakeLists.txt
@@ -118,6 +118,16 @@ else()
   target_sources(tscore PRIVATE HKDF_openssl.cc)
 endif()
 
+# Highway-dispatched SIMD base64 (encode/decode). Only compiled when
+# ENABLE_HIGHWAY_DISPATCH=ON; otherwise ink_base64.cc uses the scalar path
+# entirely. The source dir is added to the include path so Highway's
+# foreach_target.h self-include of ink_base64_dispatch.cc resolves.
+if(TS_HAS_HIGHWAY_DISPATCH)
+  target_sources(tscore PRIVATE ink_base64_dispatch.cc)
+  target_link_libraries(tscore PRIVATE hwy::hwy)
+  target_include_directories(tscore PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+endif()
+
 target_link_libraries(
   tscore PUBLIC OpenSSL::Crypto libswoc::libswoc yaml-cpp::yaml-cpp systemtap::systemtap resolv::resolv ts::tsutil
 )
@@ -172,6 +182,7 @@ if(BUILD_TESTING)
     unit_tests/test_arena.cc
     unit_tests/test_ink_base64.cc
     unit_tests/test_ink_ascii_tolower.cc
+    unit_tests/test_ink_base64.cc
     unit_tests/test_ink_inet.cc
     unit_tests/test_ink_memory.cc
     unit_tests/test_ink_string.cc
diff --git a/src/tscore/ink_base64.cc b/src/tscore/ink_base64.cc
index 208626f5262..2c723f42240 100644
--- a/src/tscore/ink_base64.cc
+++ b/src/tscore/ink_base64.cc
@@ -1,6 +1,34 @@
 /** @file
 
-  A brief file description
+  Base64 encoding and decoding as according to RFC1521.  Similar to uudecode.
+
+  The public entry points (ats_base64_encode / ats_base64_decode, also exposed
+  through TSBase64Encode / TSBase64Decode) dispatch between two implementations
+  that share the canonical scalar primitives in ink_base64_scalar.h:
+
+    - A hand-rolled scalar path, always present, used directly and for inputs
+      below the SIMD crossover threshold. It avoids the SIMD path's runtime
+      dispatch overhead, which would otherwise dominate for tiny inputs (e.g.
+      the 8-byte SnowflakeID encode).
+
+    - When ENABLE_HIGHWAY_DISPATCH is on (TS_HAS_HIGHWAY_DISPATCH), a SIMD path
+      in ink_base64_dispatch.cc built on Google Highway, used for larger
+      inputs. It produces output byte-for-byte identical to the scalar path.
+
+  RFC 1521 requires inserting line breaks for long lines.  The basic web
+  authentication scheme does not require them.  This implementation is intended
+  for web-related use, and line breaks are not implemented.
+
+  Contract preserved by both paths:
+
+    - encode: standard RFC 1521 alphabet (`+`, `/`), `=` padding, no line
+      breaks, trailing NUL at outBuffer[length].
+
+    - decode: accepts standard (`+`, `/`) and URL-safe (`-`, `_`) alphabets in
+      the same input; tolerates missing padding; on any non-alphabet byte
+      (whitespace, `=`, or garbage) truncates and returns success with whatever
+      was decoded; trailing NUL at outBuffer[length]; supports in-place decode
+      (dst == src).
 
   @section license License
 
@@ -20,73 +48,42 @@
   See the License for the specific language governing permissions and
   limitations under the License.
  */
-
-/*
- * Base64 encoding and decoding as according to RFC1521.  Similar to uudecode.
- *
- * RFC 1521 requires inserting line breaks for long lines.  The basic web
- * authentication scheme does not require them.  This implementation is
- * intended for web-related use, and line breaks are not implemented.
- *
- */
 #include "tscore/ink_platform.h"
+#include "tscore/ink_config.h"
 #include "tscore/ink_base64.h"
 #include "tscore/ink_assert.h"
 
+#include "ink_base64_scalar.h"
+
+#if TS_HAS_HIGHWAY_DISPATCH
+#include "ink_base64_dispatch.h"
+
+// Inputs at or below these byte counts stay on the scalar path, where they
+// outrun the SIMD path's per-call dispatch overhead. The thresholds are
+// conservative; both paths are correct at every size. inBufferSize for encode
+// is the binary plaintext length; for decode it is the base64-encoded length.
+namespace
+{
+constexpr size_t BASE64_ENCODE_SIMD_THRESHOLD = 24;
+constexpr size_t BASE64_DECODE_SIMD_THRESHOLD = 32;
+} // namespace
+#endif
+
 bool
 ats_base64_encode(const unsigned char *inBuffer, size_t inBufferSize, char *outBuffer, size_t outBufSize, size_t *length)
 {
-  static const char _codes[66] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-  char             *obuf       = outBuffer;
-  char              in_tail[4];
-
   if (outBufSize < ats_base64_encode_dstlen(inBufferSize)) {
     return false;
   }
 
-  while (inBufferSize > 2) {
-    *obuf++ = _codes[(inBuffer[0] >> 2) & 077];
-    *obuf++ = _codes[((inBuffer[0] & 03) << 4) | ((inBuffer[1] >> 4) & 017)];
-    *obuf++ = _codes[((inBuffer[1] & 017) << 2) | ((inBuffer[2] >> 6) & 017)];
-    *obuf++ = _codes[inBuffer[2] & 077];
-
-    inBufferSize -= 3;
-    inBuffer     += 3;
-  }
-
-  /*
-   * We've done all the input groups of three chars.  We're left
-   * with 0, 1, or 2 input chars.  We have to add zero-bits to the
-   * right if we don't have enough input chars.
-   * If 0 chars left, we're done.
-   * If 1 char left, form 2 output chars, and add 2 pad chars to output.
-   * If 2 chars left, form 3 output chars, add 1 pad char to output.
-   */
-  if (inBufferSize == 0) {
-    *obuf = '\0';
-    if (length) {
-      *length = (obuf - outBuffer);
-    }
-  } else {
-    memset(in_tail, 0, sizeof(in_tail));
-    memcpy(in_tail, inBuffer, inBufferSize);
-
-    *(obuf)     = _codes[(in_tail[0] >> 2) & 077];
-    *(obuf + 1) = _codes[((in_tail[0] & 03) << 4) | ((in_tail[1] >> 4) & 017)];
-    *(obuf + 2) = _codes[((in_tail[1] & 017) << 2) | ((in_tail[2] >> 6) & 017)];
-    *(obuf + 3) = _codes[in_tail[2] & 077];
-
-    if (inBufferSize == 1) {
-      *(obuf + 2) = '=';
-    }
-    *(obuf + 3) = '=';
-    *(obuf + 4) = '\0';
-
-    if (length) {
-      *length = (obuf + 4) - outBuffer;
-    }
+#if TS_HAS_HIGHWAY_DISPATCH
+  if (inBufferSize > BASE64_ENCODE_SIMD_THRESHOLD) {
+    ats::base64::encode_dispatch(inBuffer, inBufferSize, outBuffer, length);
+    return true;
   }
+#endif
 
+  ats::base64::encode_scalar(inBuffer, inBufferSize, outBuffer, length);
   return true;
 }
 
@@ -96,76 +93,26 @@ ats_base64_encode(const char *inBuffer, size_t inBufferSize, char *outBuffer, si
   return ats_base64_encode(reinterpret_cast<const unsigned char *>(inBuffer), inBufferSize, outBuffer, outBufSize, length);
 }
 
-/*-------------------------------------------------------------------------
-  This is a reentrant, and malloc free implementation of ats_base64_decode.
-  -------------------------------------------------------------------------*/
-#ifdef DECODE
-#undef DECODE
-#endif
-
-#define DECODE(x)     printableToSixBit[(unsigned char)x]
-#define MAX_PRINT_VAL 63
-
-/* Converts a printable character to it's six bit representation */
-const unsigned char printableToSixBit[256] = {
-  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 62, 64, 62, 64, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 64, 64, 64, 64, 64, 64,
-  64, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 64, 64, 64, 64, 63,
-  64, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 64, 64, 64, 64, 64,
-  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
-
 bool
 ats_base64_decode(const char *inBuffer, size_t inBufferSize, unsigned char *outBuffer, size_t outBufSize, size_t *length)
 {
-  size_t         decodedBytes = 0;
-  unsigned char *buf          = outBuffer;
-
-  // Make sure there is sufficient space in the output buffer
   if (outBufSize < ats_base64_decode_dstlen(inBufferSize)) {
     return false;
   }
 
-  // Ignore any trailing ='s or other undecodable characters: consume only the
-  // leading run of base64-alphabet bytes.
-  // TODO: Perhaps that ought to be an error instead?
-  size_t inBytes = 0;
-  while (inBytes < inBufferSize && DECODE(inBuffer[inBytes]) <= MAX_PRINT_VAL) {
-    ++inBytes;
+#if TS_HAS_HIGHWAY_DISPATCH
+  if (inBufferSize > BASE64_DECODE_SIMD_THRESHOLD) {
+    // The SIMD path validates inline and truncates at the first non-alphabet
+    // byte, so no separate alphabet pre-scan is needed here.
+    ats::base64::decode_dispatch(inBuffer, inBufferSize, outBuffer, length);
+    return true;
   }
+#endif
 
-  // Decode complete 4-character groups into 3 bytes each. Process only whole
-  // groups here so the loop never reads past the alphabet prefix; the previous
-  // code ran one extra iteration when inBytes was not a multiple of four (a
-  // read out of bounds of the input) and then read inBuffer[-2].
-  while (inBytes >= 4) {
-    buf[0]        = static_cast<unsigned char>(DECODE(inBuffer[0]) << 2 | DECODE(inBuffer[1]) >> 4);
-    buf[1]        = static_cast<unsigned char>(DECODE(inBuffer[1]) << 4 | DECODE(inBuffer[2]) >> 2);
-    buf[2]        = static_cast<unsigned char>(DECODE(inBuffer[2]) << 6 | DECODE(inBuffer[3]));
-    buf          += 3;
-    inBuffer     += 4;
-    decodedBytes += 3;
-    inBytes      -= 4;
-  }
-
-  // Decode a trailing 2- or 3-character group; a lone trailing character does
-  // not encode a full byte and is dropped (as an RFC 4648 decoder requires).
-  if (inBytes >= 2) {
-    buf[0]        = static_cast<unsigned char>(DECODE(inBuffer[0]) << 2 | DECODE(inBuffer[1]) >> 4);
-    decodedBytes += 1;
-    if (inBytes >= 3) {
-      buf[1]        = static_cast<unsigned char>(DECODE(inBuffer[1]) << 4 | DECODE(inBuffer[2]) >> 2);
-      decodedBytes += 1;
-    }
-  }
-
-  outBuffer[decodedBytes] = '\0';
-
-  if (length) {
-    *length = decodedBytes;
-  }
+  // Ignore any trailing `=`s or other undecodable characters, then decode the
+  // valid alphabet prefix.
+  const size_t valid = ats::base64::count_alphabet_prefix(inBuffer, inBufferSize);
 
+  ats::base64::decode_scalar(inBuffer, valid, outBuffer, length);
   return true;
 }
diff --git a/src/tscore/ink_base64_dispatch.cc b/src/tscore/ink_base64_dispatch.cc
new file mode 100644
index 00000000000..427652bf862
--- /dev/null
+++ b/src/tscore/ink_base64_dispatch.cc
@@ -0,0 +1,256 @@
+/** @file
+
+  Runtime-dispatched SIMD base64 encode/decode, built against Google Highway.
+
+  Enabled by ENABLE_HIGHWAY_DISPATCH=ON. Highway's foreach_target mechanism
+  emits one body per enabled SIMD target from this single source; at runtime
+  the best target supported by the live CPU is selected once and cached, so a
+  conservatively compiled binary (e.g. -march=x86-64) still runs the AVX-512
+  body on capable hardware.
+
+  The SIMD math follows the well-known vectorized base64 algorithms popularized
+  by Wojciech Muła and Daniel Lemire and used by the simdutf library, expressed
+  here in Highway's portable ops (see NOTICE):
+
+    - decode: aqrit's "default_or_url" classifier translates ASCII to 6-bit
+      values for the standard and URL-safe alphabets at once and flags any
+      non-alphabet byte; the 4x6-bit groups are packed to 3 bytes with two
+      pairwise multiply-adds and a shuffle. Validation is fused into the loop:
+      only fully-valid 16-byte blocks are consumed by SIMD, and the remainder
+      (including any non-alphabet truncation point) is finished by the scalar
+      decoder, so output matches the scalar path exactly.
+
+    - encode: the Muła reshuffle splits each 3-byte group into four 6-bit
+      fields with one multiply-high and one multiply-low, then a pshufb-based
+      table maps 6-bit values to the standard alphabet. The 1-2 byte tail and
+      `=` padding are produced by the scalar encoder.
+
+  In-place decode (out == in) is preserved: each block is fully loaded before
+  its bounds-safe StoreN, whose end never passes the next block's load.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include "ink_base64_dispatch.h"
+#include "ink_base64_scalar.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "ink_base64_dispatch.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace ats::base64
+{
+namespace HWY_NAMESPACE
+{
+  namespace hn = hwy::HWY_NAMESPACE;
+
+  // per-128-bit-block constants, replicated to any width via LoadDup128
+  alignas(16) static constexpr int8_t kMul1[16]  = {0x40, 0x01, 0x40, 0x01, 0x40, 0x01, 0x40, 0x01,
+                                                    0x40, 0x01, 0x40, 0x01, 0x40, 0x01, 0x40, 0x01};
+  alignas(16) static constexpr int16_t kMul2[8]  = {0x1000, 0x0001, 0x1000, 0x0001, 0x1000, 0x0001, 0x1000, 0x0001};
+  alignas(16) static constexpr uint8_t kPack[16] = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0x80, 0x80, 0x80, 0x80};
+
+  // ---- DECODE ----
+
+  // aqrit's "default_or_url" classifier (from simdutf's
+  // to_base64_mask<default_or_url=true>): maps ASCII to 6-bit values for both
+  // standard (+ /) and URL-safe (- _) bytes in one pass, matching
+  // printableToSixBit exactly. The raw check mask flags whitespace as invalid
+  // (we omit simdutf's whitespace-XOR correction), giving ATS's truncate-on-
+  // non-alphabet semantics. Sets *ok false if any lane is non-alphabet.
+  alignas(16) static constexpr uint8_t kDeltaAsso[16]   = {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                                           0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16};
+  alignas(16) static constexpr uint8_t kDeltaValues[16] = {0xBF, 0xE0, 0xB9, 0x13, 0x04, 0xBF, 0xBF, 0xB9,
+                                                           0xB9, 0x00, 0xFF, 0x11, 0xFF, 0xBF, 0x10, 0xB9};
+  alignas(16) static constexpr uint8_t kCheckAsso[16]   = {0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                                           0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06};
+  alignas(16) static constexpr uint8_t kCheckValues[16] = {0x80, 0x80, 0x80, 0x80, 0xCF, 0xBF, 0xD5, 0xA6,
+                                                           0xB5, 0xA1, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80};
+
+  template <class D>
+  HWY_ATTR HWY_INLINE hn::Vec<D>
+                      classify(D d, hn::Vec<D> v, bool *ok)
+  {
+    const hn::RebindToSigned<D>        di;
+    const hn::Repartition<uint32_t, D> d32;
+
+    const auto shifted    = hn::BitCast(d, hn::ShiftRight<3>(hn::BitCast(d32, v)));
+    auto       delta_hash = hn::AverageRound(hn::TableLookupBytes(hn::LoadDup128(d, kDeltaAsso), v), shifted);
+    delta_hash            = hn::And(delta_hash, hn::Set(d, 0x0F));
+    const auto check_hash = hn::AverageRound(hn::TableLookupBytes(hn::LoadDup128(d, kCheckAsso), v), shifted);
+
+    const auto out =
+      hn::SaturatedAdd(hn::BitCast(di, hn::TableLookupBytes(hn::LoadDup128(d, kDeltaValues), delta_hash)), hn::BitCast(di, v));
+    const auto chk =
+      hn::SaturatedAdd(hn::BitCast(di, hn::TableLookupBytes(hn::LoadDup128(d, kCheckValues), check_hash)), hn::BitCast(di, v));
+
+    *ok = hn::AllFalse(di, hn::Lt(chk, hn::Zero(di))); // chk sign bit set => invalid
+    return hn::BitCast(d, out);
+  }
+
+  // Decode one 16-char block -> 12 bytes. Returns false without storing if the
+  // block holds a non-alphabet byte. The 12 valid output bytes are contiguous
+  // at the front, so a bounds-safe StoreN(12) suffices (no overrun).
+  HWY_ATTR HWY_INLINE bool
+  decode_block16(const char *in, unsigned char *out)
+  {
+    const hn::Full128<uint8_t>                  d;
+    const hn::RebindToSigned<decltype(d)>       d8i;
+    const hn::Repartition<int16_t, decltype(d)> d16;
+    const hn::Repartition<int32_t, decltype(d)> d32;
+
+    const auto v = hn::LoadU(d, reinterpret_cast<const uint8_t *>(in));
+    bool       ok;
+    const auto val = classify(d, v, &ok);
+    if (!ok) {
+      return false;
+    }
+
+    const auto mul1   = hn::BitCast(d8i, hn::Load(d, reinterpret_cast<const uint8_t *>(kMul1)));
+    const auto t0     = hn::SatWidenMulPairwiseAdd(d16, val, mul1);             // maddubs
+    const auto t1     = hn::WidenMulPairwiseAdd(d32, t0, hn::Load(d16, kMul2)); // madd
+    const auto packed = hn::TableLookupBytes(hn::BitCast(d, t1), hn::Load(d, kPack));
+
+    hn::StoreN(packed, d, out, 12);
+    return true;
+  }
+
+  HWY_ATTR void
+  DecodeImpl(const char *in, size_t in_len, unsigned char *out, size_t *out_len)
+  {
+    size_t i = 0, o = 0;
+    for (; i + 16 <= in_len; i += 16, o += 12) {
+      if (!decode_block16(in + i, out + o)) {
+        break;
+      }
+    }
+    // Scalar finishes the remainder: count the alphabet prefix (truncating at
+    // the first non-alphabet byte) then decode 4-groups + a 2/3 char tail.
+    // Identical to running the scalar decoder over the whole alphabet prefix,
+    // because the SIMD loop consumed only fully-valid 4-group-aligned blocks.
+    const size_t rem_valid = count_alphabet_prefix(in + i, in_len - i);
+    size_t       tail_len  = 0;
+    decode_scalar(in + i, rem_valid, out + o, &tail_len);
+    o        += tail_len;
+    out[o]    = '\0';
+    *out_len  = o;
+  }
+
+  // ---- ENCODE ----
+
+  // 6-bit value (0..63) -> standard-alphabet ASCII, from simdutf's
+  // lookup_pshufb_improved (Muła): reduce to a 4-bit class, look up a per-class
+  // offset, add. Standard alphabet (+ /), matching encode_scalar.
+#define U8(x) static_cast<uint8_t>(x)
+  alignas(16) static constexpr uint8_t kShiftLUT[16] = {U8('a' - 26),
+                                                        U8('0' - 52),
+                                                        U8('0' - 52),
+                                                        U8('0' - 52),
+                                                        U8('0' - 52),
+                                                        U8('0' - 52),
+                                                        U8('0' - 52),
+                                                        U8('0' - 52),
+                                                        U8('0' - 52),
+                                                        U8('0' - 52),
+                                                        U8('0' - 52),
+                                                        U8('+' - 62),
+                                                        U8('/' - 63),
+                                                        U8('A'),
+                                                        0,
+                                                        0};
+#undef U8
+
+  template <class D>
+  HWY_ATTR HWY_INLINE hn::Vec<D>
+                      to_ascii(D d, hn::Vec<D> idx)
+  {
+    auto       res  = hn::SaturatedSub(idx, hn::Set(d, 51)); // 52..63 -> 1..12, else 0
+    const auto less = hn::Lt(idx, hn::Set(d, 26));           // 0..25 (uppercase class)
+    res             = hn::Or(res, hn::IfThenElseZero(less, hn::Set(d, 13)));
+    res             = hn::TableLookupBytes(hn::LoadDup128(d, kShiftLUT), res);
+    return hn::Add(res, idx);
+  }
+
+  // Encode 12 input bytes per 16-byte block -> 16 ASCII chars. `in` must have
+  // >= 16 readable bytes (over-reads bytes 12..15, only 0..11 used). Muła
+  // reshuffle: spread the 3 bytes of each group across a 32-bit lane, then split
+  // into four 6-bit fields with one mulhi + one mullo (each a pair of per-16-bit
+  // variable shifts).
+  template <class D>
+  HWY_ATTR HWY_INLINE void
+  encode_chunk(D d, const unsigned char *in, char *out)
+  {
+    const hn::Repartition<uint32_t, D> d32;
+    const hn::Repartition<uint16_t, D> d16;
+
+    alignas(16) static constexpr uint8_t kSpread[16] = {1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};
+
+    const auto in32 = hn::BitCast(d32, hn::TableLookupBytes(hn::LoadU(d, in), hn::LoadDup128(d, kSpread)));
+
+    const auto t0  = hn::And(in32, hn::Set(d32, 0x0fc0fc00u));
+    const auto t1  = hn::MulHigh(hn::BitCast(d16, t0), hn::BitCast(d16, hn::Set(d32, 0x04000040u)));
+    const auto t2  = hn::And(in32, hn::Set(d32, 0x003f03f0u));
+    const auto t3  = hn::Mul(hn::BitCast(d16, t2), hn::BitCast(d16, hn::Set(d32, 0x01000010u)));
+    const auto idx = hn::BitCast(d, hn::Or(t1, t3)); // four 6-bit values per 32-bit lane
+
+    hn::StoreU(to_ascii(d, idx), d, reinterpret_cast<uint8_t *>(out));
+  }
+
+  HWY_ATTR void
+  EncodeImpl(const unsigned char *in, size_t in_len, char *out, size_t *out_len)
+  {
+    const hn::Full128<uint8_t> d;
+    size_t                     i = 0, o = 0;
+    while (in_len - i >= 16) {
+      encode_chunk(d, in + i, out + o);
+      i += 12;
+      o += 16;
+    }
+    size_t tail = 0;
+    encode_scalar(in + i, in_len - i, out + o, &tail);
+    o        += tail;
+    *out_len  = o;
+  }
+
+} // namespace HWY_NAMESPACE
+} // namespace ats::base64
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace ats::base64
+{
+HWY_EXPORT(DecodeImpl);
+HWY_EXPORT(EncodeImpl);
+
+void
+decode_dispatch(const char *in, size_t in_len, unsigned char *out, size_t *out_len)
+{
+  HWY_DYNAMIC_DISPATCH(DecodeImpl)(in, in_len, out, out_len);
+}
+
+void
+encode_dispatch(const unsigned char *in, size_t in_len, char *out, size_t *out_len)
+{
+  HWY_DYNAMIC_DISPATCH(EncodeImpl)(in, in_len, out, out_len);
+}
+
+} // namespace ats::base64
+#endif
diff --git a/src/tscore/ink_base64_dispatch.h b/src/tscore/ink_base64_dispatch.h
new file mode 100644
index 00000000000..afd67332f91
--- /dev/null
+++ b/src/tscore/ink_base64_dispatch.h
@@ -0,0 +1,49 @@
+/** @file
+
+  Runtime-dispatched SIMD base64 entry points, implemented in
+  ink_base64_dispatch.cc against Google Highway and selected at runtime via
+  HWY_DYNAMIC_DISPATCH. Only built and used when ENABLE_HIGHWAY_DISPATCH is on
+  (TS_HAS_HIGHWAY_DISPATCH at compile time); ink_base64.cc routes large inputs
+  here and falls back to the scalar path otherwise.
+
+  Both functions produce output byte-for-byte identical to the scalar
+  encode_scalar / count_alphabet_prefix + decode_scalar in ink_base64_scalar.h.
+  decode consumes only fully-validated SIMD blocks and hands the remainder
+  (including any truncation at a non-alphabet byte) to the scalar tail.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ats::base64
+{
+// Decode `in_len` bytes of base64 input. Output buffer capacity has already
+// been validated by the caller (ats_base64_decode). Writes a trailing NUL at
+// out[*out_len]. Supports in-place use (out == reinterpret_cast<uint8_t*>(in)).
+void decode_dispatch(const char *in, size_t in_len, unsigned char *out, size_t *out_len);
+
+// Encode `in_len` binary bytes. Output buffer capacity already validated.
+// Writes a trailing NUL at out[*out_len].
+void encode_dispatch(const unsigned char *in, size_t in_len, char *out, size_t *out_len);
+
+} // namespace ats::base64
diff --git a/src/tscore/ink_base64_scalar.h b/src/tscore/ink_base64_scalar.h
new file mode 100644
index 00000000000..51049705bdd
--- /dev/null
+++ b/src/tscore/ink_base64_scalar.h
@@ -0,0 +1,166 @@
+/** @file
+
+  Scalar base64 encode/decode primitives, shared by the always-present scalar
+  path in ink_base64.cc and (when ENABLE_HIGHWAY_DISPATCH is on) the SIMD
+  kernel's scalar tail in ink_base64_dispatch.cc. Keeping a single definition
+  here guarantees the two paths cannot drift.
+
+  These are the canonical reference semantics for ATS base64:
+
+    - encode: standard RFC 1521 alphabet (`+`, `/`), `=` padding, no line
+      breaks, trailing NUL at outBuffer[length].
+
+    - decode: accepts standard (`+`, `/`) and URL-safe (`-`, `_`) alphabets
+      mixed in the same input; truncates at the first non-alphabet byte
+      (whitespace, `=`, or garbage); tolerates missing padding; trailing NUL
+      at outBuffer[length]; supports in-place decode (dst == src).
+
+  decode_scalar restructures the historical tail handling: the previous code
+  ran one extra loop iteration past the alphabet prefix when the valid length
+  was not a multiple of four (reading bytes beyond the prefix, potentially out
+  of bounds) and then read inBuffer[-2]. This version processes only complete
+  4-character groups and decodes a 2- or 3-character tail explicitly, dropping
+  a lone trailing character. The decoded length and bytes are identical to the
+  historical code for every well-defined input; only the out-of-bounds reads
+  are removed.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+namespace ats::base64
+{
+// Converts a printable character to its six-bit representation; 64 marks a
+// non-alphabet byte. Both standard (`+`=62, `/`=63) and URL-safe (`-`=62,
+// `_`=63) punctuation are accepted.
+inline constexpr unsigned char printableToSixBit[256] = {
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 62, 64, 62, 64, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 64, 64, 64, 64, 64, 64,
+  64, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 64, 64, 64, 64, 63,
+  64, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+inline constexpr unsigned char MAX_PRINT_VAL = 63;
+
+inline unsigned char
+decode_byte(char c)
+{
+  return printableToSixBit[static_cast<uint8_t>(c)];
+}
+
+// Count the leading base64-alphabet bytes (standard or URL-safe). Any byte at
+// or after this index is whitespace, `=`, or garbage and terminates the input.
+inline size_t
+count_alphabet_prefix(const char *inBuffer, size_t inBufferSize)
+{
+  size_t valid = 0;
+
+  while (valid < inBufferSize && decode_byte(inBuffer[valid]) <= MAX_PRINT_VAL) {
+    ++valid;
+  }
+  return valid;
+}
+
+// Hand-rolled scalar encode. Caller has already validated outBufSize.
+inline void
+encode_scalar(const unsigned char *inBuffer, size_t inBufferSize, char *outBuffer, size_t *length)
+{
+  static const char _codes[66] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+  char             *obuf       = outBuffer;
+  char              in_tail[4];
+
+  while (inBufferSize > 2) {
+    *obuf++ = _codes[(inBuffer[0] >> 2) & 077];
+    *obuf++ = _codes[((inBuffer[0] & 03) << 4) | ((inBuffer[1] >> 4) & 017)];
+    *obuf++ = _codes[((inBuffer[1] & 017) << 2) | ((inBuffer[2] >> 6) & 017)];
+    *obuf++ = _codes[inBuffer[2] & 077];
+
+    inBufferSize -= 3;
+    inBuffer     += 3;
+  }
+
+  if (inBufferSize == 0) {
+    *obuf = '\0';
+    if (length) {
+      *length = (obuf - outBuffer);
+    }
+  } else {
+    memset(in_tail, 0, sizeof(in_tail));
+    memcpy(in_tail, inBuffer, inBufferSize);
+
+    *(obuf)     = _codes[(in_tail[0] >> 2) & 077];
+    *(obuf + 1) = _codes[((in_tail[0] & 03) << 4) | ((in_tail[1] >> 4) & 017)];
+    *(obuf + 2) = _codes[((in_tail[1] & 017) << 2) | ((in_tail[2] >> 6) & 017)];
+    *(obuf + 3) = _codes[in_tail[2] & 077];
+
+    if (inBufferSize == 1) {
+      *(obuf + 2) = '=';
+    }
+    *(obuf + 3) = '=';
+    *(obuf + 4) = '\0';
+
+    if (length) {
+      *length = (obuf + 4) - outBuffer;
+    }
+  }
+}
+
+// Hand-rolled scalar decode. Caller has pre-scanned with count_alphabet_prefix
+// so every byte in inBuffer[0..inBufferSize) is in the base64 alphabet, and
+// has validated outBufSize.
+inline void
+decode_scalar(const char *inBuffer, size_t inBufferSize, unsigned char *outBuffer, size_t *length)
+{
+  size_t         decodedBytes = 0;
+  unsigned char *buf          = outBuffer;
+
+  while (inBufferSize >= 4) {
+    buf[0]        = static_cast<unsigned char>(decode_byte(inBuffer[0]) << 2 | decode_byte(inBuffer[1]) >> 4);
+    buf[1]        = static_cast<unsigned char>(decode_byte(inBuffer[1]) << 4 | decode_byte(inBuffer[2]) >> 2);
+    buf[2]        = static_cast<unsigned char>(decode_byte(inBuffer[2]) << 6 | decode_byte(inBuffer[3]));
+    buf          += 3;
+    inBuffer     += 4;
+    decodedBytes += 3;
+    inBufferSize -= 4;
+  }
+
+  if (inBufferSize >= 2) {
+    buf[0]        = static_cast<unsigned char>(decode_byte(inBuffer[0]) << 2 | decode_byte(inBuffer[1]) >> 4);
+    decodedBytes += 1;
+    if (inBufferSize >= 3) {
+      buf[1]        = static_cast<unsigned char>(decode_byte(inBuffer[1]) << 4 | decode_byte(inBuffer[2]) >> 2);
+      decodedBytes += 1;
+    }
+  }
+
+  outBuffer[decodedBytes] = '\0';
+  if (length) {
+    *length = decodedBytes;
+  }
+}
+
+} // namespace ats::base64
diff --git a/src/tscore/unit_tests/test_ink_base64.cc b/src/tscore/unit_tests/test_ink_base64.cc
index bc7ac39816a..48cca44b5be 100644
--- a/src/tscore/unit_tests/test_ink_base64.cc
+++ b/src/tscore/unit_tests/test_ink_base64.cc
@@ -2,10 +2,12 @@
 
   Unit tests for ats_base64_encode / ats_base64_decode.
 
-  Includes a regression test for the out-of-bounds read that occurred when the
-  decodable prefix length was not a multiple of four. The decode helper places
-  the input in an exact-size heap buffer so that, under AddressSanitizer, any
-  read past the input (as the old decoder did) is caught.
+  These run in both build configurations. With ENABLE_HIGHWAY_DISPATCH off,
+  the public entry points are the scalar path and these checks pin its
+  behavior. With it on, large inputs take the Highway SIMD path and every
+  check below becomes a byte-for-byte parity test of SIMD vs. the scalar
+  primitives (the oracle), which is why the sizes deliberately straddle the
+  SIMD thresholds and run up to several KB.
 
   @section license License
 
@@ -30,189 +32,158 @@
 
 #include <tscore/ink_base64.h>
 
+#include "../ink_base64_scalar.h" // scalar oracle: encode_scalar / decode_scalar / count_alphabet_prefix
+
 #include <catch2/catch_test_macros.hpp>
 
-#include <algorithm>
 #include <cstdint>
-#include <cstring>
 #include <random>
 #include <string>
 #include <vector>
 
 namespace
 {
-// Decode `b64` with the input held in an EXACT-size heap buffer, so a read
-// past the input trips AddressSanitizer's red zone (this is what the
-// out-of-bounds bug did for prefixes whose length is not a multiple of four).
+// Deterministic pseudo-random bytes (fixed seed -> reproducible).
 std::string
-decode_tight(const std::string &b64)
+prng_bytes(size_t n, uint32_t seed)
 {
-  std::vector<char> in(b64.size() ? b64.size() : 1);
-  if (!b64.empty()) {
-    std::memcpy(in.data(), b64.data(), b64.size());
+  std::mt19937                            rng(seed);
+  std::uniform_int_distribution<unsigned> d(0, 255);
+  std::string                             s(n, '\0');
+  for (auto &c : s) {
+    c = static_cast<char>(d(rng));
   }
+  return s;
+}
 
-  // ats_base64_decode_dstlen() already includes the trailing NUL, so this is
-  // the exact documented minimum -- no slack, so any write at or past the
-  // bound trips ASan.
-  std::vector<unsigned char> out(ats_base64_decode_dstlen(b64.size()), 0xCC);
-  size_t                     n  = 0;
-  bool                       ok = ats_base64_decode(in.data(), b64.size(), out.data(), out.size(), &n);
-  REQUIRE(ok);
-  REQUIRE(out[n] == '\0'); // trailing NUL contract
+// Encode via the scalar oracle.
+std::string
+oracle_encode(const std::string &bin)
+{
+  std::vector<char> out(ats_base64_encode_dstlen(bin.size()) + 1, '\xCC');
+  size_t            n = 0;
+  ats::base64::encode_scalar(reinterpret_cast<const unsigned char *>(bin.data()), bin.size(), out.data(), &n);
+  return std::string(out.data(), n);
+}
+
+// Decode via the scalar oracle (count alphabet prefix, then decode it).
+std::string
+oracle_decode(const std::string &b64)
+{
+  const size_t               valid = ats::base64::count_alphabet_prefix(b64.data(), b64.size());
+  std::vector<unsigned char> out(ats_base64_decode_dstlen(b64.size()) + 1, 0xCC);
+  size_t                     n = 0;
+  ats::base64::decode_scalar(b64.data(), valid, out.data(), &n);
   return std::string(reinterpret_cast<char *>(out.data()), n);
 }
 
+// Encode via the public entry point (SIMD when enabled).
 std::string
-encode(const std::string &bin)
+public_encode(const std::string &bin)
 {
-  std::vector<char> out(ats_base64_encode_dstlen(bin.size()), '\0');
+  std::vector<char> out(ats_base64_encode_dstlen(bin.size()) + 1, '\xDD');
   size_t            n  = 0;
   bool              ok = ats_base64_encode(bin.data(), bin.size(), out.data(), out.size(), &n);
   REQUIRE(ok);
-  REQUIRE(out[n] == '\0');
+  REQUIRE(out[n] == '\0'); // trailing NUL contract
   return std::string(out.data(), n);
 }
 
-const std::string kAlpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+// Decode via the public entry point (SIMD when enabled).
+std::string
+public_decode(const std::string &b64)
+{
+  std::vector<unsigned char> out(ats_base64_decode_dstlen(b64.size()) + 1, 0xDD);
+  size_t                     n  = 0;
+  bool                       ok = ats_base64_decode(b64.data(), b64.size(), out.data(), out.size(), &n);
+  REQUIRE(ok);
+  REQUIRE(out[n] == '\0');
+  return std::string(reinterpret_cast<char *>(out.data()), n);
+}
+
+const std::string kStd = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+const std::string kUrl = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
 } // namespace
 
 TEST_CASE("ats_base64 known vectors", "[base64]")
 {
-  // RFC 4648 §10.
-  CHECK(encode("") == "");
-  CHECK(encode("f") == "Zg==");
-  CHECK(encode("fo") == "Zm8=");
-  CHECK(encode("foo") == "Zm9v");
-  CHECK(encode("foob") == "Zm9vYg==");
-  CHECK(encode("fooba") == "Zm9vYmE=");
-  CHECK(encode("foobar") == "Zm9vYmFy");
-
-  CHECK(decode_tight("Zm9vYmFy") == "foobar");
+  // RFC 4648 §10 test vectors.
+  CHECK(public_encode("") == "");
+  CHECK(public_encode("f") == "Zg==");
+  CHECK(public_encode("fo") == "Zm8=");
+  CHECK(public_encode("foo") == "Zm9v");
+  CHECK(public_encode("foob") == "Zm9vYg==");
+  CHECK(public_encode("fooba") == "Zm9vYmE=");
+  CHECK(public_encode("foobar") == "Zm9vYmFy");
+
+  CHECK(public_decode("Zg==") == "f");
+  CHECK(public_decode("Zm8=") == "fo");
+  CHECK(public_decode("Zm9vYmFy") == "foobar");
 }
 
-TEST_CASE("ats_base64_decode does not read past a non-4-aligned prefix", "[base64]")
+TEST_CASE("ats_base64 encode parity and round-trip across sizes", "[base64]")
 {
-  // Regression: a decodable prefix whose length is not a multiple of four made
-  // the old decoder run an extra loop iteration past the prefix (out of bounds
-  // of the input when the buffer ends there) and read inBuffer[-2]. These
-  // unpadded inputs exercise prefix lengths 2, 3, 6, 7 (i.e. 2 and 3 mod 4);
-  // decode_tight runs them in exact-size buffers so ASan would catch any
-  // over-read.
-  CHECK(decode_tight("") == "");
-  CHECK(decode_tight("Zg") == "f");          // 2 chars  -> 1 byte
-  CHECK(decode_tight("Zm8") == "fo");        // 3 chars  -> 2 bytes
-  CHECK(decode_tight("Zm9v") == "foo");      // 4 chars  -> 3 bytes
-  CHECK(decode_tight("Zm9vYg") == "foob");   // 6 chars  -> 4 bytes
-  CHECK(decode_tight("Zm9vYmE") == "fooba"); // 7 chars  -> 5 bytes
-
-  // A lone trailing character does not encode a full byte and is dropped.
-  CHECK(decode_tight("Zm9vYg==").size() == 4);
-  CHECK(decode_tight("Q") == "");
-
-  // Sweep every prefix length (hence every length mod 4) in an exact-size
-  // buffer; assert the decoded length matches the RFC formula for that many
-  // alphabet characters and rely on ASan for over-read detection.
-  for (size_t len = 0; len <= 300; ++len) {
-    std::string s;
-    s.reserve(len);
-    for (size_t k = 0; k < len; ++k) {
-      s.push_back(kAlpha[(k * 7 + 3) % 64]);
-    }
-    const size_t rem      = len % 4;
-    const size_t expected = (len / 4) * 3 + (rem ? rem - 1 : 0);
-    INFO("len=" << len);
-    CHECK(decode_tight(s).size() == expected);
+  // Straddle the SIMD thresholds (encode 24, decode 32) and go well past them.
+  std::vector<size_t> sizes;
+  for (size_t n = 0; n <= 96; ++n) {
+    sizes.push_back(n);
+  }
+  for (size_t n : {100u, 127u, 128u, 129u, 200u, 255u, 256u, 511u, 512u, 1000u, 4096u, 4099u}) {
+    sizes.push_back(n);
   }
-}
 
-TEST_CASE("ats_base64 round-trips across sizes", "[base64]")
-{
-  std::mt19937                            rng(20240601);
-  std::uniform_int_distribution<unsigned> byte(0, 255);
+  for (size_t n : sizes) {
+    const std::string bin = prng_bytes(n, static_cast<uint32_t>(n * 2654435761u + 1));
 
-  for (size_t n = 0; n <= 256; ++n) {
-    std::string bin(n, '\0');
-    for (auto &c : bin) {
-      c = static_cast<char>(byte(rng));
-    }
+    const std::string enc_pub = public_encode(bin);
+    const std::string enc_ref = oracle_encode(bin);
     INFO("size=" << n);
-    CHECK(decode_tight(encode(bin)) == bin);
+    CHECK(enc_pub == enc_ref);                               // SIMD encode == scalar encode
+    CHECK(public_decode(enc_pub) == bin);                    // round-trips
+    CHECK(public_decode(enc_pub) == oracle_decode(enc_pub)); // SIMD decode == scalar decode
   }
 }
 
-TEST_CASE("ats_base64_decode accepts the URL-safe alphabet", "[base64]")
+TEST_CASE("ats_base64 decode parity for standard and URL-safe alphabets", "[base64]")
 {
-  // '-' and '_' map to the same six-bit values as '+' and '/', so a URL-safe
-  // string decodes to the same bytes as its standard-alphabet equivalent.
-  CHECK(decode_tight("____") == decode_tight("////"));
-  CHECK(decode_tight("----") == decode_tight("++++"));
-
-  // Round-trip through the URL-safe alphabet: encode (standard), translate
-  // '+'/'/' to '-'/'_', decode, and expect the original bytes back.
-  std::mt19937                            rng(99);
-  std::uniform_int_distribution<unsigned> byte(0, 255);
-  for (size_t n = 0; n <= 200; ++n) {
-    std::string bin(n, '\0');
-    for (auto &c : bin) {
-      c = static_cast<char>(byte(rng));
-    }
-    std::string url = encode(bin);
-    for (auto &c : url) {
-      if (c == '+') {
-        c = '-';
-      } else if (c == '/') {
-        c = '_';
+  for (const std::string *alpha : {&kStd, &kUrl}) {
+    std::mt19937                       rng(0xBEEF);
+    std::uniform_int_distribution<int> pick(0, 63);
+    for (size_t n = 0; n <= 300; ++n) {
+      std::string s;
+      s.reserve(n);
+      for (size_t k = 0; k < n; ++k) {
+        s.push_back((*alpha)[pick(rng)]);
       }
+      INFO("alphabet=" << (alpha == &kUrl ? "url" : "std") << " len=" << n);
+      CHECK(public_decode(s) == oracle_decode(s));
     }
-    INFO("size=" << n);
-    CHECK(decode_tight(url) == bin);
   }
-}
 
-TEST_CASE("ats_base64_decode accepts both alphabets mixed in one input", "[base64]")
-{
-  // The standard and URL-safe punctuation may appear in the same input.
-  std::mt19937                            rng(1234);
-  std::uniform_int_distribution<unsigned> byte(0, 255);
-  std::uniform_int_distribution<int>      coin(0, 1);
-  for (size_t n = 0; n <= 200; ++n) {
-    std::string bin(n, '\0');
-    for (auto &c : bin) {
-      c = static_cast<char>(byte(rng));
-    }
-    std::string enc = encode(bin);
-    for (auto &c : enc) {
-      if (c == '+' && coin(rng)) {
-        c = '-';
-      } else if (c == '/' && coin(rng)) {
-        c = '_';
-      }
-    }
-    INFO("size=" << n);
-    CHECK(decode_tight(enc) == bin);
-  }
+  // Both alphabets mixed within one input must decode identically to scalar.
+  const std::string mixed = "QWxhZGRpbjpvcGVuIHNlc2FtZQ"
+                            "-_+/"
+                            "QUJDREVGabcdef0123456789";
+  CHECK(public_decode(mixed) == oracle_decode(mixed));
 }
 
-TEST_CASE("ats_base64_decode truncates at the first non-alphabet byte", "[base64]")
+TEST_CASE("ats_base64 decode truncates at first non-alphabet byte", "[base64]")
 {
-  // A non-alphabet byte (whitespace, '=', or other garbage) ends the decodable
-  // input; decoding stops there and yields the decode of the prefix before it.
-  // Injecting it at every position also exercises the over-read fix under ASan.
-  const char                        *terminators = " \t\n\r=*@";
-  std::mt19937                       rng(55);
-  std::uniform_int_distribution<int> pick(0, 63);
-  for (size_t len : {1u, 2u, 3u, 4u, 5u, 7u, 8u, 16u, 17u, 31u, 33u, 64u, 100u}) {
+  // A non-alphabet byte (whitespace, '=', or garbage) ends the input; the
+  // public path must match the scalar oracle exactly, including the SIMD path.
+  const char *terminators = " \t\n\r=*@";
+
+  for (size_t len : {1u, 4u, 7u, 16u, 17u, 31u, 32u, 33u, 48u, 63u, 64u, 65u, 96u, 128u, 200u}) {
     std::string base;
     for (size_t k = 0; k < len; ++k) {
-      base.push_back(kAlpha[pick(rng)]);
+      base.push_back(kStd[(k * 7 + 3) % 64]);
     }
     for (size_t pos = 0; pos < len; ++pos) {
       for (const char *t = terminators; *t; ++t) {
         std::string s = base;
         s[pos]        = *t;
-        INFO("len=" << len << " pos=" << pos << " term=" << static_cast<int>(*t));
-        CHECK(decode_tight(s) == decode_tight(base.substr(0, pos)));
+        INFO("len=" << len << " pos=" << pos << " term=" << int(*t));
+        CHECK(public_decode(s) == oracle_decode(s));
       }
     }
   }
@@ -220,19 +191,15 @@ TEST_CASE("ats_base64_decode truncates at the first non-alphabet byte", "[base64
 
 TEST_CASE("ats_base64_decode supports in-place (dst == src)", "[base64]")
 {
-  std::mt19937                            rng(7);
-  std::uniform_int_distribution<unsigned> byte(0, 255);
+  for (size_t n : {0u, 1u, 2u, 3u, 10u, 33u, 48u, 64u, 100u, 257u, 1000u}) {
+    const std::string bin = prng_bytes(n, static_cast<uint32_t>(n + 7));
+    const std::string enc = oracle_encode(bin);
 
-  for (size_t n : {0u, 1u, 2u, 3u, 10u, 33u, 48u, 64u, 100u, 257u}) {
-    std::string bin(n, '\0');
-    for (auto &c : bin) {
-      c = static_cast<char>(byte(rng));
-    }
-    const std::string enc    = encode(bin);
-    const std::string expect = decode_tight(enc);
+    // Decode into a separate buffer for the expected result.
+    const std::string expect = public_decode(enc);
 
-    // Large enough to hold both the input copied in and the decoded output.
-    std::vector<char> buf(std::max(enc.size(), ats_base64_decode_dstlen(enc.size())), '\0');
+    // Decode in place: output overwrites the input buffer.
+    std::vector<char> buf(std::max(enc.size(), ats_base64_decode_dstlen(enc.size())) + 1, '\0');
     std::copy(enc.begin(), enc.end(), buf.begin());
     size_t n_out = 0;
     bool   ok    = ats_base64_decode(buf.data(), enc.size(), reinterpret_cast<unsigned char *>(buf.data()), buf.size(), &n_out);
@@ -245,12 +212,16 @@ TEST_CASE("ats_base64_decode supports in-place (dst == src)", "[base64]")
 TEST_CASE("ats_base64 rejects undersized output buffers", "[base64]")
 {
   const std::string bin = "hello world, base64";
-  const std::string enc = encode(bin);
-  size_t            n   = 0;
+  const std::string enc = oracle_encode(bin);
 
+  size_t            n = 0;
   std::vector<char> small_enc(ats_base64_encode_dstlen(bin.size()) - 1);
   CHECK_FALSE(ats_base64_encode(bin.data(), bin.size(), small_enc.data(), small_enc.size(), &n));
 
   std::vector<unsigned char> small_dec(ats_base64_decode_dstlen(enc.size()) - 1);
   CHECK_FALSE(ats_base64_decode(enc.data(), enc.size(), small_dec.data(), small_dec.size(), &n));
+
+  // Exactly the required size must succeed.
+  std::vector<char> ok_enc(ats_base64_encode_dstlen(bin.size()));
+  CHECK(ats_base64_encode(bin.data(), bin.size(), ok_enc.data(), ok_enc.size(), &n));
 }
diff --git a/tests/fuzzing/CMakeLists.txt b/tests/fuzzing/CMakeLists.txt
index 3639de02409..2d523d5efe3 100644
--- a/tests/fuzzing/CMakeLists.txt
+++ b/tests/fuzzing/CMakeLists.txt
@@ -29,6 +29,7 @@ add_executable(fuzz_proxy_protocol fuzz_proxy_protocol.cc)
 add_executable(fuzz_rec_http fuzz_rec_http.cc)
 add_executable(fuzz_yamlcpp fuzz_yamlcpp.cc)
 add_executable(fuzz_http3frame fuzz_http3frame.cc)
+add_executable(fuzz_base64 fuzz_base64.cc)
 
 #  Need to rewrite the ESI Parser to remove dependencies on TS API
 #target_link_libraries(fuzz_esi PRIVATE esi-common esicore tscore tsapi inkevent http)
@@ -47,6 +48,10 @@ target_link_libraries(fuzz_yamlcpp PRIVATE yaml-cpp)
 target_link_options(fuzz_yamlcpp PRIVATE "-fuse-ld=lld")
 target_link_libraries(fuzz_http3frame PRIVATE ts::tscore ts::quic)
 target_link_options(fuzz_http3frame PRIVATE "-fuse-ld=lld")
+target_link_libraries(fuzz_base64 PRIVATE ts::tscore)
+target_link_options(fuzz_base64 PRIVATE "-fuse-ld=lld")
+# fuzz_base64 cross-checks the public path against the scalar reference header.
+target_include_directories(fuzz_base64 PRIVATE ${CMAKE_SOURCE_DIR}/src/tscore)
 
 target_sources(
   fuzz_hpack PRIVATE ${CMAKE_SOURCE_DIR}/src/proxy/http2/HTTP2.cc ${CMAKE_SOURCE_DIR}/src/proxy/http2/Http2Frame.cc
diff --git a/tests/fuzzing/fuzz_base64.cc b/tests/fuzzing/fuzz_base64.cc
new file mode 100644
index 00000000000..3e8a001c089
--- /dev/null
+++ b/tests/fuzzing/fuzz_base64.cc
@@ -0,0 +1,115 @@
+/** @file
+
+  fuzzing tscore base64 (ats_base64_encode / ats_base64_decode)
+
+  Treats the fuzz input as untrusted base64 and decodes it, then round-trips
+  arbitrary bytes through encode/decode. Every operation is cross-checked
+  against the scalar reference primitives, so any divergence of the public
+  (SIMD, when ENABLE_HIGHWAY_DISPATCH is on) path from the scalar path aborts.
+  Run under AddressSanitizer to catch any out-of-bounds access on the
+  untrusted-input decode path.
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include <cstddef> // size_t, used by ink_base64.h
+
+#include <tscore/ink_base64.h>
+
+#include "ink_base64_scalar.h" // scalar reference (added to include path by CMake)
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#define kMaxInputLength 65536
+
+namespace
+{
+[[noreturn]] void
+fail(const char *what)
+{
+  std::fprintf(stderr, "base64 fuzz mismatch: %s\n", what);
+  std::abort();
+}
+
+// Decode `in` two ways and require identical results (length, bytes, NUL).
+void
+check_decode(const char *in, size_t len)
+{
+  const size_t cap = ats_base64_decode_dstlen(len);
+
+  std::vector<unsigned char> out_pub(cap + 1, 0xAB);
+  size_t                     n_pub = 0;
+  if (!ats_base64_decode(in, len, out_pub.data(), out_pub.size(), &n_pub)) {
+    fail("decode returned false with sufficient buffer");
+  }
+
+  const size_t               valid = ats::base64::count_alphabet_prefix(in, len);
+  std::vector<unsigned char> out_ref(cap + 1, 0xCD);
+  size_t                     n_ref = 0;
+  ats::base64::decode_scalar(in, valid, out_ref.data(), &n_ref);
+
+  if (n_pub != n_ref || std::memcmp(out_pub.data(), out_ref.data(), n_ref) != 0 || out_pub[n_ref] != '\0') {
+    fail("decode parity");
+  }
+}
+} // namespace
+
+extern "C" int
+LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
+{
+  if (size > kMaxInputLength) {
+    return 0;
+  }
+
+  // 1. Decode the untrusted input directly (any byte values).
+  check_decode(reinterpret_cast<const char *>(data), size);
+
+  // 2. Encode the input bytes; cross-check encode, then decode back and
+  //    confirm the original bytes are recovered.
+  const size_t      ecap = ats_base64_encode_dstlen(size);
+  std::vector<char> enc_pub(ecap + 1, 1);
+  size_t            ne_pub = 0;
+  if (!ats_base64_encode(reinterpret_cast<const char *>(data), size, enc_pub.data(), enc_pub.size(), &ne_pub)) {
+    fail("encode returned false with sufficient buffer");
+  }
+
+  std::vector<char> enc_ref(ecap + 1, 2);
+  size_t            ne_ref = 0;
+  ats::base64::encode_scalar(data, size, enc_ref.data(), &ne_ref);
+  if (ne_pub != ne_ref || std::memcmp(enc_pub.data(), enc_ref.data(), ne_ref + 1) != 0) {
+    fail("encode parity");
+  }
+
+  // Encoded output is pure alphabet (+ padding); decoding it must recover the
+  // original bytes, and the two decode paths must agree.
+  check_decode(enc_pub.data(), ne_pub);
+
+  std::vector<unsigned char> back(ats_base64_decode_dstlen(ne_pub) + 1, 0);
+  size_t                     nb = 0;
+  ats_base64_decode(enc_pub.data(), ne_pub, back.data(), back.size(), &nb);
+  if (nb != size || std::memcmp(back.data(), data, size) != 0) {
+    fail("encode/decode round-trip");
+  }
+
+  return 0;
+}

From 779ce45723feea9852321a9f5d8d5b6f46848bde Mon Sep 17 00:00:00 2001
From: "Phong X. Nguyen" <phongn@gmail.com>
Date: Thu, 11 Jun 2026 03:28:34 +0000
Subject: [PATCH 08/12] Fix merge error

---
 CMakeLists.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 229aa1002be..30cc29440c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -330,11 +330,6 @@ elseif(TS_HAS_MIMALLOC)
   link_libraries(mimalloc)
 endif()
 
-if(ENABLE_HIGHWAY_DISPATCH)
-  find_package(hwy CONFIG REQUIRED)
-  set(TS_HAS_HIGHWAY_DISPATCH ${hwy_FOUND})
-endif()
-
 if(ENABLE_QUICHE)
   find_package(quiche REQUIRED)
 

From 59b3602bfef1ba1df461af32a0f80eaef9d7eff9 Mon Sep 17 00:00:00 2001
From: "Phong X. Nguyen" <phongn@gmail.com>
Date: Thu, 11 Jun 2026 03:45:56 +0000
Subject: [PATCH 09/12] Use old API for backport

---
 src/proxy/http2/unit_tests/test_HpackIndexingTable.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc b/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc
index 7692931b1af..d682f29abe8 100644
--- a/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc
+++ b/src/proxy/http2/unit_tests/test_HpackIndexingTable.cc
@@ -547,10 +547,10 @@ TEST_CASE("HPACK encode lower-cases mixed-case field names", "[hpack]")
   // Use a name long enough to exercise the 16-byte SSE2 body when present.
   auto encode_one = [](HpackIndexingTable &table, uint8_t *buf, const char *name, const char *value) -> int64_t {
     std::unique_ptr<HTTPHdr, void (*)(HTTPHdr *)> headers(new HTTPHdr, destroy_http_hdr);
-    headers->create(HTTPType::REQUEST);
+    headers->create(HTTP_TYPE_REQUEST);
     MIMEField *field = mime_field_create(headers->m_heap, headers->m_http->m_fields_impl);
-    field->name_set(headers->m_heap, headers->m_http->m_fields_impl, std::string_view{name});
-    field->value_set(headers->m_heap, headers->m_http->m_fields_impl, std::string_view{value});
+    field->name_set(headers->m_heap, headers->m_http->m_fields_impl, name, strlen(name));
+    field->value_set(headers->m_heap, headers->m_http->m_fields_impl, value, strlen(value));
     mime_hdr_field_attach(headers->m_http->m_fields_impl, field, 1, nullptr);
     std::memset(buf, 0, BUFSIZE_FOR_REGRESSION_TEST);
     return hpack_encode_header_block(table, buf, BUFSIZE_FOR_REGRESSION_TEST, headers.get());

From 96cf85428b5a1fc7ac838ef457150ba9c680dbff Mon Sep 17 00:00:00 2001
From: "Phong X. Nguyen" <phongn@gmail.com>
Date: Thu, 11 Jun 2026 03:50:09 +0000
Subject: [PATCH 10/12] fix another merge error

---
 tools/benchmark/CMakeLists.txt | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt
index c08e9a94d12..ef9d8ae5888 100644
--- a/tools/benchmark/CMakeLists.txt
+++ b/tools/benchmark/CMakeLists.txt
@@ -33,20 +33,5 @@ target_link_libraries(benchmark_ProxyAllocator PRIVATE Catch2::Catch2WithMain ts
 add_executable(benchmark_SharedMutex benchmark_SharedMutex.cc)
 target_link_libraries(benchmark_SharedMutex PRIVATE Catch2::Catch2 ts::tscore libswoc::libswoc)
 
-add_executable(benchmark_Random benchmark_Random.cc)
-target_link_libraries(benchmark_Random PRIVATE Catch2::Catch2WithMain ts::tscore)
-
 add_executable(benchmark_ascii_tolower benchmark_ascii_tolower.cc)
 target_link_libraries(benchmark_ascii_tolower PRIVATE Catch2::Catch2WithMain ts::tscore)
-
-add_executable(benchmark_HostDB benchmark_HostDB.cc)
-target_link_libraries(
-  benchmark_HostDB
-  PRIVATE ts::tscore
-          ts::tsutil
-          ts::inkevent
-          ts::http
-          ts::http_remap
-          ts::inkcache
-          ts::inkhostdb
-)

From 9e52929d7883149e7c3d8fc3126fd824bf2eb156 Mon Sep 17 00:00:00 2001
From: "Phong X. Nguyen" <phongn@gmail.com>
Date: Thu, 11 Jun 2026 04:30:57 +0000
Subject: [PATCH 11/12] Additional Claude Fable remediations

---
 CMakeLists.txt                     |   5 +-
 CMakePresets.json                  |   9 ++
 include/tscore/ink_ascii_tolower.h | 197 +++++------------------------
 lib/CMakeLists.txt                 |   2 +-
 src/tscore/CMakeLists.txt          |  23 +---
 src/tscore/ink_base64.cc           |   4 +-
 src/tscore/ink_base64_dispatch.cc  |  22 ++--
 src/tscore/ink_base64_scalar.h     |  13 ++
 8 files changed, 76 insertions(+), 199 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30cc29440c3..7cd4e6d7ce2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -454,9 +454,10 @@ if(ENABLE_HIGHWAY_DISPATCH)
   if(EXTERNAL_HWY)
     message(STATUS "Looking for external highway")
     find_package(HWY "1.4.0" CONFIG REQUIRED)
-    set(TS_HAS_HIGHWAY_DISPATCH ${hwy_FOUND})
+    set(TS_HAS_HIGHWAY_DISPATCH ${HWY_FOUND})
+  else()
+    set(TS_HAS_HIGHWAY_DISPATCH 1)
   endif()
-  set(TS_HAS_HIGHWAY_DISPATCH 1)
 endif()
 
 include(Check128BitCas)
diff --git a/CMakePresets.json b/CMakePresets.json
index cec863644f3..08e316b316b 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -280,6 +280,15 @@
         "CMAKE_BUILD_TYPE": "Release"
       }
     },
+    {
+      "name": "branch-highway",
+      "displayName": "CI branch with Highway SIMD dispatch",
+      "description": "Builds the Highway runtime-dispatched SIMD kernels (base64, ASCII tolower) so their unit tests run as real SIMD-vs-scalar parity checks instead of scalar-vs-scalar.",
+      "inherits": ["branch"],
+      "cacheVariables": {
+        "ENABLE_HIGHWAY_DISPATCH": "ON"
+      }
+    },
     {
       "name": "branch-debug",
       "displayName": "CI branch debug",
diff --git a/include/tscore/ink_ascii_tolower.h b/include/tscore/ink_ascii_tolower.h
index 7159a1de6ae..8d69121796b 100644
--- a/include/tscore/ink_ascii_tolower.h
+++ b/include/tscore/ink_ascii_tolower.h
@@ -1,58 +1,31 @@
 /** @file
 
-  SIMD-accelerated bulk ASCII tolower copy.
+  Bulk ASCII tolower copy.
 
   Used on the URL canonicalization fast path for cache-key digests
-  (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast) and any other place
-  that needs to fold ASCII to lowercase over a small-to-moderate
-  buffer. The scalar byte-at-a-time loop is the bottleneck for hosts
-  and schemes long enough to vectorize; for shorter inputs the scalar
-  tail handles them with no SIMD overhead.
+  (src/proxy/hdrs/URL.cc::url_CryptoHash_get_fast) and any other place that
+  needs to fold ASCII to lowercase over a small-to-moderate buffer.
 
   Semantics match a byte-at-a-time loop using ParseRules::ink_tolower():
 
-    - Bytes in 'A'..'Z' (0x41..0x5A) have bit 5 set, mapping them to
-      'a'..'z'. All other bytes (including 0x80..0xFF) pass through
-      unchanged. There is no UTF-8 case folding.
+    - Bytes in 'A'..'Z' (0x41..0x5A) are folded to 'a'..'z' (bit 5 set). All
+      other bytes (including 0x80..0xFF) pass through unchanged. There is no
+      UTF-8 case folding.
 
-    - In-place use (dst == src) is supported on every path. Each SIMD
-      body loads a full block into a register before storing back at
-      the same offset, and the AVX-512BW masked tail does masked-load
-      / masked-store at the same offset. Partial overlap where
-      dst != src but the ranges intersect is not supported.
+    - In-place use (dst == src) is supported. Partial overlap where dst != src
+      but the ranges intersect is not supported.
 
-  Two implementations exist, gated by the ENABLE_HIGHWAY_DISPATCH
-  CMake option (TS_HAS_HIGHWAY_DISPATCH at compile time):
+  Two implementations, gated by the ENABLE_HIGHWAY_DISPATCH CMake option
+  (TS_HAS_HIGHWAY_DISPATCH at compile time):
 
-    - OFF (default): the compile-time cascade defined below. Selection
-      is purely compile-time; no runtime dispatch. Body is fully
-      inlined into every caller. Best when the build target is broad
-      enough to enable the widest SIMD the production hosts support.
+    - ON: the runtime-dispatched SIMD implementation in
+      src/tscore/ink_ascii_tolower_dispatch.cc, built against Google Highway.
+      The highest SIMD target supported by the live CPU is selected once and
+      cached, so a conservatively compiled binary still runs the widest body
+      its CPU supports. This is the canonical accelerated path.
 
-    - ON: an out-of-line dispatched implementation in
-      src/tscore/ink_ascii_tolower_dispatch.cc, built against Google
-      Highway. The header degenerates to a one-line forward shim. At
-      runtime the highest SIMD target supported by the live CPU is
-      selected once and cached. Best when the build target is
-      intentionally conservative (e.g. -march=westmere) but the
-      production CPU fleet is heterogeneous.
-
-  Compile-time cascade (default-off path). Bodies are stacked
-  widest-first.
-
-    - AVX-512BW builds: when n >= 64, a 64-byte main loop handles the
-      bulk and a single masked load/store finishes any 1..63-byte tail,
-      then we return. When n < 64, we fall through to the AVX2 + SSE2
-      cascade below so tiny inputs avoid the masked tail's fixed setup
-      cost.
-
-    - AVX2 builds: a 32-byte main loop drains to a 16-byte SSE2 step
-      and then to a scalar tail of 0..15 bytes.
-
-    - SSE2 / NEON builds: a single 16-byte main loop drains to a
-      scalar tail.
-
-    - Other targets: scalar only.
+    - OFF (default): a portable scalar loop, which the compiler auto-vectorizes
+      for the build's target. No hand-written intrinsics.
 
   @section license License
 
@@ -79,12 +52,12 @@
 
 #include "tscore/ink_config.h"
 
-#if TS_HAS_HIGHWAY_DISPATCH
-
 namespace ts::ascii
 {
 
-// Out-of-line, runtime-dispatched implementation. Defined in
+#if TS_HAS_HIGHWAY_DISPATCH
+
+// Out-of-line, runtime-dispatched SIMD implementation. Defined in
 // src/tscore/ink_ascii_tolower_dispatch.cc via Highway HWY_EXPORT.
 void tolower_copy_dispatch(char *dst, const char *src, std::size_t n) noexcept;
 
@@ -94,130 +67,24 @@ tolower_copy(char *dst, const char *src, std::size_t n) noexcept
   tolower_copy_dispatch(dst, src, n);
 }
 
-inline void
-tolower_inplace(char *buf, std::size_t n) noexcept
-{
-  tolower_copy_dispatch(buf, buf, n);
-}
-
-} // namespace ts::ascii
-
-#else // !TS_HAS_HIGHWAY_DISPATCH — compile-time cascade
-
-#if defined(__AVX512BW__) || defined(__AVX2__) || defined(__SSE2__)
-#include <immintrin.h>
-#elif defined(__ARM_NEON) || defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
-namespace ts::ascii
-{
+#else // !TS_HAS_HIGHWAY_DISPATCH — portable scalar fallback
 
 inline void
 tolower_copy(char *dst, const char *src, std::size_t n) noexcept
 {
-#if defined(__AVX512BW__)
-  // AVX-512BW: 64 bytes per iteration with two key optimizations over the
-  // narrower paths:
-  //   - _mm512_mask_add_epi8 fuses the "+0x20 where upper" into a single
-  //     op (no separate maskz_set1 + or).
-  //   - A masked load/store handles the 1..63-byte tail in a single SIMD
-  //     pass, so we don't need to cascade to AVX2/SSE2 to drain the
-  //     remainder.
-  //
-  // The masked tail does carry ~7 ns of fixed setup cost, which loses to
-  // the cascade on short inputs. Gating the whole block on n >= 64 means
-  // tiny inputs fall through to the AVX2/SSE2 path below, where they keep
-  // the speedup that path already provides.
-  //
-  // Adapted from Tony Finch's copytolower64.c (see NOTICE).
-  if (n >= 64) {
-    const __m512i A_vec = _mm512_set1_epi8('A');
-    const __m512i Z_vec = _mm512_set1_epi8('Z');
-    const __m512i delta = _mm512_set1_epi8('a' - 'A');
-    do {
-      __m512i   bytes    = _mm512_loadu_epi8(src);
-      __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
-      _mm512_storeu_epi8(dst, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
-      src += 64;
-      dst += 64;
-      n   -= 64;
-    } while (n >= 64);
-    if (n != 0) {
-      auto      len_mask = static_cast<__mmask64>((~0ULL) >> (64 - n));
-      __m512i   bytes    = _mm512_maskz_loadu_epi8(len_mask, src);
-      __mmask64 is_upper = _mm512_cmpge_epi8_mask(bytes, A_vec) & _mm512_cmple_epi8_mask(bytes, Z_vec);
-      _mm512_mask_storeu_epi8(dst, len_mask, _mm512_mask_add_epi8(bytes, is_upper, bytes, delta));
-    }
-    return;
-  }
-#endif
-
-#if defined(__AVX2__)
-  // 32 bytes per iteration. Same compare-and-OR pattern as SSE2.
-  {
-    const __m256i a_minus_one = _mm256_set1_epi8('A' - 1);
-    const __m256i z_plus_one  = _mm256_set1_epi8('Z' + 1);
-    const __m256i bit5        = _mm256_set1_epi8(0x20);
-    while (n >= 32) {
-      __m256i bytes = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
-      __m256i ge_A  = _mm256_cmpgt_epi8(bytes, a_minus_one);
-      __m256i le_Z  = _mm256_cmpgt_epi8(z_plus_one, bytes);
-      __m256i mask  = _mm256_and_si256(_mm256_and_si256(ge_A, le_Z), bit5);
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), _mm256_or_si256(bytes, mask));
-      src += 32;
-      dst += 32;
-      n   -= 32;
-    }
-  }
-#endif
-
-#if defined(__SSE2__)
-  // 16 bytes per iteration. Signed compare works for ASCII A-Z because all
-  // letters live below 0x80; high bytes (0x80..0xFF) compare as negative
-  // and correctly miss the [A,Z] range so they pass through unchanged.
-  {
-    const __m128i a_minus_one = _mm_set1_epi8('A' - 1);
-    const __m128i z_plus_one  = _mm_set1_epi8('Z' + 1);
-    const __m128i bit5        = _mm_set1_epi8(0x20);
-    while (n >= 16) {
-      __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
-      __m128i ge_A  = _mm_cmpgt_epi8(bytes, a_minus_one);
-      __m128i le_Z  = _mm_cmpgt_epi8(z_plus_one, bytes);
-      __m128i mask  = _mm_and_si128(_mm_and_si128(ge_A, le_Z), bit5);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_or_si128(bytes, mask));
-      src += 16;
-      dst += 16;
-      n   -= 16;
-    }
-  }
-#elif defined(__ARM_NEON) || defined(__aarch64__)
-  // 16 bytes per iteration; unsigned compare available natively.
-  {
-    const uint8x16_t a_minus_one = vdupq_n_u8('A' - 1);
-    const uint8x16_t z_plus_one  = vdupq_n_u8('Z' + 1);
-    const uint8x16_t bit5        = vdupq_n_u8(0x20);
-    while (n >= 16) {
-      uint8x16_t bytes = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
-      uint8x16_t ge_A  = vcgtq_u8(bytes, a_minus_one);
-      uint8x16_t le_Z  = vcltq_u8(bytes, z_plus_one);
-      uint8x16_t mask  = vandq_u8(vandq_u8(ge_A, le_Z), bit5);
-      vst1q_u8(reinterpret_cast<uint8_t *>(dst), vorrq_u8(bytes, mask));
-      src += 16;
-      dst += 16;
-      n   -= 16;
-    }
-  }
-#endif
-
-  while (n--) {
-    auto c = static_cast<unsigned char>(*src++);
-    *dst++ = (c >= 'A' && c <= 'Z') ? static_cast<char>(c | 0x20) : static_cast<char>(c);
+  // The unsigned (c - 'A') < 26 test is true only for 'A'..'Z'; every other
+  // byte (including 0x80..0xFF) wraps to >= 26 and passes through unchanged.
+  // The compiler auto-vectorizes this loop for the build's target.
+  for (std::size_t i = 0; i < n; ++i) {
+    auto c = static_cast<unsigned char>(src[i]);
+    dst[i] = (static_cast<unsigned char>(c - 'A') < 26) ? static_cast<char>(c | 0x20) : static_cast<char>(c);
   }
 }
 
-// Thin sugar over tolower_copy for the in-place case. Makes call sites
-// like ts::ascii::tolower_inplace(buf, n) read naturally instead of
+#endif // TS_HAS_HIGHWAY_DISPATCH
+
+// Thin sugar over tolower_copy for the in-place case. Makes call sites like
+// ts::ascii::tolower_inplace(buf, n) read naturally instead of
 // ts::ascii::tolower_copy(buf, buf, n).
 inline void
 tolower_inplace(char *buf, std::size_t n) noexcept
@@ -226,5 +93,3 @@ tolower_inplace(char *buf, std::size_t n) noexcept
 }
 
 } // namespace ts::ascii
-
-#endif // TS_HAS_HIGHWAY_DISPATCH
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 17771e4ee5f..68e5b6eb98c 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -93,7 +93,7 @@ add_subdirectory(ls-hpack)
 # CMakeLists.txt slightly modified to fully disable
 # building tests if `HWY_ENABLE_TESTS=OFF`, we intend
 # to upstream this change
-if(NOT EXTERNAL_HWY)
+if(ENABLE_HIGHWAY_DISPATCH AND NOT EXTERNAL_HWY)
   message(STATUS "Using internal highway")
   set(HWY_FORCE_STATIC_LIBS
       ON
diff --git a/src/tscore/CMakeLists.txt b/src/tscore/CMakeLists.txt
index d687348f50a..23141e8fc47 100644
--- a/src/tscore/CMakeLists.txt
+++ b/src/tscore/CMakeLists.txt
@@ -94,18 +94,6 @@ add_library(
 )
 add_library(ts::tscore ALIAS tscore)
 
-# Highway-dispatched implementation of ts::ascii::tolower_copy. Only
-# compiled when ENABLE_HIGHWAY_DISPATCH=ON; otherwise the cascade in
-# include/tscore/ink_ascii_tolower.h provides the implementation
-# entirely inline at the call site. The source dir is added to the
-# include path so Highway's foreach_target.h self-include of
-# ink_ascii_tolower_dispatch.cc resolves.
-if(TS_HAS_HIGHWAY_DISPATCH)
-  target_sources(tscore PRIVATE ink_ascii_tolower_dispatch.cc)
-  target_link_libraries(tscore PRIVATE hwy::hwy)
-  target_include_directories(tscore PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-endif()
-
 # Some plugins link against ts::tscore and therefore need it to be compiled as
 # position independent.
 set_target_properties(tscore PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
@@ -118,12 +106,13 @@ else()
   target_sources(tscore PRIVATE HKDF_openssl.cc)
 endif()
 
-# Highway-dispatched SIMD base64 (encode/decode). Only compiled when
-# ENABLE_HIGHWAY_DISPATCH=ON; otherwise ink_base64.cc uses the scalar path
-# entirely. The source dir is added to the include path so Highway's
-# foreach_target.h self-include of ink_base64_dispatch.cc resolves.
+# Highway runtime-dispatched SIMD kernels (ASCII tolower, base64). Only
+# compiled when ENABLE_HIGHWAY_DISPATCH=ON; otherwise ink_ascii_tolower.h and
+# ink_base64.cc use their scalar paths. The source dir is added to the include
+# path so Highway's foreach_target.h self-include of each *_dispatch.cc
+# resolves.
 if(TS_HAS_HIGHWAY_DISPATCH)
-  target_sources(tscore PRIVATE ink_base64_dispatch.cc)
+  target_sources(tscore PRIVATE ink_ascii_tolower_dispatch.cc ink_base64_dispatch.cc)
   target_link_libraries(tscore PRIVATE hwy::hwy)
   target_include_directories(tscore PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 endif()
diff --git a/src/tscore/ink_base64.cc b/src/tscore/ink_base64.cc
index 2c723f42240..00e84110d30 100644
--- a/src/tscore/ink_base64.cc
+++ b/src/tscore/ink_base64.cc
@@ -111,8 +111,6 @@ ats_base64_decode(const char *inBuffer, size_t inBufferSize, unsigned char *outB
 
   // Ignore any trailing `=`s or other undecodable characters, then decode the
   // valid alphabet prefix.
-  const size_t valid = ats::base64::count_alphabet_prefix(inBuffer, inBufferSize);
-
-  ats::base64::decode_scalar(inBuffer, valid, outBuffer, length);
+  ats::base64::decode_scalar_prefix(inBuffer, inBufferSize, outBuffer, length);
   return true;
 }
diff --git a/src/tscore/ink_base64_dispatch.cc b/src/tscore/ink_base64_dispatch.cc
index 427652bf862..1fafae0037e 100644
--- a/src/tscore/ink_base64_dispatch.cc
+++ b/src/tscore/ink_base64_dispatch.cc
@@ -142,16 +142,16 @@ namespace HWY_NAMESPACE
         break;
       }
     }
-    // Scalar finishes the remainder: count the alphabet prefix (truncating at
-    // the first non-alphabet byte) then decode 4-groups + a 2/3 char tail.
+    // Scalar finishes the remainder: truncate at the first non-alphabet byte
+    // then decode 4-groups + a 2/3 char tail (and write the trailing NUL).
     // Identical to running the scalar decoder over the whole alphabet prefix,
     // because the SIMD loop consumed only fully-valid 4-group-aligned blocks.
-    const size_t rem_valid = count_alphabet_prefix(in + i, in_len - i);
-    size_t       tail_len  = 0;
-    decode_scalar(in + i, rem_valid, out + o, &tail_len);
-    o        += tail_len;
-    out[o]    = '\0';
-    *out_len  = o;
+    size_t tail_len = 0;
+    decode_scalar_prefix(in + i, in_len - i, out + o, &tail_len);
+    o += tail_len;
+    if (out_len) {
+      *out_len = o;
+    }
   }
 
   // ---- ENCODE ----
@@ -226,8 +226,10 @@ namespace HWY_NAMESPACE
     }
     size_t tail = 0;
     encode_scalar(in + i, in_len - i, out + o, &tail);
-    o        += tail;
-    *out_len  = o;
+    o += tail;
+    if (out_len) {
+      *out_len = o;
+    }
   }
 
 } // namespace HWY_NAMESPACE
diff --git a/src/tscore/ink_base64_scalar.h b/src/tscore/ink_base64_scalar.h
index 51049705bdd..0f89f3eda93 100644
--- a/src/tscore/ink_base64_scalar.h
+++ b/src/tscore/ink_base64_scalar.h
@@ -163,4 +163,17 @@ decode_scalar(const char *inBuffer, size_t inBufferSize, unsigned char *outBuffe
   }
 }
 
+// Decode the leading base64-alphabet run of inBuffer[0..inBufferSize): truncate
+// at the first non-alphabet byte, write the decoded bytes plus trailing NUL to
+// outBuffer, and set *length (if non-null) to the decoded byte count. This is
+// the canonical scalar decode, used directly by ats_base64_decode and as the
+// SIMD path's scalar tail in ink_base64_dispatch.cc.
+inline void
+decode_scalar_prefix(const char *inBuffer, size_t inBufferSize, unsigned char *outBuffer, size_t *length)
+{
+  const size_t valid = count_alphabet_prefix(inBuffer, inBufferSize);
+
+  decode_scalar(inBuffer, valid, outBuffer, length);
+}
+
 } // namespace ats::base64

From f430a25c641cd32253b1e1dc34aa05f07c966a2e Mon Sep 17 00:00:00 2001
From: "Phong X. Nguyen" <phongn@gmail.com>
Date: Thu, 11 Jun 2026 04:36:38 +0000
Subject: [PATCH 12/12] fix wrong namespace

---
 src/tscore/ink_base64.cc                 | 8 ++++----
 src/tscore/ink_base64_dispatch.cc        | 8 ++++----
 src/tscore/ink_base64_dispatch.h         | 4 ++--
 src/tscore/ink_base64_scalar.h           | 4 ++--
 src/tscore/unit_tests/test_ink_base64.cc | 6 +++---
 tests/fuzzing/fuzz_base64.cc             | 6 +++---
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/tscore/ink_base64.cc b/src/tscore/ink_base64.cc
index 00e84110d30..635500cc0c4 100644
--- a/src/tscore/ink_base64.cc
+++ b/src/tscore/ink_base64.cc
@@ -78,12 +78,12 @@ ats_base64_encode(const unsigned char *inBuffer, size_t inBufferSize, char *outB
 
 #if TS_HAS_HIGHWAY_DISPATCH
   if (inBufferSize > BASE64_ENCODE_SIMD_THRESHOLD) {
-    ats::base64::encode_dispatch(inBuffer, inBufferSize, outBuffer, length);
+    ts::base64::encode_dispatch(inBuffer, inBufferSize, outBuffer, length);
     return true;
   }
 #endif
 
-  ats::base64::encode_scalar(inBuffer, inBufferSize, outBuffer, length);
+  ts::base64::encode_scalar(inBuffer, inBufferSize, outBuffer, length);
   return true;
 }
 
@@ -104,13 +104,13 @@ ats_base64_decode(const char *inBuffer, size_t inBufferSize, unsigned char *outB
   if (inBufferSize > BASE64_DECODE_SIMD_THRESHOLD) {
     // The SIMD path validates inline and truncates at the first non-alphabet
     // byte, so no separate alphabet pre-scan is needed here.
-    ats::base64::decode_dispatch(inBuffer, inBufferSize, outBuffer, length);
+    ts::base64::decode_dispatch(inBuffer, inBufferSize, outBuffer, length);
     return true;
   }
 #endif
 
   // Ignore any trailing `=`s or other undecodable characters, then decode the
   // valid alphabet prefix.
-  ats::base64::decode_scalar_prefix(inBuffer, inBufferSize, outBuffer, length);
+  ts::base64::decode_scalar_prefix(inBuffer, inBufferSize, outBuffer, length);
   return true;
 }
diff --git a/src/tscore/ink_base64_dispatch.cc b/src/tscore/ink_base64_dispatch.cc
index 1fafae0037e..21aebdaaddd 100644
--- a/src/tscore/ink_base64_dispatch.cc
+++ b/src/tscore/ink_base64_dispatch.cc
@@ -56,7 +56,7 @@
 #include <hwy/highway.h>
 
 HWY_BEFORE_NAMESPACE();
-namespace ats::base64
+namespace ts::base64
 {
 namespace HWY_NAMESPACE
 {
@@ -233,11 +233,11 @@ namespace HWY_NAMESPACE
   }
 
 } // namespace HWY_NAMESPACE
-} // namespace ats::base64
+} // namespace ts::base64
 HWY_AFTER_NAMESPACE();
 
 #if HWY_ONCE
-namespace ats::base64
+namespace ts::base64
 {
 HWY_EXPORT(DecodeImpl);
 HWY_EXPORT(EncodeImpl);
@@ -254,5 +254,5 @@ encode_dispatch(const unsigned char *in, size_t in_len, char *out, size_t *out_l
   HWY_DYNAMIC_DISPATCH(EncodeImpl)(in, in_len, out, out_len);
 }
 
-} // namespace ats::base64
+} // namespace ts::base64
 #endif
diff --git a/src/tscore/ink_base64_dispatch.h b/src/tscore/ink_base64_dispatch.h
index afd67332f91..98f3eef335d 100644
--- a/src/tscore/ink_base64_dispatch.h
+++ b/src/tscore/ink_base64_dispatch.h
@@ -35,7 +35,7 @@
 #include <cstddef>
 #include <cstdint>
 
-namespace ats::base64
+namespace ts::base64
 {
 // Decode `in_len` bytes of base64 input. Output buffer capacity has already
 // been validated by the caller (ats_base64_decode). Writes a trailing NUL at
@@ -46,4 +46,4 @@ void decode_dispatch(const char *in, size_t in_len, unsigned char *out, size_t *
 // Writes a trailing NUL at out[*out_len].
 void encode_dispatch(const unsigned char *in, size_t in_len, char *out, size_t *out_len);
 
-} // namespace ats::base64
+} // namespace ts::base64
diff --git a/src/tscore/ink_base64_scalar.h b/src/tscore/ink_base64_scalar.h
index 0f89f3eda93..d3ff8d39c3e 100644
--- a/src/tscore/ink_base64_scalar.h
+++ b/src/tscore/ink_base64_scalar.h
@@ -49,7 +49,7 @@
 #include <cstdint>
 #include <cstring>
 
-namespace ats::base64
+namespace ts::base64
 {
 // Converts a printable character to its six-bit representation; 64 marks a
 // non-alphabet byte. Both standard (`+`=62, `/`=63) and URL-safe (`-`=62,
@@ -176,4 +176,4 @@ decode_scalar_prefix(const char *inBuffer, size_t inBufferSize, unsigned char *o
   decode_scalar(inBuffer, valid, outBuffer, length);
 }
 
-} // namespace ats::base64
+} // namespace ts::base64
diff --git a/src/tscore/unit_tests/test_ink_base64.cc b/src/tscore/unit_tests/test_ink_base64.cc
index 48cca44b5be..2e103ff04d4 100644
--- a/src/tscore/unit_tests/test_ink_base64.cc
+++ b/src/tscore/unit_tests/test_ink_base64.cc
@@ -62,7 +62,7 @@ oracle_encode(const std::string &bin)
 {
   std::vector<char> out(ats_base64_encode_dstlen(bin.size()) + 1, '\xCC');
   size_t            n = 0;
-  ats::base64::encode_scalar(reinterpret_cast<const unsigned char *>(bin.data()), bin.size(), out.data(), &n);
+  ts::base64::encode_scalar(reinterpret_cast<const unsigned char *>(bin.data()), bin.size(), out.data(), &n);
   return std::string(out.data(), n);
 }
 
@@ -70,10 +70,10 @@ oracle_encode(const std::string &bin)
 std::string
 oracle_decode(const std::string &b64)
 {
-  const size_t               valid = ats::base64::count_alphabet_prefix(b64.data(), b64.size());
+  const size_t               valid = ts::base64::count_alphabet_prefix(b64.data(), b64.size());
   std::vector<unsigned char> out(ats_base64_decode_dstlen(b64.size()) + 1, 0xCC);
   size_t                     n = 0;
-  ats::base64::decode_scalar(b64.data(), valid, out.data(), &n);
+  ts::base64::decode_scalar(b64.data(), valid, out.data(), &n);
   return std::string(reinterpret_cast<char *>(out.data()), n);
 }
 
diff --git a/tests/fuzzing/fuzz_base64.cc b/tests/fuzzing/fuzz_base64.cc
index 3e8a001c089..969fc1694c3 100644
--- a/tests/fuzzing/fuzz_base64.cc
+++ b/tests/fuzzing/fuzz_base64.cc
@@ -63,10 +63,10 @@ check_decode(const char *in, size_t len)
     fail("decode returned false with sufficient buffer");
   }
 
-  const size_t               valid = ats::base64::count_alphabet_prefix(in, len);
+  const size_t               valid = ts::base64::count_alphabet_prefix(in, len);
   std::vector<unsigned char> out_ref(cap + 1, 0xCD);
   size_t                     n_ref = 0;
-  ats::base64::decode_scalar(in, valid, out_ref.data(), &n_ref);
+  ts::base64::decode_scalar(in, valid, out_ref.data(), &n_ref);
 
   if (n_pub != n_ref || std::memcmp(out_pub.data(), out_ref.data(), n_ref) != 0 || out_pub[n_ref] != '\0') {
     fail("decode parity");
@@ -95,7 +95,7 @@ LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
 
   std::vector<char> enc_ref(ecap + 1, 2);
   size_t            ne_ref = 0;
-  ats::base64::encode_scalar(data, size, enc_ref.data(), &ne_ref);
+  ts::base64::encode_scalar(data, size, enc_ref.data(), &ne_ref);
   if (ne_pub != ne_ref || std::memcmp(enc_pub.data(), enc_ref.data(), ne_ref + 1) != 0) {
     fail("encode parity");
   }