diff --git a/CryptoLib.Benchmark/Delphi/CryptoLib.BenchmarkConsole.dpr b/CryptoLib.Benchmark/Delphi/CryptoLib.BenchmarkConsole.dpr
index edd01df3..7bb4c6a9 100644
--- a/CryptoLib.Benchmark/Delphi/CryptoLib.BenchmarkConsole.dpr
+++ b/CryptoLib.Benchmark/Delphi/CryptoLib.BenchmarkConsole.dpr
@@ -16,6 +16,7 @@ uses
ClpIAesEngineX86 in '..\..\CryptoLib\src\Interfaces\Crypto\Engines\ClpIAesEngineX86.pas',
ClpAesEngineX86 in '..\..\CryptoLib\src\Crypto\Engines\ClpAesEngineX86.pas',
ClpAesUtilities in '..\..\CryptoLib\src\Crypto\ClpAesUtilities.pas',
+ ClpAesSimd in '..\..\CryptoLib\src\Crypto\ClpAesSimd.pas',
ClpAesLightEngine in '..\..\CryptoLib\src\Crypto\Engines\ClpAesLightEngine.pas',
ClpAgreementUtilities in '..\..\CryptoLib\src\Crypto\Agreements\ClpAgreementUtilities.pas',
ClpArgon2ParametersGenerator in '..\..\CryptoLib\src\Crypto\Generators\ClpArgon2ParametersGenerator.pas',
@@ -183,6 +184,17 @@ uses
ClpIDHGenerators in '..\..\CryptoLib\src\Interfaces\Crypto\Generators\ClpIDHGenerators.pas',
ClpIDHParameters in '..\..\CryptoLib\src\Interfaces\Crypto\Parameters\ClpIDHParameters.pas',
ClpIBackingHashProvider in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIBackingHashProvider.pas',
+ ClpChaChaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaSimd.pas',
+ ClpChaChaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaX86Backend.pas',
+ ClpSalsaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaSimd.pas',
+ ClpSalsaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaX86Backend.pas',
+ ClpPoly1305State in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305State.pas',
+ ClpPoly1305Simd in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305Simd.pas',
+ ClpPoly1305X86Backend in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305X86Backend.pas',
+ ClpGhashSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashSimd.pas',
+ ClpGhashX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashX86Backend.pas',
+ ClpGcmSivSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivSimd.pas',
+ ClpGcmSivX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivX86Backend.pas',
ClpIDigest in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIDigest.pas',
ClpIDigestFactory in '..\..\CryptoLib\src\Interfaces\Crypto\Operators\ClpIDigestFactory.pas',
ClpIDigestRandomGenerator in '..\..\CryptoLib\src\Interfaces\Rngs\ClpIDigestRandomGenerator.pas',
@@ -680,7 +692,9 @@ uses
ClpIFusedEaxKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedEaxKernel.pas',
ClpIFusedGcmSivKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedGcmSivKernel.pas',
ClpFusedKernelRegistry in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpFusedKernelRegistry.pas',
- ClpAesNiAeadResolver in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesNiAeadResolver.pas',
+ ClpAesFusedAeadSimd in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadSimd.pas',
+ ClpAesFusedAeadX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadX86Backend.pas',
+ ClpBinPolySimd in '..\..\CryptoLib\src\Math\BinPoly\ClpBinPolySimd.pas',
ClpAesNiOcbKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiOcbKernel.pas',
ClpAesNiCcmKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiCcmKernel.pas',
ClpAesNiEaxKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiEaxKernel.pas',
diff --git a/CryptoLib.Examples/Delphi.Examples/CryptoLib.Examples.dpr b/CryptoLib.Examples/Delphi.Examples/CryptoLib.Examples.dpr
index fb7a8a29..f48218a3 100644
--- a/CryptoLib.Examples/Delphi.Examples/CryptoLib.Examples.dpr
+++ b/CryptoLib.Examples/Delphi.Examples/CryptoLib.Examples.dpr
@@ -32,6 +32,7 @@ uses
ClpIAesEngineX86 in '..\..\CryptoLib\src\Interfaces\Crypto\Engines\ClpIAesEngineX86.pas',
ClpAesEngineX86 in '..\..\CryptoLib\src\Crypto\Engines\ClpAesEngineX86.pas',
ClpAesUtilities in '..\..\CryptoLib\src\Crypto\ClpAesUtilities.pas',
+ ClpAesSimd in '..\..\CryptoLib\src\Crypto\ClpAesSimd.pas',
ClpAesLightEngine in '..\..\CryptoLib\src\Crypto\Engines\ClpAesLightEngine.pas',
ClpAgreementUtilities in '..\..\CryptoLib\src\Crypto\Agreements\ClpAgreementUtilities.pas',
ClpArgon2ParametersGenerator in '..\..\CryptoLib\src\Crypto\Generators\ClpArgon2ParametersGenerator.pas',
@@ -199,6 +200,17 @@ uses
ClpIDHGenerators in '..\..\CryptoLib\src\Interfaces\Crypto\Generators\ClpIDHGenerators.pas',
ClpIDHParameters in '..\..\CryptoLib\src\Interfaces\Crypto\Parameters\ClpIDHParameters.pas',
ClpIBackingHashProvider in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIBackingHashProvider.pas',
+ ClpChaChaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaSimd.pas',
+ ClpChaChaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaX86Backend.pas',
+ ClpSalsaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaSimd.pas',
+ ClpSalsaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaX86Backend.pas',
+ ClpPoly1305State in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305State.pas',
+ ClpPoly1305Simd in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305Simd.pas',
+ ClpPoly1305X86Backend in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305X86Backend.pas',
+ ClpGhashSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashSimd.pas',
+ ClpGhashX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashX86Backend.pas',
+ ClpGcmSivSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivSimd.pas',
+ ClpGcmSivX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivX86Backend.pas',
ClpIDigest in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIDigest.pas',
ClpIDigestFactory in '..\..\CryptoLib\src\Interfaces\Crypto\Operators\ClpIDigestFactory.pas',
ClpIDigestRandomGenerator in '..\..\CryptoLib\src\Interfaces\Rngs\ClpIDigestRandomGenerator.pas',
@@ -696,7 +708,9 @@ uses
ClpIFusedEaxKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedEaxKernel.pas',
ClpIFusedGcmSivKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedGcmSivKernel.pas',
ClpFusedKernelRegistry in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpFusedKernelRegistry.pas',
- ClpAesNiAeadResolver in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesNiAeadResolver.pas',
+ ClpAesFusedAeadSimd in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadSimd.pas',
+ ClpAesFusedAeadX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadX86Backend.pas',
+ ClpBinPolySimd in '..\..\CryptoLib\src\Math\BinPoly\ClpBinPolySimd.pas',
ClpAesNiOcbKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiOcbKernel.pas',
ClpAesNiCcmKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiCcmKernel.pas',
ClpAesNiEaxKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiEaxKernel.pas',
diff --git a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dpr b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dpr
index 5cefe272..44c55f12 100644
--- a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dpr
+++ b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dpr
@@ -13,6 +13,7 @@ uses
ClpIAesEngineX86 in '..\..\CryptoLib\src\Interfaces\Crypto\Engines\ClpIAesEngineX86.pas',
ClpAesEngineX86 in '..\..\CryptoLib\src\Crypto\Engines\ClpAesEngineX86.pas',
ClpAesUtilities in '..\..\CryptoLib\src\Crypto\ClpAesUtilities.pas',
+ ClpAesSimd in '..\..\CryptoLib\src\Crypto\ClpAesSimd.pas',
ClpAesLightEngine in '..\..\CryptoLib\src\Crypto\Engines\ClpAesLightEngine.pas',
ClpAgreementUtilities in '..\..\CryptoLib\src\Crypto\Agreements\ClpAgreementUtilities.pas',
ClpArgon2ParametersGenerator in '..\..\CryptoLib\src\Crypto\Generators\ClpArgon2ParametersGenerator.pas',
@@ -180,6 +181,17 @@ uses
ClpIDHGenerators in '..\..\CryptoLib\src\Interfaces\Crypto\Generators\ClpIDHGenerators.pas',
ClpIDHParameters in '..\..\CryptoLib\src\Interfaces\Crypto\Parameters\ClpIDHParameters.pas',
ClpIBackingHashProvider in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIBackingHashProvider.pas',
+ ClpChaChaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaSimd.pas',
+ ClpChaChaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaX86Backend.pas',
+ ClpSalsaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaSimd.pas',
+ ClpSalsaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaX86Backend.pas',
+ ClpPoly1305State in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305State.pas',
+ ClpPoly1305Simd in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305Simd.pas',
+ ClpPoly1305X86Backend in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305X86Backend.pas',
+ ClpGhashSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashSimd.pas',
+ ClpGhashX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashX86Backend.pas',
+ ClpGcmSivSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivSimd.pas',
+ ClpGcmSivX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivX86Backend.pas',
ClpIDigest in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIDigest.pas',
ClpIDigestFactory in '..\..\CryptoLib\src\Interfaces\Crypto\Operators\ClpIDigestFactory.pas',
ClpIDigestRandomGenerator in '..\..\CryptoLib\src\Interfaces\Rngs\ClpIDigestRandomGenerator.pas',
@@ -677,7 +689,9 @@ uses
ClpIFusedEaxKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedEaxKernel.pas',
ClpIFusedGcmSivKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedGcmSivKernel.pas',
ClpFusedKernelRegistry in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpFusedKernelRegistry.pas',
- ClpAesNiAeadResolver in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesNiAeadResolver.pas',
+ ClpAesFusedAeadSimd in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadSimd.pas',
+ ClpAesFusedAeadX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadX86Backend.pas',
+ ClpBinPolySimd in '..\..\CryptoLib\src\Math\BinPoly\ClpBinPolySimd.pas',
ClpAesNiOcbKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiOcbKernel.pas',
ClpAesNiCcmKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiCcmKernel.pas',
ClpAesNiEaxKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiEaxKernel.pas',
diff --git a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dproj b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dproj
index ef8b156f..15cf015c 100644
--- a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dproj
+++ b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dproj
@@ -356,6 +356,7 @@
+
@@ -523,6 +524,17 @@
+
+
+
+
+
+
+
+
+
+
+
@@ -1020,7 +1032,9 @@
-
+
+
+
diff --git a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.dpr b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.dpr
index b224fe2d..2bba9ca1 100644
--- a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.dpr
+++ b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.dpr
@@ -32,6 +32,7 @@ uses
ClpIAesEngineX86 in '..\..\CryptoLib\src\Interfaces\Crypto\Engines\ClpIAesEngineX86.pas',
ClpAesEngineX86 in '..\..\CryptoLib\src\Crypto\Engines\ClpAesEngineX86.pas',
ClpAesUtilities in '..\..\CryptoLib\src\Crypto\ClpAesUtilities.pas',
+ ClpAesSimd in '..\..\CryptoLib\src\Crypto\ClpAesSimd.pas',
ClpAesLightEngine in '..\..\CryptoLib\src\Crypto\Engines\ClpAesLightEngine.pas',
ClpAgreementUtilities in '..\..\CryptoLib\src\Crypto\Agreements\ClpAgreementUtilities.pas',
ClpArgon2ParametersGenerator in '..\..\CryptoLib\src\Crypto\Generators\ClpArgon2ParametersGenerator.pas',
@@ -199,6 +200,17 @@ uses
ClpIDHGenerators in '..\..\CryptoLib\src\Interfaces\Crypto\Generators\ClpIDHGenerators.pas',
ClpIDHParameters in '..\..\CryptoLib\src\Interfaces\Crypto\Parameters\ClpIDHParameters.pas',
ClpIBackingHashProvider in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIBackingHashProvider.pas',
+ ClpChaChaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaSimd.pas',
+ ClpChaChaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaX86Backend.pas',
+ ClpSalsaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaSimd.pas',
+ ClpSalsaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaX86Backend.pas',
+ ClpPoly1305State in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305State.pas',
+ ClpPoly1305Simd in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305Simd.pas',
+ ClpPoly1305X86Backend in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305X86Backend.pas',
+ ClpGhashSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashSimd.pas',
+ ClpGhashX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashX86Backend.pas',
+ ClpGcmSivSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivSimd.pas',
+ ClpGcmSivX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivX86Backend.pas',
ClpIDigest in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIDigest.pas',
ClpIDigestFactory in '..\..\CryptoLib\src\Interfaces\Crypto\Operators\ClpIDigestFactory.pas',
ClpIDigestRandomGenerator in '..\..\CryptoLib\src\Interfaces\Rngs\ClpIDigestRandomGenerator.pas',
@@ -696,7 +708,9 @@ uses
ClpIFusedEaxKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedEaxKernel.pas',
ClpIFusedGcmSivKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedGcmSivKernel.pas',
ClpFusedKernelRegistry in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpFusedKernelRegistry.pas',
- ClpAesNiAeadResolver in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesNiAeadResolver.pas',
+ ClpAesFusedAeadSimd in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadSimd.pas',
+ ClpAesFusedAeadX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadX86Backend.pas',
+ ClpBinPolySimd in '..\..\CryptoLib\src\Math\BinPoly\ClpBinPolySimd.pas',
ClpAesNiOcbKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiOcbKernel.pas',
ClpAesNiCcmKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiCcmKernel.pas',
ClpAesNiEaxKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiEaxKernel.pas',
diff --git a/CryptoLib.Tests/src/Math/BinPoly/BinPolyTests.pas b/CryptoLib.Tests/src/Math/BinPoly/BinPolyTests.pas
index 9bf5daf9..11b9ba80 100644
--- a/CryptoLib.Tests/src/Math/BinPoly/BinPolyTests.pas
+++ b/CryptoLib.Tests/src/Math/BinPoly/BinPolyTests.pas
@@ -16,11 +16,9 @@
unit BinPolyTests;
-interface
+{$I ..\..\..\..\CryptoLib\src\Include\CryptoLib.inc}
-{$IFDEF FPC}
-{$MODE DELPHI}
-{$ENDIF FPC}
+interface
uses
SysUtils,
@@ -38,7 +36,9 @@ interface
ClpIRandom,
ClpRandom,
ClpBinPolyScalarBackend,
+{$IFDEF CRYPTOLIB_X86_SIMD}
ClpBinPolyX86V128Backend,
+{$ENDIF CRYPTOLIB_X86_SIMD}
ClpBinPolyMulBaseBinomialReduce,
CryptoLibTestBase;
@@ -62,8 +62,14 @@ TPentaCase = record
K3: Int32;
end;
- TTestBinPoly = class(TCryptoLibAlgorithmTestCase)
- strict private
+ ///
+ /// Shared BinPoly test machinery (constants, RNG helpers, reference
+ /// implementations, generic assert utilities). Backend-agnostic; consumed by
+ /// both the public-API suite (TTestBinPoly) and the per-backend
+ /// vs-scalar suite (TBinPolyBackendTestBase and its subclasses).
+ ///
+ TBinPolyTestBase = class abstract(TCryptoLibAlgorithmTestCase)
+ strict protected
const
BikeR1 = 12323;
RandomTrials = 16;
@@ -100,10 +106,14 @@ TTestBinPoly = class(TCryptoLibAlgorithmTestCase)
const ARandom: IRandom; const ALabel: string);
procedure RunInvertChecks(const AMul: IBinPolyMul; const AInv: IBinPolyInv;
AN: Int32; const ARandom: IRandom; const ALabel: string);
- procedure RunX86V128VsScalar(AN: Int32; const ARandom: IRandom;
- const AContext: string);
- procedure AssertX86V128MultiplyEquals(AN: Int32;
- const AX, AY: TCryptoLibUInt64Array; const AContext: string);
+ end;
+
+ ///
+ /// Architecture-neutral BinPoly tests: exercise the public TBinPolys API
+ /// (binomial / trinomial / pentanomial multiply / square / invert, factory
+ /// validation, bit-length). Run on every target.
+ ///
+ TTestBinPoly = class(TBinPolyTestBase)
published
procedure TestBinomial_Add_AgainstXor_BikeR1;
procedure TestBinomial_AddTo_AgainstXor_BikeR1;
@@ -134,14 +144,52 @@ TTestBinPoly = class(TCryptoLibAlgorithmTestCase)
procedure TestPentanomial_Invert_RoundTrip;
procedure TestInv_Factory_RejectsNullAndDegenerate;
procedure TestBitLengthVar_AgainstReference;
- procedure TestX86V128_Multiply_MatchesScalar;
- procedure TestX86V128_SizeSweep;
- procedure TestX86V128_LSize10_MidWindow;
- procedure TestX86V128_SmallSizes_AllOps;
- procedure TestX86V128_MultiplyByZeroAndOne;
- procedure TestX86V128_EdgeVectors;
end;
+ ///
+ /// Backend-agnostic suite that diffs a per-arch SIMD IBinPolyMul backend
+ /// against the scalar reference. A concrete per-arch suite supplies three hooks
+ /// (BackendSupported / CreateBackendMul / BackendLabel) and
+ /// registers itself under its architecture guard; the published tests are
+ /// inherited and discovered automatically. Binds to the arch-neutral
+ /// IBinPolyMul interface, never to a concrete backend class.
+ ///
+ TBinPolyBackendTestBase = class abstract(TBinPolyTestBase)
+ strict protected
+ // ---- architecture hooks (implemented by the concrete per-arch suite) ----
+ function BackendSupported: Boolean; virtual; abstract;
+ function CreateBackendMul(AN: Int32; const AReduce: IBinPolyReduce)
+ : IBinPolyMul; virtual; abstract;
+ function BackendLabel: String; virtual; abstract;
+
+ // ---- shared backend-vs-scalar logic ----
+ procedure RunBackendVsScalar(AN: Int32; const ARandom: IRandom;
+ const AContext: string);
+ procedure AssertBackendMultiplyEquals(AN: Int32;
+ const AX, AY: TCryptoLibUInt64Array; const AContext: string);
+ published
+ procedure TestBackend_Multiply_MatchesScalar;
+ procedure TestBackend_SizeSweep;
+ procedure TestBackend_LSize10_MidWindow;
+ procedure TestBackend_SmallSizes_AllOps;
+ procedure TestBackend_MultiplyByZeroAndOne;
+ procedure TestBackend_EdgeVectors;
+ end;
+
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ ///
+ /// x86/V128 (PCLMULQDQ) instantiation of the BinPoly backend suite. Registered
+ /// only when CRYPTOLIB_X86_SIMD is defined.
+ ///
+ TTestBinPolyX86V128 = class(TBinPolyBackendTestBase)
+ strict protected
+ function BackendSupported: Boolean; override;
+ function CreateBackendMul(AN: Int32; const AReduce: IBinPolyReduce)
+ : IBinPolyMul; override;
+ function BackendLabel: String; override;
+ end;
+{$ENDIF CRYPTOLIB_X86_SIMD}
+
implementation
const
@@ -239,13 +287,13 @@ implementation
{ TTestBinPoly }
-procedure TTestBinPoly.AssertUInt64ArraysEqual(ASize: Int32;
+procedure TBinPolyTestBase.AssertUInt64ArraysEqual(ASize: Int32;
const AExpected, AActual: TCryptoLibUInt64Array; const AContext: string);
begin
Check(TBinPolys.EqualTo(ASize, AExpected, 0, AActual, 0) <> 0, AContext);
end;
-procedure TTestBinPoly.AssertSliceEquals(const AExpected: TCryptoLibUInt64Array;
+procedure TBinPolyTestBase.AssertSliceEquals(const AExpected: TCryptoLibUInt64Array;
const AActual: TCryptoLibUInt64Array; AActualOff, ASize: Int32;
const AContext: string);
var
@@ -255,7 +303,7 @@ procedure TTestBinPoly.AssertSliceEquals(const AExpected: TCryptoLibUInt64Array;
CheckEquals(AExpected[LI], AActual[AActualOff + LI], AContext + ' limb ' + IntToStr(LI));
end;
-procedure TTestBinPoly.AssertGuardZonesEqual(const ABefore, AAfter: TCryptoLibUInt64Array;
+procedure TBinPolyTestBase.AssertGuardZonesEqual(const ABefore, AAfter: TCryptoLibUInt64Array;
ASliceOff, ASliceSize: Int32; const AContext: string);
var
LI: Int32;
@@ -266,7 +314,7 @@ procedure TTestBinPoly.AssertGuardZonesEqual(const ABefore, AAfter: TCryptoLibUI
CheckEquals(ABefore[LI], AAfter[LI], AContext + ' tail guard at ' + IntToStr(LI));
end;
-function TTestBinPoly.PadBuffer(ASliceSize, ASliceOff, APadTail: Int32;
+function TBinPolyTestBase.PadBuffer(ASliceSize, ASliceOff, APadTail: Int32;
const ARandom: IRandom): TCryptoLibUInt64Array;
var
LTotal, LI, LJ: Int32;
@@ -286,7 +334,7 @@ function TTestBinPoly.PadBuffer(ASliceSize, ASliceOff, APadTail: Int32;
end;
end;
-function TTestBinPoly.RandomLimbs(const ARandom: IRandom; ASize: Int32)
+function TBinPolyTestBase.RandomLimbs(const ARandom: IRandom; ASize: Int32)
: TCryptoLibUInt64Array;
var
LI, LJ: Int32;
@@ -305,7 +353,7 @@ function TTestBinPoly.RandomLimbs(const ARandom: IRandom; ASize: Int32)
end;
end;
-function TTestBinPoly.RandomReduced(const ARandom: IRandom; AN: Int32)
+function TBinPolyTestBase.RandomReduced(const ARandom: IRandom; AN: Int32)
: TCryptoLibUInt64Array;
var
LSize, LPartial, LI, LJ: Int32;
@@ -328,7 +376,7 @@ function TTestBinPoly.RandomReduced(const ARandom: IRandom; AN: Int32)
Result[LSize - 1] := Result[LSize - 1] and ((UInt64(1) shl LPartial) - 1);
end;
-function TTestBinPoly.ReferenceBitLength(ASize: Int32;
+function TBinPolyTestBase.ReferenceBitLength(ASize: Int32;
const AX: TCryptoLibUInt64Array): Int32;
var
LBit: Int32;
@@ -339,7 +387,7 @@ function TTestBinPoly.ReferenceBitLength(ASize: Int32;
Result := 0;
end;
-function TTestBinPoly.CarrylessMul(AN: Int32; const AX, AY: TCryptoLibUInt64Array)
+function TBinPolyTestBase.CarrylessMul(AN: Int32; const AX, AY: TCryptoLibUInt64Array)
: TCryptoLibUInt64Array;
var
LSize, LI, LWOff, LBOff, LJ: Int32;
@@ -371,7 +419,7 @@ function TTestBinPoly.CarrylessMul(AN: Int32; const AX, AY: TCryptoLibUInt64Arra
end;
end;
-function TTestBinPoly.ReferenceBinomialMul(AR: Int32;
+function TBinPolyTestBase.ReferenceBinomialMul(AR: Int32;
const AX, AY: TCryptoLibUInt64Array): TCryptoLibUInt64Array;
var
LSize, LP, LQ, LPartial: Int32;
@@ -397,7 +445,7 @@ function TTestBinPoly.ReferenceBinomialMul(AR: Int32;
Result[LSize - 1] := Result[LSize - 1] and LPartialMask;
end;
-function TTestBinPoly.ReferenceTrinomialMul(AN, AK: Int32;
+function TBinPolyTestBase.ReferenceTrinomialMul(AN, AK: Int32;
const AX, AY: TCryptoLibUInt64Array): TCryptoLibUInt64Array;
var
LSize, LP, LQ0, LQ1, LPartial: Int32;
@@ -425,7 +473,7 @@ function TTestBinPoly.ReferenceTrinomialMul(AN, AK: Int32;
Result[LSize - 1] := Result[LSize - 1] and LPartialMask;
end;
-function TTestBinPoly.ReferencePentanomialMul(AN, AK1, AK2, AK3: Int32;
+function TBinPolyTestBase.ReferencePentanomialMul(AN, AK1, AK2, AK3: Int32;
const AX, AY: TCryptoLibUInt64Array): TCryptoLibUInt64Array;
var
LSize, LP, LQ0, LQ1, LQ2, LQ3, LPartial: Int32;
@@ -457,7 +505,7 @@ function TTestBinPoly.ReferencePentanomialMul(AN, AK1, AK2, AK3: Int32;
Result[LSize - 1] := Result[LSize - 1] and LPartialMask;
end;
-procedure TTestBinPoly.RunAllOpsAtOffsets(const AMul: IBinPolyMul; AN: Int32;
+procedure TBinPolyTestBase.RunAllOpsAtOffsets(const AMul: IBinPolyMul; AN: Int32;
const ARandom: IRandom; const ALabel: string);
var
LSize: Int32;
@@ -546,7 +594,7 @@ procedure TTestBinPoly.RunAllOpsAtOffsets(const AMul: IBinPolyMul; AN: Int32;
AssertGuardZonesEqual(LZBufBefore, LZBuf, OffZ, LSize, ALabel + ' AddTo zBuf');
end;
-procedure TTestBinPoly.RunInvertChecks(const AMul: IBinPolyMul; const AInv: IBinPolyInv;
+procedure TBinPolyTestBase.RunInvertChecks(const AMul: IBinPolyMul; const AInv: IBinPolyInv;
AN: Int32; const ARandom: IRandom; const ALabel: string);
var
LSize, LT: Int32;
@@ -1393,28 +1441,28 @@ procedure TTestBinPoly.TestInv_Factory_RejectsNullAndDegenerate;
end;
end;
-// Cross-backend check: the x86/V128 backend must agree with the scalar backend
-// for Multiply, Square and SquareN, even when operands and outputs live at
-// non-zero offsets inside guard-padded buffers (verifies offset handling, that
-// inputs are never clobbered, and that nothing is written outside the result
-// slice).
-procedure TTestBinPoly.RunX86V128VsScalar(AN: Int32; const ARandom: IRandom;
+// Cross-backend check: the SIMD backend under test must agree with the scalar
+// backend for Multiply, Square and SquareN, even when operands and outputs live
+// at non-zero offsets inside guard-padded buffers (verifies offset handling,
+// that inputs are never clobbered, and that nothing is written outside the
+// result slice).
+procedure TBinPolyBackendTestBase.RunBackendVsScalar(AN: Int32; const ARandom: IRandom;
const AContext: string);
const
SquareNCount = 5;
var
LReduce: IBinPolyReduce;
- LScalar, LX86: IBinPolyMul;
+ LScalar, LSimd: IBinPolyMul;
LSize: Int32;
LX, LY, LRef, LXBuf, LYBuf, LZBuf, LXBefore, LYBefore, LZBefore: TCryptoLibUInt64Array;
begin
- if not TBinPolyX86V128Backend.IsEnabled then
+ if not BackendSupported then
Exit;
LReduce := TBinPolyMulBaseBinomialReduce.Create(AN);
LScalar := TBinPolyScalarBackend.CreateBinPolyMul(AN, LReduce);
- LX86 := TBinPolyX86V128Backend.CreateBinPolyMul(AN, LReduce);
- LSize := LX86.Size;
+ LSimd := CreateBackendMul(AN, LReduce);
+ LSize := LSimd.Size;
// Multiply
LX := RandomReduced(ARandom, AN);
@@ -1429,7 +1477,7 @@ procedure TTestBinPoly.RunX86V128VsScalar(AN: Int32; const ARandom: IRandom;
LXBefore := System.Copy(LXBuf);
LYBefore := System.Copy(LYBuf);
LZBefore := System.Copy(LZBuf);
- LX86.Multiply(LXBuf, OffX, LYBuf, OffY, LZBuf, OffZ);
+ LSimd.Multiply(LXBuf, OffX, LYBuf, OffY, LZBuf, OffZ);
AssertSliceEquals(LRef, LZBuf, OffZ, LSize, AContext + ' Multiply');
AssertUInt64ArraysEqual(System.Length(LXBuf), LXBefore, LXBuf, AContext + ' Multiply xBuf clobbered');
AssertUInt64ArraysEqual(System.Length(LYBuf), LYBefore, LYBuf, AContext + ' Multiply yBuf clobbered');
@@ -1444,7 +1492,7 @@ procedure TTestBinPoly.RunX86V128VsScalar(AN: Int32; const ARandom: IRandom;
TNat.Copy64(LSize, LX, 0, LXBuf, OffX);
LXBefore := System.Copy(LXBuf);
LZBefore := System.Copy(LZBuf);
- LX86.Square(LXBuf, OffX, LZBuf, OffZ);
+ LSimd.Square(LXBuf, OffX, LZBuf, OffZ);
AssertSliceEquals(LRef, LZBuf, OffZ, LSize, AContext + ' Square');
AssertUInt64ArraysEqual(System.Length(LXBuf), LXBefore, LXBuf, AContext + ' Square xBuf clobbered');
AssertGuardZonesEqual(LZBefore, LZBuf, OffZ, LSize, AContext + ' Square zBuf');
@@ -1458,51 +1506,51 @@ procedure TTestBinPoly.RunX86V128VsScalar(AN: Int32; const ARandom: IRandom;
TNat.Copy64(LSize, LX, 0, LXBuf, OffX);
LXBefore := System.Copy(LXBuf);
LZBefore := System.Copy(LZBuf);
- LX86.SquareN(LXBuf, OffX, SquareNCount, LZBuf, OffZ);
+ LSimd.SquareN(LXBuf, OffX, SquareNCount, LZBuf, OffZ);
AssertSliceEquals(LRef, LZBuf, OffZ, LSize, AContext + ' SquareN');
AssertUInt64ArraysEqual(System.Length(LXBuf), LXBefore, LXBuf, AContext + ' SquareN xBuf clobbered');
AssertGuardZonesEqual(LZBefore, LZBuf, OffZ, LSize, AContext + ' SquareN zBuf');
end;
-procedure TTestBinPoly.AssertX86V128MultiplyEquals(AN: Int32;
+procedure TBinPolyBackendTestBase.AssertBackendMultiplyEquals(AN: Int32;
const AX, AY: TCryptoLibUInt64Array; const AContext: string);
var
LReduce: IBinPolyReduce;
- LScalar, LX86: IBinPolyMul;
- LScalarZ, LX86Z: TCryptoLibUInt64Array;
+ LScalar, LSimd: IBinPolyMul;
+ LScalarZ, LSimdZ: TCryptoLibUInt64Array;
begin
- if not TBinPolyX86V128Backend.IsEnabled then
+ if not BackendSupported then
Exit;
LReduce := TBinPolyMulBaseBinomialReduce.Create(AN);
LScalar := TBinPolyScalarBackend.CreateBinPolyMul(AN, LReduce);
- LX86 := TBinPolyX86V128Backend.CreateBinPolyMul(AN, LReduce);
+ LSimd := CreateBackendMul(AN, LReduce);
LScalarZ := TBinPolys.Create(LScalar.Size);
- LX86Z := TBinPolys.Create(LX86.Size);
+ LSimdZ := TBinPolys.Create(LSimd.Size);
LScalar.Multiply(AX, 0, AY, 0, LScalarZ, 0);
- LX86.Multiply(AX, 0, AY, 0, LX86Z, 0);
- AssertUInt64ArraysEqual(LScalar.Size, LScalarZ, LX86Z, AContext);
+ LSimd.Multiply(AX, 0, AY, 0, LSimdZ, 0);
+ AssertUInt64ArraysEqual(LScalar.Size, LScalarZ, LSimdZ, AContext);
end;
-procedure TTestBinPoly.TestX86V128_Multiply_MatchesScalar;
+procedure TBinPolyBackendTestBase.TestBackend_Multiply_MatchesScalar;
var
LRandom: IRandom;
LT: Int32;
begin
- if not TBinPolyX86V128Backend.IsEnabled then
+ if not BackendSupported then
Exit;
LRandom := TRandom.Create(FixedSeed + 2000);
try
for LT := 0 to RandomTrials - 1 do
- RunX86V128VsScalar(BikeR1, LRandom,
+ RunBackendVsScalar(BikeR1, LRandom,
'BikeR1 equivalence trial ' + IntToStr(LT));
finally
LRandom := nil;
end;
end;
-procedure TTestBinPoly.TestX86V128_SizeSweep;
+procedure TBinPolyBackendTestBase.TestBackend_SizeSweep;
const
SweepCases: array [0 .. 17] of TBinCase = (
(CaseName: 'lsize1'; N: 32),
@@ -1527,7 +1575,7 @@ procedure TTestBinPoly.TestX86V128_SizeSweep;
LRandom: IRandom;
LC, LT: Int32;
begin
- if not TBinPolyX86V128Backend.IsEnabled then
+ if not BackendSupported then
Exit;
for LC := Low(SweepCases) to High(SweepCases) do
@@ -1535,7 +1583,7 @@ procedure TTestBinPoly.TestX86V128_SizeSweep;
LRandom := TRandom.Create(FixedSeed + SweepCases[LC].N);
try
for LT := 0 to RandomTrials - 1 do
- RunX86V128VsScalar(SweepCases[LC].N, LRandom,
+ RunBackendVsScalar(SweepCases[LC].N, LRandom,
SweepCases[LC].CaseName + ' trial ' + IntToStr(LT));
finally
LRandom := nil;
@@ -1543,18 +1591,18 @@ procedure TTestBinPoly.TestX86V128_SizeSweep;
end;
end;
-procedure TTestBinPoly.TestX86V128_LSize10_MidWindow;
+procedure TBinPolyBackendTestBase.TestBackend_LSize10_MidWindow;
var
LRandom: IRandom;
LT: Int32;
begin
- if not TBinPolyX86V128Backend.IsEnabled then
+ if not BackendSupported then
Exit;
LRandom := TRandom.Create(FixedSeed + 610);
try
for LT := 0 to RandomTrials * 2 - 1 do
- RunX86V128VsScalar(610, LRandom,
+ RunBackendVsScalar(610, LRandom,
'LSize10 mid-window trial ' + IntToStr(LT));
finally
LRandom := nil;
@@ -1563,12 +1611,12 @@ procedure TTestBinPoly.TestX86V128_LSize10_MidWindow;
// Exhaustively exercise every generated small fixed-size kernel (lsize 1..10)
// through all three operations at non-zero offsets.
-procedure TTestBinPoly.TestX86V128_SmallSizes_AllOps;
+procedure TBinPolyBackendTestBase.TestBackend_SmallSizes_AllOps;
var
LRandom: IRandom;
LLSize, LT, LN: Int32;
begin
- if not TBinPolyX86V128Backend.IsEnabled then
+ if not BackendSupported then
Exit;
for LLSize := 1 to 10 do
@@ -1577,7 +1625,7 @@ procedure TTestBinPoly.TestX86V128_SmallSizes_AllOps;
LRandom := TRandom.Create(FixedSeed + 4000 + LLSize);
try
for LT := 0 to RandomTrials - 1 do
- RunX86V128VsScalar(LN, LRandom,
+ RunBackendVsScalar(LN, LRandom,
'small lsize' + IntToStr(LLSize) + ' trial ' + IntToStr(LT));
finally
LRandom := nil;
@@ -1587,7 +1635,7 @@ procedure TTestBinPoly.TestX86V128_SmallSizes_AllOps;
// Multiplying by zero must yield zero and multiplying by one must be the
// identity, matching the scalar backend, across small, medium and large sizes.
-procedure TTestBinPoly.TestX86V128_MultiplyByZeroAndOne;
+procedure TBinPolyBackendTestBase.TestBackend_MultiplyByZeroAndOne;
const
Sizes: array [0 .. 5] of Int32 = (64, 320, 608, 672, 1248, BikeR1);
var
@@ -1595,7 +1643,7 @@ procedure TTestBinPoly.TestX86V128_MultiplyByZeroAndOne;
LC, LSize: Int32;
LX, LZero, LOne: TCryptoLibUInt64Array;
begin
- if not TBinPolyX86V128Backend.IsEnabled then
+ if not BackendSupported then
Exit;
LRandom := TRandom.Create(FixedSeed + 5000);
@@ -1607,9 +1655,9 @@ procedure TTestBinPoly.TestX86V128_MultiplyByZeroAndOne;
LZero := TBinPolys.Create(LSize);
LOne := TBinPolys.Create(LSize);
LOne[0] := 1;
- AssertX86V128MultiplyEquals(Sizes[LC], LX, LZero,
+ AssertBackendMultiplyEquals(Sizes[LC], LX, LZero,
'mul-by-zero n=' + IntToStr(Sizes[LC]));
- AssertX86V128MultiplyEquals(Sizes[LC], LX, LOne,
+ AssertBackendMultiplyEquals(Sizes[LC], LX, LOne,
'mul-by-one n=' + IntToStr(Sizes[LC]));
end;
finally
@@ -1620,7 +1668,7 @@ procedure TTestBinPoly.TestX86V128_MultiplyByZeroAndOne;
// Adversarial bit patterns (all-ones, single high bit, alternating) maximise
// carryless lane interaction and catch lane-splice / shift errors that random
// inputs may miss.
-procedure TTestBinPoly.TestX86V128_EdgeVectors;
+procedure TBinPolyBackendTestBase.TestBackend_EdgeVectors;
const
Sizes: array [0 .. 6] of Int32 = (64, 192, 608, 672, 736, 1248, BikeR1);
var
@@ -1640,7 +1688,7 @@ procedure TTestBinPoly.TestX86V128_EdgeVectors;
end;
begin
- if not TBinPolyX86V128Backend.IsEnabled then
+ if not BackendSupported then
Exit;
for LC := Low(Sizes) to High(Sizes) do
@@ -1664,15 +1712,15 @@ procedure TTestBinPoly.TestX86V128_EdgeVectors;
LLowBit := TBinPolys.Create(LSize);
LLowBit[0] := 1;
- AssertX86V128MultiplyEquals(Sizes[LC], LAllOnes, LAllOnes,
+ AssertBackendMultiplyEquals(Sizes[LC], LAllOnes, LAllOnes,
'edge allones^2 n=' + IntToStr(Sizes[LC]));
- AssertX86V128MultiplyEquals(Sizes[LC], LAllOnes, LAlt,
+ AssertBackendMultiplyEquals(Sizes[LC], LAllOnes, LAlt,
'edge allones*alt n=' + IntToStr(Sizes[LC]));
- AssertX86V128MultiplyEquals(Sizes[LC], LHighBit, LHighBit,
+ AssertBackendMultiplyEquals(Sizes[LC], LHighBit, LHighBit,
'edge highbit^2 n=' + IntToStr(Sizes[LC]));
- AssertX86V128MultiplyEquals(Sizes[LC], LHighBit, LAllOnes,
+ AssertBackendMultiplyEquals(Sizes[LC], LHighBit, LAllOnes,
'edge highbit*allones n=' + IntToStr(Sizes[LC]));
- AssertX86V128MultiplyEquals(Sizes[LC], LAlt, LLowBit,
+ AssertBackendMultiplyEquals(Sizes[LC], LAlt, LLowBit,
'edge alt*lowbit n=' + IntToStr(Sizes[LC]));
end;
end;
@@ -1711,6 +1759,28 @@ procedure TTestBinPoly.TestBitLengthVar_AgainstReference;
end;
end;
+{$IFDEF CRYPTOLIB_X86_SIMD}
+
+{ TTestBinPolyX86V128 }
+
+function TTestBinPolyX86V128.BackendSupported: Boolean;
+begin
+ Result := TBinPolyX86V128Backend.IsSupported;
+end;
+
+function TTestBinPolyX86V128.CreateBackendMul(AN: Int32;
+ const AReduce: IBinPolyReduce): IBinPolyMul;
+begin
+ Result := TBinPolyX86V128Backend.CreateBinPolyMul(AN, AReduce);
+end;
+
+function TTestBinPolyX86V128.BackendLabel: String;
+begin
+ Result := 'X86V128';
+end;
+
+{$ENDIF CRYPTOLIB_X86_SIMD}
+
initialization
{$IFDEF FPC}
@@ -1719,4 +1789,12 @@ initialization
RegisterTest(TTestBinPoly.Suite);
{$ENDIF FPC}
+{$IFDEF CRYPTOLIB_X86_SIMD}
+{$IFDEF FPC}
+ RegisterTest(TTestBinPolyX86V128);
+{$ELSE}
+ RegisterTest(TTestBinPolyX86V128.Suite);
+{$ENDIF FPC}
+{$ENDIF CRYPTOLIB_X86_SIMD}
+
end.
diff --git a/CryptoLib/src/Crypto/ClpAesSimd.pas b/CryptoLib/src/Crypto/ClpAesSimd.pas
new file mode 100644
index 00000000..f8a74536
--- /dev/null
+++ b/CryptoLib/src/Crypto/ClpAesSimd.pas
@@ -0,0 +1,78 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpAesSimd;
+
+{$I ..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ ClpIBlockCipher
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ , ClpAesEngineX86
+{$IFEND}
+ ;
+
+type
+ ///
+ /// Arch-neutral SIMD dispatch facade for hardware-accelerated AES engines.
+ /// SIMD-only by contract: it produces the per-arch hardware engine (e.g.
+ /// AES-NI via TAesEngineX86 on x86)
+ /// when available, or reports "not handled" - it never returns the portable
+ /// scalar engine. The scalar fallback belongs to the caller
+ /// (TAesUtilities), matching the Try*-then-scalar shape used across the
+ /// other SIMD families. Selects the per-arch backend at compile time.
+ ///
+ TAesSimd = class sealed
+ public
+ /// True when a hardware AES engine is available on this build/CPU.
+ class function IsSupported: Boolean; static;
+ ///
+ /// Create the per-arch hardware AES engine when available (returns True with
+ /// set); otherwise is nil
+ /// and the caller runs its scalar path (returns False).
+ ///
+ class function TryCreateHardwareEngine(out AEngine: IBlockCipher): Boolean; static;
+ end;
+
+implementation
+
+{ TAesSimd }
+
+class function TAesSimd.IsSupported: Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TAesEngineX86.IsSupported;
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TAesSimd.TryCreateHardwareEngine(out AEngine: IBlockCipher): Boolean;
+begin
+ AEngine := nil;
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ if TAesEngineX86.IsSupported then
+ begin
+ AEngine := TAesEngineX86.Create();
+ Exit(True);
+ end;
+{$IFEND}
+ Result := False;
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/ClpAesUtilities.pas b/CryptoLib/src/Crypto/ClpAesUtilities.pas
index 141d5d3a..2093a8e6 100644
--- a/CryptoLib/src/Crypto/ClpAesUtilities.pas
+++ b/CryptoLib/src/Crypto/ClpAesUtilities.pas
@@ -22,25 +22,22 @@ interface
uses
ClpIBlockCipher,
- ClpAesEngine
-{$IFDEF CRYPTOLIB_X86_SIMD}
- , ClpAesEngineX86
-{$ENDIF}
- ;
+ ClpAesEngine,
+ ClpAesSimd;
type
///
- /// Factory for the default AES block cipher.
- /// When CRYPTOLIB_X86_SIMD is defined and AES-NI is available at runtime,
- /// returns TAesEngineX86 (hardware AES);
- /// otherwise TAesEngine.
+ /// Factory for the default AES block cipher. Selects the per-arch hardware
+ /// engine at compile time and, when it is available at runtime, returns it
+ /// (e.g. AES-NI via TAesEngineX86 on x86); otherwise the portable
+ /// scalar TAesEngine.
///
TAesUtilities = class sealed(TObject)
public
class function CreateEngine(): IBlockCipher; static;
///
- /// True when the library is built with CRYPTOLIB_X86_SIMD and AES-NI is available
- /// at runtime (TAesEngineX86.IsSupported). Otherwise False.
+ /// True when the build has a per-arch hardware AES engine and it is available
+ /// at runtime (its IsSupported is True). Otherwise False.
///
class function IsHardwareAccelerated(): Boolean; static;
end;
@@ -50,21 +47,17 @@ implementation
{ TAesUtilities }
class function TAesUtilities.CreateEngine(): IBlockCipher;
+var
+ LEngine: IBlockCipher;
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if IsHardwareAccelerated then
- Exit(TAesEngineX86.Create());
-{$ENDIF}
+ if TAesSimd.TryCreateHardwareEngine(LEngine) then
+ Exit(LEngine);
Result := TAesEngine.Create();
end;
class function TAesUtilities.IsHardwareAccelerated(): Boolean;
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- Result := TAesEngineX86.IsSupported;
-{$ELSE}
- Result := False;
-{$ENDIF}
+ Result := TAesSimd.IsSupported;
end;
end.
diff --git a/CryptoLib/src/Crypto/Engines/ClpChaCha7539Engine.pas b/CryptoLib/src/Crypto/Engines/ClpChaCha7539Engine.pas
index f3cc0941..2c04da98 100644
--- a/CryptoLib/src/Crypto/Engines/ClpChaCha7539Engine.pas
+++ b/CryptoLib/src/Crypto/Engines/ClpChaCha7539Engine.pas
@@ -29,8 +29,7 @@ interface
ClpChaChaEngine,
ClpPack,
ClpCryptoLibTypes,
- ClpCpuFeatures,
- ClpSimdLevels,
+ ClpChaChaSimd,
ClpByteUtilities;
resourcestring
@@ -58,6 +57,11 @@ TChaCha7539Engine = class(TSalsa20Engine, IChaCha7539Engine, IStreamCipher)
strict private
procedure ImplProcessBlock(const AInBuf: TCryptoLibByteArray; AInOff: Int32;
const AOutBuf: TCryptoLibByteArray; AOutOff: Int32); inline;
+ // Two-block keystream body (SIMD-accelerated when available, else scalar) with
+ // no state validation or 2^38 byte-limit accounting - those belong to the
+ // callers (ProcessBlocks2, ProcessBlocks4), which do them exactly once.
+ procedure ProcessBlocks2Core(const AInBytes: TCryptoLibByteArray; AInOff: Int32;
+ const AOutBytes: TCryptoLibByteArray; AOutOff: Int32);
public
constructor Create();
@@ -82,46 +86,6 @@ TChaCha7539Engine = class(TSalsa20Engine, IChaCha7539Engine, IStreamCipher)
implementation
-{$IFDEF CRYPTOLIB_X86_SIMD}
-procedure ChaCha7539RaiseCounter7539;
-begin
- raise EInvalidOperationCryptoLibException.CreateRes(@SCounterExceeded);
-end;
-
-procedure ChaCha7539ProcessBlocks2Sse2(ARounds: Int32; AState, AIn, AOut: PByte);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
-{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Sse2_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
-{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Sse2_i386.inc}
-{$ENDIF}
-end;
-
-procedure ChaCha7539ProcessBlocks2Avx2(ARounds: Int32; AState, AIn, AOut: PByte);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
-{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Avx2_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
-{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Avx2_i386.inc}
-{$ENDIF}
-end;
-
-procedure ChaCha7539ProcessBlocks4Avx2(ARounds: Int32; AState, AIn, AOut: PByte);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
-{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks4Avx2_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
-{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks4Avx2_i386.inc}
-{$ENDIF}
-end;
-{$ENDIF}
-
{ TChaCha7539Engine }
constructor TChaCha7539Engine.Create;
@@ -363,22 +327,17 @@ procedure TChaCha7539Engine.ProcessBlocks2(const AInBytes: TCryptoLibByteArray;
begin
raise EMaxBytesExceededCryptoLibException.CreateRes(@SMaxByteExceeded);
end;
-{$IFDEF CRYPTOLIB_X86_SIMD}
- case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]) of
- TX86SimdLevel.AVX2:
- begin
- ChaCha7539ProcessBlocks2Avx2(FRounds, PByte(@FEngineState[0]), PByte(@AInBytes[AInOff]),
- PByte(@AOutBytes[AOutOff]));
- Exit;
- end;
- TX86SimdLevel.SSE2:
- begin
- ChaCha7539ProcessBlocks2Sse2(FRounds, PByte(@FEngineState[0]), PByte(@AInBytes[AInOff]),
- PByte(@AOutBytes[AOutOff]));
- Exit;
- end;
- end;
-{$ENDIF}
+
+ ProcessBlocks2Core(AInBytes, AInOff, AOutBytes, AOutOff);
+end;
+
+procedure TChaCha7539Engine.ProcessBlocks2Core(const AInBytes: TCryptoLibByteArray;
+ AInOff: Int32; const AOutBytes: TCryptoLibByteArray; AOutOff: Int32);
+begin
+ if TChaChaSimd.TryProcessBlocks2(FRounds, PByte(@FEngineState[0]),
+ PByte(@AInBytes[AInOff]), PByte(@AOutBytes[AOutOff])) then
+ Exit;
+
ImplProcessBlock(AInBytes, AInOff, AOutBytes, AOutOff);
ImplProcessBlock(AInBytes, AInOff + 64, AOutBytes, AOutOff + 64);
end;
@@ -396,22 +355,17 @@ procedure TChaCha7539Engine.ProcessBlocks4(const AInBytes: TCryptoLibByteArray;
raise EInvalidOperationCryptoLibException.CreateResFmt(@SNotBlockAligned,
[AlgorithmName]);
end;
-{$IFDEF CRYPTOLIB_X86_SIMD}
- case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2]) of
- TX86SimdLevel.AVX2:
- begin
- if (LimitExceeded(UInt32(256))) then
- begin
- raise EMaxBytesExceededCryptoLibException.CreateRes(@SMaxByteExceeded);
- end;
- ChaCha7539ProcessBlocks4Avx2(FRounds, PByte(@FEngineState[0]),
- PByte(@AInBytes[AInOff]), PByte(@AOutBytes[AOutOff]));
- Exit;
- end;
+ if (LimitExceeded(UInt32(256))) then
+ begin
+ raise EMaxBytesExceededCryptoLibException.CreateRes(@SMaxByteExceeded);
end;
-{$ENDIF}
- ProcessBlocks2(AInBytes, AInOff, AOutBytes, AOutOff);
- ProcessBlocks2(AInBytes, AInOff + 128, AOutBytes, AOutOff + 128);
+
+ if TChaChaSimd.TryProcessBlocks4(FRounds, PByte(@FEngineState[0]),
+ PByte(@AInBytes[AInOff]), PByte(@AOutBytes[AOutOff])) then
+ Exit;
+
+ ProcessBlocks2Core(AInBytes, AInOff, AOutBytes, AOutOff);
+ ProcessBlocks2Core(AInBytes, AInOff + 128, AOutBytes, AOutOff + 128);
end;
procedure TChaCha7539Engine.ImplProcessBlock(
diff --git a/CryptoLib/src/Crypto/Engines/ClpChaChaEngine.pas b/CryptoLib/src/Crypto/Engines/ClpChaChaEngine.pas
index 3da6bea8..9f26e81d 100644
--- a/CryptoLib/src/Crypto/Engines/ClpChaChaEngine.pas
+++ b/CryptoLib/src/Crypto/Engines/ClpChaChaEngine.pas
@@ -26,8 +26,7 @@ interface
ClpIChaChaEngine,
ClpSalsa20Engine,
ClpPack,
- ClpCpuFeatures,
- ClpSimdLevels,
+ ClpChaChaSimd,
ClpCryptoLibTypes;
resourcestring
@@ -84,19 +83,6 @@ TChaChaEngine = class(TSalsa20Engine, IChaChaEngine, IStreamCipher)
implementation
-{$IFDEF CRYPTOLIB_X86_SIMD}
-procedure ChaCha20BlockSse2(ARounds: Int32; AInput, AOut: PByte);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc}
-{$I ..\..\Include\Simd\ChaCha\ChaCha20BlockSse2_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc3Begin_i386.inc}
-{$I ..\..\Include\Simd\ChaCha\ChaCha20BlockSse2_i386.inc}
-{$ENDIF}
-end;
-{$ENDIF}
-
{ TChaChaEngine }
procedure TChaChaEngine.ProcessBlocks2(
@@ -136,15 +122,8 @@ class procedure TChaChaEngine.ChaChaCore(ARounds: Int32;
begin
raise EArgumentCryptoLibException.CreateRes(@SRoundsEven);
end;
-{$IFDEF CRYPTOLIB_X86_SIMD}
- case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of
- TX86SimdLevel.SSE2:
- begin
- ChaCha20BlockSse2(ARounds, PByte(@AInput[0]), PByte(@AOutput[0]));
- Exit;
- end;
- end;
-{$ENDIF}
+ if TChaChaSimd.TryCore(ARounds, PByte(@AInput[0]), PByte(@AOutput[0])) then
+ Exit;
LX00 := AInput[0];
LX01 := AInput[1];
diff --git a/CryptoLib/src/Crypto/Engines/ClpChaChaSimd.pas b/CryptoLib/src/Crypto/Engines/ClpChaChaSimd.pas
new file mode 100644
index 00000000..1100839b
--- /dev/null
+++ b/CryptoLib/src/Crypto/Engines/ClpChaChaSimd.pas
@@ -0,0 +1,79 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpChaChaSimd;
+
+{$I ..\..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ ClpCryptoLibTypes
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ , ClpChaChaX86Backend
+{$IFEND}
+ ;
+
+type
+ ///
+ /// Arch-neutral SIMD dispatch facade for the ChaCha family. Selects the
+ /// per-arch backend at compile time; on a build with no
+ /// SIMD backend every entry point degrades to "not handled" so callers run
+ /// their scalar reference path. The ChaCha engines call only this facade and
+ /// stay free of any TCpuFeatures / CRYPTOLIB_*_ASM knowledge.
+ ///
+ TChaChaSimd = class sealed
+ public
+ /// Single-block ChaCha core (ChaCha20 keystream block).
+ class function TryCore(ARounds: Int32; AInput, AOut: PByte): Boolean; static;
+ /// Two-block ChaCha7539 keystream (128 bytes).
+ class function TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static;
+ /// Four-block ChaCha7539 keystream (256 bytes).
+ class function TryProcessBlocks4(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static;
+ end;
+
+implementation
+
+{ TChaChaSimd }
+
+class function TChaChaSimd.TryCore(ARounds: Int32; AInput, AOut: PByte): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TChaChaX86Backend.TryCore(ARounds, AInput, AOut);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TChaChaSimd.TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TChaChaX86Backend.TryProcessBlocks2(ARounds, AState, AIn, AOut);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TChaChaSimd.TryProcessBlocks4(ARounds: Int32; AState, AIn, AOut: PByte): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TChaChaX86Backend.TryProcessBlocks4(ARounds, AState, AIn, AOut);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/Engines/ClpChaChaX86Backend.pas b/CryptoLib/src/Crypto/Engines/ClpChaChaX86Backend.pas
new file mode 100644
index 00000000..27f03d99
--- /dev/null
+++ b/CryptoLib/src/Crypto/Engines/ClpChaChaX86Backend.pas
@@ -0,0 +1,152 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpChaChaX86Backend;
+
+{$I ..\..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ SysUtils,
+ ClpCpuFeatures,
+ ClpSimdLevels,
+ ClpCryptoLibTypes;
+
+resourcestring
+ SCounterExceeded = 'attempt to increase counter past 2^32';
+
+type
+ ///
+ /// x86 SIMD backend for the ChaCha family: owns the SIMD keystream kernels
+ /// (bodies in Include\Simd\ChaCha\) and the runtime tier selection via
+ /// TCpuFeatures.X86.SelectSlot. Compiles on every target - when built
+ /// without x86 SIMD the Try* entry points return False and the
+ /// callers fall back to their scalar reference path.
+ ///
+ TChaChaX86Backend = class sealed
+ public
+ /// SIMD single-block ChaCha core (ChaCha20 keystream block).
+ class function TryCore(ARounds: Int32; AInput, AOut: PByte): Boolean; static;
+ /// SIMD two-block ChaCha7539 keystream (128 bytes).
+ class function TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static;
+ /// AVX2 four-block ChaCha7539 keystream (256 bytes).
+ class function TryProcessBlocks4(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static;
+ end;
+
+implementation
+
+{$IFDEF CRYPTOLIB_X86_SIMD}
+procedure ChaCha20BlockSse2(ARounds: Int32; AInput, AOut: PByte);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc}
+{$I ..\..\Include\Simd\ChaCha\ChaCha20BlockSse2_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc3Begin_i386.inc}
+{$I ..\..\Include\Simd\ChaCha\ChaCha20BlockSse2_i386.inc}
+{$ENDIF}
+end;
+
+procedure ChaCha7539RaiseCounter7539;
+begin
+ raise EInvalidOperationCryptoLibException.CreateRes(@SCounterExceeded);
+end;
+
+procedure ChaCha7539ProcessBlocks2Sse2(ARounds: Int32; AState, AIn, AOut: PByte);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
+{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Sse2_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
+{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Sse2_i386.inc}
+{$ENDIF}
+end;
+
+procedure ChaCha7539ProcessBlocks2Avx2(ARounds: Int32; AState, AIn, AOut: PByte);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
+{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Avx2_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
+{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Avx2_i386.inc}
+{$ENDIF}
+end;
+
+procedure ChaCha7539ProcessBlocks4Avx2(ARounds: Int32; AState, AIn, AOut: PByte);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
+{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks4Avx2_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
+{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks4Avx2_i386.inc}
+{$ENDIF}
+end;
+{$ENDIF CRYPTOLIB_X86_SIMD}
+
+{ TChaChaX86Backend }
+
+class function TChaChaX86Backend.TryCore(ARounds: Int32; AInput, AOut: PByte): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of
+ TX86SimdLevel.SSE2:
+ begin
+ ChaCha20BlockSse2(ARounds, AInput, AOut);
+ Exit(True);
+ end;
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+class function TChaChaX86Backend.TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]) of
+ TX86SimdLevel.AVX2:
+ begin
+ ChaCha7539ProcessBlocks2Avx2(ARounds, AState, AIn, AOut);
+ Exit(True);
+ end;
+ TX86SimdLevel.SSE2:
+ begin
+ ChaCha7539ProcessBlocks2Sse2(ARounds, AState, AIn, AOut);
+ Exit(True);
+ end;
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+class function TChaChaX86Backend.TryProcessBlocks4(ARounds: Int32; AState, AIn, AOut: PByte): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2]) of
+ TX86SimdLevel.AVX2:
+ begin
+ ChaCha7539ProcessBlocks4Avx2(ARounds, AState, AIn, AOut);
+ Exit(True);
+ end;
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/Engines/ClpSalsa20Engine.pas b/CryptoLib/src/Crypto/Engines/ClpSalsa20Engine.pas
index 3adfab31..222e579d 100644
--- a/CryptoLib/src/Crypto/Engines/ClpSalsa20Engine.pas
+++ b/CryptoLib/src/Crypto/Engines/ClpSalsa20Engine.pas
@@ -30,8 +30,7 @@ interface
ClpICipherParameters,
ClpIParametersWithIV,
ClpPack,
- ClpCpuFeatures,
- ClpSimdLevels,
+ ClpSalsaSimd,
ClpByteUtilities,
ClpCryptoLibTypes;
@@ -146,30 +145,6 @@ TSalsa20Engine = class(TInterfacedObject, ISalsa20Engine, IStreamCipher)
implementation
-{$IFDEF CRYPTOLIB_X86_SIMD}
-procedure Salsa20BlockSse41(ARounds: Int32; AInput, AOut: Pointer);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc}
-{$I ..\..\Include\Simd\Salsa\Salsa20BlockSse41_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc3Begin_i386.inc}
-{$I ..\..\Include\Simd\Salsa\Salsa20BlockSse41_i386.inc}
-{$ENDIF}
-end;
-
-procedure Salsa20ProcessBlocks2Sse41(ARounds: Int32; AState, AIn, AOut: PByte);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
-{$I ..\..\Include\Simd\Salsa\Salsa20ProcessBlocks2Sse41_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
-{$I ..\..\Include\Simd\Salsa\Salsa20ProcessBlocks2Sse41_i386.inc}
-{$ENDIF}
-end;
-{$ENDIF}
-
{ TSalsa20Engine }
constructor TSalsa20Engine.Create;
@@ -344,15 +319,11 @@ procedure TSalsa20Engine.ProcessBlocks2(
const AOutBytes: TCryptoLibByteArray; AOutOff: Int32);
begin
AssertInitialisedAndBlockAligned;
-{$IFDEF CRYPTOLIB_X86_SIMD}
- case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE41]) of
- TX86SimdLevel.SSE41:
- begin
- Salsa20ProcessBlocks2Sse41(FRounds, PByte(@FEngineState[0]), PByte(@AInBytes[AInOff]), PByte(@AOutBytes[AOutOff]));
- Exit;
- end;
- end;
-{$ENDIF}
+
+ if TSalsaSimd.TryProcessBlocks2(FRounds, PByte(@FEngineState[0]),
+ PByte(@AInBytes[AInOff]), PByte(@AOutBytes[AOutOff])) then
+ Exit;
+
ImplProcessBlock(AInBytes, AInOff, AOutBytes, AOutOff);
ImplProcessBlock(AInBytes, AInOff + 64, AOutBytes, AOutOff + 64);
end;
@@ -520,15 +491,8 @@ class procedure TSalsa20Engine.SalsaCore(ARounds: Int32;
begin
raise EArgumentCryptoLibException.CreateRes(@SRoundsMustBeEven);
end;
-{$IFDEF CRYPTOLIB_X86_SIMD}
- case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE41]) of
- TX86SimdLevel.SSE41:
- begin
- Salsa20BlockSse41(ARounds, @AInput[0], @AX[0]);
- Exit;
- end;
- end;
-{$ENDIF}
+ if TSalsaSimd.TryCore(ARounds, @AInput[0], @AX[0]) then
+ Exit;
LX00 := AInput[0];
LX01 := AInput[1];
diff --git a/CryptoLib/src/Crypto/Engines/ClpSalsaSimd.pas b/CryptoLib/src/Crypto/Engines/ClpSalsaSimd.pas
new file mode 100644
index 00000000..c69ecba3
--- /dev/null
+++ b/CryptoLib/src/Crypto/Engines/ClpSalsaSimd.pas
@@ -0,0 +1,68 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpSalsaSimd;
+
+{$I ..\..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ ClpCryptoLibTypes
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ , ClpSalsaX86Backend
+{$IFEND}
+ ;
+
+type
+ ///
+ /// Arch-neutral SIMD dispatch facade for the Salsa20 family. Selects the
+ /// per-arch backend at compile time; on a build with no
+ /// SIMD backend every entry point degrades to "not handled" so callers run
+ /// their scalar reference path. The Salsa20 engine calls only this facade and
+ /// stays free of any TCpuFeatures / CRYPTOLIB_*_ASM knowledge.
+ ///
+ TSalsaSimd = class sealed
+ public
+ /// Single-block Salsa20 core.
+ class function TryCore(ARounds: Int32; AInput, AOut: Pointer): Boolean; static;
+ /// Two-block Salsa20 keystream (128 bytes).
+ class function TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static;
+ end;
+
+implementation
+
+{ TSalsaSimd }
+
+class function TSalsaSimd.TryCore(ARounds: Int32; AInput, AOut: Pointer): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TSalsaX86Backend.TryCore(ARounds, AInput, AOut);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TSalsaSimd.TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TSalsaX86Backend.TryProcessBlocks2(ARounds, AState, AIn, AOut);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/Engines/ClpSalsaX86Backend.pas b/CryptoLib/src/Crypto/Engines/ClpSalsaX86Backend.pas
new file mode 100644
index 00000000..f60a33e0
--- /dev/null
+++ b/CryptoLib/src/Crypto/Engines/ClpSalsaX86Backend.pas
@@ -0,0 +1,100 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpSalsaX86Backend;
+
+{$I ..\..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ ClpCpuFeatures,
+ ClpSimdLevels,
+ ClpCryptoLibTypes;
+
+type
+ ///
+ /// x86 SIMD backend for the Salsa20 family: owns the SIMD keystream kernels
+ /// (bodies in Include\Simd\Salsa\) and the runtime tier selection via
+ /// TCpuFeatures.X86.SelectSlot. Compiles on every target - when built
+ /// without x86 SIMD the Try* entry points return False and the
+ /// callers fall back to their scalar reference path.
+ ///
+ TSalsaX86Backend = class sealed
+ public
+ /// SIMD single-block Salsa20 core.
+ class function TryCore(ARounds: Int32; AInput, AOut: Pointer): Boolean; static;
+ /// SIMD two-block Salsa20 keystream (128 bytes).
+ class function TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static;
+ end;
+
+implementation
+
+{$IFDEF CRYPTOLIB_X86_SIMD}
+procedure Salsa20BlockSse41(ARounds: Int32; AInput, AOut: Pointer);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc}
+{$I ..\..\Include\Simd\Salsa\Salsa20BlockSse41_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc3Begin_i386.inc}
+{$I ..\..\Include\Simd\Salsa\Salsa20BlockSse41_i386.inc}
+{$ENDIF}
+end;
+
+procedure Salsa20ProcessBlocks2Sse41(ARounds: Int32; AState, AIn, AOut: PByte);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
+{$I ..\..\Include\Simd\Salsa\Salsa20ProcessBlocks2Sse41_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
+{$I ..\..\Include\Simd\Salsa\Salsa20ProcessBlocks2Sse41_i386.inc}
+{$ENDIF}
+end;
+{$ENDIF CRYPTOLIB_X86_SIMD}
+
+{ TSalsaX86Backend }
+
+class function TSalsaX86Backend.TryCore(ARounds: Int32; AInput, AOut: Pointer): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE41]) of
+ TX86SimdLevel.SSE41:
+ begin
+ Salsa20BlockSse41(ARounds, AInput, AOut);
+ Exit(True);
+ end;
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+class function TSalsaX86Backend.TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE41]) of
+ TX86SimdLevel.SSE41:
+ begin
+ Salsa20ProcessBlocks2Sse41(ARounds, AState, AIn, AOut);
+ Exit(True);
+ end;
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/Macs/ClpPoly1305.pas b/CryptoLib/src/Crypto/Macs/ClpPoly1305.pas
index d4e57fbf..240a97d2 100644
--- a/CryptoLib/src/Crypto/Macs/ClpPoly1305.pas
+++ b/CryptoLib/src/Crypto/Macs/ClpPoly1305.pas
@@ -34,8 +34,8 @@ interface
ClpPack,
ClpBitOperations,
ClpArrayUtilities,
- ClpCpuFeatures,
- ClpSimdLevels,
+ ClpPoly1305State,
+ ClpPoly1305Simd,
ClpCryptoLibTypes;
resourcestring
@@ -51,23 +51,6 @@ interface
'Poly1305 requires a 128-bit IV when used with a cipher.';
type
- ///
- /// Poly1305 algorithm state in radix-2^26 form (72 bytes; same layout on
- /// every architecture).
- ///
- /// - R0..R4 - clamped 130-bit r split into five 26-bit limbs
- /// - S1..S4 - precomputed 5 * R1..R4 wraparound multipliers
- /// - H0..H4 - 130-bit accumulator in five 26-bit limbs (plus a few carry bits)
- /// - K0..K3 - the Poly1305 "s" key (second half of the 32-byte key)
- ///
- ///
- TPoly1305State = record
- R0, R1, R2, R3, R4: UInt32;
- S1, S2, S3, S4: UInt32;
- H0, H1, H2, H3, H4: UInt32;
- K0, K1, K2, K3: UInt32;
- end;
-
TPoly1305 = class sealed(TMac, IPoly1305, IMac)
strict private
@@ -240,174 +223,6 @@ procedure Poly1305StateProcessBlocksScalar(var AState: TPoly1305State;
end;
end;
-{ AVX2 helpers }
-
-{$IFDEF CRYPTOLIB_X86_SIMD}
-
-// Multiply two 5-limb radix-2^26 numbers ALhs, ARhs modulo 2^130-5,
-// returning the 5-limb result in AProduct. Same field arithmetic as the
-// inner step of Poly1305StateProcessBlock; used here at SetKey time to
-// derive r^2..r^4 for the AVX2 power table. Kept private to this unit
-// since it has no caller outside the AVX2 setup path.
-procedure Poly1305MulLimbs(out AProduct: array of UInt32;
- const ALhs, ARhs: array of UInt32);
-var
- LS1, LS2, LS3, LS4: UInt32;
- LD0, LD1, LD2, LD3, LD4: UInt64;
-begin
- LS1 := ARhs[1] * 5;
- LS2 := ARhs[2] * 5;
- LS3 := ARhs[3] * 5;
- LS4 := ARhs[4] * 5;
-
- LD0 := UInt64(ALhs[0]) * ARhs[0] + UInt64(ALhs[1]) * LS4 +
- UInt64(ALhs[2]) * LS3 + UInt64(ALhs[3]) * LS2 + UInt64(ALhs[4]) * LS1;
- LD1 := UInt64(ALhs[0]) * ARhs[1] + UInt64(ALhs[1]) * ARhs[0] +
- UInt64(ALhs[2]) * LS4 + UInt64(ALhs[3]) * LS3 + UInt64(ALhs[4]) * LS2;
- LD2 := UInt64(ALhs[0]) * ARhs[2] + UInt64(ALhs[1]) * ARhs[1] +
- UInt64(ALhs[2]) * ARhs[0] + UInt64(ALhs[3]) * LS4 + UInt64(ALhs[4]) * LS3;
- LD3 := UInt64(ALhs[0]) * ARhs[3] + UInt64(ALhs[1]) * ARhs[2] +
- UInt64(ALhs[2]) * ARhs[1] + UInt64(ALhs[3]) * ARhs[0] +
- UInt64(ALhs[4]) * LS4;
- LD4 := UInt64(ALhs[0]) * ARhs[4] + UInt64(ALhs[1]) * ARhs[3] +
- UInt64(ALhs[2]) * ARhs[2] + UInt64(ALhs[3]) * ARhs[1] +
- UInt64(ALhs[4]) * ARhs[0];
-
- AProduct[0] := UInt32(LD0) and $3FFFFFF;
- LD1 := LD1 + (LD0 shr 26);
- AProduct[1] := UInt32(LD1) and $3FFFFFF;
- LD2 := LD2 + (LD1 shr 26);
- AProduct[2] := UInt32(LD2) and $3FFFFFF;
- LD3 := LD3 + (LD2 shr 26);
- AProduct[3] := UInt32(LD3) and $3FFFFFF;
- LD4 := LD4 + (LD3 shr 26);
- AProduct[4] := UInt32(LD4) and $3FFFFFF;
- AProduct[0] := AProduct[0] + UInt32(LD4 shr 26) * 5;
- AProduct[1] := AProduct[1] + (AProduct[0] shr 26);
- AProduct[0] := AProduct[0] and $3FFFFFF;
-end;
-
-// (Re)allocate APowTable to the byte size required by the AVX2 4-way
-// bulk kernel and pack the precomputed powers r^1..r^4 of AState.R0..R4
-// into it, in the post-VPERMD layout the kernel expects. Must be called
-// once after Poly1305StateAbsorbR has populated AState.R0..R4 and before
-// the first invocation of Poly1305ProcessBlocksAvx2 for the same key.
-// The exact buffer size and limb layout are private to this routine.
-procedure Poly1305Avx2InitPowerTable(var APowTable: TCryptoLibByteArray;
- const AState: TPoly1305State);
-const
- // 10 rows x 8 lanes x 4 bytes = 320. Rows 0..4 hold the limbs of
- // r^4|r^4|r^4|r^3 | r^4|r^2|r^4|r^1 across the 8 ymm lanes (post-VPERMD
- // layout); rows 5..8 hold the 5x wraparound multipliers; row 9 is
- // padding for the +4 over-read of the last shifted load.
- TableSize = Int32(320);
-type
- TPowTableLayout = array[0..9, 0..7] of UInt32;
- PPowTableLayout = ^TPowTableLayout;
-var
- LTbl: PPowTableLayout;
- Lr1, Lr2, Lr3, Lr4: array[0..4] of UInt32;
- LIdx, LRow, LJ: Int32;
-begin
- System.SetLength(APowTable, TableSize);
- LTbl := PPowTableLayout(APowTable);
-
- Lr1[0] := AState.R0;
- Lr1[1] := AState.R1;
- Lr1[2] := AState.R2;
- Lr1[3] := AState.R3;
- Lr1[4] := AState.R4;
-
- Poly1305MulLimbs(Lr2, Lr1, Lr1);
- Poly1305MulLimbs(Lr3, Lr2, Lr1);
- Poly1305MulLimbs(Lr4, Lr2, Lr2);
-
- // Rows 0..4: limbs of r^k for the 4 powers, post-VPERMD layout.
- for LIdx := 0 to 4 do
- begin
- LTbl^[LIdx, 0] := Lr4[LIdx];
- LTbl^[LIdx, 1] := Lr4[LIdx];
- LTbl^[LIdx, 2] := Lr4[LIdx];
- LTbl^[LIdx, 3] := Lr3[LIdx];
- LTbl^[LIdx, 4] := Lr4[LIdx];
- LTbl^[LIdx, 5] := Lr2[LIdx];
- LTbl^[LIdx, 6] := Lr4[LIdx];
- LTbl^[LIdx, 7] := Lr1[LIdx];
- end;
-
- // Rows 5..8: 5 * limbs[1..4] of r^k (wraparound multipliers).
- for LRow := 5 to 8 do
- begin
- LJ := LRow - 4; // 1..4
- LTbl^[LRow, 0] := Lr4[LJ] * 5;
- LTbl^[LRow, 1] := Lr4[LJ] * 5;
- LTbl^[LRow, 2] := Lr4[LJ] * 5;
- LTbl^[LRow, 3] := Lr3[LJ] * 5;
- LTbl^[LRow, 4] := Lr4[LJ] * 5;
- LTbl^[LRow, 5] := Lr2[LJ] * 5;
- LTbl^[LRow, 6] := Lr4[LJ] * 5;
- LTbl^[LRow, 7] := Lr1[LJ] * 5;
- end;
-
- // Row 9 is unused padding for the +4 over-read of the last shifted load.
- for LIdx := 0 to 7 do
- LTbl^[9, LIdx] := 0;
-end;
-
-// Asm wrapper around the architecture-specific 4-way bulk kernel. The .inc
-// files contain pure assembly (db-encoded VEX with mnemonic comments); the
-// Pascal layer below is just the procedure header + the SimdProc5Begin ABI
-// glue + the kernel body include. ACtx points at the 72-byte R/S/H/K
-// portion of TPoly1305State; APowTable points at the separate 320-byte
-// power table buffer; the kernel never reads the K limbs.
-procedure Poly1305BlocksBulkAvx2Core(ACtx, APowTable, AInp: PByte;
- ALen: NativeUInt; APad: Int32);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc5Begin_x86_64.inc}
-{$I ..\..\Include\Simd\Poly1305\Poly1305BlocksBulkAvx2Core_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc5Begin_i386.inc}
-{$I ..\..\Include\Simd\Poly1305\Poly1305BlocksBulkAvx2Core_i386.inc}
-{$ENDIF}
-end;
-
-// Bulk-processing variant for AVX2-capable CPUs. Rounds ANumBlocks down
-// to a multiple of the AVX2 lane count (4) and dispatches the AVX2 kernel
-// for that bulk; the 0..3 leftover blocks are forwarded to
-// Poly1305StateProcessBlocksScalar. When fewer than 4 blocks are
-// available the entire batch is handled by the scalar path. APowTable
-// must point at a buffer already populated by Poly1305Avx2InitPowerTable
-// for the same r as currently in AState.
-procedure Poly1305ProcessBlocksAvx2(var AState: TPoly1305State;
- APowTable: PByte;
- const ABuf: TCryptoLibByteArray; AOff, ANumBlocks: Int32);
-const
- // Minimum number of 16-byte blocks before the AVX2 4-way kernel pays off
- // over the scalar block step; smaller batches go straight to the scalar
- // tail handler below.
- LMinBlocks = Int32(4);
- // Number of 16-byte blocks consumed per AVX2 kernel iteration (one block
- // per 64-bit lane of a 256-bit ymm); used to round the dispatch count
- // down to a multiple supported by the kernel.
- LLaneCount = Int32(4);
-var
- LSimdBlocks: Int32;
-begin
- if ANumBlocks >= LMinBlocks then
- begin
- LSimdBlocks := ANumBlocks and not (LLaneCount - 1);
- Poly1305BlocksBulkAvx2Core(@AState, APowTable, @ABuf[AOff],
- NativeUInt(LSimdBlocks) * 16, 1);
- AOff := AOff + LSimdBlocks * 16;
- ANumBlocks := ANumBlocks - LSimdBlocks;
- end;
- if ANumBlocks > 0 then
- Poly1305StateProcessBlocksScalar(AState, ABuf, AOff, ANumBlocks);
-end;
-
-{$ENDIF CRYPTOLIB_X86_SIMD}
-
{ TPoly1305 }
constructor TPoly1305.Create();
@@ -481,16 +296,10 @@ procedure TPoly1305.SetKey(const AKeyParameter: IKeyParameter;
// Pre-build any SIMD-specific lookup tables for this key, and use the
// (non-)allocation of FPowTable as the dispatch flag for BlockUpdate.
- // Reset to nil first so the scalar path is the postcondition when no
- // SIMD branch matches. To add a new SIMD variant: declare its tier in
- // SelectSlot and delegate to its initializer (which owns sizing + layout).
+ // Reset to nil first so the scalar path is the postcondition when no SIMD
+ // backend claims the key; the facade fills FPowTable iff a SIMD tier applies.
FPowTable := nil;
-{$IFDEF CRYPTOLIB_X86_SIMD}
- case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2]) of
- TX86SimdLevel.AVX2:
- Poly1305Avx2InitPowerTable(FPowTable, FState);
- end;
-{$ENDIF}
+ TPoly1305Simd.TryInitPowerTable(FPowTable, FState);
end;
function TPoly1305.GetAlgorithmName: String;
@@ -520,7 +329,7 @@ procedure TPoly1305.Update(AInput: Byte);
procedure TPoly1305.BlockUpdate(const AInput: TCryptoLibByteArray;
AInOff, ALen: Int32);
var
- LAvailable, LPos, LRemaining, LNb, LBulkBytes: Int32;
+ LAvailable, LPos, LRemaining, LNb, LBulkBytes, LSimdBlocks: Int32;
begin
TCheck.DataLength(AInput, AInOff, ALen, 'input buffer too short');
@@ -549,13 +358,13 @@ procedure TPoly1305.BlockUpdate(const AInput: TCryptoLibByteArray;
if LNb > 0 then
begin
LBulkBytes := LNb shl 4;
- {$IFDEF CRYPTOLIB_X86_SIMD}
- if FPowTable <> nil then
- Poly1305ProcessBlocksAvx2(FState, PByte(FPowTable), AInput,
- AInOff + LPos, LNb)
- else
- {$ENDIF}
- Poly1305StateProcessBlocksScalar(FState, AInput, AInOff + LPos, LNb);
+ // The SIMD facade consumes a lane-multiple of the blocks (0 when no SIMD
+ // path applies for this key) and the scalar reference handles the tail.
+ LSimdBlocks := TPoly1305Simd.ProcessBulk(FState, PByte(FPowTable), AInput,
+ AInOff + LPos, LNb);
+ if LSimdBlocks < LNb then
+ Poly1305StateProcessBlocksScalar(FState, AInput,
+ AInOff + LPos + LSimdBlocks * 16, LNb - LSimdBlocks);
LPos := LPos + LBulkBytes;
LRemaining := ALen - LPos;
end;
diff --git a/CryptoLib/src/Crypto/Macs/ClpPoly1305Simd.pas b/CryptoLib/src/Crypto/Macs/ClpPoly1305Simd.pas
new file mode 100644
index 00000000..568d550d
--- /dev/null
+++ b/CryptoLib/src/Crypto/Macs/ClpPoly1305Simd.pas
@@ -0,0 +1,83 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpPoly1305Simd;
+
+{$I ..\..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ ClpPoly1305State,
+ ClpCryptoLibTypes
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ , ClpPoly1305X86Backend
+{$IFEND}
+ ;
+
+type
+ ///
+ /// Arch-neutral SIMD dispatch facade for Poly1305. Selects the per-arch
+ /// backend at compile time; on a build with no SIMD
+ /// backend TryInitPowerTable returns False and ProcessBulk
+ /// consumes zero blocks, so the MAC runs entirely on its scalar reference
+ /// path. The Poly1305 MAC calls only this facade and stays free of any
+ /// TCpuFeatures / CRYPTOLIB_*_ASM knowledge.
+ ///
+ TPoly1305Simd = class sealed
+ public
+ ///
+ /// If a SIMD tier is available, build the per-key power table into
+ /// and return True; otherwise leave it
+ /// untouched and return False (the nil-ness of the caller's table then
+ /// doubles as the "scalar path" dispatch flag).
+ ///
+ class function TryInitPowerTable(var APowTable: TCryptoLibByteArray;
+ const AState: TPoly1305State): Boolean; static;
+ ///
+ /// Process the leading lane-multiple of blocks
+ /// with SIMD and return how many blocks were consumed (0 when no SIMD path
+ /// applies); the caller processes the remainder on its scalar path.
+ ///
+ class function ProcessBulk(var AState: TPoly1305State; APowTable: PByte;
+ const ABuf: TCryptoLibByteArray; AOff, ANumBlocks: Int32): Int32; static;
+ end;
+
+implementation
+
+{ TPoly1305Simd }
+
+class function TPoly1305Simd.TryInitPowerTable(var APowTable: TCryptoLibByteArray;
+ const AState: TPoly1305State): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TPoly1305X86Backend.TryInitPowerTable(APowTable, AState);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TPoly1305Simd.ProcessBulk(var AState: TPoly1305State; APowTable: PByte;
+ const ABuf: TCryptoLibByteArray; AOff, ANumBlocks: Int32): Int32;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TPoly1305X86Backend.ProcessBulk(AState, APowTable, ABuf, AOff, ANumBlocks);
+{$ELSE}
+ Result := 0;
+{$IFEND}
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/Macs/ClpPoly1305State.pas b/CryptoLib/src/Crypto/Macs/ClpPoly1305State.pas
new file mode 100644
index 00000000..2c226cb4
--- /dev/null
+++ b/CryptoLib/src/Crypto/Macs/ClpPoly1305State.pas
@@ -0,0 +1,43 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpPoly1305State;
+
+{$I ..\..\Include\CryptoLib.inc}
+
+interface
+
+type
+ ///
+ /// Poly1305 algorithm state in radix-2^26 form (72 bytes; same layout on
+ /// every architecture).
+ ///
+ /// - R0..R4 - clamped 130-bit r split into five 26-bit limbs
+ /// - S1..S4 - precomputed 5 * R1..R4 wraparound multipliers
+ /// - H0..H4 - 130-bit accumulator in five 26-bit limbs (plus a few carry bits)
+ /// - K0..K3 - the Poly1305 "s" key (second half of the 32-byte key)
+ ///
+ ///
+ TPoly1305State = record
+ R0, R1, R2, R3, R4: UInt32;
+ S1, S2, S3, S4: UInt32;
+ H0, H1, H2, H3, H4: UInt32;
+ K0, K1, K2, K3: UInt32;
+ end;
+
+implementation
+
+end.
diff --git a/CryptoLib/src/Crypto/Macs/ClpPoly1305X86Backend.pas b/CryptoLib/src/Crypto/Macs/ClpPoly1305X86Backend.pas
new file mode 100644
index 00000000..3e726f11
--- /dev/null
+++ b/CryptoLib/src/Crypto/Macs/ClpPoly1305X86Backend.pas
@@ -0,0 +1,237 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpPoly1305X86Backend;
+
+{$I ..\..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ ClpPoly1305State,
+ ClpCpuFeatures,
+ ClpSimdLevels,
+ ClpCryptoLibTypes;
+
+type
+ ///
+ /// x86 SIMD backend for Poly1305: owns the AVX2 power-table builder and the
+ /// 4-way bulk kernel (body in Include\Simd\Poly1305\) plus the runtime
+ /// tier selection via TCpuFeatures.X86.SelectSlot. Compiles on every
+ /// target - when built without x86 SIMD TryInitPowerTable returns
+ /// False (leaving the caller on the scalar path) and ProcessBulk
+ /// consumes zero blocks.
+ ///
+ TPoly1305X86Backend = class sealed
+ public
+ ///
+ /// If a SIMD tier is available, (re)allocate and populate
+ /// with the precomputed power table for the r currently in
+ /// , and return True. Otherwise leave
+ /// untouched and return False.
+ ///
+ class function TryInitPowerTable(var APowTable: TCryptoLibByteArray;
+ const AState: TPoly1305State): Boolean; static;
+ ///
+ /// Process the leading lane-multiple of 16-byte
+ /// blocks with the AVX2 kernel and return the number of blocks consumed
+ /// (a multiple of the lane count). Returns 0 - leaving the whole batch to the
+ /// caller's scalar path - when no power table is present, fewer than one lane
+ /// of blocks is available, or the build has no x86 SIMD.
+ ///
+ class function ProcessBulk(var AState: TPoly1305State; APowTable: PByte;
+ const ABuf: TCryptoLibByteArray; AOff, ANumBlocks: Int32): Int32; static;
+ end;
+
+implementation
+
+{$IFDEF CRYPTOLIB_X86_SIMD}
+
+// Multiply two 5-limb radix-2^26 numbers ALhs, ARhs modulo 2^130-5,
+// returning the 5-limb result in AProduct. Same field arithmetic as the
+// inner step of Poly1305StateProcessBlock; used here at SetKey time to
+// derive r^2..r^4 for the AVX2 power table.
+procedure Poly1305MulLimbs(out AProduct: array of UInt32;
+ const ALhs, ARhs: array of UInt32);
+var
+ LS1, LS2, LS3, LS4: UInt32;
+ LD0, LD1, LD2, LD3, LD4: UInt64;
+begin
+ LS1 := ARhs[1] * 5;
+ LS2 := ARhs[2] * 5;
+ LS3 := ARhs[3] * 5;
+ LS4 := ARhs[4] * 5;
+
+ LD0 := UInt64(ALhs[0]) * ARhs[0] + UInt64(ALhs[1]) * LS4 +
+ UInt64(ALhs[2]) * LS3 + UInt64(ALhs[3]) * LS2 + UInt64(ALhs[4]) * LS1;
+ LD1 := UInt64(ALhs[0]) * ARhs[1] + UInt64(ALhs[1]) * ARhs[0] +
+ UInt64(ALhs[2]) * LS4 + UInt64(ALhs[3]) * LS3 + UInt64(ALhs[4]) * LS2;
+ LD2 := UInt64(ALhs[0]) * ARhs[2] + UInt64(ALhs[1]) * ARhs[1] +
+ UInt64(ALhs[2]) * ARhs[0] + UInt64(ALhs[3]) * LS4 + UInt64(ALhs[4]) * LS3;
+ LD3 := UInt64(ALhs[0]) * ARhs[3] + UInt64(ALhs[1]) * ARhs[2] +
+ UInt64(ALhs[2]) * ARhs[1] + UInt64(ALhs[3]) * ARhs[0] +
+ UInt64(ALhs[4]) * LS4;
+ LD4 := UInt64(ALhs[0]) * ARhs[4] + UInt64(ALhs[1]) * ARhs[3] +
+ UInt64(ALhs[2]) * ARhs[2] + UInt64(ALhs[3]) * ARhs[1] +
+ UInt64(ALhs[4]) * ARhs[0];
+
+ AProduct[0] := UInt32(LD0) and $3FFFFFF;
+ LD1 := LD1 + (LD0 shr 26);
+ AProduct[1] := UInt32(LD1) and $3FFFFFF;
+ LD2 := LD2 + (LD1 shr 26);
+ AProduct[2] := UInt32(LD2) and $3FFFFFF;
+ LD3 := LD3 + (LD2 shr 26);
+ AProduct[3] := UInt32(LD3) and $3FFFFFF;
+ LD4 := LD4 + (LD3 shr 26);
+ AProduct[4] := UInt32(LD4) and $3FFFFFF;
+ AProduct[0] := AProduct[0] + UInt32(LD4 shr 26) * 5;
+ AProduct[1] := AProduct[1] + (AProduct[0] shr 26);
+ AProduct[0] := AProduct[0] and $3FFFFFF;
+end;
+
+// (Re)allocate APowTable to the byte size required by the AVX2 4-way
+// bulk kernel and pack the precomputed powers r^1..r^4 of AState.R0..R4
+// into it, in the post-VPERMD layout the kernel expects. Must be called
+// once after AState.R0..R4 is populated and before the first invocation of
+// the bulk kernel for the same key. The exact buffer size and limb layout
+// are private to this routine.
+procedure Poly1305Avx2InitPowerTable(var APowTable: TCryptoLibByteArray;
+ const AState: TPoly1305State);
+const
+ // 10 rows x 8 lanes x 4 bytes = 320. Rows 0..4 hold the limbs of
+ // r^4|r^4|r^4|r^3 | r^4|r^2|r^4|r^1 across the 8 ymm lanes (post-VPERMD
+ // layout); rows 5..8 hold the 5x wraparound multipliers; row 9 is
+ // padding for the +4 over-read of the last shifted load.
+ TableSize = Int32(320);
+type
+ TPowTableLayout = array[0..9, 0..7] of UInt32;
+ PPowTableLayout = ^TPowTableLayout;
+var
+ LTbl: PPowTableLayout;
+ Lr1, Lr2, Lr3, Lr4: array[0..4] of UInt32;
+ LIdx, LRow, LJ: Int32;
+begin
+ System.SetLength(APowTable, TableSize);
+ LTbl := PPowTableLayout(APowTable);
+
+ Lr1[0] := AState.R0;
+ Lr1[1] := AState.R1;
+ Lr1[2] := AState.R2;
+ Lr1[3] := AState.R3;
+ Lr1[4] := AState.R4;
+
+ Poly1305MulLimbs(Lr2, Lr1, Lr1);
+ Poly1305MulLimbs(Lr3, Lr2, Lr1);
+ Poly1305MulLimbs(Lr4, Lr2, Lr2);
+
+ // Rows 0..4: limbs of r^k for the 4 powers, post-VPERMD layout.
+ for LIdx := 0 to 4 do
+ begin
+ LTbl^[LIdx, 0] := Lr4[LIdx];
+ LTbl^[LIdx, 1] := Lr4[LIdx];
+ LTbl^[LIdx, 2] := Lr4[LIdx];
+ LTbl^[LIdx, 3] := Lr3[LIdx];
+ LTbl^[LIdx, 4] := Lr4[LIdx];
+ LTbl^[LIdx, 5] := Lr2[LIdx];
+ LTbl^[LIdx, 6] := Lr4[LIdx];
+ LTbl^[LIdx, 7] := Lr1[LIdx];
+ end;
+
+ // Rows 5..8: 5 * limbs[1..4] of r^k (wraparound multipliers).
+ for LRow := 5 to 8 do
+ begin
+ LJ := LRow - 4; // 1..4
+ LTbl^[LRow, 0] := Lr4[LJ] * 5;
+ LTbl^[LRow, 1] := Lr4[LJ] * 5;
+ LTbl^[LRow, 2] := Lr4[LJ] * 5;
+ LTbl^[LRow, 3] := Lr3[LJ] * 5;
+ LTbl^[LRow, 4] := Lr4[LJ] * 5;
+ LTbl^[LRow, 5] := Lr2[LJ] * 5;
+ LTbl^[LRow, 6] := Lr4[LJ] * 5;
+ LTbl^[LRow, 7] := Lr1[LJ] * 5;
+ end;
+
+ // Row 9 is unused padding for the +4 over-read of the last shifted load.
+ for LIdx := 0 to 7 do
+ LTbl^[9, LIdx] := 0;
+end;
+
+// Asm wrapper around the architecture-specific 4-way bulk kernel. The .inc
+// files contain pure assembly (db-encoded VEX with mnemonic comments); the
+// Pascal layer below is just the procedure header + the SimdProc5Begin ABI
+// glue + the kernel body include. ACtx points at the 72-byte R/S/H/K
+// portion of TPoly1305State; APowTable points at the separate 320-byte
+// power table buffer; the kernel never reads the K limbs.
+procedure Poly1305BlocksBulkAvx2Core(ACtx, APowTable, AInp: PByte;
+ ALen: NativeUInt; APad: Int32);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc5Begin_x86_64.inc}
+{$I ..\..\Include\Simd\Poly1305\Poly1305BlocksBulkAvx2Core_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\Include\Simd\Common\SimdProc5Begin_i386.inc}
+{$I ..\..\Include\Simd\Poly1305\Poly1305BlocksBulkAvx2Core_i386.inc}
+{$ENDIF}
+end;
+
+{$ENDIF CRYPTOLIB_X86_SIMD}
+
+{ TPoly1305X86Backend }
+
+class function TPoly1305X86Backend.TryInitPowerTable(var APowTable: TCryptoLibByteArray;
+ const AState: TPoly1305State): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2]) of
+ TX86SimdLevel.AVX2:
+ begin
+ Poly1305Avx2InitPowerTable(APowTable, AState);
+ Exit(True);
+ end;
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+class function TPoly1305X86Backend.ProcessBulk(var AState: TPoly1305State; APowTable: PByte;
+ const ABuf: TCryptoLibByteArray; AOff, ANumBlocks: Int32): Int32;
+{$IFDEF CRYPTOLIB_X86_SIMD}
+const
+ // Minimum number of 16-byte blocks before the AVX2 4-way kernel pays off
+ // over the scalar block step; smaller batches go straight to the caller's
+ // scalar tail.
+ LMinBlocks = Int32(4);
+ // Number of 16-byte blocks consumed per AVX2 kernel iteration (one block
+ // per 64-bit lane of a 256-bit ymm); used to round the dispatch count
+ // down to a multiple supported by the kernel.
+ LLaneCount = Int32(4);
+var
+ LSimdBlocks: Int32;
+{$ENDIF}
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ if (APowTable <> nil) and (ANumBlocks >= LMinBlocks) then
+ begin
+ LSimdBlocks := ANumBlocks and not (LLaneCount - 1);
+ Poly1305BlocksBulkAvx2Core(@AState, APowTable, @ABuf[AOff],
+ NativeUInt(LSimdBlocks) * 16, 1);
+ Exit(LSimdBlocks);
+ end;
+{$ENDIF}
+ Result := 0;
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/Modes/ClpCcmBlockCipher.pas b/CryptoLib/src/Crypto/Modes/ClpCcmBlockCipher.pas
index b114dc69..b1e3d50e 100644
--- a/CryptoLib/src/Crypto/Modes/ClpCcmBlockCipher.pas
+++ b/CryptoLib/src/Crypto/Modes/ClpCcmBlockCipher.pas
@@ -79,11 +79,9 @@ TCcmBlockCipher = class(TInterfacedObject, ICcmBlockCipher,
FLastKey: TCryptoLibByteArray;
FAssociatedText: TMemoryStream;
FData: TMemoryStream;
-{$IFDEF CRYPTOLIB_X86_SIMD}
// Cached once per Init; non-nil when the registry resolved a fused
// CCM kernel for the underlying cipher and current direction.
FCcmKernel: IFusedCcmKernel;
-{$ENDIF CRYPTOLIB_X86_SIMD}
class function GetMacSize(ARequestedMacBits: Int32): Int32; static;
procedure CheckNonceReuse(AForEncryption: Boolean;
@@ -92,7 +90,6 @@ TCcmBlockCipher = class(TInterfacedObject, ICcmBlockCipher,
function HasAssociatedText(): Boolean;
function CalculateMac(const AData: TCryptoLibByteArray; ADataOff, ADataLen: Int32;
const AMacBlock: TCryptoLibByteArray): Int32;
-{$IFDEF CRYPTOLIB_X86_SIMD}
// Runs AES CBC-MAC over the CCM header (B_0 || AAD length-prefix ||
// AAD || zero-pad) and writes the post-header 16-byte state into
// AMacState. Matches the scalar CalculateMac contract.
@@ -110,7 +107,6 @@ TCcmBlockCipher = class(TInterfacedObject, ICcmBlockCipher,
AInOff, AInLen, AOutputLen: Int32;
const AOutput: TCryptoLibByteArray; AOutOff: Int32;
const AIV: TCryptoLibByteArray): Boolean;
-{$ENDIF CRYPTOLIB_X86_SIMD}
strict protected
function GetAlgorithmName: String; virtual;
@@ -200,9 +196,7 @@ procedure TCcmBlockCipher.Init(AForEncryption: Boolean;
var
LChoice: TCipherAeadChoice;
LRequestedMacSizeBits: Int32;
-{$IFDEF CRYPTOLIB_X86_SIMD}
LDirection: TFusedModeDirection;
-{$ENDIF CRYPTOLIB_X86_SIMD}
begin
FForEncryption := AForEncryption;
@@ -234,7 +228,6 @@ procedure TCcmBlockCipher.Init(AForEncryption: Boolean;
if (System.Length(FNonce) < 7) or (System.Length(FNonce) > 13) then
raise EArgumentCryptoLibException.CreateRes(@SNonceLengthRange);
-{$IFDEF CRYPTOLIB_X86_SIMD}
FCcmKernel := nil;
if FKeyParam <> nil then
begin
@@ -248,7 +241,6 @@ procedure TCcmBlockCipher.Init(AForEncryption: Boolean;
LDirection := TFusedModeDirection.Decrypt;
TFusedKernelRegistry.TryAcquireCcm(FCipher, LDirection, FCcmKernel);
end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
Reset();
end;
@@ -411,7 +403,6 @@ function TCcmBlockCipher.ProcessPacket(const AInput: TCryptoLibByteArray;
LOutputLen := AInLen + FMacSize;
TCheck.OutputLength(AOutput, AOutOff, LOutputLen, SOutputBufferTooShort);
-{$IFDEF CRYPTOLIB_X86_SIMD}
// Fused fast path folds CTR and CBC-MAC into one sweep; the scalar
// path handles the 1..16-byte tail and the tag encryption.
if (FCcmKernel <> nil)
@@ -422,7 +413,6 @@ function TCcmBlockCipher.ProcessPacket(const AInput: TCryptoLibByteArray;
Result := LOutputLen;
Exit;
end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
CalculateMac(AInput, AInOff, AInLen, FMacBlock);
@@ -470,7 +460,6 @@ function TCcmBlockCipher.ProcessPacket(const AInput: TCryptoLibByteArray;
LOutputLen := AInLen - FMacSize;
TCheck.OutputLength(AOutput, AOutOff, LOutputLen, SOutputBufferTooShort);
-{$IFDEF CRYPTOLIB_X86_SIMD}
// Fused decrypt twin. Scalar tail handles the trailing 1..16-byte
// block plus the FixedTime tag compare.
if (FCcmKernel <> nil)
@@ -481,7 +470,6 @@ function TCcmBlockCipher.ProcessPacket(const AInput: TCryptoLibByteArray;
Result := LOutputLen;
Exit;
end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
System.Move(AInput[AInOff + LOutputLen], FMacBlock[0], FMacSize);
@@ -649,7 +637,6 @@ function TCcmBlockCipher.HasAssociatedText: Boolean;
Result := GetAssociatedTextLength() > 0;
end;
-{$IFDEF CRYPTOLIB_X86_SIMD}
procedure TCcmBlockCipher.ComputePostHeaderMacState(AInLen: Int32;
const AMacState: TCryptoLibByteArray);
@@ -865,6 +852,5 @@ function TCcmBlockCipher.ProcessPacketDecryptFused(
end;
end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
end.
diff --git a/CryptoLib/src/Crypto/Modes/ClpEaxBlockCipher.pas b/CryptoLib/src/Crypto/Modes/ClpEaxBlockCipher.pas
index 901b42ba..38783a47 100644
--- a/CryptoLib/src/Crypto/Modes/ClpEaxBlockCipher.pas
+++ b/CryptoLib/src/Crypto/Modes/ClpEaxBlockCipher.pas
@@ -306,7 +306,9 @@ procedure TEaxBlockCipher.Init(AForEncryption: Boolean;
// tag-lookahead in FBufBlock: the first FBlockSize bytes of FBufBlock
// are the block that has just been confirmed as ciphertext (not tag),
// the last FMacSize bytes remain the trailing tag candidate.
-{$IFDEF CRYPTOLIB_X86_SIMD}
+ // Off-SIMD (or any build with no registered fused factory) TryAcquireEax
+ // leaves FEaxKernel nil, so FUseFusedBody is False and the scalar
+ // TCMac / TSicBlockCipher path runs - no compile-time arch gating needed.
FEaxKernel := nil;
if FForEncryption then
TFusedKernelRegistry.TryAcquireEax(FCipher,
@@ -315,10 +317,6 @@ procedure TEaxBlockCipher.Init(AForEncryption: Boolean;
TFusedKernelRegistry.TryAcquireEax(FCipher,
TFusedModeDirection.Decrypt, FEaxKernel);
FUseFusedBody := FEaxKernel <> nil;
-{$ELSE}
- FEaxKernel := nil;
- FUseFusedBody := False;
-{$ENDIF CRYPTOLIB_X86_SIMD}
if FUseFusedBody then
begin
diff --git a/CryptoLib/src/Crypto/Modes/ClpGcmBlockCipher.pas b/CryptoLib/src/Crypto/Modes/ClpGcmBlockCipher.pas
index 42b3bdb9..9127ab92 100644
--- a/CryptoLib/src/Crypto/Modes/ClpGcmBlockCipher.pas
+++ b/CryptoLib/src/Crypto/Modes/ClpGcmBlockCipher.pas
@@ -34,6 +34,7 @@ interface
ClpIGcmMultiplier,
ClpIGcmExponentiator,
ClpGcmUtilities,
+ ClpGhashSimd,
ClpBasicGcmExponentiator,
ClpTables4kGcmMultiplier,
ClpIBulkBlockCipher,
@@ -46,8 +47,6 @@ interface
ClpPack,
ClpCheck,
ClpBasicGcmMultiplier,
- ClpCpuFeatures,
- ClpIntrinsicsVector,
ClpArrayUtilities,
ClpCryptoLibTypes;
@@ -90,18 +89,14 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher,
public
class function CreateGcmMultiplier(): IGcmMultiplier; static;
///
- /// True when the fused four-block SIMD path may run: PCLMULQDQ + SSSE3 shuffled GHASH,
- /// batched counter AES, and a packed 16-byte XMM layout.
+ /// True when the fused four-block SIMD path may run: hardware shuffled GHASH,
+ /// batched counter AES, and a packed 16-byte vector layout.
///
class function IsFourWaySupported: Boolean; static;
///
/// True when the fused eight-block SIMD path may run (128-byte CTR batch + wider GHASH).
///
class function IsEightWaySupported: Boolean; static;
- ///
- /// True when the 128-bit SSE2 XOR fast path may run for one and two-block steps (with packed layout).
- ///
- class function IsSse2PackedVectorXorSupported: Boolean; static;
strict private
@@ -111,13 +106,12 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher,
// exposes the generic IBulkBlockCipher capability. Drives the
// non-fused 4/8-block CTR dispatchers (GetNextCtrBlocks4/8).
FBulkCipher: IBulkBlockCipher;
-{$IFDEF CRYPTOLIB_X86_SIMD}
// Fused CTR+GHASH kernel resolved via TFusedKernelRegistry at Init
// time. Non-nil only when an accelerator factory accepts the
- // underlying cipher + direction and the fused gate is open.
+ // underlying cipher + direction and the fused gate is open (always nil
+ // off-SIMD; IFusedGcmKernel is arch-neutral).
FGcmKernel: IFusedGcmKernel;
FGcmKernelMinBlocks: Int32;
-{$ENDIF CRYPTOLIB_X86_SIMD}
FMultiplier: IGcmMultiplier;
FExp: IGcmExponentiator;
@@ -195,29 +189,23 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher,
procedure ProcessBlocks8Pipelined(const AInBuf: TCryptoLibByteArray; var AInOff: Int32;
var ALen: Int32; const AOutBuf: TCryptoLibByteArray; var AOutOff: Int32;
ALimit: Int32; AForEncrypt: Boolean);
-{$IFDEF CRYPTOLIB_X86_SIMD}
// =====================================================================
- // Fused AES-NI + 8-way GHASH pipeline (x86-64 and i386).
+ // Fused block-cipher keystream + 8-way GHASH pipeline (provided by the
+ // fused kernel registry; nil kernel -> not used, e.g. off-SIMD).
// =====================================================================
// This outer driver is arch-agnostic: pure Pascal batch orchestration
- // plus one call into an IFusedGcmKernel per 8-block stride. The
- // underlying assembly kernel has two variants keyed on register
- // budget:
- // * x86-64: Gueron-style single-pass 8-wide interleave that keeps
- // 15 of 16 XMM registers simultaneously live (8 AES state +
- // 3 GHASH accumulators + 1 round key + 1 GHASH block +
- // 1 H-power + 1 PCLMUL scratch + 1 byte-reverse mask).
- // * i386: two back-to-back 4-wide halves sharing running Z0/Z1/Z2
- // accumulators, sized to fit in xmm0..xmm7. AES rounds 1..4 in
- // each half carry the 4 pclmul iters so port-0 / port-5 ILP
- // overlap is preserved within the 8-register budget.
- // Both variants expose the same IFusedGcmKernel surface so this
- // driver only sees the kernel interface.
- /// Fills ABlocks[0..127] with eight 16-byte counter blocks (pre-AES form). Used by the FusedILP pipeline where AES is performed inside the fused assembly kernel.
+ // plus one call into an IFusedGcmKernel per 8-block stride. The kernel
+ // fuses CTR-mode keystream generation with the GHASH multiply-reduce in a
+ // single pass, interleaving the two at the instruction level so their
+ // independent execution units overlap. How wide that interleave runs and
+ // how it is scheduled against the available vector-register budget is a
+ // backend detail hidden entirely behind the IFusedGcmKernel surface, so
+ // this driver only ever sees the kernel interface.
+ /// Fills ABlocks[0..127] with eight 16-byte counter blocks (pre-cipher form). Used by the FusedILP pipeline where the block-cipher keystream is produced inside the fused kernel.
procedure FillNextCtrBlocks8Raw(const ABlocks: TCryptoLibByteArray);
///
- /// Pipelined GCM path driven by FGcmKernel (x86-64 and i386). Active
- /// when a fused CTR+GHASH kernel was acquired at Init; otherwise the
+ /// Pipelined GCM path driven by FGcmKernel (when a fused kernel is
+ /// registered). Active when a fused CTR+GHASH kernel was acquired at Init; otherwise the
/// caller falls back to ProcessBlocks8Pipelined. AForEncrypt selects
/// direction: encrypt GHASHes the prior iteration's OUTPUT
/// ciphertext, decrypt GHASHes the prior iteration's INPUT
@@ -227,7 +215,6 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher,
procedure ProcessBlocks8FusedILP(const AInBuf: TCryptoLibByteArray; var AInOff: Int32;
var ALen: Int32; const AOutBuf: TCryptoLibByteArray; var AOutOff: Int32;
ALimit: Int32; AForEncrypt: Boolean);
-{$ENDIF CRYPTOLIB_X86_SIMD}
// ---------------------------------------------------------------------
// Cipher-state setup / per-call initialization.
@@ -249,7 +236,7 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher,
procedure CheckNonceReuse(AForEncryption: Boolean;
const ANewNonce: TCryptoLibByteArray; const AKeyParam: IKeyParameter);
/// Rekey path: initialize the underlying block cipher, compute the hash
- /// subkey H, cache the AES-NI engine (when available), and (re)allocate the
+ /// subkey H, cache the bulk-capable cipher engine (when available), and (re)allocate the
/// 8-way SIMD buffers (FHPow / FWorkCtr / FWorkCtrAhead) on capable hardware.
/// Called only when a new key is supplied.
procedure InitCipherAndHashSubKey(const AKeyParam: IKeyParameter);
@@ -340,79 +327,31 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher,
implementation
-{$IFDEF CRYPTOLIB_X86_SIMD}
-const
- ReverseBytesMask: packed array[0..15] of Byte = (
- $0F, $0E, $0D, $0C, $0B, $0A, $09, $08, $07, $06, $05, $04, $03, $02, $01, $00);
-{$ENDIF}
-
// =======================================================================
-// Class-level CPU feature probes and multiplier factory.
+// Class-level capability probes and multiplier factory. All arch-neutral:
+// the GHASH SIMD facade answers False off-SIMD, so the mode's fast paths
+// simply fall through to their scalar reference code.
// =======================================================================
class function TGcmBlockCipher.IsFourWaySupported: Boolean;
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- Result := TCpuFeatures.X86.HasPCLMULQDQ and TCpuFeatures.X86.HasSSSE3 and
- TIntrinsicsVector.IsPacked;
-{$ELSE}
- Result := False;
-{$ENDIF}
+ Result := TGhashSimd.IsShuffledGhashSupported;
end;
class function TGcmBlockCipher.IsEightWaySupported: Boolean;
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- Result := TGcmBlockCipher.IsFourWaySupported;
-{$ELSE}
- Result := False;
-{$ENDIF}
+ Result := TGhashSimd.IsShuffledGhashSupported;
end;
-class function TGcmBlockCipher.IsSse2PackedVectorXorSupported: Boolean;
-begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- Result := TCpuFeatures.X86.HasSSE2 and TIntrinsicsVector.IsPacked;
-{$ELSE}
- Result := False;
-{$ENDIF}
-end;
-
-{$IFDEF CRYPTOLIB_X86_SIMD}
-procedure GcmBlockXor128Sse2(PDst, PSrc: PByte);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc2Begin_x86_64.inc}
-{$I ..\..\Include\Simd\Gcm\GcmBlockXor128Sse2_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc2Begin_i386.inc}
-{$I ..\..\Include\Simd\Gcm\GcmBlockXor128Sse2_i386.inc}
-{$ENDIF}
-end;
-
-procedure GcmBlockReverse128Ssse3(PDst, PSrc, PMask: PByte);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc}
-{$I ..\..\Include\Simd\Gcm\GcmBlockReverse128Ssse3_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\Include\Simd\Common\SimdProc3Begin_i386.inc}
-{$I ..\..\Include\Simd\Gcm\GcmBlockReverse128Ssse3_i386.inc}
-{$ENDIF}
-end;
-{$ENDIF}
-
{ TGcmBlockCipher }
class function TGcmBlockCipher.CreateGcmMultiplier: IGcmMultiplier;
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if TCpuFeatures.X86.HasPCLMULQDQ then
+ if TGhashSimd.HasCarrylessMultiply then
begin
Result := TBasicGcmMultiplier.Create();
Exit;
end;
-{$ENDIF}
Result := TTables4kGcmMultiplier.Create();
end;
@@ -510,10 +449,8 @@ procedure TGcmBlockCipher.InitCipherAndHashSubKey(const AKeyParam: IKeyParameter
TBlockCipherBulkUtilities.TryResolveBulkCipher(FCipher, FBulkCipher);
-{$IFDEF CRYPTOLIB_X86_SIMD}
FGcmKernel := nil;
FGcmKernelMinBlocks := 0;
-{$ENDIF CRYPTOLIB_X86_SIMD}
FH := nil;
System.SetLength(FH, BlockSize);
@@ -524,7 +461,6 @@ procedure TGcmBlockCipher.InitCipherAndHashSubKey(const AKeyParam: IKeyParameter
FHPow := nil;
FWorkCtr := nil;
FWorkCtrAhead := nil;
-{$IFDEF CRYPTOLIB_X86_SIMD}
if TGcmBlockCipher.IsFourWaySupported then
begin
System.SetLength(FHPow, 128);
@@ -545,7 +481,6 @@ procedure TGcmBlockCipher.InitCipherAndHashSubKey(const AKeyParam: IKeyParameter
end;
end;
end;
-{$ENDIF}
end;
procedure TGcmBlockCipher.ComputeJ0();
@@ -836,7 +771,6 @@ function TGcmBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray;
AOutOff := AOutOff + BlockSize;
end;
-{$IFDEF CRYPTOLIB_X86_SIMD}
if TGcmBlockCipher.IsEightWaySupported and (ALen >= BlockSize * 8) then
begin
EncryptBlocks8(AInput, AInOff, ALen, AOutput, AOutOff);
@@ -871,7 +805,6 @@ function TGcmBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray;
end;
end
else
-{$ENDIF}
begin
while ALen >= BlockSize * 2 do
begin
@@ -952,7 +885,6 @@ function TGcmBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray;
LThresh4 := LBufLen + (BlockSize * 3);
LThresh8 := LBufLen + (BlockSize * 7);
-{$IFDEF CRYPTOLIB_X86_SIMD}
if TGcmBlockCipher.IsEightWaySupported and (ALen >= LThresh8) then
begin
DecryptBlocks8(AInput, AInOff, ALen, AOutput, AOutOff, LThresh8);
@@ -987,7 +919,6 @@ function TGcmBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray;
end;
end
else
-{$ENDIF}
begin
while ALen >= LThresh2 do
begin
@@ -1161,13 +1092,8 @@ class procedure TGcmBlockCipher.GcmReverse16(const ASrc, ADst: PByte);
var
LI: Int32;
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if TCpuFeatures.X86.HasSSSE3 then
- begin
- GcmBlockReverse128Ssse3(ADst, ASrc, @ReverseBytesMask[0]);
+ if TGhashSimd.TryBlockReverse128(ADst, ASrc) then
Exit;
- end;
-{$ENDIF}
for LI := 0 to 15 do
ADst[LI] := ASrc[15 - LI];
end;
@@ -1181,13 +1107,8 @@ procedure TGcmBlockCipher.GhashFourShuffledBlocks(PC0, PC16, PC32, PC48: PByte);
LSRev: array[0..15] of Byte;
LPCiph: PByte;
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if TGcmBlockCipher.IsFourWaySupported then
- begin
- TGcmUtilities.FusedFourShuffledGhash(@FS[0], PC0, @FHPow[64], @ReverseBytesMask[0]);
+ if TGhashSimd.TryFusedFourShuffledGhash(@FS[0], PC0, @FHPow[64]) then
Exit;
- end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
GcmReverse16(@FS[0], @LSRev[0]);
FillChar(LU0, 16, 0);
FillChar(LU1, 16, 0);
@@ -1219,15 +1140,15 @@ procedure TGcmBlockCipher.GhashFourShuffledBlocks(PC0, PC16, PC32, PC48: PByte);
// Fused and pipelined batch routines -- GCM performance core.
// =======================================================================
// Each routine consumes 64 bytes (4-way) or 128 bytes (8-way) of
-// plaintext / ciphertext per iteration. The "fused" variants run AES
+// plaintext / ciphertext per iteration. The "fused" variants run
// counter-keystream generation then GHASH back-to-back. The
-// "pipelined" variants overlap current-batch AES with previous-batch
-// GHASH to reclaim port-0 / port-5 ILP. The FusedILP variant (further
-// below, under CRYPTOLIB_X86_SIMD) pushes this further by interleaving
-// both at the instruction level inside a single assembly kernel,
-// selected per arch (Gueron 8-wide on x86-64, 2x4-wide halves on i386).
-// AForEncrypt selects which buffer feeds GHASH: output ciphertext on
-// encrypt, input ciphertext on decrypt.
+// "pipelined" variants overlap current-batch keystream with previous-batch
+// GHASH to reclaim instruction-level parallelism across the two independent
+// execution units. The FusedILP variant (further below) pushes this further
+// by interleaving both at the instruction level inside a single kernel
+// supplied by the fused-kernel registry (nil off-SIMD, so that path is
+// simply skipped there). AForEncrypt selects which buffer feeds GHASH:
+// output ciphertext on encrypt, input ciphertext on decrypt.
// =======================================================================
// Single-batch fused 4-way GCM step. AForEncrypt=True hashes the output ciphertext;
@@ -1257,13 +1178,8 @@ procedure TGcmBlockCipher.GhashEightShuffledBlocks(PBase: PByte);
LSRev: array [0 .. 15] of Byte;
LPCiph: PByte;
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if TGcmBlockCipher.IsEightWaySupported then
- begin
- TGcmUtilities.FusedEightShuffledGhash(@FS[0], PBase, @FHPow[0], @ReverseBytesMask[0]);
+ if TGhashSimd.TryFusedEightShuffledGhash(@FS[0], PBase, @FHPow[0]) then
Exit;
- end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
GcmReverse16(@FS[0], @LSRev[0]);
FillChar(LU0, 16, 0);
FillChar(LU1, 16, 0);
@@ -1301,8 +1217,9 @@ procedure TGcmBlockCipher.ProcessBlocks8Fused(const AInBuf: TCryptoLibByteArray;
// Pipeline-by-one fused four-block step. Requires ALen >= ALimit + BlockSize*4*2
// (i.e. at least two 4-block batches remain after honouring the caller's tail
// hold-back) so we can overlap each batch's GHASH with the next batch's
-// CTR-keystream generation via CPU OoO scheduling (AES-NI uses port 0 / GHASH
-// PCLMULQDQ uses port 5 on Intel). After this method returns, 0 or 1 full
+// CTR-keystream generation via CPU out-of-order scheduling (the block-cipher
+// keystream and the GHASH multiply-reduce use independent execution units).
+// After this method returns, 0 or 1 full
// four-block batches remain; the caller's non-pipelined loop handles the tail.
// AForEncrypt=True does XOR then GHASH(output); AForEncrypt=False does
// GHASH(input) then XOR (the only per-direction difference).
@@ -1464,14 +1381,13 @@ class procedure TGcmBlockCipher.FillCtr8BlocksRaw(
System.Move(ACounter[0], ABlocks[112], 16);
end;
-{$IFDEF CRYPTOLIB_X86_SIMD}
// =======================================================================
-// Fused AES-NI + 8-way GHASH pipeline (x86-64 and i386).
-// The driver is arch-agnostic: it drives the outer 8-block stride loop
-// and delegates the fused work to whichever IFusedGcmKernel variant the
-// registry resolved (Gueron 8-wide on x86-64, 2x4-wide halves on i386).
-// Register-budget rationale for each variant lives on the matching
-// banner in the class declaration.
+// Fused block-cipher keystream + 8-way GHASH pipeline. The driver is
+// arch-agnostic: it drives the outer 8-block stride loop and delegates the
+// fused work to whichever IFusedGcmKernel the registry resolved (nil
+// off-SIMD, so the callers never invoke this path there). The kernel's
+// internal interleave and register budget are backend details behind the
+// interface, summarised on the matching banner in the class declaration.
// =======================================================================
procedure TGcmBlockCipher.FillNextCtrBlocks8Raw(const ABlocks: TCryptoLibByteArray);
@@ -1479,21 +1395,21 @@ procedure TGcmBlockCipher.FillNextCtrBlocks8Raw(const ABlocks: TCryptoLibByteArr
FillCtr8BlocksRaw(FCounter, FCounter32, ABlocks);
end;
-// Fused AES-NI keystream + 8-way GHASH pipeline (x86-64 and i386). The
-// AES engine is always in encrypt mode (CTR keystream) regardless of GCM
-// direction. AForEncrypt selects the per-direction bookkeeping only:
+// Fused block-cipher keystream + 8-way GHASH pipeline. The cipher engine is
+// always in encrypt mode (CTR keystream) regardless of GCM direction.
+// AForEncrypt selects the per-direction bookkeeping only:
// * encrypt: GHASH consumes the prior iteration's OUTPUT ciphertext.
// * decrypt: GHASH consumes the prior iteration's INPUT ciphertext.
-// Dispatches to the AES-128 / AES-192 / AES-256 fused wrapper based on the
-// engine's current round-key schedule length (10 / 12 / 14 rounds). Encrypt
+// Dispatches to the 128 / 192 / 256-bit fused wrapper based on the engine's
+// current round-key schedule length (10 / 12 / 14 rounds). Encrypt
// callers pass ALimit=0 (threshold collapses to BlockSize*16). Decrypt callers
// pass the tail hold-back threshold; the loop leaves at least ALimit bytes for
// the caller to process after the pipelined block.
-// Prime: batch 0 is produced via the regular AES-NI 8-wide kernel + Pascal XOR,
+// Prime: batch 0 is produced via the regular 8-wide keystream path + Pascal XOR,
// leaving its ciphertext reference at LPrevCipher awaiting GHASH in the next
// iteration.
-// Body: each loop iteration invokes the interleaved assembly kernel which
-// (a) AES-encrypts eight fresh counter blocks to keystream,
+// Body: each loop iteration invokes the interleaved fused kernel which
+// (a) encrypts eight fresh counter blocks to keystream,
// (b) XORs the keystream with the current plaintext/ciphertext,
// (c) GHASHes the previous iteration's ciphertext into the running state.
// Tail: the last pending ciphertext is GHASH'd, then the final batch is
@@ -1514,7 +1430,7 @@ procedure TGcmBlockCipher.ProcessBlocks8FusedILP(const AInBuf: TCryptoLibByteArr
LCurrCtrs := FWorkCtr;
LNextCtrs := FWorkCtrAhead;
- // Prime batch 0: regular 8-wide AES-NI into LCurrCtrs (now holds keystream),
+ // Prime batch 0: regular 8-wide keystream into LCurrCtrs (now holds keystream),
// XOR with plaintext/ciphertext at LPOut, defer GHASH of batch 0.
GetNextCtrBlocks8(LCurrCtrs);
LPIn := PByte(AInBuf) + AInOff;
@@ -1570,7 +1486,6 @@ procedure TGcmBlockCipher.ProcessBlocks8FusedILP(const AInBuf: TCryptoLibByteArr
AOutOff := AOutOff + (BlockSize * 8);
ALen := ALen - (BlockSize * 8);
end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
// =======================================================================
// Batch dispatchers: route each N-block call to the fastest available
@@ -1583,7 +1498,6 @@ procedure TGcmBlockCipher.EncryptBlocks4(const AInBuf: TCryptoLibByteArray;
var AInOff: Int32; var ALen: Int32; const AOutBuf: TCryptoLibByteArray;
var AOutOff: Int32);
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
if not TGcmBlockCipher.IsFourWaySupported then
raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['four']);
if FHPow = nil then
@@ -1597,31 +1511,26 @@ procedure TGcmBlockCipher.EncryptBlocks4(const AInBuf: TCryptoLibByteArray;
ALen := ALen - (BlockSize * 4);
AOutOff := AOutOff + (BlockSize * 4);
end;
-{$ELSE}
- raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['four']);
-{$ENDIF}
end;
procedure TGcmBlockCipher.EncryptBlocks8(const AInBuf: TCryptoLibByteArray;
var AInOff: Int32; var ALen: Int32; const AOutBuf: TCryptoLibByteArray;
var AOutOff: Int32);
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
if not TGcmBlockCipher.IsEightWaySupported then
raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['eight']);
if (FHPow = nil) or (System.Length(FHPow) < 128) then
raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockHStateMissing, ['eight']);
if ALen >= BlockSize * 16 then
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- // FusedILP is worthwhile only when the inner loop actually
- // iterates: prime batch + >=1 kernel iter + tail batch = 3 strides
- // of 128 B. Below that the fused asm is bypassed (prime + tail
- // only) and the driver's entry cost regresses small payloads,
- // notably on i386 where register pressure amplifies the overhead.
+ // FusedILP is worthwhile only when the inner loop actually iterates:
+ // prime batch + >=1 kernel iter + tail batch = 3 strides of 128 B. Below
+ // that the fused kernel is bypassed (prime + tail only) and the driver's
+ // entry cost regresses small payloads, especially on register-constrained
+ // targets where that overhead is amplified. FGcmKernel is nil off-SIMD, so
+ // this branch is simply skipped there.
if (FGcmKernel <> nil) and (ALen >= BlockSize * 24) then
ProcessBlocks8FusedILP(AInBuf, AInOff, ALen, AOutBuf, AOutOff, 0, True);
-{$ENDIF}
if ALen >= BlockSize * 16 then
ProcessBlocks8Pipelined(AInBuf, AInOff, ALen, AOutBuf, AOutOff, 0, True);
end;
@@ -1632,9 +1541,6 @@ procedure TGcmBlockCipher.EncryptBlocks8(const AInBuf: TCryptoLibByteArray;
ALen := ALen - (BlockSize * 8);
AOutOff := AOutOff + (BlockSize * 8);
end;
-{$ELSE}
- raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['eight']);
-{$ENDIF}
end;
procedure TGcmBlockCipher.CipherBlock(const AInBuf: TCryptoLibByteArray;
@@ -1648,25 +1554,23 @@ procedure TGcmBlockCipher.CipherBlock(const AInBuf: TCryptoLibByteArray;
LCtrBlock := nil;
System.SetLength(LCtrBlock, BlockSize);
GetNextCtrBlock(LCtrBlock);
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if TGcmBlockCipher.IsSse2PackedVectorXorSupported then
+ if TGhashSimd.IsBlockXorSupported then
begin
if AForEncrypt then
begin
System.Move(LCtrBlock[0], AOutBuf[AOutOff], BlockSize);
- GcmBlockXor128Sse2(@AOutBuf[AOutOff], @AInBuf[AInOff]);
- GcmBlockXor128Sse2(@FS[0], @AOutBuf[AOutOff]);
+ TGhashSimd.BlockXor128(@AOutBuf[AOutOff], @AInBuf[AInOff]);
+ TGhashSimd.BlockXor128(@FS[0], @AOutBuf[AOutOff]);
end
else
begin
System.Move(AInBuf[AInOff], AOutBuf[AOutOff], BlockSize);
- GcmBlockXor128Sse2(@AOutBuf[AOutOff], @LCtrBlock[0]);
- GcmBlockXor128Sse2(@FS[0], @AInBuf[AInOff]);
+ TGhashSimd.BlockXor128(@AOutBuf[AOutOff], @LCtrBlock[0]);
+ TGhashSimd.BlockXor128(@FS[0], @AInBuf[AInOff]);
end;
FMultiplier.MultiplyH(FS);
Exit;
end;
-{$ENDIF}
if AForEncrypt then
begin
@@ -1715,7 +1619,6 @@ procedure TGcmBlockCipher.DecryptBlocks4(const AInBuf: TCryptoLibByteArray;
var AInOff: Int32; var ALen: Int32; const AOutBuf: TCryptoLibByteArray;
var AOutOff: Int32; ALimit: Int32);
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
if not TGcmBlockCipher.IsFourWaySupported then
raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['four']);
if ALimit < BlockSize * 4 then
@@ -1731,16 +1634,12 @@ procedure TGcmBlockCipher.DecryptBlocks4(const AInBuf: TCryptoLibByteArray;
ALen := ALen - (BlockSize * 4);
AOutOff := AOutOff + (BlockSize * 4);
end;
-{$ELSE}
- raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['four']);
-{$ENDIF}
end;
procedure TGcmBlockCipher.DecryptBlocks8(const AInBuf: TCryptoLibByteArray;
var AInOff: Int32; var ALen: Int32; const AOutBuf: TCryptoLibByteArray;
var AOutOff: Int32; ALimit: Int32);
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
if not TGcmBlockCipher.IsEightWaySupported then
raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['eight']);
if ALimit < BlockSize * 8 then
@@ -1749,12 +1648,11 @@ procedure TGcmBlockCipher.DecryptBlocks8(const AInBuf: TCryptoLibByteArray;
raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockHStateMissing, ['eight']);
if ALen >= ALimit + (BlockSize * 8) * 2 then
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
// See EncryptBlocks8: require prime + >=1 kernel iter + tail
// (3 strides of 128 B above ALimit) before entering FusedILP.
+ // FGcmKernel is nil off-SIMD, so this branch is skipped there.
if (FGcmKernel <> nil) and (ALen >= ALimit + (BlockSize * 8) * 3) then
ProcessBlocks8FusedILP(AInBuf, AInOff, ALen, AOutBuf, AOutOff, ALimit, False);
-{$ENDIF}
if ALen >= ALimit + (BlockSize * 8) * 2 then
ProcessBlocks8Pipelined(AInBuf, AInOff, ALen, AOutBuf, AOutOff, ALimit, False);
end;
@@ -1765,9 +1663,6 @@ procedure TGcmBlockCipher.DecryptBlocks8(const AInBuf: TCryptoLibByteArray;
ALen := ALen - (BlockSize * 8);
AOutOff := AOutOff + (BlockSize * 8);
end;
-{$ELSE}
- raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['eight']);
-{$ENDIF}
end;
procedure TGcmBlockCipher.CipherBlocks2(const AInBuf: TCryptoLibByteArray;
@@ -1781,8 +1676,7 @@ procedure TGcmBlockCipher.CipherBlocks2(const AInBuf: TCryptoLibByteArray;
LCtrBlock := nil;
System.SetLength(LCtrBlock, BlockSize);
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if TGcmBlockCipher.IsSse2PackedVectorXorSupported then
+ if TGhashSimd.IsBlockXorSupported then
begin
for LB := 0 to 1 do
begin
@@ -1790,14 +1684,14 @@ procedure TGcmBlockCipher.CipherBlocks2(const AInBuf: TCryptoLibByteArray;
if AForEncrypt then
begin
System.Move(LCtrBlock[0], AOutBuf[AOutOff], BlockSize);
- GcmBlockXor128Sse2(@AOutBuf[AOutOff], @AInBuf[AInOff]);
- GcmBlockXor128Sse2(@FS[0], @AOutBuf[AOutOff]);
+ TGhashSimd.BlockXor128(@AOutBuf[AOutOff], @AInBuf[AInOff]);
+ TGhashSimd.BlockXor128(@FS[0], @AOutBuf[AOutOff]);
end
else
begin
System.Move(AInBuf[AInOff], AOutBuf[AOutOff], BlockSize);
- GcmBlockXor128Sse2(@AOutBuf[AOutOff], @LCtrBlock[0]);
- GcmBlockXor128Sse2(@FS[0], @AInBuf[AInOff]);
+ TGhashSimd.BlockXor128(@AOutBuf[AOutOff], @LCtrBlock[0]);
+ TGhashSimd.BlockXor128(@FS[0], @AInBuf[AInOff]);
end;
FMultiplier.MultiplyH(FS);
AInOff := AInOff + BlockSize;
@@ -1805,7 +1699,6 @@ procedure TGcmBlockCipher.CipherBlocks2(const AInBuf: TCryptoLibByteArray;
end;
Exit;
end;
-{$ENDIF}
for LB := 0 to 1 do
begin
diff --git a/CryptoLib/src/Crypto/Modes/ClpGcmSivBlockCipher.pas b/CryptoLib/src/Crypto/Modes/ClpGcmSivBlockCipher.pas
index d4223b35..de7f71d9 100644
--- a/CryptoLib/src/Crypto/Modes/ClpGcmSivBlockCipher.pas
+++ b/CryptoLib/src/Crypto/Modes/ClpGcmSivBlockCipher.pas
@@ -37,6 +37,7 @@ interface
ClpGcmBlockCipher,
ClpGcmUtilities,
ClpGcmSivUtilities,
+ ClpGcmSivSimd,
ClpFusedKernelTypes,
ClpIFusedGcmSivKernel,
ClpFusedKernelRegistry,
@@ -121,7 +122,6 @@ TGcmSivHasher = class(TObject)
FTheNonce: TCryptoLibByteArray;
FTheFlags: Int32;
-{$IFDEF CRYPTOLIB_X86_SIMD}
// POLYVAL H-power table (H^8..H^1 as 16-byte limbs in GHASH
// canonical form, 128 bytes). Populated once per key in DeriveKeys
// when the fused kernel is available; captured by reference by the
@@ -132,7 +132,6 @@ TGcmSivHasher = class(TObject)
// matches the mode's 8-block batch contract.
FGcmSivKernel: IFusedGcmSivKernel;
FGcmSivKernelBatchBytes: Int32;
-{$ENDIF CRYPTOLIB_X86_SIMD}
procedure CheckAeadStatus(ALen: Int32);
procedure CheckStatus(ALen: Int32);
@@ -251,7 +250,6 @@ procedure TGcmSivBlockCipher.TGcmSivHasher.UpdateHash(const ABuffer: TCryptoLibB
FNumActive := 0;
end;
-{$IFDEF CRYPTOLIB_X86_SIMD}
// Fused POLYVAL Horner-by-8 fast path for full 128-byte batches.
if (FParent.FGcmSivKernel <> nil) and
(LMyRemaining >= FParent.FGcmSivKernelBatchBytes) then
@@ -266,7 +264,6 @@ procedure TGcmSivBlockCipher.TGcmSivHasher.UpdateHash(const ABuffer: TCryptoLibB
LMyRemaining := LMyRemaining - FParent.FGcmSivKernelBatchBytes;
end;
end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
while LMyRemaining >= 16 do
begin
@@ -882,26 +879,29 @@ procedure TGcmSivBlockCipher.DeriveKeys(const AKey: IKeyParameter);
TGcmSivUtilities.MulX(LMyOut);
FTheMultiplier.Init(LMyOut);
-{$IFDEF CRYPTOLIB_X86_SIMD}
- // Precompute the POLYVAL H-power table and resolve the fused kernel
- // for this key. LMyOut is already conditioned for GHASH. The H-power
- // table is captured by reference by the kernel and must outlive it;
- // it is owned by this cipher instance.
+ // Precompute the POLYVAL H-power table and resolve the fused kernel for
+ // this key when a SIMD POLYVAL backend is available. LMyOut is already
+ // conditioned for GHASH; the H-power table is captured by reference by the
+ // kernel and must outlive it (it is owned by this cipher instance).
+ // TGcmSivSimd.IsSupported is False off-SIMD (or with no fused backend), so the
+ // precompute is skipped and TGcmSivHasher.UpdateHash stays on scalar POLYVAL.
FGcmSivKernel := nil;
FGcmSivKernelBatchBytes := 0;
- if System.Length(FHPow128) < 128 then
- System.SetLength(FHPow128, 128);
- TGcmUtilities.InitEightWayHPowFromH(LMyOut, FHPow128);
- if TFusedKernelRegistry.TryAcquireGcmSiv(FTheCipher,
- TFusedModeDirection.Encrypt, @FHPow128[0], FGcmSivKernel) and
- (FGcmSivKernel <> nil) then
+ if TGcmSivSimd.IsSupported then
begin
- if FGcmSivKernel.MinimumBlockCount = 8 then
- FGcmSivKernelBatchBytes := FGcmSivKernel.MinimumBlockCount * BUFLEN
- else
- FGcmSivKernel := nil;
+ if System.Length(FHPow128) < 128 then
+ System.SetLength(FHPow128, 128);
+ TGcmUtilities.InitEightWayHPowFromH(LMyOut, FHPow128);
+ if TFusedKernelRegistry.TryAcquireGcmSiv(FTheCipher,
+ TFusedModeDirection.Encrypt, @FHPow128[0], FGcmSivKernel) and
+ (FGcmSivKernel <> nil) then
+ begin
+ if FGcmSivKernel.MinimumBlockCount = 8 then
+ FGcmSivKernelBatchBytes := FGcmSivKernel.MinimumBlockCount * BUFLEN
+ else
+ FGcmSivKernel := nil;
+ end;
end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
FTheFlags := FTheFlags or INITIAL;
end;
diff --git a/CryptoLib/src/Crypto/Modes/ClpOcbBlockCipher.pas b/CryptoLib/src/Crypto/Modes/ClpOcbBlockCipher.pas
index c21d8960..44af0ece 100644
--- a/CryptoLib/src/Crypto/Modes/ClpOcbBlockCipher.pas
+++ b/CryptoLib/src/Crypto/Modes/ClpOcbBlockCipher.pas
@@ -130,11 +130,9 @@ TOcbBlockCipher = class(TInterfacedObject, IOcbBlockCipher,
procedure CheckNonceReuse(AForEncryption: Boolean;
const ANewNonce: TCryptoLibByteArray; const AKeyParam: IKeyParameter);
-{$IFDEF CRYPTOLIB_X86_SIMD}
procedure ProcessFusedBulk(const AInput: TCryptoLibByteArray;
AInOff: Int32; const AOutput: TCryptoLibByteArray; AOutOff: Int32;
ABlockCount: Int32);
-{$ENDIF CRYPTOLIB_X86_SIMD}
procedure ProcessEightBlocksBulk(const AInput: TCryptoLibByteArray;
AInOff: Int32; const AOutput: TCryptoLibByteArray; AOutOff: Int32);
@@ -526,15 +524,16 @@ function TOcbBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray;
while (LI < ALen) do
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
// Fused-kernel fast path: accelerator-provided AEAD kernel
- // (AES-NI today; ARM / other accelerators pluggable via the
- // registry). Takes priority over the 8-wide bulk-cipher path
- // below whenever at least one kernel-stride batch fits the
- // steady-state window. A single dispatch stages up to
- // FUSED_BATCH_BLOCKS worth of offsets and lets the kernel loop
- // internally in MinimumBlockCount strides, amortising per-call
- // overhead across the whole batch.
+ // (e.g. AES-NI; other accelerators pluggable via the
+ // registry). FOcbKernel is nil when no factory accepted this
+ // cipher / direction (always so off-SIMD), in which case this branch
+ // is skipped and the 8-wide bulk / scalar paths below run unchanged.
+ // Takes priority over the 8-wide bulk-cipher path below whenever at
+ // least one kernel-stride batch fits the steady-state window. A single
+ // dispatch stages up to FUSED_BATCH_BLOCKS worth of offsets and lets
+ // the kernel loop internally in MinimumBlockCount strides, amortising
+ // per-call overhead across the whole batch.
if (FOcbKernel <> nil) and (FMainBlockPos = LSteadyPos) and
((ALen - LI) >= FOcbKernelMinBlocks * BLOCK_SIZE) then
begin
@@ -552,7 +551,6 @@ function TOcbBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray;
LI := LI + LBatchBytes;
Continue;
end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
// 8-wide bulk-cipher fast path. Entered only when no fused kernel
// accepted this cipher / direction (FOcbKernel = nil) or the
@@ -582,7 +580,6 @@ function TOcbBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray;
Result := LResultLen;
end;
-{$IFDEF CRYPTOLIB_X86_SIMD}
procedure TOcbBlockCipher.ProcessFusedBulk(const AInput: TCryptoLibByteArray;
AInOff: Int32; const AOutput: TCryptoLibByteArray; AOutOff: Int32;
ABlockCount: Int32);
@@ -697,7 +694,6 @@ procedure TOcbBlockCipher.ProcessFusedBulk(const AInput: TCryptoLibByteArray;
FMacSize);
end;
end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
procedure TOcbBlockCipher.ProcessEightBlocksBulk(
const AInput: TCryptoLibByteArray; AInOff: Int32;
diff --git a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiCcmKernel.pas b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiCcmKernel.pas
index fe05890b..16a2dde9 100644
--- a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiCcmKernel.pas
+++ b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiCcmKernel.pas
@@ -27,11 +27,12 @@ interface
ClpFusedKernelTypes,
ClpIFusedCcmKernel,
ClpFusedKernelRegistry,
- ClpAesNiAeadResolver;
+ ClpAesFusedAeadSimd,
+ ClpAesFusedAeadX86Backend;
type
///
- /// AES-NI + SSSE3 implementation of IFusedCcmKernel.
+ /// AES-NI implementation of IFusedCcmKernel.
/// Available on x86_64 (CRYPTOLIB_X86_64_ASM) and i386
/// (CRYPTOLIB_I386_ASM); both arms gated collectively by
/// CRYPTOLIB_X86_SIMD.
@@ -287,9 +288,9 @@ function TAesNiCcmKernelFactory.TryCreate(const ACipher: IBlockCipher;
AKernel := nil;
Result := False;
try
- if not TAesNiAeadResolver.CpuSupports then
+ if not TAesFusedAeadSimd.CpuSupports then
Exit;
- if not TAesNiAeadResolver.TryResolveEngine(ACipher, LEngine) then
+ if not TAesFusedAeadX86Backend.TryResolveEngine(ACipher, LEngine) then
Exit;
// CCM drives CTR and CBC-MAC lanes from the same forward-encrypt
// schedule for both directions. LKeys is consumed only to probe
diff --git a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiEaxKernel.pas b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiEaxKernel.pas
index de8253be..8efa8dea 100644
--- a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiEaxKernel.pas
+++ b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiEaxKernel.pas
@@ -27,11 +27,12 @@ interface
ClpFusedKernelTypes,
ClpIFusedEaxKernel,
ClpFusedKernelRegistry,
- ClpAesNiAeadResolver;
+ ClpAesFusedAeadSimd,
+ ClpAesFusedAeadX86Backend;
type
///
- /// AES-NI + SSSE3 implementation of IFusedEaxKernel.
+ /// AES-NI implementation of IFusedEaxKernel.
/// Available on x86_64 (CRYPTOLIB_X86_64_ASM) and i386
/// (CRYPTOLIB_I386_ASM); both arms gated collectively by
/// CRYPTOLIB_X86_SIMD.
@@ -278,9 +279,9 @@ function TAesNiEaxKernelFactory.TryCreate(const ACipher: IBlockCipher;
AKernel := nil;
Result := False;
try
- if not TAesNiAeadResolver.CpuSupports then
+ if not TAesFusedAeadSimd.CpuSupports then
Exit;
- if not TAesNiAeadResolver.TryResolveEngine(ACipher, LEngine) then
+ if not TAesFusedAeadX86Backend.TryResolveEngine(ACipher, LEngine) then
Exit;
// EAX drives CTR and OMAC lanes from the same forward-encrypt
// schedule for both directions.
diff --git a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiGcmKernel.pas b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiGcmKernel.pas
index 3ca41437..a89b840e 100644
--- a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiGcmKernel.pas
+++ b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiGcmKernel.pas
@@ -27,11 +27,12 @@ interface
ClpFusedKernelTypes,
ClpIFusedGcmKernel,
ClpFusedKernelRegistry,
- ClpAesNiAeadResolver;
+ ClpAesFusedAeadSimd,
+ ClpAesFusedAeadX86Backend;
type
///
- /// AES-NI + PCLMULQDQ + SSSE3 implementation of IFusedGcmKernel.
+ /// AES-NI + PCLMULQDQ implementation of IFusedGcmKernel.
/// Available on x86_64 (CRYPTOLIB_X86_64_ASM) and i386
/// (CRYPTOLIB_I386_ASM); both arms gated collectively by
/// CRYPTOLIB_X86_SIMD.
@@ -207,9 +208,9 @@ function TAesNiGcmKernelFactory.TryCreate(const ACipher: IBlockCipher;
if AHPowers = nil then
Exit;
{$IFDEF CRYPTOLIB_X86_SIMD}
- if not TAesNiAeadResolver.CpuSupports then
+ if not TAesFusedAeadSimd.CpuSupports then
Exit;
- if not TAesNiAeadResolver.TryResolveEngine(ACipher, LEngine) then
+ if not TAesFusedAeadX86Backend.TryResolveEngine(ACipher, LEngine) then
Exit;
if not LEngine.TryGetEncKeysPtr(LKeys, LRounds) then
Exit;
diff --git a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiOcbKernel.pas b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiOcbKernel.pas
index 52ce7ae5..8d275e08 100644
--- a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiOcbKernel.pas
+++ b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiOcbKernel.pas
@@ -27,11 +27,12 @@ interface
ClpFusedKernelTypes,
ClpIFusedOcbKernel,
ClpFusedKernelRegistry,
- ClpAesNiAeadResolver;
+ ClpAesFusedAeadSimd,
+ ClpAesFusedAeadX86Backend;
type
///
- /// AES-NI + SSSE3 implementation of IFusedOcbKernel.
+ /// AES-NI implementation of IFusedOcbKernel.
/// Available on x86_64 (CRYPTOLIB_X86_64_ASM) and i386
/// (CRYPTOLIB_I386_ASM); both arms gated collectively by
/// CRYPTOLIB_X86_SIMD.
@@ -268,9 +269,9 @@ function TAesNiOcbKernelFactory.TryCreate(const ACipher: IBlockCipher;
AKernel := nil;
Result := False;
try
- if not TAesNiAeadResolver.CpuSupports then
+ if not TAesFusedAeadSimd.CpuSupports then
Exit;
- if not TAesNiAeadResolver.TryResolveEngine(ACipher, LEngine) then
+ if not TAesFusedAeadX86Backend.TryResolveEngine(ACipher, LEngine) then
Exit;
if ADirection = TFusedModeDirection.Encrypt then
LHasSchedule := LEngine.TryGetEncKeysPtr(LKeys, LRounds)
diff --git a/CryptoLib/src/Crypto/Modes/Fused/ClpPclmulGcmSivKernel.pas b/CryptoLib/src/Crypto/Modes/Fused/ClpPclmulGcmSivKernel.pas
index 14686c56..5c65e931 100644
--- a/CryptoLib/src/Crypto/Modes/Fused/ClpPclmulGcmSivKernel.pas
+++ b/CryptoLib/src/Crypto/Modes/Fused/ClpPclmulGcmSivKernel.pas
@@ -26,12 +26,11 @@ interface
ClpFusedKernelTypes,
ClpIFusedGcmSivKernel,
ClpFusedKernelRegistry,
- ClpCpuFeatures,
- ClpIntrinsicsVector;
+ ClpGcmSivSimd;
type
///
- /// PCLMULQDQ + SSSE3 implementation of IFusedGcmSivKernel. Pure
+ /// PCLMULQDQ implementation of IFusedGcmSivKernel. Pure
/// POLYVAL: the factory ignores ACipher identity and only requires
/// a valid pre-computed H-power table from the caller. Ships on
/// both x86_64 and i386.
@@ -63,19 +62,6 @@ TPclmulGcmSivKernelFactory = class sealed(TInterfacedObject,
implementation
-{$IFDEF CRYPTOLIB_X86_SIMD}
-procedure GcmSivPolyvalHornerEight(PFS, PC0, PHPow128, PMask: Pointer);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
-{$I ..\..\..\Include\Simd\GcmSiv\PolyvalHornerEight_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
-{$I ..\..\..\Include\Simd\GcmSiv\PolyvalHornerEight_i386.inc}
-{$ENDIF}
-end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
-
const
// PSHUFB full-reverse control used by the POLYVAL Horner batch.
GcmSivKernelReverseMask: packed array[0..15] of Byte = (
@@ -99,11 +85,9 @@ function TPclmulGcmSivKernel.MinimumBlockCount: Int32;
procedure TPclmulGcmSivKernel.ProcessPolyvalBatch(AInPtr, AAccumulator: Pointer;
ABlockCount: Int32);
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
if ABlockCount <> FUSED_POLYVAL_MIN_BLOCKS then
Exit;
- GcmSivPolyvalHornerEight(AAccumulator, AInPtr, FHPow128, FMask);
-{$ENDIF CRYPTOLIB_X86_SIMD}
+ TGcmSivSimd.ProcessPolyvalBatch(AAccumulator, AInPtr, FHPow128, FMask);
end;
{ TPclmulGcmSivKernelFactory }
@@ -127,14 +111,11 @@ function TPclmulGcmSivKernelFactory.TryCreate(const ACipher: IBlockCipher;
try
if AHPowers = nil then
Exit;
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if not (TCpuFeatures.X86.HasPCLMULQDQ and TCpuFeatures.X86.HasSSSE3 and
- TIntrinsicsVector.IsPacked) then
+ if not TGcmSivSimd.IsSupported then
Exit;
AKernel := TPclmulGcmSivKernel.Create(AHPowers,
@GcmSivKernelReverseMask[0]);
Result := True;
-{$ENDIF CRYPTOLIB_X86_SIMD}
except
AKernel := nil;
Result := False;
diff --git a/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadSimd.pas b/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadSimd.pas
new file mode 100644
index 00000000..e8b1c8b7
--- /dev/null
+++ b/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadSimd.pas
@@ -0,0 +1,62 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpAesFusedAeadSimd;
+
+{$I ..\..\..\..\Include\CryptoLib.inc}
+
+interface
+
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+uses
+ ClpAesFusedAeadX86Backend;
+{$IFEND}
+
+type
+ ///
+ /// Arch-neutral capability gate for the fused hardware-AES AEAD pipeline
+ /// (e.g. AES-NI on x86). Selects the per-arch
+ /// backend at compile time and answers only the genuinely arch-neutral question
+ /// - "is a fused hardware-AES path available on this build/CPU?".
+ ///
+ ///
+ /// Engine resolution deliberately does NOT live here: it hands back an
+ /// instruction-set-specific round-key schedule (see IAesEngineX86), so it
+ /// belongs on the per-arch backend (TAesFusedAeadX86Backend.TryResolveEngine)
+ /// that the matching per-arch kernels call. Not exported via the public
+ /// interface surface and never imported by mode units, which stay
+ /// cipher-agnostic.
+ ///
+ TAesFusedAeadSimd = class sealed
+ public
+ /// CPU + build-time gate for the fused hardware-AES AEAD pipeline on this arch.
+ class function CpuSupports: Boolean; static;
+ end;
+
+implementation
+
+{ TAesFusedAeadSimd }
+
+class function TAesFusedAeadSimd.CpuSupports: Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TAesFusedAeadX86Backend.CpuSupports;
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesNiAeadResolver.pas b/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadX86Backend.pas
similarity index 71%
rename from CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesNiAeadResolver.pas
rename to CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadX86Backend.pas
index ee652240..bb5e8aab 100644
--- a/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesNiAeadResolver.pas
+++ b/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadX86Backend.pas
@@ -14,7 +14,7 @@
(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
-unit ClpAesNiAeadResolver;
+unit ClpAesFusedAeadX86Backend;
{$I ..\..\..\..\Include\CryptoLib.inc}
@@ -26,35 +26,33 @@ interface
ClpIBlockCipherMode,
ClpIAesEngineX86,
ClpCpuFeatures,
- ClpIntrinsicsVector;
+ ClpIntrinsicsVector;
type
///
- /// Internal helper used exclusively by the in-tree AES-NI AEAD
- /// kernel factories. Not exported via the public interface surface
- /// and never imported by mode units, which stay cipher-agnostic.
+ /// x86 backend for the fused AES-NI AEAD pipeline: owns the CPU/build gate and
+ /// the concrete IAesEngineX86 resolution used by the in-tree AES-NI AEAD
+ /// kernel factories. Compiles on every target - CpuSupports is
+ /// False off x86 and TryResolveEngine then finds no engine.
///
- TAesNiAeadResolver = class sealed(TObject)
+ TAesFusedAeadX86Backend = class sealed
public
- /// CPU + build-time gate for the AES-NI AEAD pipeline.
- /// True only when the build defines CRYPTOLIB_X86_SIMD, the CPU
- /// exposes AES-NI + PCLMULQDQ + SSSE3, and the SIMD intrinsics
- /// layout is packed.
+ /// True only when the build defines CRYPTOLIB_X86_SIMD, the CPU
+ /// exposes hardware AES + carryless multiply, and the SIMD intrinsics layout is packed.
class function CpuSupports: Boolean; static;
- /// Probe ACipher for IAesEngineX86, handling both the
- /// direct case (ACipher itself is the engine) and the wrapped case
- /// (ACipher is an IBlockCipherMode whose UnderlyingCipher is the
- /// engine). AEngine is nil on False.
+ /// Probe ACipher for IAesEngineX86, handling both the direct case
+ /// (ACipher itself is the engine) and the wrapped case (ACipher is an
+ /// IBlockCipherMode whose UnderlyingCipher is the engine). AEngine is nil on False.
class function TryResolveEngine(const ACipher: IBlockCipher;
out AEngine: IAesEngineX86): Boolean; static;
end;
implementation
-{ TAesNiAeadResolver }
+{ TAesFusedAeadX86Backend }
-class function TAesNiAeadResolver.CpuSupports: Boolean;
+class function TAesFusedAeadX86Backend.CpuSupports: Boolean;
begin
{$IFDEF CRYPTOLIB_X86_SIMD}
Result := TCpuFeatures.X86.HasAESNI and TCpuFeatures.X86.HasPCLMULQDQ and
@@ -64,7 +62,7 @@ class function TAesNiAeadResolver.CpuSupports: Boolean;
{$ENDIF CRYPTOLIB_X86_SIMD}
end;
-class function TAesNiAeadResolver.TryResolveEngine(const ACipher: IBlockCipher;
+class function TAesFusedAeadX86Backend.TryResolveEngine(const ACipher: IBlockCipher;
out AEngine: IAesEngineX86): Boolean;
var
LMode: IBlockCipherMode;
diff --git a/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivSimd.pas b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivSimd.pas
new file mode 100644
index 00000000..0bc82c8c
--- /dev/null
+++ b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivSimd.pas
@@ -0,0 +1,65 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpGcmSivSimd;
+
+{$I ..\..\..\Include\CryptoLib.inc}
+
+interface
+
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+uses
+ ClpGcmSivX86Backend;
+{$IFEND}
+
+type
+ ///
+ /// Arch-neutral SIMD dispatch facade for the AES-GCM-SIV POLYVAL batch kernel.
+ /// Selects the per-arch backend at compile time; on a
+ /// build with no SIMD backend IsSupported is False (so the fused
+ /// GCM-SIV kernel factory declines) and ProcessPolyvalBatch is a no-op.
+ /// The kernel unit calls only this facade and stays free of any
+ /// TCpuFeatures / CRYPTOLIB_*_ASM knowledge.
+ ///
+ TGcmSivSimd = class sealed
+ public
+ /// True when a POLYVAL batch kernel is usable on this CPU.
+ class function IsSupported: Boolean; static;
+ /// Eight-block POLYVAL Horner batch. Precondition: IsSupported.
+ class procedure ProcessPolyvalBatch(PFS, PC0, PHPow128, PMask: Pointer); static;
+ end;
+
+implementation
+
+{ TGcmSivSimd }
+
+class function TGcmSivSimd.IsSupported: Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGcmSivX86Backend.IsSupported;
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class procedure TGcmSivSimd.ProcessPolyvalBatch(PFS, PC0, PHPow128, PMask: Pointer);
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ TGcmSivX86Backend.ProcessPolyvalBatch(PFS, PC0, PHPow128, PMask);
+{$IFEND}
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivX86Backend.pas b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivX86Backend.pas
new file mode 100644
index 00000000..78a08894
--- /dev/null
+++ b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivX86Backend.pas
@@ -0,0 +1,77 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpGcmSivX86Backend;
+
+{$I ..\..\..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ ClpCpuFeatures,
+ ClpIntrinsicsVector;
+
+type
+ ///
+ /// x86 SIMD backend for the AES-GCM-SIV POLYVAL batch kernel: owns the
+ /// PCLMULQDQ 8-block Horner kernel (body in Include\Simd\GcmSiv\)
+ /// and the runtime capability gate. Compiles on every target - IsSupported
+ /// returns False off x86 (so the fused-kernel factory declines to build a
+ /// kernel) and ProcessPolyvalBatch is a no-op there.
+ ///
+ TGcmSivX86Backend = class sealed
+ public
+ /// True when the PCLMULQDQ POLYVAL kernel is usable on this CPU.
+ class function IsSupported: Boolean; static;
+ /// Eight-block POLYVAL Horner batch. Precondition: IsSupported.
+ class procedure ProcessPolyvalBatch(PFS, PC0, PHPow128, PMask: Pointer); static;
+ end;
+
+implementation
+
+{$IFDEF CRYPTOLIB_X86_SIMD}
+procedure GcmSivPolyvalHornerEight(PFS, PC0, PHPow128, PMask: Pointer);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
+{$I ..\..\..\Include\Simd\GcmSiv\PolyvalHornerEight_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
+{$I ..\..\..\Include\Simd\GcmSiv\PolyvalHornerEight_i386.inc}
+{$ENDIF}
+end;
+{$ENDIF CRYPTOLIB_X86_SIMD}
+
+{ TGcmSivX86Backend }
+
+class function TGcmSivX86Backend.IsSupported: Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ Result := TCpuFeatures.X86.HasPCLMULQDQ and TCpuFeatures.X86.HasSSSE3 and
+ TIntrinsicsVector.IsPacked;
+{$ELSE}
+ Result := False;
+{$ENDIF}
+end;
+
+class procedure TGcmSivX86Backend.ProcessPolyvalBatch(PFS, PC0, PHPow128, PMask: Pointer);
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ GcmSivPolyvalHornerEight(PFS, PC0, PHPow128, PMask);
+{$ENDIF}
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmUtilities.pas b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmUtilities.pas
index fd991e67..bc574749 100644
--- a/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmUtilities.pas
+++ b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmUtilities.pas
@@ -26,14 +26,12 @@ interface
ClpPack,
ClpBinaryPrimitives,
ClpInterleave,
- ClpCpuFeatures,
+ ClpGhashSimd,
ClpCryptoLibTypes,
- ClpIntrinsicsVector,
ClpByteUtilities;
resourcestring
- SPclmulqdqMultiplyExtIsNotAvailable = 'PCLMULQDQ multiply-ext is not available on this target';
- SFusedGhashRequiresSsse3 = 'fused %s-way GHASH requires SSSE3, PCLMULQDQ, and packed XMM layout';
+ SCarrylessMultiplyExtIsNotAvailable = 'Carryless multiply-ext is not available on this target';
type
TFieldElement = record
@@ -71,16 +69,9 @@ TGcmUtilities = class sealed(TObject)
class procedure Reduce3(PZ0, PZ1, PZ2, PSVector16: PByte); static;
/// Xor three 16-byte limbs with three 16-byte slices from a 48-byte MultiplyExt output.
class procedure XorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte); static;
-{$IFDEF CRYPTOLIB_X86_SIMD}
/// HPow[0..7] = H^8..H^1 as 16-byte limbs at offsets 0,16,...,112 (index 0 = H^8). Four-way fused GHASH uses offsets 64..112 (H^4..H^1).
class procedure InitEightWayHPowFromH(const AH: TCryptoLibByteArray; const AHPow128: TCryptoLibByteArray); static;
- /// Fused GHASH for four 16-byte ciphertext blocks. PFS 16-byte in/out (canonical); PC0 points to 64 contiguous ciphertext bytes; PHPow64 = H^4..H^1 (64 bytes); PMask = 16-byte SSSE3 PSHUFB control.
- class procedure FusedFourShuffledGhash(PFS, PC0, PHPow64, PMask: PByte); static;
- /// Fused GHASH for eight 16-byte ciphertext blocks. PHPow128 = H^8..H^1 (128 bytes at FHPow[0]).
- class procedure FusedEightShuffledGhash(PFS, PC0, PHPow128, PMask: PByte); static;
-{$ENDIF CRYPTOLIB_X86_SIMD}
-
class procedure &Xor(const AX, AY: TCryptoLibByteArray); overload; static;
class procedure &Xor(const AX, AY: TCryptoLibByteArray; AYOff: Int32); overload; static;
class procedure &Xor(const AX, AY: TCryptoLibByteArray; AYOff, AYLen: Int32); overload; static;
@@ -93,103 +84,6 @@ TGcmUtilities = class sealed(TObject)
implementation
-{$IFDEF CRYPTOLIB_X86_SIMD}
-type
- TGcmPartial128 = record
- T3, T2, T1, T0: UInt64;
- end;
-
-procedure GcmPclmulFieldPartial(PX, PY, POut: Pointer);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmPclmulPartial_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_i386.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmPclmulPartial_i386.inc}
-{$ENDIF}
-end;
-
-procedure GcmPclmulMultiplyExtBytes(PX, PY, POut48: Pointer);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmPclmulMultiplyExt_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_i386.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmPclmulMultiplyExt_i386.inc}
-{$ENDIF}
-end;
-
-procedure GcmReduce3FoldSse2(PZ0, PZ1, PZ2, POut: Pointer);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmReduce3FoldSse2_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmReduce3FoldSse2_i386.inc}
-{$ENDIF}
-end;
-
-procedure GcmXorMultiplyExtLimbs48Sse2(PA0, PA1, PA2, PSrc48: Pointer);
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmXorMultiplyExtLimbs48Sse2_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmXorMultiplyExtLimbs48Sse2_i386.inc}
-{$ENDIF}
-end;
-
-procedure GcmGhashFourFull(PFS, PC0, PHPow64, PMask: Pointer);
-{$DEFINE GCM_GHASH_FULL_BLOCKS_4}
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_i386.inc}
-{$ENDIF}
-{$UNDEF GCM_GHASH_FULL_BLOCKS_4}
-end;
-
-procedure GcmGhashEightFull(PFS, PC0, PHPow128, PMask: Pointer);
-{$DEFINE GCM_GHASH_FULL_BLOCKS_8}
-{$IFDEF CRYPTOLIB_X86_64_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_x86_64.inc}
-{$ENDIF}
-{$IFDEF CRYPTOLIB_I386_ASM}
-{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
-{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_i386.inc}
-{$ENDIF}
-{$UNDEF GCM_GHASH_FULL_BLOCKS_8}
-end;
-
-procedure GcmPclmulReducePartial(const APartial: TGcmPartial128; var AZ: TFieldElement);
-var
- LT3, LT2, LT1, LT0: UInt64;
- LZ0, LZ1, LZ2: UInt64;
-begin
- LT3 := APartial.T3;
- LT2 := APartial.T2;
- LT1 := APartial.T1;
- LT0 := APartial.T0;
- LT1 := LT1 xor LT3 xor (LT3 shr 1) xor (LT3 shr 2) xor (LT3 shr 7);
- LT2 := LT2 xor (LT3 shl 63) xor (LT3 shl 62) xor (LT3 shl 57);
- LZ0 := (LT0 shl 1) or (LT1 shr 63);
- LZ1 := (LT1 shl 1) or (LT2 shr 63);
- LZ2 := LT2 shl 1;
- LZ0 := LZ0 xor LZ2 xor (LZ2 shr 1) xor (LZ2 shr 2) xor (LZ2 shr 7);
- LZ1 := LZ1 xor (LT2 shl 63) xor (LT2 shl 58);
- AZ.N0 := LZ0;
- AZ.N1 := LZ1;
-end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
-
{ TGcmUtilities }
class procedure TGcmUtilities.One(out AX: TFieldElement);
@@ -243,18 +137,10 @@ class procedure TGcmUtilities.Multiply(var AX: TFieldElement; var AY: TFieldElem
LX0R, LX1R, LY0R, LY1R: UInt64;
LZ0, LZ1, LZ2, LZ3: UInt64;
LH0, LH1, LH2, LH3, LH4, LH5: UInt64;
- {$IFDEF CRYPTOLIB_X86_SIMD}
- LPartial: TGcmPartial128;
- {$ENDIF}
begin
- {$IFDEF CRYPTOLIB_X86_SIMD}
- if TCpuFeatures.X86.HasPCLMULQDQ then
- begin
- GcmPclmulFieldPartial(@AX, @AY, @LPartial);
- GcmPclmulReducePartial(LPartial, AX);
+ if TGhashSimd.TryMultiply(@AX, @AY) then
Exit;
- end;
- {$ENDIF}
+
LX0 := AX.N0;
LX1 := AX.N1;
LY0 := AY.N0;
@@ -413,27 +299,18 @@ class function TGcmUtilities.ImplMul64(AX, AY: UInt64): UInt64;
class procedure TGcmUtilities.MultiplyExt(PX, PY, POut48: PByte);
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if TCpuFeatures.X86.HasPCLMULQDQ then
- begin
- GcmPclmulMultiplyExtBytes(PX, PY, POut48);
+ if TGhashSimd.TryMultiplyExt(PX, PY, POut48) then
Exit;
- end;
-{$ENDIF}
- raise EInvalidOperationCryptoLibException.CreateRes(@SPclmulqdqMultiplyExtIsNotAvailable);
+ raise EInvalidOperationCryptoLibException.CreateRes(@SCarrylessMultiplyExtIsNotAvailable);
end;
class procedure TGcmUtilities.XorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte);
var
LK: Int32;
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if TCpuFeatures.X86.HasSSE2 then
- begin
- GcmXorMultiplyExtLimbs48Sse2(PA0, PA1, PA2, PSrc48);
+ if TGhashSimd.TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48) then
Exit;
- end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
+
for LK := 0 to 1 do
begin
TByteUtilities.XorTo(8, PSrc48 + LK * 8, PA0 + LK * 8);
@@ -449,13 +326,9 @@ class procedure TGcmUtilities.Reduce3(PZ0, PZ1, PZ2, PSVector16: PByte);
LI: Int32;
LT3, LT2, LT1, LT0, LZ0, LZ1, LZ2: UInt64;
begin
-{$IFDEF CRYPTOLIB_X86_SIMD}
- if TCpuFeatures.X86.HasSSE2 then
- begin
- GcmReduce3FoldSse2(PZ0, PZ1, PZ2, PSVector16);
+ if TGhashSimd.TryReduce3(PZ0, PZ1, PZ2, PSVector16) then
Exit;
- end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
+
System.Move(PZ0^, B0[0], 16);
System.Move(PZ1^, B1[0], 16);
System.Move(PZ2^, B2[0], 16);
@@ -482,31 +355,6 @@ class procedure TGcmUtilities.Reduce3(PZ0, PZ1, PZ2, PSVector16: PByte);
TBinaryPrimitives.WriteUInt64LittleEndian(PSVector16, 8, LZ0);
end;
-{$IFDEF CRYPTOLIB_X86_SIMD}
-class procedure TGcmUtilities.FusedFourShuffledGhash(PFS, PC0, PHPow64, PMask: PByte);
-begin
- if (not TCpuFeatures.X86.HasSSSE3) or (not TCpuFeatures.X86.HasPCLMULQDQ) or
- (not TIntrinsicsVector.IsPacked) then
- raise EInvalidOperationCryptoLibException.CreateResFmt
- (@SFusedGhashRequiresSsse3, ['four']);
-
- // Monolithic kernel: byte-reverse + state fold + 4-way multiply-accumulate +
- // Reduce3 + byte-reverse back, all in a single assembly body (one call boundary).
- GcmGhashFourFull(PFS, PC0, PHPow64, PMask);
-end;
-
-class procedure TGcmUtilities.FusedEightShuffledGhash(PFS, PC0, PHPow128, PMask: PByte);
-begin
- if (not TCpuFeatures.X86.HasSSSE3) or (not TCpuFeatures.X86.HasPCLMULQDQ) or
- (not TIntrinsicsVector.IsPacked) then
- raise EInvalidOperationCryptoLibException.CreateResFmt
- (@SFusedGhashRequiresSsse3, ['eight']);
-
- // Monolithic kernel: byte-reverse + state fold + 8-way multiply-accumulate +
- // Reduce3 + byte-reverse back, all in a single assembly body (one call boundary).
- GcmGhashEightFull(PFS, PC0, PHPow128, PMask);
-end;
-
class procedure TGcmUtilities.InitEightWayHPowFromH(const AH: TCryptoLibByteArray;
const AHPow128: TCryptoLibByteArray);
var
@@ -557,6 +405,4 @@ class procedure TGcmUtilities.InitEightWayHPowFromH(const AH: TCryptoLibByteArra
TPack.UInt64_To_LE(LF1.N0, AHPow128, 120);
end;
-{$ENDIF CRYPTOLIB_X86_SIMD}
-
end.
diff --git a/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashSimd.pas b/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashSimd.pas
new file mode 100644
index 00000000..7eaaf77f
--- /dev/null
+++ b/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashSimd.pas
@@ -0,0 +1,167 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpGhashSimd;
+
+{$I ..\..\..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ ClpCryptoLibTypes
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ , ClpGhashX86Backend
+{$IFEND}
+ ;
+
+type
+ ///
+ /// Arch-neutral SIMD dispatch facade for the GHASH / GF(2^128) field kernels
+ /// behind TGcmUtilities. Selects the per-arch backend at compile time;
+ /// on a build with no SIMD backend every entry point
+ /// returns False, so the field operations run on their scalar reference
+ /// path. TGcmUtilities calls only this facade and stays free of any
+ /// TCpuFeatures / CRYPTOLIB_*_ASM knowledge.
+ ///
+ TGhashSimd = class sealed
+ public
+ /// Carryless multiply-reduce: PX := PX * PY in GF(2^128).
+ class function TryMultiply(PX, PY: Pointer): Boolean; static;
+ /// Carryless multiply to three 128-bit limbs (48 bytes).
+ class function TryMultiplyExt(PX, PY, POut48: PByte): Boolean; static;
+ /// Fold + reduce of three 128-bit limbs into one block.
+ class function TryReduce3(PZ0, PZ1, PZ2, PSVector16: PByte): Boolean; static;
+ /// Xor three 16-byte limbs with three slices of a 48-byte MultiplyExt output.
+ class function TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte): Boolean; static;
+ /// Fused 4-way GHASH over 64 contiguous ciphertext bytes.
+ class function TryFusedFourShuffledGhash(PFS, PC0, PHPow64: PByte): Boolean; static;
+ /// Fused 8-way GHASH over 128 contiguous ciphertext bytes.
+ class function TryFusedEightShuffledGhash(PFS, PC0, PHPow128: PByte): Boolean; static;
+
+ /// True when the shuffled/fused GHASH path (4-/8-way) is usable on this build/CPU. Gates the batch dispatch and the H-power precompute.
+ class function IsShuffledGhashSupported: Boolean; static;
+ /// True when the packed 128-bit block-XOR fast path is usable.
+ class function IsBlockXorSupported: Boolean; static;
+ /// True when a hardware carryless (polynomial) multiply is available (selects the carryless-multiply GCM multiplier over the 4K-table one). Backed by PCLMULQDQ on x86 and PMULL on arm.
+ class function HasCarrylessMultiply: Boolean; static;
+ /// XOR one 128-bit block: PDst := PDst xor PSrc. Precondition: IsBlockXorSupported.
+ class procedure BlockXor128(PDst, PSrc: PByte); static;
+ /// Full byte-reverse of one 128-bit block from PSrc into PDst; False when unavailable.
+ class function TryBlockReverse128(PDst, PSrc: PByte): Boolean; static;
+ end;
+
+implementation
+
+{ TGhashSimd }
+
+class function TGhashSimd.TryMultiply(PX, PY: Pointer): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGhashX86Backend.TryMultiply(PX, PY);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TGhashSimd.TryMultiplyExt(PX, PY, POut48: PByte): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGhashX86Backend.TryMultiplyExt(PX, PY, POut48);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TGhashSimd.TryReduce3(PZ0, PZ1, PZ2, PSVector16: PByte): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGhashX86Backend.TryReduce3(PZ0, PZ1, PZ2, PSVector16);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TGhashSimd.TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGhashX86Backend.TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TGhashSimd.TryFusedFourShuffledGhash(PFS, PC0, PHPow64: PByte): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGhashX86Backend.TryFusedFourShuffledGhash(PFS, PC0, PHPow64);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TGhashSimd.TryFusedEightShuffledGhash(PFS, PC0, PHPow128: PByte): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGhashX86Backend.TryFusedEightShuffledGhash(PFS, PC0, PHPow128);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TGhashSimd.IsShuffledGhashSupported: Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGhashX86Backend.IsShuffledGhashSupported;
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TGhashSimd.IsBlockXorSupported: Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGhashX86Backend.IsBlockXorSupported;
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class function TGhashSimd.HasCarrylessMultiply: Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGhashX86Backend.HasCarrylessMultiply;
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+class procedure TGhashSimd.BlockXor128(PDst, PSrc: PByte);
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ TGhashX86Backend.BlockXor128(PDst, PSrc);
+{$IFEND}
+end;
+
+class function TGhashSimd.TryBlockReverse128(PDst, PSrc: PByte): Boolean;
+begin
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ Result := TGhashX86Backend.TryBlockReverse128(PDst, PSrc);
+{$ELSE}
+ Result := False;
+{$IFEND}
+end;
+
+end.
diff --git a/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashX86Backend.pas b/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashX86Backend.pas
new file mode 100644
index 00000000..d63cea50
--- /dev/null
+++ b/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashX86Backend.pas
@@ -0,0 +1,335 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpGhashX86Backend;
+
+{$I ..\..\..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ ClpCpuFeatures,
+ ClpIntrinsicsVector,
+ ClpCryptoLibTypes;
+
+type
+ ///
+ /// x86 SIMD backend for the GHASH / GF(2^128) field operations behind
+ /// TGcmUtilities: owns the SIMD GHASH kernels (bodies in
+ /// Include\Simd\Gcm\) and the runtime capability gates. Compiles on
+ /// every target - every entry point returns False (leaving the caller on
+ /// its scalar reference path) when built without x86 SIMD or on a CPU lacking
+ /// the required instruction set. All entry points work on raw pointers (the
+ /// kernel ABI), so this unit carries no dependency on the field-element record.
+ ///
+ TGhashX86Backend = class sealed
+ public
+ /// PCLMULQDQ carryless multiply-reduce: PX := PX * PY in GF(2^128).
+ class function TryMultiply(PX, PY: Pointer): Boolean; static;
+ /// PCLMULQDQ carryless multiply to three 128-bit limbs (48 bytes).
+ class function TryMultiplyExt(PX, PY, POut48: PByte): Boolean; static;
+ /// SIMD fold + reduce of three 128-bit limbs into one block.
+ class function TryReduce3(PZ0, PZ1, PZ2, PSVector16: PByte): Boolean; static;
+ /// SIMD xor of three 16-byte limbs with three slices of a 48-byte MultiplyExt output.
+ class function TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte): Boolean; static;
+ /// PCLMULQDQ fused 4-way GHASH (requires packed vector layout). Uses the backend's own byte-reverse mask.
+ class function TryFusedFourShuffledGhash(PFS, PC0, PHPow64: PByte): Boolean; static;
+ /// PCLMULQDQ fused 8-way GHASH (requires packed vector layout). Uses the backend's own byte-reverse mask.
+ class function TryFusedEightShuffledGhash(PFS, PC0, PHPow128: PByte): Boolean; static;
+
+ /// True when the fused shuffled-GHASH path is usable on this CPU (needs packed vector layout). Gates the 4-/8-way batch dispatch and the H-power precompute.
+ class function IsShuffledGhashSupported: Boolean; static;
+ /// True when the packed 128-bit block XOR fast path is usable.
+ class function IsBlockXorSupported: Boolean; static;
+ /// True when the PCLMULQDQ carryless multiply is available (selects the carryless-multiply GCM multiplier over the 4K-table one).
+ class function HasCarrylessMultiply: Boolean; static;
+ /// SIMD XOR of one 128-bit block: PDst := PDst xor PSrc. Precondition: IsBlockXorSupported.
+ class procedure BlockXor128(PDst, PSrc: PByte); static;
+ /// Full byte-reverse of one 128-bit block from PSrc into PDst; returns False when unavailable on this CPU.
+ class function TryBlockReverse128(PDst, PSrc: PByte): Boolean; static;
+ end;
+
+implementation
+
+{$IFDEF CRYPTOLIB_X86_SIMD}
+type
+ TGcmPartial128 = record
+ T3, T2, T1, T0: UInt64;
+ end;
+
+ // Raw two-limb GF(2^128) field element, layout-compatible with the caller's
+ // field-element record (N0 at offset 0, N1 at offset 8). Used only to write the
+ // reduced product back through the caller-supplied pointer without depending on
+ // the field-element type declared in ClpGcmUtilities.
+ TGcmFieldRaw = record
+ N0, N1: UInt64;
+ end;
+ PGcmFieldRaw = ^TGcmFieldRaw;
+
+procedure GcmPclmulFieldPartial(PX, PY, POut: Pointer);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmPclmulPartial_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_i386.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmPclmulPartial_i386.inc}
+{$ENDIF}
+end;
+
+procedure GcmPclmulMultiplyExtBytes(PX, PY, POut48: Pointer);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmPclmulMultiplyExt_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_i386.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmPclmulMultiplyExt_i386.inc}
+{$ENDIF}
+end;
+
+procedure GcmReduce3FoldSse2(PZ0, PZ1, PZ2, POut: Pointer);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmReduce3FoldSse2_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmReduce3FoldSse2_i386.inc}
+{$ENDIF}
+end;
+
+procedure GcmXorMultiplyExtLimbs48Sse2(PA0, PA1, PA2, PSrc48: Pointer);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmXorMultiplyExtLimbs48Sse2_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmXorMultiplyExtLimbs48Sse2_i386.inc}
+{$ENDIF}
+end;
+
+procedure GcmGhashFourFull(PFS, PC0, PHPow64, PMask: Pointer);
+{$DEFINE GCM_GHASH_FULL_BLOCKS_4}
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_i386.inc}
+{$ENDIF}
+{$UNDEF GCM_GHASH_FULL_BLOCKS_4}
+end;
+
+procedure GcmGhashEightFull(PFS, PC0, PHPow128, PMask: Pointer);
+{$DEFINE GCM_GHASH_FULL_BLOCKS_8}
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_i386.inc}
+{$ENDIF}
+{$UNDEF GCM_GHASH_FULL_BLOCKS_8}
+end;
+
+const
+ // Byte-reverse shuffle control shared by the block byte-reverse and the fused
+ // shuffled-GHASH kernels. Owned here (not by the mode) - it is a SIMD
+ // implementation detail.
+ ReverseBytesMask: packed array[0..15] of Byte = (
+ $0F, $0E, $0D, $0C, $0B, $0A, $09, $08, $07, $06, $05, $04, $03, $02, $01, $00);
+
+procedure GcmBlockXor128Sse2(PDst, PSrc: PByte);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc2Begin_x86_64.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmBlockXor128Sse2_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc2Begin_i386.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmBlockXor128Sse2_i386.inc}
+{$ENDIF}
+end;
+
+procedure GcmBlockReverse128Ssse3(PDst, PSrc, PMask: PByte);
+{$IFDEF CRYPTOLIB_X86_64_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmBlockReverse128Ssse3_x86_64.inc}
+{$ENDIF}
+{$IFDEF CRYPTOLIB_I386_ASM}
+{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_i386.inc}
+{$I ..\..\..\Include\Simd\Gcm\GcmBlockReverse128Ssse3_i386.inc}
+{$ENDIF}
+end;
+
+// Scalar reduction of the 256-bit carryless product produced by
+// GcmPclmulFieldPartial into a 128-bit field element (radix-free bit reflection
+// reduction modulo the GCM polynomial). Pure UInt64 arithmetic.
+procedure GcmPclmulReducePartial(const APartial: TGcmPartial128; var AZ: TGcmFieldRaw);
+var
+ LT3, LT2, LT1, LT0: UInt64;
+ LZ0, LZ1, LZ2: UInt64;
+begin
+ LT3 := APartial.T3;
+ LT2 := APartial.T2;
+ LT1 := APartial.T1;
+ LT0 := APartial.T0;
+ LT1 := LT1 xor LT3 xor (LT3 shr 1) xor (LT3 shr 2) xor (LT3 shr 7);
+ LT2 := LT2 xor (LT3 shl 63) xor (LT3 shl 62) xor (LT3 shl 57);
+ LZ0 := (LT0 shl 1) or (LT1 shr 63);
+ LZ1 := (LT1 shl 1) or (LT2 shr 63);
+ LZ2 := LT2 shl 1;
+ LZ0 := LZ0 xor LZ2 xor (LZ2 shr 1) xor (LZ2 shr 2) xor (LZ2 shr 7);
+ LZ1 := LZ1 xor (LT2 shl 63) xor (LT2 shl 58);
+ AZ.N0 := LZ0;
+ AZ.N1 := LZ1;
+end;
+{$ENDIF CRYPTOLIB_X86_SIMD}
+
+{ TGhashX86Backend }
+
+class function TGhashX86Backend.TryMultiply(PX, PY: Pointer): Boolean;
+{$IFDEF CRYPTOLIB_X86_SIMD}
+var
+ LPartial: TGcmPartial128;
+{$ENDIF}
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ if TCpuFeatures.X86.HasPCLMULQDQ then
+ begin
+ GcmPclmulFieldPartial(PX, PY, @LPartial);
+ GcmPclmulReducePartial(LPartial, PGcmFieldRaw(PX)^);
+ Exit(True);
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+class function TGhashX86Backend.TryMultiplyExt(PX, PY, POut48: PByte): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ if TCpuFeatures.X86.HasPCLMULQDQ then
+ begin
+ GcmPclmulMultiplyExtBytes(PX, PY, POut48);
+ Exit(True);
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+class function TGhashX86Backend.TryReduce3(PZ0, PZ1, PZ2, PSVector16: PByte): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ if TCpuFeatures.X86.HasSSE2 then
+ begin
+ GcmReduce3FoldSse2(PZ0, PZ1, PZ2, PSVector16);
+ Exit(True);
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+class function TGhashX86Backend.TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ if TCpuFeatures.X86.HasSSE2 then
+ begin
+ GcmXorMultiplyExtLimbs48Sse2(PA0, PA1, PA2, PSrc48);
+ Exit(True);
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+class function TGhashX86Backend.TryFusedFourShuffledGhash(PFS, PC0, PHPow64: PByte): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ if TCpuFeatures.X86.HasSSSE3 and TCpuFeatures.X86.HasPCLMULQDQ and TIntrinsicsVector.IsPacked then
+ begin
+ // Monolithic kernel: byte-reverse + state fold + 4-way multiply-accumulate +
+ // Reduce3 + byte-reverse back, all in a single assembly body (one call boundary).
+ GcmGhashFourFull(PFS, PC0, PHPow64, @ReverseBytesMask[0]);
+ Exit(True);
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+class function TGhashX86Backend.TryFusedEightShuffledGhash(PFS, PC0, PHPow128: PByte): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ if TCpuFeatures.X86.HasSSSE3 and TCpuFeatures.X86.HasPCLMULQDQ and TIntrinsicsVector.IsPacked then
+ begin
+ // Monolithic kernel: byte-reverse + state fold + 8-way multiply-accumulate +
+ // Reduce3 + byte-reverse back, all in a single assembly body (one call boundary).
+ GcmGhashEightFull(PFS, PC0, PHPow128, @ReverseBytesMask[0]);
+ Exit(True);
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+class function TGhashX86Backend.IsShuffledGhashSupported: Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ Result := TCpuFeatures.X86.HasPCLMULQDQ and TCpuFeatures.X86.HasSSSE3 and
+ TIntrinsicsVector.IsPacked;
+{$ELSE}
+ Result := False;
+{$ENDIF}
+end;
+
+class function TGhashX86Backend.IsBlockXorSupported: Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ Result := TCpuFeatures.X86.HasSSE2 and TIntrinsicsVector.IsPacked;
+{$ELSE}
+ Result := False;
+{$ENDIF}
+end;
+
+class function TGhashX86Backend.HasCarrylessMultiply: Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ Result := TCpuFeatures.X86.HasPCLMULQDQ;
+{$ELSE}
+ Result := False;
+{$ENDIF}
+end;
+
+class procedure TGhashX86Backend.BlockXor128(PDst, PSrc: PByte);
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ GcmBlockXor128Sse2(PDst, PSrc);
+{$ENDIF}
+end;
+
+class function TGhashX86Backend.TryBlockReverse128(PDst, PSrc: PByte): Boolean;
+begin
+{$IFDEF CRYPTOLIB_X86_SIMD}
+ if TCpuFeatures.X86.HasSSSE3 then
+ begin
+ GcmBlockReverse128Ssse3(PDst, PSrc, @ReverseBytesMask[0]);
+ Exit(True);
+ end;
+{$ENDIF}
+ Result := False;
+end;
+
+end.
diff --git a/CryptoLib/src/Interfaces/Crypto/ClpIBulkBlockCipher.pas b/CryptoLib/src/Interfaces/Crypto/ClpIBulkBlockCipher.pas
index 9e50cf43..d796433a 100644
--- a/CryptoLib/src/Interfaces/Crypto/ClpIBulkBlockCipher.pas
+++ b/CryptoLib/src/Interfaces/Crypto/ClpIBulkBlockCipher.pas
@@ -32,9 +32,9 @@ interface
/// interface is the cipher-side companion to IBulkBlockCipherMode: modes
/// (CTR/SIC, CBC, ECB, the non-fused GCM CTR dispatcher, ...) query for it
/// via Supports(FCipher, IBulkBlockCipher, FBulkCipher) and let the engine
- /// own the "best batch size" decision (8-wide / 4-wide / 1-wide ladders on
- /// AES-NI today; a hypothetical AVX-512 16-wide or ARMv8 engine would just
- /// plug in here with no mode-side changes).
+ /// own the "best batch size" decision (e.g. 8-wide / 4-wide / 1-wide ladders
+ /// on AES-NI). Any other engine - a wider vector or an ARM Crypto-Extensions
+ /// engine - plugs in here with no mode-side changes.
///
///
/// Contract: ProcessBlocks(..., ABlockCount) produces byte-identical output
diff --git a/CryptoLib/src/Interfaces/Crypto/Engines/ClpIAesHardwareEngine.pas b/CryptoLib/src/Interfaces/Crypto/Engines/ClpIAesHardwareEngine.pas
index 77366c01..691036ec 100644
--- a/CryptoLib/src/Interfaces/Crypto/Engines/ClpIAesHardwareEngine.pas
+++ b/CryptoLib/src/Interfaces/Crypto/Engines/ClpIAesHardwareEngine.pas
@@ -27,8 +27,8 @@ interface
type
///
/// Architecture-neutral capability interface for hardware-accelerated AES
- /// engines (AES-NI on x86 via TAesEngineX86 today; a NEON/Crypto-Ext
- /// ARMv8 engine could implement the same surface tomorrow). It exposes the
+ /// engines (e.g. AES-NI on x86 via TAesEngineX86; a NEON/Crypto-Ext
+ /// ARMv8 engine could implement the same surface). It exposes the
/// fixed-width 4-/8-block batch entry points and the raw-pointer single-block
/// overload that sit beneath the generic
/// ladder. The aliasing contract for every overload here is identical to
diff --git a/CryptoLib/src/Math/BinPoly/ClpBinPolySimd.pas b/CryptoLib/src/Math/BinPoly/ClpBinPolySimd.pas
new file mode 100644
index 00000000..9444a480
--- /dev/null
+++ b/CryptoLib/src/Math/BinPoly/ClpBinPolySimd.pas
@@ -0,0 +1,69 @@
+{ *********************************************************************************** }
+{ * CryptoLib Library * }
+{ * Author - Ugochukwu Mmaduekwe * }
+{ * Github Repository * }
+{ * * }
+{ * Distributed under the MIT software license, see the accompanying file LICENSE * }
+{ * or visit http://www.opensource.org/licenses/mit-license.php. * }
+{ * * }
+{ * Acknowledgements: * }
+{ * * }
+{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * }
+{ * the development of this library * }
+{ * ******************************************************************************* * }
+
+(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
+
+unit ClpBinPolySimd;
+
+{$I ..\..\Include\CryptoLib.inc}
+
+interface
+
+uses
+ ClpIBinPolyMul
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ , ClpBinPolyX86V128Backend
+{$IFEND}
+ ;
+
+type
+ ///
+ /// Arch-neutral SIMD dispatch facade for binary-polynomial multiplication.
+ /// SIMD-only by contract: it produces the per-arch SIMD multiplier
+ /// when available, or reports "not handled" - it never
+ /// returns the portable scalar backend. The scalar fallback belongs to the
+ /// caller (TBinPolys), matching the Try*-then-scalar shape used across
+ /// the other SIMD families. Selects the per-arch backend at compile time.
+ ///
+ TBinPolySimd = class sealed
+ public
+ ///
+ /// Build a SIMD IBinPolyMul for degree under the
+ /// given reduction when a SIMD backend is available (returns True with
+ /// set); otherwise is nil and
+ /// the caller runs its scalar path (returns False).
+ ///
+ class function TryCreateBinPolyMul(AN: Int32; const AReduce: IBinPolyReduce;
+ out AMul: IBinPolyMul): Boolean; static;
+ end;
+
+implementation
+
+{ TBinPolySimd }
+
+class function TBinPolySimd.TryCreateBinPolyMul(AN: Int32; const AReduce: IBinPolyReduce;
+ out AMul: IBinPolyMul): Boolean;
+begin
+ AMul := nil;
+{$IF DEFINED(CRYPTOLIB_X86_SIMD)}
+ if TBinPolyX86V128Backend.IsSupported then
+ begin
+ AMul := TBinPolyX86V128Backend.CreateBinPolyMul(AN, AReduce);
+ Exit(True);
+ end;
+{$IFEND}
+ Result := False;
+end;
+
+end.
diff --git a/CryptoLib/src/Math/BinPoly/ClpBinPolyX86V128Backend.pas b/CryptoLib/src/Math/BinPoly/ClpBinPolyX86V128Backend.pas
index 8248675c..b950a8d0 100644
--- a/CryptoLib/src/Math/BinPoly/ClpBinPolyX86V128Backend.pas
+++ b/CryptoLib/src/Math/BinPoly/ClpBinPolyX86V128Backend.pas
@@ -31,7 +31,7 @@ interface
ClpCryptoLibTypes;
resourcestring
- SX86V128BackendRequiresPclmulqdqSupport = 'X86.V128 backend requires PCLMULQDQ support on this target';
+ SX86V128BackendRequiresCarrylessMultiply = 'X86.V128 backend requires carryless-multiply support on this target';
type
///
@@ -39,7 +39,7 @@ interface
///
TBinPolyX86V128Backend = class sealed
public
- class function IsEnabled: Boolean; static;
+ class function IsSupported: Boolean; static;
class function CreateBinPolyMul(AN: Int32; const AReduce: IBinPolyReduce): IBinPolyMul; static;
end;
@@ -47,7 +47,7 @@ implementation
{ TBinPolyX86V128Backend }
-class function TBinPolyX86V128Backend.IsEnabled: Boolean;
+class function TBinPolyX86V128Backend.IsSupported: Boolean;
begin
{$IFDEF CRYPTOLIB_X86_SIMD}
Result := TCpuFeatures.X86.HasPCLMULQDQ and TIntrinsicsVector.IsPacked;
@@ -60,8 +60,8 @@ class function TBinPolyX86V128Backend.CreateBinPolyMul(AN: Int32; const AReduce:
var
LSize: Int32;
begin
- if not IsEnabled then
- raise EInvalidOperationCryptoLibException.CreateRes(@SX86V128BackendRequiresPclmulqdqSupport);
+ if not IsSupported then
+ raise EInvalidOperationCryptoLibException.CreateRes(@SX86V128BackendRequiresCarrylessMultiply);
LSize := (AN + 63) shr 6;
case LSize of
diff --git a/CryptoLib/src/Math/BinPoly/ClpBinPolys.pas b/CryptoLib/src/Math/BinPoly/ClpBinPolys.pas
index 6512431d..3b97cf8f 100644
--- a/CryptoLib/src/Math/BinPoly/ClpBinPolys.pas
+++ b/CryptoLib/src/Math/BinPoly/ClpBinPolys.pas
@@ -31,8 +31,8 @@ interface
ClpBinPolyMulBaseBinomialReduce,
ClpBinPolyMulBaseTrinomialReduce,
ClpBinPolyMulBasePentanomialReduce,
+ ClpBinPolySimd,
ClpBinPolyScalarBackend,
- ClpBinPolyX86V128Backend,
ClpItohTsujiiInv;
type
@@ -41,7 +41,7 @@ interface
/// (Size, Create, Add, AddTo, etc.) sit at the top level;
/// factories classified by reduction polynomial shape live under the nested
/// TBinPolysMul class, and inversion factories under TBinPolysInv
- /// (Itoh-Tsujii today).
+ /// (Itoh-Tsujii).
///
///
/// Internal library surface — consumed by the generic F2m field layer and other
@@ -259,10 +259,8 @@ class function TBinPolys.BitLengthVar(ASize: Int32; const AX: TCryptoLibUInt64Ar
class function TBinPolys.TBinPolysMul.CreateBinPolyMul(AN: Int32; const AReduce: IBinPolyReduce): IBinPolyMul;
begin
- {$IFDEF CRYPTOLIB_X86_SIMD}
- if TBinPolyX86V128Backend.IsEnabled then
- Exit(TBinPolyX86V128Backend.CreateBinPolyMul(AN, AReduce));
- {$ENDIF}
+ if TBinPolySimd.TryCreateBinPolyMul(AN, AReduce, Result) then
+ Exit;
Result := TBinPolyScalarBackend.CreateBinPolyMul(AN, AReduce);
end;
diff --git a/CryptoLib/src/Packages/Delphi/CryptoLib4PascalPackage.dpk b/CryptoLib/src/Packages/Delphi/CryptoLib4PascalPackage.dpk
index a91176b3..e40523c7 100644
--- a/CryptoLib/src/Packages/Delphi/CryptoLib4PascalPackage.dpk
+++ b/CryptoLib/src/Packages/Delphi/CryptoLib4PascalPackage.dpk
@@ -41,6 +41,7 @@ contains
ClpIAesEngineX86 in '..\..\Interfaces\Crypto\Engines\ClpIAesEngineX86.pas',
ClpAesEngineX86 in '..\..\Crypto\Engines\ClpAesEngineX86.pas',
ClpAesUtilities in '..\..\Crypto\ClpAesUtilities.pas',
+ ClpAesSimd in '..\..\Crypto\ClpAesSimd.pas',
ClpAesLightEngine in '..\..\Crypto\Engines\ClpAesLightEngine.pas',
ClpAgreementUtilities in '..\..\Crypto\Agreements\ClpAgreementUtilities.pas',
ClpArgon2ParametersGenerator in '..\..\Crypto\Generators\ClpArgon2ParametersGenerator.pas',
@@ -194,6 +195,17 @@ contains
ClpIDHGenerators in '..\..\Interfaces\Crypto\Generators\ClpIDHGenerators.pas',
ClpIDHParameters in '..\..\Interfaces\Crypto\Parameters\ClpIDHParameters.pas',
ClpIBackingHashProvider in '..\..\Interfaces\Crypto\Digests\ClpIBackingHashProvider.pas',
+ ClpChaChaSimd in '..\..\Crypto\Engines\ClpChaChaSimd.pas',
+ ClpChaChaX86Backend in '..\..\Crypto\Engines\ClpChaChaX86Backend.pas',
+ ClpSalsaSimd in '..\..\Crypto\Engines\ClpSalsaSimd.pas',
+ ClpSalsaX86Backend in '..\..\Crypto\Engines\ClpSalsaX86Backend.pas',
+ ClpPoly1305State in '..\..\Crypto\Macs\ClpPoly1305State.pas',
+ ClpPoly1305Simd in '..\..\Crypto\Macs\ClpPoly1305Simd.pas',
+ ClpPoly1305X86Backend in '..\..\Crypto\Macs\ClpPoly1305X86Backend.pas',
+ ClpGhashSimd in '..\..\Crypto\Modes\Gcm\ClpGhashSimd.pas',
+ ClpGhashX86Backend in '..\..\Crypto\Modes\Gcm\ClpGhashX86Backend.pas',
+ ClpGcmSivSimd in '..\..\Crypto\Modes\Gcm\ClpGcmSivSimd.pas',
+ ClpGcmSivX86Backend in '..\..\Crypto\Modes\Gcm\ClpGcmSivX86Backend.pas',
ClpIDigest in '..\..\Interfaces\Crypto\Digests\ClpIDigest.pas',
ClpIDigestFactory in '..\..\Interfaces\Crypto\Operators\ClpIDigestFactory.pas',
ClpIDigestRandomGenerator in '..\..\Interfaces\Rngs\ClpIDigestRandomGenerator.pas',
@@ -679,7 +691,9 @@ contains
ClpIFusedEaxKernel in '..\..\Interfaces\Crypto\Modes\Fused\ClpIFusedEaxKernel.pas',
ClpIFusedGcmSivKernel in '..\..\Interfaces\Crypto\Modes\Fused\ClpIFusedGcmSivKernel.pas',
ClpFusedKernelRegistry in '..\..\Crypto\Modes\Fused\ClpFusedKernelRegistry.pas',
- ClpAesNiAeadResolver in '..\..\Crypto\Modes\Fused\Internal\ClpAesNiAeadResolver.pas',
+ ClpAesFusedAeadSimd in '..\..\Crypto\Modes\Fused\Internal\ClpAesFusedAeadSimd.pas',
+ ClpAesFusedAeadX86Backend in '..\..\Crypto\Modes\Fused\Internal\ClpAesFusedAeadX86Backend.pas',
+ ClpBinPolySimd in '..\..\Math\BinPoly\ClpBinPolySimd.pas',
ClpAesNiOcbKernel in '..\..\Crypto\Modes\Fused\ClpAesNiOcbKernel.pas',
ClpAesNiCcmKernel in '..\..\Crypto\Modes\Fused\ClpAesNiCcmKernel.pas',
ClpAesNiEaxKernel in '..\..\Crypto\Modes\Fused\ClpAesNiEaxKernel.pas',
diff --git a/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.lpk b/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.lpk
index 144504b4..5d141cbc 100644
--- a/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.lpk
+++ b/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.lpk
@@ -31,7 +31,7 @@
Acknowledgements:
Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring the development of this library "/>
-
+
@@ -2734,8 +2734,8 @@ Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring the devel
-
-
+
+
@@ -3133,6 +3133,62 @@ Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring the devel
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.pas b/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.pas
index 88041881..42a0be66 100644
--- a/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.pas
+++ b/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.pas
@@ -220,7 +220,7 @@ interface
ClpBlockCipherBulkUtilities, ClpCipherModeParameterUtilities,
ClpGcmSivUtilities, ClpFusedKernelTypes, ClpIFusedGcmKernel,
ClpIFusedOcbKernel, ClpIFusedCcmKernel, ClpIFusedEaxKernel,
- ClpIFusedGcmSivKernel, ClpFusedKernelRegistry, ClpAesNiAeadResolver,
+ ClpIFusedGcmSivKernel, ClpFusedKernelRegistry, ClpAesFusedAeadSimd,
ClpAesNiOcbKernel, ClpAesNiCcmKernel, ClpAesNiEaxKernel, ClpAesNiGcmKernel,
ClpPclmulGcmSivKernel, ClpFusedKernelDefaults, ClpXChaCha20Engine,
ClpIXChaCha20Engine, ClpXChaCha20Poly1305, ClpIXChaCha20Poly1305,
@@ -252,7 +252,10 @@ interface
ClpISP800SecureRandomBuilder, ClpSP800SecureRandomBuilder,
ClpECDHRawAgreement, ClpECDHCRawAgreement, ClpIECDHRawAgreement,
ClpIECDHCRawAgreement, ClpGF256Aes, ClpIAesHardwareEngine,
- ClpIBackingHashProvider;
+ ClpIBackingHashProvider, ClpChaChaSimd, ClpChaChaX86Backend, ClpSalsaSimd,
+ ClpSalsaX86Backend, ClpPoly1305State, ClpPoly1305Simd,
+ ClpPoly1305X86Backend, ClpGhashSimd, ClpGhashX86Backend, ClpGcmSivSimd,
+ ClpGcmSivX86Backend, ClpAesFusedAeadX86Backend, ClpBinPolySimd, ClpAesSimd;
implementation