diff --git a/CryptoLib.Benchmark/Delphi/CryptoLib.BenchmarkConsole.dpr b/CryptoLib.Benchmark/Delphi/CryptoLib.BenchmarkConsole.dpr index edd01df3..7bb4c6a9 100644 --- a/CryptoLib.Benchmark/Delphi/CryptoLib.BenchmarkConsole.dpr +++ b/CryptoLib.Benchmark/Delphi/CryptoLib.BenchmarkConsole.dpr @@ -16,6 +16,7 @@ uses ClpIAesEngineX86 in '..\..\CryptoLib\src\Interfaces\Crypto\Engines\ClpIAesEngineX86.pas', ClpAesEngineX86 in '..\..\CryptoLib\src\Crypto\Engines\ClpAesEngineX86.pas', ClpAesUtilities in '..\..\CryptoLib\src\Crypto\ClpAesUtilities.pas', + ClpAesSimd in '..\..\CryptoLib\src\Crypto\ClpAesSimd.pas', ClpAesLightEngine in '..\..\CryptoLib\src\Crypto\Engines\ClpAesLightEngine.pas', ClpAgreementUtilities in '..\..\CryptoLib\src\Crypto\Agreements\ClpAgreementUtilities.pas', ClpArgon2ParametersGenerator in '..\..\CryptoLib\src\Crypto\Generators\ClpArgon2ParametersGenerator.pas', @@ -183,6 +184,17 @@ uses ClpIDHGenerators in '..\..\CryptoLib\src\Interfaces\Crypto\Generators\ClpIDHGenerators.pas', ClpIDHParameters in '..\..\CryptoLib\src\Interfaces\Crypto\Parameters\ClpIDHParameters.pas', ClpIBackingHashProvider in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIBackingHashProvider.pas', + ClpChaChaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaSimd.pas', + ClpChaChaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaX86Backend.pas', + ClpSalsaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaSimd.pas', + ClpSalsaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaX86Backend.pas', + ClpPoly1305State in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305State.pas', + ClpPoly1305Simd in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305Simd.pas', + ClpPoly1305X86Backend in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305X86Backend.pas', + ClpGhashSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashSimd.pas', + ClpGhashX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashX86Backend.pas', + ClpGcmSivSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivSimd.pas', + ClpGcmSivX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivX86Backend.pas', ClpIDigest in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIDigest.pas', ClpIDigestFactory in '..\..\CryptoLib\src\Interfaces\Crypto\Operators\ClpIDigestFactory.pas', ClpIDigestRandomGenerator in '..\..\CryptoLib\src\Interfaces\Rngs\ClpIDigestRandomGenerator.pas', @@ -680,7 +692,9 @@ uses ClpIFusedEaxKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedEaxKernel.pas', ClpIFusedGcmSivKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedGcmSivKernel.pas', ClpFusedKernelRegistry in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpFusedKernelRegistry.pas', - ClpAesNiAeadResolver in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesNiAeadResolver.pas', + ClpAesFusedAeadSimd in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadSimd.pas', + ClpAesFusedAeadX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadX86Backend.pas', + ClpBinPolySimd in '..\..\CryptoLib\src\Math\BinPoly\ClpBinPolySimd.pas', ClpAesNiOcbKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiOcbKernel.pas', ClpAesNiCcmKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiCcmKernel.pas', ClpAesNiEaxKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiEaxKernel.pas', diff --git a/CryptoLib.Examples/Delphi.Examples/CryptoLib.Examples.dpr b/CryptoLib.Examples/Delphi.Examples/CryptoLib.Examples.dpr index fb7a8a29..f48218a3 100644 --- a/CryptoLib.Examples/Delphi.Examples/CryptoLib.Examples.dpr +++ b/CryptoLib.Examples/Delphi.Examples/CryptoLib.Examples.dpr @@ -32,6 +32,7 @@ uses ClpIAesEngineX86 in '..\..\CryptoLib\src\Interfaces\Crypto\Engines\ClpIAesEngineX86.pas', ClpAesEngineX86 in '..\..\CryptoLib\src\Crypto\Engines\ClpAesEngineX86.pas', ClpAesUtilities in '..\..\CryptoLib\src\Crypto\ClpAesUtilities.pas', + ClpAesSimd in '..\..\CryptoLib\src\Crypto\ClpAesSimd.pas', ClpAesLightEngine in '..\..\CryptoLib\src\Crypto\Engines\ClpAesLightEngine.pas', ClpAgreementUtilities in '..\..\CryptoLib\src\Crypto\Agreements\ClpAgreementUtilities.pas', ClpArgon2ParametersGenerator in '..\..\CryptoLib\src\Crypto\Generators\ClpArgon2ParametersGenerator.pas', @@ -199,6 +200,17 @@ uses ClpIDHGenerators in '..\..\CryptoLib\src\Interfaces\Crypto\Generators\ClpIDHGenerators.pas', ClpIDHParameters in '..\..\CryptoLib\src\Interfaces\Crypto\Parameters\ClpIDHParameters.pas', ClpIBackingHashProvider in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIBackingHashProvider.pas', + ClpChaChaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaSimd.pas', + ClpChaChaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaX86Backend.pas', + ClpSalsaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaSimd.pas', + ClpSalsaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaX86Backend.pas', + ClpPoly1305State in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305State.pas', + ClpPoly1305Simd in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305Simd.pas', + ClpPoly1305X86Backend in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305X86Backend.pas', + ClpGhashSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashSimd.pas', + ClpGhashX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashX86Backend.pas', + ClpGcmSivSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivSimd.pas', + ClpGcmSivX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivX86Backend.pas', ClpIDigest in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIDigest.pas', ClpIDigestFactory in '..\..\CryptoLib\src\Interfaces\Crypto\Operators\ClpIDigestFactory.pas', ClpIDigestRandomGenerator in '..\..\CryptoLib\src\Interfaces\Rngs\ClpIDigestRandomGenerator.pas', @@ -696,7 +708,9 @@ uses ClpIFusedEaxKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedEaxKernel.pas', ClpIFusedGcmSivKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedGcmSivKernel.pas', ClpFusedKernelRegistry in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpFusedKernelRegistry.pas', - ClpAesNiAeadResolver in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesNiAeadResolver.pas', + ClpAesFusedAeadSimd in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadSimd.pas', + ClpAesFusedAeadX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadX86Backend.pas', + ClpBinPolySimd in '..\..\CryptoLib\src\Math\BinPoly\ClpBinPolySimd.pas', ClpAesNiOcbKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiOcbKernel.pas', ClpAesNiCcmKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiCcmKernel.pas', ClpAesNiEaxKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiEaxKernel.pas', diff --git a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dpr b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dpr index 5cefe272..44c55f12 100644 --- a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dpr +++ b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dpr @@ -13,6 +13,7 @@ uses ClpIAesEngineX86 in '..\..\CryptoLib\src\Interfaces\Crypto\Engines\ClpIAesEngineX86.pas', ClpAesEngineX86 in '..\..\CryptoLib\src\Crypto\Engines\ClpAesEngineX86.pas', ClpAesUtilities in '..\..\CryptoLib\src\Crypto\ClpAesUtilities.pas', + ClpAesSimd in '..\..\CryptoLib\src\Crypto\ClpAesSimd.pas', ClpAesLightEngine in '..\..\CryptoLib\src\Crypto\Engines\ClpAesLightEngine.pas', ClpAgreementUtilities in '..\..\CryptoLib\src\Crypto\Agreements\ClpAgreementUtilities.pas', ClpArgon2ParametersGenerator in '..\..\CryptoLib\src\Crypto\Generators\ClpArgon2ParametersGenerator.pas', @@ -180,6 +181,17 @@ uses ClpIDHGenerators in '..\..\CryptoLib\src\Interfaces\Crypto\Generators\ClpIDHGenerators.pas', ClpIDHParameters in '..\..\CryptoLib\src\Interfaces\Crypto\Parameters\ClpIDHParameters.pas', ClpIBackingHashProvider in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIBackingHashProvider.pas', + ClpChaChaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaSimd.pas', + ClpChaChaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaX86Backend.pas', + ClpSalsaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaSimd.pas', + ClpSalsaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaX86Backend.pas', + ClpPoly1305State in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305State.pas', + ClpPoly1305Simd in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305Simd.pas', + ClpPoly1305X86Backend in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305X86Backend.pas', + ClpGhashSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashSimd.pas', + ClpGhashX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashX86Backend.pas', + ClpGcmSivSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivSimd.pas', + ClpGcmSivX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivX86Backend.pas', ClpIDigest in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIDigest.pas', ClpIDigestFactory in '..\..\CryptoLib\src\Interfaces\Crypto\Operators\ClpIDigestFactory.pas', ClpIDigestRandomGenerator in '..\..\CryptoLib\src\Interfaces\Rngs\ClpIDigestRandomGenerator.pas', @@ -677,7 +689,9 @@ uses ClpIFusedEaxKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedEaxKernel.pas', ClpIFusedGcmSivKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedGcmSivKernel.pas', ClpFusedKernelRegistry in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpFusedKernelRegistry.pas', - ClpAesNiAeadResolver in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesNiAeadResolver.pas', + ClpAesFusedAeadSimd in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadSimd.pas', + ClpAesFusedAeadX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadX86Backend.pas', + ClpBinPolySimd in '..\..\CryptoLib\src\Math\BinPoly\ClpBinPolySimd.pas', ClpAesNiOcbKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiOcbKernel.pas', ClpAesNiCcmKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiCcmKernel.pas', ClpAesNiEaxKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiEaxKernel.pas', diff --git a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dproj b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dproj index ef8b156f..15cf015c 100644 --- a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dproj +++ b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.Mobile.dproj @@ -356,6 +356,7 @@ + @@ -523,6 +524,17 @@ + + + + + + + + + + + @@ -1020,7 +1032,9 @@ - + + + diff --git a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.dpr b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.dpr index b224fe2d..2bba9ca1 100644 --- a/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.dpr +++ b/CryptoLib.Tests/Delphi.Tests/CryptoLib.Tests.dpr @@ -32,6 +32,7 @@ uses ClpIAesEngineX86 in '..\..\CryptoLib\src\Interfaces\Crypto\Engines\ClpIAesEngineX86.pas', ClpAesEngineX86 in '..\..\CryptoLib\src\Crypto\Engines\ClpAesEngineX86.pas', ClpAesUtilities in '..\..\CryptoLib\src\Crypto\ClpAesUtilities.pas', + ClpAesSimd in '..\..\CryptoLib\src\Crypto\ClpAesSimd.pas', ClpAesLightEngine in '..\..\CryptoLib\src\Crypto\Engines\ClpAesLightEngine.pas', ClpAgreementUtilities in '..\..\CryptoLib\src\Crypto\Agreements\ClpAgreementUtilities.pas', ClpArgon2ParametersGenerator in '..\..\CryptoLib\src\Crypto\Generators\ClpArgon2ParametersGenerator.pas', @@ -199,6 +200,17 @@ uses ClpIDHGenerators in '..\..\CryptoLib\src\Interfaces\Crypto\Generators\ClpIDHGenerators.pas', ClpIDHParameters in '..\..\CryptoLib\src\Interfaces\Crypto\Parameters\ClpIDHParameters.pas', ClpIBackingHashProvider in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIBackingHashProvider.pas', + ClpChaChaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaSimd.pas', + ClpChaChaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpChaChaX86Backend.pas', + ClpSalsaSimd in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaSimd.pas', + ClpSalsaX86Backend in '..\..\CryptoLib\src\Crypto\Engines\ClpSalsaX86Backend.pas', + ClpPoly1305State in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305State.pas', + ClpPoly1305Simd in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305Simd.pas', + ClpPoly1305X86Backend in '..\..\CryptoLib\src\Crypto\Macs\ClpPoly1305X86Backend.pas', + ClpGhashSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashSimd.pas', + ClpGhashX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGhashX86Backend.pas', + ClpGcmSivSimd in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivSimd.pas', + ClpGcmSivX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Gcm\ClpGcmSivX86Backend.pas', ClpIDigest in '..\..\CryptoLib\src\Interfaces\Crypto\Digests\ClpIDigest.pas', ClpIDigestFactory in '..\..\CryptoLib\src\Interfaces\Crypto\Operators\ClpIDigestFactory.pas', ClpIDigestRandomGenerator in '..\..\CryptoLib\src\Interfaces\Rngs\ClpIDigestRandomGenerator.pas', @@ -696,7 +708,9 @@ uses ClpIFusedEaxKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedEaxKernel.pas', ClpIFusedGcmSivKernel in '..\..\CryptoLib\src\Interfaces\Crypto\Modes\Fused\ClpIFusedGcmSivKernel.pas', ClpFusedKernelRegistry in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpFusedKernelRegistry.pas', - ClpAesNiAeadResolver in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesNiAeadResolver.pas', + ClpAesFusedAeadSimd in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadSimd.pas', + ClpAesFusedAeadX86Backend in '..\..\CryptoLib\src\Crypto\Modes\Fused\Internal\ClpAesFusedAeadX86Backend.pas', + ClpBinPolySimd in '..\..\CryptoLib\src\Math\BinPoly\ClpBinPolySimd.pas', ClpAesNiOcbKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiOcbKernel.pas', ClpAesNiCcmKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiCcmKernel.pas', ClpAesNiEaxKernel in '..\..\CryptoLib\src\Crypto\Modes\Fused\ClpAesNiEaxKernel.pas', diff --git a/CryptoLib.Tests/src/Math/BinPoly/BinPolyTests.pas b/CryptoLib.Tests/src/Math/BinPoly/BinPolyTests.pas index 9bf5daf9..11b9ba80 100644 --- a/CryptoLib.Tests/src/Math/BinPoly/BinPolyTests.pas +++ b/CryptoLib.Tests/src/Math/BinPoly/BinPolyTests.pas @@ -16,11 +16,9 @@ unit BinPolyTests; -interface +{$I ..\..\..\..\CryptoLib\src\Include\CryptoLib.inc} -{$IFDEF FPC} -{$MODE DELPHI} -{$ENDIF FPC} +interface uses SysUtils, @@ -38,7 +36,9 @@ interface ClpIRandom, ClpRandom, ClpBinPolyScalarBackend, +{$IFDEF CRYPTOLIB_X86_SIMD} ClpBinPolyX86V128Backend, +{$ENDIF CRYPTOLIB_X86_SIMD} ClpBinPolyMulBaseBinomialReduce, CryptoLibTestBase; @@ -62,8 +62,14 @@ TPentaCase = record K3: Int32; end; - TTestBinPoly = class(TCryptoLibAlgorithmTestCase) - strict private + /// + /// Shared BinPoly test machinery (constants, RNG helpers, reference + /// implementations, generic assert utilities). Backend-agnostic; consumed by + /// both the public-API suite (TTestBinPoly) and the per-backend + /// vs-scalar suite (TBinPolyBackendTestBase and its subclasses). + /// + TBinPolyTestBase = class abstract(TCryptoLibAlgorithmTestCase) + strict protected const BikeR1 = 12323; RandomTrials = 16; @@ -100,10 +106,14 @@ TTestBinPoly = class(TCryptoLibAlgorithmTestCase) const ARandom: IRandom; const ALabel: string); procedure RunInvertChecks(const AMul: IBinPolyMul; const AInv: IBinPolyInv; AN: Int32; const ARandom: IRandom; const ALabel: string); - procedure RunX86V128VsScalar(AN: Int32; const ARandom: IRandom; - const AContext: string); - procedure AssertX86V128MultiplyEquals(AN: Int32; - const AX, AY: TCryptoLibUInt64Array; const AContext: string); + end; + + /// + /// Architecture-neutral BinPoly tests: exercise the public TBinPolys API + /// (binomial / trinomial / pentanomial multiply / square / invert, factory + /// validation, bit-length). Run on every target. + /// + TTestBinPoly = class(TBinPolyTestBase) published procedure TestBinomial_Add_AgainstXor_BikeR1; procedure TestBinomial_AddTo_AgainstXor_BikeR1; @@ -134,14 +144,52 @@ TTestBinPoly = class(TCryptoLibAlgorithmTestCase) procedure TestPentanomial_Invert_RoundTrip; procedure TestInv_Factory_RejectsNullAndDegenerate; procedure TestBitLengthVar_AgainstReference; - procedure TestX86V128_Multiply_MatchesScalar; - procedure TestX86V128_SizeSweep; - procedure TestX86V128_LSize10_MidWindow; - procedure TestX86V128_SmallSizes_AllOps; - procedure TestX86V128_MultiplyByZeroAndOne; - procedure TestX86V128_EdgeVectors; end; + /// + /// Backend-agnostic suite that diffs a per-arch SIMD IBinPolyMul backend + /// against the scalar reference. A concrete per-arch suite supplies three hooks + /// (BackendSupported / CreateBackendMul / BackendLabel) and + /// registers itself under its architecture guard; the published tests are + /// inherited and discovered automatically. Binds to the arch-neutral + /// IBinPolyMul interface, never to a concrete backend class. + /// + TBinPolyBackendTestBase = class abstract(TBinPolyTestBase) + strict protected + // ---- architecture hooks (implemented by the concrete per-arch suite) ---- + function BackendSupported: Boolean; virtual; abstract; + function CreateBackendMul(AN: Int32; const AReduce: IBinPolyReduce) + : IBinPolyMul; virtual; abstract; + function BackendLabel: String; virtual; abstract; + + // ---- shared backend-vs-scalar logic ---- + procedure RunBackendVsScalar(AN: Int32; const ARandom: IRandom; + const AContext: string); + procedure AssertBackendMultiplyEquals(AN: Int32; + const AX, AY: TCryptoLibUInt64Array; const AContext: string); + published + procedure TestBackend_Multiply_MatchesScalar; + procedure TestBackend_SizeSweep; + procedure TestBackend_LSize10_MidWindow; + procedure TestBackend_SmallSizes_AllOps; + procedure TestBackend_MultiplyByZeroAndOne; + procedure TestBackend_EdgeVectors; + end; + +{$IFDEF CRYPTOLIB_X86_SIMD} + /// + /// x86/V128 (PCLMULQDQ) instantiation of the BinPoly backend suite. Registered + /// only when CRYPTOLIB_X86_SIMD is defined. + /// + TTestBinPolyX86V128 = class(TBinPolyBackendTestBase) + strict protected + function BackendSupported: Boolean; override; + function CreateBackendMul(AN: Int32; const AReduce: IBinPolyReduce) + : IBinPolyMul; override; + function BackendLabel: String; override; + end; +{$ENDIF CRYPTOLIB_X86_SIMD} + implementation const @@ -239,13 +287,13 @@ implementation { TTestBinPoly } -procedure TTestBinPoly.AssertUInt64ArraysEqual(ASize: Int32; +procedure TBinPolyTestBase.AssertUInt64ArraysEqual(ASize: Int32; const AExpected, AActual: TCryptoLibUInt64Array; const AContext: string); begin Check(TBinPolys.EqualTo(ASize, AExpected, 0, AActual, 0) <> 0, AContext); end; -procedure TTestBinPoly.AssertSliceEquals(const AExpected: TCryptoLibUInt64Array; +procedure TBinPolyTestBase.AssertSliceEquals(const AExpected: TCryptoLibUInt64Array; const AActual: TCryptoLibUInt64Array; AActualOff, ASize: Int32; const AContext: string); var @@ -255,7 +303,7 @@ procedure TTestBinPoly.AssertSliceEquals(const AExpected: TCryptoLibUInt64Array; CheckEquals(AExpected[LI], AActual[AActualOff + LI], AContext + ' limb ' + IntToStr(LI)); end; -procedure TTestBinPoly.AssertGuardZonesEqual(const ABefore, AAfter: TCryptoLibUInt64Array; +procedure TBinPolyTestBase.AssertGuardZonesEqual(const ABefore, AAfter: TCryptoLibUInt64Array; ASliceOff, ASliceSize: Int32; const AContext: string); var LI: Int32; @@ -266,7 +314,7 @@ procedure TTestBinPoly.AssertGuardZonesEqual(const ABefore, AAfter: TCryptoLibUI CheckEquals(ABefore[LI], AAfter[LI], AContext + ' tail guard at ' + IntToStr(LI)); end; -function TTestBinPoly.PadBuffer(ASliceSize, ASliceOff, APadTail: Int32; +function TBinPolyTestBase.PadBuffer(ASliceSize, ASliceOff, APadTail: Int32; const ARandom: IRandom): TCryptoLibUInt64Array; var LTotal, LI, LJ: Int32; @@ -286,7 +334,7 @@ function TTestBinPoly.PadBuffer(ASliceSize, ASliceOff, APadTail: Int32; end; end; -function TTestBinPoly.RandomLimbs(const ARandom: IRandom; ASize: Int32) +function TBinPolyTestBase.RandomLimbs(const ARandom: IRandom; ASize: Int32) : TCryptoLibUInt64Array; var LI, LJ: Int32; @@ -305,7 +353,7 @@ function TTestBinPoly.RandomLimbs(const ARandom: IRandom; ASize: Int32) end; end; -function TTestBinPoly.RandomReduced(const ARandom: IRandom; AN: Int32) +function TBinPolyTestBase.RandomReduced(const ARandom: IRandom; AN: Int32) : TCryptoLibUInt64Array; var LSize, LPartial, LI, LJ: Int32; @@ -328,7 +376,7 @@ function TTestBinPoly.RandomReduced(const ARandom: IRandom; AN: Int32) Result[LSize - 1] := Result[LSize - 1] and ((UInt64(1) shl LPartial) - 1); end; -function TTestBinPoly.ReferenceBitLength(ASize: Int32; +function TBinPolyTestBase.ReferenceBitLength(ASize: Int32; const AX: TCryptoLibUInt64Array): Int32; var LBit: Int32; @@ -339,7 +387,7 @@ function TTestBinPoly.ReferenceBitLength(ASize: Int32; Result := 0; end; -function TTestBinPoly.CarrylessMul(AN: Int32; const AX, AY: TCryptoLibUInt64Array) +function TBinPolyTestBase.CarrylessMul(AN: Int32; const AX, AY: TCryptoLibUInt64Array) : TCryptoLibUInt64Array; var LSize, LI, LWOff, LBOff, LJ: Int32; @@ -371,7 +419,7 @@ function TTestBinPoly.CarrylessMul(AN: Int32; const AX, AY: TCryptoLibUInt64Arra end; end; -function TTestBinPoly.ReferenceBinomialMul(AR: Int32; +function TBinPolyTestBase.ReferenceBinomialMul(AR: Int32; const AX, AY: TCryptoLibUInt64Array): TCryptoLibUInt64Array; var LSize, LP, LQ, LPartial: Int32; @@ -397,7 +445,7 @@ function TTestBinPoly.ReferenceBinomialMul(AR: Int32; Result[LSize - 1] := Result[LSize - 1] and LPartialMask; end; -function TTestBinPoly.ReferenceTrinomialMul(AN, AK: Int32; +function TBinPolyTestBase.ReferenceTrinomialMul(AN, AK: Int32; const AX, AY: TCryptoLibUInt64Array): TCryptoLibUInt64Array; var LSize, LP, LQ0, LQ1, LPartial: Int32; @@ -425,7 +473,7 @@ function TTestBinPoly.ReferenceTrinomialMul(AN, AK: Int32; Result[LSize - 1] := Result[LSize - 1] and LPartialMask; end; -function TTestBinPoly.ReferencePentanomialMul(AN, AK1, AK2, AK3: Int32; +function TBinPolyTestBase.ReferencePentanomialMul(AN, AK1, AK2, AK3: Int32; const AX, AY: TCryptoLibUInt64Array): TCryptoLibUInt64Array; var LSize, LP, LQ0, LQ1, LQ2, LQ3, LPartial: Int32; @@ -457,7 +505,7 @@ function TTestBinPoly.ReferencePentanomialMul(AN, AK1, AK2, AK3: Int32; Result[LSize - 1] := Result[LSize - 1] and LPartialMask; end; -procedure TTestBinPoly.RunAllOpsAtOffsets(const AMul: IBinPolyMul; AN: Int32; +procedure TBinPolyTestBase.RunAllOpsAtOffsets(const AMul: IBinPolyMul; AN: Int32; const ARandom: IRandom; const ALabel: string); var LSize: Int32; @@ -546,7 +594,7 @@ procedure TTestBinPoly.RunAllOpsAtOffsets(const AMul: IBinPolyMul; AN: Int32; AssertGuardZonesEqual(LZBufBefore, LZBuf, OffZ, LSize, ALabel + ' AddTo zBuf'); end; -procedure TTestBinPoly.RunInvertChecks(const AMul: IBinPolyMul; const AInv: IBinPolyInv; +procedure TBinPolyTestBase.RunInvertChecks(const AMul: IBinPolyMul; const AInv: IBinPolyInv; AN: Int32; const ARandom: IRandom; const ALabel: string); var LSize, LT: Int32; @@ -1393,28 +1441,28 @@ procedure TTestBinPoly.TestInv_Factory_RejectsNullAndDegenerate; end; end; -// Cross-backend check: the x86/V128 backend must agree with the scalar backend -// for Multiply, Square and SquareN, even when operands and outputs live at -// non-zero offsets inside guard-padded buffers (verifies offset handling, that -// inputs are never clobbered, and that nothing is written outside the result -// slice). -procedure TTestBinPoly.RunX86V128VsScalar(AN: Int32; const ARandom: IRandom; +// Cross-backend check: the SIMD backend under test must agree with the scalar +// backend for Multiply, Square and SquareN, even when operands and outputs live +// at non-zero offsets inside guard-padded buffers (verifies offset handling, +// that inputs are never clobbered, and that nothing is written outside the +// result slice). +procedure TBinPolyBackendTestBase.RunBackendVsScalar(AN: Int32; const ARandom: IRandom; const AContext: string); const SquareNCount = 5; var LReduce: IBinPolyReduce; - LScalar, LX86: IBinPolyMul; + LScalar, LSimd: IBinPolyMul; LSize: Int32; LX, LY, LRef, LXBuf, LYBuf, LZBuf, LXBefore, LYBefore, LZBefore: TCryptoLibUInt64Array; begin - if not TBinPolyX86V128Backend.IsEnabled then + if not BackendSupported then Exit; LReduce := TBinPolyMulBaseBinomialReduce.Create(AN); LScalar := TBinPolyScalarBackend.CreateBinPolyMul(AN, LReduce); - LX86 := TBinPolyX86V128Backend.CreateBinPolyMul(AN, LReduce); - LSize := LX86.Size; + LSimd := CreateBackendMul(AN, LReduce); + LSize := LSimd.Size; // Multiply LX := RandomReduced(ARandom, AN); @@ -1429,7 +1477,7 @@ procedure TTestBinPoly.RunX86V128VsScalar(AN: Int32; const ARandom: IRandom; LXBefore := System.Copy(LXBuf); LYBefore := System.Copy(LYBuf); LZBefore := System.Copy(LZBuf); - LX86.Multiply(LXBuf, OffX, LYBuf, OffY, LZBuf, OffZ); + LSimd.Multiply(LXBuf, OffX, LYBuf, OffY, LZBuf, OffZ); AssertSliceEquals(LRef, LZBuf, OffZ, LSize, AContext + ' Multiply'); AssertUInt64ArraysEqual(System.Length(LXBuf), LXBefore, LXBuf, AContext + ' Multiply xBuf clobbered'); AssertUInt64ArraysEqual(System.Length(LYBuf), LYBefore, LYBuf, AContext + ' Multiply yBuf clobbered'); @@ -1444,7 +1492,7 @@ procedure TTestBinPoly.RunX86V128VsScalar(AN: Int32; const ARandom: IRandom; TNat.Copy64(LSize, LX, 0, LXBuf, OffX); LXBefore := System.Copy(LXBuf); LZBefore := System.Copy(LZBuf); - LX86.Square(LXBuf, OffX, LZBuf, OffZ); + LSimd.Square(LXBuf, OffX, LZBuf, OffZ); AssertSliceEquals(LRef, LZBuf, OffZ, LSize, AContext + ' Square'); AssertUInt64ArraysEqual(System.Length(LXBuf), LXBefore, LXBuf, AContext + ' Square xBuf clobbered'); AssertGuardZonesEqual(LZBefore, LZBuf, OffZ, LSize, AContext + ' Square zBuf'); @@ -1458,51 +1506,51 @@ procedure TTestBinPoly.RunX86V128VsScalar(AN: Int32; const ARandom: IRandom; TNat.Copy64(LSize, LX, 0, LXBuf, OffX); LXBefore := System.Copy(LXBuf); LZBefore := System.Copy(LZBuf); - LX86.SquareN(LXBuf, OffX, SquareNCount, LZBuf, OffZ); + LSimd.SquareN(LXBuf, OffX, SquareNCount, LZBuf, OffZ); AssertSliceEquals(LRef, LZBuf, OffZ, LSize, AContext + ' SquareN'); AssertUInt64ArraysEqual(System.Length(LXBuf), LXBefore, LXBuf, AContext + ' SquareN xBuf clobbered'); AssertGuardZonesEqual(LZBefore, LZBuf, OffZ, LSize, AContext + ' SquareN zBuf'); end; -procedure TTestBinPoly.AssertX86V128MultiplyEquals(AN: Int32; +procedure TBinPolyBackendTestBase.AssertBackendMultiplyEquals(AN: Int32; const AX, AY: TCryptoLibUInt64Array; const AContext: string); var LReduce: IBinPolyReduce; - LScalar, LX86: IBinPolyMul; - LScalarZ, LX86Z: TCryptoLibUInt64Array; + LScalar, LSimd: IBinPolyMul; + LScalarZ, LSimdZ: TCryptoLibUInt64Array; begin - if not TBinPolyX86V128Backend.IsEnabled then + if not BackendSupported then Exit; LReduce := TBinPolyMulBaseBinomialReduce.Create(AN); LScalar := TBinPolyScalarBackend.CreateBinPolyMul(AN, LReduce); - LX86 := TBinPolyX86V128Backend.CreateBinPolyMul(AN, LReduce); + LSimd := CreateBackendMul(AN, LReduce); LScalarZ := TBinPolys.Create(LScalar.Size); - LX86Z := TBinPolys.Create(LX86.Size); + LSimdZ := TBinPolys.Create(LSimd.Size); LScalar.Multiply(AX, 0, AY, 0, LScalarZ, 0); - LX86.Multiply(AX, 0, AY, 0, LX86Z, 0); - AssertUInt64ArraysEqual(LScalar.Size, LScalarZ, LX86Z, AContext); + LSimd.Multiply(AX, 0, AY, 0, LSimdZ, 0); + AssertUInt64ArraysEqual(LScalar.Size, LScalarZ, LSimdZ, AContext); end; -procedure TTestBinPoly.TestX86V128_Multiply_MatchesScalar; +procedure TBinPolyBackendTestBase.TestBackend_Multiply_MatchesScalar; var LRandom: IRandom; LT: Int32; begin - if not TBinPolyX86V128Backend.IsEnabled then + if not BackendSupported then Exit; LRandom := TRandom.Create(FixedSeed + 2000); try for LT := 0 to RandomTrials - 1 do - RunX86V128VsScalar(BikeR1, LRandom, + RunBackendVsScalar(BikeR1, LRandom, 'BikeR1 equivalence trial ' + IntToStr(LT)); finally LRandom := nil; end; end; -procedure TTestBinPoly.TestX86V128_SizeSweep; +procedure TBinPolyBackendTestBase.TestBackend_SizeSweep; const SweepCases: array [0 .. 17] of TBinCase = ( (CaseName: 'lsize1'; N: 32), @@ -1527,7 +1575,7 @@ procedure TTestBinPoly.TestX86V128_SizeSweep; LRandom: IRandom; LC, LT: Int32; begin - if not TBinPolyX86V128Backend.IsEnabled then + if not BackendSupported then Exit; for LC := Low(SweepCases) to High(SweepCases) do @@ -1535,7 +1583,7 @@ procedure TTestBinPoly.TestX86V128_SizeSweep; LRandom := TRandom.Create(FixedSeed + SweepCases[LC].N); try for LT := 0 to RandomTrials - 1 do - RunX86V128VsScalar(SweepCases[LC].N, LRandom, + RunBackendVsScalar(SweepCases[LC].N, LRandom, SweepCases[LC].CaseName + ' trial ' + IntToStr(LT)); finally LRandom := nil; @@ -1543,18 +1591,18 @@ procedure TTestBinPoly.TestX86V128_SizeSweep; end; end; -procedure TTestBinPoly.TestX86V128_LSize10_MidWindow; +procedure TBinPolyBackendTestBase.TestBackend_LSize10_MidWindow; var LRandom: IRandom; LT: Int32; begin - if not TBinPolyX86V128Backend.IsEnabled then + if not BackendSupported then Exit; LRandom := TRandom.Create(FixedSeed + 610); try for LT := 0 to RandomTrials * 2 - 1 do - RunX86V128VsScalar(610, LRandom, + RunBackendVsScalar(610, LRandom, 'LSize10 mid-window trial ' + IntToStr(LT)); finally LRandom := nil; @@ -1563,12 +1611,12 @@ procedure TTestBinPoly.TestX86V128_LSize10_MidWindow; // Exhaustively exercise every generated small fixed-size kernel (lsize 1..10) // through all three operations at non-zero offsets. -procedure TTestBinPoly.TestX86V128_SmallSizes_AllOps; +procedure TBinPolyBackendTestBase.TestBackend_SmallSizes_AllOps; var LRandom: IRandom; LLSize, LT, LN: Int32; begin - if not TBinPolyX86V128Backend.IsEnabled then + if not BackendSupported then Exit; for LLSize := 1 to 10 do @@ -1577,7 +1625,7 @@ procedure TTestBinPoly.TestX86V128_SmallSizes_AllOps; LRandom := TRandom.Create(FixedSeed + 4000 + LLSize); try for LT := 0 to RandomTrials - 1 do - RunX86V128VsScalar(LN, LRandom, + RunBackendVsScalar(LN, LRandom, 'small lsize' + IntToStr(LLSize) + ' trial ' + IntToStr(LT)); finally LRandom := nil; @@ -1587,7 +1635,7 @@ procedure TTestBinPoly.TestX86V128_SmallSizes_AllOps; // Multiplying by zero must yield zero and multiplying by one must be the // identity, matching the scalar backend, across small, medium and large sizes. -procedure TTestBinPoly.TestX86V128_MultiplyByZeroAndOne; +procedure TBinPolyBackendTestBase.TestBackend_MultiplyByZeroAndOne; const Sizes: array [0 .. 5] of Int32 = (64, 320, 608, 672, 1248, BikeR1); var @@ -1595,7 +1643,7 @@ procedure TTestBinPoly.TestX86V128_MultiplyByZeroAndOne; LC, LSize: Int32; LX, LZero, LOne: TCryptoLibUInt64Array; begin - if not TBinPolyX86V128Backend.IsEnabled then + if not BackendSupported then Exit; LRandom := TRandom.Create(FixedSeed + 5000); @@ -1607,9 +1655,9 @@ procedure TTestBinPoly.TestX86V128_MultiplyByZeroAndOne; LZero := TBinPolys.Create(LSize); LOne := TBinPolys.Create(LSize); LOne[0] := 1; - AssertX86V128MultiplyEquals(Sizes[LC], LX, LZero, + AssertBackendMultiplyEquals(Sizes[LC], LX, LZero, 'mul-by-zero n=' + IntToStr(Sizes[LC])); - AssertX86V128MultiplyEquals(Sizes[LC], LX, LOne, + AssertBackendMultiplyEquals(Sizes[LC], LX, LOne, 'mul-by-one n=' + IntToStr(Sizes[LC])); end; finally @@ -1620,7 +1668,7 @@ procedure TTestBinPoly.TestX86V128_MultiplyByZeroAndOne; // Adversarial bit patterns (all-ones, single high bit, alternating) maximise // carryless lane interaction and catch lane-splice / shift errors that random // inputs may miss. -procedure TTestBinPoly.TestX86V128_EdgeVectors; +procedure TBinPolyBackendTestBase.TestBackend_EdgeVectors; const Sizes: array [0 .. 6] of Int32 = (64, 192, 608, 672, 736, 1248, BikeR1); var @@ -1640,7 +1688,7 @@ procedure TTestBinPoly.TestX86V128_EdgeVectors; end; begin - if not TBinPolyX86V128Backend.IsEnabled then + if not BackendSupported then Exit; for LC := Low(Sizes) to High(Sizes) do @@ -1664,15 +1712,15 @@ procedure TTestBinPoly.TestX86V128_EdgeVectors; LLowBit := TBinPolys.Create(LSize); LLowBit[0] := 1; - AssertX86V128MultiplyEquals(Sizes[LC], LAllOnes, LAllOnes, + AssertBackendMultiplyEquals(Sizes[LC], LAllOnes, LAllOnes, 'edge allones^2 n=' + IntToStr(Sizes[LC])); - AssertX86V128MultiplyEquals(Sizes[LC], LAllOnes, LAlt, + AssertBackendMultiplyEquals(Sizes[LC], LAllOnes, LAlt, 'edge allones*alt n=' + IntToStr(Sizes[LC])); - AssertX86V128MultiplyEquals(Sizes[LC], LHighBit, LHighBit, + AssertBackendMultiplyEquals(Sizes[LC], LHighBit, LHighBit, 'edge highbit^2 n=' + IntToStr(Sizes[LC])); - AssertX86V128MultiplyEquals(Sizes[LC], LHighBit, LAllOnes, + AssertBackendMultiplyEquals(Sizes[LC], LHighBit, LAllOnes, 'edge highbit*allones n=' + IntToStr(Sizes[LC])); - AssertX86V128MultiplyEquals(Sizes[LC], LAlt, LLowBit, + AssertBackendMultiplyEquals(Sizes[LC], LAlt, LLowBit, 'edge alt*lowbit n=' + IntToStr(Sizes[LC])); end; end; @@ -1711,6 +1759,28 @@ procedure TTestBinPoly.TestBitLengthVar_AgainstReference; end; end; +{$IFDEF CRYPTOLIB_X86_SIMD} + +{ TTestBinPolyX86V128 } + +function TTestBinPolyX86V128.BackendSupported: Boolean; +begin + Result := TBinPolyX86V128Backend.IsSupported; +end; + +function TTestBinPolyX86V128.CreateBackendMul(AN: Int32; + const AReduce: IBinPolyReduce): IBinPolyMul; +begin + Result := TBinPolyX86V128Backend.CreateBinPolyMul(AN, AReduce); +end; + +function TTestBinPolyX86V128.BackendLabel: String; +begin + Result := 'X86V128'; +end; + +{$ENDIF CRYPTOLIB_X86_SIMD} + initialization {$IFDEF FPC} @@ -1719,4 +1789,12 @@ initialization RegisterTest(TTestBinPoly.Suite); {$ENDIF FPC} +{$IFDEF CRYPTOLIB_X86_SIMD} +{$IFDEF FPC} + RegisterTest(TTestBinPolyX86V128); +{$ELSE} + RegisterTest(TTestBinPolyX86V128.Suite); +{$ENDIF FPC} +{$ENDIF CRYPTOLIB_X86_SIMD} + end. diff --git a/CryptoLib/src/Crypto/ClpAesSimd.pas b/CryptoLib/src/Crypto/ClpAesSimd.pas new file mode 100644 index 00000000..f8a74536 --- /dev/null +++ b/CryptoLib/src/Crypto/ClpAesSimd.pas @@ -0,0 +1,78 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpAesSimd; + +{$I ..\Include\CryptoLib.inc} + +interface + +uses + ClpIBlockCipher +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + , ClpAesEngineX86 +{$IFEND} + ; + +type + /// + /// Arch-neutral SIMD dispatch facade for hardware-accelerated AES engines. + /// SIMD-only by contract: it produces the per-arch hardware engine (e.g. + /// AES-NI via TAesEngineX86 on x86) + /// when available, or reports "not handled" - it never returns the portable + /// scalar engine. The scalar fallback belongs to the caller + /// (TAesUtilities), matching the Try*-then-scalar shape used across the + /// other SIMD families. Selects the per-arch backend at compile time. + /// + TAesSimd = class sealed + public + /// True when a hardware AES engine is available on this build/CPU. + class function IsSupported: Boolean; static; + /// + /// Create the per-arch hardware AES engine when available (returns True with + /// set); otherwise is nil + /// and the caller runs its scalar path (returns False). + /// + class function TryCreateHardwareEngine(out AEngine: IBlockCipher): Boolean; static; + end; + +implementation + +{ TAesSimd } + +class function TAesSimd.IsSupported: Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TAesEngineX86.IsSupported; +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TAesSimd.TryCreateHardwareEngine(out AEngine: IBlockCipher): Boolean; +begin + AEngine := nil; +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + if TAesEngineX86.IsSupported then + begin + AEngine := TAesEngineX86.Create(); + Exit(True); + end; +{$IFEND} + Result := False; +end; + +end. diff --git a/CryptoLib/src/Crypto/ClpAesUtilities.pas b/CryptoLib/src/Crypto/ClpAesUtilities.pas index 141d5d3a..2093a8e6 100644 --- a/CryptoLib/src/Crypto/ClpAesUtilities.pas +++ b/CryptoLib/src/Crypto/ClpAesUtilities.pas @@ -22,25 +22,22 @@ interface uses ClpIBlockCipher, - ClpAesEngine -{$IFDEF CRYPTOLIB_X86_SIMD} - , ClpAesEngineX86 -{$ENDIF} - ; + ClpAesEngine, + ClpAesSimd; type /// - /// Factory for the default AES block cipher. - /// When CRYPTOLIB_X86_SIMD is defined and AES-NI is available at runtime, - /// returns TAesEngineX86 (hardware AES); - /// otherwise TAesEngine. + /// Factory for the default AES block cipher. Selects the per-arch hardware + /// engine at compile time and, when it is available at runtime, returns it + /// (e.g. AES-NI via TAesEngineX86 on x86); otherwise the portable + /// scalar TAesEngine. /// TAesUtilities = class sealed(TObject) public class function CreateEngine(): IBlockCipher; static; /// - /// True when the library is built with CRYPTOLIB_X86_SIMD and AES-NI is available - /// at runtime (TAesEngineX86.IsSupported). Otherwise False. + /// True when the build has a per-arch hardware AES engine and it is available + /// at runtime (its IsSupported is True). Otherwise False. /// class function IsHardwareAccelerated(): Boolean; static; end; @@ -50,21 +47,17 @@ implementation { TAesUtilities } class function TAesUtilities.CreateEngine(): IBlockCipher; +var + LEngine: IBlockCipher; begin -{$IFDEF CRYPTOLIB_X86_SIMD} - if IsHardwareAccelerated then - Exit(TAesEngineX86.Create()); -{$ENDIF} + if TAesSimd.TryCreateHardwareEngine(LEngine) then + Exit(LEngine); Result := TAesEngine.Create(); end; class function TAesUtilities.IsHardwareAccelerated(): Boolean; begin -{$IFDEF CRYPTOLIB_X86_SIMD} - Result := TAesEngineX86.IsSupported; -{$ELSE} - Result := False; -{$ENDIF} + Result := TAesSimd.IsSupported; end; end. diff --git a/CryptoLib/src/Crypto/Engines/ClpChaCha7539Engine.pas b/CryptoLib/src/Crypto/Engines/ClpChaCha7539Engine.pas index f3cc0941..2c04da98 100644 --- a/CryptoLib/src/Crypto/Engines/ClpChaCha7539Engine.pas +++ b/CryptoLib/src/Crypto/Engines/ClpChaCha7539Engine.pas @@ -29,8 +29,7 @@ interface ClpChaChaEngine, ClpPack, ClpCryptoLibTypes, - ClpCpuFeatures, - ClpSimdLevels, + ClpChaChaSimd, ClpByteUtilities; resourcestring @@ -58,6 +57,11 @@ TChaCha7539Engine = class(TSalsa20Engine, IChaCha7539Engine, IStreamCipher) strict private procedure ImplProcessBlock(const AInBuf: TCryptoLibByteArray; AInOff: Int32; const AOutBuf: TCryptoLibByteArray; AOutOff: Int32); inline; + // Two-block keystream body (SIMD-accelerated when available, else scalar) with + // no state validation or 2^38 byte-limit accounting - those belong to the + // callers (ProcessBlocks2, ProcessBlocks4), which do them exactly once. + procedure ProcessBlocks2Core(const AInBytes: TCryptoLibByteArray; AInOff: Int32; + const AOutBytes: TCryptoLibByteArray; AOutOff: Int32); public constructor Create(); @@ -82,46 +86,6 @@ TChaCha7539Engine = class(TSalsa20Engine, IChaCha7539Engine, IStreamCipher) implementation -{$IFDEF CRYPTOLIB_X86_SIMD} -procedure ChaCha7539RaiseCounter7539; -begin - raise EInvalidOperationCryptoLibException.CreateRes(@SCounterExceeded); -end; - -procedure ChaCha7539ProcessBlocks2Sse2(ARounds: Int32; AState, AIn, AOut: PByte); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} -{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Sse2_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} -{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Sse2_i386.inc} -{$ENDIF} -end; - -procedure ChaCha7539ProcessBlocks2Avx2(ARounds: Int32; AState, AIn, AOut: PByte); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} -{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Avx2_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} -{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Avx2_i386.inc} -{$ENDIF} -end; - -procedure ChaCha7539ProcessBlocks4Avx2(ARounds: Int32; AState, AIn, AOut: PByte); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} -{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks4Avx2_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} -{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks4Avx2_i386.inc} -{$ENDIF} -end; -{$ENDIF} - { TChaCha7539Engine } constructor TChaCha7539Engine.Create; @@ -363,22 +327,17 @@ procedure TChaCha7539Engine.ProcessBlocks2(const AInBytes: TCryptoLibByteArray; begin raise EMaxBytesExceededCryptoLibException.CreateRes(@SMaxByteExceeded); end; -{$IFDEF CRYPTOLIB_X86_SIMD} - case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]) of - TX86SimdLevel.AVX2: - begin - ChaCha7539ProcessBlocks2Avx2(FRounds, PByte(@FEngineState[0]), PByte(@AInBytes[AInOff]), - PByte(@AOutBytes[AOutOff])); - Exit; - end; - TX86SimdLevel.SSE2: - begin - ChaCha7539ProcessBlocks2Sse2(FRounds, PByte(@FEngineState[0]), PByte(@AInBytes[AInOff]), - PByte(@AOutBytes[AOutOff])); - Exit; - end; - end; -{$ENDIF} + + ProcessBlocks2Core(AInBytes, AInOff, AOutBytes, AOutOff); +end; + +procedure TChaCha7539Engine.ProcessBlocks2Core(const AInBytes: TCryptoLibByteArray; + AInOff: Int32; const AOutBytes: TCryptoLibByteArray; AOutOff: Int32); +begin + if TChaChaSimd.TryProcessBlocks2(FRounds, PByte(@FEngineState[0]), + PByte(@AInBytes[AInOff]), PByte(@AOutBytes[AOutOff])) then + Exit; + ImplProcessBlock(AInBytes, AInOff, AOutBytes, AOutOff); ImplProcessBlock(AInBytes, AInOff + 64, AOutBytes, AOutOff + 64); end; @@ -396,22 +355,17 @@ procedure TChaCha7539Engine.ProcessBlocks4(const AInBytes: TCryptoLibByteArray; raise EInvalidOperationCryptoLibException.CreateResFmt(@SNotBlockAligned, [AlgorithmName]); end; -{$IFDEF CRYPTOLIB_X86_SIMD} - case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2]) of - TX86SimdLevel.AVX2: - begin - if (LimitExceeded(UInt32(256))) then - begin - raise EMaxBytesExceededCryptoLibException.CreateRes(@SMaxByteExceeded); - end; - ChaCha7539ProcessBlocks4Avx2(FRounds, PByte(@FEngineState[0]), - PByte(@AInBytes[AInOff]), PByte(@AOutBytes[AOutOff])); - Exit; - end; + if (LimitExceeded(UInt32(256))) then + begin + raise EMaxBytesExceededCryptoLibException.CreateRes(@SMaxByteExceeded); end; -{$ENDIF} - ProcessBlocks2(AInBytes, AInOff, AOutBytes, AOutOff); - ProcessBlocks2(AInBytes, AInOff + 128, AOutBytes, AOutOff + 128); + + if TChaChaSimd.TryProcessBlocks4(FRounds, PByte(@FEngineState[0]), + PByte(@AInBytes[AInOff]), PByte(@AOutBytes[AOutOff])) then + Exit; + + ProcessBlocks2Core(AInBytes, AInOff, AOutBytes, AOutOff); + ProcessBlocks2Core(AInBytes, AInOff + 128, AOutBytes, AOutOff + 128); end; procedure TChaCha7539Engine.ImplProcessBlock( diff --git a/CryptoLib/src/Crypto/Engines/ClpChaChaEngine.pas b/CryptoLib/src/Crypto/Engines/ClpChaChaEngine.pas index 3da6bea8..9f26e81d 100644 --- a/CryptoLib/src/Crypto/Engines/ClpChaChaEngine.pas +++ b/CryptoLib/src/Crypto/Engines/ClpChaChaEngine.pas @@ -26,8 +26,7 @@ interface ClpIChaChaEngine, ClpSalsa20Engine, ClpPack, - ClpCpuFeatures, - ClpSimdLevels, + ClpChaChaSimd, ClpCryptoLibTypes; resourcestring @@ -84,19 +83,6 @@ TChaChaEngine = class(TSalsa20Engine, IChaChaEngine, IStreamCipher) implementation -{$IFDEF CRYPTOLIB_X86_SIMD} -procedure ChaCha20BlockSse2(ARounds: Int32; AInput, AOut: PByte); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc} -{$I ..\..\Include\Simd\ChaCha\ChaCha20BlockSse2_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\Include\Simd\Common\SimdProc3Begin_i386.inc} -{$I ..\..\Include\Simd\ChaCha\ChaCha20BlockSse2_i386.inc} -{$ENDIF} -end; -{$ENDIF} - { TChaChaEngine } procedure TChaChaEngine.ProcessBlocks2( @@ -136,15 +122,8 @@ class procedure TChaChaEngine.ChaChaCore(ARounds: Int32; begin raise EArgumentCryptoLibException.CreateRes(@SRoundsEven); end; -{$IFDEF CRYPTOLIB_X86_SIMD} - case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of - TX86SimdLevel.SSE2: - begin - ChaCha20BlockSse2(ARounds, PByte(@AInput[0]), PByte(@AOutput[0])); - Exit; - end; - end; -{$ENDIF} + if TChaChaSimd.TryCore(ARounds, PByte(@AInput[0]), PByte(@AOutput[0])) then + Exit; LX00 := AInput[0]; LX01 := AInput[1]; diff --git a/CryptoLib/src/Crypto/Engines/ClpChaChaSimd.pas b/CryptoLib/src/Crypto/Engines/ClpChaChaSimd.pas new file mode 100644 index 00000000..1100839b --- /dev/null +++ b/CryptoLib/src/Crypto/Engines/ClpChaChaSimd.pas @@ -0,0 +1,79 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpChaChaSimd; + +{$I ..\..\Include\CryptoLib.inc} + +interface + +uses + ClpCryptoLibTypes +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + , ClpChaChaX86Backend +{$IFEND} + ; + +type + /// + /// Arch-neutral SIMD dispatch facade for the ChaCha family. Selects the + /// per-arch backend at compile time; on a build with no + /// SIMD backend every entry point degrades to "not handled" so callers run + /// their scalar reference path. The ChaCha engines call only this facade and + /// stay free of any TCpuFeatures / CRYPTOLIB_*_ASM knowledge. + /// + TChaChaSimd = class sealed + public + /// Single-block ChaCha core (ChaCha20 keystream block). + class function TryCore(ARounds: Int32; AInput, AOut: PByte): Boolean; static; + /// Two-block ChaCha7539 keystream (128 bytes). + class function TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static; + /// Four-block ChaCha7539 keystream (256 bytes). + class function TryProcessBlocks4(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static; + end; + +implementation + +{ TChaChaSimd } + +class function TChaChaSimd.TryCore(ARounds: Int32; AInput, AOut: PByte): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TChaChaX86Backend.TryCore(ARounds, AInput, AOut); +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TChaChaSimd.TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TChaChaX86Backend.TryProcessBlocks2(ARounds, AState, AIn, AOut); +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TChaChaSimd.TryProcessBlocks4(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TChaChaX86Backend.TryProcessBlocks4(ARounds, AState, AIn, AOut); +{$ELSE} + Result := False; +{$IFEND} +end; + +end. diff --git a/CryptoLib/src/Crypto/Engines/ClpChaChaX86Backend.pas b/CryptoLib/src/Crypto/Engines/ClpChaChaX86Backend.pas new file mode 100644 index 00000000..27f03d99 --- /dev/null +++ b/CryptoLib/src/Crypto/Engines/ClpChaChaX86Backend.pas @@ -0,0 +1,152 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpChaChaX86Backend; + +{$I ..\..\Include\CryptoLib.inc} + +interface + +uses + SysUtils, + ClpCpuFeatures, + ClpSimdLevels, + ClpCryptoLibTypes; + +resourcestring + SCounterExceeded = 'attempt to increase counter past 2^32'; + +type + /// + /// x86 SIMD backend for the ChaCha family: owns the SIMD keystream kernels + /// (bodies in Include\Simd\ChaCha\) and the runtime tier selection via + /// TCpuFeatures.X86.SelectSlot. Compiles on every target - when built + /// without x86 SIMD the Try* entry points return False and the + /// callers fall back to their scalar reference path. + /// + TChaChaX86Backend = class sealed + public + /// SIMD single-block ChaCha core (ChaCha20 keystream block). + class function TryCore(ARounds: Int32; AInput, AOut: PByte): Boolean; static; + /// SIMD two-block ChaCha7539 keystream (128 bytes). + class function TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static; + /// AVX2 four-block ChaCha7539 keystream (256 bytes). + class function TryProcessBlocks4(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static; + end; + +implementation + +{$IFDEF CRYPTOLIB_X86_SIMD} +procedure ChaCha20BlockSse2(ARounds: Int32; AInput, AOut: PByte); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc} +{$I ..\..\Include\Simd\ChaCha\ChaCha20BlockSse2_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\Include\Simd\Common\SimdProc3Begin_i386.inc} +{$I ..\..\Include\Simd\ChaCha\ChaCha20BlockSse2_i386.inc} +{$ENDIF} +end; + +procedure ChaCha7539RaiseCounter7539; +begin + raise EInvalidOperationCryptoLibException.CreateRes(@SCounterExceeded); +end; + +procedure ChaCha7539ProcessBlocks2Sse2(ARounds: Int32; AState, AIn, AOut: PByte); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} +{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Sse2_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} +{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Sse2_i386.inc} +{$ENDIF} +end; + +procedure ChaCha7539ProcessBlocks2Avx2(ARounds: Int32; AState, AIn, AOut: PByte); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} +{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Avx2_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} +{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks2Avx2_i386.inc} +{$ENDIF} +end; + +procedure ChaCha7539ProcessBlocks4Avx2(ARounds: Int32; AState, AIn, AOut: PByte); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} +{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks4Avx2_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} +{$I ..\..\Include\Simd\ChaCha\ChaCha7539ProcessBlocks4Avx2_i386.inc} +{$ENDIF} +end; +{$ENDIF CRYPTOLIB_X86_SIMD} + +{ TChaChaX86Backend } + +class function TChaChaX86Backend.TryCore(ARounds: Int32; AInput, AOut: PByte): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of + TX86SimdLevel.SSE2: + begin + ChaCha20BlockSse2(ARounds, AInput, AOut); + Exit(True); + end; + end; +{$ENDIF} + Result := False; +end; + +class function TChaChaX86Backend.TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]) of + TX86SimdLevel.AVX2: + begin + ChaCha7539ProcessBlocks2Avx2(ARounds, AState, AIn, AOut); + Exit(True); + end; + TX86SimdLevel.SSE2: + begin + ChaCha7539ProcessBlocks2Sse2(ARounds, AState, AIn, AOut); + Exit(True); + end; + end; +{$ENDIF} + Result := False; +end; + +class function TChaChaX86Backend.TryProcessBlocks4(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2]) of + TX86SimdLevel.AVX2: + begin + ChaCha7539ProcessBlocks4Avx2(ARounds, AState, AIn, AOut); + Exit(True); + end; + end; +{$ENDIF} + Result := False; +end; + +end. diff --git a/CryptoLib/src/Crypto/Engines/ClpSalsa20Engine.pas b/CryptoLib/src/Crypto/Engines/ClpSalsa20Engine.pas index 3adfab31..222e579d 100644 --- a/CryptoLib/src/Crypto/Engines/ClpSalsa20Engine.pas +++ b/CryptoLib/src/Crypto/Engines/ClpSalsa20Engine.pas @@ -30,8 +30,7 @@ interface ClpICipherParameters, ClpIParametersWithIV, ClpPack, - ClpCpuFeatures, - ClpSimdLevels, + ClpSalsaSimd, ClpByteUtilities, ClpCryptoLibTypes; @@ -146,30 +145,6 @@ TSalsa20Engine = class(TInterfacedObject, ISalsa20Engine, IStreamCipher) implementation -{$IFDEF CRYPTOLIB_X86_SIMD} -procedure Salsa20BlockSse41(ARounds: Int32; AInput, AOut: Pointer); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc} -{$I ..\..\Include\Simd\Salsa\Salsa20BlockSse41_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\Include\Simd\Common\SimdProc3Begin_i386.inc} -{$I ..\..\Include\Simd\Salsa\Salsa20BlockSse41_i386.inc} -{$ENDIF} -end; - -procedure Salsa20ProcessBlocks2Sse41(ARounds: Int32; AState, AIn, AOut: PByte); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} -{$I ..\..\Include\Simd\Salsa\Salsa20ProcessBlocks2Sse41_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} -{$I ..\..\Include\Simd\Salsa\Salsa20ProcessBlocks2Sse41_i386.inc} -{$ENDIF} -end; -{$ENDIF} - { TSalsa20Engine } constructor TSalsa20Engine.Create; @@ -344,15 +319,11 @@ procedure TSalsa20Engine.ProcessBlocks2( const AOutBytes: TCryptoLibByteArray; AOutOff: Int32); begin AssertInitialisedAndBlockAligned; -{$IFDEF CRYPTOLIB_X86_SIMD} - case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE41]) of - TX86SimdLevel.SSE41: - begin - Salsa20ProcessBlocks2Sse41(FRounds, PByte(@FEngineState[0]), PByte(@AInBytes[AInOff]), PByte(@AOutBytes[AOutOff])); - Exit; - end; - end; -{$ENDIF} + + if TSalsaSimd.TryProcessBlocks2(FRounds, PByte(@FEngineState[0]), + PByte(@AInBytes[AInOff]), PByte(@AOutBytes[AOutOff])) then + Exit; + ImplProcessBlock(AInBytes, AInOff, AOutBytes, AOutOff); ImplProcessBlock(AInBytes, AInOff + 64, AOutBytes, AOutOff + 64); end; @@ -520,15 +491,8 @@ class procedure TSalsa20Engine.SalsaCore(ARounds: Int32; begin raise EArgumentCryptoLibException.CreateRes(@SRoundsMustBeEven); end; -{$IFDEF CRYPTOLIB_X86_SIMD} - case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE41]) of - TX86SimdLevel.SSE41: - begin - Salsa20BlockSse41(ARounds, @AInput[0], @AX[0]); - Exit; - end; - end; -{$ENDIF} + if TSalsaSimd.TryCore(ARounds, @AInput[0], @AX[0]) then + Exit; LX00 := AInput[0]; LX01 := AInput[1]; diff --git a/CryptoLib/src/Crypto/Engines/ClpSalsaSimd.pas b/CryptoLib/src/Crypto/Engines/ClpSalsaSimd.pas new file mode 100644 index 00000000..c69ecba3 --- /dev/null +++ b/CryptoLib/src/Crypto/Engines/ClpSalsaSimd.pas @@ -0,0 +1,68 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpSalsaSimd; + +{$I ..\..\Include\CryptoLib.inc} + +interface + +uses + ClpCryptoLibTypes +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + , ClpSalsaX86Backend +{$IFEND} + ; + +type + /// + /// Arch-neutral SIMD dispatch facade for the Salsa20 family. Selects the + /// per-arch backend at compile time; on a build with no + /// SIMD backend every entry point degrades to "not handled" so callers run + /// their scalar reference path. The Salsa20 engine calls only this facade and + /// stays free of any TCpuFeatures / CRYPTOLIB_*_ASM knowledge. + /// + TSalsaSimd = class sealed + public + /// Single-block Salsa20 core. + class function TryCore(ARounds: Int32; AInput, AOut: Pointer): Boolean; static; + /// Two-block Salsa20 keystream (128 bytes). + class function TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static; + end; + +implementation + +{ TSalsaSimd } + +class function TSalsaSimd.TryCore(ARounds: Int32; AInput, AOut: Pointer): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TSalsaX86Backend.TryCore(ARounds, AInput, AOut); +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TSalsaSimd.TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TSalsaX86Backend.TryProcessBlocks2(ARounds, AState, AIn, AOut); +{$ELSE} + Result := False; +{$IFEND} +end; + +end. diff --git a/CryptoLib/src/Crypto/Engines/ClpSalsaX86Backend.pas b/CryptoLib/src/Crypto/Engines/ClpSalsaX86Backend.pas new file mode 100644 index 00000000..f60a33e0 --- /dev/null +++ b/CryptoLib/src/Crypto/Engines/ClpSalsaX86Backend.pas @@ -0,0 +1,100 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpSalsaX86Backend; + +{$I ..\..\Include\CryptoLib.inc} + +interface + +uses + ClpCpuFeatures, + ClpSimdLevels, + ClpCryptoLibTypes; + +type + /// + /// x86 SIMD backend for the Salsa20 family: owns the SIMD keystream kernels + /// (bodies in Include\Simd\Salsa\) and the runtime tier selection via + /// TCpuFeatures.X86.SelectSlot. Compiles on every target - when built + /// without x86 SIMD the Try* entry points return False and the + /// callers fall back to their scalar reference path. + /// + TSalsaX86Backend = class sealed + public + /// SIMD single-block Salsa20 core. + class function TryCore(ARounds: Int32; AInput, AOut: Pointer): Boolean; static; + /// SIMD two-block Salsa20 keystream (128 bytes). + class function TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; static; + end; + +implementation + +{$IFDEF CRYPTOLIB_X86_SIMD} +procedure Salsa20BlockSse41(ARounds: Int32; AInput, AOut: Pointer); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc} +{$I ..\..\Include\Simd\Salsa\Salsa20BlockSse41_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\Include\Simd\Common\SimdProc3Begin_i386.inc} +{$I ..\..\Include\Simd\Salsa\Salsa20BlockSse41_i386.inc} +{$ENDIF} +end; + +procedure Salsa20ProcessBlocks2Sse41(ARounds: Int32; AState, AIn, AOut: PByte); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} +{$I ..\..\Include\Simd\Salsa\Salsa20ProcessBlocks2Sse41_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} +{$I ..\..\Include\Simd\Salsa\Salsa20ProcessBlocks2Sse41_i386.inc} +{$ENDIF} +end; +{$ENDIF CRYPTOLIB_X86_SIMD} + +{ TSalsaX86Backend } + +class function TSalsaX86Backend.TryCore(ARounds: Int32; AInput, AOut: Pointer): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE41]) of + TX86SimdLevel.SSE41: + begin + Salsa20BlockSse41(ARounds, AInput, AOut); + Exit(True); + end; + end; +{$ENDIF} + Result := False; +end; + +class function TSalsaX86Backend.TryProcessBlocks2(ARounds: Int32; AState, AIn, AOut: PByte): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE41]) of + TX86SimdLevel.SSE41: + begin + Salsa20ProcessBlocks2Sse41(ARounds, AState, AIn, AOut); + Exit(True); + end; + end; +{$ENDIF} + Result := False; +end; + +end. diff --git a/CryptoLib/src/Crypto/Macs/ClpPoly1305.pas b/CryptoLib/src/Crypto/Macs/ClpPoly1305.pas index d4e57fbf..240a97d2 100644 --- a/CryptoLib/src/Crypto/Macs/ClpPoly1305.pas +++ b/CryptoLib/src/Crypto/Macs/ClpPoly1305.pas @@ -34,8 +34,8 @@ interface ClpPack, ClpBitOperations, ClpArrayUtilities, - ClpCpuFeatures, - ClpSimdLevels, + ClpPoly1305State, + ClpPoly1305Simd, ClpCryptoLibTypes; resourcestring @@ -51,23 +51,6 @@ interface 'Poly1305 requires a 128-bit IV when used with a cipher.'; type - /// - /// Poly1305 algorithm state in radix-2^26 form (72 bytes; same layout on - /// every architecture). - /// - /// R0..R4 - clamped 130-bit r split into five 26-bit limbs - /// S1..S4 - precomputed 5 * R1..R4 wraparound multipliers - /// H0..H4 - 130-bit accumulator in five 26-bit limbs (plus a few carry bits) - /// K0..K3 - the Poly1305 "s" key (second half of the 32-byte key) - /// - /// - TPoly1305State = record - R0, R1, R2, R3, R4: UInt32; - S1, S2, S3, S4: UInt32; - H0, H1, H2, H3, H4: UInt32; - K0, K1, K2, K3: UInt32; - end; - TPoly1305 = class sealed(TMac, IPoly1305, IMac) strict private @@ -240,174 +223,6 @@ procedure Poly1305StateProcessBlocksScalar(var AState: TPoly1305State; end; end; -{ AVX2 helpers } - -{$IFDEF CRYPTOLIB_X86_SIMD} - -// Multiply two 5-limb radix-2^26 numbers ALhs, ARhs modulo 2^130-5, -// returning the 5-limb result in AProduct. Same field arithmetic as the -// inner step of Poly1305StateProcessBlock; used here at SetKey time to -// derive r^2..r^4 for the AVX2 power table. Kept private to this unit -// since it has no caller outside the AVX2 setup path. -procedure Poly1305MulLimbs(out AProduct: array of UInt32; - const ALhs, ARhs: array of UInt32); -var - LS1, LS2, LS3, LS4: UInt32; - LD0, LD1, LD2, LD3, LD4: UInt64; -begin - LS1 := ARhs[1] * 5; - LS2 := ARhs[2] * 5; - LS3 := ARhs[3] * 5; - LS4 := ARhs[4] * 5; - - LD0 := UInt64(ALhs[0]) * ARhs[0] + UInt64(ALhs[1]) * LS4 + - UInt64(ALhs[2]) * LS3 + UInt64(ALhs[3]) * LS2 + UInt64(ALhs[4]) * LS1; - LD1 := UInt64(ALhs[0]) * ARhs[1] + UInt64(ALhs[1]) * ARhs[0] + - UInt64(ALhs[2]) * LS4 + UInt64(ALhs[3]) * LS3 + UInt64(ALhs[4]) * LS2; - LD2 := UInt64(ALhs[0]) * ARhs[2] + UInt64(ALhs[1]) * ARhs[1] + - UInt64(ALhs[2]) * ARhs[0] + UInt64(ALhs[3]) * LS4 + UInt64(ALhs[4]) * LS3; - LD3 := UInt64(ALhs[0]) * ARhs[3] + UInt64(ALhs[1]) * ARhs[2] + - UInt64(ALhs[2]) * ARhs[1] + UInt64(ALhs[3]) * ARhs[0] + - UInt64(ALhs[4]) * LS4; - LD4 := UInt64(ALhs[0]) * ARhs[4] + UInt64(ALhs[1]) * ARhs[3] + - UInt64(ALhs[2]) * ARhs[2] + UInt64(ALhs[3]) * ARhs[1] + - UInt64(ALhs[4]) * ARhs[0]; - - AProduct[0] := UInt32(LD0) and $3FFFFFF; - LD1 := LD1 + (LD0 shr 26); - AProduct[1] := UInt32(LD1) and $3FFFFFF; - LD2 := LD2 + (LD1 shr 26); - AProduct[2] := UInt32(LD2) and $3FFFFFF; - LD3 := LD3 + (LD2 shr 26); - AProduct[3] := UInt32(LD3) and $3FFFFFF; - LD4 := LD4 + (LD3 shr 26); - AProduct[4] := UInt32(LD4) and $3FFFFFF; - AProduct[0] := AProduct[0] + UInt32(LD4 shr 26) * 5; - AProduct[1] := AProduct[1] + (AProduct[0] shr 26); - AProduct[0] := AProduct[0] and $3FFFFFF; -end; - -// (Re)allocate APowTable to the byte size required by the AVX2 4-way -// bulk kernel and pack the precomputed powers r^1..r^4 of AState.R0..R4 -// into it, in the post-VPERMD layout the kernel expects. Must be called -// once after Poly1305StateAbsorbR has populated AState.R0..R4 and before -// the first invocation of Poly1305ProcessBlocksAvx2 for the same key. -// The exact buffer size and limb layout are private to this routine. -procedure Poly1305Avx2InitPowerTable(var APowTable: TCryptoLibByteArray; - const AState: TPoly1305State); -const - // 10 rows x 8 lanes x 4 bytes = 320. Rows 0..4 hold the limbs of - // r^4|r^4|r^4|r^3 | r^4|r^2|r^4|r^1 across the 8 ymm lanes (post-VPERMD - // layout); rows 5..8 hold the 5x wraparound multipliers; row 9 is - // padding for the +4 over-read of the last shifted load. - TableSize = Int32(320); -type - TPowTableLayout = array[0..9, 0..7] of UInt32; - PPowTableLayout = ^TPowTableLayout; -var - LTbl: PPowTableLayout; - Lr1, Lr2, Lr3, Lr4: array[0..4] of UInt32; - LIdx, LRow, LJ: Int32; -begin - System.SetLength(APowTable, TableSize); - LTbl := PPowTableLayout(APowTable); - - Lr1[0] := AState.R0; - Lr1[1] := AState.R1; - Lr1[2] := AState.R2; - Lr1[3] := AState.R3; - Lr1[4] := AState.R4; - - Poly1305MulLimbs(Lr2, Lr1, Lr1); - Poly1305MulLimbs(Lr3, Lr2, Lr1); - Poly1305MulLimbs(Lr4, Lr2, Lr2); - - // Rows 0..4: limbs of r^k for the 4 powers, post-VPERMD layout. - for LIdx := 0 to 4 do - begin - LTbl^[LIdx, 0] := Lr4[LIdx]; - LTbl^[LIdx, 1] := Lr4[LIdx]; - LTbl^[LIdx, 2] := Lr4[LIdx]; - LTbl^[LIdx, 3] := Lr3[LIdx]; - LTbl^[LIdx, 4] := Lr4[LIdx]; - LTbl^[LIdx, 5] := Lr2[LIdx]; - LTbl^[LIdx, 6] := Lr4[LIdx]; - LTbl^[LIdx, 7] := Lr1[LIdx]; - end; - - // Rows 5..8: 5 * limbs[1..4] of r^k (wraparound multipliers). - for LRow := 5 to 8 do - begin - LJ := LRow - 4; // 1..4 - LTbl^[LRow, 0] := Lr4[LJ] * 5; - LTbl^[LRow, 1] := Lr4[LJ] * 5; - LTbl^[LRow, 2] := Lr4[LJ] * 5; - LTbl^[LRow, 3] := Lr3[LJ] * 5; - LTbl^[LRow, 4] := Lr4[LJ] * 5; - LTbl^[LRow, 5] := Lr2[LJ] * 5; - LTbl^[LRow, 6] := Lr4[LJ] * 5; - LTbl^[LRow, 7] := Lr1[LJ] * 5; - end; - - // Row 9 is unused padding for the +4 over-read of the last shifted load. - for LIdx := 0 to 7 do - LTbl^[9, LIdx] := 0; -end; - -// Asm wrapper around the architecture-specific 4-way bulk kernel. The .inc -// files contain pure assembly (db-encoded VEX with mnemonic comments); the -// Pascal layer below is just the procedure header + the SimdProc5Begin ABI -// glue + the kernel body include. ACtx points at the 72-byte R/S/H/K -// portion of TPoly1305State; APowTable points at the separate 320-byte -// power table buffer; the kernel never reads the K limbs. -procedure Poly1305BlocksBulkAvx2Core(ACtx, APowTable, AInp: PByte; - ALen: NativeUInt; APad: Int32); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\Include\Simd\Common\SimdProc5Begin_x86_64.inc} -{$I ..\..\Include\Simd\Poly1305\Poly1305BlocksBulkAvx2Core_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\Include\Simd\Common\SimdProc5Begin_i386.inc} -{$I ..\..\Include\Simd\Poly1305\Poly1305BlocksBulkAvx2Core_i386.inc} -{$ENDIF} -end; - -// Bulk-processing variant for AVX2-capable CPUs. Rounds ANumBlocks down -// to a multiple of the AVX2 lane count (4) and dispatches the AVX2 kernel -// for that bulk; the 0..3 leftover blocks are forwarded to -// Poly1305StateProcessBlocksScalar. When fewer than 4 blocks are -// available the entire batch is handled by the scalar path. APowTable -// must point at a buffer already populated by Poly1305Avx2InitPowerTable -// for the same r as currently in AState. -procedure Poly1305ProcessBlocksAvx2(var AState: TPoly1305State; - APowTable: PByte; - const ABuf: TCryptoLibByteArray; AOff, ANumBlocks: Int32); -const - // Minimum number of 16-byte blocks before the AVX2 4-way kernel pays off - // over the scalar block step; smaller batches go straight to the scalar - // tail handler below. - LMinBlocks = Int32(4); - // Number of 16-byte blocks consumed per AVX2 kernel iteration (one block - // per 64-bit lane of a 256-bit ymm); used to round the dispatch count - // down to a multiple supported by the kernel. - LLaneCount = Int32(4); -var - LSimdBlocks: Int32; -begin - if ANumBlocks >= LMinBlocks then - begin - LSimdBlocks := ANumBlocks and not (LLaneCount - 1); - Poly1305BlocksBulkAvx2Core(@AState, APowTable, @ABuf[AOff], - NativeUInt(LSimdBlocks) * 16, 1); - AOff := AOff + LSimdBlocks * 16; - ANumBlocks := ANumBlocks - LSimdBlocks; - end; - if ANumBlocks > 0 then - Poly1305StateProcessBlocksScalar(AState, ABuf, AOff, ANumBlocks); -end; - -{$ENDIF CRYPTOLIB_X86_SIMD} - { TPoly1305 } constructor TPoly1305.Create(); @@ -481,16 +296,10 @@ procedure TPoly1305.SetKey(const AKeyParameter: IKeyParameter; // Pre-build any SIMD-specific lookup tables for this key, and use the // (non-)allocation of FPowTable as the dispatch flag for BlockUpdate. - // Reset to nil first so the scalar path is the postcondition when no - // SIMD branch matches. To add a new SIMD variant: declare its tier in - // SelectSlot and delegate to its initializer (which owns sizing + layout). + // Reset to nil first so the scalar path is the postcondition when no SIMD + // backend claims the key; the facade fills FPowTable iff a SIMD tier applies. FPowTable := nil; -{$IFDEF CRYPTOLIB_X86_SIMD} - case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2]) of - TX86SimdLevel.AVX2: - Poly1305Avx2InitPowerTable(FPowTable, FState); - end; -{$ENDIF} + TPoly1305Simd.TryInitPowerTable(FPowTable, FState); end; function TPoly1305.GetAlgorithmName: String; @@ -520,7 +329,7 @@ procedure TPoly1305.Update(AInput: Byte); procedure TPoly1305.BlockUpdate(const AInput: TCryptoLibByteArray; AInOff, ALen: Int32); var - LAvailable, LPos, LRemaining, LNb, LBulkBytes: Int32; + LAvailable, LPos, LRemaining, LNb, LBulkBytes, LSimdBlocks: Int32; begin TCheck.DataLength(AInput, AInOff, ALen, 'input buffer too short'); @@ -549,13 +358,13 @@ procedure TPoly1305.BlockUpdate(const AInput: TCryptoLibByteArray; if LNb > 0 then begin LBulkBytes := LNb shl 4; - {$IFDEF CRYPTOLIB_X86_SIMD} - if FPowTable <> nil then - Poly1305ProcessBlocksAvx2(FState, PByte(FPowTable), AInput, - AInOff + LPos, LNb) - else - {$ENDIF} - Poly1305StateProcessBlocksScalar(FState, AInput, AInOff + LPos, LNb); + // The SIMD facade consumes a lane-multiple of the blocks (0 when no SIMD + // path applies for this key) and the scalar reference handles the tail. + LSimdBlocks := TPoly1305Simd.ProcessBulk(FState, PByte(FPowTable), AInput, + AInOff + LPos, LNb); + if LSimdBlocks < LNb then + Poly1305StateProcessBlocksScalar(FState, AInput, + AInOff + LPos + LSimdBlocks * 16, LNb - LSimdBlocks); LPos := LPos + LBulkBytes; LRemaining := ALen - LPos; end; diff --git a/CryptoLib/src/Crypto/Macs/ClpPoly1305Simd.pas b/CryptoLib/src/Crypto/Macs/ClpPoly1305Simd.pas new file mode 100644 index 00000000..568d550d --- /dev/null +++ b/CryptoLib/src/Crypto/Macs/ClpPoly1305Simd.pas @@ -0,0 +1,83 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpPoly1305Simd; + +{$I ..\..\Include\CryptoLib.inc} + +interface + +uses + ClpPoly1305State, + ClpCryptoLibTypes +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + , ClpPoly1305X86Backend +{$IFEND} + ; + +type + /// + /// Arch-neutral SIMD dispatch facade for Poly1305. Selects the per-arch + /// backend at compile time; on a build with no SIMD + /// backend TryInitPowerTable returns False and ProcessBulk + /// consumes zero blocks, so the MAC runs entirely on its scalar reference + /// path. The Poly1305 MAC calls only this facade and stays free of any + /// TCpuFeatures / CRYPTOLIB_*_ASM knowledge. + /// + TPoly1305Simd = class sealed + public + /// + /// If a SIMD tier is available, build the per-key power table into + /// and return True; otherwise leave it + /// untouched and return False (the nil-ness of the caller's table then + /// doubles as the "scalar path" dispatch flag). + /// + class function TryInitPowerTable(var APowTable: TCryptoLibByteArray; + const AState: TPoly1305State): Boolean; static; + /// + /// Process the leading lane-multiple of blocks + /// with SIMD and return how many blocks were consumed (0 when no SIMD path + /// applies); the caller processes the remainder on its scalar path. + /// + class function ProcessBulk(var AState: TPoly1305State; APowTable: PByte; + const ABuf: TCryptoLibByteArray; AOff, ANumBlocks: Int32): Int32; static; + end; + +implementation + +{ TPoly1305Simd } + +class function TPoly1305Simd.TryInitPowerTable(var APowTable: TCryptoLibByteArray; + const AState: TPoly1305State): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TPoly1305X86Backend.TryInitPowerTable(APowTable, AState); +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TPoly1305Simd.ProcessBulk(var AState: TPoly1305State; APowTable: PByte; + const ABuf: TCryptoLibByteArray; AOff, ANumBlocks: Int32): Int32; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TPoly1305X86Backend.ProcessBulk(AState, APowTable, ABuf, AOff, ANumBlocks); +{$ELSE} + Result := 0; +{$IFEND} +end; + +end. diff --git a/CryptoLib/src/Crypto/Macs/ClpPoly1305State.pas b/CryptoLib/src/Crypto/Macs/ClpPoly1305State.pas new file mode 100644 index 00000000..2c226cb4 --- /dev/null +++ b/CryptoLib/src/Crypto/Macs/ClpPoly1305State.pas @@ -0,0 +1,43 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpPoly1305State; + +{$I ..\..\Include\CryptoLib.inc} + +interface + +type + /// + /// Poly1305 algorithm state in radix-2^26 form (72 bytes; same layout on + /// every architecture). + /// + /// R0..R4 - clamped 130-bit r split into five 26-bit limbs + /// S1..S4 - precomputed 5 * R1..R4 wraparound multipliers + /// H0..H4 - 130-bit accumulator in five 26-bit limbs (plus a few carry bits) + /// K0..K3 - the Poly1305 "s" key (second half of the 32-byte key) + /// + /// + TPoly1305State = record + R0, R1, R2, R3, R4: UInt32; + S1, S2, S3, S4: UInt32; + H0, H1, H2, H3, H4: UInt32; + K0, K1, K2, K3: UInt32; + end; + +implementation + +end. diff --git a/CryptoLib/src/Crypto/Macs/ClpPoly1305X86Backend.pas b/CryptoLib/src/Crypto/Macs/ClpPoly1305X86Backend.pas new file mode 100644 index 00000000..3e726f11 --- /dev/null +++ b/CryptoLib/src/Crypto/Macs/ClpPoly1305X86Backend.pas @@ -0,0 +1,237 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpPoly1305X86Backend; + +{$I ..\..\Include\CryptoLib.inc} + +interface + +uses + ClpPoly1305State, + ClpCpuFeatures, + ClpSimdLevels, + ClpCryptoLibTypes; + +type + /// + /// x86 SIMD backend for Poly1305: owns the AVX2 power-table builder and the + /// 4-way bulk kernel (body in Include\Simd\Poly1305\) plus the runtime + /// tier selection via TCpuFeatures.X86.SelectSlot. Compiles on every + /// target - when built without x86 SIMD TryInitPowerTable returns + /// False (leaving the caller on the scalar path) and ProcessBulk + /// consumes zero blocks. + /// + TPoly1305X86Backend = class sealed + public + /// + /// If a SIMD tier is available, (re)allocate and populate + /// with the precomputed power table for the r currently in + /// , and return True. Otherwise leave + /// untouched and return False. + /// + class function TryInitPowerTable(var APowTable: TCryptoLibByteArray; + const AState: TPoly1305State): Boolean; static; + /// + /// Process the leading lane-multiple of 16-byte + /// blocks with the AVX2 kernel and return the number of blocks consumed + /// (a multiple of the lane count). Returns 0 - leaving the whole batch to the + /// caller's scalar path - when no power table is present, fewer than one lane + /// of blocks is available, or the build has no x86 SIMD. + /// + class function ProcessBulk(var AState: TPoly1305State; APowTable: PByte; + const ABuf: TCryptoLibByteArray; AOff, ANumBlocks: Int32): Int32; static; + end; + +implementation + +{$IFDEF CRYPTOLIB_X86_SIMD} + +// Multiply two 5-limb radix-2^26 numbers ALhs, ARhs modulo 2^130-5, +// returning the 5-limb result in AProduct. Same field arithmetic as the +// inner step of Poly1305StateProcessBlock; used here at SetKey time to +// derive r^2..r^4 for the AVX2 power table. +procedure Poly1305MulLimbs(out AProduct: array of UInt32; + const ALhs, ARhs: array of UInt32); +var + LS1, LS2, LS3, LS4: UInt32; + LD0, LD1, LD2, LD3, LD4: UInt64; +begin + LS1 := ARhs[1] * 5; + LS2 := ARhs[2] * 5; + LS3 := ARhs[3] * 5; + LS4 := ARhs[4] * 5; + + LD0 := UInt64(ALhs[0]) * ARhs[0] + UInt64(ALhs[1]) * LS4 + + UInt64(ALhs[2]) * LS3 + UInt64(ALhs[3]) * LS2 + UInt64(ALhs[4]) * LS1; + LD1 := UInt64(ALhs[0]) * ARhs[1] + UInt64(ALhs[1]) * ARhs[0] + + UInt64(ALhs[2]) * LS4 + UInt64(ALhs[3]) * LS3 + UInt64(ALhs[4]) * LS2; + LD2 := UInt64(ALhs[0]) * ARhs[2] + UInt64(ALhs[1]) * ARhs[1] + + UInt64(ALhs[2]) * ARhs[0] + UInt64(ALhs[3]) * LS4 + UInt64(ALhs[4]) * LS3; + LD3 := UInt64(ALhs[0]) * ARhs[3] + UInt64(ALhs[1]) * ARhs[2] + + UInt64(ALhs[2]) * ARhs[1] + UInt64(ALhs[3]) * ARhs[0] + + UInt64(ALhs[4]) * LS4; + LD4 := UInt64(ALhs[0]) * ARhs[4] + UInt64(ALhs[1]) * ARhs[3] + + UInt64(ALhs[2]) * ARhs[2] + UInt64(ALhs[3]) * ARhs[1] + + UInt64(ALhs[4]) * ARhs[0]; + + AProduct[0] := UInt32(LD0) and $3FFFFFF; + LD1 := LD1 + (LD0 shr 26); + AProduct[1] := UInt32(LD1) and $3FFFFFF; + LD2 := LD2 + (LD1 shr 26); + AProduct[2] := UInt32(LD2) and $3FFFFFF; + LD3 := LD3 + (LD2 shr 26); + AProduct[3] := UInt32(LD3) and $3FFFFFF; + LD4 := LD4 + (LD3 shr 26); + AProduct[4] := UInt32(LD4) and $3FFFFFF; + AProduct[0] := AProduct[0] + UInt32(LD4 shr 26) * 5; + AProduct[1] := AProduct[1] + (AProduct[0] shr 26); + AProduct[0] := AProduct[0] and $3FFFFFF; +end; + +// (Re)allocate APowTable to the byte size required by the AVX2 4-way +// bulk kernel and pack the precomputed powers r^1..r^4 of AState.R0..R4 +// into it, in the post-VPERMD layout the kernel expects. Must be called +// once after AState.R0..R4 is populated and before the first invocation of +// the bulk kernel for the same key. The exact buffer size and limb layout +// are private to this routine. +procedure Poly1305Avx2InitPowerTable(var APowTable: TCryptoLibByteArray; + const AState: TPoly1305State); +const + // 10 rows x 8 lanes x 4 bytes = 320. Rows 0..4 hold the limbs of + // r^4|r^4|r^4|r^3 | r^4|r^2|r^4|r^1 across the 8 ymm lanes (post-VPERMD + // layout); rows 5..8 hold the 5x wraparound multipliers; row 9 is + // padding for the +4 over-read of the last shifted load. + TableSize = Int32(320); +type + TPowTableLayout = array[0..9, 0..7] of UInt32; + PPowTableLayout = ^TPowTableLayout; +var + LTbl: PPowTableLayout; + Lr1, Lr2, Lr3, Lr4: array[0..4] of UInt32; + LIdx, LRow, LJ: Int32; +begin + System.SetLength(APowTable, TableSize); + LTbl := PPowTableLayout(APowTable); + + Lr1[0] := AState.R0; + Lr1[1] := AState.R1; + Lr1[2] := AState.R2; + Lr1[3] := AState.R3; + Lr1[4] := AState.R4; + + Poly1305MulLimbs(Lr2, Lr1, Lr1); + Poly1305MulLimbs(Lr3, Lr2, Lr1); + Poly1305MulLimbs(Lr4, Lr2, Lr2); + + // Rows 0..4: limbs of r^k for the 4 powers, post-VPERMD layout. + for LIdx := 0 to 4 do + begin + LTbl^[LIdx, 0] := Lr4[LIdx]; + LTbl^[LIdx, 1] := Lr4[LIdx]; + LTbl^[LIdx, 2] := Lr4[LIdx]; + LTbl^[LIdx, 3] := Lr3[LIdx]; + LTbl^[LIdx, 4] := Lr4[LIdx]; + LTbl^[LIdx, 5] := Lr2[LIdx]; + LTbl^[LIdx, 6] := Lr4[LIdx]; + LTbl^[LIdx, 7] := Lr1[LIdx]; + end; + + // Rows 5..8: 5 * limbs[1..4] of r^k (wraparound multipliers). + for LRow := 5 to 8 do + begin + LJ := LRow - 4; // 1..4 + LTbl^[LRow, 0] := Lr4[LJ] * 5; + LTbl^[LRow, 1] := Lr4[LJ] * 5; + LTbl^[LRow, 2] := Lr4[LJ] * 5; + LTbl^[LRow, 3] := Lr3[LJ] * 5; + LTbl^[LRow, 4] := Lr4[LJ] * 5; + LTbl^[LRow, 5] := Lr2[LJ] * 5; + LTbl^[LRow, 6] := Lr4[LJ] * 5; + LTbl^[LRow, 7] := Lr1[LJ] * 5; + end; + + // Row 9 is unused padding for the +4 over-read of the last shifted load. + for LIdx := 0 to 7 do + LTbl^[9, LIdx] := 0; +end; + +// Asm wrapper around the architecture-specific 4-way bulk kernel. The .inc +// files contain pure assembly (db-encoded VEX with mnemonic comments); the +// Pascal layer below is just the procedure header + the SimdProc5Begin ABI +// glue + the kernel body include. ACtx points at the 72-byte R/S/H/K +// portion of TPoly1305State; APowTable points at the separate 320-byte +// power table buffer; the kernel never reads the K limbs. +procedure Poly1305BlocksBulkAvx2Core(ACtx, APowTable, AInp: PByte; + ALen: NativeUInt; APad: Int32); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\Include\Simd\Common\SimdProc5Begin_x86_64.inc} +{$I ..\..\Include\Simd\Poly1305\Poly1305BlocksBulkAvx2Core_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\Include\Simd\Common\SimdProc5Begin_i386.inc} +{$I ..\..\Include\Simd\Poly1305\Poly1305BlocksBulkAvx2Core_i386.inc} +{$ENDIF} +end; + +{$ENDIF CRYPTOLIB_X86_SIMD} + +{ TPoly1305X86Backend } + +class function TPoly1305X86Backend.TryInitPowerTable(var APowTable: TCryptoLibByteArray; + const AState: TPoly1305State): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2]) of + TX86SimdLevel.AVX2: + begin + Poly1305Avx2InitPowerTable(APowTable, AState); + Exit(True); + end; + end; +{$ENDIF} + Result := False; +end; + +class function TPoly1305X86Backend.ProcessBulk(var AState: TPoly1305State; APowTable: PByte; + const ABuf: TCryptoLibByteArray; AOff, ANumBlocks: Int32): Int32; +{$IFDEF CRYPTOLIB_X86_SIMD} +const + // Minimum number of 16-byte blocks before the AVX2 4-way kernel pays off + // over the scalar block step; smaller batches go straight to the caller's + // scalar tail. + LMinBlocks = Int32(4); + // Number of 16-byte blocks consumed per AVX2 kernel iteration (one block + // per 64-bit lane of a 256-bit ymm); used to round the dispatch count + // down to a multiple supported by the kernel. + LLaneCount = Int32(4); +var + LSimdBlocks: Int32; +{$ENDIF} +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + if (APowTable <> nil) and (ANumBlocks >= LMinBlocks) then + begin + LSimdBlocks := ANumBlocks and not (LLaneCount - 1); + Poly1305BlocksBulkAvx2Core(@AState, APowTable, @ABuf[AOff], + NativeUInt(LSimdBlocks) * 16, 1); + Exit(LSimdBlocks); + end; +{$ENDIF} + Result := 0; +end; + +end. diff --git a/CryptoLib/src/Crypto/Modes/ClpCcmBlockCipher.pas b/CryptoLib/src/Crypto/Modes/ClpCcmBlockCipher.pas index b114dc69..b1e3d50e 100644 --- a/CryptoLib/src/Crypto/Modes/ClpCcmBlockCipher.pas +++ b/CryptoLib/src/Crypto/Modes/ClpCcmBlockCipher.pas @@ -79,11 +79,9 @@ TCcmBlockCipher = class(TInterfacedObject, ICcmBlockCipher, FLastKey: TCryptoLibByteArray; FAssociatedText: TMemoryStream; FData: TMemoryStream; -{$IFDEF CRYPTOLIB_X86_SIMD} // Cached once per Init; non-nil when the registry resolved a fused // CCM kernel for the underlying cipher and current direction. FCcmKernel: IFusedCcmKernel; -{$ENDIF CRYPTOLIB_X86_SIMD} class function GetMacSize(ARequestedMacBits: Int32): Int32; static; procedure CheckNonceReuse(AForEncryption: Boolean; @@ -92,7 +90,6 @@ TCcmBlockCipher = class(TInterfacedObject, ICcmBlockCipher, function HasAssociatedText(): Boolean; function CalculateMac(const AData: TCryptoLibByteArray; ADataOff, ADataLen: Int32; const AMacBlock: TCryptoLibByteArray): Int32; -{$IFDEF CRYPTOLIB_X86_SIMD} // Runs AES CBC-MAC over the CCM header (B_0 || AAD length-prefix || // AAD || zero-pad) and writes the post-header 16-byte state into // AMacState. Matches the scalar CalculateMac contract. @@ -110,7 +107,6 @@ TCcmBlockCipher = class(TInterfacedObject, ICcmBlockCipher, AInOff, AInLen, AOutputLen: Int32; const AOutput: TCryptoLibByteArray; AOutOff: Int32; const AIV: TCryptoLibByteArray): Boolean; -{$ENDIF CRYPTOLIB_X86_SIMD} strict protected function GetAlgorithmName: String; virtual; @@ -200,9 +196,7 @@ procedure TCcmBlockCipher.Init(AForEncryption: Boolean; var LChoice: TCipherAeadChoice; LRequestedMacSizeBits: Int32; -{$IFDEF CRYPTOLIB_X86_SIMD} LDirection: TFusedModeDirection; -{$ENDIF CRYPTOLIB_X86_SIMD} begin FForEncryption := AForEncryption; @@ -234,7 +228,6 @@ procedure TCcmBlockCipher.Init(AForEncryption: Boolean; if (System.Length(FNonce) < 7) or (System.Length(FNonce) > 13) then raise EArgumentCryptoLibException.CreateRes(@SNonceLengthRange); -{$IFDEF CRYPTOLIB_X86_SIMD} FCcmKernel := nil; if FKeyParam <> nil then begin @@ -248,7 +241,6 @@ procedure TCcmBlockCipher.Init(AForEncryption: Boolean; LDirection := TFusedModeDirection.Decrypt; TFusedKernelRegistry.TryAcquireCcm(FCipher, LDirection, FCcmKernel); end; -{$ENDIF CRYPTOLIB_X86_SIMD} Reset(); end; @@ -411,7 +403,6 @@ function TCcmBlockCipher.ProcessPacket(const AInput: TCryptoLibByteArray; LOutputLen := AInLen + FMacSize; TCheck.OutputLength(AOutput, AOutOff, LOutputLen, SOutputBufferTooShort); -{$IFDEF CRYPTOLIB_X86_SIMD} // Fused fast path folds CTR and CBC-MAC into one sweep; the scalar // path handles the 1..16-byte tail and the tag encryption. if (FCcmKernel <> nil) @@ -422,7 +413,6 @@ function TCcmBlockCipher.ProcessPacket(const AInput: TCryptoLibByteArray; Result := LOutputLen; Exit; end; -{$ENDIF CRYPTOLIB_X86_SIMD} CalculateMac(AInput, AInOff, AInLen, FMacBlock); @@ -470,7 +460,6 @@ function TCcmBlockCipher.ProcessPacket(const AInput: TCryptoLibByteArray; LOutputLen := AInLen - FMacSize; TCheck.OutputLength(AOutput, AOutOff, LOutputLen, SOutputBufferTooShort); -{$IFDEF CRYPTOLIB_X86_SIMD} // Fused decrypt twin. Scalar tail handles the trailing 1..16-byte // block plus the FixedTime tag compare. if (FCcmKernel <> nil) @@ -481,7 +470,6 @@ function TCcmBlockCipher.ProcessPacket(const AInput: TCryptoLibByteArray; Result := LOutputLen; Exit; end; -{$ENDIF CRYPTOLIB_X86_SIMD} System.Move(AInput[AInOff + LOutputLen], FMacBlock[0], FMacSize); @@ -649,7 +637,6 @@ function TCcmBlockCipher.HasAssociatedText: Boolean; Result := GetAssociatedTextLength() > 0; end; -{$IFDEF CRYPTOLIB_X86_SIMD} procedure TCcmBlockCipher.ComputePostHeaderMacState(AInLen: Int32; const AMacState: TCryptoLibByteArray); @@ -865,6 +852,5 @@ function TCcmBlockCipher.ProcessPacketDecryptFused( end; end; -{$ENDIF CRYPTOLIB_X86_SIMD} end. diff --git a/CryptoLib/src/Crypto/Modes/ClpEaxBlockCipher.pas b/CryptoLib/src/Crypto/Modes/ClpEaxBlockCipher.pas index 901b42ba..38783a47 100644 --- a/CryptoLib/src/Crypto/Modes/ClpEaxBlockCipher.pas +++ b/CryptoLib/src/Crypto/Modes/ClpEaxBlockCipher.pas @@ -306,7 +306,9 @@ procedure TEaxBlockCipher.Init(AForEncryption: Boolean; // tag-lookahead in FBufBlock: the first FBlockSize bytes of FBufBlock // are the block that has just been confirmed as ciphertext (not tag), // the last FMacSize bytes remain the trailing tag candidate. -{$IFDEF CRYPTOLIB_X86_SIMD} + // Off-SIMD (or any build with no registered fused factory) TryAcquireEax + // leaves FEaxKernel nil, so FUseFusedBody is False and the scalar + // TCMac / TSicBlockCipher path runs - no compile-time arch gating needed. FEaxKernel := nil; if FForEncryption then TFusedKernelRegistry.TryAcquireEax(FCipher, @@ -315,10 +317,6 @@ procedure TEaxBlockCipher.Init(AForEncryption: Boolean; TFusedKernelRegistry.TryAcquireEax(FCipher, TFusedModeDirection.Decrypt, FEaxKernel); FUseFusedBody := FEaxKernel <> nil; -{$ELSE} - FEaxKernel := nil; - FUseFusedBody := False; -{$ENDIF CRYPTOLIB_X86_SIMD} if FUseFusedBody then begin diff --git a/CryptoLib/src/Crypto/Modes/ClpGcmBlockCipher.pas b/CryptoLib/src/Crypto/Modes/ClpGcmBlockCipher.pas index 42b3bdb9..9127ab92 100644 --- a/CryptoLib/src/Crypto/Modes/ClpGcmBlockCipher.pas +++ b/CryptoLib/src/Crypto/Modes/ClpGcmBlockCipher.pas @@ -34,6 +34,7 @@ interface ClpIGcmMultiplier, ClpIGcmExponentiator, ClpGcmUtilities, + ClpGhashSimd, ClpBasicGcmExponentiator, ClpTables4kGcmMultiplier, ClpIBulkBlockCipher, @@ -46,8 +47,6 @@ interface ClpPack, ClpCheck, ClpBasicGcmMultiplier, - ClpCpuFeatures, - ClpIntrinsicsVector, ClpArrayUtilities, ClpCryptoLibTypes; @@ -90,18 +89,14 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher, public class function CreateGcmMultiplier(): IGcmMultiplier; static; /// - /// True when the fused four-block SIMD path may run: PCLMULQDQ + SSSE3 shuffled GHASH, - /// batched counter AES, and a packed 16-byte XMM layout. + /// True when the fused four-block SIMD path may run: hardware shuffled GHASH, + /// batched counter AES, and a packed 16-byte vector layout. /// class function IsFourWaySupported: Boolean; static; /// /// True when the fused eight-block SIMD path may run (128-byte CTR batch + wider GHASH). /// class function IsEightWaySupported: Boolean; static; - /// - /// True when the 128-bit SSE2 XOR fast path may run for one and two-block steps (with packed layout). - /// - class function IsSse2PackedVectorXorSupported: Boolean; static; strict private @@ -111,13 +106,12 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher, // exposes the generic IBulkBlockCipher capability. Drives the // non-fused 4/8-block CTR dispatchers (GetNextCtrBlocks4/8). FBulkCipher: IBulkBlockCipher; -{$IFDEF CRYPTOLIB_X86_SIMD} // Fused CTR+GHASH kernel resolved via TFusedKernelRegistry at Init // time. Non-nil only when an accelerator factory accepts the - // underlying cipher + direction and the fused gate is open. + // underlying cipher + direction and the fused gate is open (always nil + // off-SIMD; IFusedGcmKernel is arch-neutral). FGcmKernel: IFusedGcmKernel; FGcmKernelMinBlocks: Int32; -{$ENDIF CRYPTOLIB_X86_SIMD} FMultiplier: IGcmMultiplier; FExp: IGcmExponentiator; @@ -195,29 +189,23 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher, procedure ProcessBlocks8Pipelined(const AInBuf: TCryptoLibByteArray; var AInOff: Int32; var ALen: Int32; const AOutBuf: TCryptoLibByteArray; var AOutOff: Int32; ALimit: Int32; AForEncrypt: Boolean); -{$IFDEF CRYPTOLIB_X86_SIMD} // ===================================================================== - // Fused AES-NI + 8-way GHASH pipeline (x86-64 and i386). + // Fused block-cipher keystream + 8-way GHASH pipeline (provided by the + // fused kernel registry; nil kernel -> not used, e.g. off-SIMD). // ===================================================================== // This outer driver is arch-agnostic: pure Pascal batch orchestration - // plus one call into an IFusedGcmKernel per 8-block stride. The - // underlying assembly kernel has two variants keyed on register - // budget: - // * x86-64: Gueron-style single-pass 8-wide interleave that keeps - // 15 of 16 XMM registers simultaneously live (8 AES state + - // 3 GHASH accumulators + 1 round key + 1 GHASH block + - // 1 H-power + 1 PCLMUL scratch + 1 byte-reverse mask). - // * i386: two back-to-back 4-wide halves sharing running Z0/Z1/Z2 - // accumulators, sized to fit in xmm0..xmm7. AES rounds 1..4 in - // each half carry the 4 pclmul iters so port-0 / port-5 ILP - // overlap is preserved within the 8-register budget. - // Both variants expose the same IFusedGcmKernel surface so this - // driver only sees the kernel interface. - /// Fills ABlocks[0..127] with eight 16-byte counter blocks (pre-AES form). Used by the FusedILP pipeline where AES is performed inside the fused assembly kernel. + // plus one call into an IFusedGcmKernel per 8-block stride. The kernel + // fuses CTR-mode keystream generation with the GHASH multiply-reduce in a + // single pass, interleaving the two at the instruction level so their + // independent execution units overlap. How wide that interleave runs and + // how it is scheduled against the available vector-register budget is a + // backend detail hidden entirely behind the IFusedGcmKernel surface, so + // this driver only ever sees the kernel interface. + /// Fills ABlocks[0..127] with eight 16-byte counter blocks (pre-cipher form). Used by the FusedILP pipeline where the block-cipher keystream is produced inside the fused kernel. procedure FillNextCtrBlocks8Raw(const ABlocks: TCryptoLibByteArray); /// - /// Pipelined GCM path driven by FGcmKernel (x86-64 and i386). Active - /// when a fused CTR+GHASH kernel was acquired at Init; otherwise the + /// Pipelined GCM path driven by FGcmKernel (when a fused kernel is + /// registered). Active when a fused CTR+GHASH kernel was acquired at Init; otherwise the /// caller falls back to ProcessBlocks8Pipelined. AForEncrypt selects /// direction: encrypt GHASHes the prior iteration's OUTPUT /// ciphertext, decrypt GHASHes the prior iteration's INPUT @@ -227,7 +215,6 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher, procedure ProcessBlocks8FusedILP(const AInBuf: TCryptoLibByteArray; var AInOff: Int32; var ALen: Int32; const AOutBuf: TCryptoLibByteArray; var AOutOff: Int32; ALimit: Int32; AForEncrypt: Boolean); -{$ENDIF CRYPTOLIB_X86_SIMD} // --------------------------------------------------------------------- // Cipher-state setup / per-call initialization. @@ -249,7 +236,7 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher, procedure CheckNonceReuse(AForEncryption: Boolean; const ANewNonce: TCryptoLibByteArray; const AKeyParam: IKeyParameter); /// Rekey path: initialize the underlying block cipher, compute the hash - /// subkey H, cache the AES-NI engine (when available), and (re)allocate the + /// subkey H, cache the bulk-capable cipher engine (when available), and (re)allocate the /// 8-way SIMD buffers (FHPow / FWorkCtr / FWorkCtrAhead) on capable hardware. /// Called only when a new key is supplied. procedure InitCipherAndHashSubKey(const AKeyParam: IKeyParameter); @@ -340,79 +327,31 @@ TGcmBlockCipher = class(TInterfacedObject, IGcmBlockCipher, implementation -{$IFDEF CRYPTOLIB_X86_SIMD} -const - ReverseBytesMask: packed array[0..15] of Byte = ( - $0F, $0E, $0D, $0C, $0B, $0A, $09, $08, $07, $06, $05, $04, $03, $02, $01, $00); -{$ENDIF} - // ======================================================================= -// Class-level CPU feature probes and multiplier factory. +// Class-level capability probes and multiplier factory. All arch-neutral: +// the GHASH SIMD facade answers False off-SIMD, so the mode's fast paths +// simply fall through to their scalar reference code. // ======================================================================= class function TGcmBlockCipher.IsFourWaySupported: Boolean; begin -{$IFDEF CRYPTOLIB_X86_SIMD} - Result := TCpuFeatures.X86.HasPCLMULQDQ and TCpuFeatures.X86.HasSSSE3 and - TIntrinsicsVector.IsPacked; -{$ELSE} - Result := False; -{$ENDIF} + Result := TGhashSimd.IsShuffledGhashSupported; end; class function TGcmBlockCipher.IsEightWaySupported: Boolean; begin -{$IFDEF CRYPTOLIB_X86_SIMD} - Result := TGcmBlockCipher.IsFourWaySupported; -{$ELSE} - Result := False; -{$ENDIF} + Result := TGhashSimd.IsShuffledGhashSupported; end; -class function TGcmBlockCipher.IsSse2PackedVectorXorSupported: Boolean; -begin -{$IFDEF CRYPTOLIB_X86_SIMD} - Result := TCpuFeatures.X86.HasSSE2 and TIntrinsicsVector.IsPacked; -{$ELSE} - Result := False; -{$ENDIF} -end; - -{$IFDEF CRYPTOLIB_X86_SIMD} -procedure GcmBlockXor128Sse2(PDst, PSrc: PByte); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\Include\Simd\Common\SimdProc2Begin_x86_64.inc} -{$I ..\..\Include\Simd\Gcm\GcmBlockXor128Sse2_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\Include\Simd\Common\SimdProc2Begin_i386.inc} -{$I ..\..\Include\Simd\Gcm\GcmBlockXor128Sse2_i386.inc} -{$ENDIF} -end; - -procedure GcmBlockReverse128Ssse3(PDst, PSrc, PMask: PByte); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc} -{$I ..\..\Include\Simd\Gcm\GcmBlockReverse128Ssse3_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\Include\Simd\Common\SimdProc3Begin_i386.inc} -{$I ..\..\Include\Simd\Gcm\GcmBlockReverse128Ssse3_i386.inc} -{$ENDIF} -end; -{$ENDIF} - { TGcmBlockCipher } class function TGcmBlockCipher.CreateGcmMultiplier: IGcmMultiplier; begin -{$IFDEF CRYPTOLIB_X86_SIMD} - if TCpuFeatures.X86.HasPCLMULQDQ then + if TGhashSimd.HasCarrylessMultiply then begin Result := TBasicGcmMultiplier.Create(); Exit; end; -{$ENDIF} Result := TTables4kGcmMultiplier.Create(); end; @@ -510,10 +449,8 @@ procedure TGcmBlockCipher.InitCipherAndHashSubKey(const AKeyParam: IKeyParameter TBlockCipherBulkUtilities.TryResolveBulkCipher(FCipher, FBulkCipher); -{$IFDEF CRYPTOLIB_X86_SIMD} FGcmKernel := nil; FGcmKernelMinBlocks := 0; -{$ENDIF CRYPTOLIB_X86_SIMD} FH := nil; System.SetLength(FH, BlockSize); @@ -524,7 +461,6 @@ procedure TGcmBlockCipher.InitCipherAndHashSubKey(const AKeyParam: IKeyParameter FHPow := nil; FWorkCtr := nil; FWorkCtrAhead := nil; -{$IFDEF CRYPTOLIB_X86_SIMD} if TGcmBlockCipher.IsFourWaySupported then begin System.SetLength(FHPow, 128); @@ -545,7 +481,6 @@ procedure TGcmBlockCipher.InitCipherAndHashSubKey(const AKeyParam: IKeyParameter end; end; end; -{$ENDIF} end; procedure TGcmBlockCipher.ComputeJ0(); @@ -836,7 +771,6 @@ function TGcmBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray; AOutOff := AOutOff + BlockSize; end; -{$IFDEF CRYPTOLIB_X86_SIMD} if TGcmBlockCipher.IsEightWaySupported and (ALen >= BlockSize * 8) then begin EncryptBlocks8(AInput, AInOff, ALen, AOutput, AOutOff); @@ -871,7 +805,6 @@ function TGcmBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray; end; end else -{$ENDIF} begin while ALen >= BlockSize * 2 do begin @@ -952,7 +885,6 @@ function TGcmBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray; LThresh4 := LBufLen + (BlockSize * 3); LThresh8 := LBufLen + (BlockSize * 7); -{$IFDEF CRYPTOLIB_X86_SIMD} if TGcmBlockCipher.IsEightWaySupported and (ALen >= LThresh8) then begin DecryptBlocks8(AInput, AInOff, ALen, AOutput, AOutOff, LThresh8); @@ -987,7 +919,6 @@ function TGcmBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray; end; end else -{$ENDIF} begin while ALen >= LThresh2 do begin @@ -1161,13 +1092,8 @@ class procedure TGcmBlockCipher.GcmReverse16(const ASrc, ADst: PByte); var LI: Int32; begin -{$IFDEF CRYPTOLIB_X86_SIMD} - if TCpuFeatures.X86.HasSSSE3 then - begin - GcmBlockReverse128Ssse3(ADst, ASrc, @ReverseBytesMask[0]); + if TGhashSimd.TryBlockReverse128(ADst, ASrc) then Exit; - end; -{$ENDIF} for LI := 0 to 15 do ADst[LI] := ASrc[15 - LI]; end; @@ -1181,13 +1107,8 @@ procedure TGcmBlockCipher.GhashFourShuffledBlocks(PC0, PC16, PC32, PC48: PByte); LSRev: array[0..15] of Byte; LPCiph: PByte; begin -{$IFDEF CRYPTOLIB_X86_SIMD} - if TGcmBlockCipher.IsFourWaySupported then - begin - TGcmUtilities.FusedFourShuffledGhash(@FS[0], PC0, @FHPow[64], @ReverseBytesMask[0]); + if TGhashSimd.TryFusedFourShuffledGhash(@FS[0], PC0, @FHPow[64]) then Exit; - end; -{$ENDIF CRYPTOLIB_X86_SIMD} GcmReverse16(@FS[0], @LSRev[0]); FillChar(LU0, 16, 0); FillChar(LU1, 16, 0); @@ -1219,15 +1140,15 @@ procedure TGcmBlockCipher.GhashFourShuffledBlocks(PC0, PC16, PC32, PC48: PByte); // Fused and pipelined batch routines -- GCM performance core. // ======================================================================= // Each routine consumes 64 bytes (4-way) or 128 bytes (8-way) of -// plaintext / ciphertext per iteration. The "fused" variants run AES +// plaintext / ciphertext per iteration. The "fused" variants run // counter-keystream generation then GHASH back-to-back. The -// "pipelined" variants overlap current-batch AES with previous-batch -// GHASH to reclaim port-0 / port-5 ILP. The FusedILP variant (further -// below, under CRYPTOLIB_X86_SIMD) pushes this further by interleaving -// both at the instruction level inside a single assembly kernel, -// selected per arch (Gueron 8-wide on x86-64, 2x4-wide halves on i386). -// AForEncrypt selects which buffer feeds GHASH: output ciphertext on -// encrypt, input ciphertext on decrypt. +// "pipelined" variants overlap current-batch keystream with previous-batch +// GHASH to reclaim instruction-level parallelism across the two independent +// execution units. The FusedILP variant (further below) pushes this further +// by interleaving both at the instruction level inside a single kernel +// supplied by the fused-kernel registry (nil off-SIMD, so that path is +// simply skipped there). AForEncrypt selects which buffer feeds GHASH: +// output ciphertext on encrypt, input ciphertext on decrypt. // ======================================================================= // Single-batch fused 4-way GCM step. AForEncrypt=True hashes the output ciphertext; @@ -1257,13 +1178,8 @@ procedure TGcmBlockCipher.GhashEightShuffledBlocks(PBase: PByte); LSRev: array [0 .. 15] of Byte; LPCiph: PByte; begin -{$IFDEF CRYPTOLIB_X86_SIMD} - if TGcmBlockCipher.IsEightWaySupported then - begin - TGcmUtilities.FusedEightShuffledGhash(@FS[0], PBase, @FHPow[0], @ReverseBytesMask[0]); + if TGhashSimd.TryFusedEightShuffledGhash(@FS[0], PBase, @FHPow[0]) then Exit; - end; -{$ENDIF CRYPTOLIB_X86_SIMD} GcmReverse16(@FS[0], @LSRev[0]); FillChar(LU0, 16, 0); FillChar(LU1, 16, 0); @@ -1301,8 +1217,9 @@ procedure TGcmBlockCipher.ProcessBlocks8Fused(const AInBuf: TCryptoLibByteArray; // Pipeline-by-one fused four-block step. Requires ALen >= ALimit + BlockSize*4*2 // (i.e. at least two 4-block batches remain after honouring the caller's tail // hold-back) so we can overlap each batch's GHASH with the next batch's -// CTR-keystream generation via CPU OoO scheduling (AES-NI uses port 0 / GHASH -// PCLMULQDQ uses port 5 on Intel). After this method returns, 0 or 1 full +// CTR-keystream generation via CPU out-of-order scheduling (the block-cipher +// keystream and the GHASH multiply-reduce use independent execution units). +// After this method returns, 0 or 1 full // four-block batches remain; the caller's non-pipelined loop handles the tail. // AForEncrypt=True does XOR then GHASH(output); AForEncrypt=False does // GHASH(input) then XOR (the only per-direction difference). @@ -1464,14 +1381,13 @@ class procedure TGcmBlockCipher.FillCtr8BlocksRaw( System.Move(ACounter[0], ABlocks[112], 16); end; -{$IFDEF CRYPTOLIB_X86_SIMD} // ======================================================================= -// Fused AES-NI + 8-way GHASH pipeline (x86-64 and i386). -// The driver is arch-agnostic: it drives the outer 8-block stride loop -// and delegates the fused work to whichever IFusedGcmKernel variant the -// registry resolved (Gueron 8-wide on x86-64, 2x4-wide halves on i386). -// Register-budget rationale for each variant lives on the matching -// banner in the class declaration. +// Fused block-cipher keystream + 8-way GHASH pipeline. The driver is +// arch-agnostic: it drives the outer 8-block stride loop and delegates the +// fused work to whichever IFusedGcmKernel the registry resolved (nil +// off-SIMD, so the callers never invoke this path there). The kernel's +// internal interleave and register budget are backend details behind the +// interface, summarised on the matching banner in the class declaration. // ======================================================================= procedure TGcmBlockCipher.FillNextCtrBlocks8Raw(const ABlocks: TCryptoLibByteArray); @@ -1479,21 +1395,21 @@ procedure TGcmBlockCipher.FillNextCtrBlocks8Raw(const ABlocks: TCryptoLibByteArr FillCtr8BlocksRaw(FCounter, FCounter32, ABlocks); end; -// Fused AES-NI keystream + 8-way GHASH pipeline (x86-64 and i386). The -// AES engine is always in encrypt mode (CTR keystream) regardless of GCM -// direction. AForEncrypt selects the per-direction bookkeeping only: +// Fused block-cipher keystream + 8-way GHASH pipeline. The cipher engine is +// always in encrypt mode (CTR keystream) regardless of GCM direction. +// AForEncrypt selects the per-direction bookkeeping only: // * encrypt: GHASH consumes the prior iteration's OUTPUT ciphertext. // * decrypt: GHASH consumes the prior iteration's INPUT ciphertext. -// Dispatches to the AES-128 / AES-192 / AES-256 fused wrapper based on the -// engine's current round-key schedule length (10 / 12 / 14 rounds). Encrypt +// Dispatches to the 128 / 192 / 256-bit fused wrapper based on the engine's +// current round-key schedule length (10 / 12 / 14 rounds). Encrypt // callers pass ALimit=0 (threshold collapses to BlockSize*16). Decrypt callers // pass the tail hold-back threshold; the loop leaves at least ALimit bytes for // the caller to process after the pipelined block. -// Prime: batch 0 is produced via the regular AES-NI 8-wide kernel + Pascal XOR, +// Prime: batch 0 is produced via the regular 8-wide keystream path + Pascal XOR, // leaving its ciphertext reference at LPrevCipher awaiting GHASH in the next // iteration. -// Body: each loop iteration invokes the interleaved assembly kernel which -// (a) AES-encrypts eight fresh counter blocks to keystream, +// Body: each loop iteration invokes the interleaved fused kernel which +// (a) encrypts eight fresh counter blocks to keystream, // (b) XORs the keystream with the current plaintext/ciphertext, // (c) GHASHes the previous iteration's ciphertext into the running state. // Tail: the last pending ciphertext is GHASH'd, then the final batch is @@ -1514,7 +1430,7 @@ procedure TGcmBlockCipher.ProcessBlocks8FusedILP(const AInBuf: TCryptoLibByteArr LCurrCtrs := FWorkCtr; LNextCtrs := FWorkCtrAhead; - // Prime batch 0: regular 8-wide AES-NI into LCurrCtrs (now holds keystream), + // Prime batch 0: regular 8-wide keystream into LCurrCtrs (now holds keystream), // XOR with plaintext/ciphertext at LPOut, defer GHASH of batch 0. GetNextCtrBlocks8(LCurrCtrs); LPIn := PByte(AInBuf) + AInOff; @@ -1570,7 +1486,6 @@ procedure TGcmBlockCipher.ProcessBlocks8FusedILP(const AInBuf: TCryptoLibByteArr AOutOff := AOutOff + (BlockSize * 8); ALen := ALen - (BlockSize * 8); end; -{$ENDIF CRYPTOLIB_X86_SIMD} // ======================================================================= // Batch dispatchers: route each N-block call to the fastest available @@ -1583,7 +1498,6 @@ procedure TGcmBlockCipher.EncryptBlocks4(const AInBuf: TCryptoLibByteArray; var AInOff: Int32; var ALen: Int32; const AOutBuf: TCryptoLibByteArray; var AOutOff: Int32); begin -{$IFDEF CRYPTOLIB_X86_SIMD} if not TGcmBlockCipher.IsFourWaySupported then raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['four']); if FHPow = nil then @@ -1597,31 +1511,26 @@ procedure TGcmBlockCipher.EncryptBlocks4(const AInBuf: TCryptoLibByteArray; ALen := ALen - (BlockSize * 4); AOutOff := AOutOff + (BlockSize * 4); end; -{$ELSE} - raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['four']); -{$ENDIF} end; procedure TGcmBlockCipher.EncryptBlocks8(const AInBuf: TCryptoLibByteArray; var AInOff: Int32; var ALen: Int32; const AOutBuf: TCryptoLibByteArray; var AOutOff: Int32); begin -{$IFDEF CRYPTOLIB_X86_SIMD} if not TGcmBlockCipher.IsEightWaySupported then raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['eight']); if (FHPow = nil) or (System.Length(FHPow) < 128) then raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockHStateMissing, ['eight']); if ALen >= BlockSize * 16 then begin -{$IFDEF CRYPTOLIB_X86_SIMD} - // FusedILP is worthwhile only when the inner loop actually - // iterates: prime batch + >=1 kernel iter + tail batch = 3 strides - // of 128 B. Below that the fused asm is bypassed (prime + tail - // only) and the driver's entry cost regresses small payloads, - // notably on i386 where register pressure amplifies the overhead. + // FusedILP is worthwhile only when the inner loop actually iterates: + // prime batch + >=1 kernel iter + tail batch = 3 strides of 128 B. Below + // that the fused kernel is bypassed (prime + tail only) and the driver's + // entry cost regresses small payloads, especially on register-constrained + // targets where that overhead is amplified. FGcmKernel is nil off-SIMD, so + // this branch is simply skipped there. if (FGcmKernel <> nil) and (ALen >= BlockSize * 24) then ProcessBlocks8FusedILP(AInBuf, AInOff, ALen, AOutBuf, AOutOff, 0, True); -{$ENDIF} if ALen >= BlockSize * 16 then ProcessBlocks8Pipelined(AInBuf, AInOff, ALen, AOutBuf, AOutOff, 0, True); end; @@ -1632,9 +1541,6 @@ procedure TGcmBlockCipher.EncryptBlocks8(const AInBuf: TCryptoLibByteArray; ALen := ALen - (BlockSize * 8); AOutOff := AOutOff + (BlockSize * 8); end; -{$ELSE} - raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['eight']); -{$ENDIF} end; procedure TGcmBlockCipher.CipherBlock(const AInBuf: TCryptoLibByteArray; @@ -1648,25 +1554,23 @@ procedure TGcmBlockCipher.CipherBlock(const AInBuf: TCryptoLibByteArray; LCtrBlock := nil; System.SetLength(LCtrBlock, BlockSize); GetNextCtrBlock(LCtrBlock); -{$IFDEF CRYPTOLIB_X86_SIMD} - if TGcmBlockCipher.IsSse2PackedVectorXorSupported then + if TGhashSimd.IsBlockXorSupported then begin if AForEncrypt then begin System.Move(LCtrBlock[0], AOutBuf[AOutOff], BlockSize); - GcmBlockXor128Sse2(@AOutBuf[AOutOff], @AInBuf[AInOff]); - GcmBlockXor128Sse2(@FS[0], @AOutBuf[AOutOff]); + TGhashSimd.BlockXor128(@AOutBuf[AOutOff], @AInBuf[AInOff]); + TGhashSimd.BlockXor128(@FS[0], @AOutBuf[AOutOff]); end else begin System.Move(AInBuf[AInOff], AOutBuf[AOutOff], BlockSize); - GcmBlockXor128Sse2(@AOutBuf[AOutOff], @LCtrBlock[0]); - GcmBlockXor128Sse2(@FS[0], @AInBuf[AInOff]); + TGhashSimd.BlockXor128(@AOutBuf[AOutOff], @LCtrBlock[0]); + TGhashSimd.BlockXor128(@FS[0], @AInBuf[AInOff]); end; FMultiplier.MultiplyH(FS); Exit; end; -{$ENDIF} if AForEncrypt then begin @@ -1715,7 +1619,6 @@ procedure TGcmBlockCipher.DecryptBlocks4(const AInBuf: TCryptoLibByteArray; var AInOff: Int32; var ALen: Int32; const AOutBuf: TCryptoLibByteArray; var AOutOff: Int32; ALimit: Int32); begin -{$IFDEF CRYPTOLIB_X86_SIMD} if not TGcmBlockCipher.IsFourWaySupported then raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['four']); if ALimit < BlockSize * 4 then @@ -1731,16 +1634,12 @@ procedure TGcmBlockCipher.DecryptBlocks4(const AInBuf: TCryptoLibByteArray; ALen := ALen - (BlockSize * 4); AOutOff := AOutOff + (BlockSize * 4); end; -{$ELSE} - raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['four']); -{$ENDIF} end; procedure TGcmBlockCipher.DecryptBlocks8(const AInBuf: TCryptoLibByteArray; var AInOff: Int32; var ALen: Int32; const AOutBuf: TCryptoLibByteArray; var AOutOff: Int32; ALimit: Int32); begin -{$IFDEF CRYPTOLIB_X86_SIMD} if not TGcmBlockCipher.IsEightWaySupported then raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['eight']); if ALimit < BlockSize * 8 then @@ -1749,12 +1648,11 @@ procedure TGcmBlockCipher.DecryptBlocks8(const AInBuf: TCryptoLibByteArray; raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockHStateMissing, ['eight']); if ALen >= ALimit + (BlockSize * 8) * 2 then begin -{$IFDEF CRYPTOLIB_X86_SIMD} // See EncryptBlocks8: require prime + >=1 kernel iter + tail // (3 strides of 128 B above ALimit) before entering FusedILP. + // FGcmKernel is nil off-SIMD, so this branch is skipped there. if (FGcmKernel <> nil) and (ALen >= ALimit + (BlockSize * 8) * 3) then ProcessBlocks8FusedILP(AInBuf, AInOff, ALen, AOutBuf, AOutOff, ALimit, False); -{$ENDIF} if ALen >= ALimit + (BlockSize * 8) * 2 then ProcessBlocks8Pipelined(AInBuf, AInOff, ALen, AOutBuf, AOutOff, ALimit, False); end; @@ -1765,9 +1663,6 @@ procedure TGcmBlockCipher.DecryptBlocks8(const AInBuf: TCryptoLibByteArray; ALen := ALen - (BlockSize * 8); AOutOff := AOutOff + (BlockSize * 8); end; -{$ELSE} - raise EInvalidOperationCryptoLibException.CreateResFmt(@SGcmBlockPathNotSupported, ['eight']); -{$ENDIF} end; procedure TGcmBlockCipher.CipherBlocks2(const AInBuf: TCryptoLibByteArray; @@ -1781,8 +1676,7 @@ procedure TGcmBlockCipher.CipherBlocks2(const AInBuf: TCryptoLibByteArray; LCtrBlock := nil; System.SetLength(LCtrBlock, BlockSize); -{$IFDEF CRYPTOLIB_X86_SIMD} - if TGcmBlockCipher.IsSse2PackedVectorXorSupported then + if TGhashSimd.IsBlockXorSupported then begin for LB := 0 to 1 do begin @@ -1790,14 +1684,14 @@ procedure TGcmBlockCipher.CipherBlocks2(const AInBuf: TCryptoLibByteArray; if AForEncrypt then begin System.Move(LCtrBlock[0], AOutBuf[AOutOff], BlockSize); - GcmBlockXor128Sse2(@AOutBuf[AOutOff], @AInBuf[AInOff]); - GcmBlockXor128Sse2(@FS[0], @AOutBuf[AOutOff]); + TGhashSimd.BlockXor128(@AOutBuf[AOutOff], @AInBuf[AInOff]); + TGhashSimd.BlockXor128(@FS[0], @AOutBuf[AOutOff]); end else begin System.Move(AInBuf[AInOff], AOutBuf[AOutOff], BlockSize); - GcmBlockXor128Sse2(@AOutBuf[AOutOff], @LCtrBlock[0]); - GcmBlockXor128Sse2(@FS[0], @AInBuf[AInOff]); + TGhashSimd.BlockXor128(@AOutBuf[AOutOff], @LCtrBlock[0]); + TGhashSimd.BlockXor128(@FS[0], @AInBuf[AInOff]); end; FMultiplier.MultiplyH(FS); AInOff := AInOff + BlockSize; @@ -1805,7 +1699,6 @@ procedure TGcmBlockCipher.CipherBlocks2(const AInBuf: TCryptoLibByteArray; end; Exit; end; -{$ENDIF} for LB := 0 to 1 do begin diff --git a/CryptoLib/src/Crypto/Modes/ClpGcmSivBlockCipher.pas b/CryptoLib/src/Crypto/Modes/ClpGcmSivBlockCipher.pas index d4223b35..de7f71d9 100644 --- a/CryptoLib/src/Crypto/Modes/ClpGcmSivBlockCipher.pas +++ b/CryptoLib/src/Crypto/Modes/ClpGcmSivBlockCipher.pas @@ -37,6 +37,7 @@ interface ClpGcmBlockCipher, ClpGcmUtilities, ClpGcmSivUtilities, + ClpGcmSivSimd, ClpFusedKernelTypes, ClpIFusedGcmSivKernel, ClpFusedKernelRegistry, @@ -121,7 +122,6 @@ TGcmSivHasher = class(TObject) FTheNonce: TCryptoLibByteArray; FTheFlags: Int32; -{$IFDEF CRYPTOLIB_X86_SIMD} // POLYVAL H-power table (H^8..H^1 as 16-byte limbs in GHASH // canonical form, 128 bytes). Populated once per key in DeriveKeys // when the fused kernel is available; captured by reference by the @@ -132,7 +132,6 @@ TGcmSivHasher = class(TObject) // matches the mode's 8-block batch contract. FGcmSivKernel: IFusedGcmSivKernel; FGcmSivKernelBatchBytes: Int32; -{$ENDIF CRYPTOLIB_X86_SIMD} procedure CheckAeadStatus(ALen: Int32); procedure CheckStatus(ALen: Int32); @@ -251,7 +250,6 @@ procedure TGcmSivBlockCipher.TGcmSivHasher.UpdateHash(const ABuffer: TCryptoLibB FNumActive := 0; end; -{$IFDEF CRYPTOLIB_X86_SIMD} // Fused POLYVAL Horner-by-8 fast path for full 128-byte batches. if (FParent.FGcmSivKernel <> nil) and (LMyRemaining >= FParent.FGcmSivKernelBatchBytes) then @@ -266,7 +264,6 @@ procedure TGcmSivBlockCipher.TGcmSivHasher.UpdateHash(const ABuffer: TCryptoLibB LMyRemaining := LMyRemaining - FParent.FGcmSivKernelBatchBytes; end; end; -{$ENDIF CRYPTOLIB_X86_SIMD} while LMyRemaining >= 16 do begin @@ -882,26 +879,29 @@ procedure TGcmSivBlockCipher.DeriveKeys(const AKey: IKeyParameter); TGcmSivUtilities.MulX(LMyOut); FTheMultiplier.Init(LMyOut); -{$IFDEF CRYPTOLIB_X86_SIMD} - // Precompute the POLYVAL H-power table and resolve the fused kernel - // for this key. LMyOut is already conditioned for GHASH. The H-power - // table is captured by reference by the kernel and must outlive it; - // it is owned by this cipher instance. + // Precompute the POLYVAL H-power table and resolve the fused kernel for + // this key when a SIMD POLYVAL backend is available. LMyOut is already + // conditioned for GHASH; the H-power table is captured by reference by the + // kernel and must outlive it (it is owned by this cipher instance). + // TGcmSivSimd.IsSupported is False off-SIMD (or with no fused backend), so the + // precompute is skipped and TGcmSivHasher.UpdateHash stays on scalar POLYVAL. FGcmSivKernel := nil; FGcmSivKernelBatchBytes := 0; - if System.Length(FHPow128) < 128 then - System.SetLength(FHPow128, 128); - TGcmUtilities.InitEightWayHPowFromH(LMyOut, FHPow128); - if TFusedKernelRegistry.TryAcquireGcmSiv(FTheCipher, - TFusedModeDirection.Encrypt, @FHPow128[0], FGcmSivKernel) and - (FGcmSivKernel <> nil) then + if TGcmSivSimd.IsSupported then begin - if FGcmSivKernel.MinimumBlockCount = 8 then - FGcmSivKernelBatchBytes := FGcmSivKernel.MinimumBlockCount * BUFLEN - else - FGcmSivKernel := nil; + if System.Length(FHPow128) < 128 then + System.SetLength(FHPow128, 128); + TGcmUtilities.InitEightWayHPowFromH(LMyOut, FHPow128); + if TFusedKernelRegistry.TryAcquireGcmSiv(FTheCipher, + TFusedModeDirection.Encrypt, @FHPow128[0], FGcmSivKernel) and + (FGcmSivKernel <> nil) then + begin + if FGcmSivKernel.MinimumBlockCount = 8 then + FGcmSivKernelBatchBytes := FGcmSivKernel.MinimumBlockCount * BUFLEN + else + FGcmSivKernel := nil; + end; end; -{$ENDIF CRYPTOLIB_X86_SIMD} FTheFlags := FTheFlags or INITIAL; end; diff --git a/CryptoLib/src/Crypto/Modes/ClpOcbBlockCipher.pas b/CryptoLib/src/Crypto/Modes/ClpOcbBlockCipher.pas index c21d8960..44af0ece 100644 --- a/CryptoLib/src/Crypto/Modes/ClpOcbBlockCipher.pas +++ b/CryptoLib/src/Crypto/Modes/ClpOcbBlockCipher.pas @@ -130,11 +130,9 @@ TOcbBlockCipher = class(TInterfacedObject, IOcbBlockCipher, procedure CheckNonceReuse(AForEncryption: Boolean; const ANewNonce: TCryptoLibByteArray; const AKeyParam: IKeyParameter); -{$IFDEF CRYPTOLIB_X86_SIMD} procedure ProcessFusedBulk(const AInput: TCryptoLibByteArray; AInOff: Int32; const AOutput: TCryptoLibByteArray; AOutOff: Int32; ABlockCount: Int32); -{$ENDIF CRYPTOLIB_X86_SIMD} procedure ProcessEightBlocksBulk(const AInput: TCryptoLibByteArray; AInOff: Int32; const AOutput: TCryptoLibByteArray; AOutOff: Int32); @@ -526,15 +524,16 @@ function TOcbBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray; while (LI < ALen) do begin -{$IFDEF CRYPTOLIB_X86_SIMD} // Fused-kernel fast path: accelerator-provided AEAD kernel - // (AES-NI today; ARM / other accelerators pluggable via the - // registry). Takes priority over the 8-wide bulk-cipher path - // below whenever at least one kernel-stride batch fits the - // steady-state window. A single dispatch stages up to - // FUSED_BATCH_BLOCKS worth of offsets and lets the kernel loop - // internally in MinimumBlockCount strides, amortising per-call - // overhead across the whole batch. + // (e.g. AES-NI; other accelerators pluggable via the + // registry). FOcbKernel is nil when no factory accepted this + // cipher / direction (always so off-SIMD), in which case this branch + // is skipped and the 8-wide bulk / scalar paths below run unchanged. + // Takes priority over the 8-wide bulk-cipher path below whenever at + // least one kernel-stride batch fits the steady-state window. A single + // dispatch stages up to FUSED_BATCH_BLOCKS worth of offsets and lets + // the kernel loop internally in MinimumBlockCount strides, amortising + // per-call overhead across the whole batch. if (FOcbKernel <> nil) and (FMainBlockPos = LSteadyPos) and ((ALen - LI) >= FOcbKernelMinBlocks * BLOCK_SIZE) then begin @@ -552,7 +551,6 @@ function TOcbBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray; LI := LI + LBatchBytes; Continue; end; -{$ENDIF CRYPTOLIB_X86_SIMD} // 8-wide bulk-cipher fast path. Entered only when no fused kernel // accepted this cipher / direction (FOcbKernel = nil) or the @@ -582,7 +580,6 @@ function TOcbBlockCipher.ProcessBytes(const AInput: TCryptoLibByteArray; Result := LResultLen; end; -{$IFDEF CRYPTOLIB_X86_SIMD} procedure TOcbBlockCipher.ProcessFusedBulk(const AInput: TCryptoLibByteArray; AInOff: Int32; const AOutput: TCryptoLibByteArray; AOutOff: Int32; ABlockCount: Int32); @@ -697,7 +694,6 @@ procedure TOcbBlockCipher.ProcessFusedBulk(const AInput: TCryptoLibByteArray; FMacSize); end; end; -{$ENDIF CRYPTOLIB_X86_SIMD} procedure TOcbBlockCipher.ProcessEightBlocksBulk( const AInput: TCryptoLibByteArray; AInOff: Int32; diff --git a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiCcmKernel.pas b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiCcmKernel.pas index fe05890b..16a2dde9 100644 --- a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiCcmKernel.pas +++ b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiCcmKernel.pas @@ -27,11 +27,12 @@ interface ClpFusedKernelTypes, ClpIFusedCcmKernel, ClpFusedKernelRegistry, - ClpAesNiAeadResolver; + ClpAesFusedAeadSimd, + ClpAesFusedAeadX86Backend; type /// - /// AES-NI + SSSE3 implementation of IFusedCcmKernel. + /// AES-NI implementation of IFusedCcmKernel. /// Available on x86_64 (CRYPTOLIB_X86_64_ASM) and i386 /// (CRYPTOLIB_I386_ASM); both arms gated collectively by /// CRYPTOLIB_X86_SIMD. @@ -287,9 +288,9 @@ function TAesNiCcmKernelFactory.TryCreate(const ACipher: IBlockCipher; AKernel := nil; Result := False; try - if not TAesNiAeadResolver.CpuSupports then + if not TAesFusedAeadSimd.CpuSupports then Exit; - if not TAesNiAeadResolver.TryResolveEngine(ACipher, LEngine) then + if not TAesFusedAeadX86Backend.TryResolveEngine(ACipher, LEngine) then Exit; // CCM drives CTR and CBC-MAC lanes from the same forward-encrypt // schedule for both directions. LKeys is consumed only to probe diff --git a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiEaxKernel.pas b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiEaxKernel.pas index de8253be..8efa8dea 100644 --- a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiEaxKernel.pas +++ b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiEaxKernel.pas @@ -27,11 +27,12 @@ interface ClpFusedKernelTypes, ClpIFusedEaxKernel, ClpFusedKernelRegistry, - ClpAesNiAeadResolver; + ClpAesFusedAeadSimd, + ClpAesFusedAeadX86Backend; type /// - /// AES-NI + SSSE3 implementation of IFusedEaxKernel. + /// AES-NI implementation of IFusedEaxKernel. /// Available on x86_64 (CRYPTOLIB_X86_64_ASM) and i386 /// (CRYPTOLIB_I386_ASM); both arms gated collectively by /// CRYPTOLIB_X86_SIMD. @@ -278,9 +279,9 @@ function TAesNiEaxKernelFactory.TryCreate(const ACipher: IBlockCipher; AKernel := nil; Result := False; try - if not TAesNiAeadResolver.CpuSupports then + if not TAesFusedAeadSimd.CpuSupports then Exit; - if not TAesNiAeadResolver.TryResolveEngine(ACipher, LEngine) then + if not TAesFusedAeadX86Backend.TryResolveEngine(ACipher, LEngine) then Exit; // EAX drives CTR and OMAC lanes from the same forward-encrypt // schedule for both directions. diff --git a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiGcmKernel.pas b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiGcmKernel.pas index 3ca41437..a89b840e 100644 --- a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiGcmKernel.pas +++ b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiGcmKernel.pas @@ -27,11 +27,12 @@ interface ClpFusedKernelTypes, ClpIFusedGcmKernel, ClpFusedKernelRegistry, - ClpAesNiAeadResolver; + ClpAesFusedAeadSimd, + ClpAesFusedAeadX86Backend; type /// - /// AES-NI + PCLMULQDQ + SSSE3 implementation of IFusedGcmKernel. + /// AES-NI + PCLMULQDQ implementation of IFusedGcmKernel. /// Available on x86_64 (CRYPTOLIB_X86_64_ASM) and i386 /// (CRYPTOLIB_I386_ASM); both arms gated collectively by /// CRYPTOLIB_X86_SIMD. @@ -207,9 +208,9 @@ function TAesNiGcmKernelFactory.TryCreate(const ACipher: IBlockCipher; if AHPowers = nil then Exit; {$IFDEF CRYPTOLIB_X86_SIMD} - if not TAesNiAeadResolver.CpuSupports then + if not TAesFusedAeadSimd.CpuSupports then Exit; - if not TAesNiAeadResolver.TryResolveEngine(ACipher, LEngine) then + if not TAesFusedAeadX86Backend.TryResolveEngine(ACipher, LEngine) then Exit; if not LEngine.TryGetEncKeysPtr(LKeys, LRounds) then Exit; diff --git a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiOcbKernel.pas b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiOcbKernel.pas index 52ce7ae5..8d275e08 100644 --- a/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiOcbKernel.pas +++ b/CryptoLib/src/Crypto/Modes/Fused/ClpAesNiOcbKernel.pas @@ -27,11 +27,12 @@ interface ClpFusedKernelTypes, ClpIFusedOcbKernel, ClpFusedKernelRegistry, - ClpAesNiAeadResolver; + ClpAesFusedAeadSimd, + ClpAesFusedAeadX86Backend; type /// - /// AES-NI + SSSE3 implementation of IFusedOcbKernel. + /// AES-NI implementation of IFusedOcbKernel. /// Available on x86_64 (CRYPTOLIB_X86_64_ASM) and i386 /// (CRYPTOLIB_I386_ASM); both arms gated collectively by /// CRYPTOLIB_X86_SIMD. @@ -268,9 +269,9 @@ function TAesNiOcbKernelFactory.TryCreate(const ACipher: IBlockCipher; AKernel := nil; Result := False; try - if not TAesNiAeadResolver.CpuSupports then + if not TAesFusedAeadSimd.CpuSupports then Exit; - if not TAesNiAeadResolver.TryResolveEngine(ACipher, LEngine) then + if not TAesFusedAeadX86Backend.TryResolveEngine(ACipher, LEngine) then Exit; if ADirection = TFusedModeDirection.Encrypt then LHasSchedule := LEngine.TryGetEncKeysPtr(LKeys, LRounds) diff --git a/CryptoLib/src/Crypto/Modes/Fused/ClpPclmulGcmSivKernel.pas b/CryptoLib/src/Crypto/Modes/Fused/ClpPclmulGcmSivKernel.pas index 14686c56..5c65e931 100644 --- a/CryptoLib/src/Crypto/Modes/Fused/ClpPclmulGcmSivKernel.pas +++ b/CryptoLib/src/Crypto/Modes/Fused/ClpPclmulGcmSivKernel.pas @@ -26,12 +26,11 @@ interface ClpFusedKernelTypes, ClpIFusedGcmSivKernel, ClpFusedKernelRegistry, - ClpCpuFeatures, - ClpIntrinsicsVector; + ClpGcmSivSimd; type /// - /// PCLMULQDQ + SSSE3 implementation of IFusedGcmSivKernel. Pure + /// PCLMULQDQ implementation of IFusedGcmSivKernel. Pure /// POLYVAL: the factory ignores ACipher identity and only requires /// a valid pre-computed H-power table from the caller. Ships on /// both x86_64 and i386. @@ -63,19 +62,6 @@ TPclmulGcmSivKernelFactory = class sealed(TInterfacedObject, implementation -{$IFDEF CRYPTOLIB_X86_SIMD} -procedure GcmSivPolyvalHornerEight(PFS, PC0, PHPow128, PMask: Pointer); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} -{$I ..\..\..\Include\Simd\GcmSiv\PolyvalHornerEight_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} -{$I ..\..\..\Include\Simd\GcmSiv\PolyvalHornerEight_i386.inc} -{$ENDIF} -end; -{$ENDIF CRYPTOLIB_X86_SIMD} - const // PSHUFB full-reverse control used by the POLYVAL Horner batch. GcmSivKernelReverseMask: packed array[0..15] of Byte = ( @@ -99,11 +85,9 @@ function TPclmulGcmSivKernel.MinimumBlockCount: Int32; procedure TPclmulGcmSivKernel.ProcessPolyvalBatch(AInPtr, AAccumulator: Pointer; ABlockCount: Int32); begin -{$IFDEF CRYPTOLIB_X86_SIMD} if ABlockCount <> FUSED_POLYVAL_MIN_BLOCKS then Exit; - GcmSivPolyvalHornerEight(AAccumulator, AInPtr, FHPow128, FMask); -{$ENDIF CRYPTOLIB_X86_SIMD} + TGcmSivSimd.ProcessPolyvalBatch(AAccumulator, AInPtr, FHPow128, FMask); end; { TPclmulGcmSivKernelFactory } @@ -127,14 +111,11 @@ function TPclmulGcmSivKernelFactory.TryCreate(const ACipher: IBlockCipher; try if AHPowers = nil then Exit; -{$IFDEF CRYPTOLIB_X86_SIMD} - if not (TCpuFeatures.X86.HasPCLMULQDQ and TCpuFeatures.X86.HasSSSE3 and - TIntrinsicsVector.IsPacked) then + if not TGcmSivSimd.IsSupported then Exit; AKernel := TPclmulGcmSivKernel.Create(AHPowers, @GcmSivKernelReverseMask[0]); Result := True; -{$ENDIF CRYPTOLIB_X86_SIMD} except AKernel := nil; Result := False; diff --git a/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadSimd.pas b/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadSimd.pas new file mode 100644 index 00000000..e8b1c8b7 --- /dev/null +++ b/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadSimd.pas @@ -0,0 +1,62 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpAesFusedAeadSimd; + +{$I ..\..\..\..\Include\CryptoLib.inc} + +interface + +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} +uses + ClpAesFusedAeadX86Backend; +{$IFEND} + +type + /// + /// Arch-neutral capability gate for the fused hardware-AES AEAD pipeline + /// (e.g. AES-NI on x86). Selects the per-arch + /// backend at compile time and answers only the genuinely arch-neutral question + /// - "is a fused hardware-AES path available on this build/CPU?". + /// + /// + /// Engine resolution deliberately does NOT live here: it hands back an + /// instruction-set-specific round-key schedule (see IAesEngineX86), so it + /// belongs on the per-arch backend (TAesFusedAeadX86Backend.TryResolveEngine) + /// that the matching per-arch kernels call. Not exported via the public + /// interface surface and never imported by mode units, which stay + /// cipher-agnostic. + /// + TAesFusedAeadSimd = class sealed + public + /// CPU + build-time gate for the fused hardware-AES AEAD pipeline on this arch. + class function CpuSupports: Boolean; static; + end; + +implementation + +{ TAesFusedAeadSimd } + +class function TAesFusedAeadSimd.CpuSupports: Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TAesFusedAeadX86Backend.CpuSupports; +{$ELSE} + Result := False; +{$IFEND} +end; + +end. diff --git a/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesNiAeadResolver.pas b/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadX86Backend.pas similarity index 71% rename from CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesNiAeadResolver.pas rename to CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadX86Backend.pas index ee652240..bb5e8aab 100644 --- a/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesNiAeadResolver.pas +++ b/CryptoLib/src/Crypto/Modes/Fused/Internal/ClpAesFusedAeadX86Backend.pas @@ -14,7 +14,7 @@ (* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) -unit ClpAesNiAeadResolver; +unit ClpAesFusedAeadX86Backend; {$I ..\..\..\..\Include\CryptoLib.inc} @@ -26,35 +26,33 @@ interface ClpIBlockCipherMode, ClpIAesEngineX86, ClpCpuFeatures, - ClpIntrinsicsVector; + ClpIntrinsicsVector; type /// - /// Internal helper used exclusively by the in-tree AES-NI AEAD - /// kernel factories. Not exported via the public interface surface - /// and never imported by mode units, which stay cipher-agnostic. + /// x86 backend for the fused AES-NI AEAD pipeline: owns the CPU/build gate and + /// the concrete IAesEngineX86 resolution used by the in-tree AES-NI AEAD + /// kernel factories. Compiles on every target - CpuSupports is + /// False off x86 and TryResolveEngine then finds no engine. /// - TAesNiAeadResolver = class sealed(TObject) + TAesFusedAeadX86Backend = class sealed public - /// CPU + build-time gate for the AES-NI AEAD pipeline. - /// True only when the build defines CRYPTOLIB_X86_SIMD, the CPU - /// exposes AES-NI + PCLMULQDQ + SSSE3, and the SIMD intrinsics - /// layout is packed. + /// True only when the build defines CRYPTOLIB_X86_SIMD, the CPU + /// exposes hardware AES + carryless multiply, and the SIMD intrinsics layout is packed. class function CpuSupports: Boolean; static; - /// Probe ACipher for IAesEngineX86, handling both the - /// direct case (ACipher itself is the engine) and the wrapped case - /// (ACipher is an IBlockCipherMode whose UnderlyingCipher is the - /// engine). AEngine is nil on False. + /// Probe ACipher for IAesEngineX86, handling both the direct case + /// (ACipher itself is the engine) and the wrapped case (ACipher is an + /// IBlockCipherMode whose UnderlyingCipher is the engine). AEngine is nil on False. class function TryResolveEngine(const ACipher: IBlockCipher; out AEngine: IAesEngineX86): Boolean; static; end; implementation -{ TAesNiAeadResolver } +{ TAesFusedAeadX86Backend } -class function TAesNiAeadResolver.CpuSupports: Boolean; +class function TAesFusedAeadX86Backend.CpuSupports: Boolean; begin {$IFDEF CRYPTOLIB_X86_SIMD} Result := TCpuFeatures.X86.HasAESNI and TCpuFeatures.X86.HasPCLMULQDQ and @@ -64,7 +62,7 @@ class function TAesNiAeadResolver.CpuSupports: Boolean; {$ENDIF CRYPTOLIB_X86_SIMD} end; -class function TAesNiAeadResolver.TryResolveEngine(const ACipher: IBlockCipher; +class function TAesFusedAeadX86Backend.TryResolveEngine(const ACipher: IBlockCipher; out AEngine: IAesEngineX86): Boolean; var LMode: IBlockCipherMode; diff --git a/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivSimd.pas b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivSimd.pas new file mode 100644 index 00000000..0bc82c8c --- /dev/null +++ b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivSimd.pas @@ -0,0 +1,65 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpGcmSivSimd; + +{$I ..\..\..\Include\CryptoLib.inc} + +interface + +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} +uses + ClpGcmSivX86Backend; +{$IFEND} + +type + /// + /// Arch-neutral SIMD dispatch facade for the AES-GCM-SIV POLYVAL batch kernel. + /// Selects the per-arch backend at compile time; on a + /// build with no SIMD backend IsSupported is False (so the fused + /// GCM-SIV kernel factory declines) and ProcessPolyvalBatch is a no-op. + /// The kernel unit calls only this facade and stays free of any + /// TCpuFeatures / CRYPTOLIB_*_ASM knowledge. + /// + TGcmSivSimd = class sealed + public + /// True when a POLYVAL batch kernel is usable on this CPU. + class function IsSupported: Boolean; static; + /// Eight-block POLYVAL Horner batch. Precondition: IsSupported. + class procedure ProcessPolyvalBatch(PFS, PC0, PHPow128, PMask: Pointer); static; + end; + +implementation + +{ TGcmSivSimd } + +class function TGcmSivSimd.IsSupported: Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGcmSivX86Backend.IsSupported; +{$ELSE} + Result := False; +{$IFEND} +end; + +class procedure TGcmSivSimd.ProcessPolyvalBatch(PFS, PC0, PHPow128, PMask: Pointer); +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + TGcmSivX86Backend.ProcessPolyvalBatch(PFS, PC0, PHPow128, PMask); +{$IFEND} +end; + +end. diff --git a/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivX86Backend.pas b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivX86Backend.pas new file mode 100644 index 00000000..78a08894 --- /dev/null +++ b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmSivX86Backend.pas @@ -0,0 +1,77 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpGcmSivX86Backend; + +{$I ..\..\..\Include\CryptoLib.inc} + +interface + +uses + ClpCpuFeatures, + ClpIntrinsicsVector; + +type + /// + /// x86 SIMD backend for the AES-GCM-SIV POLYVAL batch kernel: owns the + /// PCLMULQDQ 8-block Horner kernel (body in Include\Simd\GcmSiv\) + /// and the runtime capability gate. Compiles on every target - IsSupported + /// returns False off x86 (so the fused-kernel factory declines to build a + /// kernel) and ProcessPolyvalBatch is a no-op there. + /// + TGcmSivX86Backend = class sealed + public + /// True when the PCLMULQDQ POLYVAL kernel is usable on this CPU. + class function IsSupported: Boolean; static; + /// Eight-block POLYVAL Horner batch. Precondition: IsSupported. + class procedure ProcessPolyvalBatch(PFS, PC0, PHPow128, PMask: Pointer); static; + end; + +implementation + +{$IFDEF CRYPTOLIB_X86_SIMD} +procedure GcmSivPolyvalHornerEight(PFS, PC0, PHPow128, PMask: Pointer); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} +{$I ..\..\..\Include\Simd\GcmSiv\PolyvalHornerEight_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} +{$I ..\..\..\Include\Simd\GcmSiv\PolyvalHornerEight_i386.inc} +{$ENDIF} +end; +{$ENDIF CRYPTOLIB_X86_SIMD} + +{ TGcmSivX86Backend } + +class function TGcmSivX86Backend.IsSupported: Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + Result := TCpuFeatures.X86.HasPCLMULQDQ and TCpuFeatures.X86.HasSSSE3 and + TIntrinsicsVector.IsPacked; +{$ELSE} + Result := False; +{$ENDIF} +end; + +class procedure TGcmSivX86Backend.ProcessPolyvalBatch(PFS, PC0, PHPow128, PMask: Pointer); +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + GcmSivPolyvalHornerEight(PFS, PC0, PHPow128, PMask); +{$ENDIF} +end; + +end. diff --git a/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmUtilities.pas b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmUtilities.pas index fd991e67..bc574749 100644 --- a/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmUtilities.pas +++ b/CryptoLib/src/Crypto/Modes/Gcm/ClpGcmUtilities.pas @@ -26,14 +26,12 @@ interface ClpPack, ClpBinaryPrimitives, ClpInterleave, - ClpCpuFeatures, + ClpGhashSimd, ClpCryptoLibTypes, - ClpIntrinsicsVector, ClpByteUtilities; resourcestring - SPclmulqdqMultiplyExtIsNotAvailable = 'PCLMULQDQ multiply-ext is not available on this target'; - SFusedGhashRequiresSsse3 = 'fused %s-way GHASH requires SSSE3, PCLMULQDQ, and packed XMM layout'; + SCarrylessMultiplyExtIsNotAvailable = 'Carryless multiply-ext is not available on this target'; type TFieldElement = record @@ -71,16 +69,9 @@ TGcmUtilities = class sealed(TObject) class procedure Reduce3(PZ0, PZ1, PZ2, PSVector16: PByte); static; /// Xor three 16-byte limbs with three 16-byte slices from a 48-byte MultiplyExt output. class procedure XorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte); static; -{$IFDEF CRYPTOLIB_X86_SIMD} /// HPow[0..7] = H^8..H^1 as 16-byte limbs at offsets 0,16,...,112 (index 0 = H^8). Four-way fused GHASH uses offsets 64..112 (H^4..H^1). class procedure InitEightWayHPowFromH(const AH: TCryptoLibByteArray; const AHPow128: TCryptoLibByteArray); static; - /// Fused GHASH for four 16-byte ciphertext blocks. PFS 16-byte in/out (canonical); PC0 points to 64 contiguous ciphertext bytes; PHPow64 = H^4..H^1 (64 bytes); PMask = 16-byte SSSE3 PSHUFB control. - class procedure FusedFourShuffledGhash(PFS, PC0, PHPow64, PMask: PByte); static; - /// Fused GHASH for eight 16-byte ciphertext blocks. PHPow128 = H^8..H^1 (128 bytes at FHPow[0]). - class procedure FusedEightShuffledGhash(PFS, PC0, PHPow128, PMask: PByte); static; -{$ENDIF CRYPTOLIB_X86_SIMD} - class procedure &Xor(const AX, AY: TCryptoLibByteArray); overload; static; class procedure &Xor(const AX, AY: TCryptoLibByteArray; AYOff: Int32); overload; static; class procedure &Xor(const AX, AY: TCryptoLibByteArray; AYOff, AYLen: Int32); overload; static; @@ -93,103 +84,6 @@ TGcmUtilities = class sealed(TObject) implementation -{$IFDEF CRYPTOLIB_X86_SIMD} -type - TGcmPartial128 = record - T3, T2, T1, T0: UInt64; - end; - -procedure GcmPclmulFieldPartial(PX, PY, POut: Pointer); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmPclmulPartial_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_i386.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmPclmulPartial_i386.inc} -{$ENDIF} -end; - -procedure GcmPclmulMultiplyExtBytes(PX, PY, POut48: Pointer); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmPclmulMultiplyExt_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_i386.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmPclmulMultiplyExt_i386.inc} -{$ENDIF} -end; - -procedure GcmReduce3FoldSse2(PZ0, PZ1, PZ2, POut: Pointer); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmReduce3FoldSse2_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmReduce3FoldSse2_i386.inc} -{$ENDIF} -end; - -procedure GcmXorMultiplyExtLimbs48Sse2(PA0, PA1, PA2, PSrc48: Pointer); -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmXorMultiplyExtLimbs48Sse2_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmXorMultiplyExtLimbs48Sse2_i386.inc} -{$ENDIF} -end; - -procedure GcmGhashFourFull(PFS, PC0, PHPow64, PMask: Pointer); -{$DEFINE GCM_GHASH_FULL_BLOCKS_4} -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_i386.inc} -{$ENDIF} -{$UNDEF GCM_GHASH_FULL_BLOCKS_4} -end; - -procedure GcmGhashEightFull(PFS, PC0, PHPow128, PMask: Pointer); -{$DEFINE GCM_GHASH_FULL_BLOCKS_8} -{$IFDEF CRYPTOLIB_X86_64_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_x86_64.inc} -{$ENDIF} -{$IFDEF CRYPTOLIB_I386_ASM} -{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} -{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_i386.inc} -{$ENDIF} -{$UNDEF GCM_GHASH_FULL_BLOCKS_8} -end; - -procedure GcmPclmulReducePartial(const APartial: TGcmPartial128; var AZ: TFieldElement); -var - LT3, LT2, LT1, LT0: UInt64; - LZ0, LZ1, LZ2: UInt64; -begin - LT3 := APartial.T3; - LT2 := APartial.T2; - LT1 := APartial.T1; - LT0 := APartial.T0; - LT1 := LT1 xor LT3 xor (LT3 shr 1) xor (LT3 shr 2) xor (LT3 shr 7); - LT2 := LT2 xor (LT3 shl 63) xor (LT3 shl 62) xor (LT3 shl 57); - LZ0 := (LT0 shl 1) or (LT1 shr 63); - LZ1 := (LT1 shl 1) or (LT2 shr 63); - LZ2 := LT2 shl 1; - LZ0 := LZ0 xor LZ2 xor (LZ2 shr 1) xor (LZ2 shr 2) xor (LZ2 shr 7); - LZ1 := LZ1 xor (LT2 shl 63) xor (LT2 shl 58); - AZ.N0 := LZ0; - AZ.N1 := LZ1; -end; -{$ENDIF CRYPTOLIB_X86_SIMD} - { TGcmUtilities } class procedure TGcmUtilities.One(out AX: TFieldElement); @@ -243,18 +137,10 @@ class procedure TGcmUtilities.Multiply(var AX: TFieldElement; var AY: TFieldElem LX0R, LX1R, LY0R, LY1R: UInt64; LZ0, LZ1, LZ2, LZ3: UInt64; LH0, LH1, LH2, LH3, LH4, LH5: UInt64; - {$IFDEF CRYPTOLIB_X86_SIMD} - LPartial: TGcmPartial128; - {$ENDIF} begin - {$IFDEF CRYPTOLIB_X86_SIMD} - if TCpuFeatures.X86.HasPCLMULQDQ then - begin - GcmPclmulFieldPartial(@AX, @AY, @LPartial); - GcmPclmulReducePartial(LPartial, AX); + if TGhashSimd.TryMultiply(@AX, @AY) then Exit; - end; - {$ENDIF} + LX0 := AX.N0; LX1 := AX.N1; LY0 := AY.N0; @@ -413,27 +299,18 @@ class function TGcmUtilities.ImplMul64(AX, AY: UInt64): UInt64; class procedure TGcmUtilities.MultiplyExt(PX, PY, POut48: PByte); begin -{$IFDEF CRYPTOLIB_X86_SIMD} - if TCpuFeatures.X86.HasPCLMULQDQ then - begin - GcmPclmulMultiplyExtBytes(PX, PY, POut48); + if TGhashSimd.TryMultiplyExt(PX, PY, POut48) then Exit; - end; -{$ENDIF} - raise EInvalidOperationCryptoLibException.CreateRes(@SPclmulqdqMultiplyExtIsNotAvailable); + raise EInvalidOperationCryptoLibException.CreateRes(@SCarrylessMultiplyExtIsNotAvailable); end; class procedure TGcmUtilities.XorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte); var LK: Int32; begin -{$IFDEF CRYPTOLIB_X86_SIMD} - if TCpuFeatures.X86.HasSSE2 then - begin - GcmXorMultiplyExtLimbs48Sse2(PA0, PA1, PA2, PSrc48); + if TGhashSimd.TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48) then Exit; - end; -{$ENDIF CRYPTOLIB_X86_SIMD} + for LK := 0 to 1 do begin TByteUtilities.XorTo(8, PSrc48 + LK * 8, PA0 + LK * 8); @@ -449,13 +326,9 @@ class procedure TGcmUtilities.Reduce3(PZ0, PZ1, PZ2, PSVector16: PByte); LI: Int32; LT3, LT2, LT1, LT0, LZ0, LZ1, LZ2: UInt64; begin -{$IFDEF CRYPTOLIB_X86_SIMD} - if TCpuFeatures.X86.HasSSE2 then - begin - GcmReduce3FoldSse2(PZ0, PZ1, PZ2, PSVector16); + if TGhashSimd.TryReduce3(PZ0, PZ1, PZ2, PSVector16) then Exit; - end; -{$ENDIF CRYPTOLIB_X86_SIMD} + System.Move(PZ0^, B0[0], 16); System.Move(PZ1^, B1[0], 16); System.Move(PZ2^, B2[0], 16); @@ -482,31 +355,6 @@ class procedure TGcmUtilities.Reduce3(PZ0, PZ1, PZ2, PSVector16: PByte); TBinaryPrimitives.WriteUInt64LittleEndian(PSVector16, 8, LZ0); end; -{$IFDEF CRYPTOLIB_X86_SIMD} -class procedure TGcmUtilities.FusedFourShuffledGhash(PFS, PC0, PHPow64, PMask: PByte); -begin - if (not TCpuFeatures.X86.HasSSSE3) or (not TCpuFeatures.X86.HasPCLMULQDQ) or - (not TIntrinsicsVector.IsPacked) then - raise EInvalidOperationCryptoLibException.CreateResFmt - (@SFusedGhashRequiresSsse3, ['four']); - - // Monolithic kernel: byte-reverse + state fold + 4-way multiply-accumulate + - // Reduce3 + byte-reverse back, all in a single assembly body (one call boundary). - GcmGhashFourFull(PFS, PC0, PHPow64, PMask); -end; - -class procedure TGcmUtilities.FusedEightShuffledGhash(PFS, PC0, PHPow128, PMask: PByte); -begin - if (not TCpuFeatures.X86.HasSSSE3) or (not TCpuFeatures.X86.HasPCLMULQDQ) or - (not TIntrinsicsVector.IsPacked) then - raise EInvalidOperationCryptoLibException.CreateResFmt - (@SFusedGhashRequiresSsse3, ['eight']); - - // Monolithic kernel: byte-reverse + state fold + 8-way multiply-accumulate + - // Reduce3 + byte-reverse back, all in a single assembly body (one call boundary). - GcmGhashEightFull(PFS, PC0, PHPow128, PMask); -end; - class procedure TGcmUtilities.InitEightWayHPowFromH(const AH: TCryptoLibByteArray; const AHPow128: TCryptoLibByteArray); var @@ -557,6 +405,4 @@ class procedure TGcmUtilities.InitEightWayHPowFromH(const AH: TCryptoLibByteArra TPack.UInt64_To_LE(LF1.N0, AHPow128, 120); end; -{$ENDIF CRYPTOLIB_X86_SIMD} - end. diff --git a/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashSimd.pas b/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashSimd.pas new file mode 100644 index 00000000..7eaaf77f --- /dev/null +++ b/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashSimd.pas @@ -0,0 +1,167 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpGhashSimd; + +{$I ..\..\..\Include\CryptoLib.inc} + +interface + +uses + ClpCryptoLibTypes +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + , ClpGhashX86Backend +{$IFEND} + ; + +type + /// + /// Arch-neutral SIMD dispatch facade for the GHASH / GF(2^128) field kernels + /// behind TGcmUtilities. Selects the per-arch backend at compile time; + /// on a build with no SIMD backend every entry point + /// returns False, so the field operations run on their scalar reference + /// path. TGcmUtilities calls only this facade and stays free of any + /// TCpuFeatures / CRYPTOLIB_*_ASM knowledge. + /// + TGhashSimd = class sealed + public + /// Carryless multiply-reduce: PX := PX * PY in GF(2^128). + class function TryMultiply(PX, PY: Pointer): Boolean; static; + /// Carryless multiply to three 128-bit limbs (48 bytes). + class function TryMultiplyExt(PX, PY, POut48: PByte): Boolean; static; + /// Fold + reduce of three 128-bit limbs into one block. + class function TryReduce3(PZ0, PZ1, PZ2, PSVector16: PByte): Boolean; static; + /// Xor three 16-byte limbs with three slices of a 48-byte MultiplyExt output. + class function TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte): Boolean; static; + /// Fused 4-way GHASH over 64 contiguous ciphertext bytes. + class function TryFusedFourShuffledGhash(PFS, PC0, PHPow64: PByte): Boolean; static; + /// Fused 8-way GHASH over 128 contiguous ciphertext bytes. + class function TryFusedEightShuffledGhash(PFS, PC0, PHPow128: PByte): Boolean; static; + + /// True when the shuffled/fused GHASH path (4-/8-way) is usable on this build/CPU. Gates the batch dispatch and the H-power precompute. + class function IsShuffledGhashSupported: Boolean; static; + /// True when the packed 128-bit block-XOR fast path is usable. + class function IsBlockXorSupported: Boolean; static; + /// True when a hardware carryless (polynomial) multiply is available (selects the carryless-multiply GCM multiplier over the 4K-table one). Backed by PCLMULQDQ on x86 and PMULL on arm. + class function HasCarrylessMultiply: Boolean; static; + /// XOR one 128-bit block: PDst := PDst xor PSrc. Precondition: IsBlockXorSupported. + class procedure BlockXor128(PDst, PSrc: PByte); static; + /// Full byte-reverse of one 128-bit block from PSrc into PDst; False when unavailable. + class function TryBlockReverse128(PDst, PSrc: PByte): Boolean; static; + end; + +implementation + +{ TGhashSimd } + +class function TGhashSimd.TryMultiply(PX, PY: Pointer): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGhashX86Backend.TryMultiply(PX, PY); +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TGhashSimd.TryMultiplyExt(PX, PY, POut48: PByte): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGhashX86Backend.TryMultiplyExt(PX, PY, POut48); +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TGhashSimd.TryReduce3(PZ0, PZ1, PZ2, PSVector16: PByte): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGhashX86Backend.TryReduce3(PZ0, PZ1, PZ2, PSVector16); +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TGhashSimd.TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGhashX86Backend.TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48); +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TGhashSimd.TryFusedFourShuffledGhash(PFS, PC0, PHPow64: PByte): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGhashX86Backend.TryFusedFourShuffledGhash(PFS, PC0, PHPow64); +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TGhashSimd.TryFusedEightShuffledGhash(PFS, PC0, PHPow128: PByte): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGhashX86Backend.TryFusedEightShuffledGhash(PFS, PC0, PHPow128); +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TGhashSimd.IsShuffledGhashSupported: Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGhashX86Backend.IsShuffledGhashSupported; +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TGhashSimd.IsBlockXorSupported: Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGhashX86Backend.IsBlockXorSupported; +{$ELSE} + Result := False; +{$IFEND} +end; + +class function TGhashSimd.HasCarrylessMultiply: Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGhashX86Backend.HasCarrylessMultiply; +{$ELSE} + Result := False; +{$IFEND} +end; + +class procedure TGhashSimd.BlockXor128(PDst, PSrc: PByte); +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + TGhashX86Backend.BlockXor128(PDst, PSrc); +{$IFEND} +end; + +class function TGhashSimd.TryBlockReverse128(PDst, PSrc: PByte): Boolean; +begin +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + Result := TGhashX86Backend.TryBlockReverse128(PDst, PSrc); +{$ELSE} + Result := False; +{$IFEND} +end; + +end. diff --git a/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashX86Backend.pas b/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashX86Backend.pas new file mode 100644 index 00000000..d63cea50 --- /dev/null +++ b/CryptoLib/src/Crypto/Modes/Gcm/ClpGhashX86Backend.pas @@ -0,0 +1,335 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpGhashX86Backend; + +{$I ..\..\..\Include\CryptoLib.inc} + +interface + +uses + ClpCpuFeatures, + ClpIntrinsicsVector, + ClpCryptoLibTypes; + +type + /// + /// x86 SIMD backend for the GHASH / GF(2^128) field operations behind + /// TGcmUtilities: owns the SIMD GHASH kernels (bodies in + /// Include\Simd\Gcm\) and the runtime capability gates. Compiles on + /// every target - every entry point returns False (leaving the caller on + /// its scalar reference path) when built without x86 SIMD or on a CPU lacking + /// the required instruction set. All entry points work on raw pointers (the + /// kernel ABI), so this unit carries no dependency on the field-element record. + /// + TGhashX86Backend = class sealed + public + /// PCLMULQDQ carryless multiply-reduce: PX := PX * PY in GF(2^128). + class function TryMultiply(PX, PY: Pointer): Boolean; static; + /// PCLMULQDQ carryless multiply to three 128-bit limbs (48 bytes). + class function TryMultiplyExt(PX, PY, POut48: PByte): Boolean; static; + /// SIMD fold + reduce of three 128-bit limbs into one block. + class function TryReduce3(PZ0, PZ1, PZ2, PSVector16: PByte): Boolean; static; + /// SIMD xor of three 16-byte limbs with three slices of a 48-byte MultiplyExt output. + class function TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte): Boolean; static; + /// PCLMULQDQ fused 4-way GHASH (requires packed vector layout). Uses the backend's own byte-reverse mask. + class function TryFusedFourShuffledGhash(PFS, PC0, PHPow64: PByte): Boolean; static; + /// PCLMULQDQ fused 8-way GHASH (requires packed vector layout). Uses the backend's own byte-reverse mask. + class function TryFusedEightShuffledGhash(PFS, PC0, PHPow128: PByte): Boolean; static; + + /// True when the fused shuffled-GHASH path is usable on this CPU (needs packed vector layout). Gates the 4-/8-way batch dispatch and the H-power precompute. + class function IsShuffledGhashSupported: Boolean; static; + /// True when the packed 128-bit block XOR fast path is usable. + class function IsBlockXorSupported: Boolean; static; + /// True when the PCLMULQDQ carryless multiply is available (selects the carryless-multiply GCM multiplier over the 4K-table one). + class function HasCarrylessMultiply: Boolean; static; + /// SIMD XOR of one 128-bit block: PDst := PDst xor PSrc. Precondition: IsBlockXorSupported. + class procedure BlockXor128(PDst, PSrc: PByte); static; + /// Full byte-reverse of one 128-bit block from PSrc into PDst; returns False when unavailable on this CPU. + class function TryBlockReverse128(PDst, PSrc: PByte): Boolean; static; + end; + +implementation + +{$IFDEF CRYPTOLIB_X86_SIMD} +type + TGcmPartial128 = record + T3, T2, T1, T0: UInt64; + end; + + // Raw two-limb GF(2^128) field element, layout-compatible with the caller's + // field-element record (N0 at offset 0, N1 at offset 8). Used only to write the + // reduced product back through the caller-supplied pointer without depending on + // the field-element type declared in ClpGcmUtilities. + TGcmFieldRaw = record + N0, N1: UInt64; + end; + PGcmFieldRaw = ^TGcmFieldRaw; + +procedure GcmPclmulFieldPartial(PX, PY, POut: Pointer); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmPclmulPartial_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_i386.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmPclmulPartial_i386.inc} +{$ENDIF} +end; + +procedure GcmPclmulMultiplyExtBytes(PX, PY, POut48: Pointer); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmPclmulMultiplyExt_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_i386.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmPclmulMultiplyExt_i386.inc} +{$ENDIF} +end; + +procedure GcmReduce3FoldSse2(PZ0, PZ1, PZ2, POut: Pointer); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmReduce3FoldSse2_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmReduce3FoldSse2_i386.inc} +{$ENDIF} +end; + +procedure GcmXorMultiplyExtLimbs48Sse2(PA0, PA1, PA2, PSrc48: Pointer); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmXorMultiplyExtLimbs48Sse2_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmXorMultiplyExtLimbs48Sse2_i386.inc} +{$ENDIF} +end; + +procedure GcmGhashFourFull(PFS, PC0, PHPow64, PMask: Pointer); +{$DEFINE GCM_GHASH_FULL_BLOCKS_4} +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_i386.inc} +{$ENDIF} +{$UNDEF GCM_GHASH_FULL_BLOCKS_4} +end; + +procedure GcmGhashEightFull(PFS, PC0, PHPow128, PMask: Pointer); +{$DEFINE GCM_GHASH_FULL_BLOCKS_8} +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_x86_64.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc4Begin_i386.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmGhashFull_i386.inc} +{$ENDIF} +{$UNDEF GCM_GHASH_FULL_BLOCKS_8} +end; + +const + // Byte-reverse shuffle control shared by the block byte-reverse and the fused + // shuffled-GHASH kernels. Owned here (not by the mode) - it is a SIMD + // implementation detail. + ReverseBytesMask: packed array[0..15] of Byte = ( + $0F, $0E, $0D, $0C, $0B, $0A, $09, $08, $07, $06, $05, $04, $03, $02, $01, $00); + +procedure GcmBlockXor128Sse2(PDst, PSrc: PByte); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc2Begin_x86_64.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmBlockXor128Sse2_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc2Begin_i386.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmBlockXor128Sse2_i386.inc} +{$ENDIF} +end; + +procedure GcmBlockReverse128Ssse3(PDst, PSrc, PMask: PByte); +{$IFDEF CRYPTOLIB_X86_64_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_x86_64.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmBlockReverse128Ssse3_x86_64.inc} +{$ENDIF} +{$IFDEF CRYPTOLIB_I386_ASM} +{$I ..\..\..\Include\Simd\Common\SimdProc3Begin_i386.inc} +{$I ..\..\..\Include\Simd\Gcm\GcmBlockReverse128Ssse3_i386.inc} +{$ENDIF} +end; + +// Scalar reduction of the 256-bit carryless product produced by +// GcmPclmulFieldPartial into a 128-bit field element (radix-free bit reflection +// reduction modulo the GCM polynomial). Pure UInt64 arithmetic. +procedure GcmPclmulReducePartial(const APartial: TGcmPartial128; var AZ: TGcmFieldRaw); +var + LT3, LT2, LT1, LT0: UInt64; + LZ0, LZ1, LZ2: UInt64; +begin + LT3 := APartial.T3; + LT2 := APartial.T2; + LT1 := APartial.T1; + LT0 := APartial.T0; + LT1 := LT1 xor LT3 xor (LT3 shr 1) xor (LT3 shr 2) xor (LT3 shr 7); + LT2 := LT2 xor (LT3 shl 63) xor (LT3 shl 62) xor (LT3 shl 57); + LZ0 := (LT0 shl 1) or (LT1 shr 63); + LZ1 := (LT1 shl 1) or (LT2 shr 63); + LZ2 := LT2 shl 1; + LZ0 := LZ0 xor LZ2 xor (LZ2 shr 1) xor (LZ2 shr 2) xor (LZ2 shr 7); + LZ1 := LZ1 xor (LT2 shl 63) xor (LT2 shl 58); + AZ.N0 := LZ0; + AZ.N1 := LZ1; +end; +{$ENDIF CRYPTOLIB_X86_SIMD} + +{ TGhashX86Backend } + +class function TGhashX86Backend.TryMultiply(PX, PY: Pointer): Boolean; +{$IFDEF CRYPTOLIB_X86_SIMD} +var + LPartial: TGcmPartial128; +{$ENDIF} +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + if TCpuFeatures.X86.HasPCLMULQDQ then + begin + GcmPclmulFieldPartial(PX, PY, @LPartial); + GcmPclmulReducePartial(LPartial, PGcmFieldRaw(PX)^); + Exit(True); + end; +{$ENDIF} + Result := False; +end; + +class function TGhashX86Backend.TryMultiplyExt(PX, PY, POut48: PByte): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + if TCpuFeatures.X86.HasPCLMULQDQ then + begin + GcmPclmulMultiplyExtBytes(PX, PY, POut48); + Exit(True); + end; +{$ENDIF} + Result := False; +end; + +class function TGhashX86Backend.TryReduce3(PZ0, PZ1, PZ2, PSVector16: PByte): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + if TCpuFeatures.X86.HasSSE2 then + begin + GcmReduce3FoldSse2(PZ0, PZ1, PZ2, PSVector16); + Exit(True); + end; +{$ENDIF} + Result := False; +end; + +class function TGhashX86Backend.TryXorMultiplyExtLimbs48(PA0, PA1, PA2, PSrc48: PByte): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + if TCpuFeatures.X86.HasSSE2 then + begin + GcmXorMultiplyExtLimbs48Sse2(PA0, PA1, PA2, PSrc48); + Exit(True); + end; +{$ENDIF} + Result := False; +end; + +class function TGhashX86Backend.TryFusedFourShuffledGhash(PFS, PC0, PHPow64: PByte): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + if TCpuFeatures.X86.HasSSSE3 and TCpuFeatures.X86.HasPCLMULQDQ and TIntrinsicsVector.IsPacked then + begin + // Monolithic kernel: byte-reverse + state fold + 4-way multiply-accumulate + + // Reduce3 + byte-reverse back, all in a single assembly body (one call boundary). + GcmGhashFourFull(PFS, PC0, PHPow64, @ReverseBytesMask[0]); + Exit(True); + end; +{$ENDIF} + Result := False; +end; + +class function TGhashX86Backend.TryFusedEightShuffledGhash(PFS, PC0, PHPow128: PByte): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + if TCpuFeatures.X86.HasSSSE3 and TCpuFeatures.X86.HasPCLMULQDQ and TIntrinsicsVector.IsPacked then + begin + // Monolithic kernel: byte-reverse + state fold + 8-way multiply-accumulate + + // Reduce3 + byte-reverse back, all in a single assembly body (one call boundary). + GcmGhashEightFull(PFS, PC0, PHPow128, @ReverseBytesMask[0]); + Exit(True); + end; +{$ENDIF} + Result := False; +end; + +class function TGhashX86Backend.IsShuffledGhashSupported: Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + Result := TCpuFeatures.X86.HasPCLMULQDQ and TCpuFeatures.X86.HasSSSE3 and + TIntrinsicsVector.IsPacked; +{$ELSE} + Result := False; +{$ENDIF} +end; + +class function TGhashX86Backend.IsBlockXorSupported: Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + Result := TCpuFeatures.X86.HasSSE2 and TIntrinsicsVector.IsPacked; +{$ELSE} + Result := False; +{$ENDIF} +end; + +class function TGhashX86Backend.HasCarrylessMultiply: Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + Result := TCpuFeatures.X86.HasPCLMULQDQ; +{$ELSE} + Result := False; +{$ENDIF} +end; + +class procedure TGhashX86Backend.BlockXor128(PDst, PSrc: PByte); +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + GcmBlockXor128Sse2(PDst, PSrc); +{$ENDIF} +end; + +class function TGhashX86Backend.TryBlockReverse128(PDst, PSrc: PByte): Boolean; +begin +{$IFDEF CRYPTOLIB_X86_SIMD} + if TCpuFeatures.X86.HasSSSE3 then + begin + GcmBlockReverse128Ssse3(PDst, PSrc, @ReverseBytesMask[0]); + Exit(True); + end; +{$ENDIF} + Result := False; +end; + +end. diff --git a/CryptoLib/src/Interfaces/Crypto/ClpIBulkBlockCipher.pas b/CryptoLib/src/Interfaces/Crypto/ClpIBulkBlockCipher.pas index 9e50cf43..d796433a 100644 --- a/CryptoLib/src/Interfaces/Crypto/ClpIBulkBlockCipher.pas +++ b/CryptoLib/src/Interfaces/Crypto/ClpIBulkBlockCipher.pas @@ -32,9 +32,9 @@ interface /// interface is the cipher-side companion to IBulkBlockCipherMode: modes /// (CTR/SIC, CBC, ECB, the non-fused GCM CTR dispatcher, ...) query for it /// via Supports(FCipher, IBulkBlockCipher, FBulkCipher) and let the engine - /// own the "best batch size" decision (8-wide / 4-wide / 1-wide ladders on - /// AES-NI today; a hypothetical AVX-512 16-wide or ARMv8 engine would just - /// plug in here with no mode-side changes). + /// own the "best batch size" decision (e.g. 8-wide / 4-wide / 1-wide ladders + /// on AES-NI). Any other engine - a wider vector or an ARM Crypto-Extensions + /// engine - plugs in here with no mode-side changes. /// /// /// Contract: ProcessBlocks(..., ABlockCount) produces byte-identical output diff --git a/CryptoLib/src/Interfaces/Crypto/Engines/ClpIAesHardwareEngine.pas b/CryptoLib/src/Interfaces/Crypto/Engines/ClpIAesHardwareEngine.pas index 77366c01..691036ec 100644 --- a/CryptoLib/src/Interfaces/Crypto/Engines/ClpIAesHardwareEngine.pas +++ b/CryptoLib/src/Interfaces/Crypto/Engines/ClpIAesHardwareEngine.pas @@ -27,8 +27,8 @@ interface type /// /// Architecture-neutral capability interface for hardware-accelerated AES - /// engines (AES-NI on x86 via TAesEngineX86 today; a NEON/Crypto-Ext - /// ARMv8 engine could implement the same surface tomorrow). It exposes the + /// engines (e.g. AES-NI on x86 via TAesEngineX86; a NEON/Crypto-Ext + /// ARMv8 engine could implement the same surface). It exposes the /// fixed-width 4-/8-block batch entry points and the raw-pointer single-block /// overload that sit beneath the generic /// ladder. The aliasing contract for every overload here is identical to diff --git a/CryptoLib/src/Math/BinPoly/ClpBinPolySimd.pas b/CryptoLib/src/Math/BinPoly/ClpBinPolySimd.pas new file mode 100644 index 00000000..9444a480 --- /dev/null +++ b/CryptoLib/src/Math/BinPoly/ClpBinPolySimd.pas @@ -0,0 +1,69 @@ +{ *********************************************************************************** } +{ * CryptoLib Library * } +{ * Author - Ugochukwu Mmaduekwe * } +{ * Github Repository * } +{ * * } +{ * Distributed under the MIT software license, see the accompanying file LICENSE * } +{ * or visit http://www.opensource.org/licenses/mit-license.php. * } +{ * * } +{ * Acknowledgements: * } +{ * * } +{ * Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring * } +{ * the development of this library * } +{ * ******************************************************************************* * } + +(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) + +unit ClpBinPolySimd; + +{$I ..\..\Include\CryptoLib.inc} + +interface + +uses + ClpIBinPolyMul +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + , ClpBinPolyX86V128Backend +{$IFEND} + ; + +type + /// + /// Arch-neutral SIMD dispatch facade for binary-polynomial multiplication. + /// SIMD-only by contract: it produces the per-arch SIMD multiplier + /// when available, or reports "not handled" - it never + /// returns the portable scalar backend. The scalar fallback belongs to the + /// caller (TBinPolys), matching the Try*-then-scalar shape used across + /// the other SIMD families. Selects the per-arch backend at compile time. + /// + TBinPolySimd = class sealed + public + /// + /// Build a SIMD IBinPolyMul for degree under the + /// given reduction when a SIMD backend is available (returns True with + /// set); otherwise is nil and + /// the caller runs its scalar path (returns False). + /// + class function TryCreateBinPolyMul(AN: Int32; const AReduce: IBinPolyReduce; + out AMul: IBinPolyMul): Boolean; static; + end; + +implementation + +{ TBinPolySimd } + +class function TBinPolySimd.TryCreateBinPolyMul(AN: Int32; const AReduce: IBinPolyReduce; + out AMul: IBinPolyMul): Boolean; +begin + AMul := nil; +{$IF DEFINED(CRYPTOLIB_X86_SIMD)} + if TBinPolyX86V128Backend.IsSupported then + begin + AMul := TBinPolyX86V128Backend.CreateBinPolyMul(AN, AReduce); + Exit(True); + end; +{$IFEND} + Result := False; +end; + +end. diff --git a/CryptoLib/src/Math/BinPoly/ClpBinPolyX86V128Backend.pas b/CryptoLib/src/Math/BinPoly/ClpBinPolyX86V128Backend.pas index 8248675c..b950a8d0 100644 --- a/CryptoLib/src/Math/BinPoly/ClpBinPolyX86V128Backend.pas +++ b/CryptoLib/src/Math/BinPoly/ClpBinPolyX86V128Backend.pas @@ -31,7 +31,7 @@ interface ClpCryptoLibTypes; resourcestring - SX86V128BackendRequiresPclmulqdqSupport = 'X86.V128 backend requires PCLMULQDQ support on this target'; + SX86V128BackendRequiresCarrylessMultiply = 'X86.V128 backend requires carryless-multiply support on this target'; type /// @@ -39,7 +39,7 @@ interface /// TBinPolyX86V128Backend = class sealed public - class function IsEnabled: Boolean; static; + class function IsSupported: Boolean; static; class function CreateBinPolyMul(AN: Int32; const AReduce: IBinPolyReduce): IBinPolyMul; static; end; @@ -47,7 +47,7 @@ implementation { TBinPolyX86V128Backend } -class function TBinPolyX86V128Backend.IsEnabled: Boolean; +class function TBinPolyX86V128Backend.IsSupported: Boolean; begin {$IFDEF CRYPTOLIB_X86_SIMD} Result := TCpuFeatures.X86.HasPCLMULQDQ and TIntrinsicsVector.IsPacked; @@ -60,8 +60,8 @@ class function TBinPolyX86V128Backend.CreateBinPolyMul(AN: Int32; const AReduce: var LSize: Int32; begin - if not IsEnabled then - raise EInvalidOperationCryptoLibException.CreateRes(@SX86V128BackendRequiresPclmulqdqSupport); + if not IsSupported then + raise EInvalidOperationCryptoLibException.CreateRes(@SX86V128BackendRequiresCarrylessMultiply); LSize := (AN + 63) shr 6; case LSize of diff --git a/CryptoLib/src/Math/BinPoly/ClpBinPolys.pas b/CryptoLib/src/Math/BinPoly/ClpBinPolys.pas index 6512431d..3b97cf8f 100644 --- a/CryptoLib/src/Math/BinPoly/ClpBinPolys.pas +++ b/CryptoLib/src/Math/BinPoly/ClpBinPolys.pas @@ -31,8 +31,8 @@ interface ClpBinPolyMulBaseBinomialReduce, ClpBinPolyMulBaseTrinomialReduce, ClpBinPolyMulBasePentanomialReduce, + ClpBinPolySimd, ClpBinPolyScalarBackend, - ClpBinPolyX86V128Backend, ClpItohTsujiiInv; type @@ -41,7 +41,7 @@ interface /// (Size, Create, Add, AddTo, etc.) sit at the top level; /// factories classified by reduction polynomial shape live under the nested /// TBinPolysMul class, and inversion factories under TBinPolysInv - /// (Itoh-Tsujii today). + /// (Itoh-Tsujii). /// /// /// Internal library surface — consumed by the generic F2m field layer and other @@ -259,10 +259,8 @@ class function TBinPolys.BitLengthVar(ASize: Int32; const AX: TCryptoLibUInt64Ar class function TBinPolys.TBinPolysMul.CreateBinPolyMul(AN: Int32; const AReduce: IBinPolyReduce): IBinPolyMul; begin - {$IFDEF CRYPTOLIB_X86_SIMD} - if TBinPolyX86V128Backend.IsEnabled then - Exit(TBinPolyX86V128Backend.CreateBinPolyMul(AN, AReduce)); - {$ENDIF} + if TBinPolySimd.TryCreateBinPolyMul(AN, AReduce, Result) then + Exit; Result := TBinPolyScalarBackend.CreateBinPolyMul(AN, AReduce); end; diff --git a/CryptoLib/src/Packages/Delphi/CryptoLib4PascalPackage.dpk b/CryptoLib/src/Packages/Delphi/CryptoLib4PascalPackage.dpk index a91176b3..e40523c7 100644 --- a/CryptoLib/src/Packages/Delphi/CryptoLib4PascalPackage.dpk +++ b/CryptoLib/src/Packages/Delphi/CryptoLib4PascalPackage.dpk @@ -41,6 +41,7 @@ contains ClpIAesEngineX86 in '..\..\Interfaces\Crypto\Engines\ClpIAesEngineX86.pas', ClpAesEngineX86 in '..\..\Crypto\Engines\ClpAesEngineX86.pas', ClpAesUtilities in '..\..\Crypto\ClpAesUtilities.pas', + ClpAesSimd in '..\..\Crypto\ClpAesSimd.pas', ClpAesLightEngine in '..\..\Crypto\Engines\ClpAesLightEngine.pas', ClpAgreementUtilities in '..\..\Crypto\Agreements\ClpAgreementUtilities.pas', ClpArgon2ParametersGenerator in '..\..\Crypto\Generators\ClpArgon2ParametersGenerator.pas', @@ -194,6 +195,17 @@ contains ClpIDHGenerators in '..\..\Interfaces\Crypto\Generators\ClpIDHGenerators.pas', ClpIDHParameters in '..\..\Interfaces\Crypto\Parameters\ClpIDHParameters.pas', ClpIBackingHashProvider in '..\..\Interfaces\Crypto\Digests\ClpIBackingHashProvider.pas', + ClpChaChaSimd in '..\..\Crypto\Engines\ClpChaChaSimd.pas', + ClpChaChaX86Backend in '..\..\Crypto\Engines\ClpChaChaX86Backend.pas', + ClpSalsaSimd in '..\..\Crypto\Engines\ClpSalsaSimd.pas', + ClpSalsaX86Backend in '..\..\Crypto\Engines\ClpSalsaX86Backend.pas', + ClpPoly1305State in '..\..\Crypto\Macs\ClpPoly1305State.pas', + ClpPoly1305Simd in '..\..\Crypto\Macs\ClpPoly1305Simd.pas', + ClpPoly1305X86Backend in '..\..\Crypto\Macs\ClpPoly1305X86Backend.pas', + ClpGhashSimd in '..\..\Crypto\Modes\Gcm\ClpGhashSimd.pas', + ClpGhashX86Backend in '..\..\Crypto\Modes\Gcm\ClpGhashX86Backend.pas', + ClpGcmSivSimd in '..\..\Crypto\Modes\Gcm\ClpGcmSivSimd.pas', + ClpGcmSivX86Backend in '..\..\Crypto\Modes\Gcm\ClpGcmSivX86Backend.pas', ClpIDigest in '..\..\Interfaces\Crypto\Digests\ClpIDigest.pas', ClpIDigestFactory in '..\..\Interfaces\Crypto\Operators\ClpIDigestFactory.pas', ClpIDigestRandomGenerator in '..\..\Interfaces\Rngs\ClpIDigestRandomGenerator.pas', @@ -679,7 +691,9 @@ contains ClpIFusedEaxKernel in '..\..\Interfaces\Crypto\Modes\Fused\ClpIFusedEaxKernel.pas', ClpIFusedGcmSivKernel in '..\..\Interfaces\Crypto\Modes\Fused\ClpIFusedGcmSivKernel.pas', ClpFusedKernelRegistry in '..\..\Crypto\Modes\Fused\ClpFusedKernelRegistry.pas', - ClpAesNiAeadResolver in '..\..\Crypto\Modes\Fused\Internal\ClpAesNiAeadResolver.pas', + ClpAesFusedAeadSimd in '..\..\Crypto\Modes\Fused\Internal\ClpAesFusedAeadSimd.pas', + ClpAesFusedAeadX86Backend in '..\..\Crypto\Modes\Fused\Internal\ClpAesFusedAeadX86Backend.pas', + ClpBinPolySimd in '..\..\Math\BinPoly\ClpBinPolySimd.pas', ClpAesNiOcbKernel in '..\..\Crypto\Modes\Fused\ClpAesNiOcbKernel.pas', ClpAesNiCcmKernel in '..\..\Crypto\Modes\Fused\ClpAesNiCcmKernel.pas', ClpAesNiEaxKernel in '..\..\Crypto\Modes\Fused\ClpAesNiEaxKernel.pas', diff --git a/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.lpk b/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.lpk index 144504b4..5d141cbc 100644 --- a/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.lpk +++ b/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.lpk @@ -31,7 +31,7 @@ Acknowledgements: Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring the development of this library "/> - + @@ -2734,8 +2734,8 @@ Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring the devel - - + + @@ -3133,6 +3133,62 @@ Thanks to Sphere 10 Software (http://www.sphere10.com/) for sponsoring the devel + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.pas b/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.pas index 88041881..42a0be66 100644 --- a/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.pas +++ b/CryptoLib/src/Packages/FPC/CryptoLib4PascalPackage.pas @@ -220,7 +220,7 @@ interface ClpBlockCipherBulkUtilities, ClpCipherModeParameterUtilities, ClpGcmSivUtilities, ClpFusedKernelTypes, ClpIFusedGcmKernel, ClpIFusedOcbKernel, ClpIFusedCcmKernel, ClpIFusedEaxKernel, - ClpIFusedGcmSivKernel, ClpFusedKernelRegistry, ClpAesNiAeadResolver, + ClpIFusedGcmSivKernel, ClpFusedKernelRegistry, ClpAesFusedAeadSimd, ClpAesNiOcbKernel, ClpAesNiCcmKernel, ClpAesNiEaxKernel, ClpAesNiGcmKernel, ClpPclmulGcmSivKernel, ClpFusedKernelDefaults, ClpXChaCha20Engine, ClpIXChaCha20Engine, ClpXChaCha20Poly1305, ClpIXChaCha20Poly1305, @@ -252,7 +252,10 @@ interface ClpISP800SecureRandomBuilder, ClpSP800SecureRandomBuilder, ClpECDHRawAgreement, ClpECDHCRawAgreement, ClpIECDHRawAgreement, ClpIECDHCRawAgreement, ClpGF256Aes, ClpIAesHardwareEngine, - ClpIBackingHashProvider; + ClpIBackingHashProvider, ClpChaChaSimd, ClpChaChaX86Backend, ClpSalsaSimd, + ClpSalsaX86Backend, ClpPoly1305State, ClpPoly1305Simd, + ClpPoly1305X86Backend, ClpGhashSimd, ClpGhashX86Backend, ClpGcmSivSimd, + ClpGcmSivX86Backend, ClpAesFusedAeadX86Backend, ClpBinPolySimd, ClpAesSimd; implementation