diff --git a/go.mod b/go.mod index f4aeefd800b..61aa4c16fe2 100644 --- a/go.mod +++ b/go.mod @@ -65,7 +65,7 @@ require ( go.opentelemetry.io/otel/sdk v1.44.0 go.opentelemetry.io/otel/trace v1.44.0 go.uber.org/atomic v1.11.0 - golang.org/x/net v0.55.0 + golang.org/x/net v0.56.0 golang.org/x/sync v0.21.0 golang.org/x/time v0.15.0 google.golang.org/grpc v1.81.1 @@ -290,13 +290,13 @@ require ( go.yaml.in/yaml/v3 v3.0.4 // indirect go4.org/intern v0.0.0-20230525184215-6c62f75575cb // indirect go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect - golang.org/x/crypto v0.51.0 // indirect + golang.org/x/crypto v0.53.0 // indirect golang.org/x/exp v0.0.0-20260218203240-3dfff04db8fa // indirect - golang.org/x/mod v0.35.0 // indirect + golang.org/x/mod v0.36.0 // indirect golang.org/x/oauth2 v0.36.0 // indirect - golang.org/x/sys v0.45.0 // indirect - golang.org/x/text v0.37.0 // indirect - golang.org/x/tools v0.44.0 // indirect + golang.org/x/sys v0.46.0 // indirect + golang.org/x/text v0.38.0 // indirect + golang.org/x/tools v0.45.0 // indirect gonum.org/v1/gonum v0.17.0 // indirect google.golang.org/api v0.252.0 // indirect google.golang.org/genproto v0.0.0-20250603155806-513f23925822 // indirect diff --git a/go.sum b/go.sum index 50d53bbd501..c7f9a5369c7 100644 --- a/go.sum +++ b/go.sum @@ -1166,8 +1166,8 @@ golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPh golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220411220226-7b82a4e95df4/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= -golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= +golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto= +golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -1205,8 +1205,8 @@ golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= -golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= +golang.org/x/mod v0.36.0 h1:JJjpVx6myfUsUdAzZuOSTTmRE0PfZeNWzzvKrP7amb4= +golang.org/x/mod v0.36.0/go.mod h1:moc6ELqsWcOw5Ef3xVprK5ul/MvtVvkIXLziUOICjUQ= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1255,8 +1255,8 @@ golang.org/x/net v0.0.0-20220325170049-de3da57026de/go.mod h1:CfG3xpIq0wQ8r1q4Su golang.org/x/net v0.0.0-20220412020605-290c469a71a5/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220520000938-2e3eb7b945c2/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= -golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= -golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= +golang.org/x/net v0.56.0 h1:Rw8j/hFzGvJUZwNBXnAtf5sVDVt+65SK2C7IxCxZt5o= +golang.org/x/net v0.56.0/go.mod h1:D3Ku6r+V6JROoZK144D2XfMHFcMq/0zSfLelVTCFKec= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1376,12 +1376,12 @@ golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= -golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= +golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= -golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= +golang.org/x/term v0.44.0 h1:0rLvDRCtNj0gZkyIXhCyOb2OAzEhLVqc4B+hrsBhrmc= +golang.org/x/term v0.44.0/go.mod h1:7ze4MdzUzLXpSAoFP1H0bOI9aXDqveSvatT5vKcFh2Y= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1391,8 +1391,8 @@ golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= -golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= +golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE= +golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1454,8 +1454,8 @@ golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= -golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c= -golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= +golang.org/x/tools v0.45.0 h1:18qN3FAooORvApf5XjCXgsuayZOEtXf6JK18I3+ONa8= +golang.org/x/tools v0.45.0/go.mod h1:LuUGqqaXcXMEFEruIVJVm5mgDD8vww/z/SR1gQ4uE/0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/vendor/golang.org/x/crypto/blake2b/go125.go b/vendor/golang.org/x/crypto/blake2b/go125.go deleted file mode 100644 index 67e990b7e16..00000000000 --- a/vendor/golang.org/x/crypto/blake2b/go125.go +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2025 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.25 - -package blake2b - -import "hash" - -var _ hash.XOF = (*xof)(nil) diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go index b850e772e16..bfe546b60a2 100644 --- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go @@ -20,7 +20,7 @@ func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte) var ( - useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2 + useAVX2 = cpu.X86.HasSSSE3 && cpu.X86.HasAVX2 && cpu.X86.HasBMI2 ) // setupState writes a ChaCha20 input matrix to state. See @@ -47,7 +47,7 @@ func setupState(state *[16]uint32, key *[32]byte, nonce []byte) { } func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte { - if !cpu.X86.HasSSSE3 { + if !useAVX2 { return c.sealGeneric(dst, nonce, plaintext, additionalData) } @@ -66,7 +66,7 @@ func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) [] } func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { - if !cpu.X86.HasSSSE3 { + if !useAVX2 { return c.openGeneric(dst, nonce, ciphertext, additionalData) } diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s index fd5ee845f9f..c703c13471a 100644 --- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s @@ -6,27 +6,6 @@ // func polyHashADInternal<>() TEXT polyHashADInternal<>(SB), NOSPLIT, $0 - // Hack: Must declare #define macros inside of a function due to Avo constraints - // ROL rotates the uint32s in register R left by N bits, using temporary T. - #define ROL(N, R, T) \ - MOVO R, T; \ - PSLLL $(N), T; \ - PSRLL $(32-(N)), R; \ - PXOR T, R - - // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. - #ifdef GOAMD64_v2 - #define ROL8(R, T) PSHUFB ·rol8<>(SB), R - #else - #define ROL8(R, T) ROL(8, R, T) - #endif - - // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. - #ifdef GOAMD64_v2 - #define ROL16(R, T) PSHUFB ·rol16<>(SB), R - #else - #define ROL16(R, T) ROL(16, R, T) - #endif XORQ R10, R10 XORQ R11, R11 XORQ R12, R12 @@ -192,676 +171,112 @@ hashADDone: // Requires: AVX, AVX2, BMI2, CMOV, SSE2 TEXT ·chacha20Poly1305Open(SB), $288-97 // For aligned stack access - MOVQ SP, BP - ADDQ $0x20, BP - ANDQ $-32, BP - MOVQ dst_base+0(FP), DI - MOVQ key_base+24(FP), R8 - MOVQ src_base+48(FP), SI - MOVQ src_len+56(FP), BX - MOVQ ad_base+72(FP), CX - - // Check for AVX2 support - CMPB ·useAVX2+0(SB), $0x01 - JE chacha20Poly1305Open_AVX2 + MOVQ SP, BP + ADDQ $0x20, BP + ANDQ $-32, BP + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX + VZEROUPPER + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VBROADCASTI128 16(R8), Y14 + VBROADCASTI128 32(R8), Y12 + VBROADCASTI128 48(R8), Y4 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimization, for very short buffers - CMPQ BX, $0x80 - JBE openSSE128 - - // For long buffers, prepare the poly key first - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - MOVO X9, X13 - - // Store state on stack for future use - MOVO X3, 32(BP) - MOVO X6, 48(BP) - MOVO X9, 128(BP) - MOVQ $0x0000000a, R9 - -openSSEPreparePolyKey: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - DECQ R9 - JNE openSSEPreparePolyKey + CMPQ BX, $0xc0 + JBE openAVX2192 + CMPQ BX, $0x00000140 + JBE openAVX2320 + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, 64(BP) + VMOVDQA Y4, 192(BP) + MOVQ $0x0000000a, R9 + +openAVX2PreparePolyKey: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ R9 + JNE openAVX2PreparePolyKey + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD 32(BP), Y14, Y14 + VPADDD 64(BP), Y12, Y12 + VPADDD 192(BP), Y4, Y4 + VPERM2I128 $0x02, Y0, Y14, Y3 - // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL 32(BP), X3 + // Clamp and store poly key + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) - // Clamp and store the key - PAND ·polyClampMask<>+0(SB), X0 - MOVO X0, (BP) - MOVO X3, 16(BP) + // Stream for the first 64 bytes + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 - // Hash AAD + // Hash AD + first 64 bytes MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) + XORQ CX, CX -openSSEMainLoop: - CMPQ BX, $0x00000100 - JB openSSEMainLoopDone - - // Load state, increment counter blocks - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - - // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash - // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 - MOVQ $0x00000004, CX - MOVQ SI, R9 - -openSSEInternalLoop: - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - LEAQ 16(R9), R9 - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - DECQ CX - JGE openSSEInternalLoop - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(R9), R9 - CMPQ CX, $-6 - JG openSSEInternalLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X6 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 80(BP), X9 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - - // Load - xor - store - MOVO X15, 64(BP) - MOVOU (SI), X15 - PXOR X15, X0 - MOVOU X0, (DI) - MOVOU 16(SI), X15 - PXOR X15, X3 - MOVOU X3, 16(DI) - MOVOU 32(SI), X15 - PXOR X15, X6 - MOVOU X6, 32(DI) - MOVOU 48(SI), X15 - PXOR X15, X9 - MOVOU X9, 48(DI) - MOVOU 64(SI), X9 - PXOR X9, X1 - MOVOU X1, 64(DI) - MOVOU 80(SI), X9 - PXOR X9, X4 - MOVOU X4, 80(DI) - MOVOU 96(SI), X9 - PXOR X9, X7 - MOVOU X7, 96(DI) - MOVOU 112(SI), X9 - PXOR X9, X10 - MOVOU X10, 112(DI) - MOVOU 128(SI), X9 - PXOR X9, X2 - MOVOU X2, 128(DI) - MOVOU 144(SI), X9 - PXOR X9, X5 - MOVOU X5, 144(DI) - MOVOU 160(SI), X9 - PXOR X9, X8 - MOVOU X8, 160(DI) - MOVOU 176(SI), X9 - PXOR X9, X11 - MOVOU X11, 176(DI) - MOVOU 192(SI), X9 - PXOR X9, X12 - MOVOU X12, 192(DI) - MOVOU 208(SI), X9 - PXOR X9, X13 - MOVOU X13, 208(DI) - MOVOU 224(SI), X9 - PXOR X9, X14 - MOVOU X14, 224(DI) - MOVOU 240(SI), X9 - PXOR 64(BP), X9 - MOVOU X9, 240(DI) - LEAQ 256(SI), SI - LEAQ 256(DI), DI - SUBQ $0x00000100, BX - JMP openSSEMainLoop - -openSSEMainLoopDone: - // Handle the various tail sizes efficiently - TESTQ BX, BX - JE openSSEFinalize - CMPQ BX, $0x40 - JBE openSSETail64 - CMPQ BX, $0x80 - JBE openSSETail128 - CMPQ BX, $0xc0 - JBE openSSETail192 - JMP openSSETail256 - -openSSEFinalize: - // Hash in the PT, AAD lengths - ADDQ ad_len+80(FP), R10 - ADCQ src_len+56(FP), R11 +openAVX2InitialHash64: + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -878,3187 +293,494 @@ openSSEFinalize: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 + ADDQ $0x10, CX + CMPQ CX, $0x40 + JNE openAVX2InitialHash64 - // Final reduce - MOVQ R10, R13 - MOVQ R11, R14 - MOVQ R12, R15 - SUBQ $-5, R10 - SBBQ $-1, R11 - SBBQ $0x03, R12 - CMOVQCS R13, R10 - CMOVQCS R14, R11 - CMOVQCS R15, R12 - - // Add in the "s" part of the key - ADDQ 16(BP), R10 - ADCQ 24(BP), R11 - - // Finally, constant time compare to the tag at the end of the message - XORQ AX, AX - MOVQ $0x00000001, DX - XORQ (SI), R10 - XORQ 8(SI), R11 - ORQ R11, R10 - CMOVQEQ DX, AX + // Decrypt the first 64 bytes + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VMOVDQU Y0, (DI) + VMOVDQU Y14, 32(DI) + LEAQ 64(SI), SI + LEAQ 64(DI), DI + SUBQ $0x40, BX - // Return true iff tags are equal - MOVB AX, ret+96(FP) - RET +openAVX2MainLoop: + CMPQ BX, $0x00000200 + JB openAVX2MainLoopDone -openSSE128: - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X3, X13 - MOVO X6, X14 - MOVO X10, X15 - MOVQ $0x0000000a, R9 - -openSSE128InnerCipherLoop: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - DECQ R9 - JNE openSSE128InnerCipherLoop - - // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL X13, X3 - PADDL X13, X4 - PADDL X13, X5 - PADDL X14, X7 - PADDL X14, X8 - PADDL X15, X10 - PADDL ·sseIncMask<>+0(SB), X15 - PADDL X15, X11 - - // Clamp and store the key - PAND ·polyClampMask<>+0(SB), X0 - MOVOU X0, (BP) - MOVOU X3, 16(BP) + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX - // Hash - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - -openSSE128Open: - CMPQ BX, $0x10 - JB openSSETail16 - SUBQ $0x10, BX - - // Load for hashing - ADDQ (SI), R10 - ADCQ 8(SI), R11 - ADCQ $0x01, R12 - - // Load for decryption - MOVOU (SI), X12 - PXOR X12, X1 - MOVOU X1, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Shift the stream "left" - MOVO X4, X1 - MOVO X7, X4 - MOVO X10, X7 - MOVO X2, X10 - MOVO X5, X2 - MOVO X8, X5 - MOVO X11, X8 - JMP openSSE128Open - -openSSETail16: - TESTQ BX, BX - JE openSSEFinalize - - // We can safely load the CT from the end, because it is padded with the MAC - MOVQ BX, R9 - SHLQ $0x04, R9 - LEAQ ·andMask<>+0(SB), R13 - MOVOU (SI), X12 - ADDQ BX, SI - PAND -16(R13)(R9*1), X12 - MOVO X12, 64(BP) - MOVQ X12, R13 - MOVQ 72(BP), R14 - PXOR X1, X12 - - // We can only store one byte at a time, since plaintext can be shorter than 16 bytes -openSSETail16Store: - MOVQ X12, R8 - MOVB R8, (DI) - PSRLDQ $0x01, X12 - INCQ DI - DECQ BX - JNE openSSETail16Store - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - JMP openSSEFinalize - -openSSETail64: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 80(BP) - XORQ R9, R9 - MOVQ BX, CX - CMPQ CX, $0x10 - JB openSSETail64LoopB - -openSSETail64LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - SUBQ $0x10, CX - -openSSETail64LoopB: - ADDQ $0x10, R9 - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - CMPQ CX, $0x10 - JAE openSSETail64LoopA - CMPQ R9, $0xa0 - JNE openSSETail64LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL 32(BP), X3 - PADDL 48(BP), X6 - PADDL 80(BP), X9 - -openSSETail64DecLoop: - CMPQ BX, $0x10 - JB openSSETail64DecLoopDone - SUBQ $0x10, BX - MOVOU (SI), X12 - PXOR X12, X0 - MOVOU X0, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - MOVO X3, X0 - MOVO X6, X3 - MOVO X9, X6 - JMP openSSETail64DecLoop - -openSSETail64DecLoopDone: - MOVO X0, X1 - JMP openSSETail16 - -openSSETail128: - MOVO ·chacha20Constants<>+0(SB), X1 - MOVO 32(BP), X4 - MOVO 48(BP), X7 - MOVO 128(BP), X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 80(BP) - MOVO X1, X0 - MOVO X4, X3 - MOVO X7, X6 - MOVO X10, X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 96(BP) - XORQ R9, R9 - MOVQ BX, CX - ANDQ $-16, CX - -openSSETail128LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openSSETail128LoopB: - ADDQ $0x10, R9 - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - CMPQ R9, CX - JB openSSETail128LoopA - CMPQ R9, $0xa0 - JNE openSSETail128LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 96(BP), X9 - PADDL 80(BP), X10 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X1 - PXOR X13, X4 - PXOR X14, X7 - PXOR X15, X10 - MOVOU X1, (DI) - MOVOU X4, 16(DI) - MOVOU X7, 32(DI) - MOVOU X10, 48(DI) - SUBQ $0x40, BX - LEAQ 64(SI), SI - LEAQ 64(DI), DI - JMP openSSETail64DecLoop - -openSSETail192: - MOVO ·chacha20Constants<>+0(SB), X2 - MOVO 32(BP), X5 - MOVO 48(BP), X8 - MOVO 128(BP), X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X11, 80(BP) - MOVO X2, X1 - MOVO X5, X4 - MOVO X8, X7 - MOVO X11, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 96(BP) - MOVO X1, X0 - MOVO X4, X3 - MOVO X7, X6 - MOVO X10, X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 112(BP) - MOVQ BX, CX - MOVQ $0x000000a0, R9 - CMPQ CX, $0xa0 - CMOVQGT R9, CX - ANDQ $-16, CX - XORQ R9, R9 - -openSSLTail192LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openSSLTail192LoopB: - ADDQ $0x10, R9 - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - CMPQ R9, CX - JB openSSLTail192LoopA - CMPQ R9, $0xa0 - JNE openSSLTail192LoopB - CMPQ BX, $0xb0 - JB openSSLTail192Store - ADDQ 160(SI), R10 - ADCQ 168(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - CMPQ BX, $0xc0 - JB openSSLTail192Store - ADDQ 176(SI), R10 - ADCQ 184(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openSSLTail192Store: - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 32(BP), X5 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 48(BP), X8 - PADDL 112(BP), X9 - PADDL 96(BP), X10 - PADDL 80(BP), X11 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X2 - PXOR X13, X5 - PXOR X14, X8 - PXOR X15, X11 - MOVOU X2, (DI) - MOVOU X5, 16(DI) - MOVOU X8, 32(DI) - MOVOU X11, 48(DI) - MOVOU 64(SI), X12 - MOVOU 80(SI), X13 - MOVOU 96(SI), X14 - MOVOU 112(SI), X15 - PXOR X12, X1 - PXOR X13, X4 - PXOR X14, X7 - PXOR X15, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - SUBQ $0x80, BX - LEAQ 128(SI), SI - LEAQ 128(DI), DI - JMP openSSETail64DecLoop - -openSSETail256: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - XORQ R9, R9 - -openSSETail256Loop: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - ADDQ $0x10, R9 - CMPQ R9, $0xa0 - JB openSSETail256Loop - MOVQ BX, CX - ANDQ $-16, CX - -openSSETail256HashLoop: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ $0x10, R9 - CMPQ R9, CX - JB openSSETail256HashLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X6 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 80(BP), X9 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - MOVO X15, 64(BP) - - // Load - xor - store - MOVOU (SI), X15 - PXOR X15, X0 - MOVOU 16(SI), X15 - PXOR X15, X3 - MOVOU 32(SI), X15 - PXOR X15, X6 - MOVOU 48(SI), X15 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVOU 64(SI), X0 - MOVOU 80(SI), X3 - MOVOU 96(SI), X6 - MOVOU 112(SI), X9 - PXOR X0, X1 - PXOR X3, X4 - PXOR X6, X7 - PXOR X9, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - MOVOU 128(SI), X0 - MOVOU 144(SI), X3 - MOVOU 160(SI), X6 - MOVOU 176(SI), X9 - PXOR X0, X2 - PXOR X3, X5 - PXOR X6, X8 - PXOR X9, X11 - MOVOU X2, 128(DI) - MOVOU X5, 144(DI) - MOVOU X8, 160(DI) - MOVOU X11, 176(DI) - LEAQ 192(SI), SI - LEAQ 192(DI), DI - SUBQ $0xc0, BX - MOVO X12, X0 - MOVO X13, X3 - MOVO X14, X6 - MOVO 64(BP), X9 - JMP openSSETail64DecLoop - -chacha20Poly1305Open_AVX2: - VZEROUPPER - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x70 - BYTE $0x10 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x20 - BYTE $0xc4 - BYTE $0xc2 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x30 - VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 - - // Special optimization, for very short buffers - CMPQ BX, $0xc0 - JBE openAVX2192 - CMPQ BX, $0x00000140 - JBE openAVX2320 - - // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA Y14, 32(BP) - VMOVDQA Y12, 64(BP) - VMOVDQA Y4, 192(BP) - MOVQ $0x0000000a, R9 - -openAVX2PreparePolyKey: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x04, Y4, Y4, Y4 - DECQ R9 - JNE openAVX2PreparePolyKey - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD 32(BP), Y14, Y14 - VPADDD 64(BP), Y12, Y12 - VPADDD 192(BP), Y4, Y4 - VPERM2I128 $0x02, Y0, Y14, Y3 - - // Clamp and store poly key - VPAND ·polyClampMask<>+0(SB), Y3, Y3 - VMOVDQA Y3, (BP) - - // Stream for the first 64 bytes - VPERM2I128 $0x13, Y0, Y14, Y0 - VPERM2I128 $0x13, Y12, Y4, Y14 - - // Hash AD + first 64 bytes - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - XORQ CX, CX - -openAVX2InitialHash64: - ADDQ (SI)(CX*1), R10 - ADCQ 8(SI)(CX*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ $0x10, CX - CMPQ CX, $0x40 - JNE openAVX2InitialHash64 - - // Decrypt the first 64 bytes - VPXOR (SI), Y0, Y0 - VPXOR 32(SI), Y14, Y14 - VMOVDQU Y0, (DI) - VMOVDQU Y14, 32(DI) - LEAQ 64(SI), SI - LEAQ 64(DI), DI - SUBQ $0x40, BX - -openAVX2MainLoop: - CMPQ BX, $0x00000200 - JB openAVX2MainLoopDone - - // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA Y0, Y6 - VMOVDQA Y0, Y7 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA Y14, Y10 - VMOVDQA Y14, Y11 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA Y12, Y8 - VMOVDQA Y12, Y15 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 - VMOVDQA Y4, 96(BP) - VMOVDQA Y1, 128(BP) - VMOVDQA Y2, 160(BP) - VMOVDQA Y3, 192(BP) - XORQ CX, CX - -openAVX2InternalLoop: - ADDQ (SI)(CX*1), R10 - ADCQ 8(SI)(CX*1), R11 - ADCQ $0x01, R12 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 - VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 - ADDQ 16(SI)(CX*1), R10 - ADCQ 24(SI)(CX*1), R11 - ADCQ $0x01, R12 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 - VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 - VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 - VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x04, Y11, Y11, Y11 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPALIGNR $0x0c, Y2, Y2, Y2 - VPALIGNR $0x0c, Y3, Y3, Y3 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - ADDQ 32(SI)(CX*1), R10 - ADCQ 40(SI)(CX*1), R11 - ADCQ $0x01, R12 - LEAQ 48(CX), CX - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 - VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 - VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 - VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 - VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x0c, Y11, Y11, Y11 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - VPALIGNR $0x04, Y2, Y2, Y2 - VPALIGNR $0x04, Y3, Y3, Y3 - CMPQ CX, $0x000001e0 - JNE openAVX2InternalLoop - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 - VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 - VPADDD 32(BP), Y14, Y14 - VPADDD 32(BP), Y9, Y9 - VPADDD 32(BP), Y10, Y10 - VPADDD 32(BP), Y11, Y11 - VPADDD 64(BP), Y12, Y12 - VPADDD 64(BP), Y13, Y13 - VPADDD 64(BP), Y8, Y8 - VPADDD 64(BP), Y15, Y15 - VPADDD 96(BP), Y4, Y4 - VPADDD 128(BP), Y1, Y1 - VPADDD 160(BP), Y2, Y2 - VPADDD 192(BP), Y3, Y3 - VMOVDQA Y15, 224(BP) - - // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - ADDQ 480(SI), R10 - ADCQ 488(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPERM2I128 $0x02, Y0, Y14, Y15 - VPERM2I128 $0x13, Y0, Y14, Y14 - VPERM2I128 $0x02, Y12, Y4, Y0 - VPERM2I128 $0x13, Y12, Y4, Y12 - VPXOR (SI), Y15, Y15 - VPXOR 32(SI), Y0, Y0 - VPXOR 64(SI), Y14, Y14 - VPXOR 96(SI), Y12, Y12 - VMOVDQU Y15, (DI) - VMOVDQU Y0, 32(DI) - VMOVDQU Y14, 64(DI) - VMOVDQU Y12, 96(DI) - VPERM2I128 $0x02, Y5, Y9, Y0 - VPERM2I128 $0x02, Y13, Y1, Y14 - VPERM2I128 $0x13, Y5, Y9, Y12 - VPERM2I128 $0x13, Y13, Y1, Y4 - VPXOR 128(SI), Y0, Y0 - VPXOR 160(SI), Y14, Y14 - VPXOR 192(SI), Y12, Y12 - VPXOR 224(SI), Y4, Y4 - VMOVDQU Y0, 128(DI) - VMOVDQU Y14, 160(DI) - VMOVDQU Y12, 192(DI) - VMOVDQU Y4, 224(DI) - - // and here - ADDQ 496(SI), R10 - ADCQ 504(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - VPERM2I128 $0x02, Y6, Y10, Y0 - VPERM2I128 $0x02, Y8, Y2, Y14 - VPERM2I128 $0x13, Y6, Y10, Y12 - VPERM2I128 $0x13, Y8, Y2, Y4 - VPXOR 256(SI), Y0, Y0 - VPXOR 288(SI), Y14, Y14 - VPXOR 320(SI), Y12, Y12 - VPXOR 352(SI), Y4, Y4 - VMOVDQU Y0, 256(DI) - VMOVDQU Y14, 288(DI) - VMOVDQU Y12, 320(DI) - VMOVDQU Y4, 352(DI) - VPERM2I128 $0x02, Y7, Y11, Y0 - VPERM2I128 $0x02, 224(BP), Y3, Y14 - VPERM2I128 $0x13, Y7, Y11, Y12 - VPERM2I128 $0x13, 224(BP), Y3, Y4 - VPXOR 384(SI), Y0, Y0 - VPXOR 416(SI), Y14, Y14 - VPXOR 448(SI), Y12, Y12 - VPXOR 480(SI), Y4, Y4 - VMOVDQU Y0, 384(DI) - VMOVDQU Y14, 416(DI) - VMOVDQU Y12, 448(DI) - VMOVDQU Y4, 480(DI) - LEAQ 512(SI), SI - LEAQ 512(DI), DI - SUBQ $0x00000200, BX - JMP openAVX2MainLoop - -openAVX2MainLoopDone: - // Handle the various tail sizes efficiently - TESTQ BX, BX - JE openSSEFinalize - CMPQ BX, $0x80 - JBE openAVX2Tail128 - CMPQ BX, $0x00000100 - JBE openAVX2Tail256 - CMPQ BX, $0x00000180 - JBE openAVX2Tail384 - JMP openAVX2Tail512 - -openAVX2192: - VMOVDQA Y0, Y5 - VMOVDQA Y14, Y9 - VMOVDQA Y12, Y13 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VMOVDQA Y0, Y6 - VMOVDQA Y14, Y10 - VMOVDQA Y12, Y8 - VMOVDQA Y4, Y2 - VMOVDQA Y1, Y15 - MOVQ $0x0000000a, R9 - -openAVX2192InnerCipherLoop: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - DECQ R9 - JNE openAVX2192InnerCipherLoop - VPADDD Y6, Y0, Y0 - VPADDD Y6, Y5, Y5 - VPADDD Y10, Y14, Y14 - VPADDD Y10, Y9, Y9 - VPADDD Y8, Y12, Y12 - VPADDD Y8, Y13, Y13 - VPADDD Y2, Y4, Y4 - VPADDD Y15, Y1, Y1 - VPERM2I128 $0x02, Y0, Y14, Y3 - - // Clamp and store poly key - VPAND ·polyClampMask<>+0(SB), Y3, Y3 - VMOVDQA Y3, (BP) - - // Stream for up to 192 bytes - VPERM2I128 $0x13, Y0, Y14, Y0 - VPERM2I128 $0x13, Y12, Y4, Y14 - VPERM2I128 $0x02, Y5, Y9, Y12 - VPERM2I128 $0x02, Y13, Y1, Y4 - VPERM2I128 $0x13, Y5, Y9, Y5 - VPERM2I128 $0x13, Y13, Y1, Y9 - -openAVX2ShortOpen: - // Hash - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - -openAVX2ShortOpenLoop: - CMPQ BX, $0x20 - JB openAVX2ShortTail32 - SUBQ $0x20, BX - - // Load for hashing - ADDQ (SI), R10 - ADCQ 8(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ 16(SI), R10 - ADCQ 24(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Load for decryption - VPXOR (SI), Y0, Y0 - VMOVDQU Y0, (DI) - LEAQ 32(SI), SI - LEAQ 32(DI), DI - - // Shift stream left - VMOVDQA Y14, Y0 - VMOVDQA Y12, Y14 - VMOVDQA Y4, Y12 - VMOVDQA Y5, Y4 - VMOVDQA Y9, Y5 - VMOVDQA Y13, Y9 - VMOVDQA Y1, Y13 - VMOVDQA Y6, Y1 - VMOVDQA Y10, Y6 - JMP openAVX2ShortOpenLoop - -openAVX2ShortTail32: - CMPQ BX, $0x10 - VMOVDQA X0, X1 - JB openAVX2ShortDone - SUBQ $0x10, BX - - // Load for hashing - ADDQ (SI), R10 - ADCQ 8(SI), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Load for decryption - VPXOR (SI), X0, X12 - VMOVDQU X12, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - VPERM2I128 $0x11, Y0, Y0, Y0 - VMOVDQA X0, X1 - -openAVX2ShortDone: - VZEROUPPER - JMP openSSETail16 - -openAVX2320: - VMOVDQA Y0, Y5 - VMOVDQA Y14, Y9 - VMOVDQA Y12, Y13 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VMOVDQA Y0, Y6 - VMOVDQA Y14, Y10 - VMOVDQA Y12, Y8 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VMOVDQA Y14, Y7 - VMOVDQA Y12, Y11 - VMOVDQA Y4, Y15 - MOVQ $0x0000000a, R9 - -openAVX2320InnerCipherLoop: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPALIGNR $0x0c, Y2, Y2, Y2 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - VPALIGNR $0x04, Y2, Y2, Y2 - DECQ R9 - JNE openAVX2320InnerCipherLoop - VMOVDQA ·chacha20Constants<>+0(SB), Y3 - VPADDD Y3, Y0, Y0 - VPADDD Y3, Y5, Y5 - VPADDD Y3, Y6, Y6 - VPADDD Y7, Y14, Y14 - VPADDD Y7, Y9, Y9 - VPADDD Y7, Y10, Y10 - VPADDD Y11, Y12, Y12 - VPADDD Y11, Y13, Y13 - VPADDD Y11, Y8, Y8 - VMOVDQA ·avx2IncMask<>+0(SB), Y3 - VPADDD Y15, Y4, Y4 - VPADDD Y3, Y15, Y15 - VPADDD Y15, Y1, Y1 - VPADDD Y3, Y15, Y15 - VPADDD Y15, Y2, Y2 - - // Clamp and store poly key - VPERM2I128 $0x02, Y0, Y14, Y3 - VPAND ·polyClampMask<>+0(SB), Y3, Y3 - VMOVDQA Y3, (BP) - - // Stream for up to 320 bytes - VPERM2I128 $0x13, Y0, Y14, Y0 - VPERM2I128 $0x13, Y12, Y4, Y14 - VPERM2I128 $0x02, Y5, Y9, Y12 - VPERM2I128 $0x02, Y13, Y1, Y4 - VPERM2I128 $0x13, Y5, Y9, Y5 - VPERM2I128 $0x13, Y13, Y1, Y9 - VPERM2I128 $0x02, Y6, Y10, Y13 - VPERM2I128 $0x02, Y8, Y2, Y1 - VPERM2I128 $0x13, Y6, Y10, Y6 - VPERM2I128 $0x13, Y8, Y2, Y10 - JMP openAVX2ShortOpen - -openAVX2Tail128: - // Need to decrypt up to 128 bytes - prepare two blocks - VMOVDQA ·chacha20Constants<>+0(SB), Y5 - VMOVDQA 32(BP), Y9 - VMOVDQA 64(BP), Y13 - VMOVDQA 192(BP), Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 - VMOVDQA Y1, Y4 - XORQ R9, R9 - MOVQ BX, CX - ANDQ $-16, CX - TESTQ CX, CX - JE openAVX2Tail128LoopB - -openAVX2Tail128LoopA: - ADDQ (SI)(R9*1), R10 - ADCQ 8(SI)(R9*1), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - -openAVX2Tail128LoopB: - ADDQ $0x10, R9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x04, Y1, Y1, Y1 - CMPQ R9, CX - JB openAVX2Tail128LoopA - CMPQ R9, $0xa0 - JNE openAVX2Tail128LoopB - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD 32(BP), Y9, Y9 - VPADDD 64(BP), Y13, Y13 - VPADDD Y4, Y1, Y1 - VPERM2I128 $0x02, Y5, Y9, Y0 - VPERM2I128 $0x02, Y13, Y1, Y14 - VPERM2I128 $0x13, Y5, Y9, Y12 - VPERM2I128 $0x13, Y13, Y1, Y4 - -openAVX2TailLoop: - CMPQ BX, $0x20 - JB openAVX2Tail - SUBQ $0x20, BX - - // Load for decryption - VPXOR (SI), Y0, Y0 - VMOVDQU Y0, (DI) - LEAQ 32(SI), SI - LEAQ 32(DI), DI - VMOVDQA Y14, Y0 - VMOVDQA Y12, Y14 - VMOVDQA Y4, Y12 - JMP openAVX2TailLoop - -openAVX2Tail: - CMPQ BX, $0x10 - VMOVDQA X0, X1 - JB openAVX2TailDone - SUBQ $0x10, BX - - // Load for decryption - VPXOR (SI), X0, X12 - VMOVDQU X12, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - VPERM2I128 $0x11, Y0, Y0, Y0 - VMOVDQA X0, X1 - -openAVX2TailDone: - VZEROUPPER - JMP openSSETail16 - -openAVX2Tail256: - VMOVDQA ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VMOVDQA Y4, Y7 - VMOVDQA Y1, Y11 - - // Compute the number of iterations that will hash data - MOVQ BX, 224(BP) - MOVQ BX, CX - SUBQ $0x80, CX - SHRQ $0x04, CX - MOVQ $0x0000000a, R9 - CMPQ CX, $0x0a - CMOVQGT R9, CX - MOVQ SI, BX - XORQ R9, R9 - -openAVX2Tail256LoopA: - ADDQ (BX), R10 - ADCQ 8(BX), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(BX), BX - -openAVX2Tail256LoopB: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 +openAVX2InternalLoop: + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(SI)(CX*1), R10 + ADCQ 24(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 - INCQ R9 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(SI)(CX*1), R10 + ADCQ 40(SI)(CX*1), R11 + ADCQ $0x01, R12 + LEAQ 48(CX), CX + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - CMPQ R9, CX - JB openAVX2Tail256LoopA - CMPQ R9, $0x0a - JNE openAVX2Tail256LoopB - MOVQ BX, R9 - SUBQ SI, BX - MOVQ BX, CX - MOVQ 224(BP), BX - -openAVX2Tail256Hash: - ADDQ $0x10, CX - CMPQ CX, BX - JGT openAVX2Tail256HashEnd - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(R9), R9 - JMP openAVX2Tail256Hash + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + CMPQ CX, $0x000001e0 + JNE openAVX2InternalLoop + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) -openAVX2Tail256HashEnd: - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD 32(BP), Y14, Y14 - VPADDD 32(BP), Y9, Y9 - VPADDD 64(BP), Y12, Y12 - VPADDD 64(BP), Y13, Y13 - VPADDD Y7, Y4, Y4 - VPADDD Y11, Y1, Y1 - VPERM2I128 $0x02, Y0, Y14, Y6 - VPERM2I128 $0x02, Y12, Y4, Y10 - VPERM2I128 $0x13, Y0, Y14, Y8 - VPERM2I128 $0x13, Y12, Y4, Y2 + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + ADDQ 480(SI), R10 + ADCQ 488(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) VPERM2I128 $0x02, Y5, Y9, Y0 VPERM2I128 $0x02, Y13, Y1, Y14 VPERM2I128 $0x13, Y5, Y9, Y12 VPERM2I128 $0x13, Y13, Y1, Y4 - VPXOR (SI), Y6, Y6 - VPXOR 32(SI), Y10, Y10 - VPXOR 64(SI), Y8, Y8 - VPXOR 96(SI), Y2, Y2 - VMOVDQU Y6, (DI) - VMOVDQU Y10, 32(DI) - VMOVDQU Y8, 64(DI) - VMOVDQU Y2, 96(DI) - LEAQ 128(SI), SI - LEAQ 128(DI), DI - SUBQ $0x80, BX - JMP openAVX2TailLoop + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) -openAVX2Tail384: - // Need to decrypt up to 384 bytes - prepare six blocks - VMOVDQA ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA Y0, Y6 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA Y14, Y10 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA Y12, Y8 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VMOVDQA Y4, 96(BP) - VMOVDQA Y1, 128(BP) - VMOVDQA Y2, 160(BP) + // and here + ADDQ 496(SI), R10 + ADCQ 504(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + LEAQ 512(DI), DI + SUBQ $0x00000200, BX + JMP openAVX2MainLoop - // Compute the number of iterations that will hash two blocks of data - MOVQ BX, 224(BP) - MOVQ BX, CX - SUBQ $0x00000100, CX - SHRQ $0x04, CX - ADDQ $0x06, CX - MOVQ $0x0000000a, R9 - CMPQ CX, $0x0a - CMOVQGT R9, CX - MOVQ SI, BX - XORQ R9, R9 +openAVX2MainLoopDone: + // Handle the various tail sizes efficiently + TESTQ BX, BX + JE openSSEFinalize + CMPQ BX, $0x80 + JBE openAVX2Tail128 + CMPQ BX, $0x00000100 + JBE openAVX2Tail256 + CMPQ BX, $0x00000180 + JBE openAVX2Tail384 + JMP openAVX2Tail512 -openAVX2Tail384LoopB: - ADDQ (BX), R10 - ADCQ 8(BX), R11 +openSSEFinalize: + // Hash in the PT, AAD lengths + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 IMULQ R12, R15 - MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -4075,174 +797,261 @@ openAVX2Tail384LoopB: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(BX), BX -openAVX2Tail384LoopA: - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x04, Y14, Y14, Y14 - VPALIGNR $0x04, Y9, Y9, Y9 - VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x0c, Y4, Y4, Y4 - VPALIGNR $0x0c, Y1, Y1, Y1 - VPALIGNR $0x0c, Y2, Y2, Y2 - ADDQ (BX), R10 - ADCQ 8(BX), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(BX), BX - INCQ R9 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x0c, Y14, Y3 - VPSRLD $0x14, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y14, Y0, Y0 - VPXOR Y0, Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPADDD Y4, Y12, Y12 - VPXOR Y12, Y14, Y14 - VPSLLD $0x07, Y14, Y3 - VPSRLD $0x19, Y14, Y14 - VPXOR Y3, Y14, Y14 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x0c, Y9, Y3 - VPSRLD $0x14, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y9, Y5, Y5 - VPXOR Y5, Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPADDD Y1, Y13, Y13 - VPXOR Y13, Y9, Y9 - VPSLLD $0x07, Y9, Y3 - VPSRLD $0x19, Y9, Y9 - VPXOR Y3, Y9, Y9 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x0c, Y10, Y3 - VPSRLD $0x14, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPADDD Y10, Y6, Y6 - VPXOR Y6, Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPADDD Y2, Y8, Y8 - VPXOR Y8, Y10, Y10 - VPSLLD $0x07, Y10, Y3 - VPSRLD $0x19, Y10, Y10 - VPXOR Y3, Y10, Y10 - VPALIGNR $0x0c, Y14, Y14, Y14 - VPALIGNR $0x0c, Y9, Y9, Y9 - VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x08, Y12, Y12, Y12 - VPALIGNR $0x08, Y13, Y13, Y13 - VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x04, Y4, Y4, Y4 - VPALIGNR $0x04, Y1, Y1, Y1 - VPALIGNR $0x04, Y2, Y2, Y2 - CMPQ R9, CX - JB openAVX2Tail384LoopB - CMPQ R9, $0x0a - JNE openAVX2Tail384LoopA - MOVQ BX, R9 - SUBQ SI, BX - MOVQ BX, CX - MOVQ 224(BP), BX + // Final reduce + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 + + // Add in the "s" part of the key + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 + + // Finally, constant time compare to the tag at the end of the message + XORQ AX, AX + MOVQ $0x00000001, DX + XORQ (SI), R10 + XORQ 8(SI), R11 + ORQ R11, R10 + CMOVQEQ DX, AX + + // Return true iff tags are equal + MOVB AX, ret+96(FP) + RET + +openSSETail16: + TESTQ BX, BX + JE openSSEFinalize + + // We can safely load the CT from the end, because it is padded with the MAC + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVOU (SI), X12 + ADDQ BX, SI + PAND -16(R13)(R9*1), X12 + MOVO X12, 64(BP) + MOVQ X12, R13 + MOVQ 72(BP), R14 + PXOR X1, X12 + + // We can only store one byte at a time, since plaintext can be shorter than 16 bytes +openSSETail16Store: + MOVQ X12, R8 + MOVB R8, (DI) + PSRLDQ $0x01, X12 + INCQ DI + DECQ BX + JNE openSSETail16Store + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + JMP openSSEFinalize -openAVX2Tail384Hash: - ADDQ $0x10, CX - CMPQ CX, BX - JGT openAVX2Tail384HashEnd - ADDQ (R9), R10 - ADCQ 8(R9), R11 +openAVX2192: + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 + +openAVX2192InnerCipherLoop: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 + JNE openAVX2192InnerCipherLoop + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 + + // Clamp and store poly key + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) + + // Stream for up to 192 bytes + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + +openAVX2ShortOpen: + // Hash + MOVQ ad_len+80(FP), R9 + CALL polyHashADInternal<>(SB) + +openAVX2ShortOpenLoop: + CMPQ BX, $0x20 + JB openAVX2ShortTail32 + SUBQ $0x20, BX + + // Load for hashing + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(SI), R10 + ADCQ 24(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 @@ -4275,83 +1084,34 @@ openAVX2Tail384Hash: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(R9), R9 - JMP openAVX2Tail384Hash -openAVX2Tail384HashEnd: - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 - VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 - VPADDD 32(BP), Y14, Y14 - VPADDD 32(BP), Y9, Y9 - VPADDD 32(BP), Y10, Y10 - VPADDD 64(BP), Y12, Y12 - VPADDD 64(BP), Y13, Y13 - VPADDD 64(BP), Y8, Y8 - VPADDD 96(BP), Y4, Y4 - VPADDD 128(BP), Y1, Y1 - VPADDD 160(BP), Y2, Y2 - VPERM2I128 $0x02, Y0, Y14, Y3 - VPERM2I128 $0x02, Y12, Y4, Y7 - VPERM2I128 $0x13, Y0, Y14, Y11 - VPERM2I128 $0x13, Y12, Y4, Y15 - VPXOR (SI), Y3, Y3 - VPXOR 32(SI), Y7, Y7 - VPXOR 64(SI), Y11, Y11 - VPXOR 96(SI), Y15, Y15 - VMOVDQU Y3, (DI) - VMOVDQU Y7, 32(DI) - VMOVDQU Y11, 64(DI) - VMOVDQU Y15, 96(DI) - VPERM2I128 $0x02, Y5, Y9, Y3 - VPERM2I128 $0x02, Y13, Y1, Y7 - VPERM2I128 $0x13, Y5, Y9, Y11 - VPERM2I128 $0x13, Y13, Y1, Y15 - VPXOR 128(SI), Y3, Y3 - VPXOR 160(SI), Y7, Y7 - VPXOR 192(SI), Y11, Y11 - VPXOR 224(SI), Y15, Y15 - VMOVDQU Y3, 128(DI) - VMOVDQU Y7, 160(DI) - VMOVDQU Y11, 192(DI) - VMOVDQU Y15, 224(DI) - VPERM2I128 $0x02, Y6, Y10, Y0 - VPERM2I128 $0x02, Y8, Y2, Y14 - VPERM2I128 $0x13, Y6, Y10, Y12 - VPERM2I128 $0x13, Y8, Y2, Y4 - LEAQ 256(SI), SI - LEAQ 256(DI), DI - SUBQ $0x00000100, BX - JMP openAVX2TailLoop + // Load for decryption + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI -openAVX2Tail512: - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - VMOVDQA Y0, Y5 - VMOVDQA Y0, Y6 - VMOVDQA Y0, Y7 - VMOVDQA 32(BP), Y14 - VMOVDQA Y14, Y9 - VMOVDQA Y14, Y10 - VMOVDQA Y14, Y11 - VMOVDQA 64(BP), Y12 - VMOVDQA Y12, Y13 - VMOVDQA Y12, Y8 - VMOVDQA Y12, Y15 - VMOVDQA 192(BP), Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 - VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 - VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 - VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 - VMOVDQA Y4, 96(BP) - VMOVDQA Y1, 128(BP) - VMOVDQA Y2, 160(BP) - VMOVDQA Y3, 192(BP) - XORQ CX, CX - MOVQ SI, R9 + // Shift stream left + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 + JMP openAVX2ShortOpenLoop + +openAVX2ShortTail32: + CMPQ BX, $0x10 + VMOVDQA X0, X1 + JB openAVX2ShortDone + SUBQ $0x10, BX -openAVX2Tail512LoopB: - ADDQ (R9), R10 - ADCQ 8(R9), R11 + // Load for hashing + ADDQ (SI), R10 + ADCQ 8(SI), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 @@ -4384,252 +1144,202 @@ openAVX2Tail512LoopB: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(R9), R9 -openAVX2Tail512LoopA: + // Load for decryption + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 + +openAVX2ShortDone: + VZEROUPPER + JMP openSSETail16 + +openAVX2320: + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 + +openAVX2320InnerCipherLoop: VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 + VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 - ADDQ (R9), R10 - ADCQ 8(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 + VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 - VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 + VPSLLD $0x07, Y14, Y3 VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 + VPXOR Y3, Y10, Y10 VPALIGNR $0x04, Y14, Y14, Y14 VPALIGNR $0x04, Y9, Y9, Y9 VPALIGNR $0x04, Y10, Y10, Y10 - VPALIGNR $0x04, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x0c, Y4, Y4, Y4 VPALIGNR $0x0c, Y1, Y1, Y1 VPALIGNR $0x0c, Y2, Y2, Y2 - VPALIGNR $0x0c, Y3, Y3, Y3 VPADDD Y14, Y0, Y0 - VPADDD Y9, Y5, Y5 - VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 VPSHUFB ·rol16<>+0(SB), Y4, Y4 - VPSHUFB ·rol16<>+0(SB), Y1, Y1 - VPSHUFB ·rol16<>+0(SB), Y2, Y2 - VPSHUFB ·rol16<>+0(SB), Y3, Y3 VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 - VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 - VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - ADDQ 16(R9), R10 - ADCQ 24(R9), R11 - ADCQ $0x01, R12 - MOVQ (BP), DX - MOVQ DX, R15 - MULXQ R10, R13, R14 - IMULQ R12, R15 - MULXQ R11, AX, DX - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), DX - MULXQ R10, R10, AX - ADDQ R10, R14 - MULXQ R11, R11, R8 - ADCQ R11, R15 - ADCQ $0x00, R8 - IMULQ R12, DX - ADDQ AX, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 32(R9), R9 - VMOVDQA Y15, 224(BP) - VPSLLD $0x0c, Y14, Y15 + VPSLLD $0x0c, Y14, Y3 VPSRLD $0x14, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x0c, Y9, Y15 - VPSRLD $0x14, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x0c, Y10, Y15 - VPSRLD $0x14, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x0c, Y11, Y15 - VPSRLD $0x14, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 + VPXOR Y3, Y14, Y14 VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 VPADDD Y10, Y6, Y6 - VPADDD Y11, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 - VPXOR Y7, Y3, Y3 - VPSHUFB ·rol8<>+0(SB), Y4, Y4 - VPSHUFB ·rol8<>+0(SB), Y1, Y1 VPSHUFB ·rol8<>+0(SB), Y2, Y2 - VPSHUFB ·rol8<>+0(SB), Y3, Y3 - VPADDD Y4, Y12, Y12 - VPADDD Y1, Y13, Y13 VPADDD Y2, Y8, Y8 - VPADDD Y3, Y15, Y15 - VPXOR Y12, Y14, Y14 - VPXOR Y13, Y9, Y9 VPXOR Y8, Y10, Y10 - VPXOR Y15, Y11, Y11 - VMOVDQA Y15, 224(BP) - VPSLLD $0x07, Y14, Y15 - VPSRLD $0x19, Y14, Y14 - VPXOR Y15, Y14, Y14 - VPSLLD $0x07, Y9, Y15 - VPSRLD $0x19, Y9, Y9 - VPXOR Y15, Y9, Y9 - VPSLLD $0x07, Y10, Y15 + VPSLLD $0x07, Y10, Y3 VPSRLD $0x19, Y10, Y10 - VPXOR Y15, Y10, Y10 - VPSLLD $0x07, Y11, Y15 - VPSRLD $0x19, Y11, Y11 - VPXOR Y15, Y11, Y11 - VMOVDQA 224(BP), Y15 + VPXOR Y3, Y10, Y10 VPALIGNR $0x0c, Y14, Y14, Y14 VPALIGNR $0x0c, Y9, Y9, Y9 VPALIGNR $0x0c, Y10, Y10, Y10 - VPALIGNR $0x0c, Y11, Y11, Y11 VPALIGNR $0x08, Y12, Y12, Y12 VPALIGNR $0x08, Y13, Y13, Y13 VPALIGNR $0x08, Y8, Y8, Y8 - VPALIGNR $0x08, Y15, Y15, Y15 VPALIGNR $0x04, Y4, Y4, Y4 VPALIGNR $0x04, Y1, Y1, Y1 VPALIGNR $0x04, Y2, Y2, Y2 - VPALIGNR $0x04, Y3, Y3, Y3 - INCQ CX - CMPQ CX, $0x04 - JLT openAVX2Tail512LoopB - CMPQ CX, $0x0a - JNE openAVX2Tail512LoopA - MOVQ BX, CX - SUBQ $0x00000180, CX - ANDQ $-16, CX + DECQ R9 + JNE openAVX2320InnerCipherLoop + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 -openAVX2Tail512HashLoop: - TESTQ CX, CX - JE openAVX2Tail512HashEnd - ADDQ (R9), R10 - ADCQ 8(R9), R11 + // Clamp and store poly key + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) + + // Stream for up to 320 bytes + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 + JMP openAVX2ShortOpen + +openAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 + VMOVDQA Y1, Y4 + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX + TESTQ CX, CX + JE openAVX2Tail128LoopB + +openAVX2Tail128LoopA: + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 ADCQ $0x01, R12 MOVQ (BP), DX MOVQ DX, R15 @@ -4662,884 +1372,137 @@ openAVX2Tail512HashLoop: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(R9), R9 - SUBQ $0x10, CX - JMP openAVX2Tail512HashLoop -openAVX2Tail512HashEnd: - VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 +openAVX2Tail128LoopB: + ADDQ $0x10, R9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail128LoopA + CMPQ R9, $0xa0 + JNE openAVX2Tail128LoopB VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 - VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 - VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 - VPADDD 32(BP), Y14, Y14 VPADDD 32(BP), Y9, Y9 - VPADDD 32(BP), Y10, Y10 - VPADDD 32(BP), Y11, Y11 - VPADDD 64(BP), Y12, Y12 VPADDD 64(BP), Y13, Y13 - VPADDD 64(BP), Y8, Y8 - VPADDD 64(BP), Y15, Y15 - VPADDD 96(BP), Y4, Y4 - VPADDD 128(BP), Y1, Y1 - VPADDD 160(BP), Y2, Y2 - VPADDD 192(BP), Y3, Y3 - VMOVDQA Y15, 224(BP) - VPERM2I128 $0x02, Y0, Y14, Y15 - VPERM2I128 $0x13, Y0, Y14, Y14 - VPERM2I128 $0x02, Y12, Y4, Y0 - VPERM2I128 $0x13, Y12, Y4, Y12 - VPXOR (SI), Y15, Y15 - VPXOR 32(SI), Y0, Y0 - VPXOR 64(SI), Y14, Y14 - VPXOR 96(SI), Y12, Y12 - VMOVDQU Y15, (DI) - VMOVDQU Y0, 32(DI) - VMOVDQU Y14, 64(DI) - VMOVDQU Y12, 96(DI) - VPERM2I128 $0x02, Y5, Y9, Y0 - VPERM2I128 $0x02, Y13, Y1, Y14 - VPERM2I128 $0x13, Y5, Y9, Y12 - VPERM2I128 $0x13, Y13, Y1, Y4 - VPXOR 128(SI), Y0, Y0 - VPXOR 160(SI), Y14, Y14 - VPXOR 192(SI), Y12, Y12 - VPXOR 224(SI), Y4, Y4 - VMOVDQU Y0, 128(DI) - VMOVDQU Y14, 160(DI) - VMOVDQU Y12, 192(DI) - VMOVDQU Y4, 224(DI) - VPERM2I128 $0x02, Y6, Y10, Y0 - VPERM2I128 $0x02, Y8, Y2, Y14 - VPERM2I128 $0x13, Y6, Y10, Y12 - VPERM2I128 $0x13, Y8, Y2, Y4 - VPXOR 256(SI), Y0, Y0 - VPXOR 288(SI), Y14, Y14 - VPXOR 320(SI), Y12, Y12 - VPXOR 352(SI), Y4, Y4 - VMOVDQU Y0, 256(DI) - VMOVDQU Y14, 288(DI) - VMOVDQU Y12, 320(DI) - VMOVDQU Y4, 352(DI) - VPERM2I128 $0x02, Y7, Y11, Y0 - VPERM2I128 $0x02, 224(BP), Y3, Y14 - VPERM2I128 $0x13, Y7, Y11, Y12 - VPERM2I128 $0x13, 224(BP), Y3, Y4 - LEAQ 384(SI), SI - LEAQ 384(DI), DI - SUBQ $0x00000180, BX - JMP openAVX2TailLoop - -DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 -DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 -GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 - -DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff -DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc -DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff -DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff -GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 - -DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 -DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 -GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 - -DATA ·andMask<>+0(SB)/8, $0x00000000000000ff -DATA ·andMask<>+8(SB)/8, $0x0000000000000000 -DATA ·andMask<>+16(SB)/8, $0x000000000000ffff -DATA ·andMask<>+24(SB)/8, $0x0000000000000000 -DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+40(SB)/8, $0x0000000000000000 -DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+56(SB)/8, $0x0000000000000000 -DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+72(SB)/8, $0x0000000000000000 -DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+88(SB)/8, $0x0000000000000000 -DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff -DATA ·andMask<>+104(SB)/8, $0x0000000000000000 -DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+120(SB)/8, $0x0000000000000000 -DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+136(SB)/8, $0x00000000000000ff -DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+152(SB)/8, $0x000000000000ffff -DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff -GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 + VPADDD Y4, Y1, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 -DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 -DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 -DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 -DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 -GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 +openAVX2TailLoop: + CMPQ BX, $0x20 + JB openAVX2Tail + SUBQ $0x20, BX -DATA ·rol16<>+0(SB)/8, $0x0504070601000302 -DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a -DATA ·rol16<>+16(SB)/8, $0x0504070601000302 -DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a -GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 + // Load for decryption + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + JMP openAVX2TailLoop -DATA ·rol8<>+0(SB)/8, $0x0605040702010003 -DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b -DATA ·rol8<>+16(SB)/8, $0x0605040702010003 -DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b -GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 +openAVX2Tail: + CMPQ BX, $0x10 + VMOVDQA X0, X1 + JB openAVX2TailDone + SUBQ $0x10, BX -DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 -DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 -DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 -DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 -GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + // Load for decryption + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 -// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) -// Requires: AVX, AVX2, BMI2, CMOV, SSE2 -TEXT ·chacha20Poly1305Seal(SB), $288-96 - MOVQ SP, BP - ADDQ $0x20, BP - ANDQ $-32, BP - MOVQ dst_base+0(FP), DI - MOVQ key_base+24(FP), R8 - MOVQ src_base+48(FP), SI - MOVQ src_len+56(FP), BX - MOVQ ad_base+72(FP), CX - CMPB ·useAVX2+0(SB), $0x01 - JE chacha20Poly1305Seal_AVX2 +openAVX2TailDone: + VZEROUPPER + JMP openSSETail16 - // Special optimization, for very short buffers - CMPQ BX, $0x80 - JBE sealSSE128 - - // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - - // Store state on stack for future use - MOVO X3, 32(BP) - MOVO X6, 48(BP) - - // Load state, increment counter blocks - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - MOVQ $0x0000000a, R9 - -sealSSEIntroLoop: - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - DECQ R9 - JNE sealSSEIntroLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - - // Clamp and store the key - PAND ·polyClampMask<>+0(SB), X0 - MOVO X0, (BP) - MOVO X3, 16(BP) - - // Hash AAD - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - MOVOU (SI), X0 - MOVOU 16(SI), X3 - MOVOU 32(SI), X6 - MOVOU 48(SI), X9 - PXOR X0, X1 - PXOR X3, X4 - PXOR X6, X7 - PXOR X9, X10 - MOVOU X1, (DI) - MOVOU X4, 16(DI) - MOVOU X7, 32(DI) - MOVOU X10, 48(DI) - MOVOU 64(SI), X0 - MOVOU 80(SI), X3 - MOVOU 96(SI), X6 - MOVOU 112(SI), X9 - PXOR X0, X2 - PXOR X3, X5 - PXOR X6, X8 - PXOR X9, X11 - MOVOU X2, 64(DI) - MOVOU X5, 80(DI) - MOVOU X8, 96(DI) - MOVOU X11, 112(DI) - MOVQ $0x00000080, CX - SUBQ $0x80, BX - LEAQ 128(SI), SI - MOVO X12, X1 - MOVO X13, X4 - MOVO X14, X7 - MOVO X15, X10 - CMPQ BX, $0x40 - JBE sealSSE128SealHash - MOVOU (SI), X0 - MOVOU 16(SI), X3 - MOVOU 32(SI), X6 - MOVOU 48(SI), X9 - PXOR X0, X12 - PXOR X3, X13 - PXOR X6, X14 - PXOR X9, X15 - MOVOU X12, 128(DI) - MOVOU X13, 144(DI) - MOVOU X14, 160(DI) - MOVOU X15, 176(DI) - ADDQ $0x40, CX - SUBQ $0x40, BX - LEAQ 64(SI), SI - MOVQ $0x00000002, CX - MOVQ $0x00000008, R9 - CMPQ BX, $0x40 - JBE sealSSETail64 - CMPQ BX, $0x80 - JBE sealSSETail128 - CMPQ BX, $0xc0 - JBE sealSSETail192 - -sealSSEMainLoop: - // Load state, increment counter blocks - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X2, X12 - MOVO X5, X13 - MOVO X8, X14 - MOVO X11, X15 - PADDL ·sseIncMask<>+0(SB), X15 - - // Store counters - MOVO X9, 80(BP) - MOVO X10, 96(BP) - MOVO X11, 112(BP) - MOVO X15, 128(BP) - -sealSSEInnerLoop: - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 - ADDQ (DI), R10 - ADCQ 8(DI), R11 +openAVX2Tail256: + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 + + // Compute the number of iterations that will hash data + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x80, CX + SHRQ $0x04, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + +openAVX2Tail256LoopA: + ADDQ (BX), R10 + ADCQ 8(BX), R11 ADCQ $0x01, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x0c - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - LEAQ 16(DI), DI - MOVO X14, 64(BP) - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X3 - PXOR X14, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X14) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X3 - PXOR X14, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X4 - PXOR X14, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X14) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X4 - PXOR X14, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x0c, X14 - PSRLL $0x14, X5 - PXOR X14, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X14) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X14 - PSLLL $0x07, X14 - PSRLL $0x19, X5 - PXOR X14, X5 - MOVO 64(BP), X14 - MOVO X7, 64(BP) - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 - PADDD X13, X12 - PXOR X12, X15 - ROL16(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x0c, X7 - PSRLL $0x14, X13 - PXOR X7, X13 - PADDD X13, X12 - PXOR X12, X15 - ROL8(X15, X7) - PADDD X15, X14 - PXOR X14, X13 - MOVO X13, X7 - PSLLL $0x07, X7 - PSRLL $0x19, X13 - PXOR X7, X13 - MOVO 64(BP), X7 MOVQ R13, R10 MOVQ R14, R11 MOVQ R15, R12 @@ -5555,112 +1518,117 @@ sealSSEInnerLoop: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x04 - DECQ R9 - JGE sealSSEInnerLoop - ADDQ (DI), R10 - ADCQ 8(DI), R11 + LEAQ 16(BX), BX + +openAVX2Tail256LoopB: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail256LoopA + CMPQ R9, $0x0a + JNE openAVX2Tail256LoopB + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX + +openAVX2Tail256Hash: + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail256HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -5672,143 +1640,94 @@ sealSSEInnerLoop: SHRQ $0x02, R8, R15 SHRQ $0x02, R8 ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI - DECQ CX - JG sealSSEInnerLoop - - // Add in the state - PADDD ·chacha20Constants<>+0(SB), X0 - PADDD ·chacha20Constants<>+0(SB), X1 - PADDD ·chacha20Constants<>+0(SB), X2 - PADDD ·chacha20Constants<>+0(SB), X12 - PADDD 32(BP), X3 - PADDD 32(BP), X4 - PADDD 32(BP), X5 - PADDD 32(BP), X13 - PADDD 48(BP), X6 - PADDD 48(BP), X7 - PADDD 48(BP), X8 - PADDD 48(BP), X14 - PADDD 80(BP), X9 - PADDD 96(BP), X10 - PADDD 112(BP), X11 - PADDD 128(BP), X15 - MOVO X15, 64(BP) - - // Load - xor - store - MOVOU (SI), X15 - PXOR X15, X0 - MOVOU 16(SI), X15 - PXOR X15, X3 - MOVOU 32(SI), X15 - PXOR X15, X6 - MOVOU 48(SI), X15 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVO 64(BP), X15 - MOVOU 64(SI), X0 - MOVOU 80(SI), X3 - MOVOU 96(SI), X6 - MOVOU 112(SI), X9 - PXOR X0, X1 - PXOR X3, X4 - PXOR X6, X7 - PXOR X9, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - MOVOU 128(SI), X0 - MOVOU 144(SI), X3 - MOVOU 160(SI), X6 - MOVOU 176(SI), X9 - PXOR X0, X2 - PXOR X3, X5 - PXOR X6, X8 - PXOR X9, X11 - MOVOU X2, 128(DI) - MOVOU X5, 144(DI) - MOVOU X8, 160(DI) - MOVOU X11, 176(DI) - ADDQ $0xc0, SI - MOVQ $0x000000c0, CX - SUBQ $0xc0, BX - MOVO X12, X1 - MOVO X13, X4 - MOVO X14, X7 - MOVO X15, X10 - CMPQ BX, $0x40 - JBE sealSSE128SealHash - MOVOU (SI), X0 - MOVOU 16(SI), X3 - MOVOU 32(SI), X6 - MOVOU 48(SI), X9 - PXOR X0, X12 - PXOR X3, X13 - PXOR X6, X14 - PXOR X9, X15 - MOVOU X12, 192(DI) - MOVOU X13, 208(DI) - MOVOU X14, 224(DI) - MOVOU X15, 240(DI) - LEAQ 64(SI), SI - SUBQ $0x40, BX - MOVQ $0x00000006, CX - MOVQ $0x00000004, R9 - CMPQ BX, $0xc0 - JG sealSSEMainLoop - MOVQ BX, CX - TESTQ BX, BX - JE sealSSE128SealHash - MOVQ $0x00000006, CX - CMPQ BX, $0x40 - JBE sealSSETail64 - CMPQ BX, $0x80 - JBE sealSSETail128 - JMP sealSSETail192 - -sealSSETail64: - MOVO ·chacha20Constants<>+0(SB), X1 - MOVO 32(BP), X4 - MOVO 48(BP), X7 - MOVO 128(BP), X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 80(BP) - -sealSSETail64LoopA: - ADDQ (DI), R10 - ADCQ 8(DI), R11 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail256Hash + +openAVX2Tail256HashEnd: + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y6 + VPERM2I128 $0x02, Y12, Y4, Y10 + VPERM2I128 $0x13, Y0, Y14, Y8 + VPERM2I128 $0x13, Y12, Y4, Y2 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR (SI), Y6, Y6 + VPXOR 32(SI), Y10, Y10 + VPXOR 64(SI), Y8, Y8 + VPXOR 96(SI), Y2, Y2 + VMOVDQU Y6, (DI) + VMOVDQU Y10, 32(DI) + VMOVDQU Y8, 64(DI) + VMOVDQU Y2, 96(DI) + LEAQ 128(SI), SI + LEAQ 128(DI), DI + SUBQ $0x80, BX + JMP openAVX2TailLoop + +openAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare six blocks + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + + // Compute the number of iterations that will hash two blocks of data + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x00000100, CX + SHRQ $0x04, CX + ADDQ $0x06, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + +openAVX2Tail384LoopB: + ADDQ (BX), R10 + ADCQ 8(BX), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -5825,175 +1744,190 @@ sealSSETail64LoopA: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(DI), DI + LEAQ 16(BX), BX + +openAVX2Tail384LoopA: + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + CMPQ R9, CX + JB openAVX2Tail384LoopB + CMPQ R9, $0x0a + JNE openAVX2Tail384LoopA + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX -sealSSETail64LoopB: - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x0c, X13 - PSRLL $0x14, X4 - PXOR X13, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x07, X13 - PSRLL $0x19, X4 - PXOR X13, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x0c, X13 - PSRLL $0x14, X4 - PXOR X13, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X13) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X13 - PSLLL $0x07, X13 - PSRLL $0x19, X4 - PXOR X13, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI - DECQ CX - JG sealSSETail64LoopA - DECQ R9 - JGE sealSSETail64LoopB - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL 32(BP), X4 - PADDL 48(BP), X7 - PADDL 80(BP), X10 - JMP sealSSE128Seal - -sealSSETail128: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 80(BP) - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 96(BP) - -sealSSETail128LoopA: - ADDQ (DI), R10 - ADCQ 8(DI), R11 +openAVX2Tail384Hash: + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail384HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -6010,430 +1944,99 @@ sealSSETail128LoopA: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(DI), DI + LEAQ 16(R9), R9 + JMP openAVX2Tail384Hash -sealSSETail128LoopB: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - DECQ CX - JG sealSSETail128LoopA - DECQ R9 - JGE sealSSETail128LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 80(BP), X9 - PADDL 96(BP), X10 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X0 - PXOR X13, X3 - PXOR X14, X6 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVQ $0x00000040, CX - LEAQ 64(SI), SI - SUBQ $0x40, BX - JMP sealSSE128SealHash - -sealSSETail192: - MOVO ·chacha20Constants<>+0(SB), X0 - MOVO 32(BP), X3 - MOVO 48(BP), X6 - MOVO 128(BP), X9 - PADDL ·sseIncMask<>+0(SB), X9 - MOVO X9, 80(BP) - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X10, 96(BP) - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X11, 112(BP) - -sealSSETail192LoopA: - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - LEAQ 16(DI), DI +openAVX2Tail384HashEnd: + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX + JMP openAVX2TailLoop -sealSSETail192LoopB: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - ADDQ (DI), R10 - ADCQ 8(DI), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 +openAVX2Tail512: + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX + MOVQ SI, R9 + +openAVX2Tail512LoopB: + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -6450,465 +2053,268 @@ sealSSETail192LoopB: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 - LEAQ 16(DI), DI - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - DECQ CX - JG sealSSETail192LoopA - DECQ R9 - JGE sealSSETail192LoopB - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL 32(BP), X3 - PADDL 32(BP), X4 - PADDL 32(BP), X5 - PADDL 48(BP), X6 - PADDL 48(BP), X7 - PADDL 48(BP), X8 - PADDL 80(BP), X9 - PADDL 96(BP), X10 - PADDL 112(BP), X11 - MOVOU (SI), X12 - MOVOU 16(SI), X13 - MOVOU 32(SI), X14 - MOVOU 48(SI), X15 - PXOR X12, X0 - PXOR X13, X3 - PXOR X14, X6 - PXOR X15, X9 - MOVOU X0, (DI) - MOVOU X3, 16(DI) - MOVOU X6, 32(DI) - MOVOU X9, 48(DI) - MOVOU 64(SI), X12 - MOVOU 80(SI), X13 - MOVOU 96(SI), X14 - MOVOU 112(SI), X15 - PXOR X12, X1 - PXOR X13, X4 - PXOR X14, X7 - PXOR X15, X10 - MOVOU X1, 64(DI) - MOVOU X4, 80(DI) - MOVOU X7, 96(DI) - MOVOU X10, 112(DI) - MOVO X2, X1 - MOVO X5, X4 - MOVO X8, X7 - MOVO X11, X10 - MOVQ $0x00000080, CX - LEAQ 128(SI), SI - SUBQ $0x80, BX - JMP sealSSE128SealHash - -sealSSE128: - MOVOU ·chacha20Constants<>+0(SB), X0 - MOVOU 16(R8), X3 - MOVOU 32(R8), X6 - MOVOU 48(R8), X9 - MOVO X0, X1 - MOVO X3, X4 - MOVO X6, X7 - MOVO X9, X10 - PADDL ·sseIncMask<>+0(SB), X10 - MOVO X1, X2 - MOVO X4, X5 - MOVO X7, X8 - MOVO X10, X11 - PADDL ·sseIncMask<>+0(SB), X11 - MOVO X3, X13 - MOVO X6, X14 - MOVO X10, X15 - MOVQ $0x0000000a, R9 - -sealSSE128InnerCipherLoop: - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x04 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x0c - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - PADDD X3, X0 - PXOR X0, X9 - ROL16(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X3 - PXOR X12, X3 - PADDD X3, X0 - PXOR X0, X9 - ROL8(X9, X12) - PADDD X9, X6 - PXOR X6, X3 - MOVO X3, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X3 - PXOR X12, X3 - PADDD X4, X1 - PXOR X1, X10 - ROL16(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X4 - PXOR X12, X4 - PADDD X4, X1 - PXOR X1, X10 - ROL8(X10, X12) - PADDD X10, X7 - PXOR X7, X4 - MOVO X4, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X4 - PXOR X12, X4 - PADDD X5, X2 - PXOR X2, X11 - ROL16(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x0c, X12 - PSRLL $0x14, X5 - PXOR X12, X5 - PADDD X5, X2 - PXOR X2, X11 - ROL8(X11, X12) - PADDD X11, X8 - PXOR X8, X5 - MOVO X5, X12 - PSLLL $0x07, X12 - PSRLL $0x19, X5 - PXOR X12, X5 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xe4 - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xed - BYTE $0x0c - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xf6 - BYTE $0x08 - BYTE $0x66 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xff - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc0 - BYTE $0x08 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xc9 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xd2 - BYTE $0x04 - BYTE $0x66 - BYTE $0x45 - BYTE $0x0f - BYTE $0x3a - BYTE $0x0f - BYTE $0xdb - BYTE $0x04 - DECQ R9 - JNE sealSSE128InnerCipherLoop - - // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>+0(SB), X0 - PADDL ·chacha20Constants<>+0(SB), X1 - PADDL ·chacha20Constants<>+0(SB), X2 - PADDL X13, X3 - PADDL X13, X4 - PADDL X13, X5 - PADDL X14, X7 - PADDL X14, X8 - PADDL X15, X10 - PADDL ·sseIncMask<>+0(SB), X15 - PADDL X15, X11 - PAND ·polyClampMask<>+0(SB), X0 - MOVOU X0, (BP) - MOVOU X3, 16(BP) + LEAQ 16(R9), R9 - // Hash - MOVQ ad_len+80(FP), R9 - CALL polyHashADInternal<>(SB) - XORQ CX, CX +openAVX2Tail512LoopA: + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(R9), R10 + ADCQ 24(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(R9), R9 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + INCQ CX + CMPQ CX, $0x04 + JLT openAVX2Tail512LoopB + CMPQ CX, $0x0a + JNE openAVX2Tail512LoopA + MOVQ BX, CX + SUBQ $0x00000180, CX + ANDQ $-16, CX -sealSSE128SealHash: - CMPQ CX, $0x10 - JB sealSSE128Seal - ADDQ (DI), R10 - ADCQ 8(DI), R11 +openAVX2Tail512HashLoop: + TESTQ CX, CX + JE openAVX2Tail512HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 IMULQ R12, R15 + MULXQ R11, AX, DX ADDQ AX, R14 ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 ADCQ DX, R8 MOVQ R13, R10 MOVQ R14, R11 @@ -6925,238 +2331,162 @@ sealSSE128SealHash: ADDQ R15, R10 ADCQ R8, R11 ADCQ $0x00, R12 + LEAQ 16(R9), R9 SUBQ $0x10, CX - ADDQ $0x10, DI - JMP sealSSE128SealHash - -sealSSE128Seal: - CMPQ BX, $0x10 - JB sealSSETail - SUBQ $0x10, BX - - // Load for decryption - MOVOU (SI), X12 - PXOR X12, X1 - MOVOU X1, (DI) - LEAQ 16(SI), SI - LEAQ 16(DI), DI - - // Extract for hashing - MOVQ X1, R13 - PSRLDQ $0x08, X1 - MOVQ X1, R14 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - - // Shift the stream "left" - MOVO X4, X1 - MOVO X7, X4 - MOVO X10, X7 - MOVO X2, X10 - MOVO X5, X2 - MOVO X8, X5 - MOVO X11, X8 - JMP sealSSE128Seal + JMP openAVX2Tail512HashLoop -sealSSETail: - TESTQ BX, BX - JE sealSSEFinalize +openAVX2Tail512HashEnd: + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + LEAQ 384(SI), SI + LEAQ 384(DI), DI + SUBQ $0x00000180, BX + JMP openAVX2TailLoop - // We can only load the PT one byte at a time to avoid read after end of buffer - MOVQ BX, R9 - SHLQ $0x04, R9 - LEAQ ·andMask<>+0(SB), R13 - MOVQ BX, CX - LEAQ -1(SI)(BX*1), SI - XORQ R15, R15 - XORQ R8, R8 - XORQ AX, AX +DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 +GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 -sealSSETailLoadLoop: - SHLQ $0x08, R15, R8 - SHLQ $0x08, R15 - MOVB (SI), AX - XORQ AX, R15 - LEAQ -1(SI), SI - DECQ CX - JNE sealSSETailLoadLoop - MOVQ R15, 64(BP) - MOVQ R8, 72(BP) - PXOR 64(BP), X1 - MOVOU X1, (DI) - MOVOU -16(R13)(R9*1), X12 - PAND X12, X1 - MOVQ X1, R13 - PSRLDQ $0x08, X1 - MOVQ X1, R14 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 - ADDQ BX, DI +DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 +DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 -sealSSEFinalize: - // Hash in the buffer lengths - ADDQ ad_len+80(FP), R10 - ADCQ src_len+56(FP), R11 - ADCQ $0x01, R12 - MOVQ (BP), AX - MOVQ AX, R15 - MULQ R10 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ (BP), AX - MULQ R11 - IMULQ R12, R15 - ADDQ AX, R14 - ADCQ DX, R15 - MOVQ 8(BP), AX - MOVQ AX, R8 - MULQ R10 - ADDQ AX, R14 - ADCQ $0x00, DX - MOVQ DX, R10 - MOVQ 8(BP), AX - MULQ R11 - ADDQ AX, R15 - ADCQ $0x00, DX - IMULQ R12, R8 - ADDQ R10, R15 - ADCQ DX, R8 - MOVQ R13, R10 - MOVQ R14, R11 - MOVQ R15, R12 - ANDQ $0x03, R12 - MOVQ R15, R13 - ANDQ $-4, R13 - MOVQ R8, R14 - SHRQ $0x02, R8, R15 - SHRQ $0x02, R8 - ADDQ R13, R10 - ADCQ R14, R11 - ADCQ $0x00, R12 - ADDQ R15, R10 - ADCQ R8, R11 - ADCQ $0x00, R12 +DATA ·rol16<>+0(SB)/8, $0x0504070601000302 +DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +DATA ·rol16<>+16(SB)/8, $0x0504070601000302 +DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a +GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 - // Final reduce - MOVQ R10, R13 - MOVQ R11, R14 - MOVQ R12, R15 - SUBQ $-5, R10 - SBBQ $-1, R11 - SBBQ $0x03, R12 - CMOVQCS R13, R10 - CMOVQCS R14, R11 - CMOVQCS R15, R12 +DATA ·rol8<>+0(SB)/8, $0x0605040702010003 +DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b +DATA ·rol8<>+16(SB)/8, $0x0605040702010003 +DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b +GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 - // Add in the "s" part of the key - ADDQ 16(BP), R10 - ADCQ 24(BP), R11 +DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff +DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc +DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff +DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff +GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 - // Finally store the tag at the end of the message - MOVQ R10, (DI) - MOVQ R11, 8(DI) - RET +DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + +DATA ·andMask<>+0(SB)/8, $0x00000000000000ff +DATA ·andMask<>+8(SB)/8, $0x0000000000000000 +DATA ·andMask<>+16(SB)/8, $0x000000000000ffff +DATA ·andMask<>+24(SB)/8, $0x0000000000000000 +DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+40(SB)/8, $0x0000000000000000 +DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+56(SB)/8, $0x0000000000000000 +DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+72(SB)/8, $0x0000000000000000 +DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+88(SB)/8, $0x0000000000000000 +DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+104(SB)/8, $0x0000000000000000 +DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+120(SB)/8, $0x0000000000000000 +DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+136(SB)/8, $0x00000000000000ff +DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+152(SB)/8, $0x000000000000ffff +DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff +GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 -chacha20Poly1305Seal_AVX2: +// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Seal(SB), $288-96 + MOVQ SP, BP + ADDQ $0x20, BP + ANDQ $-32, BP + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX VZEROUPPER - VMOVDQU ·chacha20Constants<>+0(SB), Y0 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x70 - BYTE $0x10 - BYTE $0xc4 - BYTE $0x42 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x20 - BYTE $0xc4 - BYTE $0xc2 - BYTE $0x7d - BYTE $0x5a - BYTE $0x60 - BYTE $0x30 - VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VBROADCASTI128 16(R8), Y14 + VBROADCASTI128 32(R8), Y12 + VBROADCASTI128 48(R8), Y4 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimizations, for very short buffers CMPQ BX, $0x000000c0 @@ -8170,6 +3500,144 @@ sealAVX2InternalLoopStart: JBE sealAVX2Tail384 JMP sealAVX2Tail512 +sealSSETail: + TESTQ BX, BX + JE sealSSEFinalize + + // We can only load the PT one byte at a time to avoid read after end of buffer + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVQ BX, CX + LEAQ -1(SI)(BX*1), SI + XORQ R15, R15 + XORQ R8, R8 + XORQ AX, AX + +sealSSETailLoadLoop: + SHLQ $0x08, R15, R8 + SHLQ $0x08, R15 + MOVB (SI), AX + XORQ AX, R15 + LEAQ -1(SI), SI + DECQ CX + JNE sealSSETailLoadLoop + MOVQ R15, 64(BP) + MOVQ R8, 72(BP) + PXOR 64(BP), X1 + MOVOU X1, (DI) + MOVOU -16(R13)(R9*1), X12 + PAND X12, X1 + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ BX, DI + +sealSSEFinalize: + // Hash in the buffer lengths + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + + // Final reduce + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 + + // Add in the "s" part of the key + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 + + // Finally store the tag at the end of the message + MOVQ R10, (DI) + MOVQ R11, 8(DI) + RET + seal192AVX2: VMOVDQA Y0, Y5 VMOVDQA Y14, Y9 diff --git a/vendor/golang.org/x/crypto/pkcs12/crypto.go b/vendor/golang.org/x/crypto/pkcs12/crypto.go index 212538cb5a8..3f307323cc3 100644 --- a/vendor/golang.org/x/crypto/pkcs12/crypto.go +++ b/vendor/golang.org/x/crypto/pkcs12/crypto.go @@ -80,6 +80,10 @@ func pbDecrypterFor(algorithm pkix.AlgorithmIdentifier, password []byte) (cipher return nil, 0, err } + if params.Iterations < 0 || params.Iterations > maxIterations { + return nil, 0, NotImplementedError("iteration count is invalid or too high") + } + key := cipherType.deriveKey(params.Salt, password, params.Iterations) iv := cipherType.deriveIV(params.Salt, password, params.Iterations) diff --git a/vendor/golang.org/x/crypto/pkcs12/mac.go b/vendor/golang.org/x/crypto/pkcs12/mac.go index 5f38aa7de83..84039cee1e5 100644 --- a/vendor/golang.org/x/crypto/pkcs12/mac.go +++ b/vendor/golang.org/x/crypto/pkcs12/mac.go @@ -27,11 +27,19 @@ var ( oidSHA1 = asn1.ObjectIdentifier([]int{1, 3, 14, 3, 2, 26}) ) +// maxIterations is a safety limit to prevent CPU exhaustion from +// crafted PKCS#12 files with unreasonable iteration counts. +const maxIterations = 1 << 20 // ~1 million + func verifyMac(macData *macData, message, password []byte) error { if !macData.Mac.Algorithm.Algorithm.Equal(oidSHA1) { return NotImplementedError("unknown digest algorithm: " + macData.Mac.Algorithm.Algorithm.String()) } + if macData.Iterations < 0 || macData.Iterations > maxIterations { + return NotImplementedError("iteration count is invalid or too high") + } + key := pbkdf(sha1Sum, 20, 64, macData.MacSalt, password, macData.Iterations, 3, 20) mac := hmac.New(sha1.New, key) diff --git a/vendor/golang.org/x/net/http2/server_wrap.go b/vendor/golang.org/x/net/http2/server_wrap.go index a7a09551c43..737f1f0573b 100644 --- a/vendor/golang.org/x/net/http2/server_wrap.go +++ b/vendor/golang.org/x/net/http2/server_wrap.go @@ -10,9 +10,11 @@ package http2 import ( "context" + "crypto/tls" "errors" "net" "net/http" + "slices" "sync" "time" ) @@ -44,6 +46,20 @@ func configureServer(s *http.Server, conf *Server) error { h2.IdleTimeout = h1.ReadTimeout } } + + // Register h2 and http/1.1 ALPN protocols on s.TLSConfig, matching + // the pre-wrapping implementation in server.go, so that TLS listeners + // built from s.TLSConfig still negotiate HTTP/2. + if s.TLSConfig == nil { + s.TLSConfig = new(tls.Config) + } + if !slices.Contains(s.TLSConfig.NextProtos, NextProtoTLS) { + s.TLSConfig.NextProtos = append(s.TLSConfig.NextProtos, NextProtoTLS) + } + if !slices.Contains(s.TLSConfig.NextProtos, "http/1.1") { + s.TLSConfig.NextProtos = append(s.TLSConfig.NextProtos, "http/1.1") + } + conf.state = &serverInternalState{ s1: s, } diff --git a/vendor/golang.org/x/net/http2/transport_wrap.go b/vendor/golang.org/x/net/http2/transport_wrap.go index d25d99bdbb0..eab2e6b0731 100644 --- a/vendor/golang.org/x/net/http2/transport_wrap.go +++ b/vendor/golang.org/x/net/http2/transport_wrap.go @@ -22,8 +22,8 @@ import ( ) func configureTransport(t1 *http.Transport) error { - // ConfigureTransport is a no-op: The http.Transport already supports HTTP/2. - return nil + _, err := configureTransports(t1) + return err } func configureTransports(t1 *http.Transport) (*Transport, error) { @@ -31,6 +31,17 @@ func configureTransports(t1 *http.Transport) (*Transport, error) { // linked to the http.Transport's. tr2 := &Transport{} tr2.configure(t1) + // Enable HTTP/2 on the transport, as the pre-wrapping implementation did: + // net/http does not auto-enable it for a transport with a custom + // TLSClientConfig or dialer. + if t1.TLSClientConfig == nil { + t1.TLSClientConfig = &tls.Config{} + } + if t1.Protocols == nil { + t1.Protocols = new(http.Protocols) + t1.Protocols.SetHTTP1(true) + } + t1.Protocols.SetHTTP2(true) return tr2, nil } diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux.go b/vendor/golang.org/x/sys/unix/ztypes_linux.go index d11d5b96a49..526a0d5f434 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux.go @@ -6397,3 +6397,79 @@ const ( MPOL_PREFERRED_MANY = 0x5 MPOL_WEIGHTED_INTERLEAVE = 0x6 ) + +const ( + GPIO_V2_GET_LINEINFO_IOCTL = 0xc100b405 + GPIO_V2_GET_LINE_IOCTL = 0xc250b407 + GPIO_V2_LINE_GET_VALUES_IOCTL = 0xc010b40e + GPIO_V2_LINE_SET_VALUES_IOCTL = 0xc010b40f + GPIO_V2_GET_LINEINFO_WATCH_IOCTL = 0xc100b406 + GPIO_GET_LINEINFO_UNWATCH_IOCTL = 0xc004b40c +) +const ( + GPIO_V2_LINE_ATTR_ID_FLAGS = 0x1 + GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES = 0x2 + GPIO_V2_LINE_ATTR_ID_DEBOUNCE = 0x3 + GPIO_V2_LINE_CHANGED_REQUESTED = 0x1 + GPIO_V2_LINE_CHANGED_RELEASED = 0x2 + GPIO_V2_LINE_CHANGED_CONFIG = 0x3 + GPIO_V2_LINE_EVENT_RISING_EDGE = 0x1 + GPIO_V2_LINE_EVENT_FALLING_EDGE = 0x2 +) + +type GPIOChipInfo struct { + Name [32]byte + Label [32]byte + Lines uint32 +} +type GPIOV2LineValues struct { + Bits uint64 + Mask uint64 +} +type GPIOV2LineAttribute struct { + Id uint32 + _ uint32 + Flags uint64 +} +type GPIOV2LineConfigAttribute struct { + Attr GPIOV2LineAttribute + Mask uint64 +} +type GPIOV2LineConfig struct { + Flags uint64 + Num_attrs uint32 + _ [5]uint32 + Attrs [10]GPIOV2LineConfigAttribute +} +type GPIOV2LineRequest struct { + Offsets [64]uint32 + Consumer [32]byte + Config GPIOV2LineConfig + Num_lines uint32 + Event_buffer_size uint32 + _ [5]uint32 + Fd int32 +} +type GPIOV2LineInfo struct { + Name [32]byte + Consumer [32]byte + Offset uint32 + Num_attrs uint32 + Flags uint64 + Attrs [10]GPIOV2LineAttribute + _ [4]uint32 +} +type GPIOV2LineInfoChanged struct { + Info GPIOV2LineInfo + Timestamp_ns uint64 + Event_type uint32 + _ [5]uint32 +} +type GPIOV2LineEvent struct { + Timestamp_ns uint64 + Id uint32 + Offset uint32 + Seqno uint32 + Line_seqno uint32 + _ [6]uint32 +} diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_386.go b/vendor/golang.org/x/sys/unix/ztypes_linux_386.go index 97ef790deb2..aede1de7f2e 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_386.go @@ -711,3 +711,7 @@ type SysvShmDesc struct { _ uint32 _ uint32 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go index 90b50da680f..bb3bc4dc2c4 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go @@ -725,3 +725,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go index acda1368510..1fdf4c5175c 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go @@ -705,3 +705,7 @@ type SysvShmDesc struct { _ uint32 _ uint32 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go index ef7a99e1f9d..063e6f0b41e 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go @@ -704,3 +704,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go index 966063dfc13..9cf836c708f 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go @@ -705,3 +705,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go index dc53b20b743..1d222fcb312 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go @@ -710,3 +710,7 @@ type SysvShmDesc struct { Ctime_high uint16 _ uint16 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go index 9ad0aa8c31e..912cc4ab63b 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go @@ -707,3 +707,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go index 29d55493d55..1e358ef34f2 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go @@ -707,3 +707,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go index a4d9e158488..df59f32f5e4 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go @@ -710,3 +710,7 @@ type SysvShmDesc struct { Ctime_high uint16 _ uint16 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go index f8a29777162..29355aa0bfe 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go @@ -718,3 +718,7 @@ type SysvShmDesc struct { _ uint32 _ [4]byte } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go index 4158d6c4eee..c6083a15d7d 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go @@ -713,3 +713,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go index 1035af49f78..6321cc76266 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go @@ -713,3 +713,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go index 2297125d3c7..b44f402feb6 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go @@ -792,3 +792,7 @@ const ( RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE = 0x6 RISCV_HWPROBE_WHICH_CPUS = 0x1 ) + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go b/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go index 8481e9bd98d..b22c795a646 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go @@ -727,3 +727,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x8044b401 +) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go index a6828a03104..0b18075b53e 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go @@ -708,3 +708,7 @@ type SysvShmDesc struct { _ uint64 _ uint64 } + +const ( + GPIO_GET_CHIPINFO_IOCTL = 0x4044b401 +) diff --git a/vendor/golang.org/x/tools/go/ast/edge/edge.go b/vendor/golang.org/x/tools/go/ast/edge/edge.go index 4f6ccfd6e5e..8dc4dd1502b 100644 --- a/vendor/golang.org/x/tools/go/ast/edge/edge.go +++ b/vendor/golang.org/x/tools/go/ast/edge/edge.go @@ -12,7 +12,7 @@ import ( "reflect" ) -// A Kind describes a field of an ast.Node struct. +// A Kind describes a field of an [ast.Node] struct. type Kind uint8 // String returns a description of the edge kind. @@ -41,21 +41,25 @@ func (k Kind) Get(n ast.Node, idx int) ast.Node { panic(fmt.Sprintf("%v.Get(%T): invalid node type", k, n)) } v := reflect.ValueOf(n).Elem().Field(fieldInfos[k].index) - if idx != -1 { - v = v.Index(idx) // asserts valid index - } else { - // (The type assertion below asserts that v is not a slice.) + + if v.Kind() == reflect.Slice { + v = v.Index(idx) // asserts valid idx + } else if idx != -1 { + panic(fmt.Sprintf("%v, Get(%T, %d): cannot index non-slice", v, n, idx)) } - return v.Interface().(ast.Node) // may be nil + + out, _ := v.Interface().(ast.Node) // may be nil + return out } +// Each [Kind] is named Type_Field, where Type is the +// [ast.Node] struct type and Field is the name of the field const ( Invalid Kind = iota // for nodes at the root of the traversal - // Kinds are sorted alphabetically. - // Numbering is not stable. - // Each is named Type_Field, where Type is the - // ast.Node struct type and Field is the name of the field + // As of Go1.26 these kinds are sorted alphabetically, but + // numbering must be stable, so any new addition of const should + // use a new value (be added at the end of the list). ArrayType_Elt ArrayType_Len diff --git a/vendor/golang.org/x/tools/go/packages/golist.go b/vendor/golang.org/x/tools/go/packages/golist.go index a6c17cf6346..8e60cbbed7b 100644 --- a/vendor/golang.org/x/tools/go/packages/golist.go +++ b/vendor/golang.org/x/tools/go/packages/golist.go @@ -207,11 +207,10 @@ func goListDriver(cfg *Config, runner *gocommand.Runner, overlay string, pattern // doesn't exist. extractQueries: for _, pattern := range patterns { - eqidx := strings.Index(pattern, "=") - if eqidx < 0 { + query, value, ok := strings.Cut(pattern, "=") + if !ok { restPatterns = append(restPatterns, pattern) } else { - query, value := pattern[:eqidx], pattern[eqidx+len("="):] switch query { case "file": containFiles = append(containFiles, value) @@ -563,8 +562,18 @@ func (state *golistState) createDriverResponse(words ...string) (*DriverResponse } else { // golang/go#38990: go list silently fails to do cgo processing pkg.CompiledGoFiles = nil + + var msg strings.Builder + fmt.Fprintf(&msg, "go list failed to return CompiledGoFiles for %q.\n", p.Name) + + for _, err := range p.DepsErrors { + msg.WriteString(strings.TrimSpace(err.Err)) + msg.WriteByte('\n') + } + + msg.WriteString("This may indicate failure to perform cgo processing; try building at the command line. See https://golang.org/issue/38990.") pkg.Errors = append(pkg.Errors, Error{ - Msg: "go list failed to return CompiledGoFiles. This may indicate failure to perform cgo processing; try building at the command line. See https://golang.org/issue/38990.", + Msg: msg.String(), Kind: ListError, }) } diff --git a/vendor/golang.org/x/tools/go/packages/packages.go b/vendor/golang.org/x/tools/go/packages/packages.go index 412ba06b56d..de683684ab1 100644 --- a/vendor/golang.org/x/tools/go/packages/packages.go +++ b/vendor/golang.org/x/tools/go/packages/packages.go @@ -539,6 +539,11 @@ type Package struct { // depsErrors is the DepsErrors field from the go list response, if any. depsErrors []*packagesinternal.PackageError + + // exportDataError is the error encountered reading export data, if any. + // Decoding export data should ordinarily be infallible, so this typically + // indicates a producer/consumer version skew. + exportDataError error } // Module provides module information for a package. @@ -1073,10 +1078,11 @@ func (ld *loader) loadPackage(lpkg *loaderPackage) { } // TODO(adonovan): this condition looks wrong: - // I think it should be lpkg.needtypes && !lpg.needsrc, + // I think it should be lpkg.needtypes && !lpkg.needsrc, // so that NeedSyntax without NeedTypes can be satisfied by export data. if !lpkg.needsrc { if err := ld.loadFromExportData(lpkg); err != nil { + lpkg.exportDataError = err lpkg.Errors = append(lpkg.Errors, Error{ Pos: "-", Msg: err.Error(), @@ -1215,7 +1221,13 @@ func (ld *loader) loadPackage(lpkg *loaderPackage) { if ipkg.Types != nil && ipkg.Types.Complete() { return ipkg.Types, nil } - log.Fatalf("internal error: package %q without types was imported from %q", path, lpkg) + + // If types are unavailable, there must be an export data error. + if ipkg.exportDataError != nil { + return nil, ipkg.exportDataError + } + + log.Fatalf("internal error: expected complete types for package %q", path) panic("unreachable") }) diff --git a/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go b/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go index 77aad553d5b..0d6d0bced0f 100644 --- a/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go +++ b/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go @@ -24,8 +24,10 @@ package objectpath import ( + "encoding/binary" "fmt" "go/types" + "slices" "strconv" "strings" @@ -124,7 +126,66 @@ func For(obj types.Object) (Path, error) { // An Encoder amortizes the cost of encoding the paths of multiple objects. // The zero value of an Encoder is ready to use. type Encoder struct { - scopeMemo map[*types.Scope][]types.Object // memoization of scopeObjects + pkgIndex map[*types.Package]*pkgIndex +} + +// A traversal encapsulates the state of a single traversal of the object/type graph. +type traversal struct { + pkg *types.Package + ix *pkgIndex // non-nil if we are building the index + + target types.Object // the sought symbol (if ix == nil) + found Path // the found path (if ix == nil) + + // These maps are used to short circuit cycles through + // interface methods, such as occur in the following example: + // + // type I interface { f() interface{I} } + // + // See golang/go#68046 for details. + seenTParamNames map[*types.TypeName]bool // global cycle breaking through type parameters + seenMethods map[*types.Func]bool // global cycle breaking through recursive interfaces +} + +// A pkgIndex holds a compressed index of objectpaths of all symbols +// (fields, methods, params) requiring search for an entire package. +// +// The first time a search for a given package is requested, we simply +// traverse the type graph for the target object, maintaining the +// current object path as a stack. If we find the target object, we +// save the path and terminate the main loop (but it's not worth +// breaking out of the current recursion). +// +// On the second search (a pkgIndex exists but its data is nil), we +// build an index of the traversal, which we use for all subsequent +// searches. +// +// The traversal index is encoded in the data field as a list of records, +// one per node, in preorder. Records are of two types: +// +// - A record for a package-level object consists of a pair +// (parent, nameIndex uvarint), where parent is zero and +// nameIndex is the index of the object's name in the sorted +// pkg.Scope().Names() slice. +// +// - A record for a nested node (a segment of an object path) +// consists of (parent uvarint, op byte, index uvarint), where +// parent is the index of the record for the parent node, +// op is the destructuring operator, and index (if op = [AFMTr]) +// is its integer operand. +// +// Since data[0] = 0 all nodes have positive offsets. In effect the +// encoding is a trie in which each node stores one path segment +// and points to the node for its prefix. +// +// TODO(adonovan): opt: evaluate an only 2-level tree with nodes for +// package-level objects and the-rest-of-the-path. One calculation +// suggested that it might be similar speed but 30% more compact. +type pkgIndex struct { + pkg *types.Package + data []byte // encoding of traversal; nil if not yet constructed + scopeNames []string // memo of pkg.Scope().Names() to avoid O(n) alloc/sort at lookup + offsets map[types.Object]uint32 // each object's node offset within encoded traversal data } // For returns the path to an object relative to its package, @@ -211,10 +272,9 @@ func (enc *Encoder) For(obj types.Object) (Path, error) { if pkg == nil { return "", fmt.Errorf("predeclared %s has no path", obj) } - scope := pkg.Scope() // 2. package-level object? - if scope.Lookup(obj.Name()) == obj { + if pkg.Scope().Lookup(obj.Name()) == obj { // Only exported objects (and non-exported types) have a path. // Non-exported types may be referenced by other objects. if _, ok := obj.(*types.TypeName); !ok && !obj.Exported() { @@ -232,19 +292,18 @@ func (enc *Encoder) For(obj types.Object) (Path, error) { // have a path. return "", fmt.Errorf("no path for %v", obj) } + case *types.Const, // Only package-level constants have a path. *types.Label, // Labels are function-local. *types.PkgName: // PkgNames are file-local. return "", fmt.Errorf("no path for %v", obj) case *types.Var: - // Could be: - // - a field (obj.IsField()) - // - a func parameter or result - // - a local var. - // Sadly there is no way to distinguish - // a param/result from a local - // so we must proceed to the find. + // A var, if not package-level, must be a + // parameter (incl. receiver) or result, or a struct field. + if obj.Kind() == types.LocalVar { + return "", fmt.Errorf("no path for local %v", obj) + } case *types.Func: // A func, if not package-level, must be a method. @@ -261,89 +320,311 @@ func (enc *Encoder) For(obj types.Object) (Path, error) { panic(obj) } - // 4. Search the API for the path to the var (field/param/result) or method. + // 4. Search the object/type graph for the path to + // the var (field/param/result) or method. + ix, ok := enc.pkgIndex[pkg] + if !ok { + // First search: don't build an index, just traverse. + // This avoids allocation in [For], whose Encoder + // lives for a single call. + ix = &pkgIndex{pkg: pkg} + + if enc.pkgIndex == nil { + enc.pkgIndex = make(map[*types.Package]*pkgIndex) + } + enc.pkgIndex[pkg] = ix // build the index next time + + f := traversal{pkg: pkg, target: obj} + f.traverse() + + if f.found != "" { + return f.found, nil + } + } else { + // Second search: build an index while traversing. + if ix.data == nil { + ix.offsets = make(map[types.Object]uint32) + ix.data = []byte{0} // offset 0 is sentinel + (&traversal{pkg: pkg, ix: ix}).traverse() + } + + // Second and later searches: consult the index. + if offset, ok := ix.offsets[obj]; ok { + return ix.path(offset), nil + } + } + + return "", fmt.Errorf("can't find path for %v in %s", obj, pkg.Path()) +} + +// traverse performs a complete traversal of all symbols reachable from the package. +func (tr *traversal) traverse() { + scope := tr.pkg.Scope() + names := scope.Names() + if tr.ix != nil { + tr.ix.scopeNames = names + } + + empty := make([]byte, 0, 48) // initial space for stack (ix == nil) - // First inspect package-level named types. + // First inspect package-level type names. // In the presence of path aliases, these give // the best paths because non-types may // refer to types, but not the reverse. - empty := make([]byte, 0, 48) // initial space - objs := enc.scopeObjects(scope) - for _, o := range objs { - tname, ok := o.(*types.TypeName) - if !ok { - continue // handle non-types in second pass + for i, name := range names { + if tr.found != "" { + return // found (ix == nil) } - path := append(empty, o.Name()...) - path = append(path, opType) - - T := o.Type() - if alias, ok := T.(*types.Alias); ok { - if r := findTypeParam(obj, alias.TypeParams(), path, opTypeParam); r != nil { - return Path(r), nil - } - if r := find(obj, alias.Rhs(), append(path, opRhs)); r != nil { - return Path(r), nil - } + obj := scope.Lookup(name) + if _, ok := obj.(*types.TypeName); !ok { + continue // handle non-types in second pass + } - } else if tname.IsAlias() { - // legacy alias - if r := find(obj, T, path); r != nil { - return Path(r), nil - } + // emit (name, opType) + var path []byte + var offset uint32 + if tr.ix == nil { + path = append(empty, name...) + path = append(path, opType) + } else { + offset = tr.ix.emitPackageLevel(i) + tr.ix.offsets[obj] = offset + offset = tr.ix.emitPathSegment(offset, opType, -1) + } - } else if named, ok := T.(*types.Named); ok { - // defined (named) type - if r := findTypeParam(obj, named.TypeParams(), path, opTypeParam); r != nil { - return Path(r), nil - } - if r := find(obj, named.Underlying(), append(path, opUnderlying)); r != nil { - return Path(r), nil - } + // A TypeName (for Named or Alias) may have type parameters. + switch t := obj.Type().(type) { + case *types.Alias: + tr.tparams(t.TypeParams(), path, offset, opTypeParam) + tr.typ(path, offset, opRhs, -1, t.Rhs()) + case *types.Named: + tr.tparams(t.TypeParams(), path, offset, opTypeParam) + tr.typ(path, offset, opUnderlying, -1, t.Underlying()) } } // Then inspect everything else: - // non-types, and declared methods of defined types. - for _, o := range objs { - path := append(empty, o.Name()...) - if _, ok := o.(*types.TypeName); !ok { - if o.Exported() { + // exported non-types, and declared methods of defined types. + for i, name := range names { + if tr.found != "" { + return // found (ix == nil) + } + + obj := scope.Lookup(name) + + if tname, ok := obj.(*types.TypeName); !ok { + if obj.Exported() { // exported non-type (const, var, func) - if r := find(obj, o.Type(), append(path, opType)); r != nil { - return Path(r), nil + var path []byte + var offset uint32 + if tr.ix == nil { + path = append(empty, name...) + } else { + offset = tr.ix.emitPackageLevel(i) + tr.ix.offsets[obj] = offset } + tr.typ(path, offset, opType, -1, obj.Type()) } - continue - } - // Inspect declared methods of defined types. - if T, ok := types.Unalias(o.Type()).(*types.Named); ok { - path = append(path, opType) + } else if T, ok := types.Unalias(tname.Type()).(*types.Named); ok { + // defined type + var path []byte + var offset uint32 + if tr.ix == nil { + path = append(empty, name...) + path = append(path, opType) + } else { + // Inv: map entry for obj was populated in first pass. + offset = tr.ix.emitPathSegment(tr.ix.offsets[obj], opType, -1) + } + + // Inspect declared methods of defined types. + // // The method index here is always with respect // to the underlying go/types data structures, // which ultimately derives from source order // and must be preserved by export data. for i := 0; i < T.NumMethods(); i++ { m := T.Method(i) - path2 := appendOpArg(path, opMethod, i) - if m == obj { - return Path(path2), nil // found declared method - } - if r := find(obj, m.Type(), append(path2, opType)); r != nil { - return Path(r), nil + tr.object(path, offset, opMethod, i, m) + } + } + } +} + +func (tr *traversal) visitType(path []byte, offset uint32, T types.Type) { + switch T := T.(type) { + case *types.Alias: + tr.typ(path, offset, opRhs, -1, T.Rhs()) + + case *types.Basic, *types.Named: + // Named types belonging to pkg were handled already, + // so T must belong to another package. No path. + return + + case *types.Pointer, *types.Slice, *types.Array, *types.Chan: + type hasElem interface{ Elem() types.Type } // note: includes Map + tr.typ(path, offset, opElem, -1, T.(hasElem).Elem()) + + case *types.Map: + tr.typ(path, offset, opKey, -1, T.Key()) + tr.typ(path, offset, opElem, -1, T.Elem()) + + case *types.Signature: + tr.tparams(T.RecvTypeParams(), path, offset, opRecvTypeParam) + tr.tparams(T.TypeParams(), path, offset, opTypeParam) + tr.typ(path, offset, opParams, -1, T.Params()) + tr.typ(path, offset, opResults, -1, T.Results()) + + case *types.Struct: + for i := 0; i < T.NumFields(); i++ { + tr.object(path, offset, opField, i, T.Field(i)) + } + + case *types.Tuple: + for i := 0; i < T.Len(); i++ { + tr.object(path, offset, opAt, i, T.At(i)) + } + + case *types.Interface: + for i := 0; i < T.NumMethods(); i++ { + m := T.Method(i) + if m.Pkg() != nil && m.Pkg() != tr.pkg { + continue // embedded method from another package + } + if !tr.seenMethods[m] { + if tr.seenMethods == nil { + tr.seenMethods = make(map[*types.Func]bool) } + tr.seenMethods[m] = true + tr.object(path, offset, opMethod, i, m) } } + + case *types.TypeParam: + tname := T.Obj() + if tname.Pkg() != nil && tname.Pkg() != tr.pkg { + return // type parameter from another package + } + if !tr.seenTParamNames[tname] { + if tr.seenTParamNames == nil { + tr.seenTParamNames = make(map[*types.TypeName]bool) + } + tr.seenTParamNames[tname] = true + tr.object(path, offset, opObj, -1, tname) + tr.typ(path, offset, opConstraint, -1, T.Constraint()) + } } +} - return "", fmt.Errorf("can't find path for %v in %s", obj, pkg.Path()) +func (tr *traversal) tparams(list *types.TypeParamList, path []byte, offset uint32, op byte) { + for i := 0; i < list.Len(); i++ { + tr.typ(path, offset, op, i, list.At(i)) + } +} + +// typ descends the type graph edge (op, index), then proceeds to traverse type t. +func (tr *traversal) typ(path []byte, offset uint32, op byte, index int, t types.Type) { + if tr.ix == nil { + path = appendOpArg(path, op, index) + } else { + offset = tr.ix.emitPathSegment(offset, op, index) + } + tr.visitType(path, offset, t) +} + +// object descends the type graph edge (op, index), records object +// obj, then proceeds to traverse its type. +func (tr *traversal) object(path []byte, offset uint32, op byte, index int, obj types.Object) { + if tr.ix == nil { + path = appendOpArg(path, op, index) + if obj == tr.target && tr.found == "" { + tr.found = Path(path) + } + path = append(path, opType) + } else { + offset = tr.ix.emitPathSegment(offset, op, index) + if _, ok := tr.ix.offsets[obj]; !ok { + tr.ix.offsets[obj] = offset + } + offset = tr.ix.emitPathSegment(offset, opType, -1) + } + tr.visitType(path, offset, obj.Type()) +} + +// emitPackageLevel encodes a record for a package-level symbol, +// identified by its index in ix.scopeNames. +func (p *pkgIndex) emitPackageLevel(index int) uint32 { + off := uint32(len(p.data)) + p.data = append(p.data, 0) // zero varint => no parent + p.data = binary.AppendUvarint(p.data, uint64(index)) + return off +} + +// emitPathSegment emits a record for a non-initial object path segment. +func (p *pkgIndex) emitPathSegment(parent uint32, op byte, index int) uint32 { + off := uint32(len(p.data)) + p.data = binary.AppendUvarint(p.data, uint64(parent)) + p.data = append(p.data, op) + switch op { + case opAt, opField, opMethod, opTypeParam, opRecvTypeParam: + p.data = binary.AppendUvarint(p.data, uint64(index)) + } + return off +} + +// path returns the Path for the encoded node at the specified offset. +func (p *pkgIndex) path(offset uint32) Path { + var elems []string // path elements in reverse + for { + // Read parent index. + parent, n := binary.Uvarint(p.data[offset:]) + offset += uint32(n) + + if parent == 0 { + break // root (end of path) + } + + op := p.data[offset] + offset++ + + // The [AFMTr] operators have a numeric operand. + switch op { + case opAt, opField, opMethod, opTypeParam, opRecvTypeParam: + val, n := binary.Uvarint(p.data[offset:]) + offset += uint32(n) + elems = append(elems, strconv.Itoa(int(val))) + } + + elems = append(elems, string([]byte{op})) + + offset = uint32(parent) + } + idx, _ := binary.Uvarint(p.data[offset:]) + + // Convert index to Path string. + name := p.scopeNames[idx] + sz := len(name) + for _, elem := range elems { + sz += len(elem) + } + var buf strings.Builder + buf.Grow(sz) + buf.WriteString(name) + for _, elem := range slices.Backward(elems) { + buf.WriteString(elem) + } + return Path(buf.String()) } -func appendOpArg(path []byte, op byte, arg int) []byte { +// appendOpArg appends (op, index) to the object path. +// A negative index is ignored. +func appendOpArg(path []byte, op byte, index int) []byte { path = append(path, op) - path = strconv.AppendInt(path, int64(arg), 10) + if index >= 0 { + path = strconv.AppendInt(path, int64(index), 10) + } return path } @@ -442,138 +723,6 @@ func (enc *Encoder) concreteMethod(meth *types.Func) (Path, bool) { // panic(fmt.Sprintf("couldn't find method %s on type %s; methods: %#v", meth, named, enc.namedMethods(named))) } -// find finds obj within type T, returning the path to it, or nil if not found. -// -// The seen map is used to short circuit cycles through type parameters. If -// nil, it will be allocated as necessary. -// -// The seenMethods map is used internally to short circuit cycles through -// interface methods, such as occur in the following example: -// -// type I interface { f() interface{I} } -// -// See golang/go#68046 for details. -func find(obj types.Object, T types.Type, path []byte) []byte { - return (&finder{obj: obj}).find(T, path) -} - -// finder closes over search state for a call to find. -type finder struct { - obj types.Object // the sought object - seenTParamNames map[*types.TypeName]bool // for cycle breaking through type parameters - seenMethods map[*types.Func]bool // for cycle breaking through recursive interfaces -} - -func (f *finder) find(T types.Type, path []byte) []byte { - switch T := T.(type) { - case *types.Alias: - return f.find(types.Unalias(T), path) - case *types.Basic, *types.Named: - // Named types belonging to pkg were handled already, - // so T must belong to another package. No path. - return nil - case *types.Pointer: - return f.find(T.Elem(), append(path, opElem)) - case *types.Slice: - return f.find(T.Elem(), append(path, opElem)) - case *types.Array: - return f.find(T.Elem(), append(path, opElem)) - case *types.Chan: - return f.find(T.Elem(), append(path, opElem)) - case *types.Map: - if r := f.find(T.Key(), append(path, opKey)); r != nil { - return r - } - return f.find(T.Elem(), append(path, opElem)) - case *types.Signature: - if r := f.findTypeParam(T.RecvTypeParams(), path, opRecvTypeParam); r != nil { - return r - } - if r := f.findTypeParam(T.TypeParams(), path, opTypeParam); r != nil { - return r - } - if r := f.find(T.Params(), append(path, opParams)); r != nil { - return r - } - return f.find(T.Results(), append(path, opResults)) - case *types.Struct: - for i := 0; i < T.NumFields(); i++ { - fld := T.Field(i) - path2 := appendOpArg(path, opField, i) - if fld == f.obj { - return path2 // found field var - } - if r := f.find(fld.Type(), append(path2, opType)); r != nil { - return r - } - } - return nil - case *types.Tuple: - for i := 0; i < T.Len(); i++ { - v := T.At(i) - path2 := appendOpArg(path, opAt, i) - if v == f.obj { - return path2 // found param/result var - } - if r := f.find(v.Type(), append(path2, opType)); r != nil { - return r - } - } - return nil - case *types.Interface: - for i := 0; i < T.NumMethods(); i++ { - m := T.Method(i) - if f.seenMethods[m] { - continue // break cycles (see TestIssue70418) - } - path2 := appendOpArg(path, opMethod, i) - if m == f.obj { - return path2 // found interface method - } - if f.seenMethods == nil { - f.seenMethods = make(map[*types.Func]bool) - } - f.seenMethods[m] = true - if r := f.find(m.Type(), append(path2, opType)); r != nil { - return r - } - } - return nil - case *types.TypeParam: - name := T.Obj() - if f.seenTParamNames[name] { - return nil - } - if name == f.obj { - return append(path, opObj) - } - if f.seenTParamNames == nil { - f.seenTParamNames = make(map[*types.TypeName]bool) - } - f.seenTParamNames[name] = true - if r := f.find(T.Constraint(), append(path, opConstraint)); r != nil { - return r - } - return nil - } - panic(T) -} - -func findTypeParam(obj types.Object, list *types.TypeParamList, path []byte, op byte) []byte { - return (&finder{obj: obj}).findTypeParam(list, path, op) -} - -func (f *finder) findTypeParam(list *types.TypeParamList, path []byte, op byte) []byte { - for i := 0; i < list.Len(); i++ { - tparam := list.At(i) - path2 := appendOpArg(path, op, i) - if r := f.find(tparam, path2); r != nil { - return r - } - } - return nil -} - // Object returns the object denoted by path p within the package pkg. func Object(pkg *types.Package, p Path) (types.Object, error) { pathstr := string(p) @@ -708,7 +857,7 @@ func Object(pkg *types.Package, p Path) (types.Object, error) { } tparams := hasTypeParams.TypeParams() if n := tparams.Len(); index >= n { - return nil, fmt.Errorf("tuple index %d out of range [0-%d)", index, n) + return nil, fmt.Errorf("type parameter index %d out of range [0-%d)", index, n) } t = tparams.At(index) @@ -719,7 +868,7 @@ func Object(pkg *types.Package, p Path) (types.Object, error) { } rtparams := sig.RecvTypeParams() if n := rtparams.Len(); index >= n { - return nil, fmt.Errorf("tuple index %d out of range [0-%d)", index, n) + return nil, fmt.Errorf("receiver type parameter index %d out of range [0-%d)", index, n) } t = rtparams.At(index) @@ -794,23 +943,3 @@ func Object(pkg *types.Package, p Path) (types.Object, error) { return obj, nil // success } - -// scopeObjects is a memoization of scope objects. -// Callers must not modify the result. -func (enc *Encoder) scopeObjects(scope *types.Scope) []types.Object { - m := enc.scopeMemo - if m == nil { - m = make(map[*types.Scope][]types.Object) - enc.scopeMemo = m - } - objs, ok := m[scope] - if !ok { - names := scope.Names() // allocates and sorts - objs = make([]types.Object, len(names)) - for i, name := range names { - objs[i] = scope.Lookup(name) - } - m[scope] = objs - } - return objs -} diff --git a/vendor/golang.org/x/tools/internal/gcimporter/ureader.go b/vendor/golang.org/x/tools/internal/gcimporter/ureader.go index 3db62b89089..5d3b7c867a3 100644 --- a/vendor/golang.org/x/tools/internal/gcimporter/ureader.go +++ b/vendor/golang.org/x/tools/internal/gcimporter/ureader.go @@ -11,6 +11,7 @@ import ( "go/token" "go/types" "sort" + "strings" "golang.org/x/tools/internal/aliases" "golang.org/x/tools/internal/pkgbits" @@ -523,6 +524,12 @@ func (pr *pkgReader) objIdx(idx pkgbits.Index) (*types.Package, string) { return objPkg, objName } + // TODO(mark): This, like the above splitVargenSuffix, is not ideal. + // Ignore generic methods promoted to global scope. + if strings.Contains(objName, ".") { + return objPkg, objName + } + if objPkg.Scope().Lookup(objName) == nil { dict := pr.objDictIdx(idx) @@ -554,15 +561,11 @@ func (pr *pkgReader) objIdx(idx pkgbits.Index) (*types.Package, string) { case pkgbits.ObjFunc: pos := r.pos() - var rtparams []*types.TypeParam - var recv *types.Var - if r.Version().Has(pkgbits.GenericMethods) && r.Bool() { - r.selector() - rtparams = r.typeParamNames(true) - recv = r.param() + if r.Version().Has(pkgbits.GenericMethods) { + assert(!r.Bool()) // generic methods are read in their defining type } tparams := r.typeParamNames(false) - sig := r.signature(recv, rtparams, tparams) + sig := r.signature(nil, nil, tparams) declare(types.NewFunc(pos, objPkg, objName, sig)) case pkgbits.ObjType: @@ -630,6 +633,29 @@ func (pr *pkgReader) objIdx(idx pkgbits.Index) (*types.Package, string) { named.AddMethod(r.method()) } + if r.Version().Has(pkgbits.GenericMethods) { + for range r.Len() { + // Careful: objIdx is used to read in package-scoped declarations, which + // methods are not. Instead, decode it here. This makes it easier to + // associate it with the type and avoids the main objIdx loop. + idx := r.Reloc(pkgbits.RelocObj) + + r := pr.tempReader(pkgbits.RelocObj, idx, pkgbits.SyncObject1) + r.dict = pr.objDictIdx(idx) + + pos := r.pos() + assert(r.Bool()) // generic method + pkg, name := r.selector() + rtparams := r.typeParamNames(true) + recv := r.param() + tparams := r.typeParamNames(false) + sig := r.signature(recv, rtparams, tparams) + + pr.retireReader(r) + named.AddMethod(types.NewFunc(pos, pkg, name, sig)) + } + } + case pkgbits.ObjVar: pos := r.pos() typ := r.typ() @@ -653,7 +679,7 @@ func (pr *pkgReader) objDictIdx(idx pkgbits.Index) *readerDict { } nreceivers := 0 - if r.Version().Has(pkgbits.GenericMethods) && r.Bool() { + if r.Version().Has(pkgbits.GenericMethods) { nreceivers = r.Len() } nexplicits := r.Len() diff --git a/vendor/golang.org/x/tools/internal/gocommand/version.go b/vendor/golang.org/x/tools/internal/gocommand/version.go index cce290c4194..d82f13a7e60 100644 --- a/vendor/golang.org/x/tools/internal/gocommand/version.go +++ b/vendor/golang.org/x/tools/internal/gocommand/version.go @@ -8,6 +8,7 @@ import ( "context" "fmt" "regexp" + "slices" "strings" ) @@ -41,9 +42,9 @@ func GoVersion(ctx context.Context, inv Invocation, r *Runner) (int, error) { } // Split up "[go1.1 go1.15]" and return highest go1.X value. tags := strings.Fields(stdout[1 : len(stdout)-2]) - for i := len(tags) - 1; i >= 0; i-- { + for _, tag := range slices.Backward(tags) { var version int - if _, err := fmt.Sscanf(tags[i], "go1.%d", &version); err != nil { + if _, err := fmt.Sscanf(tag, "go1.%d", &version); err != nil { continue } return version, nil diff --git a/vendor/modules.txt b/vendor/modules.txt index 5818a5e8e13..f0fb58f94ce 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1628,7 +1628,7 @@ go4.org/intern # go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 ## explicit; go 1.11 go4.org/unsafe/assume-no-moving-gc -# golang.org/x/crypto v0.51.0 +# golang.org/x/crypto v0.53.0 ## explicit; go 1.25.0 golang.org/x/crypto/argon2 golang.org/x/crypto/bcrypt @@ -1647,10 +1647,10 @@ golang.org/x/crypto/pkcs12/internal/rc2 ## explicit; go 1.25.0 golang.org/x/exp/constraints golang.org/x/exp/slices -# golang.org/x/mod v0.35.0 +# golang.org/x/mod v0.36.0 ## explicit; go 1.25.0 golang.org/x/mod/semver -# golang.org/x/net v0.55.0 +# golang.org/x/net v0.56.0 ## explicit; go 1.25.0 golang.org/x/net/bpf golang.org/x/net/context @@ -1688,13 +1688,13 @@ golang.org/x/oauth2/jwt golang.org/x/sync/errgroup golang.org/x/sync/semaphore golang.org/x/sync/singleflight -# golang.org/x/sys v0.45.0 +# golang.org/x/sys v0.46.0 ## explicit; go 1.25.0 golang.org/x/sys/cpu golang.org/x/sys/unix golang.org/x/sys/windows golang.org/x/sys/windows/registry -# golang.org/x/text v0.37.0 +# golang.org/x/text v0.38.0 ## explicit; go 1.25.0 golang.org/x/text/cases golang.org/x/text/internal @@ -1710,7 +1710,7 @@ golang.org/x/text/unicode/norm # golang.org/x/time v0.15.0 ## explicit; go 1.25.0 golang.org/x/time/rate -# golang.org/x/tools v0.44.0 +# golang.org/x/tools v0.45.0 ## explicit; go 1.25.0 golang.org/x/tools/go/ast/edge golang.org/x/tools/go/ast/inspector