diff --git a/src/crypto/tls/cfkem.go b/src/crypto/tls/cfkem.go new file mode 100644 index 00000000000..cc6dc016864 --- /dev/null +++ b/src/crypto/tls/cfkem.go @@ -0,0 +1,113 @@ +// Copyright 2022 Cloudflare, Inc. All rights reserved. Use of this source code +// is governed by a BSD-style license that can be found in the LICENSE file. +// +// Glue to add Circl's (post-quantum) hybrid KEMs. +// +// To enable set CurvePreferences with the desired scheme as the first element: +// +// import ( +// "github.com/cloudflare/circl/kem/tls" +// "github.com/cloudflare/circl/kem/hybrid" +// +// [...] +// +// config.CurvePreferences = []tls.CurveID{ +// hybrid.X25519Kyber512Draft00().(tls.TLSScheme).TLSCurveID(), +// tls.X25519, +// tls.P256, +// } + +package tls + +import ( + "fmt" + "io" + + "crypto/ecdh" + + "github.com/cloudflare/circl/kem" + "github.com/cloudflare/circl/kem/hybrid" +) + +// Either ecdheParameters or kem.PrivateKey +type clientKeySharePrivate interface{} + +var ( + X25519Kyber512Draft00 = CurveID(0xfe30) + X25519Kyber768Draft00 = CurveID(0xfe31) + P256Kyber768Draft00 = CurveID(0xfe32) + invalidCurveID = CurveID(0) +) + +func kemSchemeKeyToCurveID(s kem.Scheme) CurveID { + switch s.Name() { + case "Kyber512-X25519": + return X25519Kyber512Draft00 + case "Kyber768-X25519": + return X25519Kyber768Draft00 + case "P256Kyber768Draft00": + return P256Kyber768Draft00 + default: + return invalidCurveID + } +} + +// Extract CurveID from clientKeySharePrivate +func clientKeySharePrivateCurveID(ks clientKeySharePrivate) CurveID { + switch v := ks.(type) { + case kem.PrivateKey: + ret := kemSchemeKeyToCurveID(v.Scheme()) + if ret == invalidCurveID { + panic("cfkem: internal error: don't know CurveID for this KEM") + } + return ret + case *ecdh.PrivateKey: + ret, ok := curveIDForCurve(v.Curve()) + if !ok { + panic("cfkem: internal error: unknown curve") + } + return ret + default: + panic("cfkem: internal error: unknown clientKeySharePrivate") + } +} + +// Returns scheme by CurveID if supported by Circl +func curveIdToCirclScheme(id CurveID) kem.Scheme { + switch id { + case X25519Kyber512Draft00: + return hybrid.Kyber512X25519() + case X25519Kyber768Draft00: + return hybrid.Kyber768X25519() + case P256Kyber768Draft00: + return hybrid.P256Kyber768Draft00() + } + return nil +} + +// Generate a new shared secret and encapsulates it for the packed +// public key in ppk using randomness from rnd. +func encapsulateForKem(scheme kem.Scheme, rnd io.Reader, ppk []byte) ( + ct, ss []byte, alert alert, err error) { + pk, err := scheme.UnmarshalBinaryPublicKey(ppk) + if err != nil { + return nil, nil, alertIllegalParameter, fmt.Errorf("unpack pk: %w", err) + } + seed := make([]byte, scheme.EncapsulationSeedSize()) + if _, err := io.ReadFull(rnd, seed); err != nil { + return nil, nil, alertInternalError, fmt.Errorf("random: %w", err) + } + ct, ss, err = scheme.EncapsulateDeterministically(pk, seed) + return ct, ss, alertIllegalParameter, err +} + +// Generate a new keypair using randomness from rnd. +func generateKemKeyPair(scheme kem.Scheme, rnd io.Reader) ( + kem.PublicKey, kem.PrivateKey, error) { + seed := make([]byte, scheme.SeedSize()) + if _, err := io.ReadFull(rnd, seed); err != nil { + return nil, nil, err + } + pk, sk := scheme.DeriveKeyPair(seed) + return pk, sk, nil +} diff --git a/src/crypto/tls/cfkem_test.go b/src/crypto/tls/cfkem_test.go new file mode 100644 index 00000000000..f3877229903 --- /dev/null +++ b/src/crypto/tls/cfkem_test.go @@ -0,0 +1,119 @@ +// Copyright 2022 Cloudflare, Inc. All rights reserved. Use of this source code +// is governed by a BSD-style license that can be found in the LICENSE file. + +package tls + +import ( + "fmt" + "testing" + + "github.com/cloudflare/circl/kem" + "github.com/cloudflare/circl/kem/hybrid" +) + +func testHybridKEX(t *testing.T, scheme kem.Scheme, clientPQ, serverPQ, + clientTLS12, serverTLS12 bool) { + var clientSelectedKEX *CurveID + var retry bool + + rsaCert := Certificate{ + Certificate: [][]byte{testRSACertificate}, + PrivateKey: testRSAPrivateKey, + } + serverCerts := []Certificate{rsaCert} + + clientConfig := testConfig.Clone() + if clientPQ { + clientConfig.CurvePreferences = []CurveID{ + kemSchemeKeyToCurveID(scheme), + X25519, + } + } + clientConfig.CFEventHandler = func(ev CFEvent) { + switch e := ev.(type) { + case CFEventTLSNegotiatedNamedKEX: + clientSelectedKEX = &e.KEX + case CFEventTLS13HRR: + retry = true + } + } + if clientTLS12 { + clientConfig.MaxVersion = VersionTLS12 + } + + serverConfig := testConfig.Clone() + if serverPQ { + serverConfig.CurvePreferences = []CurveID{ + kemSchemeKeyToCurveID(scheme), + X25519, + } + } + if serverTLS12 { + serverConfig.MaxVersion = VersionTLS12 + } + serverConfig.Certificates = serverCerts + + c, s := localPipe(t) + done := make(chan error) + defer c.Close() + + go func() { + defer s.Close() + done <- Server(s, serverConfig).Handshake() + }() + + cli := Client(c, clientConfig) + clientErr := cli.Handshake() + serverErr := <-done + if clientErr != nil { + t.Errorf("client error: %s", clientErr) + } + if serverErr != nil { + t.Errorf("server error: %s", serverErr) + } + + var expectedKEX CurveID + var expectedRetry bool + + if clientPQ && serverPQ && !clientTLS12 && !serverTLS12 { + expectedKEX = kemSchemeKeyToCurveID(scheme) + } else { + expectedKEX = X25519 + } + if !clientTLS12 && clientPQ && !serverPQ { + expectedRetry = true + } + + if clientSelectedKEX == nil { + t.Error("No KEX happened?") + } + + if *clientSelectedKEX != expectedKEX { + t.Errorf("failed to negotiate: expected %d, got %d", + expectedKEX, *clientSelectedKEX) + } + if expectedRetry != retry { + t.Errorf("Expected retry=%v, got retry=%v", expectedRetry, retry) + } +} + +func TestHybridKEX(t *testing.T) { + run := func(scheme kem.Scheme, clientPQ, serverPQ, clientTLS12, serverTLS12 bool) { + t.Run(fmt.Sprintf("%s serverPQ:%v clientPQ:%v serverTLS12:%v clientTLS12:%v", scheme.Name(), + serverPQ, clientPQ, serverTLS12, clientTLS12), func(t *testing.T) { + testHybridKEX(t, scheme, clientPQ, serverPQ, clientTLS12, serverTLS12) + }) + } + for _, scheme := range []kem.Scheme{ + hybrid.Kyber512X25519(), + hybrid.Kyber768X25519(), + hybrid.P256Kyber768Draft00(), + } { + run(scheme, true, true, false, false) + run(scheme, true, false, false, false) + run(scheme, false, true, false, false) + run(scheme, true, true, true, false) + run(scheme, true, true, false, true) + run(scheme, true, true, true, true) + } +} diff --git a/src/crypto/tls/handshake_client.go b/src/crypto/tls/handshake_client.go index a5b43b96ca8..cfe1c25ec31 100644 --- a/src/crypto/tls/handshake_client.go +++ b/src/crypto/tls/handshake_client.go @@ -8,7 +8,6 @@ import ( "bytes" "context" "crypto" - "crypto/ecdh" "crypto/ecdsa" "crypto/ed25519" "crypto/rsa" @@ -38,7 +37,7 @@ type clientHandshakeState struct { var testingOnlyForceClientHelloSignatureAlgorithms []SignatureScheme -func (c *Conn) makeClientHello(minVersion uint16) (*clientHelloMsg, *ecdh.PrivateKey, error) { +func (c *Conn) makeClientHello(minVersion uint16) (*clientHelloMsg, clientKeySharePrivate, error) { config := c.config if len(config.ServerName) == 0 && !config.InsecureSkipVerify { return nil, nil, errors.New("tls: either ServerName or InsecureSkipVerify must be specified in the tls.Config") @@ -127,7 +126,7 @@ func (c *Conn) makeClientHello(minVersion uint16) (*clientHelloMsg, *ecdh.Privat hello.supportedSignatureAlgorithms = testingOnlyForceClientHelloSignatureAlgorithms } - var key *ecdh.PrivateKey + var secret clientKeySharePrivate if hello.supportedVersions[0] == VersionTLS13 { if hasAESGCMHardwareSupport { hello.cipherSuites = append(hello.cipherSuites, defaultCipherSuitesTLS13...) @@ -136,19 +135,36 @@ func (c *Conn) makeClientHello(minVersion uint16) (*clientHelloMsg, *ecdh.Privat } curveID := config.curvePreferences()[0] - if _, ok := curveForCurveID(curveID); !ok { - return nil, nil, errors.New("tls: CurvePreferences includes unsupported curve") - } - key, err = generateECDHEKey(config.rand(), curveID) - if err != nil { - return nil, nil, err + if scheme := curveIdToCirclScheme(curveID); scheme != nil { + pk, sk, err := generateKemKeyPair(scheme, config.rand()) + if err != nil { + return nil, nil, fmt.Errorf("generateKemKeyPair %s: %w", + scheme.Name(), err) + } + packedPk, err := pk.MarshalBinary() + if err != nil { + return nil, nil, fmt.Errorf("pack circl public key %s: %w", + scheme.Name(), err) + } + hello.keyShares = []keyShare{{group: curveID, data: packedPk}} + secret = sk + } else { + if _, ok := curveForCurveID(curveID); !ok { + return nil, nil, errors.New("tls: CurvePreferences includes unsupported curve") + } + key, err := generateECDHEKey(config.rand(), curveID) + if err != nil { + return nil, nil, err + } + hello.keyShares = []keyShare{{group: curveID, data: key.PublicKey().Bytes()}} + secret = key } - hello.keyShares = []keyShare{{group: curveID, data: key.PublicKey().Bytes()}} + hello.delegatedCredentialSupported = config.SupportDelegatedCredential hello.supportedSignatureAlgorithmsDC = supportedSignatureAlgorithmsDC } - return hello, key, nil + return hello, secret, nil } func (c *Conn) clientHandshake(ctx context.Context) (err error) { @@ -239,16 +255,16 @@ func (c *Conn) clientHandshake(ctx context.Context) (err error) { if c.vers == VersionTLS13 { hs := &clientHandshakeStateTLS13{ - c: c, - ctx: ctx, - serverHello: serverHello, - hello: hello, - ecdheKey: ecdheKey, - helloInner: helloInner, - session: session, - earlySecret: earlySecret, - binderKey: binderKey, - hsTimings: hsTimings, + c: c, + ctx: ctx, + serverHello: serverHello, + hello: hello, + helloInner: helloInner, + keySharePrivate: ecdheKey, + session: session, + earlySecret: earlySecret, + binderKey: binderKey, + hsTimings: hsTimings, } // In TLS 1.3, session tickets are delivered after the handshake. @@ -581,6 +597,16 @@ func (hs *clientHandshakeState) doFullHandshake() error { return err } + if eccKex, ok := keyAgreement.(*ecdheKeyAgreement); ok { + curveId, ok := curveIDForCurve(eccKex.key.Curve()) + if !ok { + panic("internal error: unknown curve") + } + c.handleCFEvent(CFEventTLSNegotiatedNamedKEX{ + KEX: curveId, + }) + } + msg, err = c.readHandshake(&hs.finishedHash) if err != nil { return err diff --git a/src/crypto/tls/handshake_client_tls13.go b/src/crypto/tls/handshake_client_tls13.go index cc428c522ca..990fc83667b 100644 --- a/src/crypto/tls/handshake_client_tls13.go +++ b/src/crypto/tls/handshake_client_tls13.go @@ -16,19 +16,22 @@ import ( "fmt" "hash" "time" + + circlKem "github.com/cloudflare/circl/kem" ) type clientHandshakeStateTLS13 struct { - c *Conn - ctx context.Context - serverHello *serverHelloMsg - hello *clientHelloMsg - helloInner *clientHelloMsg - ecdheKey *ecdh.PrivateKey - - session *ClientSessionState - earlySecret []byte - binderKey []byte + c *Conn + ctx context.Context + serverHello *serverHelloMsg + hello *clientHelloMsg + helloInner *clientHelloMsg + keySharePrivate clientKeySharePrivate + + session *ClientSessionState + earlySecret []byte + binderKey []byte + selectedGroup CurveID certReq *certificateRequestMsgTLS13 usingPSK bool @@ -102,7 +105,7 @@ func (hs *clientHandshakeStateTLS13) handshake() error { } // Consistency check on the presence of a keyShare and its parameters. - if hs.ecdheKey == nil || len(hs.hello.keyShares) != 1 { + if hs.keySharePrivate == nil || len(hs.hello.keyShares) != 1 { return c.sendAlert(alertInternalError) } @@ -285,6 +288,8 @@ func (hs *clientHandshakeStateTLS13) sendDummyChangeCipherSpec() error { func (hs *clientHandshakeStateTLS13) processHelloRetryRequest() error { c := hs.c + c.handleCFEvent(CFEventTLS13HRR{}) + // The first ClientHello gets double-hashed into the transcript upon a // HelloRetryRequest. (The idea is that the server might offload transcript // storage to the client in the cookie.) See RFC 8446, Section 4.4.1. @@ -372,21 +377,38 @@ func (hs *clientHandshakeStateTLS13) processHelloRetryRequest() error { c.sendAlert(alertIllegalParameter) return errors.New("tls: server selected unsupported group") } - if sentID, _ := curveIDForCurve(hs.ecdheKey.Curve()); sentID == curveID { + if clientKeySharePrivateCurveID(hs.keySharePrivate) == curveID { c.sendAlert(alertIllegalParameter) return errors.New("tls: server sent an unnecessary HelloRetryRequest key_share") } - if _, ok := curveForCurveID(curveID); !ok { - c.sendAlert(alertInternalError) - return errors.New("tls: CurvePreferences includes unsupported curve") - } - key, err := generateECDHEKey(c.config.rand(), curveID) - if err != nil { - c.sendAlert(alertInternalError) - return err + if scheme := curveIdToCirclScheme(curveID); scheme != nil { + pk, sk, err := generateKemKeyPair(scheme, c.config.rand()) + if err != nil { + c.sendAlert(alertInternalError) + return fmt.Errorf("HRR generateKemKeyPair %s: %w", + scheme.Name(), err) + } + packedPk, err := pk.MarshalBinary() + if err != nil { + c.sendAlert(alertInternalError) + return fmt.Errorf("HRR pack circl public key %s: %w", + scheme.Name(), err) + } + hs.keySharePrivate = sk + hello.keyShares = []keyShare{{group: curveID, data: packedPk}} + } else { + if _, ok := curveForCurveID(curveID); !ok { + c.sendAlert(alertInternalError) + return errors.New("tls: CurvePreferences includes unsupported curve") + } + key, err := generateECDHEKey(c.config.rand(), curveID) + if err != nil { + c.sendAlert(alertInternalError) + return err + } + hs.keySharePrivate = key + hello.keyShares = []keyShare{{group: curveID, data: key.PublicKey().Bytes()}} } - hs.ecdheKey = key - hello.keyShares = []keyShare{{group: curveID, data: key.PublicKey().Bytes()}} } hello.raw = nil @@ -529,11 +551,15 @@ func (hs *clientHandshakeStateTLS13) processServerHello() error { c.sendAlert(alertIllegalParameter) return errors.New("tls: server did not send a key share") } - if sentID, _ := curveIDForCurve(hs.ecdheKey.Curve()); hs.serverHello.serverShare.group != sentID { + if hs.serverHello.serverShare.group != clientKeySharePrivateCurveID(hs.keySharePrivate) { c.sendAlert(alertIllegalParameter) return errors.New("tls: server selected unsupported group") } + c.handleCFEvent(CFEventTLSNegotiatedNamedKEX{ + KEX: hs.serverHello.serverShare.group, + }) + if !hs.serverHello.selectedIdentityPresent { return nil } @@ -577,15 +603,24 @@ func (hs *clientHandshakeStateTLS13) processServerHello() error { func (hs *clientHandshakeStateTLS13) establishHandshakeKeys() error { c := hs.c - peerKey, err := hs.ecdheKey.Curve().NewPublicKey(hs.serverHello.serverShare.data) - if err != nil { - c.sendAlert(alertIllegalParameter) - return errors.New("tls: invalid server key share") + var sharedKey []byte + var err error + if key, ok := hs.keySharePrivate.(*ecdh.PrivateKey); ok { + peerKey, err := key.Curve().NewPublicKey(hs.serverHello.serverShare.data) + if err == nil { + sharedKey, _ = key.ECDH(peerKey) + } + } else if sk, ok := hs.keySharePrivate.(circlKem.PrivateKey); ok { + sharedKey, err = sk.Scheme().Decapsulate(sk, hs.serverHello.serverShare.data) + if err != nil { + c.sendAlert(alertIllegalParameter) + return fmt.Errorf("%s decaps: %w", sk.Scheme().Name(), err) + } } - sharedKey, err := hs.ecdheKey.ECDH(peerKey) - if err != nil { + + if sharedKey == nil { c.sendAlert(alertIllegalParameter) - return errors.New("tls: invalid server key share") + return fmt.Errorf("tls: invalid server key share") } earlySecret := hs.earlySecret diff --git a/src/crypto/tls/handshake_server.go b/src/crypto/tls/handshake_server.go index 75429a00264..dcfcfb25646 100644 --- a/src/crypto/tls/handshake_server.go +++ b/src/crypto/tls/handshake_server.go @@ -634,6 +634,15 @@ func (hs *serverHandshakeState) doFullHandshake() error { c.sendAlert(alertHandshakeFailure) return err } + if eccKex, ok := keyAgreement.(*ecdheKeyAgreement); ok { + curveId, ok := curveIDForCurve(eccKex.key.Curve()) + if !ok { + panic("internal error: unknown curve") + } + c.handleCFEvent(CFEventTLSNegotiatedNamedKEX{ + KEX: curveId, + }) + } hs.masterSecret = masterFromPreMasterSecret(c.vers, hs.suite, preMasterSecret, hs.clientHello.random, hs.hello.random) if err := c.config.writeKeyLog(keyLogLabelTLS12, hs.clientHello.random, hs.masterSecret); err != nil { c.sendAlert(alertInternalError) diff --git a/src/crypto/tls/handshake_server_tls13.go b/src/crypto/tls/handshake_server_tls13.go index f1488a2fd81..228239fc295 100644 --- a/src/crypto/tls/handshake_server_tls13.go +++ b/src/crypto/tls/handshake_server_tls13.go @@ -33,6 +33,7 @@ type serverHandshakeStateTLS13 struct { suite *cipherSuiteTLS13 cert *Certificate sigAlg SignatureScheme + selectedGroup CurveID earlySecret []byte sharedKey []byte handshakeSecret []byte @@ -283,28 +284,39 @@ GroupSelection: clientKeyShare = &hs.clientHello.keyShares[0] } - if _, ok := curveForCurveID(selectedGroup); !ok { + if _, ok := curveForCurveID(selectedGroup); selectedGroup != X25519 && curveIdToCirclScheme(selectedGroup) == nil && !ok { c.sendAlert(alertInternalError) return errors.New("tls: CurvePreferences includes unsupported curve") } - key, err := generateECDHEKey(c.config.rand(), selectedGroup) - if err != nil { - c.sendAlert(alertInternalError) - return err - } - hs.hello.serverShare = keyShare{group: selectedGroup, data: key.PublicKey().Bytes()} - peerKey, err := key.Curve().NewPublicKey(clientKeyShare.data) - if err != nil { - c.sendAlert(alertIllegalParameter) - return errors.New("tls: invalid client key share") + if kem := curveIdToCirclScheme(selectedGroup); kem != nil { + ct, ss, alert, err := encapsulateForKem(kem, c.config.rand(), clientKeyShare.data) + if err != nil { + c.sendAlert(alert) + return fmt.Errorf("%s encap: %w", kem.Name(), err) + } + hs.hello.serverShare = keyShare{group: selectedGroup, data: ct} + hs.sharedKey = ss + } else { + key, err := generateECDHEKey(c.config.rand(), selectedGroup) + if err != nil { + c.sendAlert(alertInternalError) + return err + } + hs.hello.serverShare = keyShare{group: selectedGroup, data: key.PublicKey().Bytes()} + peerKey, err := key.Curve().NewPublicKey(clientKeyShare.data) + if err == nil { + hs.sharedKey, _ = key.ECDH(peerKey) + } } - hs.sharedKey, err = key.ECDH(peerKey) - if err != nil { + if hs.sharedKey == nil { c.sendAlert(alertIllegalParameter) return errors.New("tls: invalid client key share") } c.serverName = hs.clientHello.serverName + c.handleCFEvent(CFEventTLSNegotiatedNamedKEX{ + KEX: selectedGroup, + }) hs.hsTimings.ProcessClientHello = hs.hsTimings.elapsedTime() @@ -536,6 +548,8 @@ func (hs *serverHandshakeStateTLS13) sendDummyChangeCipherSpec() error { func (hs *serverHandshakeStateTLS13) doHelloRetryRequest(selectedGroup CurveID) error { c := hs.c + c.handleCFEvent(CFEventTLS13HRR{}) + // The first ClientHello gets double-hashed into the transcript upon a // HelloRetryRequest. See RFC 8446, Section 4.4.1. if err := transcriptMsg(hs.clientHello, hs.transcript); err != nil { diff --git a/src/crypto/tls/key_agreement.go b/src/crypto/tls/key_agreement.go index 73f3eecf6dc..b33b0f2d9ab 100644 --- a/src/crypto/tls/key_agreement.go +++ b/src/crypto/tls/key_agreement.go @@ -169,7 +169,7 @@ type ecdheKeyAgreement struct { func (ka *ecdheKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) { var curveID CurveID for _, c := range clientHello.supportedCurves { - if config.supportsCurve(c) { + if config.supportsCurve(c) && curveIdToCirclScheme(c) == nil { curveID = c break } diff --git a/src/crypto/tls/tls_cf.go b/src/crypto/tls/tls_cf.go index ef9c8c7c187..65db04e02d4 100644 --- a/src/crypto/tls/tls_cf.go +++ b/src/crypto/tls/tls_cf.go @@ -223,3 +223,24 @@ type CFEventECHPublicNameMismatch struct{} func (e CFEventECHPublicNameMismatch) Name() string { return "ech public name does not match outer sni" } + +// For backwards compatibility. +type CFEventTLS13NegotiatedKEX = CFEventTLSNegotiatedNamedKEX + +// CFEventTLSNegotiatedNamedKEX is emitted when a key agreement mechanism has been +// established that uses a named group. This includes all key agreements +// in TLSv1.3, but excludes RSA and DH in TLS 1.2 and earlier. +type CFEventTLSNegotiatedNamedKEX struct { + KEX CurveID +} + +func (e CFEventTLSNegotiatedNamedKEX) Name() string { + return "CFEventTLSNegotiatedNamedKEX" +} + +// CFEventTLS13HRR is emitted when a HRR is sent or received +type CFEventTLS13HRR struct{} + +func (e CFEventTLS13HRR) Name() string { + return "CFEventTLS13HRR" +} diff --git a/src/vendor/github.com/cloudflare/circl/kem/hybrid/ckem.go b/src/vendor/github.com/cloudflare/circl/kem/hybrid/ckem.go new file mode 100644 index 00000000000..c0620e8db99 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/kem/hybrid/ckem.go @@ -0,0 +1,207 @@ +package hybrid + +// TODO move over to crypto/ecdh once we can assume Go 1.20. + +import ( + "crypto/elliptic" + cryptoRand "crypto/rand" + "crypto/subtle" + "math/big" + + "github.com/cloudflare/circl/kem" + "github.com/cloudflare/circl/xof" +) + +type cPublicKey struct { + scheme *cScheme + x, y *big.Int +} +type cPrivateKey struct { + scheme *cScheme + key []byte +} +type cScheme struct { + curve elliptic.Curve +} + +var p256Kem = &cScheme{elliptic.P256()} + +func (sch *cScheme) scSize() int { + return (sch.curve.Params().N.BitLen() + 7) / 8 +} + +func (sch *cScheme) ptSize() int { + return (sch.curve.Params().BitSize + 7) / 8 +} + +func (sch *cScheme) Name() string { + return sch.curve.Params().Name +} + +func (sch *cScheme) PublicKeySize() int { + return 2*sch.ptSize() + 1 +} + +func (sch *cScheme) PrivateKeySize() int { + return sch.scSize() +} + +func (sch *cScheme) SeedSize() int { + return sch.PrivateKeySize() +} + +func (sch *cScheme) SharedKeySize() int { + return sch.ptSize() +} + +func (sch *cScheme) CiphertextSize() int { + return sch.PublicKeySize() +} + +func (sch *cScheme) EncapsulationSeedSize() int { + return sch.SeedSize() +} + +func (sk *cPrivateKey) Scheme() kem.Scheme { return sk.scheme } +func (pk *cPublicKey) Scheme() kem.Scheme { return pk.scheme } + +func (sk *cPrivateKey) MarshalBinary() ([]byte, error) { + ret := make([]byte, len(sk.key)) + copy(ret, sk.key) + return ret, nil +} + +func (sk *cPrivateKey) Equal(other kem.PrivateKey) bool { + oth, ok := other.(*cPrivateKey) + if !ok { + return false + } + if oth.scheme != sk.scheme { + return false + } + return subtle.ConstantTimeCompare(oth.key, sk.key) == 1 +} + +func (sk *cPrivateKey) Public() kem.PublicKey { + x, y := sk.scheme.curve.ScalarBaseMult(sk.key) + return &cPublicKey{ + sk.scheme, + x, + y, + } +} + +func (pk *cPublicKey) Equal(other kem.PublicKey) bool { + oth, ok := other.(*cPublicKey) + if !ok { + return false + } + if oth.scheme != pk.scheme { + return false + } + return oth.x.Cmp(pk.x) == 0 && oth.y.Cmp(pk.y) == 0 +} + +func (pk *cPublicKey) MarshalBinary() ([]byte, error) { + return elliptic.Marshal(pk.scheme.curve, pk.x, pk.y), nil +} + +func (sch *cScheme) GenerateKeyPair() (kem.PublicKey, kem.PrivateKey, error) { + seed := make([]byte, sch.SeedSize()) + _, err := cryptoRand.Read(seed) + if err != nil { + return nil, nil, err + } + pk, sk := sch.DeriveKeyPair(seed) + return pk, sk, nil +} + +func (sch *cScheme) DeriveKeyPair(seed []byte) (kem.PublicKey, kem.PrivateKey) { + if len(seed) != sch.SeedSize() { + panic(kem.ErrSeedSize) + } + h := xof.SHAKE256.New() + _, _ = h.Write(seed) + key, x, y, err := elliptic.GenerateKey(sch.curve, h) + if err != nil { + panic(err) + } + + sk := cPrivateKey{scheme: sch, key: key} + pk := cPublicKey{scheme: sch, x: x, y: y} + + return &pk, &sk +} + +func (sch *cScheme) Encapsulate(pk kem.PublicKey) (ct, ss []byte, err error) { + seed := make([]byte, sch.EncapsulationSeedSize()) + _, err = cryptoRand.Read(seed) + if err != nil { + return + } + return sch.EncapsulateDeterministically(pk, seed) +} + +func (pk *cPublicKey) X(sk *cPrivateKey) []byte { + if pk.scheme != sk.scheme { + panic(kem.ErrTypeMismatch) + } + + sharedKey := make([]byte, pk.scheme.SharedKeySize()) + xShared, _ := pk.scheme.curve.ScalarMult(pk.x, pk.y, sk.key) + xShared.FillBytes(sharedKey) + return sharedKey +} + +func (sch *cScheme) EncapsulateDeterministically( + pk kem.PublicKey, seed []byte, +) (ct, ss []byte, err error) { + if len(seed) != sch.EncapsulationSeedSize() { + return nil, nil, kem.ErrSeedSize + } + pub, ok := pk.(*cPublicKey) + if !ok || pub.scheme != sch { + return nil, nil, kem.ErrTypeMismatch + } + + pk2, sk2 := sch.DeriveKeyPair(seed) + ss = pub.X(sk2.(*cPrivateKey)) + ct, _ = pk2.MarshalBinary() + return +} + +func (sch *cScheme) Decapsulate(sk kem.PrivateKey, ct []byte) ([]byte, error) { + if len(ct) != sch.CiphertextSize() { + return nil, kem.ErrCiphertextSize + } + + priv, ok := sk.(*cPrivateKey) + if !ok || priv.scheme != sch { + return nil, kem.ErrTypeMismatch + } + + pk, err := sch.UnmarshalBinaryPublicKey(ct) + if err != nil { + return nil, err + } + + ss := pk.(*cPublicKey).X(priv) + return ss, nil +} + +func (sch *cScheme) UnmarshalBinaryPublicKey(buf []byte) (kem.PublicKey, error) { + if len(buf) != sch.PublicKeySize() { + return nil, kem.ErrPubKeySize + } + x, y := elliptic.Unmarshal(sch.curve, buf) + return &cPublicKey{sch, x, y}, nil +} + +func (sch *cScheme) UnmarshalBinaryPrivateKey(buf []byte) (kem.PrivateKey, error) { + if len(buf) != sch.PrivateKeySize() { + return nil, kem.ErrPrivKeySize + } + ret := cPrivateKey{sch, make([]byte, sch.PrivateKeySize())} + copy(ret.key, buf) + return &ret, nil +} diff --git a/src/vendor/github.com/cloudflare/circl/kem/hybrid/hybrid.go b/src/vendor/github.com/cloudflare/circl/kem/hybrid/hybrid.go new file mode 100644 index 00000000000..be8251c7f74 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/kem/hybrid/hybrid.go @@ -0,0 +1,344 @@ +// Package hybrid defines several hybrid classical/quantum KEMs. +// +// KEMs are combined by simple concatenation of shared secrets, cipher texts, +// public keys, etc, see +// +// https://datatracker.ietf.org/doc/draft-ietf-tls-hybrid-design/ +// https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-56Cr2.pdf +// +// Note that this is only fine if the shared secret is used in its entirety +// in a next step, such as being hashed or used as key. +// +// For deriving a KEM keypair deterministically and encapsulating +// deterministically, we expand a single seed to both using SHAKE256, +// so that a non-uniform seed (such as a shared secret generated by a hybrid +// KEM where one of the KEMs is weak) doesn't impact just one of the KEMs. +// +// Of our XOF (SHAKE256), we desire two security properties: +// +// 1. The internal state of the XOF should be big enough so that we +// do not loose entropy. +// 2. From one of the new seeds, we shouldn't be able to derive +// the other or the original seed. +// +// SHAKE256, and all siblings in the SHA3 family, have a 200B internal +// state, so (1) is fine if our seeds are less than 200B. +// If SHAKE256 is computationally indistinguishable from a random +// sponge, then it affords us 256b security against (2) by the +// flat sponge claim [https://keccak.team/files/SpongeFunctions.pdf]. +// None of the implemented schemes claim more than 256b security +// and so SHAKE256 will do fine. +package hybrid + +import ( + "errors" + + "github.com/cloudflare/circl/internal/sha3" + "github.com/cloudflare/circl/kem" + "github.com/cloudflare/circl/kem/kyber/kyber1024" + "github.com/cloudflare/circl/kem/kyber/kyber512" + "github.com/cloudflare/circl/kem/kyber/kyber768" +) + +var ErrUninitialized = errors.New("public or private key not initialized") + +// Returns the hybrid KEM of Kyber512Draft00 and X25519. +func Kyber512X25519() kem.Scheme { return kyber512X } + +// Returns the hybrid KEM of Kyber768Draft00 and X25519. +func Kyber768X25519() kem.Scheme { return kyber768X } + +// Returns the hybrid KEM of Kyber768Draft00 and X448. +func Kyber768X448() kem.Scheme { return kyber768X4 } + +// Returns the hybrid KEM of Kyber1024Draft00 and X448. +func Kyber1024X448() kem.Scheme { return kyber1024X } + +// Returns the hybrid KEM of Kyber768Draft00 and P-256. +func P256Kyber768Draft00() kem.Scheme { return p256Kyber768Draft00 } + +var p256Kyber768Draft00 kem.Scheme = &scheme{ + "P256Kyber768Draft00", + p256Kem, + kyber768.Scheme(), +} + +var kyber512X kem.Scheme = &scheme{ + "Kyber512-X25519", + x25519Kem, + kyber512.Scheme(), +} + +var kyber768X kem.Scheme = &scheme{ + "Kyber768-X25519", + x25519Kem, + kyber768.Scheme(), +} + +var kyber768X4 kem.Scheme = &scheme{ + "Kyber768-X448", + x448Kem, + kyber768.Scheme(), +} + +var kyber1024X kem.Scheme = &scheme{ + "Kyber1024-X448", + x448Kem, + kyber1024.Scheme(), +} + +// Public key of a hybrid KEM. +type publicKey struct { + scheme *scheme + first kem.PublicKey + second kem.PublicKey +} + +// Private key of a hybrid KEM. +type privateKey struct { + scheme *scheme + first kem.PrivateKey + second kem.PrivateKey +} + +// Scheme for a hybrid KEM. +type scheme struct { + name string + first kem.Scheme + second kem.Scheme +} + +func (sch *scheme) Name() string { return sch.name } +func (sch *scheme) PublicKeySize() int { + return sch.first.PublicKeySize() + sch.second.PublicKeySize() +} + +func (sch *scheme) PrivateKeySize() int { + return sch.first.PrivateKeySize() + sch.second.PrivateKeySize() +} + +func (sch *scheme) SeedSize() int { + first := sch.first.SeedSize() + second := sch.second.SeedSize() + ret := second + if first > second { + ret = first + } + return ret +} + +func (sch *scheme) SharedKeySize() int { + return sch.first.SharedKeySize() + sch.second.SharedKeySize() +} + +func (sch *scheme) CiphertextSize() int { + return sch.first.CiphertextSize() + sch.second.CiphertextSize() +} + +func (sch *scheme) EncapsulationSeedSize() int { + first := sch.first.EncapsulationSeedSize() + second := sch.second.EncapsulationSeedSize() + ret := second + if first > second { + ret = first + } + return ret +} + +func (sk *privateKey) Scheme() kem.Scheme { return sk.scheme } +func (pk *publicKey) Scheme() kem.Scheme { return pk.scheme } + +func (sk *privateKey) MarshalBinary() ([]byte, error) { + if sk.first == nil || sk.second == nil { + return nil, ErrUninitialized + } + first, err := sk.first.MarshalBinary() + if err != nil { + return nil, err + } + second, err := sk.second.MarshalBinary() + if err != nil { + return nil, err + } + return append(first, second...), nil +} + +func (sk *privateKey) Equal(other kem.PrivateKey) bool { + oth, ok := other.(*privateKey) + if !ok { + return false + } + if sk.first == nil && sk.second == nil && oth.first == nil && oth.second == nil { + return true + } + if sk.first == nil || sk.second == nil || oth.first == nil || oth.second == nil { + return false + } + return sk.first.Equal(oth.first) && sk.second.Equal(oth.second) +} + +func (sk *privateKey) Public() kem.PublicKey { + return &publicKey{sk.scheme, sk.first.Public(), sk.second.Public()} +} + +func (pk *publicKey) Equal(other kem.PublicKey) bool { + oth, ok := other.(*publicKey) + if !ok { + return false + } + if pk.first == nil && pk.second == nil && oth.first == nil && oth.second == nil { + return true + } + if pk.first == nil || pk.second == nil || oth.first == nil || oth.second == nil { + return false + } + return pk.first.Equal(oth.first) && pk.second.Equal(oth.second) +} + +func (pk *publicKey) MarshalBinary() ([]byte, error) { + if pk.first == nil || pk.second == nil { + return nil, ErrUninitialized + } + first, err := pk.first.MarshalBinary() + if err != nil { + return nil, err + } + second, err := pk.second.MarshalBinary() + if err != nil { + return nil, err + } + return append(first, second...), nil +} + +func (sch *scheme) GenerateKeyPair() (kem.PublicKey, kem.PrivateKey, error) { + pk1, sk1, err := sch.first.GenerateKeyPair() + if err != nil { + return nil, nil, err + } + pk2, sk2, err := sch.second.GenerateKeyPair() + if err != nil { + return nil, nil, err + } + + return &publicKey{sch, pk1, pk2}, &privateKey{sch, sk1, sk2}, nil +} + +func (sch *scheme) DeriveKeyPair(seed []byte) (kem.PublicKey, kem.PrivateKey) { + if len(seed) != sch.SeedSize() { + panic(kem.ErrSeedSize) + } + h := sha3.NewShake256() + _, _ = h.Write(seed) + first := make([]byte, sch.first.SeedSize()) + second := make([]byte, sch.second.SeedSize()) + _, _ = h.Read(first) + _, _ = h.Read(second) + + pk1, sk1 := sch.first.DeriveKeyPair(first) + pk2, sk2 := sch.second.DeriveKeyPair(second) + + return &publicKey{sch, pk1, pk2}, &privateKey{sch, sk1, sk2} +} + +func (sch *scheme) Encapsulate(pk kem.PublicKey) (ct, ss []byte, err error) { + pub, ok := pk.(*publicKey) + if !ok { + return nil, nil, kem.ErrTypeMismatch + } + + ct1, ss1, err := sch.first.Encapsulate(pub.first) + if err != nil { + return nil, nil, err + } + + ct2, ss2, err := sch.second.Encapsulate(pub.second) + if err != nil { + return nil, nil, err + } + + return append(ct1, ct2...), append(ss1, ss2...), nil +} + +func (sch *scheme) EncapsulateDeterministically( + pk kem.PublicKey, seed []byte, +) (ct, ss []byte, err error) { + if len(seed) != sch.EncapsulationSeedSize() { + return nil, nil, kem.ErrSeedSize + } + + h := sha3.NewShake256() + _, _ = h.Write(seed) + first := make([]byte, sch.first.EncapsulationSeedSize()) + second := make([]byte, sch.second.EncapsulationSeedSize()) + _, _ = h.Read(first) + _, _ = h.Read(second) + + pub, ok := pk.(*publicKey) + if !ok { + return nil, nil, kem.ErrTypeMismatch + } + + ct1, ss1, err := sch.first.EncapsulateDeterministically(pub.first, first) + if err != nil { + return nil, nil, err + } + ct2, ss2, err := sch.second.EncapsulateDeterministically(pub.second, second) + if err != nil { + return nil, nil, err + } + return append(ct1, ct2...), append(ss1, ss2...), nil +} + +func (sch *scheme) Decapsulate(sk kem.PrivateKey, ct []byte) ([]byte, error) { + if len(ct) != sch.CiphertextSize() { + return nil, kem.ErrCiphertextSize + } + + priv, ok := sk.(*privateKey) + if !ok { + return nil, kem.ErrTypeMismatch + } + + firstSize := sch.first.CiphertextSize() + ss1, err := sch.first.Decapsulate(priv.first, ct[:firstSize]) + if err != nil { + return nil, err + } + ss2, err := sch.second.Decapsulate(priv.second, ct[firstSize:]) + if err != nil { + return nil, err + } + return append(ss1, ss2...), nil +} + +func (sch *scheme) UnmarshalBinaryPublicKey(buf []byte) (kem.PublicKey, error) { + if len(buf) != sch.PublicKeySize() { + return nil, kem.ErrPubKeySize + } + firstSize := sch.first.PublicKeySize() + pk1, err := sch.first.UnmarshalBinaryPublicKey(buf[:firstSize]) + if err != nil { + return nil, err + } + pk2, err := sch.second.UnmarshalBinaryPublicKey(buf[firstSize:]) + if err != nil { + return nil, err + } + return &publicKey{sch, pk1, pk2}, nil +} + +func (sch *scheme) UnmarshalBinaryPrivateKey(buf []byte) (kem.PrivateKey, error) { + if len(buf) != sch.PrivateKeySize() { + return nil, kem.ErrPrivKeySize + } + firstSize := sch.first.PrivateKeySize() + sk1, err := sch.first.UnmarshalBinaryPrivateKey(buf[:firstSize]) + if err != nil { + return nil, err + } + sk2, err := sch.second.UnmarshalBinaryPrivateKey(buf[firstSize:]) + if err != nil { + return nil, err + } + return &privateKey{sch, sk1, sk2}, nil +} diff --git a/src/vendor/github.com/cloudflare/circl/kem/hybrid/xkem.go b/src/vendor/github.com/cloudflare/circl/kem/hybrid/xkem.go new file mode 100644 index 00000000000..919fb8a9c21 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/kem/hybrid/xkem.go @@ -0,0 +1,208 @@ +package hybrid + +import ( + "bytes" + cryptoRand "crypto/rand" + "crypto/subtle" + + "github.com/cloudflare/circl/dh/x25519" + "github.com/cloudflare/circl/dh/x448" + "github.com/cloudflare/circl/internal/sha3" + "github.com/cloudflare/circl/kem" +) + +type xPublicKey struct { + scheme *xScheme + key []byte +} +type xPrivateKey struct { + scheme *xScheme + key []byte +} +type xScheme struct { + size int +} + +var ( + x25519Kem = &xScheme{x25519.Size} + x448Kem = &xScheme{x448.Size} +) + +func (sch *xScheme) Name() string { + switch sch.size { + case x25519.Size: + return "X25519" + case x448.Size: + return "X448" + } + panic(kem.ErrTypeMismatch) +} + +func (sch *xScheme) PublicKeySize() int { return sch.size } +func (sch *xScheme) PrivateKeySize() int { return sch.size } +func (sch *xScheme) SeedSize() int { return sch.size } +func (sch *xScheme) SharedKeySize() int { return sch.size } +func (sch *xScheme) CiphertextSize() int { return sch.size } +func (sch *xScheme) EncapsulationSeedSize() int { return sch.size } + +func (sk *xPrivateKey) Scheme() kem.Scheme { return sk.scheme } +func (pk *xPublicKey) Scheme() kem.Scheme { return pk.scheme } + +func (sk *xPrivateKey) MarshalBinary() ([]byte, error) { + ret := make([]byte, len(sk.key)) + copy(ret, sk.key) + return ret, nil +} + +func (sk *xPrivateKey) Equal(other kem.PrivateKey) bool { + oth, ok := other.(*xPrivateKey) + if !ok { + return false + } + if oth.scheme != sk.scheme { + return false + } + return subtle.ConstantTimeCompare(oth.key, sk.key) == 1 +} + +func (sk *xPrivateKey) Public() kem.PublicKey { + pk := xPublicKey{sk.scheme, make([]byte, sk.scheme.size)} + switch sk.scheme.size { + case x25519.Size: + var sk2, pk2 x25519.Key + copy(sk2[:], sk.key) + x25519.KeyGen(&pk2, &sk2) + copy(pk.key, pk2[:]) + case x448.Size: + var sk2, pk2 x448.Key + copy(sk2[:], sk.key) + x448.KeyGen(&pk2, &sk2) + copy(pk.key, pk2[:]) + } + return &pk +} + +func (pk *xPublicKey) Equal(other kem.PublicKey) bool { + oth, ok := other.(*xPublicKey) + if !ok { + return false + } + if oth.scheme != pk.scheme { + return false + } + return bytes.Equal(oth.key, pk.key) +} + +func (pk *xPublicKey) MarshalBinary() ([]byte, error) { + ret := make([]byte, pk.scheme.size) + copy(ret, pk.key) + return ret, nil +} + +func (sch *xScheme) GenerateKeyPair() (kem.PublicKey, kem.PrivateKey, error) { + seed := make([]byte, sch.SeedSize()) + _, err := cryptoRand.Read(seed) + if err != nil { + return nil, nil, err + } + pk, sk := sch.DeriveKeyPair(seed) + return pk, sk, nil +} + +func (sch *xScheme) DeriveKeyPair(seed []byte) (kem.PublicKey, kem.PrivateKey) { + if len(seed) != sch.SeedSize() { + panic(kem.ErrSeedSize) + } + sk := xPrivateKey{scheme: sch, key: make([]byte, sch.size)} + + h := sha3.NewShake256() + _, _ = h.Write(seed) + _, _ = h.Read(sk.key) + + return sk.Public(), &sk +} + +func (sch *xScheme) Encapsulate(pk kem.PublicKey) (ct, ss []byte, err error) { + seed := make([]byte, sch.EncapsulationSeedSize()) + _, err = cryptoRand.Read(seed) + if err != nil { + return + } + return sch.EncapsulateDeterministically(pk, seed) +} + +func (pk *xPublicKey) X(sk *xPrivateKey) []byte { + if pk.scheme != sk.scheme { + panic(kem.ErrTypeMismatch) + } + + switch pk.scheme.size { + case x25519.Size: + var ss2, pk2, sk2 x25519.Key + copy(pk2[:], pk.key) + copy(sk2[:], sk.key) + x25519.Shared(&ss2, &sk2, &pk2) + return ss2[:] + case x448.Size: + var ss2, pk2, sk2 x448.Key + copy(pk2[:], pk.key) + copy(sk2[:], sk.key) + x448.Shared(&ss2, &sk2, &pk2) + return ss2[:] + } + panic(kem.ErrTypeMismatch) +} + +func (sch *xScheme) EncapsulateDeterministically( + pk kem.PublicKey, seed []byte, +) (ct, ss []byte, err error) { + if len(seed) != sch.EncapsulationSeedSize() { + return nil, nil, kem.ErrSeedSize + } + pub, ok := pk.(*xPublicKey) + if !ok || pub.scheme != sch { + return nil, nil, kem.ErrTypeMismatch + } + + pk2, sk2 := sch.DeriveKeyPair(seed) + ss = pub.X(sk2.(*xPrivateKey)) + ct, _ = pk2.MarshalBinary() + return +} + +func (sch *xScheme) Decapsulate(sk kem.PrivateKey, ct []byte) ([]byte, error) { + if len(ct) != sch.CiphertextSize() { + return nil, kem.ErrCiphertextSize + } + + priv, ok := sk.(*xPrivateKey) + if !ok || priv.scheme != sch { + return nil, kem.ErrTypeMismatch + } + + pk, err := sch.UnmarshalBinaryPublicKey(ct) + if err != nil { + return nil, err + } + + ss := pk.(*xPublicKey).X(priv) + return ss, nil +} + +func (sch *xScheme) UnmarshalBinaryPublicKey(buf []byte) (kem.PublicKey, error) { + if len(buf) != sch.PublicKeySize() { + return nil, kem.ErrPubKeySize + } + ret := xPublicKey{sch, make([]byte, sch.size)} + copy(ret.key, buf) + return &ret, nil +} + +func (sch *xScheme) UnmarshalBinaryPrivateKey(buf []byte) (kem.PrivateKey, error) { + if len(buf) != sch.PrivateKeySize() { + return nil, kem.ErrPrivKeySize + } + ret := xPrivateKey{sch, make([]byte, sch.size)} + copy(ret.key, buf) + return &ret, nil +} diff --git a/src/vendor/github.com/cloudflare/circl/kem/kyber/kyber1024/kyber.go b/src/vendor/github.com/cloudflare/circl/kem/kyber/kyber1024/kyber.go new file mode 100644 index 00000000000..223c842ed40 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/kem/kyber/kyber1024/kyber.go @@ -0,0 +1,402 @@ +// Code generated from pkg.templ.go. DO NOT EDIT. + +// Package kyber1024 implements the IND-CCA2 secure key encapsulation mechanism +// Kyber1024.CCAKEM as submitted to round 3 of the NIST PQC competition and +// described in +// +// https://pq-crystals.org/kyber/data/kyber-specification-round3.pdf +package kyber1024 + +import ( + "bytes" + "crypto/subtle" + "io" + + cryptoRand "crypto/rand" + "github.com/cloudflare/circl/internal/sha3" + "github.com/cloudflare/circl/kem" + cpapke "github.com/cloudflare/circl/pke/kyber/kyber1024" +) + +const ( + // Size of seed for NewKeyFromSeed + KeySeedSize = cpapke.KeySeedSize + 32 + + // Size of seed for EncapsulateTo. + EncapsulationSeedSize = 32 + + // Size of the established shared key. + SharedKeySize = 32 + + // Size of the encapsulated shared key. + CiphertextSize = cpapke.CiphertextSize + + // Size of a packed public key. + PublicKeySize = cpapke.PublicKeySize + + // Size of a packed private key. + PrivateKeySize = cpapke.PrivateKeySize + cpapke.PublicKeySize + 64 +) + +// Type of a Kyber1024.CCAKEM public key +type PublicKey struct { + pk *cpapke.PublicKey + + hpk [32]byte // H(pk) +} + +// Type of a Kyber1024.CCAKEM private key +type PrivateKey struct { + sk *cpapke.PrivateKey + pk *cpapke.PublicKey + hpk [32]byte // H(pk) + z [32]byte +} + +// NewKeyFromSeed derives a public/private keypair deterministically +// from the given seed. +// +// Panics if seed is not of length KeySeedSize. +func NewKeyFromSeed(seed []byte) (*PublicKey, *PrivateKey) { + var sk PrivateKey + var pk PublicKey + + if len(seed) != KeySeedSize { + panic("seed must be of length KeySeedSize") + } + + pk.pk, sk.sk = cpapke.NewKeyFromSeed(seed[:cpapke.KeySeedSize]) + sk.pk = pk.pk + copy(sk.z[:], seed[cpapke.KeySeedSize:]) + + // Compute H(pk) + var ppk [cpapke.PublicKeySize]byte + sk.pk.Pack(ppk[:]) + h := sha3.New256() + h.Write(ppk[:]) + h.Read(sk.hpk[:]) + copy(pk.hpk[:], sk.hpk[:]) + + return &pk, &sk +} + +// GenerateKeyPair generates public and private keys using entropy from rand. +// If rand is nil, crypto/rand.Reader will be used. +func GenerateKeyPair(rand io.Reader) (*PublicKey, *PrivateKey, error) { + var seed [KeySeedSize]byte + if rand == nil { + rand = cryptoRand.Reader + } + _, err := io.ReadFull(rand, seed[:]) + if err != nil { + return nil, nil, err + } + pk, sk := NewKeyFromSeed(seed[:]) + return pk, sk, nil +} + +// EncapsulateTo generates a shared key and ciphertext that contains it +// for the public key using randomness from seed and writes the shared key +// to ss and ciphertext to ct. +// +// Panics if ss, ct or seed are not of length SharedKeySize, CiphertextSize +// and EncapsulationSeedSize respectively. +// +// seed may be nil, in which case crypto/rand.Reader is used to generate one. +func (pk *PublicKey) EncapsulateTo(ct, ss []byte, seed []byte) { + if seed == nil { + seed = make([]byte, EncapsulationSeedSize) + cryptoRand.Read(seed[:]) + } else { + if len(seed) != EncapsulationSeedSize { + panic("seed must be of length EncapsulationSeedSize") + } + } + + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + + if len(ss) != SharedKeySize { + panic("ss must be of length SharedKeySize") + } + + // m = H(seed) + var m [32]byte + h := sha3.New256() + h.Write(seed[:]) + h.Read(m[:]) + + // (K', r) = G(m ‖ H(pk)) + var kr [64]byte + g := sha3.New512() + g.Write(m[:]) + g.Write(pk.hpk[:]) + g.Read(kr[:]) + + // c = Kyber.CPAPKE.Enc(pk, m, r) + pk.pk.EncryptTo(ct, m[:], kr[32:]) + + // Compute H(c) and put in second slot of kr, which will be (K', H(c)). + h.Reset() + h.Write(ct[:CiphertextSize]) + h.Read(kr[32:]) + + // K = KDF(K' ‖ H(c)) + kdf := sha3.NewShake256() + kdf.Write(kr[:]) + kdf.Read(ss[:SharedKeySize]) +} + +// DecapsulateTo computes the shared key which is encapsulated in ct +// for the private key. +// +// Panics if ct or ss are not of length CiphertextSize and SharedKeySize +// respectively. +func (sk *PrivateKey) DecapsulateTo(ss, ct []byte) { + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + + if len(ss) != SharedKeySize { + panic("ss must be of length SharedKeySize") + } + + // m' = Kyber.CPAPKE.Dec(sk, ct) + var m2 [32]byte + sk.sk.DecryptTo(m2[:], ct) + + // (K'', r') = G(m' ‖ H(pk)) + var kr2 [64]byte + g := sha3.New512() + g.Write(m2[:]) + g.Write(sk.hpk[:]) + g.Read(kr2[:]) + + // c' = Kyber.CPAPKE.Enc(pk, m', r') + var ct2 [CiphertextSize]byte + sk.pk.EncryptTo(ct2[:], m2[:], kr2[32:]) + + // Compute H(c) and put in second slot of kr2, which will be (K'', H(c)). + h := sha3.New256() + h.Write(ct[:CiphertextSize]) + h.Read(kr2[32:]) + + // Replace K'' by z in the first slot of kr2 if c ≠ c'. + subtle.ConstantTimeCopy( + 1-subtle.ConstantTimeCompare(ct, ct2[:]), + kr2[:32], + sk.z[:], + ) + + // K = KDF(K''/z, H(c)) + kdf := sha3.NewShake256() + kdf.Write(kr2[:]) + kdf.Read(ss[:SharedKeySize]) +} + +// Packs sk to buf. +// +// Panics if buf is not of size PrivateKeySize. +func (sk *PrivateKey) Pack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of length PrivateKeySize") + } + + sk.sk.Pack(buf[:cpapke.PrivateKeySize]) + buf = buf[cpapke.PrivateKeySize:] + sk.pk.Pack(buf[:cpapke.PublicKeySize]) + buf = buf[cpapke.PublicKeySize:] + copy(buf, sk.hpk[:]) + buf = buf[32:] + copy(buf, sk.z[:]) +} + +// Unpacks sk from buf. +// +// Panics if buf is not of size PrivateKeySize. +func (sk *PrivateKey) Unpack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of length PrivateKeySize") + } + + sk.sk = new(cpapke.PrivateKey) + sk.sk.Unpack(buf[:cpapke.PrivateKeySize]) + buf = buf[cpapke.PrivateKeySize:] + sk.pk = new(cpapke.PublicKey) + sk.pk.Unpack(buf[:cpapke.PublicKeySize]) + buf = buf[cpapke.PublicKeySize:] + copy(sk.hpk[:], buf[:32]) + copy(sk.z[:], buf[32:]) +} + +// Packs pk to buf. +// +// Panics if buf is not of size PublicKeySize. +func (pk *PublicKey) Pack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of length PublicKeySize") + } + + pk.pk.Pack(buf) +} + +// Unpacks pk from buf. +// +// Panics if buf is not of size PublicKeySize. +func (pk *PublicKey) Unpack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of length PublicKeySize") + } + + pk.pk = new(cpapke.PublicKey) + pk.pk.Unpack(buf) + + // Compute cached H(pk) + h := sha3.New256() + h.Write(buf) + h.Read(pk.hpk[:]) +} + +// Boilerplate down below for the KEM scheme API. + +type scheme struct{} + +var sch kem.Scheme = &scheme{} + +// Scheme returns a KEM interface. +func Scheme() kem.Scheme { return sch } + +func (*scheme) Name() string { return "Kyber1024" } +func (*scheme) PublicKeySize() int { return PublicKeySize } +func (*scheme) PrivateKeySize() int { return PrivateKeySize } +func (*scheme) SeedSize() int { return KeySeedSize } +func (*scheme) SharedKeySize() int { return SharedKeySize } +func (*scheme) CiphertextSize() int { return CiphertextSize } +func (*scheme) EncapsulationSeedSize() int { return EncapsulationSeedSize } + +func (sk *PrivateKey) Scheme() kem.Scheme { return sch } +func (pk *PublicKey) Scheme() kem.Scheme { return sch } + +func (sk *PrivateKey) MarshalBinary() ([]byte, error) { + var ret [PrivateKeySize]byte + sk.Pack(ret[:]) + return ret[:], nil +} + +func (sk *PrivateKey) Equal(other kem.PrivateKey) bool { + oth, ok := other.(*PrivateKey) + if !ok { + return false + } + if sk.pk == nil && oth.pk == nil { + return true + } + if sk.pk == nil || oth.pk == nil { + return false + } + if !bytes.Equal(sk.hpk[:], oth.hpk[:]) || + !bytes.Equal(sk.z[:], oth.z[:]) { + return false + } + return sk.sk.Equal(oth.sk) +} + +func (pk *PublicKey) Equal(other kem.PublicKey) bool { + oth, ok := other.(*PublicKey) + if !ok { + return false + } + if pk.pk == nil && oth.pk == nil { + return true + } + if pk.pk == nil || oth.pk == nil { + return false + } + return bytes.Equal(pk.hpk[:], oth.hpk[:]) +} + +func (sk *PrivateKey) Public() kem.PublicKey { + pk := new(PublicKey) + pk.pk = sk.pk + copy(pk.hpk[:], sk.hpk[:]) + return pk +} + +func (pk *PublicKey) MarshalBinary() ([]byte, error) { + var ret [PublicKeySize]byte + pk.Pack(ret[:]) + return ret[:], nil +} + +func (*scheme) GenerateKeyPair() (kem.PublicKey, kem.PrivateKey, error) { + return GenerateKeyPair(cryptoRand.Reader) +} + +func (*scheme) DeriveKeyPair(seed []byte) (kem.PublicKey, kem.PrivateKey) { + if len(seed) != KeySeedSize { + panic(kem.ErrSeedSize) + } + return NewKeyFromSeed(seed[:]) +} + +func (*scheme) Encapsulate(pk kem.PublicKey) (ct, ss []byte, err error) { + ct = make([]byte, CiphertextSize) + ss = make([]byte, SharedKeySize) + + pub, ok := pk.(*PublicKey) + if !ok { + return nil, nil, kem.ErrTypeMismatch + } + pub.EncapsulateTo(ct, ss, nil) + return +} + +func (*scheme) EncapsulateDeterministically(pk kem.PublicKey, seed []byte) ( + ct, ss []byte, err error) { + if len(seed) != EncapsulationSeedSize { + return nil, nil, kem.ErrSeedSize + } + + ct = make([]byte, CiphertextSize) + ss = make([]byte, SharedKeySize) + + pub, ok := pk.(*PublicKey) + if !ok { + return nil, nil, kem.ErrTypeMismatch + } + pub.EncapsulateTo(ct, ss, seed) + return +} + +func (*scheme) Decapsulate(sk kem.PrivateKey, ct []byte) ([]byte, error) { + if len(ct) != CiphertextSize { + return nil, kem.ErrCiphertextSize + } + + priv, ok := sk.(*PrivateKey) + if !ok { + return nil, kem.ErrTypeMismatch + } + ss := make([]byte, SharedKeySize) + priv.DecapsulateTo(ss, ct) + return ss, nil +} + +func (*scheme) UnmarshalBinaryPublicKey(buf []byte) (kem.PublicKey, error) { + if len(buf) != PublicKeySize { + return nil, kem.ErrPubKeySize + } + var ret PublicKey + ret.Unpack(buf) + return &ret, nil +} + +func (*scheme) UnmarshalBinaryPrivateKey(buf []byte) (kem.PrivateKey, error) { + if len(buf) != PrivateKeySize { + return nil, kem.ErrPrivKeySize + } + var ret PrivateKey + ret.Unpack(buf) + return &ret, nil +} diff --git a/src/vendor/github.com/cloudflare/circl/kem/kyber/kyber512/kyber.go b/src/vendor/github.com/cloudflare/circl/kem/kyber/kyber512/kyber.go new file mode 100644 index 00000000000..8cc1ec76643 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/kem/kyber/kyber512/kyber.go @@ -0,0 +1,402 @@ +// Code generated from pkg.templ.go. DO NOT EDIT. + +// Package kyber512 implements the IND-CCA2 secure key encapsulation mechanism +// Kyber512.CCAKEM as submitted to round 3 of the NIST PQC competition and +// described in +// +// https://pq-crystals.org/kyber/data/kyber-specification-round3.pdf +package kyber512 + +import ( + "bytes" + "crypto/subtle" + "io" + + cryptoRand "crypto/rand" + "github.com/cloudflare/circl/internal/sha3" + "github.com/cloudflare/circl/kem" + cpapke "github.com/cloudflare/circl/pke/kyber/kyber512" +) + +const ( + // Size of seed for NewKeyFromSeed + KeySeedSize = cpapke.KeySeedSize + 32 + + // Size of seed for EncapsulateTo. + EncapsulationSeedSize = 32 + + // Size of the established shared key. + SharedKeySize = 32 + + // Size of the encapsulated shared key. + CiphertextSize = cpapke.CiphertextSize + + // Size of a packed public key. + PublicKeySize = cpapke.PublicKeySize + + // Size of a packed private key. + PrivateKeySize = cpapke.PrivateKeySize + cpapke.PublicKeySize + 64 +) + +// Type of a Kyber512.CCAKEM public key +type PublicKey struct { + pk *cpapke.PublicKey + + hpk [32]byte // H(pk) +} + +// Type of a Kyber512.CCAKEM private key +type PrivateKey struct { + sk *cpapke.PrivateKey + pk *cpapke.PublicKey + hpk [32]byte // H(pk) + z [32]byte +} + +// NewKeyFromSeed derives a public/private keypair deterministically +// from the given seed. +// +// Panics if seed is not of length KeySeedSize. +func NewKeyFromSeed(seed []byte) (*PublicKey, *PrivateKey) { + var sk PrivateKey + var pk PublicKey + + if len(seed) != KeySeedSize { + panic("seed must be of length KeySeedSize") + } + + pk.pk, sk.sk = cpapke.NewKeyFromSeed(seed[:cpapke.KeySeedSize]) + sk.pk = pk.pk + copy(sk.z[:], seed[cpapke.KeySeedSize:]) + + // Compute H(pk) + var ppk [cpapke.PublicKeySize]byte + sk.pk.Pack(ppk[:]) + h := sha3.New256() + h.Write(ppk[:]) + h.Read(sk.hpk[:]) + copy(pk.hpk[:], sk.hpk[:]) + + return &pk, &sk +} + +// GenerateKeyPair generates public and private keys using entropy from rand. +// If rand is nil, crypto/rand.Reader will be used. +func GenerateKeyPair(rand io.Reader) (*PublicKey, *PrivateKey, error) { + var seed [KeySeedSize]byte + if rand == nil { + rand = cryptoRand.Reader + } + _, err := io.ReadFull(rand, seed[:]) + if err != nil { + return nil, nil, err + } + pk, sk := NewKeyFromSeed(seed[:]) + return pk, sk, nil +} + +// EncapsulateTo generates a shared key and ciphertext that contains it +// for the public key using randomness from seed and writes the shared key +// to ss and ciphertext to ct. +// +// Panics if ss, ct or seed are not of length SharedKeySize, CiphertextSize +// and EncapsulationSeedSize respectively. +// +// seed may be nil, in which case crypto/rand.Reader is used to generate one. +func (pk *PublicKey) EncapsulateTo(ct, ss []byte, seed []byte) { + if seed == nil { + seed = make([]byte, EncapsulationSeedSize) + cryptoRand.Read(seed[:]) + } else { + if len(seed) != EncapsulationSeedSize { + panic("seed must be of length EncapsulationSeedSize") + } + } + + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + + if len(ss) != SharedKeySize { + panic("ss must be of length SharedKeySize") + } + + // m = H(seed) + var m [32]byte + h := sha3.New256() + h.Write(seed[:]) + h.Read(m[:]) + + // (K', r) = G(m ‖ H(pk)) + var kr [64]byte + g := sha3.New512() + g.Write(m[:]) + g.Write(pk.hpk[:]) + g.Read(kr[:]) + + // c = Kyber.CPAPKE.Enc(pk, m, r) + pk.pk.EncryptTo(ct, m[:], kr[32:]) + + // Compute H(c) and put in second slot of kr, which will be (K', H(c)). + h.Reset() + h.Write(ct[:CiphertextSize]) + h.Read(kr[32:]) + + // K = KDF(K' ‖ H(c)) + kdf := sha3.NewShake256() + kdf.Write(kr[:]) + kdf.Read(ss[:SharedKeySize]) +} + +// DecapsulateTo computes the shared key which is encapsulated in ct +// for the private key. +// +// Panics if ct or ss are not of length CiphertextSize and SharedKeySize +// respectively. +func (sk *PrivateKey) DecapsulateTo(ss, ct []byte) { + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + + if len(ss) != SharedKeySize { + panic("ss must be of length SharedKeySize") + } + + // m' = Kyber.CPAPKE.Dec(sk, ct) + var m2 [32]byte + sk.sk.DecryptTo(m2[:], ct) + + // (K'', r') = G(m' ‖ H(pk)) + var kr2 [64]byte + g := sha3.New512() + g.Write(m2[:]) + g.Write(sk.hpk[:]) + g.Read(kr2[:]) + + // c' = Kyber.CPAPKE.Enc(pk, m', r') + var ct2 [CiphertextSize]byte + sk.pk.EncryptTo(ct2[:], m2[:], kr2[32:]) + + // Compute H(c) and put in second slot of kr2, which will be (K'', H(c)). + h := sha3.New256() + h.Write(ct[:CiphertextSize]) + h.Read(kr2[32:]) + + // Replace K'' by z in the first slot of kr2 if c ≠ c'. + subtle.ConstantTimeCopy( + 1-subtle.ConstantTimeCompare(ct, ct2[:]), + kr2[:32], + sk.z[:], + ) + + // K = KDF(K''/z, H(c)) + kdf := sha3.NewShake256() + kdf.Write(kr2[:]) + kdf.Read(ss[:SharedKeySize]) +} + +// Packs sk to buf. +// +// Panics if buf is not of size PrivateKeySize. +func (sk *PrivateKey) Pack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of length PrivateKeySize") + } + + sk.sk.Pack(buf[:cpapke.PrivateKeySize]) + buf = buf[cpapke.PrivateKeySize:] + sk.pk.Pack(buf[:cpapke.PublicKeySize]) + buf = buf[cpapke.PublicKeySize:] + copy(buf, sk.hpk[:]) + buf = buf[32:] + copy(buf, sk.z[:]) +} + +// Unpacks sk from buf. +// +// Panics if buf is not of size PrivateKeySize. +func (sk *PrivateKey) Unpack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of length PrivateKeySize") + } + + sk.sk = new(cpapke.PrivateKey) + sk.sk.Unpack(buf[:cpapke.PrivateKeySize]) + buf = buf[cpapke.PrivateKeySize:] + sk.pk = new(cpapke.PublicKey) + sk.pk.Unpack(buf[:cpapke.PublicKeySize]) + buf = buf[cpapke.PublicKeySize:] + copy(sk.hpk[:], buf[:32]) + copy(sk.z[:], buf[32:]) +} + +// Packs pk to buf. +// +// Panics if buf is not of size PublicKeySize. +func (pk *PublicKey) Pack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of length PublicKeySize") + } + + pk.pk.Pack(buf) +} + +// Unpacks pk from buf. +// +// Panics if buf is not of size PublicKeySize. +func (pk *PublicKey) Unpack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of length PublicKeySize") + } + + pk.pk = new(cpapke.PublicKey) + pk.pk.Unpack(buf) + + // Compute cached H(pk) + h := sha3.New256() + h.Write(buf) + h.Read(pk.hpk[:]) +} + +// Boilerplate down below for the KEM scheme API. + +type scheme struct{} + +var sch kem.Scheme = &scheme{} + +// Scheme returns a KEM interface. +func Scheme() kem.Scheme { return sch } + +func (*scheme) Name() string { return "Kyber512" } +func (*scheme) PublicKeySize() int { return PublicKeySize } +func (*scheme) PrivateKeySize() int { return PrivateKeySize } +func (*scheme) SeedSize() int { return KeySeedSize } +func (*scheme) SharedKeySize() int { return SharedKeySize } +func (*scheme) CiphertextSize() int { return CiphertextSize } +func (*scheme) EncapsulationSeedSize() int { return EncapsulationSeedSize } + +func (sk *PrivateKey) Scheme() kem.Scheme { return sch } +func (pk *PublicKey) Scheme() kem.Scheme { return sch } + +func (sk *PrivateKey) MarshalBinary() ([]byte, error) { + var ret [PrivateKeySize]byte + sk.Pack(ret[:]) + return ret[:], nil +} + +func (sk *PrivateKey) Equal(other kem.PrivateKey) bool { + oth, ok := other.(*PrivateKey) + if !ok { + return false + } + if sk.pk == nil && oth.pk == nil { + return true + } + if sk.pk == nil || oth.pk == nil { + return false + } + if !bytes.Equal(sk.hpk[:], oth.hpk[:]) || + !bytes.Equal(sk.z[:], oth.z[:]) { + return false + } + return sk.sk.Equal(oth.sk) +} + +func (pk *PublicKey) Equal(other kem.PublicKey) bool { + oth, ok := other.(*PublicKey) + if !ok { + return false + } + if pk.pk == nil && oth.pk == nil { + return true + } + if pk.pk == nil || oth.pk == nil { + return false + } + return bytes.Equal(pk.hpk[:], oth.hpk[:]) +} + +func (sk *PrivateKey) Public() kem.PublicKey { + pk := new(PublicKey) + pk.pk = sk.pk + copy(pk.hpk[:], sk.hpk[:]) + return pk +} + +func (pk *PublicKey) MarshalBinary() ([]byte, error) { + var ret [PublicKeySize]byte + pk.Pack(ret[:]) + return ret[:], nil +} + +func (*scheme) GenerateKeyPair() (kem.PublicKey, kem.PrivateKey, error) { + return GenerateKeyPair(cryptoRand.Reader) +} + +func (*scheme) DeriveKeyPair(seed []byte) (kem.PublicKey, kem.PrivateKey) { + if len(seed) != KeySeedSize { + panic(kem.ErrSeedSize) + } + return NewKeyFromSeed(seed[:]) +} + +func (*scheme) Encapsulate(pk kem.PublicKey) (ct, ss []byte, err error) { + ct = make([]byte, CiphertextSize) + ss = make([]byte, SharedKeySize) + + pub, ok := pk.(*PublicKey) + if !ok { + return nil, nil, kem.ErrTypeMismatch + } + pub.EncapsulateTo(ct, ss, nil) + return +} + +func (*scheme) EncapsulateDeterministically(pk kem.PublicKey, seed []byte) ( + ct, ss []byte, err error) { + if len(seed) != EncapsulationSeedSize { + return nil, nil, kem.ErrSeedSize + } + + ct = make([]byte, CiphertextSize) + ss = make([]byte, SharedKeySize) + + pub, ok := pk.(*PublicKey) + if !ok { + return nil, nil, kem.ErrTypeMismatch + } + pub.EncapsulateTo(ct, ss, seed) + return +} + +func (*scheme) Decapsulate(sk kem.PrivateKey, ct []byte) ([]byte, error) { + if len(ct) != CiphertextSize { + return nil, kem.ErrCiphertextSize + } + + priv, ok := sk.(*PrivateKey) + if !ok { + return nil, kem.ErrTypeMismatch + } + ss := make([]byte, SharedKeySize) + priv.DecapsulateTo(ss, ct) + return ss, nil +} + +func (*scheme) UnmarshalBinaryPublicKey(buf []byte) (kem.PublicKey, error) { + if len(buf) != PublicKeySize { + return nil, kem.ErrPubKeySize + } + var ret PublicKey + ret.Unpack(buf) + return &ret, nil +} + +func (*scheme) UnmarshalBinaryPrivateKey(buf []byte) (kem.PrivateKey, error) { + if len(buf) != PrivateKeySize { + return nil, kem.ErrPrivKeySize + } + var ret PrivateKey + ret.Unpack(buf) + return &ret, nil +} diff --git a/src/vendor/github.com/cloudflare/circl/kem/kyber/kyber768/kyber.go b/src/vendor/github.com/cloudflare/circl/kem/kyber/kyber768/kyber.go new file mode 100644 index 00000000000..98c402799c4 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/kem/kyber/kyber768/kyber.go @@ -0,0 +1,402 @@ +// Code generated from pkg.templ.go. DO NOT EDIT. + +// Package kyber768 implements the IND-CCA2 secure key encapsulation mechanism +// Kyber768.CCAKEM as submitted to round 3 of the NIST PQC competition and +// described in +// +// https://pq-crystals.org/kyber/data/kyber-specification-round3.pdf +package kyber768 + +import ( + "bytes" + "crypto/subtle" + "io" + + cryptoRand "crypto/rand" + "github.com/cloudflare/circl/internal/sha3" + "github.com/cloudflare/circl/kem" + cpapke "github.com/cloudflare/circl/pke/kyber/kyber768" +) + +const ( + // Size of seed for NewKeyFromSeed + KeySeedSize = cpapke.KeySeedSize + 32 + + // Size of seed for EncapsulateTo. + EncapsulationSeedSize = 32 + + // Size of the established shared key. + SharedKeySize = 32 + + // Size of the encapsulated shared key. + CiphertextSize = cpapke.CiphertextSize + + // Size of a packed public key. + PublicKeySize = cpapke.PublicKeySize + + // Size of a packed private key. + PrivateKeySize = cpapke.PrivateKeySize + cpapke.PublicKeySize + 64 +) + +// Type of a Kyber768.CCAKEM public key +type PublicKey struct { + pk *cpapke.PublicKey + + hpk [32]byte // H(pk) +} + +// Type of a Kyber768.CCAKEM private key +type PrivateKey struct { + sk *cpapke.PrivateKey + pk *cpapke.PublicKey + hpk [32]byte // H(pk) + z [32]byte +} + +// NewKeyFromSeed derives a public/private keypair deterministically +// from the given seed. +// +// Panics if seed is not of length KeySeedSize. +func NewKeyFromSeed(seed []byte) (*PublicKey, *PrivateKey) { + var sk PrivateKey + var pk PublicKey + + if len(seed) != KeySeedSize { + panic("seed must be of length KeySeedSize") + } + + pk.pk, sk.sk = cpapke.NewKeyFromSeed(seed[:cpapke.KeySeedSize]) + sk.pk = pk.pk + copy(sk.z[:], seed[cpapke.KeySeedSize:]) + + // Compute H(pk) + var ppk [cpapke.PublicKeySize]byte + sk.pk.Pack(ppk[:]) + h := sha3.New256() + h.Write(ppk[:]) + h.Read(sk.hpk[:]) + copy(pk.hpk[:], sk.hpk[:]) + + return &pk, &sk +} + +// GenerateKeyPair generates public and private keys using entropy from rand. +// If rand is nil, crypto/rand.Reader will be used. +func GenerateKeyPair(rand io.Reader) (*PublicKey, *PrivateKey, error) { + var seed [KeySeedSize]byte + if rand == nil { + rand = cryptoRand.Reader + } + _, err := io.ReadFull(rand, seed[:]) + if err != nil { + return nil, nil, err + } + pk, sk := NewKeyFromSeed(seed[:]) + return pk, sk, nil +} + +// EncapsulateTo generates a shared key and ciphertext that contains it +// for the public key using randomness from seed and writes the shared key +// to ss and ciphertext to ct. +// +// Panics if ss, ct or seed are not of length SharedKeySize, CiphertextSize +// and EncapsulationSeedSize respectively. +// +// seed may be nil, in which case crypto/rand.Reader is used to generate one. +func (pk *PublicKey) EncapsulateTo(ct, ss []byte, seed []byte) { + if seed == nil { + seed = make([]byte, EncapsulationSeedSize) + cryptoRand.Read(seed[:]) + } else { + if len(seed) != EncapsulationSeedSize { + panic("seed must be of length EncapsulationSeedSize") + } + } + + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + + if len(ss) != SharedKeySize { + panic("ss must be of length SharedKeySize") + } + + // m = H(seed) + var m [32]byte + h := sha3.New256() + h.Write(seed[:]) + h.Read(m[:]) + + // (K', r) = G(m ‖ H(pk)) + var kr [64]byte + g := sha3.New512() + g.Write(m[:]) + g.Write(pk.hpk[:]) + g.Read(kr[:]) + + // c = Kyber.CPAPKE.Enc(pk, m, r) + pk.pk.EncryptTo(ct, m[:], kr[32:]) + + // Compute H(c) and put in second slot of kr, which will be (K', H(c)). + h.Reset() + h.Write(ct[:CiphertextSize]) + h.Read(kr[32:]) + + // K = KDF(K' ‖ H(c)) + kdf := sha3.NewShake256() + kdf.Write(kr[:]) + kdf.Read(ss[:SharedKeySize]) +} + +// DecapsulateTo computes the shared key which is encapsulated in ct +// for the private key. +// +// Panics if ct or ss are not of length CiphertextSize and SharedKeySize +// respectively. +func (sk *PrivateKey) DecapsulateTo(ss, ct []byte) { + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + + if len(ss) != SharedKeySize { + panic("ss must be of length SharedKeySize") + } + + // m' = Kyber.CPAPKE.Dec(sk, ct) + var m2 [32]byte + sk.sk.DecryptTo(m2[:], ct) + + // (K'', r') = G(m' ‖ H(pk)) + var kr2 [64]byte + g := sha3.New512() + g.Write(m2[:]) + g.Write(sk.hpk[:]) + g.Read(kr2[:]) + + // c' = Kyber.CPAPKE.Enc(pk, m', r') + var ct2 [CiphertextSize]byte + sk.pk.EncryptTo(ct2[:], m2[:], kr2[32:]) + + // Compute H(c) and put in second slot of kr2, which will be (K'', H(c)). + h := sha3.New256() + h.Write(ct[:CiphertextSize]) + h.Read(kr2[32:]) + + // Replace K'' by z in the first slot of kr2 if c ≠ c'. + subtle.ConstantTimeCopy( + 1-subtle.ConstantTimeCompare(ct, ct2[:]), + kr2[:32], + sk.z[:], + ) + + // K = KDF(K''/z, H(c)) + kdf := sha3.NewShake256() + kdf.Write(kr2[:]) + kdf.Read(ss[:SharedKeySize]) +} + +// Packs sk to buf. +// +// Panics if buf is not of size PrivateKeySize. +func (sk *PrivateKey) Pack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of length PrivateKeySize") + } + + sk.sk.Pack(buf[:cpapke.PrivateKeySize]) + buf = buf[cpapke.PrivateKeySize:] + sk.pk.Pack(buf[:cpapke.PublicKeySize]) + buf = buf[cpapke.PublicKeySize:] + copy(buf, sk.hpk[:]) + buf = buf[32:] + copy(buf, sk.z[:]) +} + +// Unpacks sk from buf. +// +// Panics if buf is not of size PrivateKeySize. +func (sk *PrivateKey) Unpack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of length PrivateKeySize") + } + + sk.sk = new(cpapke.PrivateKey) + sk.sk.Unpack(buf[:cpapke.PrivateKeySize]) + buf = buf[cpapke.PrivateKeySize:] + sk.pk = new(cpapke.PublicKey) + sk.pk.Unpack(buf[:cpapke.PublicKeySize]) + buf = buf[cpapke.PublicKeySize:] + copy(sk.hpk[:], buf[:32]) + copy(sk.z[:], buf[32:]) +} + +// Packs pk to buf. +// +// Panics if buf is not of size PublicKeySize. +func (pk *PublicKey) Pack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of length PublicKeySize") + } + + pk.pk.Pack(buf) +} + +// Unpacks pk from buf. +// +// Panics if buf is not of size PublicKeySize. +func (pk *PublicKey) Unpack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of length PublicKeySize") + } + + pk.pk = new(cpapke.PublicKey) + pk.pk.Unpack(buf) + + // Compute cached H(pk) + h := sha3.New256() + h.Write(buf) + h.Read(pk.hpk[:]) +} + +// Boilerplate down below for the KEM scheme API. + +type scheme struct{} + +var sch kem.Scheme = &scheme{} + +// Scheme returns a KEM interface. +func Scheme() kem.Scheme { return sch } + +func (*scheme) Name() string { return "Kyber768" } +func (*scheme) PublicKeySize() int { return PublicKeySize } +func (*scheme) PrivateKeySize() int { return PrivateKeySize } +func (*scheme) SeedSize() int { return KeySeedSize } +func (*scheme) SharedKeySize() int { return SharedKeySize } +func (*scheme) CiphertextSize() int { return CiphertextSize } +func (*scheme) EncapsulationSeedSize() int { return EncapsulationSeedSize } + +func (sk *PrivateKey) Scheme() kem.Scheme { return sch } +func (pk *PublicKey) Scheme() kem.Scheme { return sch } + +func (sk *PrivateKey) MarshalBinary() ([]byte, error) { + var ret [PrivateKeySize]byte + sk.Pack(ret[:]) + return ret[:], nil +} + +func (sk *PrivateKey) Equal(other kem.PrivateKey) bool { + oth, ok := other.(*PrivateKey) + if !ok { + return false + } + if sk.pk == nil && oth.pk == nil { + return true + } + if sk.pk == nil || oth.pk == nil { + return false + } + if !bytes.Equal(sk.hpk[:], oth.hpk[:]) || + !bytes.Equal(sk.z[:], oth.z[:]) { + return false + } + return sk.sk.Equal(oth.sk) +} + +func (pk *PublicKey) Equal(other kem.PublicKey) bool { + oth, ok := other.(*PublicKey) + if !ok { + return false + } + if pk.pk == nil && oth.pk == nil { + return true + } + if pk.pk == nil || oth.pk == nil { + return false + } + return bytes.Equal(pk.hpk[:], oth.hpk[:]) +} + +func (sk *PrivateKey) Public() kem.PublicKey { + pk := new(PublicKey) + pk.pk = sk.pk + copy(pk.hpk[:], sk.hpk[:]) + return pk +} + +func (pk *PublicKey) MarshalBinary() ([]byte, error) { + var ret [PublicKeySize]byte + pk.Pack(ret[:]) + return ret[:], nil +} + +func (*scheme) GenerateKeyPair() (kem.PublicKey, kem.PrivateKey, error) { + return GenerateKeyPair(cryptoRand.Reader) +} + +func (*scheme) DeriveKeyPair(seed []byte) (kem.PublicKey, kem.PrivateKey) { + if len(seed) != KeySeedSize { + panic(kem.ErrSeedSize) + } + return NewKeyFromSeed(seed[:]) +} + +func (*scheme) Encapsulate(pk kem.PublicKey) (ct, ss []byte, err error) { + ct = make([]byte, CiphertextSize) + ss = make([]byte, SharedKeySize) + + pub, ok := pk.(*PublicKey) + if !ok { + return nil, nil, kem.ErrTypeMismatch + } + pub.EncapsulateTo(ct, ss, nil) + return +} + +func (*scheme) EncapsulateDeterministically(pk kem.PublicKey, seed []byte) ( + ct, ss []byte, err error) { + if len(seed) != EncapsulationSeedSize { + return nil, nil, kem.ErrSeedSize + } + + ct = make([]byte, CiphertextSize) + ss = make([]byte, SharedKeySize) + + pub, ok := pk.(*PublicKey) + if !ok { + return nil, nil, kem.ErrTypeMismatch + } + pub.EncapsulateTo(ct, ss, seed) + return +} + +func (*scheme) Decapsulate(sk kem.PrivateKey, ct []byte) ([]byte, error) { + if len(ct) != CiphertextSize { + return nil, kem.ErrCiphertextSize + } + + priv, ok := sk.(*PrivateKey) + if !ok { + return nil, kem.ErrTypeMismatch + } + ss := make([]byte, SharedKeySize) + priv.DecapsulateTo(ss, ct) + return ss, nil +} + +func (*scheme) UnmarshalBinaryPublicKey(buf []byte) (kem.PublicKey, error) { + if len(buf) != PublicKeySize { + return nil, kem.ErrPubKeySize + } + var ret PublicKey + ret.Unpack(buf) + return &ret, nil +} + +func (*scheme) UnmarshalBinaryPrivateKey(buf []byte) (kem.PrivateKey, error) { + if len(buf) != PrivateKeySize { + return nil, kem.ErrPrivKeySize + } + var ret PrivateKey + ret.Unpack(buf) + return &ret, nil +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.go new file mode 100644 index 00000000000..79629160a56 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.go @@ -0,0 +1,302 @@ +//go:build amd64 +// +build amd64 + +package common + +import ( + "golang.org/x/sys/cpu" +) + +// ZetasAVX2 contains all ζ used in NTT (like the Zetas array), but also +// the values int16(zeta * 62209) for each zeta, which is used in +// Montgomery reduction. There is some duplication and reordering as +// compared to Zetas to make it more covenient for use with AVX2. +var ZetasAVX2 = [...]int16{ + // level 1: int16(Zetas[1]*62209) and Zetas[1] + 31499, 2571, + + // level 2 + // + // int16(Zetas[2]*62209), Zetas[2], int16(Zetas[3]*62209), Zetas[3] + 14746, 2970, 788, 1812, + + // level 3, like level 2. + 13525, 1493, -12402, 1422, 28191, 287, -16694, 202, + + 0, 0, // padding + + // layer 4. offset: 1*16 + // + // The precomputed multiplication and zetas are grouped by 16 at a + // time as used in the set of butterflies, etc. + -20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906, + 27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758, + 3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158, + 622, 622, 622, 622, 622, 622, 622, 622, + -3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799, + -15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690, + 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, + 182, 182, 182, 182, 182, 182, 182, 182, + 10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690, + 1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359, + 962, 962, 962, 962, 962, 962, 962, 962, + 2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127, + -11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201, + 31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164, + 1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855, + 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + + // layer 5. offset: 9*16 + -5827, -5827, -5827, -5827, 17364, 17364, 17364, 17364, + -26360, -26360, -26360, -26360, -29057, -29057, -29057, -29057, + 573, 573, 573, 573, 2004, 2004, 2004, 2004, + 264, 264, 264, 264, 383, 383, 383, 383, + 5572, 5572, 5572, 5572, -1102, -1102, -1102, -1102, + 21439, 21439, 21439, 21439, -26241, -26241, -26241, -26241, + 2500, 2500, 2500, 2500, 1458, 1458, 1458, 1458, + 1727, 1727, 1727, 1727, 3199, 3199, 3199, 3199, + -28072, -28072, -28072, -28072, 24313, 24313, 24313, 24313, + -10532, -10532, -10532, -10532, 8800, 8800, 8800, 8800, + 2648, 2648, 2648, 2648, 1017, 1017, 1017, 1017, + 732, 732, 732, 732, 608, 608, 608, 608, + 18427, 18427, 18427, 18427, 8859, 8859, 8859, 8859, + 26676, 26676, 26676, 26676, -16162, -16162, -16162, -16162, + 1787, 1787, 1787, 1787, 411, 411, 411, 411, + 3124, 3124, 3124, 3124, 1758, 1758, 1758, 1758, + + // layer 6. offset: 17*16 + -5689, -5689, -6516, -6516, 1497, 1497, 30967, 30967, + -23564, -23564, 20179, 20179, 20711, 20711, 25081, 25081, + 1223, 1223, 652, 652, 2777, 2777, 1015, 1015, + 2036, 2036, 1491, 1491, 3047, 3047, 1785, 1785, + -12796, -12796, 26617, 26617, 16065, 16065, -12441, -12441, + 9135, 9135, -649, -649, -25986, -25986, 27837, 27837, + 516, 516, 3321, 3321, 3009, 3009, 2663, 2663, + 1711, 1711, 2167, 2167, 126, 126, 1469, 1469, + 19884, 19884, -28249, -28249, -15886, -15886, -8898, -8898, + -28309, -28309, 9076, 9076, -30198, -30198, 18250, 18250, + 2476, 2476, 3239, 3239, 3058, 3058, 830, 830, + 107, 107, 1908, 1908, 3082, 3082, 2378, 2378, + 13427, 13427, 14017, 14017, -29155, -29155, -12756, -12756, + 16832, 16832, 4312, 4312, -24155, -24155, -17914, -17914, + 2931, 2931, 961, 961, 1821, 1821, 2604, 2604, + 448, 448, 2264, 2264, 677, 677, 2054, 2054, + + // layer 7. offset: 25*16 + -334, 11182, -11477, 13387, -32226, -14233, 20494, -21655, + -27738, 13131, 945, -4586, -14882, 23093, 6182, 5493, + 2226, 430, 555, 843, 2078, 871, 1550, 105, + 422, 587, 177, 3094, 3038, 2869, 1574, 1653, + 32011, -32502, 10631, 30318, 29176, -18741, -28761, 12639, + -18485, 20100, 17561, 18525, -14430, 19529, -5275, -12618, + 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, + 1739, 644, 2457, 349, 418, 329, 3173, 3254, + -31183, 20297, 25435, 2146, -7382, 15356, 24392, -32384, + -20926, -6279, 10946, -14902, 24215, -11044, 16990, 14470, + 817, 1097, 603, 610, 1322, 2044, 1864, 384, + 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, + 10336, -21497, -7933, -20198, -22501, 23211, 10907, -17442, + 31637, -23859, 28644, -20257, 23998, 7757, -17422, 23132, + 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, + 3221, 3021, 996, 991, 958, 1869, 1522, 1628, + + // layer 1 inverse + 23132, -17422, 7757, 23998, -20257, 28644, -23859, 31637, + -17442, 10907, 23211, -22501, -20198, -7933, -21497, 10336, + 1628, 1522, 1869, 958, 991, 996, 3021, 3221, + 478, 2459, 2475, 1819, 794, 2051, 1799, 2144, + 14470, 16990, -11044, 24215, -14902, 10946, -6279, -20926, + -32384, 24392, 15356, -7382, 2146, 25435, 20297, -31183, + 1670, 2142, 220, 2455, 1994, 1218, 3193, 2114, + 384, 1864, 2044, 1322, 610, 603, 1097, 817, + -12618, -5275, 19529, -14430, 18525, 17561, 20100, -18485, + 12639, -28761, -18741, 29176, 30318, 10631, -32502, 32011, + 3254, 3173, 329, 418, 349, 2457, 644, 1739, + 1119, 2727, 1483, 2552, 3182, 1159, 778, 3083, + 5493, 6182, 23093, -14882, -4586, 945, 13131, -27738, + -21655, 20494, -14233, -32226, 13387, -11477, 11182, -334, + 1653, 1574, 2869, 3038, 3094, 177, 587, 422, + 105, 1550, 871, 2078, 843, 555, 430, 2226, + + // layer 2 inverse + -17914, -17914, -24155, -24155, 4312, 4312, 16832, 16832, + -12756, -12756, -29155, -29155, 14017, 14017, 13427, 13427, + 2054, 2054, 677, 677, 2264, 2264, 448, 448, + 2604, 2604, 1821, 1821, 961, 961, 2931, 2931, + 18250, 18250, -30198, -30198, 9076, 9076, -28309, -28309, + -8898, -8898, -15886, -15886, -28249, -28249, 19884, 19884, + 2378, 2378, 3082, 3082, 1908, 1908, 107, 107, + 830, 830, 3058, 3058, 3239, 3239, 2476, 2476, + 27837, 27837, -25986, -25986, -649, -649, 9135, 9135, + -12441, -12441, 16065, 16065, 26617, 26617, -12796, -12796, + 1469, 1469, 126, 126, 2167, 2167, 1711, 1711, + 2663, 2663, 3009, 3009, 3321, 3321, 516, 516, + 25081, 25081, 20711, 20711, 20179, 20179, -23564, -23564, + 30967, 30967, 1497, 1497, -6516, -6516, -5689, -5689, + 1785, 1785, 3047, 3047, 1491, 1491, 2036, 2036, + 1015, 1015, 2777, 2777, 652, 652, 1223, 1223, + + // layer 3 inverse + -16162, -16162, -16162, -16162, 26676, 26676, 26676, 26676, + 8859, 8859, 8859, 8859, 18427, 18427, 18427, 18427, + 1758, 1758, 1758, 1758, 3124, 3124, 3124, 3124, + 411, 411, 411, 411, 1787, 1787, 1787, 1787, + 8800, 8800, 8800, 8800, -10532, -10532, -10532, -10532, + 24313, 24313, 24313, 24313, -28072, -28072, -28072, -28072, + 608, 608, 608, 608, 732, 732, 732, 732, + 1017, 1017, 1017, 1017, 2648, 2648, 2648, 2648, + -26241, -26241, -26241, -26241, 21439, 21439, 21439, 21439, + -1102, -1102, -1102, -1102, 5572, 5572, 5572, 5572, + 3199, 3199, 3199, 3199, 1727, 1727, 1727, 1727, + 1458, 1458, 1458, 1458, 2500, 2500, 2500, 2500, + -29057, -29057, -29057, -29057, -26360, -26360, -26360, -26360, + 17364, 17364, 17364, 17364, -5827, -5827, -5827, -5827, + 383, 383, 383, 383, 264, 264, 264, 264, + 2004, 2004, 2004, 2004, 573, 573, 573, 573, + + // layer 4 inverse + 31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164, + -11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201, + 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855, + 1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359, + 10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690, + 2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127, + 962, 962, 962, 962, 962, 962, 962, 962, + -15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690, + -3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799, + 182, 182, 182, 182, 182, 182, 182, 182, + 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, + 27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758, + -20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906, + 622, 622, 622, 622, 622, 622, 622, 622, + 3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158, + + // layer 5 inverse + -16694, 202, 28191, 287, -12402, 1422, 13525, 1493, + + // layer 6 inverse + 788, 1812, 14746, 2970, + + // layer 7 inverse + 31499, 2571, +} + +// Sets p to a + b. Does not normalize coefficients. +func (p *Poly) Add(a, b *Poly) { + if cpu.X86.HasAVX2 { + addAVX2( + (*[N]int16)(p), + (*[N]int16)(a), + (*[N]int16)(b), + ) + } else { + p.addGeneric(a, b) + } +} + +// Sets p to a - b. Does not normalize coefficients. +func (p *Poly) Sub(a, b *Poly) { + if cpu.X86.HasAVX2 { + subAVX2( + (*[N]int16)(p), + (*[N]int16)(a), + (*[N]int16)(b), + ) + } else { + p.subGeneric(a, b) + } +} + +// Executes an in-place forward "NTT" on p. +// +// Assumes the coefficients are in absolute value ≤q. The resulting +// coefficients are in absolute value ≤7q. If the input is in Montgomery +// form, then the result is in Montgomery form and so (by linearity of the NTT) +// if the input is in regular form, then the result is also in regular form. +// The order of coefficients will be "tangled". These can be put back into +// their proper order by calling Detangle(). +func (p *Poly) NTT() { + if cpu.X86.HasAVX2 { + nttAVX2((*[N]int16)(p)) + } else { + p.nttGeneric() + } +} + +// Executes an in-place inverse "NTT" on p and multiply by the Montgomery +// factor R. +// +// Requires coefficients to be in "tangled" order, see Tangle(). +// Assumes the coefficients are in absolute value ≤q. The resulting +// coefficients are in absolute value ≤q. If the input is in Montgomery +// form, then the result is in Montgomery form and so (by linearity) +// if the input is in regular form, then the result is also in regular form. +func (p *Poly) InvNTT() { + if cpu.X86.HasAVX2 { + invNttAVX2((*[N]int16)(p)) + } else { + p.invNTTGeneric() + } +} + +// Sets p to the "pointwise" multiplication of a and b. +// +// That is: InvNTT(p) = InvNTT(a) * InvNTT(b). Assumes a and b are in +// Montgomery form. Products between coefficients of a and b must be strictly +// bounded in absolute value by 2¹⁵q. p will be in Montgomery form and +// bounded in absolute value by 2q. +// +// Requires a and b to be in "tangled" order, see Tangle(). p will be in +// tangled order as well. +func (p *Poly) MulHat(a, b *Poly) { + if cpu.X86.HasAVX2 { + mulHatAVX2( + (*[N]int16)(p), + (*[N]int16)(a), + (*[N]int16)(b), + ) + } else { + p.mulHatGeneric(a, b) + } +} + +// Puts p into the right form to be used with (among others) InvNTT(). +func (p *Poly) Tangle() { + if cpu.X86.HasAVX2 { + tangleAVX2((*[N]int16)(p)) + } + + // When AVX2 is not available, we use the standard order. +} + +// Puts p back into standard form. +func (p *Poly) Detangle() { + if cpu.X86.HasAVX2 { + detangleAVX2((*[N]int16)(p)) + } + + // When AVX2 is not available, we use the standard order. +} + +// Almost normalizes coefficients. +// +// Ensures each coefficient is in {0, …, q}. +func (p *Poly) BarrettReduce() { + if cpu.X86.HasAVX2 { + barrettReduceAVX2((*[N]int16)(p)) + } else { + p.barrettReduceGeneric() + } +} + +// Normalizes coefficients. +// +// Ensures each coefficient is in {0, …, q-1}. +func (p *Poly) Normalize() { + if cpu.X86.HasAVX2 { + normalizeAVX2((*[N]int16)(p)) + } else { + p.normalizeGeneric() + } +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.s b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.s new file mode 100644 index 00000000000..d8205465e17 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.s @@ -0,0 +1,2354 @@ +// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT. + +// +build amd64 + +#include "textflag.h" + +// func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16) +// Requires: AVX, AVX2 +TEXT ·addAVX2(SB), NOSPLIT, $0-24 + MOVQ p+0(FP), AX + MOVQ a+8(FP), CX + MOVQ b+16(FP), DX + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y6 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y10 + VMOVDQU 192(CX), Y12 + VMOVDQU 224(CX), Y14 + VMOVDQU (DX), Y1 + VMOVDQU 32(DX), Y3 + VMOVDQU 64(DX), Y5 + VMOVDQU 96(DX), Y7 + VMOVDQU 128(DX), Y9 + VMOVDQU 160(DX), Y11 + VMOVDQU 192(DX), Y13 + VMOVDQU 224(DX), Y15 + VPADDW Y0, Y1, Y1 + VPADDW Y2, Y3, Y3 + VPADDW Y4, Y5, Y5 + VPADDW Y6, Y7, Y7 + VPADDW Y8, Y9, Y9 + VPADDW Y10, Y11, Y11 + VPADDW Y12, Y13, Y13 + VPADDW Y14, Y15, Y15 + VMOVDQU Y1, (AX) + VMOVDQU Y3, 32(AX) + VMOVDQU Y5, 64(AX) + VMOVDQU Y7, 96(AX) + VMOVDQU Y9, 128(AX) + VMOVDQU Y11, 160(AX) + VMOVDQU Y13, 192(AX) + VMOVDQU Y15, 224(AX) + VMOVDQU 256(CX), Y0 + VMOVDQU 288(CX), Y2 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y10 + VMOVDQU 448(CX), Y12 + VMOVDQU 480(CX), Y14 + VMOVDQU 256(DX), Y1 + VMOVDQU 288(DX), Y3 + VMOVDQU 320(DX), Y5 + VMOVDQU 352(DX), Y7 + VMOVDQU 384(DX), Y9 + VMOVDQU 416(DX), Y11 + VMOVDQU 448(DX), Y13 + VMOVDQU 480(DX), Y15 + VPADDW Y0, Y1, Y1 + VPADDW Y2, Y3, Y3 + VPADDW Y4, Y5, Y5 + VPADDW Y6, Y7, Y7 + VPADDW Y8, Y9, Y9 + VPADDW Y10, Y11, Y11 + VPADDW Y12, Y13, Y13 + VPADDW Y14, Y15, Y15 + VMOVDQU Y1, 256(AX) + VMOVDQU Y3, 288(AX) + VMOVDQU Y5, 320(AX) + VMOVDQU Y7, 352(AX) + VMOVDQU Y9, 384(AX) + VMOVDQU Y11, 416(AX) + VMOVDQU Y13, 448(AX) + VMOVDQU Y15, 480(AX) + RET + +// func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16) +// Requires: AVX, AVX2 +TEXT ·subAVX2(SB), NOSPLIT, $0-24 + MOVQ p+0(FP), AX + MOVQ a+8(FP), CX + MOVQ b+16(FP), DX + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y6 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y10 + VMOVDQU 192(CX), Y12 + VMOVDQU 224(CX), Y14 + VMOVDQU (DX), Y1 + VMOVDQU 32(DX), Y3 + VMOVDQU 64(DX), Y5 + VMOVDQU 96(DX), Y7 + VMOVDQU 128(DX), Y9 + VMOVDQU 160(DX), Y11 + VMOVDQU 192(DX), Y13 + VMOVDQU 224(DX), Y15 + VPSUBW Y1, Y0, Y1 + VPSUBW Y3, Y2, Y3 + VPSUBW Y5, Y4, Y5 + VPSUBW Y7, Y6, Y7 + VPSUBW Y9, Y8, Y9 + VPSUBW Y11, Y10, Y11 + VPSUBW Y13, Y12, Y13 + VPSUBW Y15, Y14, Y15 + VMOVDQU Y1, (AX) + VMOVDQU Y3, 32(AX) + VMOVDQU Y5, 64(AX) + VMOVDQU Y7, 96(AX) + VMOVDQU Y9, 128(AX) + VMOVDQU Y11, 160(AX) + VMOVDQU Y13, 192(AX) + VMOVDQU Y15, 224(AX) + VMOVDQU 256(CX), Y0 + VMOVDQU 288(CX), Y2 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y10 + VMOVDQU 448(CX), Y12 + VMOVDQU 480(CX), Y14 + VMOVDQU 256(DX), Y1 + VMOVDQU 288(DX), Y3 + VMOVDQU 320(DX), Y5 + VMOVDQU 352(DX), Y7 + VMOVDQU 384(DX), Y9 + VMOVDQU 416(DX), Y11 + VMOVDQU 448(DX), Y13 + VMOVDQU 480(DX), Y15 + VPSUBW Y1, Y0, Y1 + VPSUBW Y3, Y2, Y3 + VPSUBW Y5, Y4, Y5 + VPSUBW Y7, Y6, Y7 + VPSUBW Y9, Y8, Y9 + VPSUBW Y11, Y10, Y11 + VPSUBW Y13, Y12, Y13 + VPSUBW Y15, Y14, Y15 + VMOVDQU Y1, 256(AX) + VMOVDQU Y3, 288(AX) + VMOVDQU Y5, 320(AX) + VMOVDQU Y7, 352(AX) + VMOVDQU Y9, 384(AX) + VMOVDQU Y11, 416(AX) + VMOVDQU Y13, 448(AX) + VMOVDQU Y15, 480(AX) + RET + +// func nttAVX2(p *[256]int16) +// Requires: AVX, AVX2 +TEXT ·nttAVX2(SB), NOSPLIT, $0-8 + MOVQ p+0(FP), AX + LEAQ ·ZetasAVX2+0(SB), CX + MOVL $0x00000d01, DX + VMOVD DX, X0 + VPBROADCASTW X0, Y15 + VPBROADCASTW (CX), Y0 + VPBROADCASTW 2(CX), Y1 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VMOVDQU 64(AX), Y9 + VMOVDQU 96(AX), Y10 + VMOVDQU 256(AX), Y11 + VMOVDQU 288(AX), Y12 + VMOVDQU 320(AX), Y13 + VMOVDQU 352(AX), Y14 + VPMULLW Y11, Y0, Y2 + VPMULLW Y12, Y0, Y3 + VPMULLW Y13, Y0, Y4 + VPMULLW Y14, Y0, Y5 + VPMULHW Y11, Y1, Y11 + VPMULHW Y12, Y1, Y12 + VPMULHW Y13, Y1, Y13 + VPMULHW Y14, Y1, Y14 + VPMULHW Y2, Y15, Y2 + VPMULHW Y3, Y15, Y3 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPSUBW Y2, Y11, Y2 + VPSUBW Y3, Y12, Y3 + VPSUBW Y4, Y13, Y4 + VPSUBW Y5, Y14, Y5 + VPSUBW Y2, Y7, Y11 + VPSUBW Y3, Y8, Y12 + VPSUBW Y4, Y9, Y13 + VPSUBW Y5, Y10, Y14 + VPADDW Y2, Y7, Y7 + VPADDW Y3, Y8, Y8 + VPADDW Y4, Y9, Y9 + VPADDW Y5, Y10, Y10 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + VMOVDQU Y9, 64(AX) + VMOVDQU Y10, 96(AX) + VMOVDQU Y11, 256(AX) + VMOVDQU Y12, 288(AX) + VMOVDQU Y13, 320(AX) + VMOVDQU Y14, 352(AX) + VMOVDQU 128(AX), Y7 + VMOVDQU 160(AX), Y8 + VMOVDQU 192(AX), Y9 + VMOVDQU 224(AX), Y10 + VMOVDQU 384(AX), Y11 + VMOVDQU 416(AX), Y12 + VMOVDQU 448(AX), Y13 + VMOVDQU 480(AX), Y14 + VPMULLW Y11, Y0, Y2 + VPMULLW Y12, Y0, Y3 + VPMULLW Y13, Y0, Y4 + VPMULLW Y14, Y0, Y5 + VPMULHW Y11, Y1, Y11 + VPMULHW Y12, Y1, Y12 + VPMULHW Y13, Y1, Y13 + VPMULHW Y14, Y1, Y14 + VPMULHW Y2, Y15, Y2 + VPMULHW Y3, Y15, Y3 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPSUBW Y2, Y11, Y2 + VPSUBW Y3, Y12, Y3 + VPSUBW Y4, Y13, Y4 + VPSUBW Y5, Y14, Y5 + VPSUBW Y2, Y7, Y11 + VPSUBW Y3, Y8, Y12 + VPSUBW Y4, Y9, Y13 + VPSUBW Y5, Y10, Y14 + VPADDW Y2, Y7, Y7 + VPADDW Y3, Y8, Y8 + VPADDW Y4, Y9, Y9 + VPADDW Y5, Y10, Y10 + VMOVDQU Y7, 128(AX) + VMOVDQU Y8, 160(AX) + VMOVDQU Y9, 192(AX) + VMOVDQU Y10, 224(AX) + VMOVDQU Y11, 384(AX) + VMOVDQU Y12, 416(AX) + VMOVDQU Y13, 448(AX) + VMOVDQU Y14, 480(AX) + VPBROADCASTW 4(CX), Y0 + VPBROADCASTW 6(CX), Y1 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VMOVDQU 64(AX), Y9 + VMOVDQU 96(AX), Y10 + VMOVDQU 128(AX), Y11 + VMOVDQU 160(AX), Y12 + VMOVDQU 192(AX), Y13 + VMOVDQU 224(AX), Y14 + VPMULLW Y11, Y0, Y2 + VPMULLW Y12, Y0, Y3 + VPMULLW Y13, Y0, Y4 + VPMULLW Y14, Y0, Y5 + VPMULHW Y11, Y1, Y11 + VPMULHW Y12, Y1, Y12 + VPMULHW Y13, Y1, Y13 + VPMULHW Y14, Y1, Y14 + VPMULHW Y2, Y15, Y2 + VPMULHW Y3, Y15, Y3 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPSUBW Y2, Y11, Y2 + VPSUBW Y3, Y12, Y3 + VPSUBW Y4, Y13, Y4 + VPSUBW Y5, Y14, Y5 + VPSUBW Y2, Y7, Y11 + VPSUBW Y3, Y8, Y12 + VPSUBW Y4, Y9, Y13 + VPSUBW Y5, Y10, Y14 + VPADDW Y2, Y7, Y7 + VPADDW Y3, Y8, Y8 + VPADDW Y4, Y9, Y9 + VPADDW Y5, Y10, Y10 + VPBROADCASTW 12(CX), Y0 + VPBROADCASTW 14(CX), Y1 + VPBROADCASTW 16(CX), Y2 + VPBROADCASTW 18(CX), Y3 + VPMULLW Y9, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULLW Y13, Y2, Y6 + VPMULLW Y14, Y2, Y0 + VPMULHW Y9, Y1, Y9 + VPMULHW Y10, Y1, Y10 + VPMULHW Y13, Y3, Y13 + VPMULHW Y14, Y3, Y14 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPMULHW Y6, Y15, Y6 + VPMULHW Y0, Y15, Y0 + VPSUBW Y4, Y9, Y4 + VPSUBW Y5, Y10, Y5 + VPSUBW Y6, Y13, Y6 + VPSUBW Y0, Y14, Y0 + VPSUBW Y4, Y7, Y9 + VPSUBW Y5, Y8, Y10 + VPSUBW Y6, Y11, Y13 + VPSUBW Y0, Y12, Y14 + VPADDW Y4, Y7, Y7 + VPADDW Y5, Y8, Y8 + VPADDW Y6, Y11, Y11 + VPADDW Y0, Y12, Y12 + VMOVDQU 32(CX), Y0 + VMOVDQU 64(CX), Y1 + VMOVDQU 96(CX), Y2 + VMOVDQU 128(CX), Y3 + VPERM2I128 $0x20, Y9, Y7, Y4 + VPERM2I128 $0x31, Y9, Y7, Y9 + VMOVDQA Y4, Y7 + VPERM2I128 $0x20, Y10, Y8, Y4 + VPERM2I128 $0x31, Y10, Y8, Y10 + VMOVDQA Y4, Y8 + VPERM2I128 $0x20, Y13, Y11, Y4 + VPERM2I128 $0x31, Y13, Y11, Y13 + VMOVDQA Y4, Y11 + VPERM2I128 $0x20, Y14, Y12, Y4 + VPERM2I128 $0x31, Y14, Y12, Y14 + VMOVDQA Y4, Y12 + VPMULLW Y8, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULLW Y12, Y2, Y6 + VPMULLW Y14, Y2, Y0 + VPMULHW Y8, Y1, Y8 + VPMULHW Y10, Y1, Y10 + VPMULHW Y12, Y3, Y12 + VPMULHW Y14, Y3, Y14 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPMULHW Y6, Y15, Y6 + VPMULHW Y0, Y15, Y0 + VPSUBW Y4, Y8, Y4 + VPSUBW Y5, Y10, Y5 + VPSUBW Y6, Y12, Y6 + VPSUBW Y0, Y14, Y0 + VPSUBW Y4, Y7, Y8 + VPSUBW Y5, Y9, Y10 + VPSUBW Y6, Y11, Y12 + VPSUBW Y0, Y13, Y14 + VPADDW Y4, Y7, Y7 + VPADDW Y5, Y9, Y9 + VPADDW Y6, Y11, Y11 + VPADDW Y0, Y13, Y13 + VMOVDQU 288(CX), Y0 + VMOVDQU 320(CX), Y1 + VMOVDQU 352(CX), Y2 + VMOVDQU 384(CX), Y3 + VPUNPCKLQDQ Y8, Y7, Y4 + VPUNPCKHQDQ Y8, Y7, Y8 + VMOVDQA Y4, Y7 + VPUNPCKLQDQ Y10, Y9, Y4 + VPUNPCKHQDQ Y10, Y9, Y10 + VMOVDQA Y4, Y9 + VPUNPCKLQDQ Y12, Y11, Y4 + VPUNPCKHQDQ Y12, Y11, Y12 + VMOVDQA Y4, Y11 + VPUNPCKLQDQ Y14, Y13, Y4 + VPUNPCKHQDQ Y14, Y13, Y14 + VMOVDQA Y4, Y13 + VPMULLW Y9, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULLW Y13, Y2, Y6 + VPMULLW Y14, Y2, Y0 + VPMULHW Y9, Y1, Y9 + VPMULHW Y10, Y1, Y10 + VPMULHW Y13, Y3, Y13 + VPMULHW Y14, Y3, Y14 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPMULHW Y6, Y15, Y6 + VPMULHW Y0, Y15, Y0 + VPSUBW Y4, Y9, Y4 + VPSUBW Y5, Y10, Y5 + VPSUBW Y6, Y13, Y6 + VPSUBW Y0, Y14, Y0 + VPSUBW Y4, Y7, Y9 + VPSUBW Y5, Y8, Y10 + VPSUBW Y6, Y11, Y13 + VPSUBW Y0, Y12, Y14 + VPADDW Y4, Y7, Y7 + VPADDW Y5, Y8, Y8 + VPADDW Y6, Y11, Y11 + VPADDW Y0, Y12, Y12 + VMOVDQU 544(CX), Y0 + VMOVDQU 576(CX), Y1 + VMOVDQU 608(CX), Y2 + VMOVDQU 640(CX), Y3 + VMOVSLDUP Y9, Y4 + VPBLENDD $0xaa, Y4, Y7, Y4 + VPSRLQ $0x20, Y7, Y7 + VPBLENDD $0xaa, Y9, Y7, Y9 + VMOVDQA Y4, Y7 + VMOVSLDUP Y10, Y4 + VPBLENDD $0xaa, Y4, Y8, Y4 + VPSRLQ $0x20, Y8, Y8 + VPBLENDD $0xaa, Y10, Y8, Y10 + VMOVDQA Y4, Y8 + VMOVSLDUP Y13, Y4 + VPBLENDD $0xaa, Y4, Y11, Y4 + VPSRLQ $0x20, Y11, Y11 + VPBLENDD $0xaa, Y13, Y11, Y13 + VMOVDQA Y4, Y11 + VMOVSLDUP Y14, Y4 + VPBLENDD $0xaa, Y4, Y12, Y4 + VPSRLQ $0x20, Y12, Y12 + VPBLENDD $0xaa, Y14, Y12, Y14 + VMOVDQA Y4, Y12 + VPMULLW Y8, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULLW Y12, Y2, Y6 + VPMULLW Y14, Y2, Y0 + VPMULHW Y8, Y1, Y8 + VPMULHW Y10, Y1, Y10 + VPMULHW Y12, Y3, Y12 + VPMULHW Y14, Y3, Y14 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPMULHW Y6, Y15, Y6 + VPMULHW Y0, Y15, Y0 + VPSUBW Y4, Y8, Y4 + VPSUBW Y5, Y10, Y5 + VPSUBW Y6, Y12, Y6 + VPSUBW Y0, Y14, Y0 + VPSUBW Y4, Y7, Y8 + VPSUBW Y5, Y9, Y10 + VPSUBW Y6, Y11, Y12 + VPSUBW Y0, Y13, Y14 + VPADDW Y4, Y7, Y7 + VPADDW Y5, Y9, Y9 + VPADDW Y6, Y11, Y11 + VPADDW Y0, Y13, Y13 + VMOVDQU 800(CX), Y0 + VMOVDQU 832(CX), Y1 + VMOVDQU 864(CX), Y2 + VMOVDQU 896(CX), Y3 + VPSLLD $0x10, Y8, Y4 + VPBLENDW $0xaa, Y4, Y7, Y4 + VPSRLD $0x10, Y7, Y7 + VPBLENDW $0xaa, Y8, Y7, Y8 + VMOVDQA Y4, Y7 + VPSLLD $0x10, Y10, Y4 + VPBLENDW $0xaa, Y4, Y9, Y4 + VPSRLD $0x10, Y9, Y9 + VPBLENDW $0xaa, Y10, Y9, Y10 + VMOVDQA Y4, Y9 + VPSLLD $0x10, Y12, Y4 + VPBLENDW $0xaa, Y4, Y11, Y4 + VPSRLD $0x10, Y11, Y11 + VPBLENDW $0xaa, Y12, Y11, Y12 + VMOVDQA Y4, Y11 + VPSLLD $0x10, Y14, Y4 + VPBLENDW $0xaa, Y4, Y13, Y4 + VPSRLD $0x10, Y13, Y13 + VPBLENDW $0xaa, Y14, Y13, Y14 + VMOVDQA Y4, Y13 + VPMULLW Y9, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULLW Y13, Y2, Y6 + VPMULLW Y14, Y2, Y0 + VPMULHW Y9, Y1, Y9 + VPMULHW Y10, Y1, Y10 + VPMULHW Y13, Y3, Y13 + VPMULHW Y14, Y3, Y14 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPMULHW Y6, Y15, Y6 + VPMULHW Y0, Y15, Y0 + VPSUBW Y4, Y9, Y4 + VPSUBW Y5, Y10, Y5 + VPSUBW Y6, Y13, Y6 + VPSUBW Y0, Y14, Y0 + VPSUBW Y4, Y7, Y9 + VPSUBW Y5, Y8, Y10 + VPSUBW Y6, Y11, Y13 + VPSUBW Y0, Y12, Y14 + VPADDW Y4, Y7, Y7 + VPADDW Y5, Y8, Y8 + VPADDW Y6, Y11, Y11 + VPADDW Y0, Y12, Y12 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + VMOVDQU Y9, 64(AX) + VMOVDQU Y10, 96(AX) + VMOVDQU Y11, 128(AX) + VMOVDQU Y12, 160(AX) + VMOVDQU Y13, 192(AX) + VMOVDQU Y14, 224(AX) + VPBROADCASTW 8(CX), Y0 + VPBROADCASTW 10(CX), Y1 + VMOVDQU 256(AX), Y7 + VMOVDQU 288(AX), Y8 + VMOVDQU 320(AX), Y9 + VMOVDQU 352(AX), Y10 + VMOVDQU 384(AX), Y11 + VMOVDQU 416(AX), Y12 + VMOVDQU 448(AX), Y13 + VMOVDQU 480(AX), Y14 + VPMULLW Y11, Y0, Y2 + VPMULLW Y12, Y0, Y3 + VPMULLW Y13, Y0, Y4 + VPMULLW Y14, Y0, Y5 + VPMULHW Y11, Y1, Y11 + VPMULHW Y12, Y1, Y12 + VPMULHW Y13, Y1, Y13 + VPMULHW Y14, Y1, Y14 + VPMULHW Y2, Y15, Y2 + VPMULHW Y3, Y15, Y3 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPSUBW Y2, Y11, Y2 + VPSUBW Y3, Y12, Y3 + VPSUBW Y4, Y13, Y4 + VPSUBW Y5, Y14, Y5 + VPSUBW Y2, Y7, Y11 + VPSUBW Y3, Y8, Y12 + VPSUBW Y4, Y9, Y13 + VPSUBW Y5, Y10, Y14 + VPADDW Y2, Y7, Y7 + VPADDW Y3, Y8, Y8 + VPADDW Y4, Y9, Y9 + VPADDW Y5, Y10, Y10 + VPBROADCASTW 20(CX), Y0 + VPBROADCASTW 22(CX), Y1 + VPBROADCASTW 24(CX), Y2 + VPBROADCASTW 26(CX), Y3 + VPMULLW Y9, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULLW Y13, Y2, Y6 + VPMULLW Y14, Y2, Y0 + VPMULHW Y9, Y1, Y9 + VPMULHW Y10, Y1, Y10 + VPMULHW Y13, Y3, Y13 + VPMULHW Y14, Y3, Y14 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPMULHW Y6, Y15, Y6 + VPMULHW Y0, Y15, Y0 + VPSUBW Y4, Y9, Y4 + VPSUBW Y5, Y10, Y5 + VPSUBW Y6, Y13, Y6 + VPSUBW Y0, Y14, Y0 + VPSUBW Y4, Y7, Y9 + VPSUBW Y5, Y8, Y10 + VPSUBW Y6, Y11, Y13 + VPSUBW Y0, Y12, Y14 + VPADDW Y4, Y7, Y7 + VPADDW Y5, Y8, Y8 + VPADDW Y6, Y11, Y11 + VPADDW Y0, Y12, Y12 + VMOVDQU 160(CX), Y0 + VMOVDQU 192(CX), Y1 + VMOVDQU 224(CX), Y2 + VMOVDQU 256(CX), Y3 + VPERM2I128 $0x20, Y9, Y7, Y4 + VPERM2I128 $0x31, Y9, Y7, Y9 + VMOVDQA Y4, Y7 + VPERM2I128 $0x20, Y10, Y8, Y4 + VPERM2I128 $0x31, Y10, Y8, Y10 + VMOVDQA Y4, Y8 + VPERM2I128 $0x20, Y13, Y11, Y4 + VPERM2I128 $0x31, Y13, Y11, Y13 + VMOVDQA Y4, Y11 + VPERM2I128 $0x20, Y14, Y12, Y4 + VPERM2I128 $0x31, Y14, Y12, Y14 + VMOVDQA Y4, Y12 + VPMULLW Y8, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULLW Y12, Y2, Y6 + VPMULLW Y14, Y2, Y0 + VPMULHW Y8, Y1, Y8 + VPMULHW Y10, Y1, Y10 + VPMULHW Y12, Y3, Y12 + VPMULHW Y14, Y3, Y14 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPMULHW Y6, Y15, Y6 + VPMULHW Y0, Y15, Y0 + VPSUBW Y4, Y8, Y4 + VPSUBW Y5, Y10, Y5 + VPSUBW Y6, Y12, Y6 + VPSUBW Y0, Y14, Y0 + VPSUBW Y4, Y7, Y8 + VPSUBW Y5, Y9, Y10 + VPSUBW Y6, Y11, Y12 + VPSUBW Y0, Y13, Y14 + VPADDW Y4, Y7, Y7 + VPADDW Y5, Y9, Y9 + VPADDW Y6, Y11, Y11 + VPADDW Y0, Y13, Y13 + VMOVDQU 416(CX), Y0 + VMOVDQU 448(CX), Y1 + VMOVDQU 480(CX), Y2 + VMOVDQU 512(CX), Y3 + VPUNPCKLQDQ Y8, Y7, Y4 + VPUNPCKHQDQ Y8, Y7, Y8 + VMOVDQA Y4, Y7 + VPUNPCKLQDQ Y10, Y9, Y4 + VPUNPCKHQDQ Y10, Y9, Y10 + VMOVDQA Y4, Y9 + VPUNPCKLQDQ Y12, Y11, Y4 + VPUNPCKHQDQ Y12, Y11, Y12 + VMOVDQA Y4, Y11 + VPUNPCKLQDQ Y14, Y13, Y4 + VPUNPCKHQDQ Y14, Y13, Y14 + VMOVDQA Y4, Y13 + VPMULLW Y9, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULLW Y13, Y2, Y6 + VPMULLW Y14, Y2, Y0 + VPMULHW Y9, Y1, Y9 + VPMULHW Y10, Y1, Y10 + VPMULHW Y13, Y3, Y13 + VPMULHW Y14, Y3, Y14 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPMULHW Y6, Y15, Y6 + VPMULHW Y0, Y15, Y0 + VPSUBW Y4, Y9, Y4 + VPSUBW Y5, Y10, Y5 + VPSUBW Y6, Y13, Y6 + VPSUBW Y0, Y14, Y0 + VPSUBW Y4, Y7, Y9 + VPSUBW Y5, Y8, Y10 + VPSUBW Y6, Y11, Y13 + VPSUBW Y0, Y12, Y14 + VPADDW Y4, Y7, Y7 + VPADDW Y5, Y8, Y8 + VPADDW Y6, Y11, Y11 + VPADDW Y0, Y12, Y12 + VMOVDQU 672(CX), Y0 + VMOVDQU 704(CX), Y1 + VMOVDQU 736(CX), Y2 + VMOVDQU 768(CX), Y3 + VMOVSLDUP Y9, Y4 + VPBLENDD $0xaa, Y4, Y7, Y4 + VPSRLQ $0x20, Y7, Y7 + VPBLENDD $0xaa, Y9, Y7, Y9 + VMOVDQA Y4, Y7 + VMOVSLDUP Y10, Y4 + VPBLENDD $0xaa, Y4, Y8, Y4 + VPSRLQ $0x20, Y8, Y8 + VPBLENDD $0xaa, Y10, Y8, Y10 + VMOVDQA Y4, Y8 + VMOVSLDUP Y13, Y4 + VPBLENDD $0xaa, Y4, Y11, Y4 + VPSRLQ $0x20, Y11, Y11 + VPBLENDD $0xaa, Y13, Y11, Y13 + VMOVDQA Y4, Y11 + VMOVSLDUP Y14, Y4 + VPBLENDD $0xaa, Y4, Y12, Y4 + VPSRLQ $0x20, Y12, Y12 + VPBLENDD $0xaa, Y14, Y12, Y14 + VMOVDQA Y4, Y12 + VPMULLW Y8, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULLW Y12, Y2, Y6 + VPMULLW Y14, Y2, Y0 + VPMULHW Y8, Y1, Y8 + VPMULHW Y10, Y1, Y10 + VPMULHW Y12, Y3, Y12 + VPMULHW Y14, Y3, Y14 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPMULHW Y6, Y15, Y6 + VPMULHW Y0, Y15, Y0 + VPSUBW Y4, Y8, Y4 + VPSUBW Y5, Y10, Y5 + VPSUBW Y6, Y12, Y6 + VPSUBW Y0, Y14, Y0 + VPSUBW Y4, Y7, Y8 + VPSUBW Y5, Y9, Y10 + VPSUBW Y6, Y11, Y12 + VPSUBW Y0, Y13, Y14 + VPADDW Y4, Y7, Y7 + VPADDW Y5, Y9, Y9 + VPADDW Y6, Y11, Y11 + VPADDW Y0, Y13, Y13 + VMOVDQU 928(CX), Y0 + VMOVDQU 960(CX), Y1 + VMOVDQU 992(CX), Y2 + VMOVDQU 1024(CX), Y3 + VPSLLD $0x10, Y8, Y4 + VPBLENDW $0xaa, Y4, Y7, Y4 + VPSRLD $0x10, Y7, Y7 + VPBLENDW $0xaa, Y8, Y7, Y8 + VMOVDQA Y4, Y7 + VPSLLD $0x10, Y10, Y4 + VPBLENDW $0xaa, Y4, Y9, Y4 + VPSRLD $0x10, Y9, Y9 + VPBLENDW $0xaa, Y10, Y9, Y10 + VMOVDQA Y4, Y9 + VPSLLD $0x10, Y12, Y4 + VPBLENDW $0xaa, Y4, Y11, Y4 + VPSRLD $0x10, Y11, Y11 + VPBLENDW $0xaa, Y12, Y11, Y12 + VMOVDQA Y4, Y11 + VPSLLD $0x10, Y14, Y4 + VPBLENDW $0xaa, Y4, Y13, Y4 + VPSRLD $0x10, Y13, Y13 + VPBLENDW $0xaa, Y14, Y13, Y14 + VMOVDQA Y4, Y13 + VPMULLW Y9, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULLW Y13, Y2, Y6 + VPMULLW Y14, Y2, Y0 + VPMULHW Y9, Y1, Y9 + VPMULHW Y10, Y1, Y10 + VPMULHW Y13, Y3, Y13 + VPMULHW Y14, Y3, Y14 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPMULHW Y6, Y15, Y6 + VPMULHW Y0, Y15, Y0 + VPSUBW Y4, Y9, Y4 + VPSUBW Y5, Y10, Y5 + VPSUBW Y6, Y13, Y6 + VPSUBW Y0, Y14, Y0 + VPSUBW Y4, Y7, Y9 + VPSUBW Y5, Y8, Y10 + VPSUBW Y6, Y11, Y13 + VPSUBW Y0, Y12, Y14 + VPADDW Y4, Y7, Y7 + VPADDW Y5, Y8, Y8 + VPADDW Y6, Y11, Y11 + VPADDW Y0, Y12, Y12 + VMOVDQU Y7, 256(AX) + VMOVDQU Y8, 288(AX) + VMOVDQU Y9, 320(AX) + VMOVDQU Y10, 352(AX) + VMOVDQU Y11, 384(AX) + VMOVDQU Y12, 416(AX) + VMOVDQU Y13, 448(AX) + VMOVDQU Y14, 480(AX) + RET + +// func invNttAVX2(p *[256]int16) +// Requires: AVX, AVX2 +TEXT ·invNttAVX2(SB), NOSPLIT, $0-8 + MOVQ p+0(FP), AX + LEAQ ·ZetasAVX2+0(SB), CX + MOVL $0x00000d01, DX + VMOVD DX, X0 + VPBROADCASTW X0, Y15 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VMOVDQU 64(AX), Y9 + VMOVDQU 96(AX), Y10 + VMOVDQU 128(AX), Y11 + VMOVDQU 160(AX), Y12 + VMOVDQU 192(AX), Y13 + VMOVDQU 224(AX), Y14 + VMOVDQU 1056(CX), Y0 + VMOVDQU 1088(CX), Y1 + VMOVDQU 1120(CX), Y2 + VMOVDQU 1152(CX), Y3 + VPSUBW Y7, Y9, Y4 + VPSUBW Y8, Y10, Y5 + VPSUBW Y11, Y13, Y6 + VPADDW Y7, Y9, Y7 + VPADDW Y8, Y10, Y8 + VPADDW Y11, Y13, Y11 + VPMULLW Y4, Y0, Y9 + VPMULLW Y5, Y0, Y10 + VPSUBW Y12, Y14, Y0 + VPMULLW Y6, Y2, Y13 + VPADDW Y12, Y14, Y12 + VPMULLW Y0, Y2, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y3, Y6 + VPMULHW Y0, Y3, Y0 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y13, Y15, Y13 + VPMULHW Y14, Y15, Y14 + VPSUBW Y9, Y4, Y9 + VPSUBW Y10, Y5, Y10 + VPSUBW Y13, Y6, Y13 + VPSUBW Y14, Y0, Y14 + VMOVDQU 1312(CX), Y0 + VMOVDQU 1344(CX), Y1 + VMOVDQU 1376(CX), Y2 + VMOVDQU 1408(CX), Y3 + VPSLLD $0x10, Y8, Y4 + VPBLENDW $0xaa, Y4, Y7, Y4 + VPSRLD $0x10, Y7, Y7 + VPBLENDW $0xaa, Y8, Y7, Y8 + VMOVDQA Y4, Y7 + VPSLLD $0x10, Y10, Y4 + VPBLENDW $0xaa, Y4, Y9, Y4 + VPSRLD $0x10, Y9, Y9 + VPBLENDW $0xaa, Y10, Y9, Y10 + VMOVDQA Y4, Y9 + VPSLLD $0x10, Y12, Y4 + VPBLENDW $0xaa, Y4, Y11, Y4 + VPSRLD $0x10, Y11, Y11 + VPBLENDW $0xaa, Y12, Y11, Y12 + VMOVDQA Y4, Y11 + VPSLLD $0x10, Y14, Y4 + VPBLENDW $0xaa, Y4, Y13, Y4 + VPSRLD $0x10, Y13, Y13 + VPBLENDW $0xaa, Y14, Y13, Y14 + VMOVDQA Y4, Y13 + VPSUBW Y7, Y8, Y4 + VPSUBW Y9, Y10, Y5 + VPSUBW Y11, Y12, Y6 + VPADDW Y7, Y8, Y7 + VPADDW Y9, Y10, Y9 + VPADDW Y11, Y12, Y11 + VPMULLW Y4, Y0, Y8 + VPMULLW Y5, Y0, Y10 + VPSUBW Y13, Y14, Y0 + VPMULLW Y6, Y2, Y12 + VPADDW Y13, Y14, Y13 + VPMULLW Y0, Y2, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y3, Y6 + VPMULHW Y0, Y3, Y0 + VPMULHW Y8, Y15, Y8 + VPMULHW Y10, Y15, Y10 + VPMULHW Y12, Y15, Y12 + VPMULHW Y14, Y15, Y14 + VPSUBW Y8, Y4, Y8 + VPSUBW Y10, Y5, Y10 + VPSUBW Y12, Y6, Y12 + VPSUBW Y14, Y0, Y14 + VMOVDQU 1568(CX), Y0 + VMOVDQU 1600(CX), Y1 + VMOVDQU 1632(CX), Y2 + VMOVDQU 1664(CX), Y3 + VMOVSLDUP Y9, Y4 + VPBLENDD $0xaa, Y4, Y7, Y4 + VPSRLQ $0x20, Y7, Y7 + VPBLENDD $0xaa, Y9, Y7, Y9 + VMOVDQA Y4, Y7 + VMOVSLDUP Y10, Y4 + VPBLENDD $0xaa, Y4, Y8, Y4 + VPSRLQ $0x20, Y8, Y8 + VPBLENDD $0xaa, Y10, Y8, Y10 + VMOVDQA Y4, Y8 + VMOVSLDUP Y13, Y4 + VPBLENDD $0xaa, Y4, Y11, Y4 + VPSRLQ $0x20, Y11, Y11 + VPBLENDD $0xaa, Y13, Y11, Y13 + VMOVDQA Y4, Y11 + VMOVSLDUP Y14, Y4 + VPBLENDD $0xaa, Y4, Y12, Y4 + VPSRLQ $0x20, Y12, Y12 + VPBLENDD $0xaa, Y14, Y12, Y14 + VMOVDQA Y4, Y12 + VPSUBW Y7, Y9, Y4 + VPSUBW Y8, Y10, Y5 + VPSUBW Y11, Y13, Y6 + VPADDW Y7, Y9, Y7 + VPADDW Y8, Y10, Y8 + VPADDW Y11, Y13, Y11 + VPMULLW Y4, Y0, Y9 + VPMULLW Y5, Y0, Y10 + VPSUBW Y12, Y14, Y0 + VPMULLW Y6, Y2, Y13 + VPADDW Y12, Y14, Y12 + VPMULLW Y0, Y2, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y3, Y6 + VPMULHW Y0, Y3, Y0 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y13, Y15, Y13 + VPMULHW Y14, Y15, Y14 + VPSUBW Y9, Y4, Y9 + VPSUBW Y10, Y5, Y10 + VPSUBW Y13, Y6, Y13 + VPSUBW Y14, Y0, Y14 + MOVL $0x00004ebf, DX + VMOVD DX, X0 + VPBROADCASTW X0, Y4 + VPMULHW Y4, Y7, Y5 + VPSRAW $0x0a, Y5, Y5 + VPMULLW Y15, Y5, Y5 + VPSUBW Y5, Y7, Y7 + VPMULHW Y4, Y11, Y5 + VPSRAW $0x0a, Y5, Y5 + VPMULLW Y15, Y5, Y5 + VPSUBW Y5, Y11, Y11 + VMOVDQU 1824(CX), Y0 + VMOVDQU 1856(CX), Y1 + VMOVDQU 1888(CX), Y2 + VMOVDQU 1920(CX), Y3 + VPUNPCKLQDQ Y8, Y7, Y4 + VPUNPCKHQDQ Y8, Y7, Y8 + VMOVDQA Y4, Y7 + VPUNPCKLQDQ Y10, Y9, Y4 + VPUNPCKHQDQ Y10, Y9, Y10 + VMOVDQA Y4, Y9 + VPUNPCKLQDQ Y12, Y11, Y4 + VPUNPCKHQDQ Y12, Y11, Y12 + VMOVDQA Y4, Y11 + VPUNPCKLQDQ Y14, Y13, Y4 + VPUNPCKHQDQ Y14, Y13, Y14 + VMOVDQA Y4, Y13 + VPSUBW Y7, Y8, Y4 + VPSUBW Y9, Y10, Y5 + VPSUBW Y11, Y12, Y6 + VPADDW Y7, Y8, Y7 + VPADDW Y9, Y10, Y9 + VPADDW Y11, Y12, Y11 + VPMULLW Y4, Y0, Y8 + VPMULLW Y5, Y0, Y10 + VPSUBW Y13, Y14, Y0 + VPMULLW Y6, Y2, Y12 + VPADDW Y13, Y14, Y13 + VPMULLW Y0, Y2, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y3, Y6 + VPMULHW Y0, Y3, Y0 + VPMULHW Y8, Y15, Y8 + VPMULHW Y10, Y15, Y10 + VPMULHW Y12, Y15, Y12 + VPMULHW Y14, Y15, Y14 + VPSUBW Y8, Y4, Y8 + VPSUBW Y10, Y5, Y10 + VPSUBW Y12, Y6, Y12 + VPSUBW Y14, Y0, Y14 + VPBROADCASTW 2080(CX), Y0 + VPBROADCASTW 2082(CX), Y1 + VPBROADCASTW 2084(CX), Y2 + VPBROADCASTW 2086(CX), Y3 + VPERM2I128 $0x20, Y9, Y7, Y4 + VPERM2I128 $0x31, Y9, Y7, Y9 + VMOVDQA Y4, Y7 + VPERM2I128 $0x20, Y10, Y8, Y4 + VPERM2I128 $0x31, Y10, Y8, Y10 + VMOVDQA Y4, Y8 + VPERM2I128 $0x20, Y13, Y11, Y4 + VPERM2I128 $0x31, Y13, Y11, Y13 + VMOVDQA Y4, Y11 + VPERM2I128 $0x20, Y14, Y12, Y4 + VPERM2I128 $0x31, Y14, Y12, Y14 + VMOVDQA Y4, Y12 + VPSUBW Y7, Y9, Y4 + VPSUBW Y8, Y10, Y5 + VPSUBW Y11, Y13, Y6 + VPADDW Y7, Y9, Y7 + VPADDW Y8, Y10, Y8 + VPADDW Y11, Y13, Y11 + VPMULLW Y4, Y0, Y9 + VPMULLW Y5, Y0, Y10 + VPSUBW Y12, Y14, Y0 + VPMULLW Y6, Y2, Y13 + VPADDW Y12, Y14, Y12 + VPMULLW Y0, Y2, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y3, Y6 + VPMULHW Y0, Y3, Y0 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y13, Y15, Y13 + VPMULHW Y14, Y15, Y14 + VPSUBW Y9, Y4, Y9 + VPSUBW Y10, Y5, Y10 + VPSUBW Y13, Y6, Y13 + VPSUBW Y14, Y0, Y14 + MOVL $0x00004ebf, DX + VMOVD DX, X0 + VPBROADCASTW X0, Y4 + VPMULHW Y4, Y7, Y5 + VPSRAW $0x0a, Y5, Y5 + VPMULLW Y15, Y5, Y5 + VPSUBW Y5, Y7, Y7 + VPMULHW Y4, Y11, Y5 + VPSRAW $0x0a, Y5, Y5 + VPMULLW Y15, Y5, Y5 + VPSUBW Y5, Y11, Y11 + VPBROADCASTW 2096(CX), Y0 + VPBROADCASTW 2098(CX), Y1 + VPSUBW Y7, Y11, Y4 + VPSUBW Y8, Y12, Y5 + VPSUBW Y9, Y13, Y6 + VPADDW Y7, Y11, Y7 + VPADDW Y8, Y12, Y8 + VPADDW Y9, Y13, Y9 + VPMULLW Y4, Y0, Y11 + VPMULLW Y5, Y0, Y12 + VPSUBW Y10, Y14, Y2 + VPMULLW Y6, Y0, Y13 + VPADDW Y10, Y14, Y10 + VPMULLW Y2, Y0, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y1, Y6 + VPMULHW Y2, Y1, Y2 + VPMULHW Y11, Y15, Y11 + VPMULHW Y12, Y15, Y12 + VPMULHW Y13, Y15, Y13 + VPMULHW Y14, Y15, Y14 + VPSUBW Y11, Y4, Y11 + VPSUBW Y12, Y5, Y12 + VPSUBW Y13, Y6, Y13 + VPSUBW Y14, Y2, Y14 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + VMOVDQU Y9, 64(AX) + VMOVDQU Y10, 96(AX) + VMOVDQU Y11, 128(AX) + VMOVDQU Y12, 160(AX) + VMOVDQU Y13, 192(AX) + VMOVDQU Y14, 224(AX) + VMOVDQU 256(AX), Y7 + VMOVDQU 288(AX), Y8 + VMOVDQU 320(AX), Y9 + VMOVDQU 352(AX), Y10 + VMOVDQU 384(AX), Y11 + VMOVDQU 416(AX), Y12 + VMOVDQU 448(AX), Y13 + VMOVDQU 480(AX), Y14 + VMOVDQU 1184(CX), Y0 + VMOVDQU 1216(CX), Y1 + VMOVDQU 1248(CX), Y2 + VMOVDQU 1280(CX), Y3 + VPSUBW Y7, Y9, Y4 + VPSUBW Y8, Y10, Y5 + VPSUBW Y11, Y13, Y6 + VPADDW Y7, Y9, Y7 + VPADDW Y8, Y10, Y8 + VPADDW Y11, Y13, Y11 + VPMULLW Y4, Y0, Y9 + VPMULLW Y5, Y0, Y10 + VPSUBW Y12, Y14, Y0 + VPMULLW Y6, Y2, Y13 + VPADDW Y12, Y14, Y12 + VPMULLW Y0, Y2, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y3, Y6 + VPMULHW Y0, Y3, Y0 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y13, Y15, Y13 + VPMULHW Y14, Y15, Y14 + VPSUBW Y9, Y4, Y9 + VPSUBW Y10, Y5, Y10 + VPSUBW Y13, Y6, Y13 + VPSUBW Y14, Y0, Y14 + VMOVDQU 1440(CX), Y0 + VMOVDQU 1472(CX), Y1 + VMOVDQU 1504(CX), Y2 + VMOVDQU 1536(CX), Y3 + VPSLLD $0x10, Y8, Y4 + VPBLENDW $0xaa, Y4, Y7, Y4 + VPSRLD $0x10, Y7, Y7 + VPBLENDW $0xaa, Y8, Y7, Y8 + VMOVDQA Y4, Y7 + VPSLLD $0x10, Y10, Y4 + VPBLENDW $0xaa, Y4, Y9, Y4 + VPSRLD $0x10, Y9, Y9 + VPBLENDW $0xaa, Y10, Y9, Y10 + VMOVDQA Y4, Y9 + VPSLLD $0x10, Y12, Y4 + VPBLENDW $0xaa, Y4, Y11, Y4 + VPSRLD $0x10, Y11, Y11 + VPBLENDW $0xaa, Y12, Y11, Y12 + VMOVDQA Y4, Y11 + VPSLLD $0x10, Y14, Y4 + VPBLENDW $0xaa, Y4, Y13, Y4 + VPSRLD $0x10, Y13, Y13 + VPBLENDW $0xaa, Y14, Y13, Y14 + VMOVDQA Y4, Y13 + VPSUBW Y7, Y8, Y4 + VPSUBW Y9, Y10, Y5 + VPSUBW Y11, Y12, Y6 + VPADDW Y7, Y8, Y7 + VPADDW Y9, Y10, Y9 + VPADDW Y11, Y12, Y11 + VPMULLW Y4, Y0, Y8 + VPMULLW Y5, Y0, Y10 + VPSUBW Y13, Y14, Y0 + VPMULLW Y6, Y2, Y12 + VPADDW Y13, Y14, Y13 + VPMULLW Y0, Y2, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y3, Y6 + VPMULHW Y0, Y3, Y0 + VPMULHW Y8, Y15, Y8 + VPMULHW Y10, Y15, Y10 + VPMULHW Y12, Y15, Y12 + VPMULHW Y14, Y15, Y14 + VPSUBW Y8, Y4, Y8 + VPSUBW Y10, Y5, Y10 + VPSUBW Y12, Y6, Y12 + VPSUBW Y14, Y0, Y14 + VMOVDQU 1696(CX), Y0 + VMOVDQU 1728(CX), Y1 + VMOVDQU 1760(CX), Y2 + VMOVDQU 1792(CX), Y3 + VMOVSLDUP Y9, Y4 + VPBLENDD $0xaa, Y4, Y7, Y4 + VPSRLQ $0x20, Y7, Y7 + VPBLENDD $0xaa, Y9, Y7, Y9 + VMOVDQA Y4, Y7 + VMOVSLDUP Y10, Y4 + VPBLENDD $0xaa, Y4, Y8, Y4 + VPSRLQ $0x20, Y8, Y8 + VPBLENDD $0xaa, Y10, Y8, Y10 + VMOVDQA Y4, Y8 + VMOVSLDUP Y13, Y4 + VPBLENDD $0xaa, Y4, Y11, Y4 + VPSRLQ $0x20, Y11, Y11 + VPBLENDD $0xaa, Y13, Y11, Y13 + VMOVDQA Y4, Y11 + VMOVSLDUP Y14, Y4 + VPBLENDD $0xaa, Y4, Y12, Y4 + VPSRLQ $0x20, Y12, Y12 + VPBLENDD $0xaa, Y14, Y12, Y14 + VMOVDQA Y4, Y12 + VPSUBW Y7, Y9, Y4 + VPSUBW Y8, Y10, Y5 + VPSUBW Y11, Y13, Y6 + VPADDW Y7, Y9, Y7 + VPADDW Y8, Y10, Y8 + VPADDW Y11, Y13, Y11 + VPMULLW Y4, Y0, Y9 + VPMULLW Y5, Y0, Y10 + VPSUBW Y12, Y14, Y0 + VPMULLW Y6, Y2, Y13 + VPADDW Y12, Y14, Y12 + VPMULLW Y0, Y2, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y3, Y6 + VPMULHW Y0, Y3, Y0 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y13, Y15, Y13 + VPMULHW Y14, Y15, Y14 + VPSUBW Y9, Y4, Y9 + VPSUBW Y10, Y5, Y10 + VPSUBW Y13, Y6, Y13 + VPSUBW Y14, Y0, Y14 + MOVL $0x00004ebf, DX + VMOVD DX, X0 + VPBROADCASTW X0, Y4 + VPMULHW Y4, Y7, Y5 + VPSRAW $0x0a, Y5, Y5 + VPMULLW Y15, Y5, Y5 + VPSUBW Y5, Y7, Y7 + VPMULHW Y4, Y11, Y5 + VPSRAW $0x0a, Y5, Y5 + VPMULLW Y15, Y5, Y5 + VPSUBW Y5, Y11, Y11 + VMOVDQU 1952(CX), Y0 + VMOVDQU 1984(CX), Y1 + VMOVDQU 2016(CX), Y2 + VMOVDQU 2048(CX), Y3 + VPUNPCKLQDQ Y8, Y7, Y4 + VPUNPCKHQDQ Y8, Y7, Y8 + VMOVDQA Y4, Y7 + VPUNPCKLQDQ Y10, Y9, Y4 + VPUNPCKHQDQ Y10, Y9, Y10 + VMOVDQA Y4, Y9 + VPUNPCKLQDQ Y12, Y11, Y4 + VPUNPCKHQDQ Y12, Y11, Y12 + VMOVDQA Y4, Y11 + VPUNPCKLQDQ Y14, Y13, Y4 + VPUNPCKHQDQ Y14, Y13, Y14 + VMOVDQA Y4, Y13 + VPSUBW Y7, Y8, Y4 + VPSUBW Y9, Y10, Y5 + VPSUBW Y11, Y12, Y6 + VPADDW Y7, Y8, Y7 + VPADDW Y9, Y10, Y9 + VPADDW Y11, Y12, Y11 + VPMULLW Y4, Y0, Y8 + VPMULLW Y5, Y0, Y10 + VPSUBW Y13, Y14, Y0 + VPMULLW Y6, Y2, Y12 + VPADDW Y13, Y14, Y13 + VPMULLW Y0, Y2, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y3, Y6 + VPMULHW Y0, Y3, Y0 + VPMULHW Y8, Y15, Y8 + VPMULHW Y10, Y15, Y10 + VPMULHW Y12, Y15, Y12 + VPMULHW Y14, Y15, Y14 + VPSUBW Y8, Y4, Y8 + VPSUBW Y10, Y5, Y10 + VPSUBW Y12, Y6, Y12 + VPSUBW Y14, Y0, Y14 + VPBROADCASTW 2088(CX), Y0 + VPBROADCASTW 2090(CX), Y1 + VPBROADCASTW 2092(CX), Y2 + VPBROADCASTW 2094(CX), Y3 + VPERM2I128 $0x20, Y9, Y7, Y4 + VPERM2I128 $0x31, Y9, Y7, Y9 + VMOVDQA Y4, Y7 + VPERM2I128 $0x20, Y10, Y8, Y4 + VPERM2I128 $0x31, Y10, Y8, Y10 + VMOVDQA Y4, Y8 + VPERM2I128 $0x20, Y13, Y11, Y4 + VPERM2I128 $0x31, Y13, Y11, Y13 + VMOVDQA Y4, Y11 + VPERM2I128 $0x20, Y14, Y12, Y4 + VPERM2I128 $0x31, Y14, Y12, Y14 + VMOVDQA Y4, Y12 + VPSUBW Y7, Y9, Y4 + VPSUBW Y8, Y10, Y5 + VPSUBW Y11, Y13, Y6 + VPADDW Y7, Y9, Y7 + VPADDW Y8, Y10, Y8 + VPADDW Y11, Y13, Y11 + VPMULLW Y4, Y0, Y9 + VPMULLW Y5, Y0, Y10 + VPSUBW Y12, Y14, Y0 + VPMULLW Y6, Y2, Y13 + VPADDW Y12, Y14, Y12 + VPMULLW Y0, Y2, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y3, Y6 + VPMULHW Y0, Y3, Y0 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y13, Y15, Y13 + VPMULHW Y14, Y15, Y14 + VPSUBW Y9, Y4, Y9 + VPSUBW Y10, Y5, Y10 + VPSUBW Y13, Y6, Y13 + VPSUBW Y14, Y0, Y14 + MOVL $0x00004ebf, DX + VMOVD DX, X0 + VPBROADCASTW X0, Y4 + VPMULHW Y4, Y7, Y5 + VPSRAW $0x0a, Y5, Y5 + VPMULLW Y15, Y5, Y5 + VPSUBW Y5, Y7, Y7 + VPMULHW Y4, Y11, Y5 + VPSRAW $0x0a, Y5, Y5 + VPMULLW Y15, Y5, Y5 + VPSUBW Y5, Y11, Y11 + VPBROADCASTW 2100(CX), Y0 + VPBROADCASTW 2102(CX), Y1 + VPSUBW Y7, Y11, Y4 + VPSUBW Y8, Y12, Y5 + VPSUBW Y9, Y13, Y6 + VPADDW Y7, Y11, Y7 + VPADDW Y8, Y12, Y8 + VPADDW Y9, Y13, Y9 + VPMULLW Y4, Y0, Y11 + VPMULLW Y5, Y0, Y12 + VPSUBW Y10, Y14, Y2 + VPMULLW Y6, Y0, Y13 + VPADDW Y10, Y14, Y10 + VPMULLW Y2, Y0, Y14 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y6, Y1, Y6 + VPMULHW Y2, Y1, Y2 + VPMULHW Y11, Y15, Y11 + VPMULHW Y12, Y15, Y12 + VPMULHW Y13, Y15, Y13 + VPMULHW Y14, Y15, Y14 + VPSUBW Y11, Y4, Y11 + VPSUBW Y12, Y5, Y12 + VPSUBW Y13, Y6, Y13 + VPSUBW Y14, Y2, Y14 + VMOVDQU Y7, 256(AX) + VMOVDQU Y8, 288(AX) + VMOVDQU Y9, 320(AX) + VMOVDQU Y10, 352(AX) + VMOVDQU Y11, 384(AX) + VMOVDQU Y12, 416(AX) + VMOVDQU Y13, 448(AX) + VMOVDQU Y14, 480(AX) + VPBROADCASTW 2104(CX), Y0 + VPBROADCASTW 2106(CX), Y1 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VMOVDQU 64(AX), Y9 + VMOVDQU 96(AX), Y10 + VMOVDQU 256(AX), Y11 + VMOVDQU 288(AX), Y12 + VMOVDQU 320(AX), Y13 + VMOVDQU 352(AX), Y14 + VPSUBW Y7, Y11, Y2 + VPSUBW Y8, Y12, Y3 + VPSUBW Y9, Y13, Y4 + VPADDW Y7, Y11, Y7 + VPADDW Y8, Y12, Y8 + VPADDW Y9, Y13, Y9 + VPMULLW Y2, Y0, Y11 + VPMULLW Y3, Y0, Y12 + VPSUBW Y10, Y14, Y5 + VPMULLW Y4, Y0, Y13 + VPADDW Y10, Y14, Y10 + VPMULLW Y5, Y0, Y14 + VPMULHW Y2, Y1, Y2 + VPMULHW Y3, Y1, Y3 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y11, Y15, Y11 + VPMULHW Y12, Y15, Y12 + VPMULHW Y13, Y15, Y13 + VPMULHW Y14, Y15, Y14 + VPSUBW Y11, Y2, Y11 + VPSUBW Y12, Y3, Y12 + VPSUBW Y13, Y4, Y13 + VPSUBW Y14, Y5, Y14 + MOVL $0xffffd8a1, DX + VMOVD DX, X0 + VPBROADCASTW X0, Y0 + MOVL $0x000005a1, DX + VMOVD DX, X1 + VPBROADCASTW X1, Y1 + VPMULLW Y7, Y0, Y2 + VPMULLW Y8, Y0, Y3 + VPMULLW Y9, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULHW Y7, Y1, Y7 + VPMULHW Y8, Y1, Y8 + VPMULHW Y9, Y1, Y9 + VPMULHW Y10, Y1, Y10 + VPMULHW Y2, Y15, Y2 + VPMULHW Y3, Y15, Y3 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPSUBW Y2, Y7, Y7 + VPSUBW Y3, Y8, Y8 + VPSUBW Y4, Y9, Y9 + VPSUBW Y5, Y10, Y10 + VPMULLW Y11, Y0, Y2 + VPMULLW Y12, Y0, Y3 + VPMULLW Y13, Y0, Y4 + VPMULLW Y14, Y0, Y5 + VPMULHW Y11, Y1, Y11 + VPMULHW Y12, Y1, Y12 + VPMULHW Y13, Y1, Y13 + VPMULHW Y14, Y1, Y14 + VPMULHW Y2, Y15, Y2 + VPMULHW Y3, Y15, Y3 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPSUBW Y2, Y11, Y11 + VPSUBW Y3, Y12, Y12 + VPSUBW Y4, Y13, Y13 + VPSUBW Y5, Y14, Y14 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + VMOVDQU Y9, 64(AX) + VMOVDQU Y10, 96(AX) + VMOVDQU Y11, 256(AX) + VMOVDQU Y12, 288(AX) + VMOVDQU Y13, 320(AX) + VMOVDQU Y14, 352(AX) + VPBROADCASTW 2104(CX), Y0 + VPBROADCASTW 2106(CX), Y1 + VMOVDQU 128(AX), Y7 + VMOVDQU 160(AX), Y8 + VMOVDQU 192(AX), Y9 + VMOVDQU 224(AX), Y10 + VMOVDQU 384(AX), Y11 + VMOVDQU 416(AX), Y12 + VMOVDQU 448(AX), Y13 + VMOVDQU 480(AX), Y14 + VPSUBW Y7, Y11, Y2 + VPSUBW Y8, Y12, Y3 + VPSUBW Y9, Y13, Y4 + VPADDW Y7, Y11, Y7 + VPADDW Y8, Y12, Y8 + VPADDW Y9, Y13, Y9 + VPMULLW Y2, Y0, Y11 + VPMULLW Y3, Y0, Y12 + VPSUBW Y10, Y14, Y5 + VPMULLW Y4, Y0, Y13 + VPADDW Y10, Y14, Y10 + VPMULLW Y5, Y0, Y14 + VPMULHW Y2, Y1, Y2 + VPMULHW Y3, Y1, Y3 + VPMULHW Y4, Y1, Y4 + VPMULHW Y5, Y1, Y5 + VPMULHW Y11, Y15, Y11 + VPMULHW Y12, Y15, Y12 + VPMULHW Y13, Y15, Y13 + VPMULHW Y14, Y15, Y14 + VPSUBW Y11, Y2, Y11 + VPSUBW Y12, Y3, Y12 + VPSUBW Y13, Y4, Y13 + VPSUBW Y14, Y5, Y14 + MOVL $0xffffd8a1, CX + VMOVD CX, X0 + VPBROADCASTW X0, Y0 + MOVL $0x000005a1, CX + VMOVD CX, X1 + VPBROADCASTW X1, Y1 + VPMULLW Y7, Y0, Y2 + VPMULLW Y8, Y0, Y3 + VPMULLW Y9, Y0, Y4 + VPMULLW Y10, Y0, Y5 + VPMULHW Y7, Y1, Y7 + VPMULHW Y8, Y1, Y8 + VPMULHW Y9, Y1, Y9 + VPMULHW Y10, Y1, Y10 + VPMULHW Y2, Y15, Y2 + VPMULHW Y3, Y15, Y3 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPSUBW Y2, Y7, Y7 + VPSUBW Y3, Y8, Y8 + VPSUBW Y4, Y9, Y9 + VPSUBW Y5, Y10, Y10 + VPMULLW Y11, Y0, Y2 + VPMULLW Y12, Y0, Y3 + VPMULLW Y13, Y0, Y4 + VPMULLW Y14, Y0, Y5 + VPMULHW Y11, Y1, Y11 + VPMULHW Y12, Y1, Y12 + VPMULHW Y13, Y1, Y13 + VPMULHW Y14, Y1, Y14 + VPMULHW Y2, Y15, Y2 + VPMULHW Y3, Y15, Y3 + VPMULHW Y4, Y15, Y4 + VPMULHW Y5, Y15, Y5 + VPSUBW Y2, Y11, Y11 + VPSUBW Y3, Y12, Y12 + VPSUBW Y4, Y13, Y13 + VPSUBW Y5, Y14, Y14 + VMOVDQU Y7, 128(AX) + VMOVDQU Y8, 160(AX) + VMOVDQU Y9, 192(AX) + VMOVDQU Y10, 224(AX) + VMOVDQU Y11, 384(AX) + VMOVDQU Y12, 416(AX) + VMOVDQU Y13, 448(AX) + VMOVDQU Y14, 480(AX) + RET + +// func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16) +// Requires: AVX, AVX2 +TEXT ·mulHatAVX2(SB), NOSPLIT, $8-24 + MOVQ p+0(FP), AX + MOVQ a+8(FP), CX + MOVQ b+16(FP), DX + LEAQ ·ZetasAVX2+0(SB), BX + MOVL $0xfffff301, BP + VMOVD BP, X0 + VPBROADCASTW X0, Y14 + MOVL $0x00000d01, BP + VMOVD BP, X0 + VPBROADCASTW X0, Y15 + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU (DX), Y4 + VMOVDQU 32(DX), Y5 + VMOVDQU 64(DX), Y6 + VMOVDQU 96(DX), Y7 + VPMULLW Y1, Y5, Y8 + VPMULLW Y0, Y4, Y9 + VPMULLW Y0, Y5, Y10 + VPMULLW Y1, Y4, Y11 + VPMULLW Y8, Y14, Y8 + VPMULLW Y9, Y14, Y9 + VPMULLW Y10, Y14, Y10 + VPMULLW Y11, Y14, Y11 + VPMULHW Y1, Y5, Y12 + VPMULHW Y0, Y4, Y13 + VPMULHW Y0, Y5, Y0 + VPMULHW Y1, Y4, Y1 + VMOVDQA Y12, Y4 + VMOVDQA Y13, Y5 + VPMULHW Y8, Y15, Y8 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y11, Y15, Y11 + VPSUBW Y8, Y4, Y4 + VPSUBW Y9, Y5, Y5 + VPSUBW Y10, Y0, Y0 + VPSUBW Y11, Y1, Y1 + VMOVDQU 800(BX), Y12 + VMOVDQU 832(BX), Y13 + VPMULLW Y4, Y12, Y8 + VPMULHW Y4, Y13, Y4 + VPMULHW Y8, Y15, Y8 + VPSUBW Y8, Y4, Y4 + VPADDW Y4, Y5, Y4 + VPADDW Y0, Y1, Y5 + VPMULLW Y3, Y7, Y8 + VPMULLW Y2, Y6, Y9 + VPMULLW Y2, Y7, Y10 + VPMULLW Y3, Y6, Y11 + VPMULLW Y8, Y14, Y8 + VPMULLW Y9, Y14, Y9 + VPMULLW Y10, Y14, Y10 + VPMULLW Y11, Y14, Y11 + VPMULHW Y3, Y7, Y12 + VPMULHW Y2, Y6, Y13 + VPMULHW Y2, Y7, Y2 + VPMULHW Y3, Y6, Y3 + VMOVDQA Y12, Y6 + VMOVDQA Y13, Y7 + VPMULHW Y8, Y15, Y8 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y11, Y15, Y11 + VPSUBW Y8, Y6, Y6 + VPSUBW Y9, Y7, Y7 + VPSUBW Y10, Y2, Y2 + VPSUBW Y11, Y3, Y3 + VMOVDQU 800(BX), Y12 + VMOVDQU 832(BX), Y13 + VPMULLW Y6, Y12, Y8 + VPMULHW Y6, Y13, Y6 + VPMULHW Y8, Y15, Y8 + VPSUBW Y8, Y6, Y6 + VPSUBW Y6, Y7, Y6 + VPADDW Y2, Y3, Y7 + VMOVDQU Y4, (AX) + VMOVDQU Y5, 32(AX) + VMOVDQU Y6, 64(AX) + VMOVDQU Y7, 96(AX) + VMOVDQU 128(CX), Y0 + VMOVDQU 160(CX), Y1 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VMOVDQU 128(DX), Y4 + VMOVDQU 160(DX), Y5 + VMOVDQU 192(DX), Y6 + VMOVDQU 224(DX), Y7 + VPMULLW Y1, Y5, Y8 + VPMULLW Y0, Y4, Y9 + VPMULLW Y0, Y5, Y10 + VPMULLW Y1, Y4, Y11 + VPMULLW Y8, Y14, Y8 + VPMULLW Y9, Y14, Y9 + VPMULLW Y10, Y14, Y10 + VPMULLW Y11, Y14, Y11 + VPMULHW Y1, Y5, Y12 + VPMULHW Y0, Y4, Y13 + VPMULHW Y0, Y5, Y0 + VPMULHW Y1, Y4, Y1 + VMOVDQA Y12, Y4 + VMOVDQA Y13, Y5 + VPMULHW Y8, Y15, Y8 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y11, Y15, Y11 + VPSUBW Y8, Y4, Y4 + VPSUBW Y9, Y5, Y5 + VPSUBW Y10, Y0, Y0 + VPSUBW Y11, Y1, Y1 + VMOVDQU 864(BX), Y12 + VMOVDQU 896(BX), Y13 + VPMULLW Y4, Y12, Y8 + VPMULHW Y4, Y13, Y4 + VPMULHW Y8, Y15, Y8 + VPSUBW Y8, Y4, Y4 + VPADDW Y4, Y5, Y4 + VPADDW Y0, Y1, Y5 + VPMULLW Y3, Y7, Y8 + VPMULLW Y2, Y6, Y9 + VPMULLW Y2, Y7, Y10 + VPMULLW Y3, Y6, Y11 + VPMULLW Y8, Y14, Y8 + VPMULLW Y9, Y14, Y9 + VPMULLW Y10, Y14, Y10 + VPMULLW Y11, Y14, Y11 + VPMULHW Y3, Y7, Y12 + VPMULHW Y2, Y6, Y13 + VPMULHW Y2, Y7, Y2 + VPMULHW Y3, Y6, Y3 + VMOVDQA Y12, Y6 + VMOVDQA Y13, Y7 + VPMULHW Y8, Y15, Y8 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y11, Y15, Y11 + VPSUBW Y8, Y6, Y6 + VPSUBW Y9, Y7, Y7 + VPSUBW Y10, Y2, Y2 + VPSUBW Y11, Y3, Y3 + VMOVDQU 864(BX), Y12 + VMOVDQU 896(BX), Y13 + VPMULLW Y6, Y12, Y8 + VPMULHW Y6, Y13, Y6 + VPMULHW Y8, Y15, Y8 + VPSUBW Y8, Y6, Y6 + VPSUBW Y6, Y7, Y6 + VPADDW Y2, Y3, Y7 + VMOVDQU Y4, 128(AX) + VMOVDQU Y5, 160(AX) + VMOVDQU Y6, 192(AX) + VMOVDQU Y7, 224(AX) + VMOVDQU 256(CX), Y0 + VMOVDQU 288(CX), Y1 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VMOVDQU 256(DX), Y4 + VMOVDQU 288(DX), Y5 + VMOVDQU 320(DX), Y6 + VMOVDQU 352(DX), Y7 + VPMULLW Y1, Y5, Y8 + VPMULLW Y0, Y4, Y9 + VPMULLW Y0, Y5, Y10 + VPMULLW Y1, Y4, Y11 + VPMULLW Y8, Y14, Y8 + VPMULLW Y9, Y14, Y9 + VPMULLW Y10, Y14, Y10 + VPMULLW Y11, Y14, Y11 + VPMULHW Y1, Y5, Y12 + VPMULHW Y0, Y4, Y13 + VPMULHW Y0, Y5, Y0 + VPMULHW Y1, Y4, Y1 + VMOVDQA Y12, Y4 + VMOVDQA Y13, Y5 + VPMULHW Y8, Y15, Y8 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y11, Y15, Y11 + VPSUBW Y8, Y4, Y4 + VPSUBW Y9, Y5, Y5 + VPSUBW Y10, Y0, Y0 + VPSUBW Y11, Y1, Y1 + VMOVDQU 928(BX), Y12 + VMOVDQU 960(BX), Y13 + VPMULLW Y4, Y12, Y8 + VPMULHW Y4, Y13, Y4 + VPMULHW Y8, Y15, Y8 + VPSUBW Y8, Y4, Y4 + VPADDW Y4, Y5, Y4 + VPADDW Y0, Y1, Y5 + VPMULLW Y3, Y7, Y8 + VPMULLW Y2, Y6, Y9 + VPMULLW Y2, Y7, Y10 + VPMULLW Y3, Y6, Y11 + VPMULLW Y8, Y14, Y8 + VPMULLW Y9, Y14, Y9 + VPMULLW Y10, Y14, Y10 + VPMULLW Y11, Y14, Y11 + VPMULHW Y3, Y7, Y12 + VPMULHW Y2, Y6, Y13 + VPMULHW Y2, Y7, Y2 + VPMULHW Y3, Y6, Y3 + VMOVDQA Y12, Y6 + VMOVDQA Y13, Y7 + VPMULHW Y8, Y15, Y8 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y11, Y15, Y11 + VPSUBW Y8, Y6, Y6 + VPSUBW Y9, Y7, Y7 + VPSUBW Y10, Y2, Y2 + VPSUBW Y11, Y3, Y3 + VMOVDQU 928(BX), Y12 + VMOVDQU 960(BX), Y13 + VPMULLW Y6, Y12, Y8 + VPMULHW Y6, Y13, Y6 + VPMULHW Y8, Y15, Y8 + VPSUBW Y8, Y6, Y6 + VPSUBW Y6, Y7, Y6 + VPADDW Y2, Y3, Y7 + VMOVDQU Y4, 256(AX) + VMOVDQU Y5, 288(AX) + VMOVDQU Y6, 320(AX) + VMOVDQU Y7, 352(AX) + VMOVDQU 384(CX), Y0 + VMOVDQU 416(CX), Y1 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VMOVDQU 384(DX), Y4 + VMOVDQU 416(DX), Y5 + VMOVDQU 448(DX), Y6 + VMOVDQU 480(DX), Y7 + VPMULLW Y1, Y5, Y8 + VPMULLW Y0, Y4, Y9 + VPMULLW Y0, Y5, Y10 + VPMULLW Y1, Y4, Y11 + VPMULLW Y8, Y14, Y8 + VPMULLW Y9, Y14, Y9 + VPMULLW Y10, Y14, Y10 + VPMULLW Y11, Y14, Y11 + VPMULHW Y1, Y5, Y12 + VPMULHW Y0, Y4, Y13 + VPMULHW Y0, Y5, Y0 + VPMULHW Y1, Y4, Y1 + VMOVDQA Y12, Y4 + VMOVDQA Y13, Y5 + VPMULHW Y8, Y15, Y8 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y11, Y15, Y11 + VPSUBW Y8, Y4, Y4 + VPSUBW Y9, Y5, Y5 + VPSUBW Y10, Y0, Y0 + VPSUBW Y11, Y1, Y1 + VMOVDQU 992(BX), Y12 + VMOVDQU 1024(BX), Y13 + VPMULLW Y4, Y12, Y8 + VPMULHW Y4, Y13, Y4 + VPMULHW Y8, Y15, Y8 + VPSUBW Y8, Y4, Y4 + VPADDW Y4, Y5, Y4 + VPADDW Y0, Y1, Y5 + VPMULLW Y3, Y7, Y8 + VPMULLW Y2, Y6, Y9 + VPMULLW Y2, Y7, Y10 + VPMULLW Y3, Y6, Y11 + VPMULLW Y8, Y14, Y8 + VPMULLW Y9, Y14, Y9 + VPMULLW Y10, Y14, Y10 + VPMULLW Y11, Y14, Y11 + VPMULHW Y3, Y7, Y12 + VPMULHW Y2, Y6, Y13 + VPMULHW Y2, Y7, Y2 + VPMULHW Y3, Y6, Y3 + VMOVDQA Y12, Y6 + VMOVDQA Y13, Y7 + VPMULHW Y8, Y15, Y8 + VPMULHW Y9, Y15, Y9 + VPMULHW Y10, Y15, Y10 + VPMULHW Y11, Y15, Y11 + VPSUBW Y8, Y6, Y6 + VPSUBW Y9, Y7, Y7 + VPSUBW Y10, Y2, Y2 + VPSUBW Y11, Y3, Y3 + VMOVDQU 992(BX), Y12 + VMOVDQU 1024(BX), Y13 + VPMULLW Y6, Y12, Y8 + VPMULHW Y6, Y13, Y6 + VPMULHW Y8, Y15, Y8 + VPSUBW Y8, Y6, Y6 + VPSUBW Y6, Y7, Y6 + VPADDW Y2, Y3, Y7 + VMOVDQU Y4, 384(AX) + VMOVDQU Y5, 416(AX) + VMOVDQU Y6, 448(AX) + VMOVDQU Y7, 480(AX) + RET + +// func detangleAVX2(p *[256]int16) +// Requires: AVX, AVX2 +TEXT ·detangleAVX2(SB), NOSPLIT, $0-8 + MOVQ p+0(FP), AX + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VMOVDQU 192(AX), Y6 + VMOVDQU 224(AX), Y7 + VPSLLD $0x10, Y1, Y8 + VPBLENDW $0xaa, Y8, Y0, Y8 + VPSRLD $0x10, Y0, Y0 + VPBLENDW $0xaa, Y1, Y0, Y1 + VMOVDQA Y8, Y0 + VPSLLD $0x10, Y3, Y8 + VPBLENDW $0xaa, Y8, Y2, Y8 + VPSRLD $0x10, Y2, Y2 + VPBLENDW $0xaa, Y3, Y2, Y3 + VMOVDQA Y8, Y2 + VPSLLD $0x10, Y5, Y8 + VPBLENDW $0xaa, Y8, Y4, Y8 + VPSRLD $0x10, Y4, Y4 + VPBLENDW $0xaa, Y5, Y4, Y5 + VMOVDQA Y8, Y4 + VPSLLD $0x10, Y7, Y8 + VPBLENDW $0xaa, Y8, Y6, Y8 + VPSRLD $0x10, Y6, Y6 + VPBLENDW $0xaa, Y7, Y6, Y7 + VMOVDQA Y8, Y6 + VMOVSLDUP Y2, Y8 + VPBLENDD $0xaa, Y8, Y0, Y8 + VPSRLQ $0x20, Y0, Y0 + VPBLENDD $0xaa, Y2, Y0, Y2 + VMOVDQA Y8, Y0 + VMOVSLDUP Y3, Y8 + VPBLENDD $0xaa, Y8, Y1, Y8 + VPSRLQ $0x20, Y1, Y1 + VPBLENDD $0xaa, Y3, Y1, Y3 + VMOVDQA Y8, Y1 + VMOVSLDUP Y6, Y8 + VPBLENDD $0xaa, Y8, Y4, Y8 + VPSRLQ $0x20, Y4, Y4 + VPBLENDD $0xaa, Y6, Y4, Y6 + VMOVDQA Y8, Y4 + VMOVSLDUP Y7, Y8 + VPBLENDD $0xaa, Y8, Y5, Y8 + VPSRLQ $0x20, Y5, Y5 + VPBLENDD $0xaa, Y7, Y5, Y7 + VMOVDQA Y8, Y5 + VPUNPCKLQDQ Y1, Y0, Y8 + VPUNPCKHQDQ Y1, Y0, Y1 + VMOVDQA Y8, Y0 + VPUNPCKLQDQ Y3, Y2, Y8 + VPUNPCKHQDQ Y3, Y2, Y3 + VMOVDQA Y8, Y2 + VPUNPCKLQDQ Y5, Y4, Y8 + VPUNPCKHQDQ Y5, Y4, Y5 + VMOVDQA Y8, Y4 + VPUNPCKLQDQ Y7, Y6, Y8 + VPUNPCKHQDQ Y7, Y6, Y7 + VMOVDQA Y8, Y6 + VPERM2I128 $0x20, Y2, Y0, Y8 + VPERM2I128 $0x31, Y2, Y0, Y2 + VMOVDQA Y8, Y0 + VPERM2I128 $0x20, Y3, Y1, Y8 + VPERM2I128 $0x31, Y3, Y1, Y3 + VMOVDQA Y8, Y1 + VPERM2I128 $0x20, Y6, Y4, Y8 + VPERM2I128 $0x31, Y6, Y4, Y6 + VMOVDQA Y8, Y4 + VPERM2I128 $0x20, Y7, Y5, Y8 + VPERM2I128 $0x31, Y7, Y5, Y7 + VMOVDQA Y8, Y5 + VMOVDQU Y0, (AX) + VMOVDQU Y1, 32(AX) + VMOVDQU Y2, 64(AX) + VMOVDQU Y3, 96(AX) + VMOVDQU Y4, 128(AX) + VMOVDQU Y5, 160(AX) + VMOVDQU Y6, 192(AX) + VMOVDQU Y7, 224(AX) + VMOVDQU 256(AX), Y0 + VMOVDQU 288(AX), Y1 + VMOVDQU 320(AX), Y2 + VMOVDQU 352(AX), Y3 + VMOVDQU 384(AX), Y4 + VMOVDQU 416(AX), Y5 + VMOVDQU 448(AX), Y6 + VMOVDQU 480(AX), Y7 + VPSLLD $0x10, Y1, Y8 + VPBLENDW $0xaa, Y8, Y0, Y8 + VPSRLD $0x10, Y0, Y0 + VPBLENDW $0xaa, Y1, Y0, Y1 + VMOVDQA Y8, Y0 + VPSLLD $0x10, Y3, Y8 + VPBLENDW $0xaa, Y8, Y2, Y8 + VPSRLD $0x10, Y2, Y2 + VPBLENDW $0xaa, Y3, Y2, Y3 + VMOVDQA Y8, Y2 + VPSLLD $0x10, Y5, Y8 + VPBLENDW $0xaa, Y8, Y4, Y8 + VPSRLD $0x10, Y4, Y4 + VPBLENDW $0xaa, Y5, Y4, Y5 + VMOVDQA Y8, Y4 + VPSLLD $0x10, Y7, Y8 + VPBLENDW $0xaa, Y8, Y6, Y8 + VPSRLD $0x10, Y6, Y6 + VPBLENDW $0xaa, Y7, Y6, Y7 + VMOVDQA Y8, Y6 + VMOVSLDUP Y2, Y8 + VPBLENDD $0xaa, Y8, Y0, Y8 + VPSRLQ $0x20, Y0, Y0 + VPBLENDD $0xaa, Y2, Y0, Y2 + VMOVDQA Y8, Y0 + VMOVSLDUP Y3, Y8 + VPBLENDD $0xaa, Y8, Y1, Y8 + VPSRLQ $0x20, Y1, Y1 + VPBLENDD $0xaa, Y3, Y1, Y3 + VMOVDQA Y8, Y1 + VMOVSLDUP Y6, Y8 + VPBLENDD $0xaa, Y8, Y4, Y8 + VPSRLQ $0x20, Y4, Y4 + VPBLENDD $0xaa, Y6, Y4, Y6 + VMOVDQA Y8, Y4 + VMOVSLDUP Y7, Y8 + VPBLENDD $0xaa, Y8, Y5, Y8 + VPSRLQ $0x20, Y5, Y5 + VPBLENDD $0xaa, Y7, Y5, Y7 + VMOVDQA Y8, Y5 + VPUNPCKLQDQ Y1, Y0, Y8 + VPUNPCKHQDQ Y1, Y0, Y1 + VMOVDQA Y8, Y0 + VPUNPCKLQDQ Y3, Y2, Y8 + VPUNPCKHQDQ Y3, Y2, Y3 + VMOVDQA Y8, Y2 + VPUNPCKLQDQ Y5, Y4, Y8 + VPUNPCKHQDQ Y5, Y4, Y5 + VMOVDQA Y8, Y4 + VPUNPCKLQDQ Y7, Y6, Y8 + VPUNPCKHQDQ Y7, Y6, Y7 + VMOVDQA Y8, Y6 + VPERM2I128 $0x20, Y2, Y0, Y8 + VPERM2I128 $0x31, Y2, Y0, Y2 + VMOVDQA Y8, Y0 + VPERM2I128 $0x20, Y3, Y1, Y8 + VPERM2I128 $0x31, Y3, Y1, Y3 + VMOVDQA Y8, Y1 + VPERM2I128 $0x20, Y6, Y4, Y8 + VPERM2I128 $0x31, Y6, Y4, Y6 + VMOVDQA Y8, Y4 + VPERM2I128 $0x20, Y7, Y5, Y8 + VPERM2I128 $0x31, Y7, Y5, Y7 + VMOVDQA Y8, Y5 + VMOVDQU Y0, 256(AX) + VMOVDQU Y1, 288(AX) + VMOVDQU Y2, 320(AX) + VMOVDQU Y3, 352(AX) + VMOVDQU Y4, 384(AX) + VMOVDQU Y5, 416(AX) + VMOVDQU Y6, 448(AX) + VMOVDQU Y7, 480(AX) + RET + +// func tangleAVX2(p *[256]int16) +// Requires: AVX, AVX2 +TEXT ·tangleAVX2(SB), NOSPLIT, $0-8 + MOVQ p+0(FP), AX + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VMOVDQU 128(AX), Y4 + VMOVDQU 160(AX), Y5 + VMOVDQU 192(AX), Y6 + VMOVDQU 224(AX), Y7 + VPERM2I128 $0x20, Y2, Y0, Y8 + VPERM2I128 $0x31, Y2, Y0, Y2 + VMOVDQA Y8, Y0 + VPERM2I128 $0x20, Y3, Y1, Y8 + VPERM2I128 $0x31, Y3, Y1, Y3 + VMOVDQA Y8, Y1 + VPERM2I128 $0x20, Y6, Y4, Y8 + VPERM2I128 $0x31, Y6, Y4, Y6 + VMOVDQA Y8, Y4 + VPERM2I128 $0x20, Y7, Y5, Y8 + VPERM2I128 $0x31, Y7, Y5, Y7 + VMOVDQA Y8, Y5 + VPUNPCKLQDQ Y1, Y0, Y8 + VPUNPCKHQDQ Y1, Y0, Y1 + VMOVDQA Y8, Y0 + VPUNPCKLQDQ Y3, Y2, Y8 + VPUNPCKHQDQ Y3, Y2, Y3 + VMOVDQA Y8, Y2 + VPUNPCKLQDQ Y5, Y4, Y8 + VPUNPCKHQDQ Y5, Y4, Y5 + VMOVDQA Y8, Y4 + VPUNPCKLQDQ Y7, Y6, Y8 + VPUNPCKHQDQ Y7, Y6, Y7 + VMOVDQA Y8, Y6 + VMOVSLDUP Y2, Y8 + VPBLENDD $0xaa, Y8, Y0, Y8 + VPSRLQ $0x20, Y0, Y0 + VPBLENDD $0xaa, Y2, Y0, Y2 + VMOVDQA Y8, Y0 + VMOVSLDUP Y3, Y8 + VPBLENDD $0xaa, Y8, Y1, Y8 + VPSRLQ $0x20, Y1, Y1 + VPBLENDD $0xaa, Y3, Y1, Y3 + VMOVDQA Y8, Y1 + VMOVSLDUP Y6, Y8 + VPBLENDD $0xaa, Y8, Y4, Y8 + VPSRLQ $0x20, Y4, Y4 + VPBLENDD $0xaa, Y6, Y4, Y6 + VMOVDQA Y8, Y4 + VMOVSLDUP Y7, Y8 + VPBLENDD $0xaa, Y8, Y5, Y8 + VPSRLQ $0x20, Y5, Y5 + VPBLENDD $0xaa, Y7, Y5, Y7 + VMOVDQA Y8, Y5 + VPSLLD $0x10, Y1, Y8 + VPBLENDW $0xaa, Y8, Y0, Y8 + VPSRLD $0x10, Y0, Y0 + VPBLENDW $0xaa, Y1, Y0, Y1 + VMOVDQA Y8, Y0 + VPSLLD $0x10, Y3, Y8 + VPBLENDW $0xaa, Y8, Y2, Y8 + VPSRLD $0x10, Y2, Y2 + VPBLENDW $0xaa, Y3, Y2, Y3 + VMOVDQA Y8, Y2 + VPSLLD $0x10, Y5, Y8 + VPBLENDW $0xaa, Y8, Y4, Y8 + VPSRLD $0x10, Y4, Y4 + VPBLENDW $0xaa, Y5, Y4, Y5 + VMOVDQA Y8, Y4 + VPSLLD $0x10, Y7, Y8 + VPBLENDW $0xaa, Y8, Y6, Y8 + VPSRLD $0x10, Y6, Y6 + VPBLENDW $0xaa, Y7, Y6, Y7 + VMOVDQA Y8, Y6 + VMOVDQU Y0, (AX) + VMOVDQU Y1, 32(AX) + VMOVDQU Y2, 64(AX) + VMOVDQU Y3, 96(AX) + VMOVDQU Y4, 128(AX) + VMOVDQU Y5, 160(AX) + VMOVDQU Y6, 192(AX) + VMOVDQU Y7, 224(AX) + VMOVDQU 256(AX), Y0 + VMOVDQU 288(AX), Y1 + VMOVDQU 320(AX), Y2 + VMOVDQU 352(AX), Y3 + VMOVDQU 384(AX), Y4 + VMOVDQU 416(AX), Y5 + VMOVDQU 448(AX), Y6 + VMOVDQU 480(AX), Y7 + VPERM2I128 $0x20, Y2, Y0, Y8 + VPERM2I128 $0x31, Y2, Y0, Y2 + VMOVDQA Y8, Y0 + VPERM2I128 $0x20, Y3, Y1, Y8 + VPERM2I128 $0x31, Y3, Y1, Y3 + VMOVDQA Y8, Y1 + VPERM2I128 $0x20, Y6, Y4, Y8 + VPERM2I128 $0x31, Y6, Y4, Y6 + VMOVDQA Y8, Y4 + VPERM2I128 $0x20, Y7, Y5, Y8 + VPERM2I128 $0x31, Y7, Y5, Y7 + VMOVDQA Y8, Y5 + VPUNPCKLQDQ Y1, Y0, Y8 + VPUNPCKHQDQ Y1, Y0, Y1 + VMOVDQA Y8, Y0 + VPUNPCKLQDQ Y3, Y2, Y8 + VPUNPCKHQDQ Y3, Y2, Y3 + VMOVDQA Y8, Y2 + VPUNPCKLQDQ Y5, Y4, Y8 + VPUNPCKHQDQ Y5, Y4, Y5 + VMOVDQA Y8, Y4 + VPUNPCKLQDQ Y7, Y6, Y8 + VPUNPCKHQDQ Y7, Y6, Y7 + VMOVDQA Y8, Y6 + VMOVSLDUP Y2, Y8 + VPBLENDD $0xaa, Y8, Y0, Y8 + VPSRLQ $0x20, Y0, Y0 + VPBLENDD $0xaa, Y2, Y0, Y2 + VMOVDQA Y8, Y0 + VMOVSLDUP Y3, Y8 + VPBLENDD $0xaa, Y8, Y1, Y8 + VPSRLQ $0x20, Y1, Y1 + VPBLENDD $0xaa, Y3, Y1, Y3 + VMOVDQA Y8, Y1 + VMOVSLDUP Y6, Y8 + VPBLENDD $0xaa, Y8, Y4, Y8 + VPSRLQ $0x20, Y4, Y4 + VPBLENDD $0xaa, Y6, Y4, Y6 + VMOVDQA Y8, Y4 + VMOVSLDUP Y7, Y8 + VPBLENDD $0xaa, Y8, Y5, Y8 + VPSRLQ $0x20, Y5, Y5 + VPBLENDD $0xaa, Y7, Y5, Y7 + VMOVDQA Y8, Y5 + VPSLLD $0x10, Y1, Y8 + VPBLENDW $0xaa, Y8, Y0, Y8 + VPSRLD $0x10, Y0, Y0 + VPBLENDW $0xaa, Y1, Y0, Y1 + VMOVDQA Y8, Y0 + VPSLLD $0x10, Y3, Y8 + VPBLENDW $0xaa, Y8, Y2, Y8 + VPSRLD $0x10, Y2, Y2 + VPBLENDW $0xaa, Y3, Y2, Y3 + VMOVDQA Y8, Y2 + VPSLLD $0x10, Y5, Y8 + VPBLENDW $0xaa, Y8, Y4, Y8 + VPSRLD $0x10, Y4, Y4 + VPBLENDW $0xaa, Y5, Y4, Y5 + VMOVDQA Y8, Y4 + VPSLLD $0x10, Y7, Y8 + VPBLENDW $0xaa, Y8, Y6, Y8 + VPSRLD $0x10, Y6, Y6 + VPBLENDW $0xaa, Y7, Y6, Y7 + VMOVDQA Y8, Y6 + VMOVDQU Y0, 256(AX) + VMOVDQU Y1, 288(AX) + VMOVDQU Y2, 320(AX) + VMOVDQU Y3, 352(AX) + VMOVDQU Y4, 384(AX) + VMOVDQU Y5, 416(AX) + VMOVDQU Y6, 448(AX) + VMOVDQU Y7, 480(AX) + RET + +// func barrettReduceAVX2(p *[256]int16) +// Requires: AVX, AVX2 +TEXT ·barrettReduceAVX2(SB), NOSPLIT, $0-8 + MOVQ p+0(FP), AX + MOVL $0x00000d01, CX + VMOVD CX, X0 + VPBROADCASTW X0, Y9 + MOVL $0x00004ebf, CX + VMOVD CX, X0 + VPBROADCASTW X0, Y8 + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VPMULHW Y8, Y0, Y4 + VPMULHW Y8, Y1, Y5 + VPMULHW Y8, Y2, Y6 + VPMULHW Y8, Y3, Y7 + VPSRAW $0x0a, Y4, Y4 + VPSRAW $0x0a, Y5, Y5 + VPSRAW $0x0a, Y6, Y6 + VPSRAW $0x0a, Y7, Y7 + VPMULLW Y9, Y4, Y4 + VPMULLW Y9, Y5, Y5 + VPMULLW Y9, Y6, Y6 + VPMULLW Y9, Y7, Y7 + VPSUBW Y4, Y0, Y0 + VPSUBW Y5, Y1, Y1 + VPSUBW Y6, Y2, Y2 + VPSUBW Y7, Y3, Y3 + VMOVDQU Y0, (AX) + VMOVDQU Y1, 32(AX) + VMOVDQU Y2, 64(AX) + VMOVDQU Y3, 96(AX) + VMOVDQU 128(AX), Y0 + VMOVDQU 160(AX), Y1 + VMOVDQU 192(AX), Y2 + VMOVDQU 224(AX), Y3 + VPMULHW Y8, Y0, Y4 + VPMULHW Y8, Y1, Y5 + VPMULHW Y8, Y2, Y6 + VPMULHW Y8, Y3, Y7 + VPSRAW $0x0a, Y4, Y4 + VPSRAW $0x0a, Y5, Y5 + VPSRAW $0x0a, Y6, Y6 + VPSRAW $0x0a, Y7, Y7 + VPMULLW Y9, Y4, Y4 + VPMULLW Y9, Y5, Y5 + VPMULLW Y9, Y6, Y6 + VPMULLW Y9, Y7, Y7 + VPSUBW Y4, Y0, Y0 + VPSUBW Y5, Y1, Y1 + VPSUBW Y6, Y2, Y2 + VPSUBW Y7, Y3, Y3 + VMOVDQU Y0, 128(AX) + VMOVDQU Y1, 160(AX) + VMOVDQU Y2, 192(AX) + VMOVDQU Y3, 224(AX) + VMOVDQU 256(AX), Y0 + VMOVDQU 288(AX), Y1 + VMOVDQU 320(AX), Y2 + VMOVDQU 352(AX), Y3 + VPMULHW Y8, Y0, Y4 + VPMULHW Y8, Y1, Y5 + VPMULHW Y8, Y2, Y6 + VPMULHW Y8, Y3, Y7 + VPSRAW $0x0a, Y4, Y4 + VPSRAW $0x0a, Y5, Y5 + VPSRAW $0x0a, Y6, Y6 + VPSRAW $0x0a, Y7, Y7 + VPMULLW Y9, Y4, Y4 + VPMULLW Y9, Y5, Y5 + VPMULLW Y9, Y6, Y6 + VPMULLW Y9, Y7, Y7 + VPSUBW Y4, Y0, Y0 + VPSUBW Y5, Y1, Y1 + VPSUBW Y6, Y2, Y2 + VPSUBW Y7, Y3, Y3 + VMOVDQU Y0, 256(AX) + VMOVDQU Y1, 288(AX) + VMOVDQU Y2, 320(AX) + VMOVDQU Y3, 352(AX) + VMOVDQU 384(AX), Y0 + VMOVDQU 416(AX), Y1 + VMOVDQU 448(AX), Y2 + VMOVDQU 480(AX), Y3 + VPMULHW Y8, Y0, Y4 + VPMULHW Y8, Y1, Y5 + VPMULHW Y8, Y2, Y6 + VPMULHW Y8, Y3, Y7 + VPSRAW $0x0a, Y4, Y4 + VPSRAW $0x0a, Y5, Y5 + VPSRAW $0x0a, Y6, Y6 + VPSRAW $0x0a, Y7, Y7 + VPMULLW Y9, Y4, Y4 + VPMULLW Y9, Y5, Y5 + VPMULLW Y9, Y6, Y6 + VPMULLW Y9, Y7, Y7 + VPSUBW Y4, Y0, Y0 + VPSUBW Y5, Y1, Y1 + VPSUBW Y6, Y2, Y2 + VPSUBW Y7, Y3, Y3 + VMOVDQU Y0, 384(AX) + VMOVDQU Y1, 416(AX) + VMOVDQU Y2, 448(AX) + VMOVDQU Y3, 480(AX) + RET + +// func normalizeAVX2(p *[256]int16) +// Requires: AVX, AVX2 +TEXT ·normalizeAVX2(SB), NOSPLIT, $0-8 + MOVQ p+0(FP), AX + MOVL $0x00000d01, CX + VMOVD CX, X0 + VPBROADCASTW X0, Y9 + MOVL $0x00004ebf, CX + VMOVD CX, X0 + VPBROADCASTW X0, Y8 + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y1 + VMOVDQU 64(AX), Y2 + VMOVDQU 96(AX), Y3 + VPMULHW Y8, Y0, Y4 + VPMULHW Y8, Y1, Y5 + VPMULHW Y8, Y2, Y6 + VPMULHW Y8, Y3, Y7 + VPSRAW $0x0a, Y4, Y4 + VPSRAW $0x0a, Y5, Y5 + VPSRAW $0x0a, Y6, Y6 + VPSRAW $0x0a, Y7, Y7 + VPMULLW Y9, Y4, Y4 + VPMULLW Y9, Y5, Y5 + VPMULLW Y9, Y6, Y6 + VPMULLW Y9, Y7, Y7 + VPSUBW Y4, Y0, Y0 + VPSUBW Y5, Y1, Y1 + VPSUBW Y6, Y2, Y2 + VPSUBW Y7, Y3, Y3 + VPSUBW Y9, Y0, Y0 + VPSUBW Y9, Y1, Y1 + VPSUBW Y9, Y2, Y2 + VPSUBW Y9, Y3, Y3 + VPSRAW $0x0f, Y0, Y4 + VPSRAW $0x0f, Y1, Y5 + VPSRAW $0x0f, Y2, Y6 + VPSRAW $0x0f, Y3, Y7 + VPAND Y4, Y9, Y4 + VPAND Y5, Y9, Y5 + VPAND Y6, Y9, Y6 + VPAND Y7, Y9, Y7 + VPADDW Y0, Y4, Y0 + VPADDW Y1, Y5, Y1 + VPADDW Y2, Y6, Y2 + VPADDW Y3, Y7, Y3 + VMOVDQU Y0, (AX) + VMOVDQU Y1, 32(AX) + VMOVDQU Y2, 64(AX) + VMOVDQU Y3, 96(AX) + VMOVDQU 128(AX), Y0 + VMOVDQU 160(AX), Y1 + VMOVDQU 192(AX), Y2 + VMOVDQU 224(AX), Y3 + VPMULHW Y8, Y0, Y4 + VPMULHW Y8, Y1, Y5 + VPMULHW Y8, Y2, Y6 + VPMULHW Y8, Y3, Y7 + VPSRAW $0x0a, Y4, Y4 + VPSRAW $0x0a, Y5, Y5 + VPSRAW $0x0a, Y6, Y6 + VPSRAW $0x0a, Y7, Y7 + VPMULLW Y9, Y4, Y4 + VPMULLW Y9, Y5, Y5 + VPMULLW Y9, Y6, Y6 + VPMULLW Y9, Y7, Y7 + VPSUBW Y4, Y0, Y0 + VPSUBW Y5, Y1, Y1 + VPSUBW Y6, Y2, Y2 + VPSUBW Y7, Y3, Y3 + VPSUBW Y9, Y0, Y0 + VPSUBW Y9, Y1, Y1 + VPSUBW Y9, Y2, Y2 + VPSUBW Y9, Y3, Y3 + VPSRAW $0x0f, Y0, Y4 + VPSRAW $0x0f, Y1, Y5 + VPSRAW $0x0f, Y2, Y6 + VPSRAW $0x0f, Y3, Y7 + VPAND Y4, Y9, Y4 + VPAND Y5, Y9, Y5 + VPAND Y6, Y9, Y6 + VPAND Y7, Y9, Y7 + VPADDW Y0, Y4, Y0 + VPADDW Y1, Y5, Y1 + VPADDW Y2, Y6, Y2 + VPADDW Y3, Y7, Y3 + VMOVDQU Y0, 128(AX) + VMOVDQU Y1, 160(AX) + VMOVDQU Y2, 192(AX) + VMOVDQU Y3, 224(AX) + VMOVDQU 256(AX), Y0 + VMOVDQU 288(AX), Y1 + VMOVDQU 320(AX), Y2 + VMOVDQU 352(AX), Y3 + VPMULHW Y8, Y0, Y4 + VPMULHW Y8, Y1, Y5 + VPMULHW Y8, Y2, Y6 + VPMULHW Y8, Y3, Y7 + VPSRAW $0x0a, Y4, Y4 + VPSRAW $0x0a, Y5, Y5 + VPSRAW $0x0a, Y6, Y6 + VPSRAW $0x0a, Y7, Y7 + VPMULLW Y9, Y4, Y4 + VPMULLW Y9, Y5, Y5 + VPMULLW Y9, Y6, Y6 + VPMULLW Y9, Y7, Y7 + VPSUBW Y4, Y0, Y0 + VPSUBW Y5, Y1, Y1 + VPSUBW Y6, Y2, Y2 + VPSUBW Y7, Y3, Y3 + VPSUBW Y9, Y0, Y0 + VPSUBW Y9, Y1, Y1 + VPSUBW Y9, Y2, Y2 + VPSUBW Y9, Y3, Y3 + VPSRAW $0x0f, Y0, Y4 + VPSRAW $0x0f, Y1, Y5 + VPSRAW $0x0f, Y2, Y6 + VPSRAW $0x0f, Y3, Y7 + VPAND Y4, Y9, Y4 + VPAND Y5, Y9, Y5 + VPAND Y6, Y9, Y6 + VPAND Y7, Y9, Y7 + VPADDW Y0, Y4, Y0 + VPADDW Y1, Y5, Y1 + VPADDW Y2, Y6, Y2 + VPADDW Y3, Y7, Y3 + VMOVDQU Y0, 256(AX) + VMOVDQU Y1, 288(AX) + VMOVDQU Y2, 320(AX) + VMOVDQU Y3, 352(AX) + VMOVDQU 384(AX), Y0 + VMOVDQU 416(AX), Y1 + VMOVDQU 448(AX), Y2 + VMOVDQU 480(AX), Y3 + VPMULHW Y8, Y0, Y4 + VPMULHW Y8, Y1, Y5 + VPMULHW Y8, Y2, Y6 + VPMULHW Y8, Y3, Y7 + VPSRAW $0x0a, Y4, Y4 + VPSRAW $0x0a, Y5, Y5 + VPSRAW $0x0a, Y6, Y6 + VPSRAW $0x0a, Y7, Y7 + VPMULLW Y9, Y4, Y4 + VPMULLW Y9, Y5, Y5 + VPMULLW Y9, Y6, Y6 + VPMULLW Y9, Y7, Y7 + VPSUBW Y4, Y0, Y0 + VPSUBW Y5, Y1, Y1 + VPSUBW Y6, Y2, Y2 + VPSUBW Y7, Y3, Y3 + VPSUBW Y9, Y0, Y0 + VPSUBW Y9, Y1, Y1 + VPSUBW Y9, Y2, Y2 + VPSUBW Y9, Y3, Y3 + VPSRAW $0x0f, Y0, Y4 + VPSRAW $0x0f, Y1, Y5 + VPSRAW $0x0f, Y2, Y6 + VPSRAW $0x0f, Y3, Y7 + VPAND Y4, Y9, Y4 + VPAND Y5, Y9, Y5 + VPAND Y6, Y9, Y6 + VPAND Y7, Y9, Y7 + VPADDW Y0, Y4, Y0 + VPADDW Y1, Y5, Y1 + VPADDW Y2, Y6, Y2 + VPADDW Y3, Y7, Y3 + VMOVDQU Y0, 384(AX) + VMOVDQU Y1, 416(AX) + VMOVDQU Y2, 448(AX) + VMOVDQU Y3, 480(AX) + RET diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/field.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/field.go new file mode 100644 index 00000000000..33744dff73f --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/field.go @@ -0,0 +1,74 @@ +package common + +// Given -2¹⁵ q ≤ x < 2¹⁵ q, returns -q < y < q with x 2⁻¹⁶ = y (mod q). +func montReduce(x int32) int16 { + // This is Montgomery reduction with R=2¹⁶. + // + // Note gcd(2¹⁶, q) = 1 as q is prime. Write q' := 62209 = q⁻¹ mod R. + // First we compute + // + // m := ((x mod R) q') mod R + // = x q' mod R + // = int16(x q') + // = int16(int32(x) * int32(q')) + // + // Note that x q' might be as big as 2³² and could overflow the int32 + // multiplication in the last line. However for any int32s a and b, + // we have int32(int64(a)*int64(b)) = int32(a*b) and so the result is ok. + m := int16(x * 62209) + + // Note that x - m q is divisable by R; indeed modulo R we have + // + // x - m q ≡ x - x q' q ≡ x - x q⁻¹ q ≡ x - x = 0. + // + // We return y := (x - m q) / R. Note that y is indeed correct as + // modulo q we have + // + // y ≡ x R⁻¹ - m q R⁻¹ = x R⁻¹ + // + // and as both 2¹⁵ q ≤ m q, x < 2¹⁵ q, we have + // 2¹⁶ q ≤ x - m q < 2¹⁶ and so q ≤ (x - m q) / R < q as desired. + return int16(uint32(x-int32(m)*int32(Q)) >> 16) +} + +// Given any x, returns x R mod q where R=2¹⁶. +func toMont(x int16) int16 { + // Note |1353 x| ≤ 1353 2¹⁵ ≤ 13318 q ≤ 2¹⁵ q and so we're within + // the bounds of montReduce. + return montReduce(int32(x) * 1353) // 1353 = R² mod q. +} + +// Given any x, compute 0 ≤ y ≤ q with x = y (mod q). +// +// Beware: we might have barrettReduce(x) = q ≠ 0 for some x. In fact, +// this happens if and only if x = -nq for some positive integer n. +func barrettReduce(x int16) int16 { + // This is standard Barrett reduction. + // + // For any x we have x mod q = x - ⌊x/q⌋ q. We will use 20159/2²⁶ as + // an approximation of 1/q. Note that 0 ≤ 20159/2²⁶ - 1/q ≤ 0.135/2²⁶ + // and so | x 20156/2²⁶ - x/q | ≤ 2⁻¹⁰ for |x| ≤ 2¹⁶. For all x + // not a multiple of q, the number x/q is further than 1/q from any integer + // and so ⌊x 20156/2²⁶⌋ = ⌊x/q⌋. If x is a multiple of q and x is positive, + // then x 20156/2²⁶ is larger than x/q so ⌊x 20156/2²⁶⌋ = ⌊x/q⌋ as well. + // Finally, if x is negative multiple of q, then ⌊x 20156/2²⁶⌋ = ⌊x/q⌋-1. + // Thus + // [ q if x=-nq for pos. integer n + // x - ⌊x 20156/2²⁶⌋ q = [ + // [ x mod q otherwise + // + // To compute actually compute this, note that + // + // ⌊x 20156/2²⁶⌋ = (20159 x) >> 26. + return x - int16((int32(x)*20159)>>26)*Q +} + +// Returns x if x < q and x - q otherwise. Assumes x ≥ -29439. +func csubq(x int16) int16 { + x -= Q // no overflow due to assumption x ≥ -29439. + // If x is positive, then x >> 15 = 0. If x is negative, + // then uint16(x >> 15) = 2¹⁶-1. So this will add back in q + // if x was smaller than q. + x += (x >> 15) & Q + return x +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/generic.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/generic.go new file mode 100644 index 00000000000..2b742b95a38 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/generic.go @@ -0,0 +1,77 @@ +//go:build !amd64 +// +build !amd64 + +package common + +// Sets p to a + b. Does not normalize coefficients. +func (p *Poly) Add(a, b *Poly) { + p.addGeneric(a, b) +} + +// Sets p to a - b. Does not normalize coefficients. +func (p *Poly) Sub(a, b *Poly) { + p.subGeneric(a, b) +} + +// Executes an in-place forward "NTT" on p. +// +// Assumes the coefficients are in absolute value ≤q. The resulting +// coefficients are in absolute value ≤7q. If the input is in Montgomery +// form, then the result is in Montgomery form and so (by linearity of the NTT) +// if the input is in regular form, then the result is also in regular form. +// The order of coefficients will be "tangled". These can be put back into +// their proper order by calling Detangle(). +func (p *Poly) NTT() { + p.nttGeneric() +} + +// Executes an in-place inverse "NTT" on p and multiply by the Montgomery +// factor R. +// +// Requires coefficients to be in "tangled" order, see Tangle(). +// Assumes the coefficients are in absolute value ≤q. The resulting +// coefficients are in absolute value ≤q. If the input is in Montgomery +// form, then the result is in Montgomery form and so (by linearity) +// if the input is in regular form, then the result is also in regular form. +func (p *Poly) InvNTT() { + p.invNTTGeneric() +} + +// Sets p to the "pointwise" multiplication of a and b. +// +// That is: InvNTT(p) = InvNTT(a) * InvNTT(b). Assumes a and b are in +// Montgomery form. Products between coefficients of a and b must be strictly +// bounded in absolute value by 2¹⁵q. p will be in Montgomery form and +// bounded in absolute value by 2q. +// +// Requires a and b to be in "tangled" order, see Tangle(). p will be in +// tangled order as well. +func (p *Poly) MulHat(a, b *Poly) { + p.mulHatGeneric(a, b) +} + +// Puts p into the right form to be used with (among others) InvNTT(). +func (p *Poly) Tangle() { + // In the generic implementation there is no advantage to using a + // different order, so we use the standard order everywhere. +} + +// Puts p back into standard form. +func (p *Poly) Detangle() { + // In the generic implementation there is no advantage to using a + // different order, so we use the standard order everywhere. +} + +// Almost normalizes coefficients. +// +// Ensures each coefficient is in {0, …, q}. +func (p *Poly) BarrettReduce() { + p.barrettReduceGeneric() +} + +// Normalizes coefficients. +// +// Ensures each coefficient is in {0, …, q-1}. +func (p *Poly) Normalize() { + p.normalizeGeneric() +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/ntt.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/ntt.go new file mode 100644 index 00000000000..c1abaf237f1 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/ntt.go @@ -0,0 +1,193 @@ +package common + +// Zetas lists precomputed powers of the primitive root of unity in +// Montgomery representation used for the NTT: +// +// Zetas[i] = ζᵇʳᵛ⁽ⁱ⁾ R mod q +// +// where ζ = 17, brv(i) is the bitreversal of a 7-bit number and R=2¹⁶ mod q. +// +// The following Python code generates the Zetas arrays: +// +// q = 13*2**8 + 1; zeta = 17 +// R = 2**16 % q # Montgomery const. +// def brv(x): return int(''.join(reversed(bin(x)[2:].zfill(7))),2) +// print([(pow(zeta, brv(i), q)*R)%q for i in range(128)]) +var Zetas = [128]int16{ + 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, + 962, 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, + 2648, 1017, 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, + 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, + 1469, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, + 2604, 448, 2264, 677, 2054, 2226, 430, 555, 843, 2078, 871, 1550, + 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, 3083, 778, 1159, + 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, + 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, + 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, + 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628, +} + +// InvNTTReductions keeps track of which coefficients to apply Barrett +// reduction to in Poly.InvNTT(). +// +// Generated in a lazily: once a butterfly is computed which is about to +// overflow the int16, the largest coefficient is reduced. If that is +// not enough, the other coefficient is reduced as well. +// +// This is actually optimal, as proven in https://eprint.iacr.org/2020/1377.pdf +var InvNTTReductions = [...]int{ + -1, // after layer 1 + -1, // after layer 2 + 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240, + 241, -1, // after layer 3 + 0, 1, 32, 33, 34, 35, 64, 65, 96, 97, 98, 99, 128, 129, 160, 161, 162, 163, + 192, 193, 224, 225, 226, 227, -1, // after layer 4 + 2, 3, 66, 67, 68, 69, 70, 71, 130, 131, 194, 195, 196, 197, 198, + 199, -1, // after layer 5 + 4, 5, 6, 7, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, -1, // after layer 6 + -1, // after layer 7 +} + +// Executes an in-place forward "NTT" on p. +// +// Assumes the coefficients are in absolute value ≤q. The resulting +// coefficients are in absolute value ≤7q. If the input is in Montgomery +// form, then the result is in Montgomery form and so (by linearity of the NTT) +// if the input is in regular form, then the result is also in regular form. +// The order of coefficients will be "tangled". These can be put back into +// their proper order by calling Detangle(). +func (p *Poly) nttGeneric() { + // Note that ℤ_q does not have a primitive 512ᵗʰ root of unity (as 512 + // does not divide into q-1) and so we cannot do a regular NTT. ℤ_q + // does have a primitive 256ᵗʰ root of unity, the smallest of which + // is ζ := 17. + // + // Recall that our base ring R := ℤ_q[x] / (x²⁵⁶ + 1). The polynomial + // x²⁵⁶+1 will not split completely (as its roots would be 512ᵗʰ roots + // of unity.) However, it does split almost (using ζ¹²⁸ = -1): + // + // x²⁵⁶ + 1 = (x²)¹²⁸ - ζ¹²⁸ + // = ((x²)⁶⁴ - ζ⁶⁴)((x²)⁶⁴ + ζ⁶⁴) + // = ((x²)³² - ζ³²)((x²)³² + ζ³²)((x²)³² - ζ⁹⁶)((x²)³² + ζ⁹⁶) + // ⋮ + // = (x² - ζ)(x² + ζ)(x² - ζ⁶⁵)(x² + ζ⁶⁵) … (x² + ζ¹²⁷) + // + // Note that the powers of ζ that appear (from the second line down) are + // in binary + // + // 0100000 1100000 + // 0010000 1010000 0110000 1110000 + // 0001000 1001000 0101000 1101000 0011000 1011000 0111000 1111000 + // … + // + // That is: brv(2), brv(3), brv(4), …, where brv(x) denotes the 7-bit + // bitreversal of x. These powers of ζ are given by the Zetas array. + // + // The polynomials x² ± ζⁱ are irreducible and coprime, hence by + // the Chinese Remainder Theorem we know + // + // ℤ_q[x]/(x²⁵⁶+1) → ℤ_q[x]/(x²-ζ) x … x ℤ_q[x]/(x²+ζ¹²⁷) + // + // given by a ↦ ( a mod x²-ζ, …, a mod x²+ζ¹²⁷ ) + // is an isomorphism, which is the "NTT". It can be efficiently computed by + // + // + // a ↦ ( a mod (x²)⁶⁴ - ζ⁶⁴, a mod (x²)⁶⁴ + ζ⁶⁴ ) + // ↦ ( a mod (x²)³² - ζ³², a mod (x²)³² + ζ³², + // a mod (x²)⁹⁶ - ζ⁹⁶, a mod (x²)⁹⁶ + ζ⁹⁶ ) + // + // et cetera + // + // If N was 8 then this can be pictured in the following diagram: + // + // https://cnx.org/resources/17ee4dfe517a6adda05377b25a00bf6e6c93c334/File0026.png + // + // Each cross is a Cooley-Tukey butterfly: it's the map + // + // (a, b) ↦ (a + ζb, a - ζb) + // + // for the appropriate power ζ for that column and row group. + + k := 0 // Index into Zetas + + // l runs effectively over the columns in the diagram above; it is half the + // height of a row group, i.e. the number of butterflies in each row group. + // In the diagram above it would be 4, 2, 1. + for l := N / 2; l > 1; l >>= 1 { + // On the nᵗʰ iteration of the l-loop, the absolute value of the + // coefficients are bounded by nq. + + // offset effectively loops over the row groups in this column; it is + // the first row in the row group. + for offset := 0; offset < N-l; offset += 2 * l { + k++ + zeta := int32(Zetas[k]) + + // j loops over each butterfly in the row group. + for j := offset; j < offset+l; j++ { + t := montReduce(zeta * int32(p[j+l])) + p[j+l] = p[j] - t + p[j] += t + } + } + } +} + +// Executes an in-place inverse "NTT" on p and multiply by the Montgomery +// factor R. +// +// Requires coefficients to be in "tangled" order, see Tangle(). +// Assumes the coefficients are in absolute value ≤q. The resulting +// coefficients are in absolute value ≤q. If the input is in Montgomery +// form, then the result is in Montgomery form and so (by linearity) +// if the input is in regular form, then the result is also in regular form. +func (p *Poly) invNTTGeneric() { + k := 127 // Index into Zetas + r := -1 // Index into InvNTTReductions. + + // We basically do the oppposite of NTT, but postpone dividing by 2 in the + // inverse of the Cooley-Tukey butterfly and accumulate that into a big + // division by 2⁷ at the end. See the comments in the NTT() function. + + for l := 2; l < N; l <<= 1 { + for offset := 0; offset < N-l; offset += 2 * l { + // As we're inverting, we need powers of ζ⁻¹ (instead of ζ). + // To be precise, we need ζᵇʳᵛ⁽ᵏ⁾⁻¹²⁸. However, as ζ⁻¹²⁸ = -1, + // we can use the existing Zetas table instead of + // keeping a separate InvZetas table as in Dilithium. + + minZeta := int32(Zetas[k]) + k-- + + for j := offset; j < offset+l; j++ { + // Gentleman-Sande butterfly: (a, b) ↦ (a + b, ζ(a-b)) + t := p[j+l] - p[j] + p[j] += p[j+l] + p[j+l] = montReduce(minZeta * int32(t)) + + // Note that if we had |a| < αq and |b| < βq before the + // butterfly, then now we have |a| < (α+β)q and |b| < q. + } + } + + // We let the InvNTTReductions instruct us which coefficients to + // Barrett reduce. See TestInvNTTReductions, which tests whether + // there is an overflow. + for { + r++ + i := InvNTTReductions[r] + if i < 0 { + break + } + p[i] = barrettReduce(p[i]) + } + } + + for j := 0; j < N; j++ { + // Note 1441 = (128)⁻¹ R². The coefficients are bounded by 9q, so + // as 1441 * 9 ≈ 2¹⁴ < 2¹⁵, we're within the required bounds + // for montReduce(). + p[j] = montReduce(1441 * int32(p[j])) + } +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/params.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/params.go new file mode 100644 index 00000000000..f04d1aaa32c --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/params.go @@ -0,0 +1,22 @@ +package common + +import ( + "github.com/cloudflare/circl/pke/kyber/internal/common/params" +) + +const ( + // Q is the parameter q ≡ 3329 = 2¹¹ + 2¹⁰ + 2⁸ + 1. + Q = params.Q + + // N is the parameter N: the length of the polynomials + N = params.N + + // PolySize is the size of a packed polynomial. + PolySize = params.PolySize + + // PlaintextSize is the size of the plaintext + PlaintextSize = params.PlaintextSize + + // Eta2 is the parameter η₂ + Eta2 = params.Eta2 +) diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/params/params.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/params/params.go new file mode 100644 index 00000000000..dee58ee99e6 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/params/params.go @@ -0,0 +1,21 @@ +package params + +// We put these parameters in a separate package so that the Go code, +// such as asm/src.go, that generates assembler can import it. + +const ( + // Q is the parameter q ≡ 3329 = 2¹¹ + 2¹⁰ + 2⁸ + 1. + Q int16 = 3329 + + // N is the parameter N: the length of the polynomials + N = 256 + + // PolySize is the size of a packed polynomial. + PolySize = 384 + + // PlaintextSize is the size of the plaintext + PlaintextSize = 32 + + // Eta2 is the parameter η₂ + Eta2 = 2 +) diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/poly.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/poly.go new file mode 100644 index 00000000000..f6842152bbd --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/poly.go @@ -0,0 +1,324 @@ +package common + +// An element of our base ring R which are polynomials over ℤ_q +// modulo the equation Xᴺ = -1, where q=3329 and N=256. +// +// This type is also used to store NTT-transformed polynomials, +// see Poly.NTT(). +// +// Coefficients aren't always reduced. See Normalize(). +type Poly [N]int16 + +// Sets p to a + b. Does not normalize coefficients. +func (p *Poly) addGeneric(a, b *Poly) { + for i := 0; i < N; i++ { + p[i] = a[i] + b[i] + } +} + +// Sets p to a - b. Does not normalize coefficients. +func (p *Poly) subGeneric(a, b *Poly) { + for i := 0; i < N; i++ { + p[i] = a[i] - b[i] + } +} + +// Almost normalizes coefficients. +// +// Ensures each coefficient is in {0, …, q}. +func (p *Poly) barrettReduceGeneric() { + for i := 0; i < N; i++ { + p[i] = barrettReduce(p[i]) + } +} + +// Normalizes coefficients. +// +// Ensures each coefficient is in {0, …, q-1}. +func (p *Poly) normalizeGeneric() { + for i := 0; i < N; i++ { + p[i] = csubq(barrettReduce(p[i])) + } +} + +// Multiplies p in-place by the Montgomery factor 2¹⁶. +// +// Coefficients of p can be artbitray. Resulting coefficients are bounded +// in absolute value by q. +func (p *Poly) ToMont() { + for i := 0; i < N; i++ { + p[i] = toMont(p[i]) + } +} + +// Sets p to the "pointwise" multiplication of a and b. +// +// That is: InvNTT(p) = InvNTT(a) * InvNTT(b). Assumes a and b are in +// Montgomery form. Products between coefficients of a and b must be strictly +// bounded in absolute value by 2¹⁵q. p will be in Montgomery form and +// bounded in absolute value by 2q. +// +// Requires a and b to be in "tangled" order, see Tangle(). p will be in +// tangled order as well. +func (p *Poly) mulHatGeneric(a, b *Poly) { + // Recall from the discussion in NTT(), that a transformed polynomial is + // an element of ℤ_q[x]/(x²-ζ) x … x ℤ_q[x]/(x²+ζ¹²⁷); + // that is: 128 degree-one polynomials instead of simply 256 elements + // from ℤ_q as in the regular NTT. So instead of pointwise multiplication, + // we multiply the 128 pairs of degree-one polynomials modulo the + // right equation: + // + // (a₁ + a₂x)(b₁ + b₂x) = a₁b₁ + a₂b₂ζ' + (a₁b₂ + a₂b₁)x, + // + // where ζ' is the appropriate power of ζ. + + k := 64 + for i := 0; i < N; i += 4 { + zeta := int32(Zetas[k]) + k++ + + p0 := montReduce(int32(a[i+1]) * int32(b[i+1])) + p0 = montReduce(int32(p0) * zeta) + p0 += montReduce(int32(a[i]) * int32(b[i])) + + p1 := montReduce(int32(a[i]) * int32(b[i+1])) + p1 += montReduce(int32(a[i+1]) * int32(b[i])) + + p[i] = p0 + p[i+1] = p1 + + p2 := montReduce(int32(a[i+3]) * int32(b[i+3])) + p2 = -montReduce(int32(p2) * zeta) + p2 += montReduce(int32(a[i+2]) * int32(b[i+2])) + + p3 := montReduce(int32(a[i+2]) * int32(b[i+3])) + p3 += montReduce(int32(a[i+3]) * int32(b[i+2])) + + p[i+2] = p2 + p[i+3] = p3 + } +} + +// Packs p into buf. buf should be of length PolySize. +// +// Assumes p is normalized (and not just Barrett reduced) and "tangled", +// see Tangle(). +func (p *Poly) Pack(buf []byte) { + q := *p + q.Detangle() + for i := 0; i < 128; i++ { + t0 := q[2*i] + t1 := q[2*i+1] + buf[3*i] = byte(t0) + buf[3*i+1] = byte(t0>>8) | byte(t1<<4) + buf[3*i+2] = byte(t1 >> 4) + } +} + +// Unpacks p from buf. +// +// buf should be of length PolySize. p will be "tangled", see Detangle(). +// +// p will not be normalized; instead 0 ≤ p[i] < 4096. +func (p *Poly) Unpack(buf []byte) { + for i := 0; i < 128; i++ { + p[2*i] = int16(buf[3*i]) | ((int16(buf[3*i+1]) << 8) & 0xfff) + p[2*i+1] = int16(buf[3*i+1]>>4) | (int16(buf[3*i+2]) << 4) + } + p.Tangle() +} + +// Set p to Decompress_q(m, 1). +// +// p will be normalized. m has to be of PlaintextSize. +func (p *Poly) DecompressMessage(m []byte) { + // Decompress_q(x, 1) = ⌈xq/2⌋ = ⌊xq/2+½⌋ = (xq+1) >> 1 and so + // Decompress_q(0, 1) = 0 and Decompress_q(1, 1) = (q+1)/2. + for i := 0; i < 32; i++ { + for j := 0; j < 8; j++ { + bit := (m[i] >> uint(j)) & 1 + + // Set coefficient to either 0 or (q+1)/2 depending on the bit. + p[8*i+j] = -int16(bit) & ((Q + 1) / 2) + } + } +} + +// Writes Compress_q(p, 1) to m. +// +// Assumes p is normalized. m has to be of length at least PlaintextSize. +func (p *Poly) CompressMessageTo(m []byte) { + // Compress_q(x, 1) is 1 on {833, …, 2496} and zero elsewhere. + for i := 0; i < 32; i++ { + m[i] = 0 + for j := 0; j < 8; j++ { + x := 1664 - p[8*i+j] + // With the previous substitution, we want to return 1 if + // and only if x is in {831, …, -832}. + x = (x >> 15) ^ x + // Note (x >> 15)ˣ if x≥0 and -x-1 otherwise. Thus now we want + // to return 1 iff x ≤ 831, ie. x - 832 < 0. + x -= 832 + m[i] |= ((byte(x >> 15)) & 1) << uint(j) + } + } +} + +// Set p to Decompress_q(m, 1). +// +// Assumes d is in {3, 4, 5, 10, 11}. p will be normalized. +func (p *Poly) Decompress(m []byte, d int) { + // Decompress_q(x, d) = ⌈(q/2ᵈ)x⌋ + // = ⌊(q/2ᵈ)x+½⌋ + // = ⌊(qx + 2ᵈ⁻¹)/2ᵈ⌋ + // = (qx + (1<<(d-1))) >> d + switch d { + case 4: + for i := 0; i < N/2; i++ { + p[2*i] = int16(((1 << 3) + + uint32(m[i]&15)*uint32(Q)) >> 4) + p[2*i+1] = int16(((1 << 3) + + uint32(m[i]>>4)*uint32(Q)) >> 4) + } + case 5: + var t [8]uint16 + idx := 0 + for i := 0; i < N/8; i++ { + t[0] = uint16(m[idx]) + t[1] = (uint16(m[idx]) >> 5) | (uint16(m[idx+1] << 3)) + t[2] = uint16(m[idx+1]) >> 2 + t[3] = (uint16(m[idx+1]) >> 7) | (uint16(m[idx+2] << 1)) + t[4] = (uint16(m[idx+2]) >> 4) | (uint16(m[idx+3] << 4)) + t[5] = uint16(m[idx+3]) >> 1 + t[6] = (uint16(m[idx+3]) >> 6) | (uint16(m[idx+4] << 2)) + t[7] = uint16(m[idx+4]) >> 3 + + for j := 0; j < 8; j++ { + p[8*i+j] = int16(((1 << 4) + + uint32(t[j]&((1<<5)-1))*uint32(Q)) >> 5) + } + + idx += 5 + } + + case 10: + var t [4]uint16 + idx := 0 + for i := 0; i < N/4; i++ { + t[0] = uint16(m[idx]) | (uint16(m[idx+1]) << 8) + t[1] = (uint16(m[idx+1]) >> 2) | (uint16(m[idx+2]) << 6) + t[2] = (uint16(m[idx+2]) >> 4) | (uint16(m[idx+3]) << 4) + t[3] = (uint16(m[idx+3]) >> 6) | (uint16(m[idx+4]) << 2) + + for j := 0; j < 4; j++ { + p[4*i+j] = int16(((1 << 9) + + uint32(t[j]&((1<<10)-1))*uint32(Q)) >> 10) + } + + idx += 5 + } + case 11: + var t [8]uint16 + idx := 0 + for i := 0; i < N/8; i++ { + t[0] = uint16(m[idx]) | (uint16(m[idx+1]) << 8) + t[1] = (uint16(m[idx+1]) >> 3) | (uint16(m[idx+2]) << 5) + t[2] = (uint16(m[idx+2]) >> 6) | (uint16(m[idx+3]) << 2) | (uint16(m[idx+4]) << 10) + t[3] = (uint16(m[idx+4]) >> 1) | (uint16(m[idx+5]) << 7) + t[4] = (uint16(m[idx+5]) >> 4) | (uint16(m[idx+6]) << 4) + t[5] = (uint16(m[idx+6]) >> 7) | (uint16(m[idx+7]) << 1) | (uint16(m[idx+8]) << 9) + t[6] = (uint16(m[idx+8]) >> 2) | (uint16(m[idx+9]) << 6) + t[7] = (uint16(m[idx+9]) >> 5) | (uint16(m[idx+10]) << 3) + + for j := 0; j < 8; j++ { + p[8*i+j] = int16(((1 << 10) + + uint32(t[j]&((1<<11)-1))*uint32(Q)) >> 11) + } + + idx += 11 + } + default: + panic("unsupported d") + } +} + +// Writes Compress_q(p, d) to m. +// +// Assumes p is normalized and d is in {3, 4, 5, 10, 11}. +func (p *Poly) CompressTo(m []byte, d int) { + // Compress_q(x, d) = ⌈(2ᵈ/q)x⌋ mod⁺ 2ᵈ + // = ⌊(2ᵈ/q)x+½⌋ mod⁺ 2ᵈ + // = ⌊((x << d) + q/2) / q⌋ mod⁺ 2ᵈ + // = DIV((x << d) + q/2, q) & ((1<>3) | byte(t[2]<<2) | byte(t[3]<<7) + m[idx+2] = byte(t[3]>>1) | byte(t[4]<<4) + m[idx+3] = byte(t[4]>>4) | byte(t[5]<<1) | byte(t[6]<<6) + m[idx+4] = byte(t[6]>>2) | byte(t[7]<<3) + idx += 5 + } + + case 10: + var t [4]uint16 + idx := 0 + for i := 0; i < N/4; i++ { + for j := 0; j < 4; j++ { + t[j] = uint16(((uint32(p[4*i+j])<<10)+uint32(Q)/2)/ + uint32(Q)) & ((1 << 10) - 1) + } + m[idx] = byte(t[0]) + m[idx+1] = byte(t[0]>>8) | byte(t[1]<<2) + m[idx+2] = byte(t[1]>>6) | byte(t[2]<<4) + m[idx+3] = byte(t[2]>>4) | byte(t[3]<<6) + m[idx+4] = byte(t[3] >> 2) + idx += 5 + } + case 11: + var t [8]uint16 + idx := 0 + for i := 0; i < N/8; i++ { + for j := 0; j < 8; j++ { + t[j] = uint16(((uint32(p[8*i+j])<<11)+uint32(Q)/2)/ + uint32(Q)) & ((1 << 11) - 1) + } + m[idx] = byte(t[0]) + m[idx+1] = byte(t[0]>>8) | byte(t[1]<<3) + m[idx+2] = byte(t[1]>>5) | byte(t[2]<<6) + m[idx+3] = byte(t[2] >> 2) + m[idx+4] = byte(t[2]>>10) | byte(t[3]<<1) + m[idx+5] = byte(t[3]>>7) | byte(t[4]<<4) + m[idx+6] = byte(t[4]>>4) | byte(t[5]<<7) + m[idx+7] = byte(t[5] >> 1) + m[idx+8] = byte(t[5]>>9) | byte(t[6]<<2) + m[idx+9] = byte(t[6]>>6) | byte(t[7]<<5) + m[idx+10] = byte(t[7] >> 3) + idx += 11 + } + default: + panic("unsupported d") + } +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/sample.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/sample.go new file mode 100644 index 00000000000..1f15f32c8d2 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/sample.go @@ -0,0 +1,236 @@ +package common + +import ( + "encoding/binary" + + "github.com/cloudflare/circl/internal/sha3" + "github.com/cloudflare/circl/simd/keccakf1600" +) + +// DeriveX4Available indicates whether the system supports the quick fourway +// sampling variants like PolyDeriveUniformX4. +var DeriveX4Available = keccakf1600.IsEnabledX4() + +// Samples p from a centered binomial distribution with given η. +// +// Essentially CBD_η(PRF(seed, nonce)) from the specification. +func (p *Poly) DeriveNoise(seed []byte, nonce uint8, eta int) { + switch eta { + case 2: + p.DeriveNoise2(seed, nonce) + case 3: + p.DeriveNoise3(seed, nonce) + default: + panic("unsupported eta") + } +} + +// Sample p from a centered binomial distribution with n=6 and p=½ - that is: +// coefficients are in {-3, -2, -1, 0, 1, 2, 3} with probabilities {1/64, 3/32, +// 15/64, 5/16, 16/64, 3/32, 1/64}. +func (p *Poly) DeriveNoise3(seed []byte, nonce uint8) { + keySuffix := [1]byte{nonce} + h := sha3.NewShake256() + _, _ = h.Write(seed[:]) + _, _ = h.Write(keySuffix[:]) + + // The distribution at hand is exactly the same as that + // of (a₁ + a₂ + a₃) - (b₁ + b₂+b₃) where a_i,b_i~U(1). Thus we need + // 6 bits per coefficients, thus 192 bytes of input entropy. + + // We add two extra zero bytes in the buffer to be able to read 8 bytes + // at the same time (while using only 6.) + var buf [192 + 2]byte + _, _ = h.Read(buf[:192]) + + for i := 0; i < 32; i++ { + // t is interpreted as a₁ + 2a₂ + 4a₃ + 8b₁ + 16b₂ + …. + t := binary.LittleEndian.Uint64(buf[6*i:]) + + d := t & 0x249249249249 // a₁ + 8b₁ + … + d += (t >> 1) & 0x249249249249 // a₁ + a₂ + 8(b₁ + b₂) + … + d += (t >> 2) & 0x249249249249 // a₁ + a₂ + a₃ + 4(b₁ + b₂ + b₃) + … + + for j := 0; j < 8; j++ { + a := int16(d) & 0x7 // a₁ + a₂ + a₃ + d >>= 3 + b := int16(d) & 0x7 // b₁ + b₂ + b₃ + d >>= 3 + p[8*i+j] = a - b + } + } +} + +// Sample p from a centered binomial distribution with n=4 and p=½ - that is: +// coefficients are in {-2, -1, 0, 1, 2} with probabilities {1/16, 1/4, +// 3/8, 1/4, 1/16}. +func (p *Poly) DeriveNoise2(seed []byte, nonce uint8) { + keySuffix := [1]byte{nonce} + h := sha3.NewShake256() + _, _ = h.Write(seed[:]) + _, _ = h.Write(keySuffix[:]) + + // The distribution at hand is exactly the same as that + // of (a + a') - (b + b') where a,a',b,b'~U(1). Thus we need 4 bits per + // coefficients, thus 128 bytes of input entropy. + + var buf [128]byte + _, _ = h.Read(buf[:]) + + for i := 0; i < 16; i++ { + // t is interpreted as a + 2a' + 4b + 8b' + …. + t := binary.LittleEndian.Uint64(buf[8*i:]) + + d := t & 0x5555555555555555 // a + 4b + … + d += (t >> 1) & 0x5555555555555555 // a+a' + 4(b + b') + … + + for j := 0; j < 16; j++ { + a := int16(d) & 0x3 + d >>= 2 + b := int16(d) & 0x3 + d >>= 2 + p[16*i+j] = a - b + } + } +} + +// For each i, sample ps[i] uniformly from the given seed for coordinates +// xs[i] and ys[i]. ps[i] may be nil and is ignored in that case. +// +// Can only be called when DeriveX4Available is true. +func PolyDeriveUniformX4(ps [4]*Poly, seed *[32]byte, xs, ys [4]uint8) { + var perm keccakf1600.StateX4 + state := perm.Initialize() + + // Absorb the seed in the four states + for i := 0; i < 4; i++ { + v := binary.LittleEndian.Uint64(seed[8*i : 8*(i+1)]) + for j := 0; j < 4; j++ { + state[i*4+j] = v + } + } + + // Absorb the coordinates, the SHAKE128 domain separator (0b1111), the + // start of the padding (0b…001) and the end of the padding 0b100…. + // Recall that the rate of SHAKE128 is 168; ie. 21 uint64s. + for j := 0; j < 4; j++ { + state[4*4+j] = uint64(xs[j]) | (uint64(ys[j]) << 8) | (0x1f << 16) + state[20*4+j] = 0x80 << 56 + } + + var idx [4]int // indices into ps + for j := 0; j < 4; j++ { + if ps[j] == nil { + idx[j] = N // mark nil polynomials as completed + } + } + + done := false + for !done { + // Applies KeccaK-f[1600] to state to get the next 21 uint64s of each of + // the four SHAKE128 streams. + perm.Permute() + + done = true + + PolyLoop: + for j := 0; j < 4; j++ { + if idx[j] == N { + continue + } + for i := 0; i < 7; i++ { + var t [16]uint16 + + v1 := state[i*3*4+j] + v2 := state[(i*3+1)*4+j] + v3 := state[(i*3+2)*4+j] + + t[0] = uint16(v1) & 0xfff + t[1] = uint16(v1>>12) & 0xfff + t[2] = uint16(v1>>24) & 0xfff + t[3] = uint16(v1>>36) & 0xfff + t[4] = uint16(v1>>48) & 0xfff + t[5] = uint16((v1>>60)|(v2<<4)) & 0xfff + + t[6] = uint16(v2>>8) & 0xfff + t[7] = uint16(v2>>20) & 0xfff + t[8] = uint16(v2>>32) & 0xfff + t[9] = uint16(v2>>44) & 0xfff + t[10] = uint16((v2>>56)|(v3<<8)) & 0xfff + + t[11] = uint16(v3>>4) & 0xfff + t[12] = uint16(v3>>16) & 0xfff + t[13] = uint16(v3>>28) & 0xfff + t[14] = uint16(v3>>40) & 0xfff + t[15] = uint16(v3>>52) & 0xfff + + for k := 0; k < 16; k++ { + if t[k] < uint16(Q) { + ps[j][idx[j]] = int16(t[k]) + idx[j]++ + if idx[j] == N { + continue PolyLoop + } + } + } + } + + done = false + } + } + + for i := 0; i < 4; i++ { + if ps[i] != nil { + ps[i].Tangle() + } + } +} + +// Sample p uniformly from the given seed and x and y coordinates. +// +// Coefficients are reduced and will be in "tangled" order. See Tangle(). +func (p *Poly) DeriveUniform(seed *[32]byte, x, y uint8) { + var seedSuffix [2]byte + var buf [168]byte // rate of SHAKE-128 + + seedSuffix[0] = x + seedSuffix[1] = y + + h := sha3.NewShake128() + _, _ = h.Write(seed[:]) + _, _ = h.Write(seedSuffix[:]) + + i := 0 + for { + _, _ = h.Read(buf[:]) + + for j := 0; j < 168; j += 3 { + t1 := (uint16(buf[j]) | (uint16(buf[j+1]) << 8)) & 0xfff + t2 := (uint16(buf[j+1]>>4) | (uint16(buf[j+2]) << 4)) & 0xfff + + if t1 < uint16(Q) { + p[i] = int16(t1) + i++ + + if i == N { + break + } + } + + if t2 < uint16(Q) { + p[i] = int16(t2) + i++ + + if i == N { + break + } + } + } + + if i == N { + break + } + } + + p.Tangle() +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/stubs_amd64.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/stubs_amd64.go new file mode 100644 index 00000000000..dd869993e9d --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/stubs_amd64.go @@ -0,0 +1,33 @@ +// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT. + +//go:build amd64 +// +build amd64 + +package common + +//go:noescape +func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16) + +//go:noescape +func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16) + +//go:noescape +func nttAVX2(p *[256]int16) + +//go:noescape +func invNttAVX2(p *[256]int16) + +//go:noescape +func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16) + +//go:noescape +func detangleAVX2(p *[256]int16) + +//go:noescape +func tangleAVX2(p *[256]int16) + +//go:noescape +func barrettReduceAVX2(p *[256]int16) + +//go:noescape +func normalizeAVX2(p *[256]int16) diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/cpapke.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/cpapke.go new file mode 100644 index 00000000000..01ef88b2f60 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/cpapke.go @@ -0,0 +1,176 @@ +// Code generated from kyber512/internal/cpapke.go by gen.go + +package internal + +import ( + "github.com/cloudflare/circl/internal/sha3" + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +// A Kyber.CPAPKE private key. +type PrivateKey struct { + sh Vec // NTT(s), normalized +} + +// A Kyber.CPAPKE public key. +type PublicKey struct { + rho [32]byte // ρ, the seed for the matrix A + th Vec // NTT(t), normalized + + // cached values + aT Mat // the matrix Aᵀ +} + +// Packs the private key to buf. +func (sk *PrivateKey) Pack(buf []byte) { + sk.sh.Pack(buf) +} + +// Unpacks the private key from buf. +func (sk *PrivateKey) Unpack(buf []byte) { + sk.sh.Unpack(buf) + sk.sh.Normalize() +} + +// Packs the public key to buf. +func (pk *PublicKey) Pack(buf []byte) { + pk.th.Pack(buf) + copy(buf[K*common.PolySize:], pk.rho[:]) +} + +// Unpacks the public key from buf. +func (pk *PublicKey) Unpack(buf []byte) { + pk.th.Unpack(buf) + pk.th.Normalize() + copy(pk.rho[:], buf[K*common.PolySize:]) + pk.aT.Derive(&pk.rho, true) +} + +// Derives a new Kyber.CPAPKE keypair from the given seed. +func NewKeyFromSeed(seed []byte) (*PublicKey, *PrivateKey) { + var pk PublicKey + var sk PrivateKey + + var expandedSeed [64]byte + + h := sha3.New512() + _, _ = h.Write(seed) + + // This writes hash into expandedSeed. Yes, this is idiomatic Go. + _, _ = h.Read(expandedSeed[:]) + + copy(pk.rho[:], expandedSeed[:32]) + sigma := expandedSeed[32:] // σ, the noise seed + + pk.aT.Derive(&pk.rho, false) // Expand ρ to matrix A; we'll transpose later + + var eh Vec + sk.sh.DeriveNoise(sigma, 0, Eta1) // Sample secret vector s + sk.sh.NTT() + sk.sh.Normalize() + + eh.DeriveNoise(sigma, K, Eta1) // Sample blind e + eh.NTT() + + // Next, we compute t = A s + e. + for i := 0; i < K; i++ { + // Note that coefficients of s are bounded by q and those of A + // are bounded by 4.5q and so their product is bounded by 2¹⁵q + // as required for multiplication. + PolyDotHat(&pk.th[i], &pk.aT[i], &sk.sh) + + // A and s were not in Montgomery form, so the Montgomery + // multiplications in the inner product added a factor R⁻¹ which + // we'll cancel out now. This will also ensure the coefficients of + // t are bounded in absolute value by q. + pk.th[i].ToMont() + } + + pk.th.Add(&pk.th, &eh) // bounded by 8q. + pk.th.Normalize() + pk.aT.Transpose() + + return &pk, &sk +} + +// Decrypts ciphertext ct meant for private key sk to plaintext pt. +func (sk *PrivateKey) DecryptTo(pt, ct []byte) { + var u Vec + var v, m common.Poly + + u.Decompress(ct, DU) + v.Decompress(ct[K*compressedPolySize(DU):], DV) + + // Compute m = v - + u.NTT() + PolyDotHat(&m, &sk.sh, &u) + m.BarrettReduce() + m.InvNTT() + m.Sub(&v, &m) + m.Normalize() + + // Compress polynomial m to original message + m.CompressMessageTo(pt) +} + +// Encrypts message pt for the public key to ciphertext ct using randomness +// from seed. +// +// seed has to be of length SeedSize, pt of PlaintextSize and ct of +// CiphertextSize. +func (pk *PublicKey) EncryptTo(ct, pt, seed []byte) { + var rh, e1, u Vec + var e2, v, m common.Poly + + // Sample r, e₁ and e₂ from B_η + rh.DeriveNoise(seed, 0, Eta1) + rh.NTT() + rh.BarrettReduce() + + e1.DeriveNoise(seed, K, common.Eta2) + e2.DeriveNoise(seed, 2*K, common.Eta2) + + // Next we compute u = Aᵀ r + e₁. First Aᵀ. + for i := 0; i < K; i++ { + // Note that coefficients of r are bounded by q and those of Aᵀ + // are bounded by 4.5q and so their product is bounded by 2¹⁵q + // as required for multiplication. + PolyDotHat(&u[i], &pk.aT[i], &rh) + } + + u.BarrettReduce() + + // Aᵀ and r were not in Montgomery form, so the Montgomery + // multiplications in the inner product added a factor R⁻¹ which + // the InvNTT cancels out. + u.InvNTT() + + u.Add(&u, &e1) // u = Aᵀ r + e₁ + + // Next compute v = + e₂ + Decompress_q(m, 1). + PolyDotHat(&v, &pk.th, &rh) + v.BarrettReduce() + v.InvNTT() + + m.DecompressMessage(pt) + v.Add(&v, &m) + v.Add(&v, &e2) // v = + e₂ + Decompress_q(m, 1) + + // Pack ciphertext + u.Normalize() + v.Normalize() + + u.CompressTo(ct, DU) + v.CompressTo(ct[K*compressedPolySize(DU):], DV) +} + +// Returns whether sk equals other. +func (sk *PrivateKey) Equal(other *PrivateKey) bool { + ret := int16(0) + for i := 0; i < K; i++ { + for j := 0; j < common.N; j++ { + ret |= sk.sh[i][j] ^ other.sh[i][j] + } + } + return ret == 0 +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/mat.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/mat.go new file mode 100644 index 00000000000..e8a35affa24 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/mat.go @@ -0,0 +1,85 @@ +// Code generated from kyber512/internal/mat.go by gen.go + +package internal + +import ( + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +// A k by k matrix of polynomials. +type Mat [K]Vec + +// Expands the given seed to the corresponding matrix A or its transpose Aᵀ. +func (m *Mat) Derive(seed *[32]byte, transpose bool) { + if !common.DeriveX4Available { + if transpose { + for i := 0; i < K; i++ { + for j := 0; j < K; j++ { + m[i][j].DeriveUniform(seed, uint8(i), uint8(j)) + } + } + } else { + for i := 0; i < K; i++ { + for j := 0; j < K; j++ { + m[i][j].DeriveUniform(seed, uint8(j), uint8(i)) + } + } + } + return + } + + var ps [4]*common.Poly + var xs [4]uint8 + var ys [4]uint8 + x := uint8(0) + y := uint8(0) + + for x != K { + idx := 0 + for ; idx < 4; idx++ { + ps[idx] = &m[x][y] + + if transpose { + xs[idx] = x + ys[idx] = y + } else { + xs[idx] = y + ys[idx] = x + } + + y++ + if y == K { + x++ + y = 0 + + if x == K { + if idx == 0 { + // If there is just one left, then a plain DeriveUniform + // is quicker than the X4 variant. + ps[0].DeriveUniform(seed, xs[0], ys[0]) + return + } + + for idx++; idx < 4; idx++ { + ps[idx] = nil + } + + break + } + } + } + + common.PolyDeriveUniformX4(ps, seed, xs, ys) + } +} + +// Tranposes A in place. +func (m *Mat) Transpose() { + for i := 0; i < K-1; i++ { + for j := i + 1; j < K; j++ { + t := m[i][j] + m[i][j] = m[j][i] + m[j][i] = t + } + } +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/params.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/params.go new file mode 100644 index 00000000000..669b0edaca2 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/params.go @@ -0,0 +1,21 @@ +// Code generated from params.templ.go. DO NOT EDIT. + +package internal + +import ( + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +const ( + K = 4 + Eta1 = 2 + DU = 11 + DV = 5 + PublicKeySize = 32 + K*common.PolySize + + PrivateKeySize = K * common.PolySize + + PlaintextSize = common.PlaintextSize + SeedSize = 32 + CiphertextSize = 1568 +) diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/vec.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/vec.go new file mode 100644 index 00000000000..6681895a72e --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/internal/vec.go @@ -0,0 +1,125 @@ +// Code generated from kyber512/internal/vec.go by gen.go + +package internal + +import ( + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +// A vector of K polynomials +type Vec [K]common.Poly + +// Samples v[i] from a centered binomial distribution with given η, +// seed and nonce+i. +// +// Essentially CBD_η(PRF(seed, nonce+i)) from the specification. +func (v *Vec) DeriveNoise(seed []byte, nonce uint8, eta int) { + for i := 0; i < K; i++ { + v[i].DeriveNoise(seed, nonce+uint8(i), eta) + } +} + +// Sets p to the inner product of a and b using "pointwise" multiplication. +// +// See MulHat() and NTT() for a description of the multiplication. +// Assumes a and b are in Montgomery form. p will be in Montgomery form, +// and its coefficients will be bounded in absolute value by 2kq. +// If a and b are not in Montgomery form, then the action is the same +// as "pointwise" multiplication followed by multiplying by R⁻¹, the inverse +// of the Montgomery factor. +func PolyDotHat(p *common.Poly, a, b *Vec) { + var t common.Poly + *p = common.Poly{} // set p to zero + for i := 0; i < K; i++ { + t.MulHat(&a[i], &b[i]) + p.Add(&t, p) + } +} + +// Almost normalizes coefficients in-place. +// +// Ensures each coefficient is in {0, …, q}. +func (v *Vec) BarrettReduce() { + for i := 0; i < K; i++ { + v[i].BarrettReduce() + } +} + +// Normalizes coefficients in-place. +// +// Ensures each coefficient is in {0, …, q-1}. +func (v *Vec) Normalize() { + for i := 0; i < K; i++ { + v[i].Normalize() + } +} + +// Applies in-place inverse NTT(). See Poly.InvNTT() for assumptions. +func (v *Vec) InvNTT() { + for i := 0; i < K; i++ { + v[i].InvNTT() + } +} + +// Applies in-place forward NTT(). See Poly.NTT() for assumptions. +func (v *Vec) NTT() { + for i := 0; i < K; i++ { + v[i].NTT() + } +} + +// Sets v to a + b. +func (v *Vec) Add(a, b *Vec) { + for i := 0; i < K; i++ { + v[i].Add(&a[i], &b[i]) + } +} + +// Packs v into buf, which must be of length K*PolySize. +func (v *Vec) Pack(buf []byte) { + for i := 0; i < K; i++ { + v[i].Pack(buf[common.PolySize*i:]) + } +} + +// Unpacks v from buf which must be of length K*PolySize. +func (v *Vec) Unpack(buf []byte) { + for i := 0; i < K; i++ { + v[i].Unpack(buf[common.PolySize*i:]) + } +} + +// Writes Compress_q(v, d) to m. +// +// Assumes v is normalized and d is in {3, 4, 5, 10, 11}. +func (v *Vec) CompressTo(m []byte, d int) { + size := compressedPolySize(d) + for i := 0; i < K; i++ { + v[i].CompressTo(m[size*i:], d) + } +} + +// Set v to Decompress_q(m, 1). +// +// Assumes d is in {3, 4, 5, 10, 11}. v will be normalized. +func (v *Vec) Decompress(m []byte, d int) { + size := compressedPolySize(d) + for i := 0; i < K; i++ { + v[i].Decompress(m[size*i:], d) + } +} + +// ⌈(256 d)/8⌉ +func compressedPolySize(d int) int { + switch d { + case 4: + return 128 + case 5: + return 160 + case 10: + return 320 + case 11: + return 352 + } + panic("unsupported d") +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/kyber.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/kyber.go new file mode 100644 index 00000000000..fb5911facd4 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber1024/kyber.go @@ -0,0 +1,145 @@ +// Code generated from modePkg.templ.go. DO NOT EDIT. + +// kyber1024 implements the IND-CPA-secure Public Key Encryption +// scheme Kyber1024.CPAPKE as submitted to round 3 of the NIST PQC competition +// and described in +// +// https://pq-crystals.org/kyber/data/kyber-specification-round3.pdf +package kyber1024 + +import ( + cryptoRand "crypto/rand" + "io" + + "github.com/cloudflare/circl/pke/kyber/kyber1024/internal" +) + +const ( + // Size of seed for NewKeyFromSeed + KeySeedSize = internal.SeedSize + + // Size of seed for EncryptTo + EncryptionSeedSize = internal.SeedSize + + // Size of a packed PublicKey + PublicKeySize = internal.PublicKeySize + + // Size of a packed PrivateKey + PrivateKeySize = internal.PrivateKeySize + + // Size of a ciphertext + CiphertextSize = internal.CiphertextSize + + // Size of a plaintext + PlaintextSize = internal.PlaintextSize +) + +// PublicKey is the type of Kyber1024.CPAPKE public key +type PublicKey internal.PublicKey + +// PrivateKey is the type of Kyber1024.CPAPKE private key +type PrivateKey internal.PrivateKey + +// GenerateKey generates a public/private key pair using entropy from rand. +// If rand is nil, crypto/rand.Reader will be used. +func GenerateKey(rand io.Reader) (*PublicKey, *PrivateKey, error) { + var seed [KeySeedSize]byte + if rand == nil { + rand = cryptoRand.Reader + } + _, err := io.ReadFull(rand, seed[:]) + if err != nil { + return nil, nil, err + } + pk, sk := internal.NewKeyFromSeed(seed[:]) + return (*PublicKey)(pk), (*PrivateKey)(sk), nil +} + +// NewKeyFromSeed derives a public/private key pair using the given seed. +// +// Panics if seed is not of length KeySeedSize. +func NewKeyFromSeed(seed []byte) (*PublicKey, *PrivateKey) { + if len(seed) != KeySeedSize { + panic("seed must be of length KeySeedSize") + } + pk, sk := internal.NewKeyFromSeed(seed) + return (*PublicKey)(pk), (*PrivateKey)(sk) +} + +// EncryptTo encrypts message pt for the public key and writes the ciphertext +// to ct using randomness from seed. +// +// This function panics if the lengths of pt, seed, and ct are not +// PlaintextSize, EncryptionSeedSize, and CiphertextSize respectively. +func (pk *PublicKey) EncryptTo(ct []byte, pt []byte, seed []byte) { + if len(pt) != PlaintextSize { + panic("pt must be of length PlaintextSize") + } + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + if len(seed) != EncryptionSeedSize { + panic("seed must be of length EncryptionSeedSize") + } + (*internal.PublicKey)(pk).EncryptTo(ct, pt, seed) +} + +// DecryptTo decrypts message ct for the private key and writes the +// plaintext to pt. +// +// This function panics if the lengths of ct and pt are not +// CiphertextSize and PlaintextSize respectively. +func (sk *PrivateKey) DecryptTo(pt []byte, ct []byte) { + if len(pt) != PlaintextSize { + panic("pt must be of length PlaintextSize") + } + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + (*internal.PrivateKey)(sk).DecryptTo(pt, ct) +} + +// Packs pk into the given buffer. +// +// Panics if buf is not of length PublicKeySize. +func (pk *PublicKey) Pack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of size PublicKeySize") + } + (*internal.PublicKey)(pk).Pack(buf) +} + +// Packs sk into the given buffer. +// +// Panics if buf is not of length PrivateKeySize. +func (sk *PrivateKey) Pack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of size PrivateKeySize") + } + (*internal.PrivateKey)(sk).Pack(buf) +} + +// Unpacks pk from the given buffer. +// +// Panics if buf is not of length PublicKeySize. +func (pk *PublicKey) Unpack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of size PublicKeySize") + } + (*internal.PublicKey)(pk).Unpack(buf) +} + +// Unpacks sk from the given buffer. +// +// Panics if buf is not of length PrivateKeySize. +func (sk *PrivateKey) Unpack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of size PrivateKeySize") + } + (*internal.PrivateKey)(sk).Unpack(buf) +} + +// Returns whether the two private keys are equal. +func (sk *PrivateKey) Equal(other *PrivateKey) bool { + return (*internal.PrivateKey)(sk).Equal((*internal.PrivateKey)(other)) +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/cpapke.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/cpapke.go new file mode 100644 index 00000000000..80ab2501c2f --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/cpapke.go @@ -0,0 +1,174 @@ +package internal + +import ( + "github.com/cloudflare/circl/internal/sha3" + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +// A Kyber.CPAPKE private key. +type PrivateKey struct { + sh Vec // NTT(s), normalized +} + +// A Kyber.CPAPKE public key. +type PublicKey struct { + rho [32]byte // ρ, the seed for the matrix A + th Vec // NTT(t), normalized + + // cached values + aT Mat // the matrix Aᵀ +} + +// Packs the private key to buf. +func (sk *PrivateKey) Pack(buf []byte) { + sk.sh.Pack(buf) +} + +// Unpacks the private key from buf. +func (sk *PrivateKey) Unpack(buf []byte) { + sk.sh.Unpack(buf) + sk.sh.Normalize() +} + +// Packs the public key to buf. +func (pk *PublicKey) Pack(buf []byte) { + pk.th.Pack(buf) + copy(buf[K*common.PolySize:], pk.rho[:]) +} + +// Unpacks the public key from buf. +func (pk *PublicKey) Unpack(buf []byte) { + pk.th.Unpack(buf) + pk.th.Normalize() + copy(pk.rho[:], buf[K*common.PolySize:]) + pk.aT.Derive(&pk.rho, true) +} + +// Derives a new Kyber.CPAPKE keypair from the given seed. +func NewKeyFromSeed(seed []byte) (*PublicKey, *PrivateKey) { + var pk PublicKey + var sk PrivateKey + + var expandedSeed [64]byte + + h := sha3.New512() + _, _ = h.Write(seed) + + // This writes hash into expandedSeed. Yes, this is idiomatic Go. + _, _ = h.Read(expandedSeed[:]) + + copy(pk.rho[:], expandedSeed[:32]) + sigma := expandedSeed[32:] // σ, the noise seed + + pk.aT.Derive(&pk.rho, false) // Expand ρ to matrix A; we'll transpose later + + var eh Vec + sk.sh.DeriveNoise(sigma, 0, Eta1) // Sample secret vector s + sk.sh.NTT() + sk.sh.Normalize() + + eh.DeriveNoise(sigma, K, Eta1) // Sample blind e + eh.NTT() + + // Next, we compute t = A s + e. + for i := 0; i < K; i++ { + // Note that coefficients of s are bounded by q and those of A + // are bounded by 4.5q and so their product is bounded by 2¹⁵q + // as required for multiplication. + PolyDotHat(&pk.th[i], &pk.aT[i], &sk.sh) + + // A and s were not in Montgomery form, so the Montgomery + // multiplications in the inner product added a factor R⁻¹ which + // we'll cancel out now. This will also ensure the coefficients of + // t are bounded in absolute value by q. + pk.th[i].ToMont() + } + + pk.th.Add(&pk.th, &eh) // bounded by 8q. + pk.th.Normalize() + pk.aT.Transpose() + + return &pk, &sk +} + +// Decrypts ciphertext ct meant for private key sk to plaintext pt. +func (sk *PrivateKey) DecryptTo(pt, ct []byte) { + var u Vec + var v, m common.Poly + + u.Decompress(ct, DU) + v.Decompress(ct[K*compressedPolySize(DU):], DV) + + // Compute m = v - + u.NTT() + PolyDotHat(&m, &sk.sh, &u) + m.BarrettReduce() + m.InvNTT() + m.Sub(&v, &m) + m.Normalize() + + // Compress polynomial m to original message + m.CompressMessageTo(pt) +} + +// Encrypts message pt for the public key to ciphertext ct using randomness +// from seed. +// +// seed has to be of length SeedSize, pt of PlaintextSize and ct of +// CiphertextSize. +func (pk *PublicKey) EncryptTo(ct, pt, seed []byte) { + var rh, e1, u Vec + var e2, v, m common.Poly + + // Sample r, e₁ and e₂ from B_η + rh.DeriveNoise(seed, 0, Eta1) + rh.NTT() + rh.BarrettReduce() + + e1.DeriveNoise(seed, K, common.Eta2) + e2.DeriveNoise(seed, 2*K, common.Eta2) + + // Next we compute u = Aᵀ r + e₁. First Aᵀ. + for i := 0; i < K; i++ { + // Note that coefficients of r are bounded by q and those of Aᵀ + // are bounded by 4.5q and so their product is bounded by 2¹⁵q + // as required for multiplication. + PolyDotHat(&u[i], &pk.aT[i], &rh) + } + + u.BarrettReduce() + + // Aᵀ and r were not in Montgomery form, so the Montgomery + // multiplications in the inner product added a factor R⁻¹ which + // the InvNTT cancels out. + u.InvNTT() + + u.Add(&u, &e1) // u = Aᵀ r + e₁ + + // Next compute v = + e₂ + Decompress_q(m, 1). + PolyDotHat(&v, &pk.th, &rh) + v.BarrettReduce() + v.InvNTT() + + m.DecompressMessage(pt) + v.Add(&v, &m) + v.Add(&v, &e2) // v = + e₂ + Decompress_q(m, 1) + + // Pack ciphertext + u.Normalize() + v.Normalize() + + u.CompressTo(ct, DU) + v.CompressTo(ct[K*compressedPolySize(DU):], DV) +} + +// Returns whether sk equals other. +func (sk *PrivateKey) Equal(other *PrivateKey) bool { + ret := int16(0) + for i := 0; i < K; i++ { + for j := 0; j < common.N; j++ { + ret |= sk.sh[i][j] ^ other.sh[i][j] + } + } + return ret == 0 +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/mat.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/mat.go new file mode 100644 index 00000000000..9871a7741d2 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/mat.go @@ -0,0 +1,83 @@ +package internal + +import ( + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +// A k by k matrix of polynomials. +type Mat [K]Vec + +// Expands the given seed to the corresponding matrix A or its transpose Aᵀ. +func (m *Mat) Derive(seed *[32]byte, transpose bool) { + if !common.DeriveX4Available { + if transpose { + for i := 0; i < K; i++ { + for j := 0; j < K; j++ { + m[i][j].DeriveUniform(seed, uint8(i), uint8(j)) + } + } + } else { + for i := 0; i < K; i++ { + for j := 0; j < K; j++ { + m[i][j].DeriveUniform(seed, uint8(j), uint8(i)) + } + } + } + return + } + + var ps [4]*common.Poly + var xs [4]uint8 + var ys [4]uint8 + x := uint8(0) + y := uint8(0) + + for x != K { + idx := 0 + for ; idx < 4; idx++ { + ps[idx] = &m[x][y] + + if transpose { + xs[idx] = x + ys[idx] = y + } else { + xs[idx] = y + ys[idx] = x + } + + y++ + if y == K { + x++ + y = 0 + + if x == K { + if idx == 0 { + // If there is just one left, then a plain DeriveUniform + // is quicker than the X4 variant. + ps[0].DeriveUniform(seed, xs[0], ys[0]) + return + } + + for idx++; idx < 4; idx++ { + ps[idx] = nil + } + + break + } + } + } + + common.PolyDeriveUniformX4(ps, seed, xs, ys) + } +} + +// Tranposes A in place. +func (m *Mat) Transpose() { + for i := 0; i < K-1; i++ { + for j := i + 1; j < K; j++ { + t := m[i][j] + m[i][j] = m[j][i] + m[j][i] = t + } + } +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/params.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/params.go new file mode 100644 index 00000000000..0e6df77b981 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/params.go @@ -0,0 +1,21 @@ +// Code generated from params.templ.go. DO NOT EDIT. + +package internal + +import ( + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +const ( + K = 2 + Eta1 = 3 + DU = 10 + DV = 4 + PublicKeySize = 32 + K*common.PolySize + + PrivateKeySize = K * common.PolySize + + PlaintextSize = common.PlaintextSize + SeedSize = 32 + CiphertextSize = 768 +) diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/vec.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/vec.go new file mode 100644 index 00000000000..222f1ca931f --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/internal/vec.go @@ -0,0 +1,123 @@ +package internal + +import ( + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +// A vector of K polynomials +type Vec [K]common.Poly + +// Samples v[i] from a centered binomial distribution with given η, +// seed and nonce+i. +// +// Essentially CBD_η(PRF(seed, nonce+i)) from the specification. +func (v *Vec) DeriveNoise(seed []byte, nonce uint8, eta int) { + for i := 0; i < K; i++ { + v[i].DeriveNoise(seed, nonce+uint8(i), eta) + } +} + +// Sets p to the inner product of a and b using "pointwise" multiplication. +// +// See MulHat() and NTT() for a description of the multiplication. +// Assumes a and b are in Montgomery form. p will be in Montgomery form, +// and its coefficients will be bounded in absolute value by 2kq. +// If a and b are not in Montgomery form, then the action is the same +// as "pointwise" multiplication followed by multiplying by R⁻¹, the inverse +// of the Montgomery factor. +func PolyDotHat(p *common.Poly, a, b *Vec) { + var t common.Poly + *p = common.Poly{} // set p to zero + for i := 0; i < K; i++ { + t.MulHat(&a[i], &b[i]) + p.Add(&t, p) + } +} + +// Almost normalizes coefficients in-place. +// +// Ensures each coefficient is in {0, …, q}. +func (v *Vec) BarrettReduce() { + for i := 0; i < K; i++ { + v[i].BarrettReduce() + } +} + +// Normalizes coefficients in-place. +// +// Ensures each coefficient is in {0, …, q-1}. +func (v *Vec) Normalize() { + for i := 0; i < K; i++ { + v[i].Normalize() + } +} + +// Applies in-place inverse NTT(). See Poly.InvNTT() for assumptions. +func (v *Vec) InvNTT() { + for i := 0; i < K; i++ { + v[i].InvNTT() + } +} + +// Applies in-place forward NTT(). See Poly.NTT() for assumptions. +func (v *Vec) NTT() { + for i := 0; i < K; i++ { + v[i].NTT() + } +} + +// Sets v to a + b. +func (v *Vec) Add(a, b *Vec) { + for i := 0; i < K; i++ { + v[i].Add(&a[i], &b[i]) + } +} + +// Packs v into buf, which must be of length K*PolySize. +func (v *Vec) Pack(buf []byte) { + for i := 0; i < K; i++ { + v[i].Pack(buf[common.PolySize*i:]) + } +} + +// Unpacks v from buf which must be of length K*PolySize. +func (v *Vec) Unpack(buf []byte) { + for i := 0; i < K; i++ { + v[i].Unpack(buf[common.PolySize*i:]) + } +} + +// Writes Compress_q(v, d) to m. +// +// Assumes v is normalized and d is in {3, 4, 5, 10, 11}. +func (v *Vec) CompressTo(m []byte, d int) { + size := compressedPolySize(d) + for i := 0; i < K; i++ { + v[i].CompressTo(m[size*i:], d) + } +} + +// Set v to Decompress_q(m, 1). +// +// Assumes d is in {3, 4, 5, 10, 11}. v will be normalized. +func (v *Vec) Decompress(m []byte, d int) { + size := compressedPolySize(d) + for i := 0; i < K; i++ { + v[i].Decompress(m[size*i:], d) + } +} + +// ⌈(256 d)/8⌉ +func compressedPolySize(d int) int { + switch d { + case 4: + return 128 + case 5: + return 160 + case 10: + return 320 + case 11: + return 352 + } + panic("unsupported d") +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/kyber.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/kyber.go new file mode 100644 index 00000000000..ea9248487e1 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber512/kyber.go @@ -0,0 +1,145 @@ +// Code generated from modePkg.templ.go. DO NOT EDIT. + +// kyber512 implements the IND-CPA-secure Public Key Encryption +// scheme Kyber512.CPAPKE as submitted to round 3 of the NIST PQC competition +// and described in +// +// https://pq-crystals.org/kyber/data/kyber-specification-round3.pdf +package kyber512 + +import ( + cryptoRand "crypto/rand" + "io" + + "github.com/cloudflare/circl/pke/kyber/kyber512/internal" +) + +const ( + // Size of seed for NewKeyFromSeed + KeySeedSize = internal.SeedSize + + // Size of seed for EncryptTo + EncryptionSeedSize = internal.SeedSize + + // Size of a packed PublicKey + PublicKeySize = internal.PublicKeySize + + // Size of a packed PrivateKey + PrivateKeySize = internal.PrivateKeySize + + // Size of a ciphertext + CiphertextSize = internal.CiphertextSize + + // Size of a plaintext + PlaintextSize = internal.PlaintextSize +) + +// PublicKey is the type of Kyber512.CPAPKE public key +type PublicKey internal.PublicKey + +// PrivateKey is the type of Kyber512.CPAPKE private key +type PrivateKey internal.PrivateKey + +// GenerateKey generates a public/private key pair using entropy from rand. +// If rand is nil, crypto/rand.Reader will be used. +func GenerateKey(rand io.Reader) (*PublicKey, *PrivateKey, error) { + var seed [KeySeedSize]byte + if rand == nil { + rand = cryptoRand.Reader + } + _, err := io.ReadFull(rand, seed[:]) + if err != nil { + return nil, nil, err + } + pk, sk := internal.NewKeyFromSeed(seed[:]) + return (*PublicKey)(pk), (*PrivateKey)(sk), nil +} + +// NewKeyFromSeed derives a public/private key pair using the given seed. +// +// Panics if seed is not of length KeySeedSize. +func NewKeyFromSeed(seed []byte) (*PublicKey, *PrivateKey) { + if len(seed) != KeySeedSize { + panic("seed must be of length KeySeedSize") + } + pk, sk := internal.NewKeyFromSeed(seed) + return (*PublicKey)(pk), (*PrivateKey)(sk) +} + +// EncryptTo encrypts message pt for the public key and writes the ciphertext +// to ct using randomness from seed. +// +// This function panics if the lengths of pt, seed, and ct are not +// PlaintextSize, EncryptionSeedSize, and CiphertextSize respectively. +func (pk *PublicKey) EncryptTo(ct []byte, pt []byte, seed []byte) { + if len(pt) != PlaintextSize { + panic("pt must be of length PlaintextSize") + } + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + if len(seed) != EncryptionSeedSize { + panic("seed must be of length EncryptionSeedSize") + } + (*internal.PublicKey)(pk).EncryptTo(ct, pt, seed) +} + +// DecryptTo decrypts message ct for the private key and writes the +// plaintext to pt. +// +// This function panics if the lengths of ct and pt are not +// CiphertextSize and PlaintextSize respectively. +func (sk *PrivateKey) DecryptTo(pt []byte, ct []byte) { + if len(pt) != PlaintextSize { + panic("pt must be of length PlaintextSize") + } + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + (*internal.PrivateKey)(sk).DecryptTo(pt, ct) +} + +// Packs pk into the given buffer. +// +// Panics if buf is not of length PublicKeySize. +func (pk *PublicKey) Pack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of size PublicKeySize") + } + (*internal.PublicKey)(pk).Pack(buf) +} + +// Packs sk into the given buffer. +// +// Panics if buf is not of length PrivateKeySize. +func (sk *PrivateKey) Pack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of size PrivateKeySize") + } + (*internal.PrivateKey)(sk).Pack(buf) +} + +// Unpacks pk from the given buffer. +// +// Panics if buf is not of length PublicKeySize. +func (pk *PublicKey) Unpack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of size PublicKeySize") + } + (*internal.PublicKey)(pk).Unpack(buf) +} + +// Unpacks sk from the given buffer. +// +// Panics if buf is not of length PrivateKeySize. +func (sk *PrivateKey) Unpack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of size PrivateKeySize") + } + (*internal.PrivateKey)(sk).Unpack(buf) +} + +// Returns whether the two private keys are equal. +func (sk *PrivateKey) Equal(other *PrivateKey) bool { + return (*internal.PrivateKey)(sk).Equal((*internal.PrivateKey)(other)) +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/cpapke.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/cpapke.go new file mode 100644 index 00000000000..01ef88b2f60 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/cpapke.go @@ -0,0 +1,176 @@ +// Code generated from kyber512/internal/cpapke.go by gen.go + +package internal + +import ( + "github.com/cloudflare/circl/internal/sha3" + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +// A Kyber.CPAPKE private key. +type PrivateKey struct { + sh Vec // NTT(s), normalized +} + +// A Kyber.CPAPKE public key. +type PublicKey struct { + rho [32]byte // ρ, the seed for the matrix A + th Vec // NTT(t), normalized + + // cached values + aT Mat // the matrix Aᵀ +} + +// Packs the private key to buf. +func (sk *PrivateKey) Pack(buf []byte) { + sk.sh.Pack(buf) +} + +// Unpacks the private key from buf. +func (sk *PrivateKey) Unpack(buf []byte) { + sk.sh.Unpack(buf) + sk.sh.Normalize() +} + +// Packs the public key to buf. +func (pk *PublicKey) Pack(buf []byte) { + pk.th.Pack(buf) + copy(buf[K*common.PolySize:], pk.rho[:]) +} + +// Unpacks the public key from buf. +func (pk *PublicKey) Unpack(buf []byte) { + pk.th.Unpack(buf) + pk.th.Normalize() + copy(pk.rho[:], buf[K*common.PolySize:]) + pk.aT.Derive(&pk.rho, true) +} + +// Derives a new Kyber.CPAPKE keypair from the given seed. +func NewKeyFromSeed(seed []byte) (*PublicKey, *PrivateKey) { + var pk PublicKey + var sk PrivateKey + + var expandedSeed [64]byte + + h := sha3.New512() + _, _ = h.Write(seed) + + // This writes hash into expandedSeed. Yes, this is idiomatic Go. + _, _ = h.Read(expandedSeed[:]) + + copy(pk.rho[:], expandedSeed[:32]) + sigma := expandedSeed[32:] // σ, the noise seed + + pk.aT.Derive(&pk.rho, false) // Expand ρ to matrix A; we'll transpose later + + var eh Vec + sk.sh.DeriveNoise(sigma, 0, Eta1) // Sample secret vector s + sk.sh.NTT() + sk.sh.Normalize() + + eh.DeriveNoise(sigma, K, Eta1) // Sample blind e + eh.NTT() + + // Next, we compute t = A s + e. + for i := 0; i < K; i++ { + // Note that coefficients of s are bounded by q and those of A + // are bounded by 4.5q and so their product is bounded by 2¹⁵q + // as required for multiplication. + PolyDotHat(&pk.th[i], &pk.aT[i], &sk.sh) + + // A and s were not in Montgomery form, so the Montgomery + // multiplications in the inner product added a factor R⁻¹ which + // we'll cancel out now. This will also ensure the coefficients of + // t are bounded in absolute value by q. + pk.th[i].ToMont() + } + + pk.th.Add(&pk.th, &eh) // bounded by 8q. + pk.th.Normalize() + pk.aT.Transpose() + + return &pk, &sk +} + +// Decrypts ciphertext ct meant for private key sk to plaintext pt. +func (sk *PrivateKey) DecryptTo(pt, ct []byte) { + var u Vec + var v, m common.Poly + + u.Decompress(ct, DU) + v.Decompress(ct[K*compressedPolySize(DU):], DV) + + // Compute m = v - + u.NTT() + PolyDotHat(&m, &sk.sh, &u) + m.BarrettReduce() + m.InvNTT() + m.Sub(&v, &m) + m.Normalize() + + // Compress polynomial m to original message + m.CompressMessageTo(pt) +} + +// Encrypts message pt for the public key to ciphertext ct using randomness +// from seed. +// +// seed has to be of length SeedSize, pt of PlaintextSize and ct of +// CiphertextSize. +func (pk *PublicKey) EncryptTo(ct, pt, seed []byte) { + var rh, e1, u Vec + var e2, v, m common.Poly + + // Sample r, e₁ and e₂ from B_η + rh.DeriveNoise(seed, 0, Eta1) + rh.NTT() + rh.BarrettReduce() + + e1.DeriveNoise(seed, K, common.Eta2) + e2.DeriveNoise(seed, 2*K, common.Eta2) + + // Next we compute u = Aᵀ r + e₁. First Aᵀ. + for i := 0; i < K; i++ { + // Note that coefficients of r are bounded by q and those of Aᵀ + // are bounded by 4.5q and so their product is bounded by 2¹⁵q + // as required for multiplication. + PolyDotHat(&u[i], &pk.aT[i], &rh) + } + + u.BarrettReduce() + + // Aᵀ and r were not in Montgomery form, so the Montgomery + // multiplications in the inner product added a factor R⁻¹ which + // the InvNTT cancels out. + u.InvNTT() + + u.Add(&u, &e1) // u = Aᵀ r + e₁ + + // Next compute v = + e₂ + Decompress_q(m, 1). + PolyDotHat(&v, &pk.th, &rh) + v.BarrettReduce() + v.InvNTT() + + m.DecompressMessage(pt) + v.Add(&v, &m) + v.Add(&v, &e2) // v = + e₂ + Decompress_q(m, 1) + + // Pack ciphertext + u.Normalize() + v.Normalize() + + u.CompressTo(ct, DU) + v.CompressTo(ct[K*compressedPolySize(DU):], DV) +} + +// Returns whether sk equals other. +func (sk *PrivateKey) Equal(other *PrivateKey) bool { + ret := int16(0) + for i := 0; i < K; i++ { + for j := 0; j < common.N; j++ { + ret |= sk.sh[i][j] ^ other.sh[i][j] + } + } + return ret == 0 +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/mat.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/mat.go new file mode 100644 index 00000000000..e8a35affa24 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/mat.go @@ -0,0 +1,85 @@ +// Code generated from kyber512/internal/mat.go by gen.go + +package internal + +import ( + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +// A k by k matrix of polynomials. +type Mat [K]Vec + +// Expands the given seed to the corresponding matrix A or its transpose Aᵀ. +func (m *Mat) Derive(seed *[32]byte, transpose bool) { + if !common.DeriveX4Available { + if transpose { + for i := 0; i < K; i++ { + for j := 0; j < K; j++ { + m[i][j].DeriveUniform(seed, uint8(i), uint8(j)) + } + } + } else { + for i := 0; i < K; i++ { + for j := 0; j < K; j++ { + m[i][j].DeriveUniform(seed, uint8(j), uint8(i)) + } + } + } + return + } + + var ps [4]*common.Poly + var xs [4]uint8 + var ys [4]uint8 + x := uint8(0) + y := uint8(0) + + for x != K { + idx := 0 + for ; idx < 4; idx++ { + ps[idx] = &m[x][y] + + if transpose { + xs[idx] = x + ys[idx] = y + } else { + xs[idx] = y + ys[idx] = x + } + + y++ + if y == K { + x++ + y = 0 + + if x == K { + if idx == 0 { + // If there is just one left, then a plain DeriveUniform + // is quicker than the X4 variant. + ps[0].DeriveUniform(seed, xs[0], ys[0]) + return + } + + for idx++; idx < 4; idx++ { + ps[idx] = nil + } + + break + } + } + } + + common.PolyDeriveUniformX4(ps, seed, xs, ys) + } +} + +// Tranposes A in place. +func (m *Mat) Transpose() { + for i := 0; i < K-1; i++ { + for j := i + 1; j < K; j++ { + t := m[i][j] + m[i][j] = m[j][i] + m[j][i] = t + } + } +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/params.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/params.go new file mode 100644 index 00000000000..27cdb1abfd8 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/params.go @@ -0,0 +1,21 @@ +// Code generated from params.templ.go. DO NOT EDIT. + +package internal + +import ( + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +const ( + K = 3 + Eta1 = 2 + DU = 10 + DV = 4 + PublicKeySize = 32 + K*common.PolySize + + PrivateKeySize = K * common.PolySize + + PlaintextSize = common.PlaintextSize + SeedSize = 32 + CiphertextSize = 1088 +) diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/vec.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/vec.go new file mode 100644 index 00000000000..6681895a72e --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/internal/vec.go @@ -0,0 +1,125 @@ +// Code generated from kyber512/internal/vec.go by gen.go + +package internal + +import ( + "github.com/cloudflare/circl/pke/kyber/internal/common" +) + +// A vector of K polynomials +type Vec [K]common.Poly + +// Samples v[i] from a centered binomial distribution with given η, +// seed and nonce+i. +// +// Essentially CBD_η(PRF(seed, nonce+i)) from the specification. +func (v *Vec) DeriveNoise(seed []byte, nonce uint8, eta int) { + for i := 0; i < K; i++ { + v[i].DeriveNoise(seed, nonce+uint8(i), eta) + } +} + +// Sets p to the inner product of a and b using "pointwise" multiplication. +// +// See MulHat() and NTT() for a description of the multiplication. +// Assumes a and b are in Montgomery form. p will be in Montgomery form, +// and its coefficients will be bounded in absolute value by 2kq. +// If a and b are not in Montgomery form, then the action is the same +// as "pointwise" multiplication followed by multiplying by R⁻¹, the inverse +// of the Montgomery factor. +func PolyDotHat(p *common.Poly, a, b *Vec) { + var t common.Poly + *p = common.Poly{} // set p to zero + for i := 0; i < K; i++ { + t.MulHat(&a[i], &b[i]) + p.Add(&t, p) + } +} + +// Almost normalizes coefficients in-place. +// +// Ensures each coefficient is in {0, …, q}. +func (v *Vec) BarrettReduce() { + for i := 0; i < K; i++ { + v[i].BarrettReduce() + } +} + +// Normalizes coefficients in-place. +// +// Ensures each coefficient is in {0, …, q-1}. +func (v *Vec) Normalize() { + for i := 0; i < K; i++ { + v[i].Normalize() + } +} + +// Applies in-place inverse NTT(). See Poly.InvNTT() for assumptions. +func (v *Vec) InvNTT() { + for i := 0; i < K; i++ { + v[i].InvNTT() + } +} + +// Applies in-place forward NTT(). See Poly.NTT() for assumptions. +func (v *Vec) NTT() { + for i := 0; i < K; i++ { + v[i].NTT() + } +} + +// Sets v to a + b. +func (v *Vec) Add(a, b *Vec) { + for i := 0; i < K; i++ { + v[i].Add(&a[i], &b[i]) + } +} + +// Packs v into buf, which must be of length K*PolySize. +func (v *Vec) Pack(buf []byte) { + for i := 0; i < K; i++ { + v[i].Pack(buf[common.PolySize*i:]) + } +} + +// Unpacks v from buf which must be of length K*PolySize. +func (v *Vec) Unpack(buf []byte) { + for i := 0; i < K; i++ { + v[i].Unpack(buf[common.PolySize*i:]) + } +} + +// Writes Compress_q(v, d) to m. +// +// Assumes v is normalized and d is in {3, 4, 5, 10, 11}. +func (v *Vec) CompressTo(m []byte, d int) { + size := compressedPolySize(d) + for i := 0; i < K; i++ { + v[i].CompressTo(m[size*i:], d) + } +} + +// Set v to Decompress_q(m, 1). +// +// Assumes d is in {3, 4, 5, 10, 11}. v will be normalized. +func (v *Vec) Decompress(m []byte, d int) { + size := compressedPolySize(d) + for i := 0; i < K; i++ { + v[i].Decompress(m[size*i:], d) + } +} + +// ⌈(256 d)/8⌉ +func compressedPolySize(d int) int { + switch d { + case 4: + return 128 + case 5: + return 160 + case 10: + return 320 + case 11: + return 352 + } + panic("unsupported d") +} diff --git a/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/kyber.go b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/kyber.go new file mode 100644 index 00000000000..4cecbb1b871 --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/pke/kyber/kyber768/kyber.go @@ -0,0 +1,145 @@ +// Code generated from modePkg.templ.go. DO NOT EDIT. + +// kyber768 implements the IND-CPA-secure Public Key Encryption +// scheme Kyber768.CPAPKE as submitted to round 3 of the NIST PQC competition +// and described in +// +// https://pq-crystals.org/kyber/data/kyber-specification-round3.pdf +package kyber768 + +import ( + cryptoRand "crypto/rand" + "io" + + "github.com/cloudflare/circl/pke/kyber/kyber768/internal" +) + +const ( + // Size of seed for NewKeyFromSeed + KeySeedSize = internal.SeedSize + + // Size of seed for EncryptTo + EncryptionSeedSize = internal.SeedSize + + // Size of a packed PublicKey + PublicKeySize = internal.PublicKeySize + + // Size of a packed PrivateKey + PrivateKeySize = internal.PrivateKeySize + + // Size of a ciphertext + CiphertextSize = internal.CiphertextSize + + // Size of a plaintext + PlaintextSize = internal.PlaintextSize +) + +// PublicKey is the type of Kyber768.CPAPKE public key +type PublicKey internal.PublicKey + +// PrivateKey is the type of Kyber768.CPAPKE private key +type PrivateKey internal.PrivateKey + +// GenerateKey generates a public/private key pair using entropy from rand. +// If rand is nil, crypto/rand.Reader will be used. +func GenerateKey(rand io.Reader) (*PublicKey, *PrivateKey, error) { + var seed [KeySeedSize]byte + if rand == nil { + rand = cryptoRand.Reader + } + _, err := io.ReadFull(rand, seed[:]) + if err != nil { + return nil, nil, err + } + pk, sk := internal.NewKeyFromSeed(seed[:]) + return (*PublicKey)(pk), (*PrivateKey)(sk), nil +} + +// NewKeyFromSeed derives a public/private key pair using the given seed. +// +// Panics if seed is not of length KeySeedSize. +func NewKeyFromSeed(seed []byte) (*PublicKey, *PrivateKey) { + if len(seed) != KeySeedSize { + panic("seed must be of length KeySeedSize") + } + pk, sk := internal.NewKeyFromSeed(seed) + return (*PublicKey)(pk), (*PrivateKey)(sk) +} + +// EncryptTo encrypts message pt for the public key and writes the ciphertext +// to ct using randomness from seed. +// +// This function panics if the lengths of pt, seed, and ct are not +// PlaintextSize, EncryptionSeedSize, and CiphertextSize respectively. +func (pk *PublicKey) EncryptTo(ct []byte, pt []byte, seed []byte) { + if len(pt) != PlaintextSize { + panic("pt must be of length PlaintextSize") + } + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + if len(seed) != EncryptionSeedSize { + panic("seed must be of length EncryptionSeedSize") + } + (*internal.PublicKey)(pk).EncryptTo(ct, pt, seed) +} + +// DecryptTo decrypts message ct for the private key and writes the +// plaintext to pt. +// +// This function panics if the lengths of ct and pt are not +// CiphertextSize and PlaintextSize respectively. +func (sk *PrivateKey) DecryptTo(pt []byte, ct []byte) { + if len(pt) != PlaintextSize { + panic("pt must be of length PlaintextSize") + } + if len(ct) != CiphertextSize { + panic("ct must be of length CiphertextSize") + } + (*internal.PrivateKey)(sk).DecryptTo(pt, ct) +} + +// Packs pk into the given buffer. +// +// Panics if buf is not of length PublicKeySize. +func (pk *PublicKey) Pack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of size PublicKeySize") + } + (*internal.PublicKey)(pk).Pack(buf) +} + +// Packs sk into the given buffer. +// +// Panics if buf is not of length PrivateKeySize. +func (sk *PrivateKey) Pack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of size PrivateKeySize") + } + (*internal.PrivateKey)(sk).Pack(buf) +} + +// Unpacks pk from the given buffer. +// +// Panics if buf is not of length PublicKeySize. +func (pk *PublicKey) Unpack(buf []byte) { + if len(buf) != PublicKeySize { + panic("buf must be of size PublicKeySize") + } + (*internal.PublicKey)(pk).Unpack(buf) +} + +// Unpacks sk from the given buffer. +// +// Panics if buf is not of length PrivateKeySize. +func (sk *PrivateKey) Unpack(buf []byte) { + if len(buf) != PrivateKeySize { + panic("buf must be of size PrivateKeySize") + } + (*internal.PrivateKey)(sk).Unpack(buf) +} + +// Returns whether the two private keys are equal. +func (sk *PrivateKey) Equal(other *PrivateKey) bool { + return (*internal.PrivateKey)(sk).Equal((*internal.PrivateKey)(other)) +} diff --git a/src/vendor/github.com/cloudflare/circl/xof/xof.go b/src/vendor/github.com/cloudflare/circl/xof/xof.go new file mode 100644 index 00000000000..7e4ceab8b2e --- /dev/null +++ b/src/vendor/github.com/cloudflare/circl/xof/xof.go @@ -0,0 +1,72 @@ +// Package xof provides an interface for eXtendable-Output Functions. +// +// # Available Functions +// +// SHAKE functions are defined in FIPS-202, see https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf. +// BLAKE2Xb and BLAKE2Xs are defined in https://www.blake2.net/blake2x.pdf. +package xof + +import ( + "io" + + "github.com/cloudflare/circl/internal/sha3" + "golang.org/x/crypto/blake2b" + "golang.org/x/crypto/blake2s" +) + +// XOF defines the interface to hash functions that support arbitrary-length output. +type XOF interface { + // Write absorbs more data into the XOF's state. It panics if called + // after Read. + io.Writer + + // Read reads more output from the XOF. It returns io.EOF if the limit + // has been reached. + io.Reader + + // Clone returns a copy of the XOF in its current state. + Clone() XOF + + // Reset restores the XOF to its initial state and discards all data appended by Write. + Reset() +} + +type ID uint + +const ( + SHAKE128 ID = iota + 1 + SHAKE256 + BLAKE2XB + BLAKE2XS +) + +func (x ID) New() XOF { + switch x { + case SHAKE128: + s := sha3.NewShake128() + return shakeBody{&s} + case SHAKE256: + s := sha3.NewShake256() + return shakeBody{&s} + case BLAKE2XB: + x, _ := blake2b.NewXOF(blake2b.OutputLengthUnknown, nil) + return blake2xb{x} + case BLAKE2XS: + x, _ := blake2s.NewXOF(blake2s.OutputLengthUnknown, nil) + return blake2xs{x} + default: + panic("crypto: requested unavailable XOF function") + } +} + +type shakeBody struct{ sha3.ShakeHash } + +func (s shakeBody) Clone() XOF { return shakeBody{s.ShakeHash.Clone()} } + +type blake2xb struct{ blake2b.XOF } + +func (s blake2xb) Clone() XOF { return blake2xb{s.XOF.Clone()} } + +type blake2xs struct{ blake2s.XOF } + +func (s blake2xs) Clone() XOF { return blake2xs{s.XOF.Clone()} } diff --git a/src/vendor/golang.org/x/crypto/blake2b/blake2b.go b/src/vendor/golang.org/x/crypto/blake2b/blake2b.go new file mode 100644 index 00000000000..d2e98d4295b --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2b/blake2b.go @@ -0,0 +1,291 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package blake2b implements the BLAKE2b hash algorithm defined by RFC 7693 +// and the extendable output function (XOF) BLAKE2Xb. +// +// BLAKE2b is optimized for 64-bit platforms—including NEON-enabled ARMs—and +// produces digests of any size between 1 and 64 bytes. +// For a detailed specification of BLAKE2b see https://blake2.net/blake2.pdf +// and for BLAKE2Xb see https://blake2.net/blake2x.pdf +// +// If you aren't sure which function you need, use BLAKE2b (Sum512 or New512). +// If you need a secret-key MAC (message authentication code), use the New512 +// function with a non-nil key. +// +// BLAKE2X is a construction to compute hash values larger than 64 bytes. It +// can produce hash values between 0 and 4 GiB. +package blake2b + +import ( + "encoding/binary" + "errors" + "hash" +) + +const ( + // The blocksize of BLAKE2b in bytes. + BlockSize = 128 + // The hash size of BLAKE2b-512 in bytes. + Size = 64 + // The hash size of BLAKE2b-384 in bytes. + Size384 = 48 + // The hash size of BLAKE2b-256 in bytes. + Size256 = 32 +) + +var ( + useAVX2 bool + useAVX bool + useSSE4 bool +) + +var ( + errKeySize = errors.New("blake2b: invalid key size") + errHashSize = errors.New("blake2b: invalid hash size") +) + +var iv = [8]uint64{ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +} + +// Sum512 returns the BLAKE2b-512 checksum of the data. +func Sum512(data []byte) [Size]byte { + var sum [Size]byte + checkSum(&sum, Size, data) + return sum +} + +// Sum384 returns the BLAKE2b-384 checksum of the data. +func Sum384(data []byte) [Size384]byte { + var sum [Size]byte + var sum384 [Size384]byte + checkSum(&sum, Size384, data) + copy(sum384[:], sum[:Size384]) + return sum384 +} + +// Sum256 returns the BLAKE2b-256 checksum of the data. +func Sum256(data []byte) [Size256]byte { + var sum [Size]byte + var sum256 [Size256]byte + checkSum(&sum, Size256, data) + copy(sum256[:], sum[:Size256]) + return sum256 +} + +// New512 returns a new hash.Hash computing the BLAKE2b-512 checksum. A non-nil +// key turns the hash into a MAC. The key must be between zero and 64 bytes long. +func New512(key []byte) (hash.Hash, error) { return newDigest(Size, key) } + +// New384 returns a new hash.Hash computing the BLAKE2b-384 checksum. A non-nil +// key turns the hash into a MAC. The key must be between zero and 64 bytes long. +func New384(key []byte) (hash.Hash, error) { return newDigest(Size384, key) } + +// New256 returns a new hash.Hash computing the BLAKE2b-256 checksum. A non-nil +// key turns the hash into a MAC. The key must be between zero and 64 bytes long. +func New256(key []byte) (hash.Hash, error) { return newDigest(Size256, key) } + +// New returns a new hash.Hash computing the BLAKE2b checksum with a custom length. +// A non-nil key turns the hash into a MAC. The key must be between zero and 64 bytes long. +// The hash size can be a value between 1 and 64 but it is highly recommended to use +// values equal or greater than: +// - 32 if BLAKE2b is used as a hash function (The key is zero bytes long). +// - 16 if BLAKE2b is used as a MAC function (The key is at least 16 bytes long). +// When the key is nil, the returned hash.Hash implements BinaryMarshaler +// and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash. +func New(size int, key []byte) (hash.Hash, error) { return newDigest(size, key) } + +func newDigest(hashSize int, key []byte) (*digest, error) { + if hashSize < 1 || hashSize > Size { + return nil, errHashSize + } + if len(key) > Size { + return nil, errKeySize + } + d := &digest{ + size: hashSize, + keyLen: len(key), + } + copy(d.key[:], key) + d.Reset() + return d, nil +} + +func checkSum(sum *[Size]byte, hashSize int, data []byte) { + h := iv + h[0] ^= uint64(hashSize) | (1 << 16) | (1 << 24) + var c [2]uint64 + + if length := len(data); length > BlockSize { + n := length &^ (BlockSize - 1) + if length == n { + n -= BlockSize + } + hashBlocks(&h, &c, 0, data[:n]) + data = data[n:] + } + + var block [BlockSize]byte + offset := copy(block[:], data) + remaining := uint64(BlockSize - offset) + if c[0] < remaining { + c[1]-- + } + c[0] -= remaining + + hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:]) + + for i, v := range h[:(hashSize+7)/8] { + binary.LittleEndian.PutUint64(sum[8*i:], v) + } +} + +type digest struct { + h [8]uint64 + c [2]uint64 + size int + block [BlockSize]byte + offset int + + key [BlockSize]byte + keyLen int +} + +const ( + magic = "b2b" + marshaledSize = len(magic) + 8*8 + 2*8 + 1 + BlockSize + 1 +) + +func (d *digest) MarshalBinary() ([]byte, error) { + if d.keyLen != 0 { + return nil, errors.New("crypto/blake2b: cannot marshal MACs") + } + b := make([]byte, 0, marshaledSize) + b = append(b, magic...) + for i := 0; i < 8; i++ { + b = appendUint64(b, d.h[i]) + } + b = appendUint64(b, d.c[0]) + b = appendUint64(b, d.c[1]) + // Maximum value for size is 64 + b = append(b, byte(d.size)) + b = append(b, d.block[:]...) + b = append(b, byte(d.offset)) + return b, nil +} + +func (d *digest) UnmarshalBinary(b []byte) error { + if len(b) < len(magic) || string(b[:len(magic)]) != magic { + return errors.New("crypto/blake2b: invalid hash state identifier") + } + if len(b) != marshaledSize { + return errors.New("crypto/blake2b: invalid hash state size") + } + b = b[len(magic):] + for i := 0; i < 8; i++ { + b, d.h[i] = consumeUint64(b) + } + b, d.c[0] = consumeUint64(b) + b, d.c[1] = consumeUint64(b) + d.size = int(b[0]) + b = b[1:] + copy(d.block[:], b[:BlockSize]) + b = b[BlockSize:] + d.offset = int(b[0]) + return nil +} + +func (d *digest) BlockSize() int { return BlockSize } + +func (d *digest) Size() int { return d.size } + +func (d *digest) Reset() { + d.h = iv + d.h[0] ^= uint64(d.size) | (uint64(d.keyLen) << 8) | (1 << 16) | (1 << 24) + d.offset, d.c[0], d.c[1] = 0, 0, 0 + if d.keyLen > 0 { + d.block = d.key + d.offset = BlockSize + } +} + +func (d *digest) Write(p []byte) (n int, err error) { + n = len(p) + + if d.offset > 0 { + remaining := BlockSize - d.offset + if n <= remaining { + d.offset += copy(d.block[d.offset:], p) + return + } + copy(d.block[d.offset:], p[:remaining]) + hashBlocks(&d.h, &d.c, 0, d.block[:]) + d.offset = 0 + p = p[remaining:] + } + + if length := len(p); length > BlockSize { + nn := length &^ (BlockSize - 1) + if length == nn { + nn -= BlockSize + } + hashBlocks(&d.h, &d.c, 0, p[:nn]) + p = p[nn:] + } + + if len(p) > 0 { + d.offset += copy(d.block[:], p) + } + + return +} + +func (d *digest) Sum(sum []byte) []byte { + var hash [Size]byte + d.finalize(&hash) + return append(sum, hash[:d.size]...) +} + +func (d *digest) finalize(hash *[Size]byte) { + var block [BlockSize]byte + copy(block[:], d.block[:d.offset]) + remaining := uint64(BlockSize - d.offset) + + c := d.c + if c[0] < remaining { + c[1]-- + } + c[0] -= remaining + + h := d.h + hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:]) + + for i, v := range h { + binary.LittleEndian.PutUint64(hash[8*i:], v) + } +} + +func appendUint64(b []byte, x uint64) []byte { + var a [8]byte + binary.BigEndian.PutUint64(a[:], x) + return append(b, a[:]...) +} + +func appendUint32(b []byte, x uint32) []byte { + var a [4]byte + binary.BigEndian.PutUint32(a[:], x) + return append(b, a[:]...) +} + +func consumeUint64(b []byte) ([]byte, uint64) { + x := binary.BigEndian.Uint64(b) + return b[8:], x +} + +func consumeUint32(b []byte) ([]byte, uint32) { + x := binary.BigEndian.Uint32(b) + return b[4:], x +} diff --git a/src/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.go b/src/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.go new file mode 100644 index 00000000000..56bfaaa17da --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.go @@ -0,0 +1,38 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.7 && amd64 && gc && !purego +// +build go1.7,amd64,gc,!purego + +package blake2b + +import "golang.org/x/sys/cpu" + +func init() { + useAVX2 = cpu.X86.HasAVX2 + useAVX = cpu.X86.HasAVX + useSSE4 = cpu.X86.HasSSE41 +} + +//go:noescape +func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) + +//go:noescape +func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) + +//go:noescape +func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) + +func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) { + switch { + case useAVX2: + hashBlocksAVX2(h, c, flag, blocks) + case useAVX: + hashBlocksAVX(h, c, flag, blocks) + case useSSE4: + hashBlocksSSE4(h, c, flag, blocks) + default: + hashBlocksGeneric(h, c, flag, blocks) + } +} diff --git a/src/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s b/src/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s new file mode 100644 index 00000000000..4b9daa18d9d --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s @@ -0,0 +1,745 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.7 && amd64 && gc && !purego +// +build go1.7,amd64,gc,!purego + +#include "textflag.h" + +DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b +DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32 + +DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1 +DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f +DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32 + +DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b +DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32 + +DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a +DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32 + +DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b +GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16 + +DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16 + +DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1 +DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16 + +DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16 + +DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403 +DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16 + +DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302 +DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16 + +#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39 +#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93 +#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e +#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93 +#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39 + +#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \ + VPADDQ m0, Y0, Y0; \ + VPADDQ Y1, Y0, Y0; \ + VPXOR Y0, Y3, Y3; \ + VPSHUFD $-79, Y3, Y3; \ + VPADDQ Y3, Y2, Y2; \ + VPXOR Y2, Y1, Y1; \ + VPSHUFB c40, Y1, Y1; \ + VPADDQ m1, Y0, Y0; \ + VPADDQ Y1, Y0, Y0; \ + VPXOR Y0, Y3, Y3; \ + VPSHUFB c48, Y3, Y3; \ + VPADDQ Y3, Y2, Y2; \ + VPXOR Y2, Y1, Y1; \ + VPADDQ Y1, Y1, t; \ + VPSRLQ $63, Y1, Y1; \ + VPXOR t, Y1, Y1; \ + VPERMQ_0x39_Y1_Y1; \ + VPERMQ_0x4E_Y2_Y2; \ + VPERMQ_0x93_Y3_Y3; \ + VPADDQ m2, Y0, Y0; \ + VPADDQ Y1, Y0, Y0; \ + VPXOR Y0, Y3, Y3; \ + VPSHUFD $-79, Y3, Y3; \ + VPADDQ Y3, Y2, Y2; \ + VPXOR Y2, Y1, Y1; \ + VPSHUFB c40, Y1, Y1; \ + VPADDQ m3, Y0, Y0; \ + VPADDQ Y1, Y0, Y0; \ + VPXOR Y0, Y3, Y3; \ + VPSHUFB c48, Y3, Y3; \ + VPADDQ Y3, Y2, Y2; \ + VPXOR Y2, Y1, Y1; \ + VPADDQ Y1, Y1, t; \ + VPSRLQ $63, Y1, Y1; \ + VPXOR t, Y1, Y1; \ + VPERMQ_0x39_Y3_Y3; \ + VPERMQ_0x4E_Y2_Y2; \ + VPERMQ_0x93_Y1_Y1 + +#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E +#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26 +#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E +#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36 +#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E + +#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n +#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n +#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n +#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n +#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n + +#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01 +#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01 +#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01 +#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01 +#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01 + +#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01 +#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01 +#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01 +#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01 +#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01 + +#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8 +#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01 + +// load msg: Y12 = (i0, i1, i2, i3) +// i0, i1, i2, i3 must not be 0 +#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \ + VMOVQ_SI_X12(i0*8); \ + VMOVQ_SI_X11(i2*8); \ + VPINSRQ_1_SI_X12(i1*8); \ + VPINSRQ_1_SI_X11(i3*8); \ + VINSERTI128 $1, X11, Y12, Y12 + +// load msg: Y13 = (i0, i1, i2, i3) +// i0, i1, i2, i3 must not be 0 +#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \ + VMOVQ_SI_X13(i0*8); \ + VMOVQ_SI_X11(i2*8); \ + VPINSRQ_1_SI_X13(i1*8); \ + VPINSRQ_1_SI_X11(i3*8); \ + VINSERTI128 $1, X11, Y13, Y13 + +// load msg: Y14 = (i0, i1, i2, i3) +// i0, i1, i2, i3 must not be 0 +#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \ + VMOVQ_SI_X14(i0*8); \ + VMOVQ_SI_X11(i2*8); \ + VPINSRQ_1_SI_X14(i1*8); \ + VPINSRQ_1_SI_X11(i3*8); \ + VINSERTI128 $1, X11, Y14, Y14 + +// load msg: Y15 = (i0, i1, i2, i3) +// i0, i1, i2, i3 must not be 0 +#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \ + VMOVQ_SI_X15(i0*8); \ + VMOVQ_SI_X11(i2*8); \ + VPINSRQ_1_SI_X15(i1*8); \ + VPINSRQ_1_SI_X11(i3*8); \ + VINSERTI128 $1, X11, Y15, Y15 + +#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \ + VMOVQ_SI_X12_0; \ + VMOVQ_SI_X11(4*8); \ + VPINSRQ_1_SI_X12(2*8); \ + VPINSRQ_1_SI_X11(6*8); \ + VINSERTI128 $1, X11, Y12, Y12; \ + LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \ + LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \ + LOAD_MSG_AVX2_Y15(9, 11, 13, 15) + +#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \ + LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \ + LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \ + VMOVQ_SI_X11(11*8); \ + VPSHUFD $0x4E, 0*8(SI), X14; \ + VPINSRQ_1_SI_X11(5*8); \ + VINSERTI128 $1, X11, Y14, Y14; \ + LOAD_MSG_AVX2_Y15(12, 2, 7, 3) + +#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \ + VMOVQ_SI_X11(5*8); \ + VMOVDQU 11*8(SI), X12; \ + VPINSRQ_1_SI_X11(15*8); \ + VINSERTI128 $1, X11, Y12, Y12; \ + VMOVQ_SI_X13(8*8); \ + VMOVQ_SI_X11(2*8); \ + VPINSRQ_1_SI_X13_0; \ + VPINSRQ_1_SI_X11(13*8); \ + VINSERTI128 $1, X11, Y13, Y13; \ + LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \ + LOAD_MSG_AVX2_Y15(14, 6, 1, 4) + +#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \ + LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \ + LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \ + LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \ + VMOVQ_SI_X15(6*8); \ + VMOVQ_SI_X11_0; \ + VPINSRQ_1_SI_X15(10*8); \ + VPINSRQ_1_SI_X11(8*8); \ + VINSERTI128 $1, X11, Y15, Y15 + +#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \ + LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \ + VMOVQ_SI_X13_0; \ + VMOVQ_SI_X11(4*8); \ + VPINSRQ_1_SI_X13(7*8); \ + VPINSRQ_1_SI_X11(15*8); \ + VINSERTI128 $1, X11, Y13, Y13; \ + LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \ + LOAD_MSG_AVX2_Y15(1, 12, 8, 13) + +#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \ + VMOVQ_SI_X12(2*8); \ + VMOVQ_SI_X11_0; \ + VPINSRQ_1_SI_X12(6*8); \ + VPINSRQ_1_SI_X11(8*8); \ + VINSERTI128 $1, X11, Y12, Y12; \ + LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \ + LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \ + LOAD_MSG_AVX2_Y15(13, 5, 14, 9) + +#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \ + LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \ + LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \ + VMOVQ_SI_X14_0; \ + VPSHUFD $0x4E, 8*8(SI), X11; \ + VPINSRQ_1_SI_X14(6*8); \ + VINSERTI128 $1, X11, Y14, Y14; \ + LOAD_MSG_AVX2_Y15(7, 3, 2, 11) + +#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \ + LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \ + LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \ + LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \ + VMOVQ_SI_X15_0; \ + VMOVQ_SI_X11(6*8); \ + VPINSRQ_1_SI_X15(4*8); \ + VPINSRQ_1_SI_X11(10*8); \ + VINSERTI128 $1, X11, Y15, Y15 + +#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \ + VMOVQ_SI_X12(6*8); \ + VMOVQ_SI_X11(11*8); \ + VPINSRQ_1_SI_X12(14*8); \ + VPINSRQ_1_SI_X11_0; \ + VINSERTI128 $1, X11, Y12, Y12; \ + LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \ + VMOVQ_SI_X11(1*8); \ + VMOVDQU 12*8(SI), X14; \ + VPINSRQ_1_SI_X11(10*8); \ + VINSERTI128 $1, X11, Y14, Y14; \ + VMOVQ_SI_X15(2*8); \ + VMOVDQU 4*8(SI), X11; \ + VPINSRQ_1_SI_X15(7*8); \ + VINSERTI128 $1, X11, Y15, Y15 + +#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \ + LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \ + VMOVQ_SI_X13(2*8); \ + VPSHUFD $0x4E, 5*8(SI), X11; \ + VPINSRQ_1_SI_X13(4*8); \ + VINSERTI128 $1, X11, Y13, Y13; \ + LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \ + VMOVQ_SI_X15(11*8); \ + VMOVQ_SI_X11(12*8); \ + VPINSRQ_1_SI_X15(14*8); \ + VPINSRQ_1_SI_X11_0; \ + VINSERTI128 $1, X11, Y15, Y15 + +// func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) +TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + + MOVQ SP, DX + ADDQ $31, DX + ANDQ $~31, DX + + MOVQ CX, 16(DX) + XORQ CX, CX + MOVQ CX, 24(DX) + + VMOVDQU ·AVX2_c40<>(SB), Y4 + VMOVDQU ·AVX2_c48<>(SB), Y5 + + VMOVDQU 0(AX), Y8 + VMOVDQU 32(AX), Y9 + VMOVDQU ·AVX2_iv0<>(SB), Y6 + VMOVDQU ·AVX2_iv1<>(SB), Y7 + + MOVQ 0(BX), R8 + MOVQ 8(BX), R9 + MOVQ R9, 8(DX) + +loop: + ADDQ $128, R8 + MOVQ R8, 0(DX) + CMPQ R8, $128 + JGE noinc + INCQ R9 + MOVQ R9, 8(DX) + +noinc: + VMOVDQA Y8, Y0 + VMOVDQA Y9, Y1 + VMOVDQA Y6, Y2 + VPXOR 0(DX), Y7, Y3 + + LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() + VMOVDQA Y12, 32(DX) + VMOVDQA Y13, 64(DX) + VMOVDQA Y14, 96(DX) + VMOVDQA Y15, 128(DX) + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() + VMOVDQA Y12, 160(DX) + VMOVDQA Y13, 192(DX) + VMOVDQA Y14, 224(DX) + VMOVDQA Y15, 256(DX) + + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() + ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) + + ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5) + ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5) + + VPXOR Y0, Y8, Y8 + VPXOR Y1, Y9, Y9 + VPXOR Y2, Y8, Y8 + VPXOR Y3, Y9, Y9 + + LEAQ 128(SI), SI + SUBQ $128, DI + JNE loop + + MOVQ R8, 0(BX) + MOVQ R9, 8(BX) + + VMOVDQU Y8, 0(AX) + VMOVDQU Y9, 32(AX) + VZEROUPPER + + RET + +#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA +#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB +#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF +#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD +#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE + +#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7 +#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF +#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7 +#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF +#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7 +#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7 +#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF +#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF + +#define SHUFFLE_AVX() \ + VMOVDQA X6, X13; \ + VMOVDQA X2, X14; \ + VMOVDQA X4, X6; \ + VPUNPCKLQDQ_X13_X13_X15; \ + VMOVDQA X5, X4; \ + VMOVDQA X6, X5; \ + VPUNPCKHQDQ_X15_X7_X6; \ + VPUNPCKLQDQ_X7_X7_X15; \ + VPUNPCKHQDQ_X15_X13_X7; \ + VPUNPCKLQDQ_X3_X3_X15; \ + VPUNPCKHQDQ_X15_X2_X2; \ + VPUNPCKLQDQ_X14_X14_X15; \ + VPUNPCKHQDQ_X15_X3_X3; \ + +#define SHUFFLE_AVX_INV() \ + VMOVDQA X2, X13; \ + VMOVDQA X4, X14; \ + VPUNPCKLQDQ_X2_X2_X15; \ + VMOVDQA X5, X4; \ + VPUNPCKHQDQ_X15_X3_X2; \ + VMOVDQA X14, X5; \ + VPUNPCKLQDQ_X3_X3_X15; \ + VMOVDQA X6, X14; \ + VPUNPCKHQDQ_X15_X13_X3; \ + VPUNPCKLQDQ_X7_X7_X15; \ + VPUNPCKHQDQ_X15_X6_X6; \ + VPUNPCKLQDQ_X14_X14_X15; \ + VPUNPCKHQDQ_X15_X7_X7; \ + +#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ + VPADDQ m0, v0, v0; \ + VPADDQ v2, v0, v0; \ + VPADDQ m1, v1, v1; \ + VPADDQ v3, v1, v1; \ + VPXOR v0, v6, v6; \ + VPXOR v1, v7, v7; \ + VPSHUFD $-79, v6, v6; \ + VPSHUFD $-79, v7, v7; \ + VPADDQ v6, v4, v4; \ + VPADDQ v7, v5, v5; \ + VPXOR v4, v2, v2; \ + VPXOR v5, v3, v3; \ + VPSHUFB c40, v2, v2; \ + VPSHUFB c40, v3, v3; \ + VPADDQ m2, v0, v0; \ + VPADDQ v2, v0, v0; \ + VPADDQ m3, v1, v1; \ + VPADDQ v3, v1, v1; \ + VPXOR v0, v6, v6; \ + VPXOR v1, v7, v7; \ + VPSHUFB c48, v6, v6; \ + VPSHUFB c48, v7, v7; \ + VPADDQ v6, v4, v4; \ + VPADDQ v7, v5, v5; \ + VPXOR v4, v2, v2; \ + VPXOR v5, v3, v3; \ + VPADDQ v2, v2, t0; \ + VPSRLQ $63, v2, v2; \ + VPXOR t0, v2, v2; \ + VPADDQ v3, v3, t0; \ + VPSRLQ $63, v3, v3; \ + VPXOR t0, v3, v3 + +// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) +// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 +#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \ + VMOVQ_SI_X12(i0*8); \ + VMOVQ_SI_X13(i2*8); \ + VMOVQ_SI_X14(i4*8); \ + VMOVQ_SI_X15(i6*8); \ + VPINSRQ_1_SI_X12(i1*8); \ + VPINSRQ_1_SI_X13(i3*8); \ + VPINSRQ_1_SI_X14(i5*8); \ + VPINSRQ_1_SI_X15(i7*8) + +// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) +#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \ + VMOVQ_SI_X12_0; \ + VMOVQ_SI_X13(4*8); \ + VMOVQ_SI_X14(1*8); \ + VMOVQ_SI_X15(5*8); \ + VPINSRQ_1_SI_X12(2*8); \ + VPINSRQ_1_SI_X13(6*8); \ + VPINSRQ_1_SI_X14(3*8); \ + VPINSRQ_1_SI_X15(7*8) + +// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) +#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \ + VPSHUFD $0x4E, 0*8(SI), X12; \ + VMOVQ_SI_X13(11*8); \ + VMOVQ_SI_X14(12*8); \ + VMOVQ_SI_X15(7*8); \ + VPINSRQ_1_SI_X13(5*8); \ + VPINSRQ_1_SI_X14(2*8); \ + VPINSRQ_1_SI_X15(3*8) + +// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) +#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \ + VMOVDQU 11*8(SI), X12; \ + VMOVQ_SI_X13(5*8); \ + VMOVQ_SI_X14(8*8); \ + VMOVQ_SI_X15(2*8); \ + VPINSRQ_1_SI_X13(15*8); \ + VPINSRQ_1_SI_X14_0; \ + VPINSRQ_1_SI_X15(13*8) + +// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) +#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \ + VMOVQ_SI_X12(2*8); \ + VMOVQ_SI_X13(4*8); \ + VMOVQ_SI_X14(6*8); \ + VMOVQ_SI_X15_0; \ + VPINSRQ_1_SI_X12(5*8); \ + VPINSRQ_1_SI_X13(15*8); \ + VPINSRQ_1_SI_X14(10*8); \ + VPINSRQ_1_SI_X15(8*8) + +// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) +#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \ + VMOVQ_SI_X12(9*8); \ + VMOVQ_SI_X13(2*8); \ + VMOVQ_SI_X14_0; \ + VMOVQ_SI_X15(4*8); \ + VPINSRQ_1_SI_X12(5*8); \ + VPINSRQ_1_SI_X13(10*8); \ + VPINSRQ_1_SI_X14(7*8); \ + VPINSRQ_1_SI_X15(15*8) + +// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) +#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \ + VMOVQ_SI_X12(2*8); \ + VMOVQ_SI_X13_0; \ + VMOVQ_SI_X14(12*8); \ + VMOVQ_SI_X15(11*8); \ + VPINSRQ_1_SI_X12(6*8); \ + VPINSRQ_1_SI_X13(8*8); \ + VPINSRQ_1_SI_X14(10*8); \ + VPINSRQ_1_SI_X15(3*8) + +// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) +#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \ + MOVQ 0*8(SI), X12; \ + VPSHUFD $0x4E, 8*8(SI), X13; \ + MOVQ 7*8(SI), X14; \ + MOVQ 2*8(SI), X15; \ + VPINSRQ_1_SI_X12(6*8); \ + VPINSRQ_1_SI_X14(3*8); \ + VPINSRQ_1_SI_X15(11*8) + +// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) +#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \ + MOVQ 6*8(SI), X12; \ + MOVQ 11*8(SI), X13; \ + MOVQ 15*8(SI), X14; \ + MOVQ 3*8(SI), X15; \ + VPINSRQ_1_SI_X12(14*8); \ + VPINSRQ_1_SI_X13_0; \ + VPINSRQ_1_SI_X14(9*8); \ + VPINSRQ_1_SI_X15(8*8) + +// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) +#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \ + MOVQ 5*8(SI), X12; \ + MOVQ 8*8(SI), X13; \ + MOVQ 0*8(SI), X14; \ + MOVQ 6*8(SI), X15; \ + VPINSRQ_1_SI_X12(15*8); \ + VPINSRQ_1_SI_X13(2*8); \ + VPINSRQ_1_SI_X14(4*8); \ + VPINSRQ_1_SI_X15(10*8) + +// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) +#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \ + VMOVDQU 12*8(SI), X12; \ + MOVQ 1*8(SI), X13; \ + MOVQ 2*8(SI), X14; \ + VPINSRQ_1_SI_X13(10*8); \ + VPINSRQ_1_SI_X14(7*8); \ + VMOVDQU 4*8(SI), X15 + +// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) +#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \ + MOVQ 15*8(SI), X12; \ + MOVQ 3*8(SI), X13; \ + MOVQ 11*8(SI), X14; \ + MOVQ 12*8(SI), X15; \ + VPINSRQ_1_SI_X12(9*8); \ + VPINSRQ_1_SI_X13(13*8); \ + VPINSRQ_1_SI_X14(14*8); \ + VPINSRQ_1_SI_X15_0 + +// func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) +TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + + MOVQ SP, R10 + ADDQ $15, R10 + ANDQ $~15, R10 + + VMOVDQU ·AVX_c40<>(SB), X0 + VMOVDQU ·AVX_c48<>(SB), X1 + VMOVDQA X0, X8 + VMOVDQA X1, X9 + + VMOVDQU ·AVX_iv3<>(SB), X0 + VMOVDQA X0, 0(R10) + XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0) + + VMOVDQU 0(AX), X10 + VMOVDQU 16(AX), X11 + VMOVDQU 32(AX), X2 + VMOVDQU 48(AX), X3 + + MOVQ 0(BX), R8 + MOVQ 8(BX), R9 + +loop: + ADDQ $128, R8 + CMPQ R8, $128 + JGE noinc + INCQ R9 + +noinc: + VMOVQ_R8_X15 + VPINSRQ_1_R9_X15 + + VMOVDQA X10, X0 + VMOVDQA X11, X1 + VMOVDQU ·AVX_iv0<>(SB), X4 + VMOVDQU ·AVX_iv1<>(SB), X5 + VMOVDQU ·AVX_iv2<>(SB), X6 + + VPXOR X15, X6, X6 + VMOVDQA 0(R10), X7 + + LOAD_MSG_AVX_0_2_4_6_1_3_5_7() + VMOVDQA X12, 16(R10) + VMOVDQA X13, 32(R10) + VMOVDQA X14, 48(R10) + VMOVDQA X15, 64(R10) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) + VMOVDQA X12, 80(R10) + VMOVDQA X13, 96(R10) + VMOVDQA X14, 112(R10) + VMOVDQA X15, 128(R10) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) + VMOVDQA X12, 144(R10) + VMOVDQA X13, 160(R10) + VMOVDQA X14, 176(R10) + VMOVDQA X15, 192(R10) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_1_0_11_5_12_2_7_3() + VMOVDQA X12, 208(R10) + VMOVDQA X13, 224(R10) + VMOVDQA X14, 240(R10) + VMOVDQA X15, 256(R10) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_11_12_5_15_8_0_2_13() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_2_5_4_15_6_10_0_8() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_9_5_2_10_0_7_4_15() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_2_6_0_8_12_10_11_3() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_0_6_9_8_7_3_2_11() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_5_15_8_2_0_4_6_10() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX_6_14_11_0_15_9_3_8() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_12_13_1_10_2_7_4_5() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX() + LOAD_MSG_AVX_15_9_3_13_11_14_12_0() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) + SHUFFLE_AVX_INV() + + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9) + SHUFFLE_AVX() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9) + SHUFFLE_AVX_INV() + + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9) + SHUFFLE_AVX() + HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9) + SHUFFLE_AVX_INV() + + VMOVDQU 32(AX), X14 + VMOVDQU 48(AX), X15 + VPXOR X0, X10, X10 + VPXOR X1, X11, X11 + VPXOR X2, X14, X14 + VPXOR X3, X15, X15 + VPXOR X4, X10, X10 + VPXOR X5, X11, X11 + VPXOR X6, X14, X2 + VPXOR X7, X15, X3 + VMOVDQU X2, 32(AX) + VMOVDQU X3, 48(AX) + + LEAQ 128(SI), SI + SUBQ $128, DI + JNE loop + + VMOVDQU X10, 0(AX) + VMOVDQU X11, 16(AX) + + MOVQ R8, 0(BX) + MOVQ R9, 8(BX) + VZEROUPPER + + RET diff --git a/src/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.go b/src/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.go new file mode 100644 index 00000000000..5fa1b32841d --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.go @@ -0,0 +1,25 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !go1.7 && amd64 && gc && !purego +// +build !go1.7,amd64,gc,!purego + +package blake2b + +import "golang.org/x/sys/cpu" + +func init() { + useSSE4 = cpu.X86.HasSSE41 +} + +//go:noescape +func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) + +func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) { + if useSSE4 { + hashBlocksSSE4(h, c, flag, blocks) + } else { + hashBlocksGeneric(h, c, flag, blocks) + } +} diff --git a/src/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s b/src/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s new file mode 100644 index 00000000000..ae75eb9afcd --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s @@ -0,0 +1,279 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 && gc && !purego +// +build amd64,gc,!purego + +#include "textflag.h" + +DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 +DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b +GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16 + +DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b +DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16 + +DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1 +DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16 + +DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b +DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 +GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16 + +DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 +DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 + +DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 +DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 + +#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ + MOVO v4, t1; \ + MOVO v5, v4; \ + MOVO t1, v5; \ + MOVO v6, t1; \ + PUNPCKLQDQ v6, t2; \ + PUNPCKHQDQ v7, v6; \ + PUNPCKHQDQ t2, v6; \ + PUNPCKLQDQ v7, t2; \ + MOVO t1, v7; \ + MOVO v2, t1; \ + PUNPCKHQDQ t2, v7; \ + PUNPCKLQDQ v3, t2; \ + PUNPCKHQDQ t2, v2; \ + PUNPCKLQDQ t1, t2; \ + PUNPCKHQDQ t2, v3 + +#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ + MOVO v4, t1; \ + MOVO v5, v4; \ + MOVO t1, v5; \ + MOVO v2, t1; \ + PUNPCKLQDQ v2, t2; \ + PUNPCKHQDQ v3, v2; \ + PUNPCKHQDQ t2, v2; \ + PUNPCKLQDQ v3, t2; \ + MOVO t1, v3; \ + MOVO v6, t1; \ + PUNPCKHQDQ t2, v3; \ + PUNPCKLQDQ v7, t2; \ + PUNPCKHQDQ t2, v6; \ + PUNPCKLQDQ t1, t2; \ + PUNPCKHQDQ t2, v7 + +#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ + PADDQ m0, v0; \ + PADDQ m1, v1; \ + PADDQ v2, v0; \ + PADDQ v3, v1; \ + PXOR v0, v6; \ + PXOR v1, v7; \ + PSHUFD $0xB1, v6, v6; \ + PSHUFD $0xB1, v7, v7; \ + PADDQ v6, v4; \ + PADDQ v7, v5; \ + PXOR v4, v2; \ + PXOR v5, v3; \ + PSHUFB c40, v2; \ + PSHUFB c40, v3; \ + PADDQ m2, v0; \ + PADDQ m3, v1; \ + PADDQ v2, v0; \ + PADDQ v3, v1; \ + PXOR v0, v6; \ + PXOR v1, v7; \ + PSHUFB c48, v6; \ + PSHUFB c48, v7; \ + PADDQ v6, v4; \ + PADDQ v7, v5; \ + PXOR v4, v2; \ + PXOR v5, v3; \ + MOVOU v2, t0; \ + PADDQ v2, t0; \ + PSRLQ $63, v2; \ + PXOR t0, v2; \ + MOVOU v3, t0; \ + PADDQ v3, t0; \ + PSRLQ $63, v3; \ + PXOR t0, v3 + +#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \ + MOVQ i0*8(src), m0; \ + PINSRQ $1, i1*8(src), m0; \ + MOVQ i2*8(src), m1; \ + PINSRQ $1, i3*8(src), m1; \ + MOVQ i4*8(src), m2; \ + PINSRQ $1, i5*8(src), m2; \ + MOVQ i6*8(src), m3; \ + PINSRQ $1, i7*8(src), m3 + +// func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) +TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + + MOVQ SP, R10 + ADDQ $15, R10 + ANDQ $~15, R10 + + MOVOU ·iv3<>(SB), X0 + MOVO X0, 0(R10) + XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0) + + MOVOU ·c40<>(SB), X13 + MOVOU ·c48<>(SB), X14 + + MOVOU 0(AX), X12 + MOVOU 16(AX), X15 + + MOVQ 0(BX), R8 + MOVQ 8(BX), R9 + +loop: + ADDQ $128, R8 + CMPQ R8, $128 + JGE noinc + INCQ R9 + +noinc: + MOVQ R8, X8 + PINSRQ $1, R9, X8 + + MOVO X12, X0 + MOVO X15, X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU ·iv0<>(SB), X4 + MOVOU ·iv1<>(SB), X5 + MOVOU ·iv2<>(SB), X6 + + PXOR X8, X6 + MOVO 0(R10), X7 + + LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) + MOVO X8, 16(R10) + MOVO X9, 32(R10) + MOVO X10, 48(R10) + MOVO X11, 64(R10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) + MOVO X8, 80(R10) + MOVO X9, 96(R10) + MOVO X10, 112(R10) + MOVO X11, 128(R10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) + MOVO X8, 144(R10) + MOVO X9, 160(R10) + MOVO X10, 176(R10) + MOVO X11, 192(R10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) + MOVO X8, 208(R10) + MOVO X9, 224(R10) + MOVO X10, 240(R10) + MOVO X11, 256(R10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14) + SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14) + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + + MOVOU 32(AX), X10 + MOVOU 48(AX), X11 + PXOR X0, X12 + PXOR X1, X15 + PXOR X2, X10 + PXOR X3, X11 + PXOR X4, X12 + PXOR X5, X15 + PXOR X6, X10 + PXOR X7, X11 + MOVOU X10, 32(AX) + MOVOU X11, 48(AX) + + LEAQ 128(SI), SI + SUBQ $128, DI + JNE loop + + MOVOU X12, 0(AX) + MOVOU X15, 16(AX) + + MOVQ R8, 0(BX) + MOVQ R9, 8(BX) + + RET diff --git a/src/vendor/golang.org/x/crypto/blake2b/blake2b_generic.go b/src/vendor/golang.org/x/crypto/blake2b/blake2b_generic.go new file mode 100644 index 00000000000..3168a8aa3c8 --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2b/blake2b_generic.go @@ -0,0 +1,182 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package blake2b + +import ( + "encoding/binary" + "math/bits" +) + +// the precomputed values for BLAKE2b +// there are 12 16-byte arrays - one for each round +// the entries are calculated from the sigma constants. +var precomputed = [12][16]byte{ + {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, + {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, + {11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4}, + {7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8}, + {9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13}, + {2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9}, + {12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11}, + {13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10}, + {6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5}, + {10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0}, + {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, // equal to the first + {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, // equal to the second +} + +func hashBlocksGeneric(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) { + var m [16]uint64 + c0, c1 := c[0], c[1] + + for i := 0; i < len(blocks); { + c0 += BlockSize + if c0 < BlockSize { + c1++ + } + + v0, v1, v2, v3, v4, v5, v6, v7 := h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7] + v8, v9, v10, v11, v12, v13, v14, v15 := iv[0], iv[1], iv[2], iv[3], iv[4], iv[5], iv[6], iv[7] + v12 ^= c0 + v13 ^= c1 + v14 ^= flag + + for j := range m { + m[j] = binary.LittleEndian.Uint64(blocks[i:]) + i += 8 + } + + for j := range precomputed { + s := &(precomputed[j]) + + v0 += m[s[0]] + v0 += v4 + v12 ^= v0 + v12 = bits.RotateLeft64(v12, -32) + v8 += v12 + v4 ^= v8 + v4 = bits.RotateLeft64(v4, -24) + v1 += m[s[1]] + v1 += v5 + v13 ^= v1 + v13 = bits.RotateLeft64(v13, -32) + v9 += v13 + v5 ^= v9 + v5 = bits.RotateLeft64(v5, -24) + v2 += m[s[2]] + v2 += v6 + v14 ^= v2 + v14 = bits.RotateLeft64(v14, -32) + v10 += v14 + v6 ^= v10 + v6 = bits.RotateLeft64(v6, -24) + v3 += m[s[3]] + v3 += v7 + v15 ^= v3 + v15 = bits.RotateLeft64(v15, -32) + v11 += v15 + v7 ^= v11 + v7 = bits.RotateLeft64(v7, -24) + + v0 += m[s[4]] + v0 += v4 + v12 ^= v0 + v12 = bits.RotateLeft64(v12, -16) + v8 += v12 + v4 ^= v8 + v4 = bits.RotateLeft64(v4, -63) + v1 += m[s[5]] + v1 += v5 + v13 ^= v1 + v13 = bits.RotateLeft64(v13, -16) + v9 += v13 + v5 ^= v9 + v5 = bits.RotateLeft64(v5, -63) + v2 += m[s[6]] + v2 += v6 + v14 ^= v2 + v14 = bits.RotateLeft64(v14, -16) + v10 += v14 + v6 ^= v10 + v6 = bits.RotateLeft64(v6, -63) + v3 += m[s[7]] + v3 += v7 + v15 ^= v3 + v15 = bits.RotateLeft64(v15, -16) + v11 += v15 + v7 ^= v11 + v7 = bits.RotateLeft64(v7, -63) + + v0 += m[s[8]] + v0 += v5 + v15 ^= v0 + v15 = bits.RotateLeft64(v15, -32) + v10 += v15 + v5 ^= v10 + v5 = bits.RotateLeft64(v5, -24) + v1 += m[s[9]] + v1 += v6 + v12 ^= v1 + v12 = bits.RotateLeft64(v12, -32) + v11 += v12 + v6 ^= v11 + v6 = bits.RotateLeft64(v6, -24) + v2 += m[s[10]] + v2 += v7 + v13 ^= v2 + v13 = bits.RotateLeft64(v13, -32) + v8 += v13 + v7 ^= v8 + v7 = bits.RotateLeft64(v7, -24) + v3 += m[s[11]] + v3 += v4 + v14 ^= v3 + v14 = bits.RotateLeft64(v14, -32) + v9 += v14 + v4 ^= v9 + v4 = bits.RotateLeft64(v4, -24) + + v0 += m[s[12]] + v0 += v5 + v15 ^= v0 + v15 = bits.RotateLeft64(v15, -16) + v10 += v15 + v5 ^= v10 + v5 = bits.RotateLeft64(v5, -63) + v1 += m[s[13]] + v1 += v6 + v12 ^= v1 + v12 = bits.RotateLeft64(v12, -16) + v11 += v12 + v6 ^= v11 + v6 = bits.RotateLeft64(v6, -63) + v2 += m[s[14]] + v2 += v7 + v13 ^= v2 + v13 = bits.RotateLeft64(v13, -16) + v8 += v13 + v7 ^= v8 + v7 = bits.RotateLeft64(v7, -63) + v3 += m[s[15]] + v3 += v4 + v14 ^= v3 + v14 = bits.RotateLeft64(v14, -16) + v9 += v14 + v4 ^= v9 + v4 = bits.RotateLeft64(v4, -63) + + } + + h[0] ^= v0 ^ v8 + h[1] ^= v1 ^ v9 + h[2] ^= v2 ^ v10 + h[3] ^= v3 ^ v11 + h[4] ^= v4 ^ v12 + h[5] ^= v5 ^ v13 + h[6] ^= v6 ^ v14 + h[7] ^= v7 ^ v15 + } + c[0], c[1] = c0, c1 +} diff --git a/src/vendor/golang.org/x/crypto/blake2b/blake2b_ref.go b/src/vendor/golang.org/x/crypto/blake2b/blake2b_ref.go new file mode 100644 index 00000000000..b0137cdf025 --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2b/blake2b_ref.go @@ -0,0 +1,12 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 || purego || !gc +// +build !amd64 purego !gc + +package blake2b + +func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) { + hashBlocksGeneric(h, c, flag, blocks) +} diff --git a/src/vendor/golang.org/x/crypto/blake2b/blake2x.go b/src/vendor/golang.org/x/crypto/blake2b/blake2x.go new file mode 100644 index 00000000000..52c414db0e6 --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2b/blake2x.go @@ -0,0 +1,177 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package blake2b + +import ( + "encoding/binary" + "errors" + "io" +) + +// XOF defines the interface to hash functions that +// support arbitrary-length output. +type XOF interface { + // Write absorbs more data into the hash's state. It panics if called + // after Read. + io.Writer + + // Read reads more output from the hash. It returns io.EOF if the limit + // has been reached. + io.Reader + + // Clone returns a copy of the XOF in its current state. + Clone() XOF + + // Reset resets the XOF to its initial state. + Reset() +} + +// OutputLengthUnknown can be used as the size argument to NewXOF to indicate +// the length of the output is not known in advance. +const OutputLengthUnknown = 0 + +// magicUnknownOutputLength is a magic value for the output size that indicates +// an unknown number of output bytes. +const magicUnknownOutputLength = (1 << 32) - 1 + +// maxOutputLength is the absolute maximum number of bytes to produce when the +// number of output bytes is unknown. +const maxOutputLength = (1 << 32) * 64 + +// NewXOF creates a new variable-output-length hash. The hash either produce a +// known number of bytes (1 <= size < 2**32-1), or an unknown number of bytes +// (size == OutputLengthUnknown). In the latter case, an absolute limit of +// 256GiB applies. +// +// A non-nil key turns the hash into a MAC. The key must between +// zero and 32 bytes long. +func NewXOF(size uint32, key []byte) (XOF, error) { + if len(key) > Size { + return nil, errKeySize + } + if size == magicUnknownOutputLength { + // 2^32-1 indicates an unknown number of bytes and thus isn't a + // valid length. + return nil, errors.New("blake2b: XOF length too large") + } + if size == OutputLengthUnknown { + size = magicUnknownOutputLength + } + x := &xof{ + d: digest{ + size: Size, + keyLen: len(key), + }, + length: size, + } + copy(x.d.key[:], key) + x.Reset() + return x, nil +} + +type xof struct { + d digest + length uint32 + remaining uint64 + cfg, root, block [Size]byte + offset int + nodeOffset uint32 + readMode bool +} + +func (x *xof) Write(p []byte) (n int, err error) { + if x.readMode { + panic("blake2b: write to XOF after read") + } + return x.d.Write(p) +} + +func (x *xof) Clone() XOF { + clone := *x + return &clone +} + +func (x *xof) Reset() { + x.cfg[0] = byte(Size) + binary.LittleEndian.PutUint32(x.cfg[4:], uint32(Size)) // leaf length + binary.LittleEndian.PutUint32(x.cfg[12:], x.length) // XOF length + x.cfg[17] = byte(Size) // inner hash size + + x.d.Reset() + x.d.h[1] ^= uint64(x.length) << 32 + + x.remaining = uint64(x.length) + if x.remaining == magicUnknownOutputLength { + x.remaining = maxOutputLength + } + x.offset, x.nodeOffset = 0, 0 + x.readMode = false +} + +func (x *xof) Read(p []byte) (n int, err error) { + if !x.readMode { + x.d.finalize(&x.root) + x.readMode = true + } + + if x.remaining == 0 { + return 0, io.EOF + } + + n = len(p) + if uint64(n) > x.remaining { + n = int(x.remaining) + p = p[:n] + } + + if x.offset > 0 { + blockRemaining := Size - x.offset + if n < blockRemaining { + x.offset += copy(p, x.block[x.offset:]) + x.remaining -= uint64(n) + return + } + copy(p, x.block[x.offset:]) + p = p[blockRemaining:] + x.offset = 0 + x.remaining -= uint64(blockRemaining) + } + + for len(p) >= Size { + binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset) + x.nodeOffset++ + + x.d.initConfig(&x.cfg) + x.d.Write(x.root[:]) + x.d.finalize(&x.block) + + copy(p, x.block[:]) + p = p[Size:] + x.remaining -= uint64(Size) + } + + if todo := len(p); todo > 0 { + if x.remaining < uint64(Size) { + x.cfg[0] = byte(x.remaining) + } + binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset) + x.nodeOffset++ + + x.d.initConfig(&x.cfg) + x.d.Write(x.root[:]) + x.d.finalize(&x.block) + + x.offset = copy(p, x.block[:todo]) + x.remaining -= uint64(todo) + } + return +} + +func (d *digest) initConfig(cfg *[Size]byte) { + d.offset, d.c[0], d.c[1] = 0, 0, 0 + for i := range d.h { + d.h[i] = iv[i] ^ binary.LittleEndian.Uint64(cfg[i*8:]) + } +} diff --git a/src/vendor/golang.org/x/crypto/blake2b/register.go b/src/vendor/golang.org/x/crypto/blake2b/register.go new file mode 100644 index 00000000000..9d8633963cb --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2b/register.go @@ -0,0 +1,33 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.9 +// +build go1.9 + +package blake2b + +import ( + "crypto" + "hash" +) + +func init() { + newHash256 := func() hash.Hash { + h, _ := New256(nil) + return h + } + newHash384 := func() hash.Hash { + h, _ := New384(nil) + return h + } + + newHash512 := func() hash.Hash { + h, _ := New512(nil) + return h + } + + crypto.RegisterHash(crypto.BLAKE2b_256, newHash256) + crypto.RegisterHash(crypto.BLAKE2b_384, newHash384) + crypto.RegisterHash(crypto.BLAKE2b_512, newHash512) +} diff --git a/src/vendor/golang.org/x/crypto/blake2s/blake2s.go b/src/vendor/golang.org/x/crypto/blake2s/blake2s.go new file mode 100644 index 00000000000..e3f46aab3a1 --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2s/blake2s.go @@ -0,0 +1,246 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package blake2s implements the BLAKE2s hash algorithm defined by RFC 7693 +// and the extendable output function (XOF) BLAKE2Xs. +// +// BLAKE2s is optimized for 8- to 32-bit platforms and produces digests of any +// size between 1 and 32 bytes. +// For a detailed specification of BLAKE2s see https://blake2.net/blake2.pdf +// and for BLAKE2Xs see https://blake2.net/blake2x.pdf +// +// If you aren't sure which function you need, use BLAKE2s (Sum256 or New256). +// If you need a secret-key MAC (message authentication code), use the New256 +// function with a non-nil key. +// +// BLAKE2X is a construction to compute hash values larger than 32 bytes. It +// can produce hash values between 0 and 65535 bytes. +package blake2s // import "golang.org/x/crypto/blake2s" + +import ( + "encoding/binary" + "errors" + "hash" +) + +const ( + // The blocksize of BLAKE2s in bytes. + BlockSize = 64 + + // The hash size of BLAKE2s-256 in bytes. + Size = 32 + + // The hash size of BLAKE2s-128 in bytes. + Size128 = 16 +) + +var errKeySize = errors.New("blake2s: invalid key size") + +var iv = [8]uint32{ + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19, +} + +// Sum256 returns the BLAKE2s-256 checksum of the data. +func Sum256(data []byte) [Size]byte { + var sum [Size]byte + checkSum(&sum, Size, data) + return sum +} + +// New256 returns a new hash.Hash computing the BLAKE2s-256 checksum. A non-nil +// key turns the hash into a MAC. The key must between zero and 32 bytes long. +// When the key is nil, the returned hash.Hash implements BinaryMarshaler +// and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash. +func New256(key []byte) (hash.Hash, error) { return newDigest(Size, key) } + +// New128 returns a new hash.Hash computing the BLAKE2s-128 checksum given a +// non-empty key. Note that a 128-bit digest is too small to be secure as a +// cryptographic hash and should only be used as a MAC, thus the key argument +// is not optional. +func New128(key []byte) (hash.Hash, error) { + if len(key) == 0 { + return nil, errors.New("blake2s: a key is required for a 128-bit hash") + } + return newDigest(Size128, key) +} + +func newDigest(hashSize int, key []byte) (*digest, error) { + if len(key) > Size { + return nil, errKeySize + } + d := &digest{ + size: hashSize, + keyLen: len(key), + } + copy(d.key[:], key) + d.Reset() + return d, nil +} + +func checkSum(sum *[Size]byte, hashSize int, data []byte) { + var ( + h [8]uint32 + c [2]uint32 + ) + + h = iv + h[0] ^= uint32(hashSize) | (1 << 16) | (1 << 24) + + if length := len(data); length > BlockSize { + n := length &^ (BlockSize - 1) + if length == n { + n -= BlockSize + } + hashBlocks(&h, &c, 0, data[:n]) + data = data[n:] + } + + var block [BlockSize]byte + offset := copy(block[:], data) + remaining := uint32(BlockSize - offset) + + if c[0] < remaining { + c[1]-- + } + c[0] -= remaining + + hashBlocks(&h, &c, 0xFFFFFFFF, block[:]) + + for i, v := range h { + binary.LittleEndian.PutUint32(sum[4*i:], v) + } +} + +type digest struct { + h [8]uint32 + c [2]uint32 + size int + block [BlockSize]byte + offset int + + key [BlockSize]byte + keyLen int +} + +const ( + magic = "b2s" + marshaledSize = len(magic) + 8*4 + 2*4 + 1 + BlockSize + 1 +) + +func (d *digest) MarshalBinary() ([]byte, error) { + if d.keyLen != 0 { + return nil, errors.New("crypto/blake2s: cannot marshal MACs") + } + b := make([]byte, 0, marshaledSize) + b = append(b, magic...) + for i := 0; i < 8; i++ { + b = appendUint32(b, d.h[i]) + } + b = appendUint32(b, d.c[0]) + b = appendUint32(b, d.c[1]) + // Maximum value for size is 32 + b = append(b, byte(d.size)) + b = append(b, d.block[:]...) + b = append(b, byte(d.offset)) + return b, nil +} + +func (d *digest) UnmarshalBinary(b []byte) error { + if len(b) < len(magic) || string(b[:len(magic)]) != magic { + return errors.New("crypto/blake2s: invalid hash state identifier") + } + if len(b) != marshaledSize { + return errors.New("crypto/blake2s: invalid hash state size") + } + b = b[len(magic):] + for i := 0; i < 8; i++ { + b, d.h[i] = consumeUint32(b) + } + b, d.c[0] = consumeUint32(b) + b, d.c[1] = consumeUint32(b) + d.size = int(b[0]) + b = b[1:] + copy(d.block[:], b[:BlockSize]) + b = b[BlockSize:] + d.offset = int(b[0]) + return nil +} + +func (d *digest) BlockSize() int { return BlockSize } + +func (d *digest) Size() int { return d.size } + +func (d *digest) Reset() { + d.h = iv + d.h[0] ^= uint32(d.size) | (uint32(d.keyLen) << 8) | (1 << 16) | (1 << 24) + d.offset, d.c[0], d.c[1] = 0, 0, 0 + if d.keyLen > 0 { + d.block = d.key + d.offset = BlockSize + } +} + +func (d *digest) Write(p []byte) (n int, err error) { + n = len(p) + + if d.offset > 0 { + remaining := BlockSize - d.offset + if n <= remaining { + d.offset += copy(d.block[d.offset:], p) + return + } + copy(d.block[d.offset:], p[:remaining]) + hashBlocks(&d.h, &d.c, 0, d.block[:]) + d.offset = 0 + p = p[remaining:] + } + + if length := len(p); length > BlockSize { + nn := length &^ (BlockSize - 1) + if length == nn { + nn -= BlockSize + } + hashBlocks(&d.h, &d.c, 0, p[:nn]) + p = p[nn:] + } + + d.offset += copy(d.block[:], p) + return +} + +func (d *digest) Sum(sum []byte) []byte { + var hash [Size]byte + d.finalize(&hash) + return append(sum, hash[:d.size]...) +} + +func (d *digest) finalize(hash *[Size]byte) { + var block [BlockSize]byte + h := d.h + c := d.c + + copy(block[:], d.block[:d.offset]) + remaining := uint32(BlockSize - d.offset) + if c[0] < remaining { + c[1]-- + } + c[0] -= remaining + + hashBlocks(&h, &c, 0xFFFFFFFF, block[:]) + for i, v := range h { + binary.LittleEndian.PutUint32(hash[4*i:], v) + } +} + +func appendUint32(b []byte, x uint32) []byte { + var a [4]byte + binary.BigEndian.PutUint32(a[:], x) + return append(b, a[:]...) +} + +func consumeUint32(b []byte) ([]byte, uint32) { + x := binary.BigEndian.Uint32(b) + return b[4:], x +} diff --git a/src/vendor/golang.org/x/crypto/blake2s/blake2s_386.go b/src/vendor/golang.org/x/crypto/blake2s/blake2s_386.go new file mode 100644 index 00000000000..b4463fb4dc0 --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2s/blake2s_386.go @@ -0,0 +1,33 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build 386 && gc && !purego +// +build 386,gc,!purego + +package blake2s + +import "golang.org/x/sys/cpu" + +var ( + useSSE4 = false + useSSSE3 = cpu.X86.HasSSSE3 + useSSE2 = cpu.X86.HasSSE2 +) + +//go:noescape +func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) + +//go:noescape +func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) + +func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) { + switch { + case useSSSE3: + hashBlocksSSSE3(h, c, flag, blocks) + case useSSE2: + hashBlocksSSE2(h, c, flag, blocks) + default: + hashBlocksGeneric(h, c, flag, blocks) + } +} diff --git a/src/vendor/golang.org/x/crypto/blake2s/blake2s_386.s b/src/vendor/golang.org/x/crypto/blake2s/blake2s_386.s new file mode 100644 index 00000000000..603d00ca320 --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2s/blake2s_386.s @@ -0,0 +1,430 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build 386 && gc && !purego +// +build 386,gc,!purego + +#include "textflag.h" + +DATA iv0<>+0x00(SB)/4, $0x6a09e667 +DATA iv0<>+0x04(SB)/4, $0xbb67ae85 +DATA iv0<>+0x08(SB)/4, $0x3c6ef372 +DATA iv0<>+0x0c(SB)/4, $0xa54ff53a +GLOBL iv0<>(SB), (NOPTR+RODATA), $16 + +DATA iv1<>+0x00(SB)/4, $0x510e527f +DATA iv1<>+0x04(SB)/4, $0x9b05688c +DATA iv1<>+0x08(SB)/4, $0x1f83d9ab +DATA iv1<>+0x0c(SB)/4, $0x5be0cd19 +GLOBL iv1<>(SB), (NOPTR+RODATA), $16 + +DATA rol16<>+0x00(SB)/8, $0x0504070601000302 +DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +GLOBL rol16<>(SB), (NOPTR+RODATA), $16 + +DATA rol8<>+0x00(SB)/8, $0x0407060500030201 +DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 +GLOBL rol8<>(SB), (NOPTR+RODATA), $16 + +DATA counter<>+0x00(SB)/8, $0x40 +DATA counter<>+0x08(SB)/8, $0x0 +GLOBL counter<>(SB), (NOPTR+RODATA), $16 + +#define ROTL_SSE2(n, t, v) \ + MOVO v, t; \ + PSLLL $n, t; \ + PSRLL $(32-n), v; \ + PXOR t, v + +#define ROTL_SSSE3(c, v) \ + PSHUFB c, v + +#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \ + PADDL m0, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSE2(16, t, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(20, t, v1); \ + PADDL m1, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSE2(24, t, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(25, t, v1); \ + PSHUFL $0x39, v1, v1; \ + PSHUFL $0x4E, v2, v2; \ + PSHUFL $0x93, v3, v3; \ + PADDL m2, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSE2(16, t, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(20, t, v1); \ + PADDL m3, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSE2(24, t, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(25, t, v1); \ + PSHUFL $0x39, v3, v3; \ + PSHUFL $0x4E, v2, v2; \ + PSHUFL $0x93, v1, v1 + +#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \ + PADDL m0, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSSE3(c16, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(20, t, v1); \ + PADDL m1, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSSE3(c8, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(25, t, v1); \ + PSHUFL $0x39, v1, v1; \ + PSHUFL $0x4E, v2, v2; \ + PSHUFL $0x93, v3, v3; \ + PADDL m2, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSSE3(c16, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(20, t, v1); \ + PADDL m3, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSSE3(c8, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(25, t, v1); \ + PSHUFL $0x39, v3, v3; \ + PSHUFL $0x4E, v2, v2; \ + PSHUFL $0x93, v1, v1 + +#define PRECOMPUTE(dst, off, src, t) \ + MOVL 0*4(src), t; \ + MOVL t, 0*4+off+0(dst); \ + MOVL t, 9*4+off+64(dst); \ + MOVL t, 5*4+off+128(dst); \ + MOVL t, 14*4+off+192(dst); \ + MOVL t, 4*4+off+256(dst); \ + MOVL t, 2*4+off+320(dst); \ + MOVL t, 8*4+off+384(dst); \ + MOVL t, 12*4+off+448(dst); \ + MOVL t, 3*4+off+512(dst); \ + MOVL t, 15*4+off+576(dst); \ + MOVL 1*4(src), t; \ + MOVL t, 4*4+off+0(dst); \ + MOVL t, 8*4+off+64(dst); \ + MOVL t, 14*4+off+128(dst); \ + MOVL t, 5*4+off+192(dst); \ + MOVL t, 12*4+off+256(dst); \ + MOVL t, 11*4+off+320(dst); \ + MOVL t, 1*4+off+384(dst); \ + MOVL t, 6*4+off+448(dst); \ + MOVL t, 10*4+off+512(dst); \ + MOVL t, 3*4+off+576(dst); \ + MOVL 2*4(src), t; \ + MOVL t, 1*4+off+0(dst); \ + MOVL t, 13*4+off+64(dst); \ + MOVL t, 6*4+off+128(dst); \ + MOVL t, 8*4+off+192(dst); \ + MOVL t, 2*4+off+256(dst); \ + MOVL t, 0*4+off+320(dst); \ + MOVL t, 14*4+off+384(dst); \ + MOVL t, 11*4+off+448(dst); \ + MOVL t, 12*4+off+512(dst); \ + MOVL t, 4*4+off+576(dst); \ + MOVL 3*4(src), t; \ + MOVL t, 5*4+off+0(dst); \ + MOVL t, 15*4+off+64(dst); \ + MOVL t, 9*4+off+128(dst); \ + MOVL t, 1*4+off+192(dst); \ + MOVL t, 11*4+off+256(dst); \ + MOVL t, 7*4+off+320(dst); \ + MOVL t, 13*4+off+384(dst); \ + MOVL t, 3*4+off+448(dst); \ + MOVL t, 6*4+off+512(dst); \ + MOVL t, 10*4+off+576(dst); \ + MOVL 4*4(src), t; \ + MOVL t, 2*4+off+0(dst); \ + MOVL t, 1*4+off+64(dst); \ + MOVL t, 15*4+off+128(dst); \ + MOVL t, 10*4+off+192(dst); \ + MOVL t, 6*4+off+256(dst); \ + MOVL t, 8*4+off+320(dst); \ + MOVL t, 3*4+off+384(dst); \ + MOVL t, 13*4+off+448(dst); \ + MOVL t, 14*4+off+512(dst); \ + MOVL t, 5*4+off+576(dst); \ + MOVL 5*4(src), t; \ + MOVL t, 6*4+off+0(dst); \ + MOVL t, 11*4+off+64(dst); \ + MOVL t, 2*4+off+128(dst); \ + MOVL t, 9*4+off+192(dst); \ + MOVL t, 1*4+off+256(dst); \ + MOVL t, 13*4+off+320(dst); \ + MOVL t, 4*4+off+384(dst); \ + MOVL t, 8*4+off+448(dst); \ + MOVL t, 15*4+off+512(dst); \ + MOVL t, 7*4+off+576(dst); \ + MOVL 6*4(src), t; \ + MOVL t, 3*4+off+0(dst); \ + MOVL t, 7*4+off+64(dst); \ + MOVL t, 13*4+off+128(dst); \ + MOVL t, 12*4+off+192(dst); \ + MOVL t, 10*4+off+256(dst); \ + MOVL t, 1*4+off+320(dst); \ + MOVL t, 9*4+off+384(dst); \ + MOVL t, 14*4+off+448(dst); \ + MOVL t, 0*4+off+512(dst); \ + MOVL t, 6*4+off+576(dst); \ + MOVL 7*4(src), t; \ + MOVL t, 7*4+off+0(dst); \ + MOVL t, 14*4+off+64(dst); \ + MOVL t, 10*4+off+128(dst); \ + MOVL t, 0*4+off+192(dst); \ + MOVL t, 5*4+off+256(dst); \ + MOVL t, 9*4+off+320(dst); \ + MOVL t, 12*4+off+384(dst); \ + MOVL t, 1*4+off+448(dst); \ + MOVL t, 13*4+off+512(dst); \ + MOVL t, 2*4+off+576(dst); \ + MOVL 8*4(src), t; \ + MOVL t, 8*4+off+0(dst); \ + MOVL t, 5*4+off+64(dst); \ + MOVL t, 4*4+off+128(dst); \ + MOVL t, 15*4+off+192(dst); \ + MOVL t, 14*4+off+256(dst); \ + MOVL t, 3*4+off+320(dst); \ + MOVL t, 11*4+off+384(dst); \ + MOVL t, 10*4+off+448(dst); \ + MOVL t, 7*4+off+512(dst); \ + MOVL t, 1*4+off+576(dst); \ + MOVL 9*4(src), t; \ + MOVL t, 12*4+off+0(dst); \ + MOVL t, 2*4+off+64(dst); \ + MOVL t, 11*4+off+128(dst); \ + MOVL t, 4*4+off+192(dst); \ + MOVL t, 0*4+off+256(dst); \ + MOVL t, 15*4+off+320(dst); \ + MOVL t, 10*4+off+384(dst); \ + MOVL t, 7*4+off+448(dst); \ + MOVL t, 5*4+off+512(dst); \ + MOVL t, 9*4+off+576(dst); \ + MOVL 10*4(src), t; \ + MOVL t, 9*4+off+0(dst); \ + MOVL t, 4*4+off+64(dst); \ + MOVL t, 8*4+off+128(dst); \ + MOVL t, 13*4+off+192(dst); \ + MOVL t, 3*4+off+256(dst); \ + MOVL t, 5*4+off+320(dst); \ + MOVL t, 7*4+off+384(dst); \ + MOVL t, 15*4+off+448(dst); \ + MOVL t, 11*4+off+512(dst); \ + MOVL t, 0*4+off+576(dst); \ + MOVL 11*4(src), t; \ + MOVL t, 13*4+off+0(dst); \ + MOVL t, 10*4+off+64(dst); \ + MOVL t, 0*4+off+128(dst); \ + MOVL t, 3*4+off+192(dst); \ + MOVL t, 9*4+off+256(dst); \ + MOVL t, 6*4+off+320(dst); \ + MOVL t, 15*4+off+384(dst); \ + MOVL t, 4*4+off+448(dst); \ + MOVL t, 2*4+off+512(dst); \ + MOVL t, 12*4+off+576(dst); \ + MOVL 12*4(src), t; \ + MOVL t, 10*4+off+0(dst); \ + MOVL t, 12*4+off+64(dst); \ + MOVL t, 1*4+off+128(dst); \ + MOVL t, 6*4+off+192(dst); \ + MOVL t, 13*4+off+256(dst); \ + MOVL t, 4*4+off+320(dst); \ + MOVL t, 0*4+off+384(dst); \ + MOVL t, 2*4+off+448(dst); \ + MOVL t, 8*4+off+512(dst); \ + MOVL t, 14*4+off+576(dst); \ + MOVL 13*4(src), t; \ + MOVL t, 14*4+off+0(dst); \ + MOVL t, 3*4+off+64(dst); \ + MOVL t, 7*4+off+128(dst); \ + MOVL t, 2*4+off+192(dst); \ + MOVL t, 15*4+off+256(dst); \ + MOVL t, 12*4+off+320(dst); \ + MOVL t, 6*4+off+384(dst); \ + MOVL t, 0*4+off+448(dst); \ + MOVL t, 9*4+off+512(dst); \ + MOVL t, 11*4+off+576(dst); \ + MOVL 14*4(src), t; \ + MOVL t, 11*4+off+0(dst); \ + MOVL t, 0*4+off+64(dst); \ + MOVL t, 12*4+off+128(dst); \ + MOVL t, 7*4+off+192(dst); \ + MOVL t, 8*4+off+256(dst); \ + MOVL t, 14*4+off+320(dst); \ + MOVL t, 2*4+off+384(dst); \ + MOVL t, 5*4+off+448(dst); \ + MOVL t, 1*4+off+512(dst); \ + MOVL t, 13*4+off+576(dst); \ + MOVL 15*4(src), t; \ + MOVL t, 15*4+off+0(dst); \ + MOVL t, 6*4+off+64(dst); \ + MOVL t, 3*4+off+128(dst); \ + MOVL t, 11*4+off+192(dst); \ + MOVL t, 7*4+off+256(dst); \ + MOVL t, 10*4+off+320(dst); \ + MOVL t, 5*4+off+384(dst); \ + MOVL t, 9*4+off+448(dst); \ + MOVL t, 4*4+off+512(dst); \ + MOVL t, 8*4+off+576(dst) + +// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) +TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment + MOVL h+0(FP), AX + MOVL c+4(FP), BX + MOVL flag+8(FP), CX + MOVL blocks_base+12(FP), SI + MOVL blocks_len+16(FP), DX + + MOVL SP, DI + ADDL $15, DI + ANDL $~15, DI + + MOVL CX, 8(DI) + MOVL 0(BX), CX + MOVL CX, 0(DI) + MOVL 4(BX), CX + MOVL CX, 4(DI) + XORL CX, CX + MOVL CX, 12(DI) + + MOVOU 0(AX), X0 + MOVOU 16(AX), X1 + MOVOU counter<>(SB), X2 + +loop: + MOVO X0, X4 + MOVO X1, X5 + MOVOU iv0<>(SB), X6 + MOVOU iv1<>(SB), X7 + + MOVO 0(DI), X3 + PADDQ X2, X3 + PXOR X3, X7 + MOVO X3, 0(DI) + + PRECOMPUTE(DI, 16, SI, CX) + ROUND_SSE2(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3) + ROUND_SSE2(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3) + ROUND_SSE2(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3) + ROUND_SSE2(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3) + ROUND_SSE2(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3) + ROUND_SSE2(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3) + ROUND_SSE2(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3) + ROUND_SSE2(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3) + ROUND_SSE2(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3) + ROUND_SSE2(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3) + + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + + LEAL 64(SI), SI + SUBL $64, DX + JNE loop + + MOVL 0(DI), CX + MOVL CX, 0(BX) + MOVL 4(DI), CX + MOVL CX, 4(BX) + + MOVOU X0, 0(AX) + MOVOU X1, 16(AX) + + RET + +// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) +TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment + MOVL h+0(FP), AX + MOVL c+4(FP), BX + MOVL flag+8(FP), CX + MOVL blocks_base+12(FP), SI + MOVL blocks_len+16(FP), DX + + MOVL SP, DI + ADDL $15, DI + ANDL $~15, DI + + MOVL CX, 8(DI) + MOVL 0(BX), CX + MOVL CX, 0(DI) + MOVL 4(BX), CX + MOVL CX, 4(DI) + XORL CX, CX + MOVL CX, 12(DI) + + MOVOU 0(AX), X0 + MOVOU 16(AX), X1 + MOVOU counter<>(SB), X2 + +loop: + MOVO X0, 656(DI) + MOVO X1, 672(DI) + MOVO X0, X4 + MOVO X1, X5 + MOVOU iv0<>(SB), X6 + MOVOU iv1<>(SB), X7 + + MOVO 0(DI), X3 + PADDQ X2, X3 + PXOR X3, X7 + MOVO X3, 0(DI) + + MOVOU rol16<>(SB), X0 + MOVOU rol8<>(SB), X1 + + PRECOMPUTE(DI, 16, SI, CX) + ROUND_SSSE3(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3, X0, X1) + ROUND_SSSE3(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3, X0, X1) + ROUND_SSSE3(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3, X0, X1) + ROUND_SSSE3(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3, X0, X1) + ROUND_SSSE3(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3, X0, X1) + ROUND_SSSE3(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3, X0, X1) + ROUND_SSSE3(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3, X0, X1) + ROUND_SSSE3(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3, X0, X1) + ROUND_SSSE3(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3, X0, X1) + ROUND_SSSE3(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3, X0, X1) + + MOVO 656(DI), X0 + MOVO 672(DI), X1 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X0 + PXOR X7, X1 + + LEAL 64(SI), SI + SUBL $64, DX + JNE loop + + MOVL 0(DI), CX + MOVL CX, 0(BX) + MOVL 4(DI), CX + MOVL CX, 4(BX) + + MOVOU X0, 0(AX) + MOVOU X1, 16(AX) + + RET diff --git a/src/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.go b/src/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.go new file mode 100644 index 00000000000..becdaa120ff --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.go @@ -0,0 +1,38 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 && gc && !purego +// +build amd64,gc,!purego + +package blake2s + +import "golang.org/x/sys/cpu" + +var ( + useSSE4 = cpu.X86.HasSSE41 + useSSSE3 = cpu.X86.HasSSSE3 + useSSE2 = cpu.X86.HasSSE2 +) + +//go:noescape +func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) + +//go:noescape +func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) + +//go:noescape +func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) + +func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) { + switch { + case useSSE4: + hashBlocksSSE4(h, c, flag, blocks) + case useSSSE3: + hashBlocksSSSE3(h, c, flag, blocks) + case useSSE2: + hashBlocksSSE2(h, c, flag, blocks) + default: + hashBlocksGeneric(h, c, flag, blocks) + } +} diff --git a/src/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s b/src/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s new file mode 100644 index 00000000000..e9df7a7c219 --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s @@ -0,0 +1,433 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 && gc && !purego +// +build amd64,gc,!purego + +#include "textflag.h" + +DATA iv0<>+0x00(SB)/4, $0x6a09e667 +DATA iv0<>+0x04(SB)/4, $0xbb67ae85 +DATA iv0<>+0x08(SB)/4, $0x3c6ef372 +DATA iv0<>+0x0c(SB)/4, $0xa54ff53a +GLOBL iv0<>(SB), (NOPTR+RODATA), $16 + +DATA iv1<>+0x00(SB)/4, $0x510e527f +DATA iv1<>+0x04(SB)/4, $0x9b05688c +DATA iv1<>+0x08(SB)/4, $0x1f83d9ab +DATA iv1<>+0x0c(SB)/4, $0x5be0cd19 +GLOBL iv1<>(SB), (NOPTR+RODATA), $16 + +DATA rol16<>+0x00(SB)/8, $0x0504070601000302 +DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +GLOBL rol16<>(SB), (NOPTR+RODATA), $16 + +DATA rol8<>+0x00(SB)/8, $0x0407060500030201 +DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09 +GLOBL rol8<>(SB), (NOPTR+RODATA), $16 + +DATA counter<>+0x00(SB)/8, $0x40 +DATA counter<>+0x08(SB)/8, $0x0 +GLOBL counter<>(SB), (NOPTR+RODATA), $16 + +#define ROTL_SSE2(n, t, v) \ + MOVO v, t; \ + PSLLL $n, t; \ + PSRLL $(32-n), v; \ + PXOR t, v + +#define ROTL_SSSE3(c, v) \ + PSHUFB c, v + +#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \ + PADDL m0, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSE2(16, t, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(20, t, v1); \ + PADDL m1, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSE2(24, t, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(25, t, v1); \ + PSHUFL $0x39, v1, v1; \ + PSHUFL $0x4E, v2, v2; \ + PSHUFL $0x93, v3, v3; \ + PADDL m2, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSE2(16, t, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(20, t, v1); \ + PADDL m3, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSE2(24, t, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(25, t, v1); \ + PSHUFL $0x39, v3, v3; \ + PSHUFL $0x4E, v2, v2; \ + PSHUFL $0x93, v1, v1 + +#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \ + PADDL m0, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSSE3(c16, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(20, t, v1); \ + PADDL m1, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSSE3(c8, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(25, t, v1); \ + PSHUFL $0x39, v1, v1; \ + PSHUFL $0x4E, v2, v2; \ + PSHUFL $0x93, v3, v3; \ + PADDL m2, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSSE3(c16, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(20, t, v1); \ + PADDL m3, v0; \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSSE3(c8, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE2(25, t, v1); \ + PSHUFL $0x39, v3, v3; \ + PSHUFL $0x4E, v2, v2; \ + PSHUFL $0x93, v1, v1 + + +#define LOAD_MSG_SSE4(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15) \ + MOVL i0*4(src), m0; \ + PINSRD $1, i1*4(src), m0; \ + PINSRD $2, i2*4(src), m0; \ + PINSRD $3, i3*4(src), m0; \ + MOVL i4*4(src), m1; \ + PINSRD $1, i5*4(src), m1; \ + PINSRD $2, i6*4(src), m1; \ + PINSRD $3, i7*4(src), m1; \ + MOVL i8*4(src), m2; \ + PINSRD $1, i9*4(src), m2; \ + PINSRD $2, i10*4(src), m2; \ + PINSRD $3, i11*4(src), m2; \ + MOVL i12*4(src), m3; \ + PINSRD $1, i13*4(src), m3; \ + PINSRD $2, i14*4(src), m3; \ + PINSRD $3, i15*4(src), m3 + +#define PRECOMPUTE_MSG(dst, off, src, R8, R9, R10, R11, R12, R13, R14, R15) \ + MOVQ 0*4(src), R8; \ + MOVQ 2*4(src), R9; \ + MOVQ 4*4(src), R10; \ + MOVQ 6*4(src), R11; \ + MOVQ 8*4(src), R12; \ + MOVQ 10*4(src), R13; \ + MOVQ 12*4(src), R14; \ + MOVQ 14*4(src), R15; \ + \ + MOVL R8, 0*4+off+0(dst); \ + MOVL R8, 9*4+off+64(dst); \ + MOVL R8, 5*4+off+128(dst); \ + MOVL R8, 14*4+off+192(dst); \ + MOVL R8, 4*4+off+256(dst); \ + MOVL R8, 2*4+off+320(dst); \ + MOVL R8, 8*4+off+384(dst); \ + MOVL R8, 12*4+off+448(dst); \ + MOVL R8, 3*4+off+512(dst); \ + MOVL R8, 15*4+off+576(dst); \ + SHRQ $32, R8; \ + MOVL R8, 4*4+off+0(dst); \ + MOVL R8, 8*4+off+64(dst); \ + MOVL R8, 14*4+off+128(dst); \ + MOVL R8, 5*4+off+192(dst); \ + MOVL R8, 12*4+off+256(dst); \ + MOVL R8, 11*4+off+320(dst); \ + MOVL R8, 1*4+off+384(dst); \ + MOVL R8, 6*4+off+448(dst); \ + MOVL R8, 10*4+off+512(dst); \ + MOVL R8, 3*4+off+576(dst); \ + \ + MOVL R9, 1*4+off+0(dst); \ + MOVL R9, 13*4+off+64(dst); \ + MOVL R9, 6*4+off+128(dst); \ + MOVL R9, 8*4+off+192(dst); \ + MOVL R9, 2*4+off+256(dst); \ + MOVL R9, 0*4+off+320(dst); \ + MOVL R9, 14*4+off+384(dst); \ + MOVL R9, 11*4+off+448(dst); \ + MOVL R9, 12*4+off+512(dst); \ + MOVL R9, 4*4+off+576(dst); \ + SHRQ $32, R9; \ + MOVL R9, 5*4+off+0(dst); \ + MOVL R9, 15*4+off+64(dst); \ + MOVL R9, 9*4+off+128(dst); \ + MOVL R9, 1*4+off+192(dst); \ + MOVL R9, 11*4+off+256(dst); \ + MOVL R9, 7*4+off+320(dst); \ + MOVL R9, 13*4+off+384(dst); \ + MOVL R9, 3*4+off+448(dst); \ + MOVL R9, 6*4+off+512(dst); \ + MOVL R9, 10*4+off+576(dst); \ + \ + MOVL R10, 2*4+off+0(dst); \ + MOVL R10, 1*4+off+64(dst); \ + MOVL R10, 15*4+off+128(dst); \ + MOVL R10, 10*4+off+192(dst); \ + MOVL R10, 6*4+off+256(dst); \ + MOVL R10, 8*4+off+320(dst); \ + MOVL R10, 3*4+off+384(dst); \ + MOVL R10, 13*4+off+448(dst); \ + MOVL R10, 14*4+off+512(dst); \ + MOVL R10, 5*4+off+576(dst); \ + SHRQ $32, R10; \ + MOVL R10, 6*4+off+0(dst); \ + MOVL R10, 11*4+off+64(dst); \ + MOVL R10, 2*4+off+128(dst); \ + MOVL R10, 9*4+off+192(dst); \ + MOVL R10, 1*4+off+256(dst); \ + MOVL R10, 13*4+off+320(dst); \ + MOVL R10, 4*4+off+384(dst); \ + MOVL R10, 8*4+off+448(dst); \ + MOVL R10, 15*4+off+512(dst); \ + MOVL R10, 7*4+off+576(dst); \ + \ + MOVL R11, 3*4+off+0(dst); \ + MOVL R11, 7*4+off+64(dst); \ + MOVL R11, 13*4+off+128(dst); \ + MOVL R11, 12*4+off+192(dst); \ + MOVL R11, 10*4+off+256(dst); \ + MOVL R11, 1*4+off+320(dst); \ + MOVL R11, 9*4+off+384(dst); \ + MOVL R11, 14*4+off+448(dst); \ + MOVL R11, 0*4+off+512(dst); \ + MOVL R11, 6*4+off+576(dst); \ + SHRQ $32, R11; \ + MOVL R11, 7*4+off+0(dst); \ + MOVL R11, 14*4+off+64(dst); \ + MOVL R11, 10*4+off+128(dst); \ + MOVL R11, 0*4+off+192(dst); \ + MOVL R11, 5*4+off+256(dst); \ + MOVL R11, 9*4+off+320(dst); \ + MOVL R11, 12*4+off+384(dst); \ + MOVL R11, 1*4+off+448(dst); \ + MOVL R11, 13*4+off+512(dst); \ + MOVL R11, 2*4+off+576(dst); \ + \ + MOVL R12, 8*4+off+0(dst); \ + MOVL R12, 5*4+off+64(dst); \ + MOVL R12, 4*4+off+128(dst); \ + MOVL R12, 15*4+off+192(dst); \ + MOVL R12, 14*4+off+256(dst); \ + MOVL R12, 3*4+off+320(dst); \ + MOVL R12, 11*4+off+384(dst); \ + MOVL R12, 10*4+off+448(dst); \ + MOVL R12, 7*4+off+512(dst); \ + MOVL R12, 1*4+off+576(dst); \ + SHRQ $32, R12; \ + MOVL R12, 12*4+off+0(dst); \ + MOVL R12, 2*4+off+64(dst); \ + MOVL R12, 11*4+off+128(dst); \ + MOVL R12, 4*4+off+192(dst); \ + MOVL R12, 0*4+off+256(dst); \ + MOVL R12, 15*4+off+320(dst); \ + MOVL R12, 10*4+off+384(dst); \ + MOVL R12, 7*4+off+448(dst); \ + MOVL R12, 5*4+off+512(dst); \ + MOVL R12, 9*4+off+576(dst); \ + \ + MOVL R13, 9*4+off+0(dst); \ + MOVL R13, 4*4+off+64(dst); \ + MOVL R13, 8*4+off+128(dst); \ + MOVL R13, 13*4+off+192(dst); \ + MOVL R13, 3*4+off+256(dst); \ + MOVL R13, 5*4+off+320(dst); \ + MOVL R13, 7*4+off+384(dst); \ + MOVL R13, 15*4+off+448(dst); \ + MOVL R13, 11*4+off+512(dst); \ + MOVL R13, 0*4+off+576(dst); \ + SHRQ $32, R13; \ + MOVL R13, 13*4+off+0(dst); \ + MOVL R13, 10*4+off+64(dst); \ + MOVL R13, 0*4+off+128(dst); \ + MOVL R13, 3*4+off+192(dst); \ + MOVL R13, 9*4+off+256(dst); \ + MOVL R13, 6*4+off+320(dst); \ + MOVL R13, 15*4+off+384(dst); \ + MOVL R13, 4*4+off+448(dst); \ + MOVL R13, 2*4+off+512(dst); \ + MOVL R13, 12*4+off+576(dst); \ + \ + MOVL R14, 10*4+off+0(dst); \ + MOVL R14, 12*4+off+64(dst); \ + MOVL R14, 1*4+off+128(dst); \ + MOVL R14, 6*4+off+192(dst); \ + MOVL R14, 13*4+off+256(dst); \ + MOVL R14, 4*4+off+320(dst); \ + MOVL R14, 0*4+off+384(dst); \ + MOVL R14, 2*4+off+448(dst); \ + MOVL R14, 8*4+off+512(dst); \ + MOVL R14, 14*4+off+576(dst); \ + SHRQ $32, R14; \ + MOVL R14, 14*4+off+0(dst); \ + MOVL R14, 3*4+off+64(dst); \ + MOVL R14, 7*4+off+128(dst); \ + MOVL R14, 2*4+off+192(dst); \ + MOVL R14, 15*4+off+256(dst); \ + MOVL R14, 12*4+off+320(dst); \ + MOVL R14, 6*4+off+384(dst); \ + MOVL R14, 0*4+off+448(dst); \ + MOVL R14, 9*4+off+512(dst); \ + MOVL R14, 11*4+off+576(dst); \ + \ + MOVL R15, 11*4+off+0(dst); \ + MOVL R15, 0*4+off+64(dst); \ + MOVL R15, 12*4+off+128(dst); \ + MOVL R15, 7*4+off+192(dst); \ + MOVL R15, 8*4+off+256(dst); \ + MOVL R15, 14*4+off+320(dst); \ + MOVL R15, 2*4+off+384(dst); \ + MOVL R15, 5*4+off+448(dst); \ + MOVL R15, 1*4+off+512(dst); \ + MOVL R15, 13*4+off+576(dst); \ + SHRQ $32, R15; \ + MOVL R15, 15*4+off+0(dst); \ + MOVL R15, 6*4+off+64(dst); \ + MOVL R15, 3*4+off+128(dst); \ + MOVL R15, 11*4+off+192(dst); \ + MOVL R15, 7*4+off+256(dst); \ + MOVL R15, 10*4+off+320(dst); \ + MOVL R15, 5*4+off+384(dst); \ + MOVL R15, 9*4+off+448(dst); \ + MOVL R15, 4*4+off+512(dst); \ + MOVL R15, 8*4+off+576(dst) + +#define BLAKE2s_SSE2() \ + PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \ + ROUND_SSE2(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8); \ + ROUND_SSE2(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8); \ + ROUND_SSE2(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8); \ + ROUND_SSE2(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8); \ + ROUND_SSE2(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8); \ + ROUND_SSE2(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8); \ + ROUND_SSE2(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8); \ + ROUND_SSE2(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8); \ + ROUND_SSE2(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8); \ + ROUND_SSE2(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8) + +#define BLAKE2s_SSSE3() \ + PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \ + ROUND_SSSE3(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8, X13, X14); \ + ROUND_SSSE3(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8, X13, X14); \ + ROUND_SSSE3(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8, X13, X14); \ + ROUND_SSSE3(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8, X13, X14); \ + ROUND_SSSE3(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8, X13, X14); \ + ROUND_SSSE3(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8, X13, X14); \ + ROUND_SSSE3(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8, X13, X14); \ + ROUND_SSSE3(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8, X13, X14); \ + ROUND_SSSE3(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8, X13, X14); \ + ROUND_SSSE3(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8, X13, X14) + +#define BLAKE2s_SSE4() \ + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15); \ + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3); \ + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4); \ + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8); \ + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13); \ + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9); \ + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11); \ + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10); \ + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5); \ + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \ + LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0); \ + ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14) + +#define HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_FUNC) \ + MOVQ h, AX; \ + MOVQ c, BX; \ + MOVL flag, CX; \ + MOVQ blocks_base, SI; \ + MOVQ blocks_len, DX; \ + \ + MOVQ SP, BP; \ + ADDQ $15, BP; \ + ANDQ $~15, BP; \ + \ + MOVQ 0(BX), R9; \ + MOVQ R9, 0(BP); \ + MOVQ CX, 8(BP); \ + \ + MOVOU 0(AX), X0; \ + MOVOU 16(AX), X1; \ + MOVOU iv0<>(SB), X2; \ + MOVOU iv1<>(SB), X3 \ + \ + MOVOU counter<>(SB), X12; \ + MOVOU rol16<>(SB), X13; \ + MOVOU rol8<>(SB), X14; \ + MOVO 0(BP), X15; \ + \ + loop: \ + MOVO X0, X4; \ + MOVO X1, X5; \ + MOVO X2, X6; \ + MOVO X3, X7; \ + \ + PADDQ X12, X15; \ + PXOR X15, X7; \ + \ + BLAKE2s_FUNC(); \ + \ + PXOR X4, X0; \ + PXOR X5, X1; \ + PXOR X6, X0; \ + PXOR X7, X1; \ + \ + LEAQ 64(SI), SI; \ + SUBQ $64, DX; \ + JNE loop; \ + \ + MOVO X15, 0(BP); \ + MOVQ 0(BP), R9; \ + MOVQ R9, 0(BX); \ + \ + MOVOU X0, 0(AX); \ + MOVOU X1, 16(AX) + +// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) +TEXT ·hashBlocksSSE2(SB), 0, $672-48 // frame = 656 + 16 byte alignment + HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE2) + RET + +// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) +TEXT ·hashBlocksSSSE3(SB), 0, $672-48 // frame = 656 + 16 byte alignment + HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSSE3) + RET + +// func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) +TEXT ·hashBlocksSSE4(SB), 0, $32-48 // frame = 16 + 16 byte alignment + HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE4) + RET diff --git a/src/vendor/golang.org/x/crypto/blake2s/blake2s_generic.go b/src/vendor/golang.org/x/crypto/blake2s/blake2s_generic.go new file mode 100644 index 00000000000..24a1ff22adc --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2s/blake2s_generic.go @@ -0,0 +1,178 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package blake2s + +import ( + "math/bits" +) + +// the precomputed values for BLAKE2s +// there are 10 16-byte arrays - one for each round +// the entries are calculated from the sigma constants. +var precomputed = [10][16]byte{ + {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, + {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, + {11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4}, + {7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8}, + {9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13}, + {2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9}, + {12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11}, + {13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10}, + {6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5}, + {10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0}, +} + +func hashBlocksGeneric(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) { + var m [16]uint32 + c0, c1 := c[0], c[1] + + for i := 0; i < len(blocks); { + c0 += BlockSize + if c0 < BlockSize { + c1++ + } + + v0, v1, v2, v3, v4, v5, v6, v7 := h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7] + v8, v9, v10, v11, v12, v13, v14, v15 := iv[0], iv[1], iv[2], iv[3], iv[4], iv[5], iv[6], iv[7] + v12 ^= c0 + v13 ^= c1 + v14 ^= flag + + for j := range m { + m[j] = uint32(blocks[i]) | uint32(blocks[i+1])<<8 | uint32(blocks[i+2])<<16 | uint32(blocks[i+3])<<24 + i += 4 + } + + for k := range precomputed { + s := &(precomputed[k]) + + v0 += m[s[0]] + v0 += v4 + v12 ^= v0 + v12 = bits.RotateLeft32(v12, -16) + v8 += v12 + v4 ^= v8 + v4 = bits.RotateLeft32(v4, -12) + v1 += m[s[1]] + v1 += v5 + v13 ^= v1 + v13 = bits.RotateLeft32(v13, -16) + v9 += v13 + v5 ^= v9 + v5 = bits.RotateLeft32(v5, -12) + v2 += m[s[2]] + v2 += v6 + v14 ^= v2 + v14 = bits.RotateLeft32(v14, -16) + v10 += v14 + v6 ^= v10 + v6 = bits.RotateLeft32(v6, -12) + v3 += m[s[3]] + v3 += v7 + v15 ^= v3 + v15 = bits.RotateLeft32(v15, -16) + v11 += v15 + v7 ^= v11 + v7 = bits.RotateLeft32(v7, -12) + + v0 += m[s[4]] + v0 += v4 + v12 ^= v0 + v12 = bits.RotateLeft32(v12, -8) + v8 += v12 + v4 ^= v8 + v4 = bits.RotateLeft32(v4, -7) + v1 += m[s[5]] + v1 += v5 + v13 ^= v1 + v13 = bits.RotateLeft32(v13, -8) + v9 += v13 + v5 ^= v9 + v5 = bits.RotateLeft32(v5, -7) + v2 += m[s[6]] + v2 += v6 + v14 ^= v2 + v14 = bits.RotateLeft32(v14, -8) + v10 += v14 + v6 ^= v10 + v6 = bits.RotateLeft32(v6, -7) + v3 += m[s[7]] + v3 += v7 + v15 ^= v3 + v15 = bits.RotateLeft32(v15, -8) + v11 += v15 + v7 ^= v11 + v7 = bits.RotateLeft32(v7, -7) + + v0 += m[s[8]] + v0 += v5 + v15 ^= v0 + v15 = bits.RotateLeft32(v15, -16) + v10 += v15 + v5 ^= v10 + v5 = bits.RotateLeft32(v5, -12) + v1 += m[s[9]] + v1 += v6 + v12 ^= v1 + v12 = bits.RotateLeft32(v12, -16) + v11 += v12 + v6 ^= v11 + v6 = bits.RotateLeft32(v6, -12) + v2 += m[s[10]] + v2 += v7 + v13 ^= v2 + v13 = bits.RotateLeft32(v13, -16) + v8 += v13 + v7 ^= v8 + v7 = bits.RotateLeft32(v7, -12) + v3 += m[s[11]] + v3 += v4 + v14 ^= v3 + v14 = bits.RotateLeft32(v14, -16) + v9 += v14 + v4 ^= v9 + v4 = bits.RotateLeft32(v4, -12) + + v0 += m[s[12]] + v0 += v5 + v15 ^= v0 + v15 = bits.RotateLeft32(v15, -8) + v10 += v15 + v5 ^= v10 + v5 = bits.RotateLeft32(v5, -7) + v1 += m[s[13]] + v1 += v6 + v12 ^= v1 + v12 = bits.RotateLeft32(v12, -8) + v11 += v12 + v6 ^= v11 + v6 = bits.RotateLeft32(v6, -7) + v2 += m[s[14]] + v2 += v7 + v13 ^= v2 + v13 = bits.RotateLeft32(v13, -8) + v8 += v13 + v7 ^= v8 + v7 = bits.RotateLeft32(v7, -7) + v3 += m[s[15]] + v3 += v4 + v14 ^= v3 + v14 = bits.RotateLeft32(v14, -8) + v9 += v14 + v4 ^= v9 + v4 = bits.RotateLeft32(v4, -7) + } + + h[0] ^= v0 ^ v8 + h[1] ^= v1 ^ v9 + h[2] ^= v2 ^ v10 + h[3] ^= v3 ^ v11 + h[4] ^= v4 ^ v12 + h[5] ^= v5 ^ v13 + h[6] ^= v6 ^ v14 + h[7] ^= v7 ^ v15 + } + c[0], c[1] = c0, c1 +} diff --git a/src/vendor/golang.org/x/crypto/blake2s/blake2s_ref.go b/src/vendor/golang.org/x/crypto/blake2s/blake2s_ref.go new file mode 100644 index 00000000000..799dba0c415 --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2s/blake2s_ref.go @@ -0,0 +1,18 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (!amd64 && !386) || !gc || purego +// +build !amd64,!386 !gc purego + +package blake2s + +var ( + useSSE4 = false + useSSSE3 = false + useSSE2 = false +) + +func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) { + hashBlocksGeneric(h, c, flag, blocks) +} diff --git a/src/vendor/golang.org/x/crypto/blake2s/blake2x.go b/src/vendor/golang.org/x/crypto/blake2s/blake2x.go new file mode 100644 index 00000000000..828749ff01d --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2s/blake2x.go @@ -0,0 +1,178 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package blake2s + +import ( + "encoding/binary" + "errors" + "io" +) + +// XOF defines the interface to hash functions that +// support arbitrary-length output. +type XOF interface { + // Write absorbs more data into the hash's state. It panics if called + // after Read. + io.Writer + + // Read reads more output from the hash. It returns io.EOF if the limit + // has been reached. + io.Reader + + // Clone returns a copy of the XOF in its current state. + Clone() XOF + + // Reset resets the XOF to its initial state. + Reset() +} + +// OutputLengthUnknown can be used as the size argument to NewXOF to indicate +// the length of the output is not known in advance. +const OutputLengthUnknown = 0 + +// magicUnknownOutputLength is a magic value for the output size that indicates +// an unknown number of output bytes. +const magicUnknownOutputLength = 65535 + +// maxOutputLength is the absolute maximum number of bytes to produce when the +// number of output bytes is unknown. +const maxOutputLength = (1 << 32) * 32 + +// NewXOF creates a new variable-output-length hash. The hash either produce a +// known number of bytes (1 <= size < 65535), or an unknown number of bytes +// (size == OutputLengthUnknown). In the latter case, an absolute limit of +// 128GiB applies. +// +// A non-nil key turns the hash into a MAC. The key must between +// zero and 32 bytes long. +func NewXOF(size uint16, key []byte) (XOF, error) { + if len(key) > Size { + return nil, errKeySize + } + if size == magicUnknownOutputLength { + // 2^16-1 indicates an unknown number of bytes and thus isn't a + // valid length. + return nil, errors.New("blake2s: XOF length too large") + } + if size == OutputLengthUnknown { + size = magicUnknownOutputLength + } + x := &xof{ + d: digest{ + size: Size, + keyLen: len(key), + }, + length: size, + } + copy(x.d.key[:], key) + x.Reset() + return x, nil +} + +type xof struct { + d digest + length uint16 + remaining uint64 + cfg, root, block [Size]byte + offset int + nodeOffset uint32 + readMode bool +} + +func (x *xof) Write(p []byte) (n int, err error) { + if x.readMode { + panic("blake2s: write to XOF after read") + } + return x.d.Write(p) +} + +func (x *xof) Clone() XOF { + clone := *x + return &clone +} + +func (x *xof) Reset() { + x.cfg[0] = byte(Size) + binary.LittleEndian.PutUint32(x.cfg[4:], uint32(Size)) // leaf length + binary.LittleEndian.PutUint16(x.cfg[12:], x.length) // XOF length + x.cfg[15] = byte(Size) // inner hash size + + x.d.Reset() + x.d.h[3] ^= uint32(x.length) + + x.remaining = uint64(x.length) + if x.remaining == magicUnknownOutputLength { + x.remaining = maxOutputLength + } + x.offset, x.nodeOffset = 0, 0 + x.readMode = false +} + +func (x *xof) Read(p []byte) (n int, err error) { + if !x.readMode { + x.d.finalize(&x.root) + x.readMode = true + } + + if x.remaining == 0 { + return 0, io.EOF + } + + n = len(p) + if uint64(n) > x.remaining { + n = int(x.remaining) + p = p[:n] + } + + if x.offset > 0 { + blockRemaining := Size - x.offset + if n < blockRemaining { + x.offset += copy(p, x.block[x.offset:]) + x.remaining -= uint64(n) + return + } + copy(p, x.block[x.offset:]) + p = p[blockRemaining:] + x.offset = 0 + x.remaining -= uint64(blockRemaining) + } + + for len(p) >= Size { + binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset) + x.nodeOffset++ + + x.d.initConfig(&x.cfg) + x.d.Write(x.root[:]) + x.d.finalize(&x.block) + + copy(p, x.block[:]) + p = p[Size:] + x.remaining -= uint64(Size) + } + + if todo := len(p); todo > 0 { + if x.remaining < uint64(Size) { + x.cfg[0] = byte(x.remaining) + } + binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset) + x.nodeOffset++ + + x.d.initConfig(&x.cfg) + x.d.Write(x.root[:]) + x.d.finalize(&x.block) + + x.offset = copy(p, x.block[:todo]) + x.remaining -= uint64(todo) + } + + return +} + +func (d *digest) initConfig(cfg *[Size]byte) { + d.offset, d.c[0], d.c[1] = 0, 0, 0 + for i := range d.h { + d.h[i] = iv[i] ^ binary.LittleEndian.Uint32(cfg[i*4:]) + } +} diff --git a/src/vendor/golang.org/x/crypto/blake2s/register.go b/src/vendor/golang.org/x/crypto/blake2s/register.go new file mode 100644 index 00000000000..ef79ff3c67a --- /dev/null +++ b/src/vendor/golang.org/x/crypto/blake2s/register.go @@ -0,0 +1,22 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.9 +// +build go1.9 + +package blake2s + +import ( + "crypto" + "hash" +) + +func init() { + newHash256 := func() hash.Hash { + h, _ := New256(nil) + return h + } + + crypto.RegisterHash(crypto.BLAKE2s_256, newHash256) +} diff --git a/src/vendor/modules.txt b/src/vendor/modules.txt index c3d9ae6176a..d54470c3802 100644 --- a/src/vendor/modules.txt +++ b/src/vendor/modules.txt @@ -8,10 +8,22 @@ github.com/cloudflare/circl/hpke github.com/cloudflare/circl/internal/conv github.com/cloudflare/circl/internal/sha3 github.com/cloudflare/circl/kem +github.com/cloudflare/circl/kem/hybrid +github.com/cloudflare/circl/kem/kyber/kyber1024 +github.com/cloudflare/circl/kem/kyber/kyber512 +github.com/cloudflare/circl/kem/kyber/kyber768 github.com/cloudflare/circl/math github.com/cloudflare/circl/math/fp25519 github.com/cloudflare/circl/math/fp448 github.com/cloudflare/circl/math/mlsbset +github.com/cloudflare/circl/pke/kyber/internal/common +github.com/cloudflare/circl/pke/kyber/internal/common/params +github.com/cloudflare/circl/pke/kyber/kyber1024 +github.com/cloudflare/circl/pke/kyber/kyber1024/internal +github.com/cloudflare/circl/pke/kyber/kyber512 +github.com/cloudflare/circl/pke/kyber/kyber512/internal +github.com/cloudflare/circl/pke/kyber/kyber768 +github.com/cloudflare/circl/pke/kyber/kyber768/internal github.com/cloudflare/circl/pki github.com/cloudflare/circl/sign github.com/cloudflare/circl/sign/dilithium/internal/common @@ -26,8 +38,11 @@ github.com/cloudflare/circl/sign/eddilithium2 github.com/cloudflare/circl/sign/eddilithium3 github.com/cloudflare/circl/sign/schemes github.com/cloudflare/circl/simd/keccakf1600 +github.com/cloudflare/circl/xof # golang.org/x/crypto v0.6.0 ## explicit; go 1.17 +golang.org/x/crypto/blake2b +golang.org/x/crypto/blake2s golang.org/x/crypto/chacha20 golang.org/x/crypto/chacha20poly1305 golang.org/x/crypto/cryptobyte