[EC] Unify point doubling for P-256/384/521 (#1567)

Implement and use a single version of point doubling for implementations of NIST curves P-384, P-521, and Fiat-crypto based implementation of P-256. The change does not affect performance. Point addition will be unified in a subsequent change. I verified the performance was not affected on Graviton 3, Intel, and M1 CPUs. Example for M1: ``` Before Did 2882750 EC POINT P-384 dbl operations in 1000082us (2882513.6 ops/sec) Did 1600000 EC POINT P-384 add operations in 1000497us (1599205.2 ops/sec) Did 7051 EC POINT P-384 mul operations in 1078289us (6539.1 ops/sec) Did 28000 EC POINT P-384 mul base operations in 1000115us (27996.8 ops/sec) Did 5632 EC POINT P-384 mul public operations in 1062456us (5300.9 ops/sec) Did 2685500 EC POINT P-521 dbl operations in 1000037us (2685400.6 ops/sec) Did 1435000 EC POINT P-521 add operations in 1000129us (1434814.9 ops/sec) Did 4928 EC POINT P-521 mul operations in 1055318us (4669.7 ops/sec) Did 19000 EC POINT P-521 mul base operations in 1022199us (18587.4 ops/sec) Did 3850 EC POINT P-521 mul public operations in 1036809us (3713.3 ops/sec) After: Did 2888250 EC POINT P-384 dbl operations in 1000028us (2888169.1 ops/sec) Did 1593000 EC POINT P-384 add operations in 1000405us (1592355.1 ops/sec) Did 6875 EC POINT P-384 mul operations in 1054301us (6520.9 ops/sec) Did 28000 EC POINT P-384 mul base operations in 1000818us (27977.1 ops/sec) Did 5555 EC POINT P-384 mul public operations in 1056370us (5258.6 ops/sec) Did 2775250 EC POINT P-521 dbl operations in 1000021us (2775191.7 ops/sec) Did 1435000 EC POINT P-521 add operations in 1000085us (1434878.0 ops/sec) Did 4840 EC POINT P-521 mul operations in 1044164us (4635.3 ops/sec) Did 19000 EC POINT P-521 mul base operations in 1027887us (18484.5 ops/sec) Did 3883 EC POINT P-521 mul public operations in 1051447us (3693.0 ops/sec) ```
aws · May 20, 2024 · fc06ecb · fc06ecb
1 parent a83bcb5
commit fc06ecb
Show file tree

Hide file tree

Showing 9 changed files with 244 additions and 253 deletions.
diff --git a/crypto/fipsmodule/bcm.c b/crypto/fipsmodule/bcm.c
@@ -28,7 +28,6 @@
 // to control the order. $b section will place bcm in between the start/end markers
 // which are in $a and $z.
 #if defined(BORINGSSL_FIPS) && defined(OPENSSL_WINDOWS)
-
 #pragma code_seg(".fipstx$b")
 #pragma data_seg(".fipsda$b")
 #pragma const_seg(".fipsco$b")
@@ -93,6 +92,7 @@
 #include "ec/ec.c"
 #include "ec/ec_key.c"
 #include "ec/ec_montgomery.c"
+#include "ec/ec_nistp.c"
 #include "ec/felem.c"
 #include "ec/oct.c"
 #include "ec/p224-64.c"

diff --git a/crypto/fipsmodule/ec/ec_nistp.c b/crypto/fipsmodule/ec/ec_nistp.c
@@ -0,0 +1,112 @@
+// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// In this file we will implement elliptic curve point operations for
+// NIST curves P-256, P-384, and P-521. The idea is to implement the operations
+// in a generic way such that the code can be reused instead of having
+// a separate implementation for each of the curves. We implement:
+//   1. point addition,
+//   2. point doubling,
+//   3. scalar multiplication of a base point,
+//   4. scalar multiplication of an arbitrary point,
+//   5. scalar multiplication of a base and an arbitrary point.
+//
+// Matrix of what has been done so far:
+// 
+// | op | P-521 | P-384 | P-256 |
+// |----------------------------|
+// | 1. |       |       |       |
+// | 2. |   x   |   x   |   x*  |
+// | 3. |       |       |       |
+// | 4. |       |       |       |
+// | 5. |       |       |       |
+//  * For P-256, only the Fiat-crypto implementation in p256.c is replaced. 
+
+#include "ec_nistp.h"
+
+// Some of the functions below need temporary field element variables.
+// To avoid dynamic allocation we define nistp_felem type to have the maximum
+// size possible (which is currently P-521 curve). The values are hard-coded
+// for the moment, this will be fixed when we migrate the whole P-521
+// implementation to ec_nistp.c.
+#if defined(EC_NISTP_USE_64BIT_LIMB)
+#define NISTP_FELEM_MAX_NUM_OF_LIMBS (9)
+#else
+#define NISTP_FELEM_MAX_NUM_OF_LIMBS (19)
+#endif
+typedef ec_nistp_felem_limb ec_nistp_felem[NISTP_FELEM_MAX_NUM_OF_LIMBS];
+
+// Group operations
+// ----------------
+//
+// Building on top of the field operations we have the operations on the
+// elliptic curve group itself. Points on the curve are represented in Jacobian
+// coordinates.
+//
+// ec_nistp_point_double calculates 2*(x_in, y_in, z_in)
+//
+// The method is based on:
+//   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+// for which there is a Coq transcription and correctness proof:
+//   <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L93>
+//   <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L201>
+//
+// However, we slighty changed the computation for efficiency (see the full
+// explanation within the function body), which makes the Coq proof above
+// not applicable to our implementation.
+// TODO(awslc): Write a Coq correctness proof for our version of the algorithm.
+//
+// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed;
+// while x_out == y_in is not (maybe this works, but it's not tested).
+void ec_nistp_point_double(const ec_nistp_felem_meth *ctx,
+                           ec_nistp_felem_limb *x_out,
+                           ec_nistp_felem_limb *y_out,
+                           ec_nistp_felem_limb *z_out,
+                           const ec_nistp_felem_limb *x_in,
+                           const ec_nistp_felem_limb *y_in,
+                           const ec_nistp_felem_limb *z_in) {
+  ec_nistp_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta;
+  // delta = z^2
+  ctx->sqr(delta, z_in);
+  // gamma = y^2
+  ctx->sqr(gamma, y_in);
+  // beta = x*gamma
+  ctx->mul(beta, x_in, gamma);
+
+  // alpha = 3*(x-delta)*(x+delta)
+  ctx->sub(ftmp, x_in, delta);
+  ctx->add(ftmp2, x_in, delta);
+
+  ctx->add(tmptmp, ftmp2, ftmp2);
+  ctx->add(ftmp2, ftmp2, tmptmp);
+  ctx->mul(alpha, ftmp, ftmp2);
+
+  // x' = alpha^2 - 8*beta
+  ctx->sqr(x_out, alpha);
+  ctx->add(fourbeta, beta, beta);
+  ctx->add(fourbeta, fourbeta, fourbeta);
+  ctx->add(tmptmp, fourbeta, fourbeta);
+  ctx->sub(x_out, x_out, tmptmp);
+
+  // z' = (y + z)^2 - gamma - delta
+  // The following calculation differs from the Coq proof cited above.
+  // The proof is for:
+  //   add(delta, gamma, delta);
+  //   add(ftmp, y_in, z_in);
+  //   square(z_out, ftmp);
+  //   sub(z_out, z_out, delta);
+  // Our operations sequence is a bit more efficient because it saves us
+  // a certain number of conditional moves.
+  ctx->add(ftmp, y_in, z_in);
+  ctx->sqr(z_out, ftmp);
+  ctx->sub(z_out, z_out, gamma);
+  ctx->sub(z_out, z_out, delta);
+
+  // y' = alpha*(4*beta - x') - 8*gamma^2
+  ctx->sub(y_out, fourbeta, x_out);
+  ctx->add(gamma, gamma, gamma);
+  ctx->sqr(gamma, gamma);
+  ctx->mul(y_out, alpha, y_out);
+  ctx->add(gamma, gamma, gamma);
+  ctx->sub(y_out, y_out, gamma);
+}
diff --git a/crypto/fipsmodule/ec/ec_nistp.h b/crypto/fipsmodule/ec/ec_nistp.h
@@ -0,0 +1,65 @@
+// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+#ifndef EC_NISTP_H
+#define EC_NISTP_H
+
+#include <openssl/target.h>
+
+#include <stdint.h>
+
+// We have two implementations of the field arithmetic for NIST curves:
+//   - Fiat-crypto
+//   - s2n-bignum
+// Both Fiat-crypto and s2n-bignum implementations are formally verified.
+// Fiat-crypto implementation is fully portable C code, while s2n-bignum
+// implements the operations in assembly for x86_64 and aarch64 platforms.
+// If (1) x86_64 or aarch64, (2) linux or apple, and (3) OPENSSL_NO_ASM is not
+// set, s2n-bignum path is capable.
+#if !defined(OPENSSL_NO_ASM) &&                                                \
+    (defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) &&                      \
+    ((defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)) || \
+     defined(OPENSSL_AARCH64))
+#  define EC_NISTP_USE_S2N_BIGNUM
+#  define EC_NISTP_USE_64BIT_LIMB
+#else
+// Fiat-crypto has both 64-bit and 32-bit implementation.
+#  if defined(BORINGSSL_HAS_UINT128)
+#    define EC_NISTP_USE_64BIT_LIMB
+#  endif
+#endif
+
+#if defined(EC_NISTP_USE_64BIT_LIMB)
+typedef uint64_t ec_nistp_felem_limb;
+#else
+typedef uint32_t ec_nistp_felem_limb;
+#endif
+
+// ec_nistp_felem_meth is a struct that holds pointers to implementations of field
+// arithmetic functions for specific curves. It is meant to be used
+// in higher level functions like this:
+//   void point_double(nistp_felem_methods *ctx, ...) {
+//     ctx->add(...);
+//     ctx->mul(...);
+//   }
+// This makes the functions reusable between different curves by simply
+// providing an appropriate methods object.
+typedef struct {
+  void (*add)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a, const ec_nistp_felem_limb *b);
+  void (*sub)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a, const ec_nistp_felem_limb *b);
+  void (*mul)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a, const ec_nistp_felem_limb *b);
+  void (*sqr)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a);
+} ec_nistp_felem_meth;
+
+const ec_nistp_felem_meth *p256_felem_methods(void);
+const ec_nistp_felem_meth *p384_felem_methods(void);
+const ec_nistp_felem_meth *p521_felem_methods(void);
+
+void ec_nistp_point_double(const ec_nistp_felem_meth *ctx,
+                           ec_nistp_felem_limb *x_out,
+                           ec_nistp_felem_limb *y_out,
+                           ec_nistp_felem_limb *z_out,
+                           const ec_nistp_felem_limb *x_in,
+                           const ec_nistp_felem_limb *y_in,
+                           const ec_nistp_felem_limb *z_in);
+#endif // EC_NISTP_H
+
diff --git a/crypto/fipsmodule/ec/make_tables.go b/crypto/fipsmodule/ec/make_tables.go
@@ -392,7 +392,7 @@ func writeP384Table(path string) error {
 // is based on the generation method in:
 // https://gitlab.com/nisec/ecckiila/-/blob/master/main.py#L296
 
-#if defined(P384_USE_64BIT_LIMBS_FELEM)`
+#if defined(EC_NISTP_USE_64BIT_LIMB)`
 
 	table_def_str := fmt.Sprintf("static const p384_felem p384_g_pre_comp[%d][%d][2] = ", num_subtables, pts_per_subtable)
 
@@ -462,7 +462,7 @@ func writeP521Table(path string) error {
 // is based on the generation method in:
 // https://gitlab.com/nisec/ecckiila/-/blob/master/main.py#L296
 
-#if defined(P521_USE_S2N_BIGNUM_FIELD_ARITH)`
+#if defined(EC_NISTP_USE_S2N_BIGNUM)`
 
 	table_def_str := fmt.Sprintf("static const p521_felem p521_g_pre_comp[%d][%d][2] = ", num_subtables, pts_per_subtable)
 
@@ -472,7 +472,7 @@ func writeP521Table(path string) error {
 	if err := writeTables(w, curve, tables, writeU64, nil); err != nil {
 		return err
 	}
-	if _, err := io.WriteString(w, ";\n#else\n#if defined(P521_USE_64BIT_LIMBS_FELEM)\n" + table_def_str); err != nil {
+	if _, err := io.WriteString(w, ";\n#else\n#if defined(EC_NISTP_USE_64BIT_LIMB)\n" + table_def_str); err != nil {
 		return err
 	}
 	// P-521 Fiat-crypto implementation for 64-bit systems represents a field

diff --git a/crypto/fipsmodule/ec/p256.c b/crypto/fipsmodule/ec/p256.c
@@ -30,6 +30,7 @@
 #include "../../internal.h"
 #include "../delocate.h"
 #include "./internal.h"
+#include "ec_nistp.h"
 
 #if defined(BORINGSSL_HAS_UINT128)
 #define BORINGSSL_NISTP256_64BIT 1
@@ -166,73 +167,20 @@ static void fiat_p256_inv_square(fiat_p256_felem out,
   fiat_p256_square(out, ret);  // 2^256 - 2^224 + 2^192 + 2^96 - 2^2
 }
 
-// Group operations
-// ----------------
-//
-// Building on top of the field operations we have the operations on the
-// elliptic curve group itself. Points on the curve are represented in Jacobian
-// coordinates.
-//
-// Both operations were transcribed to Coq and proven to correspond to naive
-// implementations using Affine coordinates, for all suitable fields.  In the
-// Coq proofs, issues of constant-time execution and memory layout (aliasing)
-// conventions were not considered. Specification of affine coordinates:
-// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Spec/WeierstrassCurve.v#L28>
-// As a sanity check, a proof that these points form a commutative group:
-// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/AffineProofs.v#L33>
-
-// fiat_p256_point_double calculates 2*(x_in, y_in, z_in)
-//
-// The method is taken from:
-//   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
-//
-// Coq transcription and correctness proof:
-// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L93>
-// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L201>
-//
-// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
-// while x_out == y_in is not (maybe this works, but it's not tested).
-static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out,
+DEFINE_METHOD_FUNCTION(ec_nistp_felem_meth, p256_felem_methods) {
+    out->add = fiat_p256_add;
+    out->sub = fiat_p256_sub;
+    out->mul = fiat_p256_mul;
+    out->sqr = fiat_p256_square;
+}
+
+static void fiat_p256_point_double(fiat_p256_felem x_out,
+                                   fiat_p256_felem y_out,
                                    fiat_p256_felem z_out,
                                    const fiat_p256_felem x_in,
                                    const fiat_p256_felem y_in,
                                    const fiat_p256_felem z_in) {
-  fiat_p256_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta;
-  // delta = z^2
-  fiat_p256_square(delta, z_in);
-  // gamma = y^2
-  fiat_p256_square(gamma, y_in);
-  // beta = x*gamma
-  fiat_p256_mul(beta, x_in, gamma);
-
-  // alpha = 3*(x-delta)*(x+delta)
-  fiat_p256_sub(ftmp, x_in, delta);
-  fiat_p256_add(ftmp2, x_in, delta);
-
-  fiat_p256_add(tmptmp, ftmp2, ftmp2);
-  fiat_p256_add(ftmp2, ftmp2, tmptmp);
-  fiat_p256_mul(alpha, ftmp, ftmp2);
-
-  // x' = alpha^2 - 8*beta
-  fiat_p256_square(x_out, alpha);
-  fiat_p256_add(fourbeta, beta, beta);
-  fiat_p256_add(fourbeta, fourbeta, fourbeta);
-  fiat_p256_add(tmptmp, fourbeta, fourbeta);
-  fiat_p256_sub(x_out, x_out, tmptmp);
-
-  // z' = (y + z)^2 - gamma - delta
-  fiat_p256_add(delta, gamma, delta);
-  fiat_p256_add(ftmp, y_in, z_in);
-  fiat_p256_square(z_out, ftmp);
-  fiat_p256_sub(z_out, z_out, delta);
-
-  // y' = alpha*(4*beta - x') - 8*gamma^2
-  fiat_p256_sub(y_out, fourbeta, x_out);
-  fiat_p256_add(gamma, gamma, gamma);
-  fiat_p256_square(gamma, gamma);
-  fiat_p256_mul(y_out, alpha, y_out);
-  fiat_p256_add(gamma, gamma, gamma);
-  fiat_p256_sub(y_out, y_out, gamma);
+  ec_nistp_point_double(p256_felem_methods(), x_out, y_out, z_out, x_in, y_in, z_in);
 }
 
 // fiat_p256_point_add calculates (x1, y1, z1) + (x2, y2, z2)