main
Raw Download raw file
  1//go:build amd64 && !purego
  2// +build amd64,!purego
  3
  4#include "textflag.h"
  5
  6// Depends on circl/math/fp448 package
  7#include "../../math/fp448/fp_amd64.h"
  8#include "curve_amd64.h"
  9
 10// CTE_A24 is (A+2)/4 from Curve448
 11#define CTE_A24 39082
 12
 13#define Size 56
 14
 15// multiplyA24Leg multiplies x times CTE_A24 and stores in z
 16// Uses: AX, DX, R8-R15, FLAGS
 17// Instr: x86_64, cmov, adx
 18#define multiplyA24Leg(z,x) \
 19    MOVQ $CTE_A24, R15; \
 20    MOVQ  0+x, AX; MULQ R15; MOVQ AX,  R8; ;;;;;;;;;;;;  MOVQ DX,  R9; \
 21    MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX;  MOVQ DX, R10; \
 22    MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX;  MOVQ DX, R11; \
 23    MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX;  MOVQ DX, R12; \
 24    MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX;  MOVQ DX, R13; \
 25    MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX;  MOVQ DX, R14; \
 26    MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
 27    MOVQ DX,  AX; \
 28    SHLQ $32, AX; \
 29    ADDQ DX,  R8; MOVQ $0, DX; \
 30    ADCQ $0,  R9; \
 31    ADCQ $0, R10; \
 32    ADCQ AX, R11; \
 33    ADCQ $0, R12; \
 34    ADCQ $0, R13; \
 35    ADCQ $0, R14; \
 36    ADCQ $0,  DX; \
 37    MOVQ DX,  AX; \
 38    SHLQ $32, AX; \
 39    ADDQ DX,  R8; \
 40    ADCQ $0,  R9; \
 41    ADCQ $0, R10; \
 42    ADCQ AX, R11; \
 43    ADCQ $0, R12; \
 44    ADCQ $0, R13; \
 45    ADCQ $0, R14; \
 46    MOVQ  R8,  0+z; \
 47    MOVQ  R9,  8+z; \
 48    MOVQ R10, 16+z; \
 49    MOVQ R11, 24+z; \
 50    MOVQ R12, 32+z; \
 51    MOVQ R13, 40+z; \
 52    MOVQ R14, 48+z;
 53
 54// multiplyA24Adx multiplies x times CTE_A24 and stores in z
 55// Uses: AX, DX, R8-R14, FLAGS
 56// Instr: x86_64, bmi2
 57#define multiplyA24Adx(z,x) \
 58    MOVQ $CTE_A24, DX; \
 59    MULXQ  0+x, R8,  R9; \
 60    MULXQ  8+x, AX, R10;  ADDQ AX,  R9; \
 61    MULXQ 16+x, AX, R11;  ADCQ AX, R10; \
 62    MULXQ 24+x, AX, R12;  ADCQ AX, R11; \
 63    MULXQ 32+x, AX, R13;  ADCQ AX, R12; \
 64    MULXQ 40+x, AX, R14;  ADCQ AX, R13; \
 65    MULXQ 48+x, AX,  DX;  ADCQ AX, R14; \
 66    ;;;;;;;;;;;;;;;;;;;;  ADCQ $0,  DX; \
 67    MOVQ DX,  AX; \
 68    SHLQ $32, AX; \
 69    ADDQ DX,  R8; MOVQ $0, DX; \
 70    ADCQ $0,  R9; \
 71    ADCQ $0, R10; \
 72    ADCQ AX, R11; \
 73    ADCQ $0, R12; \
 74    ADCQ $0, R13; \
 75    ADCQ $0, R14; \
 76    ADCQ $0,  DX; \
 77    MOVQ DX,  AX; \
 78    SHLQ $32, AX; \
 79    ADDQ DX,  R8; \
 80    ADCQ $0,  R9; \
 81    ADCQ $0, R10; \
 82    ADCQ AX, R11; \
 83    ADCQ $0, R12; \
 84    ADCQ $0, R13; \
 85    ADCQ $0, R14; \
 86    MOVQ  R8,  0+z; \
 87    MOVQ  R9,  8+z; \
 88    MOVQ R10, 16+z; \
 89    MOVQ R11, 24+z; \
 90    MOVQ R12, 32+z; \
 91    MOVQ R13, 40+z; \
 92    MOVQ R14, 48+z;
 93
 94#define mulA24Legacy \
 95    multiplyA24Leg(0(DI),0(SI))
 96#define mulA24Bmi2Adx \
 97    multiplyA24Adx(0(DI),0(SI))
 98
 99// func mulA24Amd64(z, x *fp448.Elt)
100TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16
101    MOVQ z+0(FP), DI
102    MOVQ x+8(FP), SI
103    CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx)
104
105// func ladderStepAmd64(w *[5]fp448.Elt, b uint)
106// ladderStepAmd64 calculates a point addition and doubling as follows:
107// (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-).
108//    w    = {x1,x2,z2,x3,z4} are five fp255.Elt of 56 bytes.
109//  stack  = (t0,t1) are two fp.Elt of fp.Size bytes, and
110//           (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
111TEXT ·ladderStepAmd64(SB),NOSPLIT,$336-16
112    // Parameters
113    #define regWork DI
114    #define regMove SI
115    #define x1 0*Size(regWork)
116    #define x2 1*Size(regWork)
117    #define z2 2*Size(regWork)
118    #define x3 3*Size(regWork)
119    #define z3 4*Size(regWork)
120    // Local variables
121    #define t0 0*Size(SP)
122    #define t1 1*Size(SP)
123    #define b0 2*Size(SP)
124    #define b1 4*Size(SP)
125    MOVQ w+0(FP), regWork
126    MOVQ b+8(FP), regMove
127    CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx)
128    #undef regWork
129    #undef regMove
130    #undef x1
131    #undef x2
132    #undef z2
133    #undef x3
134    #undef z3
135    #undef t0
136    #undef t1
137    #undef b0
138    #undef b1
139
140// func diffAddAmd64(work *[5]fp.Elt, swap uint)
141// diffAddAmd64 calculates a differential point addition using a precomputed point.
142// (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2)
143//    work = {mu,x1,z1,x2,z2} are five fp448.Elt of 56 bytes, and
144//   stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
145// This is Equation 7 at https://eprint.iacr.org/2017/264.
146TEXT ·diffAddAmd64(SB),NOSPLIT,$224-16
147    // Parameters
148    #define regWork DI
149    #define regSwap SI
150    #define ui 0*Size(regWork)
151    #define x1 1*Size(regWork)
152    #define z1 2*Size(regWork)
153    #define x2 3*Size(regWork)
154    #define z2 4*Size(regWork)
155    // Local variables
156    #define b0 0*Size(SP)
157    #define b1 2*Size(SP)
158    MOVQ w+0(FP), regWork
159    MOVQ b+8(FP), regSwap
160    cswap(x1,x2,regSwap)
161    cswap(z1,z2,regSwap)
162    CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx)
163    #undef regWork
164    #undef regSwap
165    #undef ui
166    #undef x1
167    #undef z1
168    #undef x2
169    #undef z2
170    #undef b0
171    #undef b1
172
173// func doubleAmd64(x, z *fp448.Elt)
174// doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1).
175//  stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
176//          (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
177TEXT ·doubleAmd64(SB),NOSPLIT,$336-16
178    // Parameters
179    #define x1 0(DI)
180    #define z1 0(SI)
181    // Local variables
182    #define t0 0*Size(SP)
183    #define t1 1*Size(SP)
184    #define b0 2*Size(SP)
185    #define b1 4*Size(SP)
186    MOVQ x+0(FP), DI
187    MOVQ z+8(FP), SI
188    CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx)
189    #undef x1
190    #undef z1
191    #undef t0
192    #undef t1
193    #undef b0
194    #undef b1