main
1//go:build amd64 && !purego
2// +build amd64,!purego
3
4#include "textflag.h"
5
6// Depends on circl/math/fp448 package
7#include "../../math/fp448/fp_amd64.h"
8#include "curve_amd64.h"
9
10// CTE_A24 is (A+2)/4 from Curve448
11#define CTE_A24 39082
12
13#define Size 56
14
15// multiplyA24Leg multiplies x times CTE_A24 and stores in z
16// Uses: AX, DX, R8-R15, FLAGS
17// Instr: x86_64, cmov, adx
18#define multiplyA24Leg(z,x) \
19 MOVQ $CTE_A24, R15; \
20 MOVQ 0+x, AX; MULQ R15; MOVQ AX, R8; ;;;;;;;;;;;; MOVQ DX, R9; \
21 MOVQ 8+x, AX; MULQ R15; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; \
22 MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
23 MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
24 MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
25 MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
26 MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
27 MOVQ DX, AX; \
28 SHLQ $32, AX; \
29 ADDQ DX, R8; MOVQ $0, DX; \
30 ADCQ $0, R9; \
31 ADCQ $0, R10; \
32 ADCQ AX, R11; \
33 ADCQ $0, R12; \
34 ADCQ $0, R13; \
35 ADCQ $0, R14; \
36 ADCQ $0, DX; \
37 MOVQ DX, AX; \
38 SHLQ $32, AX; \
39 ADDQ DX, R8; \
40 ADCQ $0, R9; \
41 ADCQ $0, R10; \
42 ADCQ AX, R11; \
43 ADCQ $0, R12; \
44 ADCQ $0, R13; \
45 ADCQ $0, R14; \
46 MOVQ R8, 0+z; \
47 MOVQ R9, 8+z; \
48 MOVQ R10, 16+z; \
49 MOVQ R11, 24+z; \
50 MOVQ R12, 32+z; \
51 MOVQ R13, 40+z; \
52 MOVQ R14, 48+z;
53
54// multiplyA24Adx multiplies x times CTE_A24 and stores in z
55// Uses: AX, DX, R8-R14, FLAGS
56// Instr: x86_64, bmi2
57#define multiplyA24Adx(z,x) \
58 MOVQ $CTE_A24, DX; \
59 MULXQ 0+x, R8, R9; \
60 MULXQ 8+x, AX, R10; ADDQ AX, R9; \
61 MULXQ 16+x, AX, R11; ADCQ AX, R10; \
62 MULXQ 24+x, AX, R12; ADCQ AX, R11; \
63 MULXQ 32+x, AX, R13; ADCQ AX, R12; \
64 MULXQ 40+x, AX, R14; ADCQ AX, R13; \
65 MULXQ 48+x, AX, DX; ADCQ AX, R14; \
66 ;;;;;;;;;;;;;;;;;;;; ADCQ $0, DX; \
67 MOVQ DX, AX; \
68 SHLQ $32, AX; \
69 ADDQ DX, R8; MOVQ $0, DX; \
70 ADCQ $0, R9; \
71 ADCQ $0, R10; \
72 ADCQ AX, R11; \
73 ADCQ $0, R12; \
74 ADCQ $0, R13; \
75 ADCQ $0, R14; \
76 ADCQ $0, DX; \
77 MOVQ DX, AX; \
78 SHLQ $32, AX; \
79 ADDQ DX, R8; \
80 ADCQ $0, R9; \
81 ADCQ $0, R10; \
82 ADCQ AX, R11; \
83 ADCQ $0, R12; \
84 ADCQ $0, R13; \
85 ADCQ $0, R14; \
86 MOVQ R8, 0+z; \
87 MOVQ R9, 8+z; \
88 MOVQ R10, 16+z; \
89 MOVQ R11, 24+z; \
90 MOVQ R12, 32+z; \
91 MOVQ R13, 40+z; \
92 MOVQ R14, 48+z;
93
94#define mulA24Legacy \
95 multiplyA24Leg(0(DI),0(SI))
96#define mulA24Bmi2Adx \
97 multiplyA24Adx(0(DI),0(SI))
98
99// func mulA24Amd64(z, x *fp448.Elt)
100TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16
101 MOVQ z+0(FP), DI
102 MOVQ x+8(FP), SI
103 CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx)
104
105// func ladderStepAmd64(w *[5]fp448.Elt, b uint)
106// ladderStepAmd64 calculates a point addition and doubling as follows:
107// (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-).
108// w = {x1,x2,z2,x3,z4} are five fp255.Elt of 56 bytes.
109// stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
110// (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
111TEXT ·ladderStepAmd64(SB),NOSPLIT,$336-16
112 // Parameters
113 #define regWork DI
114 #define regMove SI
115 #define x1 0*Size(regWork)
116 #define x2 1*Size(regWork)
117 #define z2 2*Size(regWork)
118 #define x3 3*Size(regWork)
119 #define z3 4*Size(regWork)
120 // Local variables
121 #define t0 0*Size(SP)
122 #define t1 1*Size(SP)
123 #define b0 2*Size(SP)
124 #define b1 4*Size(SP)
125 MOVQ w+0(FP), regWork
126 MOVQ b+8(FP), regMove
127 CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx)
128 #undef regWork
129 #undef regMove
130 #undef x1
131 #undef x2
132 #undef z2
133 #undef x3
134 #undef z3
135 #undef t0
136 #undef t1
137 #undef b0
138 #undef b1
139
140// func diffAddAmd64(work *[5]fp.Elt, swap uint)
141// diffAddAmd64 calculates a differential point addition using a precomputed point.
142// (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2)
143// work = {mu,x1,z1,x2,z2} are five fp448.Elt of 56 bytes, and
144// stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
145// This is Equation 7 at https://eprint.iacr.org/2017/264.
146TEXT ·diffAddAmd64(SB),NOSPLIT,$224-16
147 // Parameters
148 #define regWork DI
149 #define regSwap SI
150 #define ui 0*Size(regWork)
151 #define x1 1*Size(regWork)
152 #define z1 2*Size(regWork)
153 #define x2 3*Size(regWork)
154 #define z2 4*Size(regWork)
155 // Local variables
156 #define b0 0*Size(SP)
157 #define b1 2*Size(SP)
158 MOVQ w+0(FP), regWork
159 MOVQ b+8(FP), regSwap
160 cswap(x1,x2,regSwap)
161 cswap(z1,z2,regSwap)
162 CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx)
163 #undef regWork
164 #undef regSwap
165 #undef ui
166 #undef x1
167 #undef z1
168 #undef x2
169 #undef z2
170 #undef b0
171 #undef b1
172
173// func doubleAmd64(x, z *fp448.Elt)
174// doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1).
175// stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
176// (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
177TEXT ·doubleAmd64(SB),NOSPLIT,$336-16
178 // Parameters
179 #define x1 0(DI)
180 #define z1 0(SI)
181 // Local variables
182 #define t0 0*Size(SP)
183 #define t1 1*Size(SP)
184 #define b0 2*Size(SP)
185 #define b1 4*Size(SP)
186 MOVQ x+0(FP), DI
187 MOVQ z+8(FP), SI
188 CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx)
189 #undef x1
190 #undef z1
191 #undef t0
192 #undef t1
193 #undef b0
194 #undef b1