main
Raw Download raw file
  1// This code was imported from https://github.com/armfazh/rfc7748_precomputed
  2
  3// CHECK_BMI2ADX triggers bmi2adx if supported,
  4// otherwise it fallbacks to legacy code.
  5#define CHECK_BMI2ADX(label, legacy, bmi2adx) \
  6    CMPB ·hasBmi2Adx(SB), $0  \
  7    JE label                  \
  8    bmi2adx                   \
  9    RET                       \
 10    label:                    \
 11    legacy                    \
 12    RET
 13
 14// cselect is a conditional move
 15// if b=1: it copies y into x;
 16// if b=0: x remains with the same value;
 17// if b<> 0,1: undefined.
 18// Uses: AX, DX, FLAGS
 19// Instr: x86_64, cmov
 20#define cselect(x,y,b) \
 21    TESTQ b, b \
 22    MOVQ  0+x, AX; MOVQ  0+y, DX; CMOVQNE DX, AX; MOVQ AX,  0+x; \
 23    MOVQ  8+x, AX; MOVQ  8+y, DX; CMOVQNE DX, AX; MOVQ AX,  8+x; \
 24    MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \
 25    MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x; \
 26    MOVQ 32+x, AX; MOVQ 32+y, DX; CMOVQNE DX, AX; MOVQ AX, 32+x; \
 27    MOVQ 40+x, AX; MOVQ 40+y, DX; CMOVQNE DX, AX; MOVQ AX, 40+x; \
 28    MOVQ 48+x, AX; MOVQ 48+y, DX; CMOVQNE DX, AX; MOVQ AX, 48+x;
 29
 30// cswap is a conditional swap
 31// if b=1: x,y <- y,x;
 32// if b=0: x,y remain with the same values;
 33// if b<> 0,1: undefined.
 34// Uses: AX, DX, R8, FLAGS
 35// Instr: x86_64, cmov
 36#define cswap(x,y,b) \
 37    TESTQ b, b \
 38    MOVQ  0+x, AX; MOVQ AX, R8; MOVQ  0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX,  0+x; MOVQ DX,  0+y; \
 39    MOVQ  8+x, AX; MOVQ AX, R8; MOVQ  8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX,  8+x; MOVQ DX,  8+y; \
 40    MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \
 41    MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y; \
 42    MOVQ 32+x, AX; MOVQ AX, R8; MOVQ 32+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 32+x; MOVQ DX, 32+y; \
 43    MOVQ 40+x, AX; MOVQ AX, R8; MOVQ 40+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 40+x; MOVQ DX, 40+y; \
 44    MOVQ 48+x, AX; MOVQ AX, R8; MOVQ 48+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 48+x; MOVQ DX, 48+y;
 45
 46// additionLeg adds x and y and stores in z
 47// Uses: AX, DX, R8-R14, FLAGS
 48// Instr: x86_64
 49#define additionLeg(z,x,y) \
 50    MOVQ  0+x,  R8;  ADDQ  0+y,  R8; \
 51    MOVQ  8+x,  R9;  ADCQ  8+y,  R9; \
 52    MOVQ 16+x, R10;  ADCQ 16+y, R10; \
 53    MOVQ 24+x, R11;  ADCQ 24+y, R11; \
 54    MOVQ 32+x, R12;  ADCQ 32+y, R12; \
 55    MOVQ 40+x, R13;  ADCQ 40+y, R13; \
 56    MOVQ 48+x, R14;  ADCQ 48+y, R14; \
 57    MOVQ   $0,  AX;  ADCQ   $0,  AX; \
 58    MOVQ AX,  DX; \
 59    SHLQ $32, DX; \
 60    ADDQ AX,  R8; MOVQ  $0, AX; \
 61    ADCQ $0,  R9; \
 62    ADCQ $0, R10; \
 63    ADCQ DX, R11; \
 64    ADCQ $0, R12; \
 65    ADCQ $0, R13; \
 66    ADCQ $0, R14; \
 67    ADCQ $0,  AX; \
 68    MOVQ AX,  DX; \
 69    SHLQ $32, DX; \
 70    ADDQ AX,  R8;  MOVQ  R8,  0+z; \
 71    ADCQ $0,  R9;  MOVQ  R9,  8+z; \
 72    ADCQ $0, R10;  MOVQ R10, 16+z; \
 73    ADCQ DX, R11;  MOVQ R11, 24+z; \
 74    ADCQ $0, R12;  MOVQ R12, 32+z; \
 75    ADCQ $0, R13;  MOVQ R13, 40+z; \
 76    ADCQ $0, R14;  MOVQ R14, 48+z;
 77
 78
 79// additionAdx adds x and y and stores in z
 80// Uses: AX, DX, R8-R15, FLAGS
 81// Instr: x86_64, adx
 82#define additionAdx(z,x,y) \
 83    MOVL $32, R15; \
 84    XORL DX, DX; \
 85    MOVQ  0+x,  R8;  ADCXQ  0+y,  R8; \
 86    MOVQ  8+x,  R9;  ADCXQ  8+y,  R9; \
 87    MOVQ 16+x, R10;  ADCXQ 16+y, R10; \
 88    MOVQ 24+x, R11;  ADCXQ 24+y, R11; \
 89    MOVQ 32+x, R12;  ADCXQ 32+y, R12; \
 90    MOVQ 40+x, R13;  ADCXQ 40+y, R13; \
 91    MOVQ 48+x, R14;  ADCXQ 48+y, R14; \
 92    ;;;;;;;;;;;;;;;  ADCXQ   DX,  DX; \
 93    XORL AX, AX; \
 94    ADCXQ DX,  R8; SHLXQ R15, DX, DX; \
 95    ADCXQ AX,  R9; \
 96    ADCXQ AX, R10; \
 97    ADCXQ DX, R11; \
 98    ADCXQ AX, R12; \
 99    ADCXQ AX, R13; \
100    ADCXQ AX, R14; \
101    ADCXQ AX,  AX; \
102    XORL  DX,  DX; \
103    ADCXQ AX,  R8;  MOVQ  R8,  0+z; SHLXQ R15, AX, AX; \
104    ADCXQ DX,  R9;  MOVQ  R9,  8+z; \
105    ADCXQ DX, R10;  MOVQ R10, 16+z; \
106    ADCXQ AX, R11;  MOVQ R11, 24+z; \
107    ADCXQ DX, R12;  MOVQ R12, 32+z; \
108    ADCXQ DX, R13;  MOVQ R13, 40+z; \
109    ADCXQ DX, R14;  MOVQ R14, 48+z;
110
111// subtraction subtracts y from x and stores in z
112// Uses: AX, DX, R8-R14, FLAGS
113// Instr: x86_64
114#define subtraction(z,x,y) \
115    MOVQ  0+x,  R8;  SUBQ  0+y,  R8; \
116    MOVQ  8+x,  R9;  SBBQ  8+y,  R9; \
117    MOVQ 16+x, R10;  SBBQ 16+y, R10; \
118    MOVQ 24+x, R11;  SBBQ 24+y, R11; \
119    MOVQ 32+x, R12;  SBBQ 32+y, R12; \
120    MOVQ 40+x, R13;  SBBQ 40+y, R13; \
121    MOVQ 48+x, R14;  SBBQ 48+y, R14; \
122    MOVQ   $0,  AX;  SETCS AX; \
123    MOVQ AX,  DX; \
124    SHLQ $32, DX; \
125    SUBQ AX,  R8; MOVQ  $0, AX; \
126    SBBQ $0,  R9; \
127    SBBQ $0, R10; \
128    SBBQ DX, R11; \
129    SBBQ $0, R12; \
130    SBBQ $0, R13; \
131    SBBQ $0, R14; \
132    SETCS AX; \
133    MOVQ AX,  DX; \
134    SHLQ $32, DX; \
135    SUBQ AX,  R8;  MOVQ  R8,  0+z; \
136    SBBQ $0,  R9;  MOVQ  R9,  8+z; \
137    SBBQ $0, R10;  MOVQ R10, 16+z; \
138    SBBQ DX, R11;  MOVQ R11, 24+z; \
139    SBBQ $0, R12;  MOVQ R12, 32+z; \
140    SBBQ $0, R13;  MOVQ R13, 40+z; \
141    SBBQ $0, R14;  MOVQ R14, 48+z;
142
143// maddBmi2Adx multiplies x and y and accumulates in z
144// Uses: AX, DX, R15, FLAGS
145// Instr: x86_64, bmi2, adx
146#define maddBmi2Adx(z,x,y,i,r0,r1,r2,r3,r4,r5,r6) \
147    MOVQ   i+y, DX; XORL AX, AX; \
148    MULXQ  0+x, AX, R8;  ADOXQ AX, r0;  ADCXQ R8, r1; MOVQ r0,i+z; \
149    MULXQ  8+x, AX, r0;  ADOXQ AX, r1;  ADCXQ r0, r2; MOVQ $0, R8; \
150    MULXQ 16+x, AX, r0;  ADOXQ AX, r2;  ADCXQ r0, r3; \
151    MULXQ 24+x, AX, r0;  ADOXQ AX, r3;  ADCXQ r0, r4; \
152    MULXQ 32+x, AX, r0;  ADOXQ AX, r4;  ADCXQ r0, r5; \
153    MULXQ 40+x, AX, r0;  ADOXQ AX, r5;  ADCXQ r0, r6; \
154    MULXQ 48+x, AX, r0;  ADOXQ AX, r6;  ADCXQ R8, r0; \
155    ;;;;;;;;;;;;;;;;;;;  ADOXQ R8, r0;
156
157// integerMulAdx multiplies x and y and stores in z
158// Uses: AX, DX, R8-R15, FLAGS
159// Instr: x86_64, bmi2, adx
160#define integerMulAdx(z,x,y) \
161    MOVL    $0,R15; \
162    MOVQ   0+y, DX;  XORL AX, AX;  MOVQ $0, R8; \
163    MULXQ  0+x, AX,  R9;  MOVQ  AX, 0+z; \
164    MULXQ  8+x, AX, R10;  ADCXQ AX,  R9; \
165    MULXQ 16+x, AX, R11;  ADCXQ AX, R10; \
166    MULXQ 24+x, AX, R12;  ADCXQ AX, R11; \
167    MULXQ 32+x, AX, R13;  ADCXQ AX, R12; \
168    MULXQ 40+x, AX, R14;  ADCXQ AX, R13; \
169    MULXQ 48+x, AX, R15;  ADCXQ AX, R14; \
170    ;;;;;;;;;;;;;;;;;;;;  ADCXQ R8, R15; \
171    maddBmi2Adx(z,x,y, 8, R9,R10,R11,R12,R13,R14,R15) \
172    maddBmi2Adx(z,x,y,16,R10,R11,R12,R13,R14,R15, R9) \
173    maddBmi2Adx(z,x,y,24,R11,R12,R13,R14,R15, R9,R10) \
174    maddBmi2Adx(z,x,y,32,R12,R13,R14,R15, R9,R10,R11) \
175    maddBmi2Adx(z,x,y,40,R13,R14,R15, R9,R10,R11,R12) \
176    maddBmi2Adx(z,x,y,48,R14,R15, R9,R10,R11,R12,R13) \
177    MOVQ R15,  56+z; \
178    MOVQ  R9,  64+z; \
179    MOVQ R10,  72+z; \
180    MOVQ R11,  80+z; \
181    MOVQ R12,  88+z; \
182    MOVQ R13,  96+z; \
183    MOVQ R14, 104+z;
184
185// maddLegacy multiplies x and y and accumulates in z
186// Uses: AX, DX, R15, FLAGS
187// Instr: x86_64
188#define maddLegacy(z,x,y,i) \
189    MOVQ  i+y, R15; \
190    MOVQ  0+x, AX; MULQ R15; MOVQ AX,  R8; ;;;;;;;;;;;; MOVQ DX,  R9; \
191    MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; \
192    MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
193    MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
194    MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
195    MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
196    MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
197    ADDQ  0+i+z,  R8; MOVQ  R8,  0+i+z; \
198    ADCQ  8+i+z,  R9; MOVQ  R9,  8+i+z; \
199    ADCQ 16+i+z, R10; MOVQ R10, 16+i+z; \
200    ADCQ 24+i+z, R11; MOVQ R11, 24+i+z; \
201    ADCQ 32+i+z, R12; MOVQ R12, 32+i+z; \
202    ADCQ 40+i+z, R13; MOVQ R13, 40+i+z; \
203    ADCQ 48+i+z, R14; MOVQ R14, 48+i+z; \
204    ADCQ     $0,  DX; MOVQ  DX, 56+i+z;
205
206// integerMulLeg multiplies x and y and stores in z
207// Uses: AX, DX, R8-R15, FLAGS
208// Instr: x86_64
209#define integerMulLeg(z,x,y) \
210    MOVQ  0+y, R15; \
211    MOVQ  0+x, AX; MULQ R15; MOVQ AX, 0+z; ;;;;;;;;;;;; MOVQ DX,  R8; \
212    MOVQ  8+x, AX; MULQ R15; ADDQ AX,  R8; ADCQ $0, DX; MOVQ DX,  R9; MOVQ  R8,  8+z; \
213    MOVQ 16+x, AX; MULQ R15; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; MOVQ  R9, 16+z; \
214    MOVQ 24+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; MOVQ R10, 24+z; \
215    MOVQ 32+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; MOVQ R11, 32+z; \
216    MOVQ 40+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; MOVQ R12, 40+z; \
217    MOVQ 48+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX,56+z; MOVQ R13, 48+z; \
218    maddLegacy(z,x,y, 8) \
219    maddLegacy(z,x,y,16) \
220    maddLegacy(z,x,y,24) \
221    maddLegacy(z,x,y,32) \
222    maddLegacy(z,x,y,40) \
223    maddLegacy(z,x,y,48)
224
225// integerSqrLeg squares x and stores in z
226// Uses: AX, CX, DX, R8-R15, FLAGS
227// Instr: x86_64
228#define integerSqrLeg(z,x) \
229    XORL R15, R15; \
230    MOVQ  0+x, CX; \
231    MOVQ   CX, AX; MULQ CX; MOVQ AX, 0+z; MOVQ DX, R8; \
232    ADDQ   CX, CX; ADCQ $0, R15; \
233    MOVQ  8+x, AX; MULQ CX; ADDQ AX,  R8; ADCQ $0, DX; MOVQ DX,  R9; MOVQ R8, 8+z; \
234    MOVQ 16+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; MOVQ DX, R10; \
235    MOVQ 24+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
236    MOVQ 32+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
237    MOVQ 40+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
238    MOVQ 48+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
239    \
240    MOVQ  8+x, CX; \
241    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
242    ;;;;;;;;;;;;;; MULQ CX; ADDQ  AX, R9; ADCQ $0, DX; MOVQ R9,16+z; \
243    MOVQ  R15, AX; NEGQ AX; ANDQ 8+x, AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
244    ADDQ  8+x, CX; ADCQ $0, R15; \
245    MOVQ 16+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 24+z; \
246    MOVQ 24+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX, R8; \
247    MOVQ 32+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; \
248    MOVQ 40+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
249    MOVQ 48+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R9; \
250    \
251    MOVQ 16+x, CX; \
252    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
253    ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 32+z; \
254    MOVQ  R15, AX; NEGQ AX; ANDQ 16+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
255    ADDQ 16+x, CX; ADCQ $0, R15; \
256    MOVQ 24+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 40+z; \
257    MOVQ 32+x, AX; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; ADDQ R8, R13; ADCQ $0, DX; MOVQ DX, R8; \
258    MOVQ 40+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; \
259    MOVQ 48+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; ADDQ R8,  R9; ADCQ $0, DX; MOVQ DX,R10; \
260    \
261    MOVQ 24+x, CX; \
262    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
263    ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 48+z; \
264    MOVQ  R15, AX; NEGQ AX; ANDQ 24+x,AX; ADDQ AX, DX; ADCQ $0,  R9; MOVQ DX, R8; \
265    ADDQ 24+x, CX; ADCQ $0, R15; \
266    MOVQ 32+x, AX; MULQ CX; ADDQ AX, R14; ADCQ $0, DX; ADDQ R8, R14; ADCQ $0, DX; MOVQ DX, R8; MOVQ R14, 56+z; \
267    MOVQ 40+x, AX; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; ADDQ R8,  R9; ADCQ $0, DX; MOVQ DX, R8; \
268    MOVQ 48+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX,R11; \
269    \
270    MOVQ 32+x, CX; \
271    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
272    ;;;;;;;;;;;;;; MULQ CX; ADDQ AX,  R9; ADCQ $0, DX; MOVQ R9, 64+z; \
273    MOVQ  R15, AX; NEGQ AX; ANDQ 32+x,AX; ADDQ AX, DX; ADCQ $0, R11; MOVQ DX, R8; \
274    ADDQ 32+x, CX; ADCQ $0, R15; \
275    MOVQ 40+x, AX; MULQ CX; ADDQ AX, R10; ADCQ $0, DX; ADDQ R8, R10; ADCQ $0, DX; MOVQ DX, R8; MOVQ R10, 72+z; \
276    MOVQ 48+x, AX; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; ADDQ R8, R11; ADCQ $0, DX; MOVQ DX,R12; \
277    \
278    XORL R13, R13; \
279    XORL R14, R14; \
280    MOVQ 40+x, CX; \
281    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
282    ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R11; ADCQ $0, DX; MOVQ R11, 80+z; \
283    MOVQ  R15, AX; NEGQ AX; ANDQ 40+x,AX; ADDQ AX, DX; ADCQ $0, R13; MOVQ DX, R8; \
284    ADDQ 40+x, CX; ADCQ $0, R15; \
285    MOVQ 48+x, AX; MULQ CX; ADDQ AX, R12; ADCQ $0, DX; ADDQ R8, R12; ADCQ $0, DX; MOVQ DX, R8; MOVQ R12, 88+z; \
286    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8, R13; ADCQ $0,R14; \
287    \
288    XORL   R9, R9; \
289    MOVQ 48+x, CX; \
290    MOVQ   CX, AX; ADDQ R15, CX; MOVQ $0, R15; ADCQ $0, R15; \
291    ;;;;;;;;;;;;;; MULQ CX; ADDQ AX, R13; ADCQ $0, DX; MOVQ R13, 96+z; \
292    MOVQ  R15, AX; NEGQ AX; ANDQ 48+x,AX; ADDQ AX, DX; ADCQ $0, R9; MOVQ DX, R8; \
293    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ADDQ R8,R14; ADCQ $0, R9; MOVQ R14, 104+z;
294
295
296// integerSqrAdx squares x and stores in z
297// Uses: AX, CX, DX, R8-R15, FLAGS
298// Instr: x86_64, bmi2, adx
299#define integerSqrAdx(z,x) \
300    XORL R15, R15; \
301    MOVQ  0+x, DX; \
302    ;;;;;;;;;;;;;; MULXQ DX, AX, R8; MOVQ AX, 0+z; \
303    ADDQ   DX, DX; ADCQ $0, R15; CLC; \
304    MULXQ  8+x, AX,  R9; ADCXQ AX,  R8; MOVQ R8, 8+z; \
305    MULXQ 16+x, AX, R10; ADCXQ AX,  R9; MOVQ $0, R8;\
306    MULXQ 24+x, AX, R11; ADCXQ AX, R10; \
307    MULXQ 32+x, AX, R12; ADCXQ AX, R11; \
308    MULXQ 40+x, AX, R13; ADCXQ AX, R12; \
309    MULXQ 48+x, AX, R14; ADCXQ AX, R13; \
310    ;;;;;;;;;;;;;;;;;;;; ADCXQ R8, R14; \
311    \
312    MOVQ  8+x, DX; \
313    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
314    MULXQ AX,  AX, CX; \
315    MOVQ R15,  R8; NEGQ R8; ANDQ 8+x, R8; \
316    ADDQ AX,  R9; MOVQ R9, 16+z; \
317    ADCQ CX,  R8; \
318    ADCQ $0, R11; \
319    ADDQ  8+x,  DX; \
320    ADCQ   $0, R15; \
321    XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
322    MULXQ 16+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 24+z; \
323    MULXQ 24+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; MOVQ  $0, R10; \
324    MULXQ 32+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; \
325    MULXQ 40+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; \
326    MULXQ 48+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; \
327    ;;;;;;;;;;;;;;;;;;; ADCXQ R10, R9; \
328    \
329    MOVQ 16+x, DX; \
330    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
331    MULXQ AX,  AX, CX; \
332    MOVQ R15,  R8; NEGQ R8; ANDQ 16+x, R8; \
333    ADDQ AX, R11; MOVQ R11, 32+z; \
334    ADCQ CX,  R8; \
335    ADCQ $0, R13; \
336    ADDQ 16+x,  DX; \
337    ADCQ   $0, R15; \
338    XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
339    MULXQ 24+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 40+z; \
340    MULXQ 32+x, AX, CX; ADCXQ AX, R13; ADOXQ CX, R14; MOVQ  $0, R12; \
341    MULXQ 40+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; \
342    MULXQ 48+x, AX, CX; ADCXQ AX,  R9; ADOXQ CX, R10; \
343    ;;;;;;;;;;;;;;;;;;; ADCXQ R11,R10; \
344    \
345    MOVQ 24+x, DX; \
346    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
347    MULXQ AX,  AX, CX; \
348    MOVQ R15,  R8; NEGQ R8; ANDQ 24+x, R8; \
349    ADDQ AX, R13; MOVQ R13, 48+z; \
350    ADCQ CX,  R8; \
351    ADCQ $0,  R9; \
352    ADDQ 24+x,  DX; \
353    ADCQ   $0, R15; \
354    XORL R13, R13; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R14; \
355    MULXQ 32+x, AX, CX; ADCXQ AX, R14; ADOXQ CX,  R9; MOVQ R14, 56+z; \
356    MULXQ 40+x, AX, CX; ADCXQ AX,  R9; ADOXQ CX, R10; MOVQ  $0, R14; \
357    MULXQ 48+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; \
358    ;;;;;;;;;;;;;;;;;;; ADCXQ R12,R11; \
359    \
360    MOVQ 32+x, DX; \
361    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
362    MULXQ AX,  AX, CX; \
363    MOVQ R15,  R8; NEGQ R8; ANDQ 32+x, R8; \
364    ADDQ AX,  R9; MOVQ R9, 64+z; \
365    ADCQ CX,  R8; \
366    ADCQ $0, R11; \
367    ADDQ 32+x,  DX; \
368    ADCQ   $0, R15; \
369    XORL R9, R9; ;;;;;;;;;;;;;;;;;;;;; ADOXQ R8, R10; \
370    MULXQ 40+x, AX, CX; ADCXQ AX, R10; ADOXQ CX, R11; MOVQ R10, 72+z; \
371    MULXQ 48+x, AX, CX; ADCXQ AX, R11; ADOXQ CX, R12; \
372    ;;;;;;;;;;;;;;;;;;; ADCXQ R13,R12; \
373    \
374    MOVQ 40+x, DX; \
375    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
376    MULXQ AX,  AX, CX; \
377    MOVQ R15,  R8; NEGQ R8; ANDQ 40+x, R8; \
378    ADDQ AX, R11; MOVQ R11, 80+z; \
379    ADCQ CX,  R8; \
380    ADCQ $0, R13; \
381    ADDQ 40+x,  DX; \
382    ADCQ   $0, R15; \
383    XORL R11, R11; ;;;;;;;;;;;;;;;;;;; ADOXQ R8, R12; \
384    MULXQ 48+x, AX, CX; ADCXQ AX, R12; ADOXQ CX, R13; MOVQ R12, 88+z; \
385    ;;;;;;;;;;;;;;;;;;; ADCXQ R14,R13; \
386    \
387    MOVQ 48+x, DX; \
388    MOVQ   DX, AX; ADDQ R15, DX; MOVQ $0, R15; ADCQ  $0, R15; \
389    MULXQ AX,  AX, CX; \
390    MOVQ R15,  R8; NEGQ R8; ANDQ 48+x, R8; \
391    XORL R10, R10; ;;;;;;;;;;;;;; ADOXQ CX, R14; \
392    ;;;;;;;;;;;;;; ADCXQ AX, R13; ;;;;;;;;;;;;;; MOVQ R13, 96+z; \
393    ;;;;;;;;;;;;;; ADCXQ R8, R14; MOVQ R14, 104+z;
394
395// reduceFromDoubleLeg finds a z=x modulo p such that z<2^448 and stores in z
396// Uses: AX, R8-R15, FLAGS
397// Instr: x86_64
398#define reduceFromDoubleLeg(z,x) \
399    /* (   ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
400    /* (r14, r13, r12, r11,     r10,r9,r8,r15) */ \
401    MOVQ 80+x,AX; MOVQ AX,R10; \
402    MOVQ $0xFFFFFFFF00000000, R8; \
403    ANDQ R8,R10; \
404    \
405    MOVQ $0,R14; \
406    MOVQ 104+x,R13; SHLQ $1,R13,R14; \
407    MOVQ  96+x,R12; SHLQ $1,R12,R13; \
408    MOVQ  88+x,R11; SHLQ $1,R11,R12; \
409    MOVQ  72+x, R9; SHLQ $1,R10,R11; \
410    MOVQ  64+x, R8; SHLQ $1,R10; \
411    MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
412    MOVQ  56+x,R15; \
413    \
414    ADDQ  0+x,R15; MOVQ R15, 0+z; MOVQ  56+x,R15; \
415    ADCQ  8+x, R8; MOVQ  R8, 8+z; MOVQ  64+x, R8; \
416    ADCQ 16+x, R9; MOVQ  R9,16+z; MOVQ  72+x, R9; \
417    ADCQ 24+x,R10; MOVQ R10,24+z; MOVQ  80+x,R10; \
418    ADCQ 32+x,R11; MOVQ R11,32+z; MOVQ  88+x,R11; \
419    ADCQ 40+x,R12; MOVQ R12,40+z; MOVQ  96+x,R12; \
420    ADCQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
421    ADCQ   $0,R14; \
422    /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
423    /* (   r9,  r8, r15,  r13,   r12,   r11,   r10) */ \
424    MOVQ R10, AX; \
425    SHRQ $32,R11,R10; \
426    SHRQ $32,R12,R11; \
427    SHRQ $32,R13,R12; \
428    SHRQ $32,R15,R13; \
429    SHRQ $32, R8,R15; \
430    SHRQ $32, R9, R8; \
431    SHRQ $32, AX, R9; \
432    \
433    ADDQ  0+z,R10; \
434    ADCQ  8+z,R11; \
435    ADCQ 16+z,R12; \
436    ADCQ 24+z,R13; \
437    ADCQ 32+z,R15; \
438    ADCQ 40+z, R8; \
439    ADCQ 48+z, R9; \
440    ADCQ   $0,R14; \
441    /* ( c7) + (c6,...,c0) */ \
442    /* (r14) */ \
443    MOVQ R14, AX; SHLQ $32, AX; \
444    ADDQ R14,R10; MOVQ  $0,R14; \
445    ADCQ  $0,R11; \
446    ADCQ  $0,R12; \
447    ADCQ  AX,R13; \
448    ADCQ  $0,R15; \
449    ADCQ  $0, R8; \
450    ADCQ  $0, R9; \
451    ADCQ  $0,R14; \
452    /* ( c7) + (c6,...,c0) */ \
453    /* (r14) */ \
454    MOVQ R14, AX; SHLQ $32,AX; \
455    ADDQ R14,R10; MOVQ R10, 0+z; \
456    ADCQ  $0,R11; MOVQ R11, 8+z; \
457    ADCQ  $0,R12; MOVQ R12,16+z; \
458    ADCQ  AX,R13; MOVQ R13,24+z; \
459    ADCQ  $0,R15; MOVQ R15,32+z; \
460    ADCQ  $0, R8; MOVQ  R8,40+z; \
461    ADCQ  $0, R9; MOVQ  R9,48+z;
462
463// reduceFromDoubleAdx finds a z=x modulo p such that z<2^448 and stores in z
464// Uses: AX, R8-R15, FLAGS
465// Instr: x86_64, adx
466#define reduceFromDoubleAdx(z,x) \
467    /* (   ,2C13,2C12,2C11,2C10|C10,C9,C8, C7) + (C6,...,C0) */ \
468    /* (r14, r13, r12, r11,     r10,r9,r8,r15) */ \
469    MOVQ 80+x,AX; MOVQ AX,R10; \
470    MOVQ $0xFFFFFFFF00000000, R8; \
471    ANDQ R8,R10; \
472    \
473    MOVQ $0,R14; \
474    MOVQ 104+x,R13; SHLQ $1,R13,R14; \
475    MOVQ  96+x,R12; SHLQ $1,R12,R13; \
476    MOVQ  88+x,R11; SHLQ $1,R11,R12; \
477    MOVQ  72+x, R9; SHLQ $1,R10,R11; \
478    MOVQ  64+x, R8; SHLQ $1,R10; \
479    MOVQ $0xFFFFFFFF,R15; ANDQ R15,AX; ORQ AX,R10; \
480    MOVQ  56+x,R15; \
481    \
482    XORL AX,AX; \
483    ADCXQ  0+x,R15; MOVQ R15, 0+z; MOVQ  56+x,R15; \
484    ADCXQ  8+x, R8; MOVQ  R8, 8+z; MOVQ  64+x, R8; \
485    ADCXQ 16+x, R9; MOVQ  R9,16+z; MOVQ  72+x, R9; \
486    ADCXQ 24+x,R10; MOVQ R10,24+z; MOVQ  80+x,R10; \
487    ADCXQ 32+x,R11; MOVQ R11,32+z; MOVQ  88+x,R11; \
488    ADCXQ 40+x,R12; MOVQ R12,40+z; MOVQ  96+x,R12; \
489    ADCXQ 48+x,R13; MOVQ R13,48+z; MOVQ 104+x,R13; \
490    ADCXQ   AX,R14; \
491    /* (c10c9,c9c8,c8c7,c7c13,c13c12,c12c11,c11c10) + (c6,...,c0) */ \
492    /* (   r9,  r8, r15,  r13,   r12,   r11,   r10) */ \
493    MOVQ R10, AX; \
494    SHRQ $32,R11,R10; \
495    SHRQ $32,R12,R11; \
496    SHRQ $32,R13,R12; \
497    SHRQ $32,R15,R13; \
498    SHRQ $32, R8,R15; \
499    SHRQ $32, R9, R8; \
500    SHRQ $32, AX, R9; \
501    \
502    XORL AX,AX; \
503    ADCXQ  0+z,R10; \
504    ADCXQ  8+z,R11; \
505    ADCXQ 16+z,R12; \
506    ADCXQ 24+z,R13; \
507    ADCXQ 32+z,R15; \
508    ADCXQ 40+z, R8; \
509    ADCXQ 48+z, R9; \
510    ADCXQ   AX,R14; \
511    /* ( c7) + (c6,...,c0) */ \
512    /* (r14) */ \
513    MOVQ R14, AX; SHLQ $32, AX; \
514    CLC; \
515    ADCXQ R14,R10; MOVQ $0,R14; \
516    ADCXQ R14,R11; \
517    ADCXQ R14,R12; \
518    ADCXQ  AX,R13; \
519    ADCXQ R14,R15; \
520    ADCXQ R14, R8; \
521    ADCXQ R14, R9; \
522    ADCXQ R14,R14; \
523    /* ( c7) + (c6,...,c0) */ \
524    /* (r14) */ \
525    MOVQ R14, AX; SHLQ $32, AX; \
526    CLC; \
527    ADCXQ R14,R10; MOVQ R10, 0+z; MOVQ $0,R14; \
528    ADCXQ R14,R11; MOVQ R11, 8+z; \
529    ADCXQ R14,R12; MOVQ R12,16+z; \
530    ADCXQ  AX,R13; MOVQ R13,24+z; \
531    ADCXQ R14,R15; MOVQ R15,32+z; \
532    ADCXQ R14, R8; MOVQ  R8,40+z; \
533    ADCXQ R14, R9; MOVQ  R9,48+z;
534
535// addSub calculates two operations: x,y = x+y,x-y
536// Uses: AX, DX, R8-R15, FLAGS
537#define addSub(x,y) \
538    MOVQ  0+x,  R8;  ADDQ  0+y,  R8; \
539    MOVQ  8+x,  R9;  ADCQ  8+y,  R9; \
540    MOVQ 16+x, R10;  ADCQ 16+y, R10; \
541    MOVQ 24+x, R11;  ADCQ 24+y, R11; \
542    MOVQ 32+x, R12;  ADCQ 32+y, R12; \
543    MOVQ 40+x, R13;  ADCQ 40+y, R13; \
544    MOVQ 48+x, R14;  ADCQ 48+y, R14; \
545    MOVQ   $0,  AX;  ADCQ   $0,  AX; \
546    MOVQ AX,  DX; \
547    SHLQ $32, DX; \
548    ADDQ AX,  R8; MOVQ  $0, AX; \
549    ADCQ $0,  R9; \
550    ADCQ $0, R10; \
551    ADCQ DX, R11; \
552    ADCQ $0, R12; \
553    ADCQ $0, R13; \
554    ADCQ $0, R14; \
555    ADCQ $0,  AX; \
556    MOVQ AX,  DX; \
557    SHLQ $32, DX; \
558    ADDQ AX,  R8;  MOVQ  0+x,AX; MOVQ  R8,  0+x; MOVQ AX,  R8; \
559    ADCQ $0,  R9;  MOVQ  8+x,AX; MOVQ  R9,  8+x; MOVQ AX,  R9; \
560    ADCQ $0, R10;  MOVQ 16+x,AX; MOVQ R10, 16+x; MOVQ AX, R10; \
561    ADCQ DX, R11;  MOVQ 24+x,AX; MOVQ R11, 24+x; MOVQ AX, R11; \
562    ADCQ $0, R12;  MOVQ 32+x,AX; MOVQ R12, 32+x; MOVQ AX, R12; \
563    ADCQ $0, R13;  MOVQ 40+x,AX; MOVQ R13, 40+x; MOVQ AX, R13; \
564    ADCQ $0, R14;  MOVQ 48+x,AX; MOVQ R14, 48+x; MOVQ AX, R14; \
565    SUBQ  0+y,  R8; \
566    SBBQ  8+y,  R9; \
567    SBBQ 16+y, R10; \
568    SBBQ 24+y, R11; \
569    SBBQ 32+y, R12; \
570    SBBQ 40+y, R13; \
571    SBBQ 48+y, R14; \
572    MOVQ   $0,  AX;  SETCS AX; \
573    MOVQ AX,  DX; \
574    SHLQ $32, DX; \
575    SUBQ AX,  R8; MOVQ  $0, AX; \
576    SBBQ $0,  R9; \
577    SBBQ $0, R10; \
578    SBBQ DX, R11; \
579    SBBQ $0, R12; \
580    SBBQ $0, R13; \
581    SBBQ $0, R14; \
582    SETCS AX; \
583    MOVQ AX,  DX; \
584    SHLQ $32, DX; \
585    SUBQ AX,  R8;  MOVQ  R8,  0+y; \
586    SBBQ $0,  R9;  MOVQ  R9,  8+y; \
587    SBBQ $0, R10;  MOVQ R10, 16+y; \
588    SBBQ DX, R11;  MOVQ R11, 24+y; \
589    SBBQ $0, R12;  MOVQ R12, 32+y; \
590    SBBQ $0, R13;  MOVQ R13, 40+y; \
591    SBBQ $0, R14;  MOVQ R14, 48+y;