main
1//go:build amd64 && !purego
2// +build amd64,!purego
3
4#include "textflag.h"
5
6// Depends on circl/math/fp25519 package
7#include "../../math/fp25519/fp_amd64.h"
8#include "curve_amd64.h"
9
10// CTE_A24 is (A+2)/4 from Curve25519
11#define CTE_A24 121666
12
13#define Size 32
14
15// multiplyA24Leg multiplies x times CTE_A24 and stores in z
16// Uses: AX, DX, R8-R13, FLAGS
17// Instr: x86_64, cmov
18#define multiplyA24Leg(z,x) \
19 MOVL $CTE_A24, AX; MULQ 0+x; MOVQ AX, R8; MOVQ DX, R9; \
20 MOVL $CTE_A24, AX; MULQ 8+x; MOVQ AX, R12; MOVQ DX, R10; \
21 MOVL $CTE_A24, AX; MULQ 16+x; MOVQ AX, R13; MOVQ DX, R11; \
22 MOVL $CTE_A24, AX; MULQ 24+x; \
23 ADDQ R12, R9; \
24 ADCQ R13, R10; \
25 ADCQ AX, R11; \
26 ADCQ $0, DX; \
27 MOVL $38, AX; /* 2*C = 38 = 2^256 MOD 2^255-19*/ \
28 IMULQ AX, DX; \
29 ADDQ DX, R8; \
30 ADCQ $0, R9; MOVQ R9, 8+z; \
31 ADCQ $0, R10; MOVQ R10, 16+z; \
32 ADCQ $0, R11; MOVQ R11, 24+z; \
33 MOVQ $0, DX; \
34 CMOVQCS AX, DX; \
35 ADDQ DX, R8; MOVQ R8, 0+z;
36
37// multiplyA24Adx multiplies x times CTE_A24 and stores in z
38// Uses: AX, DX, R8-R12, FLAGS
39// Instr: x86_64, cmov, bmi2
40#define multiplyA24Adx(z,x) \
41 MOVQ $CTE_A24, DX; \
42 MULXQ 0+x, R8, R10; \
43 MULXQ 8+x, R9, R11; ADDQ R10, R9; \
44 MULXQ 16+x, R10, AX; ADCQ R11, R10; \
45 MULXQ 24+x, R11, R12; ADCQ AX, R11; \
46 ;;;;;;;;;;;;;;;;;;;;; ADCQ $0, R12; \
47 MOVL $38, DX; /* 2*C = 38 = 2^256 MOD 2^255-19*/ \
48 IMULQ DX, R12; \
49 ADDQ R12, R8; \
50 ADCQ $0, R9; MOVQ R9, 8+z; \
51 ADCQ $0, R10; MOVQ R10, 16+z; \
52 ADCQ $0, R11; MOVQ R11, 24+z; \
53 MOVQ $0, R12; \
54 CMOVQCS DX, R12; \
55 ADDQ R12, R8; MOVQ R8, 0+z;
56
57#define mulA24Legacy \
58 multiplyA24Leg(0(DI),0(SI))
59#define mulA24Bmi2Adx \
60 multiplyA24Adx(0(DI),0(SI))
61
62// func mulA24Amd64(z, x *fp255.Elt)
63TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16
64 MOVQ z+0(FP), DI
65 MOVQ x+8(FP), SI
66 CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx)
67
68
69// func ladderStepAmd64(w *[5]fp255.Elt, b uint)
70// ladderStepAmd64 calculates a point addition and doubling as follows:
71// (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-).
72// work = (x1,x2,z2,x3,z3) are five fp255.Elt of 32 bytes.
73// stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
74// (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
75TEXT ·ladderStepAmd64(SB),NOSPLIT,$192-16
76 // Parameters
77 #define regWork DI
78 #define regMove SI
79 #define x1 0*Size(regWork)
80 #define x2 1*Size(regWork)
81 #define z2 2*Size(regWork)
82 #define x3 3*Size(regWork)
83 #define z3 4*Size(regWork)
84 // Local variables
85 #define t0 0*Size(SP)
86 #define t1 1*Size(SP)
87 #define b0 2*Size(SP)
88 #define b1 4*Size(SP)
89 MOVQ w+0(FP), regWork
90 MOVQ b+8(FP), regMove
91 CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx)
92 #undef regWork
93 #undef regMove
94 #undef x1
95 #undef x2
96 #undef z2
97 #undef x3
98 #undef z3
99 #undef t0
100 #undef t1
101 #undef b0
102 #undef b1
103
104// func diffAddAmd64(w *[5]fp255.Elt, b uint)
105// diffAddAmd64 calculates a differential point addition using a precomputed point.
106// (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2)
107// w = (mu,x1,z1,x2,z2) are five fp.Elt, and
108// stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
109TEXT ·diffAddAmd64(SB),NOSPLIT,$128-16
110 // Parameters
111 #define regWork DI
112 #define regSwap SI
113 #define ui 0*Size(regWork)
114 #define x1 1*Size(regWork)
115 #define z1 2*Size(regWork)
116 #define x2 3*Size(regWork)
117 #define z2 4*Size(regWork)
118 // Local variables
119 #define b0 0*Size(SP)
120 #define b1 2*Size(SP)
121 MOVQ w+0(FP), regWork
122 MOVQ b+8(FP), regSwap
123 cswap(x1,x2,regSwap)
124 cswap(z1,z2,regSwap)
125 CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx)
126 #undef regWork
127 #undef regSwap
128 #undef ui
129 #undef x1
130 #undef z1
131 #undef x2
132 #undef z2
133 #undef b0
134 #undef b1
135
136// func doubleAmd64(x, z *fp255.Elt)
137// doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1).
138// stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
139// (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
140TEXT ·doubleAmd64(SB),NOSPLIT,$192-16
141 // Parameters
142 #define x1 0(DI)
143 #define z1 0(SI)
144 // Local variables
145 #define t0 0*Size(SP)
146 #define t1 1*Size(SP)
147 #define b0 2*Size(SP)
148 #define b1 4*Size(SP)
149 MOVQ x+0(FP), DI
150 MOVQ z+8(FP), SI
151 CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx)
152 #undef x1
153 #undef z1
154 #undef t0
155 #undef t1
156 #undef b0
157 #undef b1