main
1//go:build amd64 && !purego
2// +build amd64,!purego
3
4#include "textflag.h"
5#include "fp_amd64.h"
6
7// func cmovAmd64(x, y *Elt, n uint)
8TEXT ·cmovAmd64(SB),NOSPLIT,$0-24
9 MOVQ x+0(FP), DI
10 MOVQ y+8(FP), SI
11 MOVQ n+16(FP), BX
12 cselect(0(DI),0(SI),BX)
13 RET
14
15// func cswapAmd64(x, y *Elt, n uint)
16TEXT ·cswapAmd64(SB),NOSPLIT,$0-24
17 MOVQ x+0(FP), DI
18 MOVQ y+8(FP), SI
19 MOVQ n+16(FP), BX
20 cswap(0(DI),0(SI),BX)
21 RET
22
23// func subAmd64(z, x, y *Elt)
24TEXT ·subAmd64(SB),NOSPLIT,$0-24
25 MOVQ z+0(FP), DI
26 MOVQ x+8(FP), SI
27 MOVQ y+16(FP), BX
28 subtraction(0(DI),0(SI),0(BX))
29 RET
30
31// func addsubAmd64(x, y *Elt)
32TEXT ·addsubAmd64(SB),NOSPLIT,$0-16
33 MOVQ x+0(FP), DI
34 MOVQ y+8(FP), SI
35 addSub(0(DI),0(SI))
36 RET
37
38#define addLegacy \
39 additionLeg(0(DI),0(SI),0(BX))
40#define addBmi2Adx \
41 additionAdx(0(DI),0(SI),0(BX))
42
43#define mulLegacy \
44 integerMulLeg(0(SP),0(SI),0(BX)) \
45 reduceFromDoubleLeg(0(DI),0(SP))
46#define mulBmi2Adx \
47 integerMulAdx(0(SP),0(SI),0(BX)) \
48 reduceFromDoubleAdx(0(DI),0(SP))
49
50#define sqrLegacy \
51 integerSqrLeg(0(SP),0(SI)) \
52 reduceFromDoubleLeg(0(DI),0(SP))
53#define sqrBmi2Adx \
54 integerSqrAdx(0(SP),0(SI)) \
55 reduceFromDoubleAdx(0(DI),0(SP))
56
57// func addAmd64(z, x, y *Elt)
58TEXT ·addAmd64(SB),NOSPLIT,$0-24
59 MOVQ z+0(FP), DI
60 MOVQ x+8(FP), SI
61 MOVQ y+16(FP), BX
62 CHECK_BMI2ADX(LADD, addLegacy, addBmi2Adx)
63
64// func mulAmd64(z, x, y *Elt)
65TEXT ·mulAmd64(SB),NOSPLIT,$64-24
66 MOVQ z+0(FP), DI
67 MOVQ x+8(FP), SI
68 MOVQ y+16(FP), BX
69 CHECK_BMI2ADX(LMUL, mulLegacy, mulBmi2Adx)
70
71// func sqrAmd64(z, x *Elt)
72TEXT ·sqrAmd64(SB),NOSPLIT,$64-16
73 MOVQ z+0(FP), DI
74 MOVQ x+8(FP), SI
75 CHECK_BMI2ADX(LSQR, sqrLegacy, sqrBmi2Adx)
76
77// func modpAmd64(z *Elt)
78TEXT ·modpAmd64(SB),NOSPLIT,$0-8
79 MOVQ z+0(FP), DI
80
81 MOVQ (DI), R8
82 MOVQ 8(DI), R9
83 MOVQ 16(DI), R10
84 MOVQ 24(DI), R11
85
86 MOVL $19, AX
87 MOVL $38, CX
88
89 BTRQ $63, R11 // PUT BIT 255 IN CARRY FLAG AND CLEAR
90 CMOVLCC AX, CX // C[255] ? 38 : 19
91
92 // ADD EITHER 19 OR 38 TO C
93 ADDQ CX, R8
94 ADCQ $0, R9
95 ADCQ $0, R10
96 ADCQ $0, R11
97
98 // TEST FOR BIT 255 AGAIN; ONLY TRIGGERED ON OVERFLOW MODULO 2^255-19
99 MOVL $0, CX
100 CMOVLPL AX, CX // C[255] ? 0 : 19
101 BTRQ $63, R11 // CLEAR BIT 255
102
103 // SUBTRACT 19 IF NECESSARY
104 SUBQ CX, R8
105 MOVQ R8, (DI)
106 SBBQ $0, R9
107 MOVQ R9, 8(DI)
108 SBBQ $0, R10
109 MOVQ R10, 16(DI)
110 SBBQ $0, R11
111 MOVQ R11, 24(DI)
112 RET