3 * Copyright (C) 2006-2014 wolfSSL Inc.
5 * This file is part of CyaSSL.
7 * CyaSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * CyaSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
26 #include <cyassl/ctaocrypt/settings.h>
29 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
30 * http://math.libtomcrypt.com
34 /******************************************************************/
35 /* fp_montgomery_reduce.c asm or generic */
36 #if defined(TFM_X86) && !defined(TFM_SSE2)
47 "movl %5,%%eax \n\t" \
49 "addl %1,%%eax \n\t" \
50 "adcl $0,%%edx \n\t" \
51 "addl %%eax,%0 \n\t" \
52 "adcl $0,%%edx \n\t" \
53 "movl %%edx,%1 \n\t" \
54 :"=g"(_c[LO]), "=r"(cy) \
55 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
56 : "%eax", "%edx", "cc")
62 "movzbl %%al,%1 \n\t" \
63 :"=g"(_c[LO]), "=r"(cy) \
64 :"0"(_c[LO]), "1"(cy) \
67 /******************************************************************/
68 #elif defined(TFM_X86_64)
79 "movq %5,%%rax \n\t" \
81 "addq %1,%%rax \n\t" \
82 "adcq $0,%%rdx \n\t" \
83 "addq %%rax,%0 \n\t" \
84 "adcq $0,%%rdx \n\t" \
85 "movq %%rdx,%1 \n\t" \
86 :"=g"(_c[LO]), "=r"(cy) \
87 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
88 : "%rax", "%rdx", "cc")
92 "movq 0(%5),%%rax \n\t" \
93 "movq 0(%2),%%r10 \n\t" \
94 "movq 0x8(%5),%%r11 \n\t" \
96 "addq %%r10,%%rax \n\t" \
97 "adcq $0,%%rdx \n\t" \
98 "movq 0x8(%2),%%r10 \n\t" \
99 "addq %3,%%rax \n\t" \
100 "adcq $0,%%rdx \n\t" \
101 "movq %%rax,0(%0) \n\t" \
102 "movq %%rdx,%1 \n\t" \
104 "movq %%r11,%%rax \n\t" \
105 "movq 0x10(%5),%%r11 \n\t" \
107 "addq %%r10,%%rax \n\t" \
108 "adcq $0,%%rdx \n\t" \
109 "movq 0x10(%2),%%r10 \n\t" \
110 "addq %3,%%rax \n\t" \
111 "adcq $0,%%rdx \n\t" \
112 "movq %%rax,0x8(%0) \n\t" \
113 "movq %%rdx,%1 \n\t" \
115 "movq %%r11,%%rax \n\t" \
116 "movq 0x18(%5),%%r11 \n\t" \
118 "addq %%r10,%%rax \n\t" \
119 "adcq $0,%%rdx \n\t" \
120 "movq 0x18(%2),%%r10 \n\t" \
121 "addq %3,%%rax \n\t" \
122 "adcq $0,%%rdx \n\t" \
123 "movq %%rax,0x10(%0) \n\t" \
124 "movq %%rdx,%1 \n\t" \
126 "movq %%r11,%%rax \n\t" \
127 "movq 0x20(%5),%%r11 \n\t" \
129 "addq %%r10,%%rax \n\t" \
130 "adcq $0,%%rdx \n\t" \
131 "movq 0x20(%2),%%r10 \n\t" \
132 "addq %3,%%rax \n\t" \
133 "adcq $0,%%rdx \n\t" \
134 "movq %%rax,0x18(%0) \n\t" \
135 "movq %%rdx,%1 \n\t" \
137 "movq %%r11,%%rax \n\t" \
138 "movq 0x28(%5),%%r11 \n\t" \
140 "addq %%r10,%%rax \n\t" \
141 "adcq $0,%%rdx \n\t" \
142 "movq 0x28(%2),%%r10 \n\t" \
143 "addq %3,%%rax \n\t" \
144 "adcq $0,%%rdx \n\t" \
145 "movq %%rax,0x20(%0) \n\t" \
146 "movq %%rdx,%1 \n\t" \
148 "movq %%r11,%%rax \n\t" \
149 "movq 0x30(%5),%%r11 \n\t" \
151 "addq %%r10,%%rax \n\t" \
152 "adcq $0,%%rdx \n\t" \
153 "movq 0x30(%2),%%r10 \n\t" \
154 "addq %3,%%rax \n\t" \
155 "adcq $0,%%rdx \n\t" \
156 "movq %%rax,0x28(%0) \n\t" \
157 "movq %%rdx,%1 \n\t" \
159 "movq %%r11,%%rax \n\t" \
160 "movq 0x38(%5),%%r11 \n\t" \
162 "addq %%r10,%%rax \n\t" \
163 "adcq $0,%%rdx \n\t" \
164 "movq 0x38(%2),%%r10 \n\t" \
165 "addq %3,%%rax \n\t" \
166 "adcq $0,%%rdx \n\t" \
167 "movq %%rax,0x30(%0) \n\t" \
168 "movq %%rdx,%1 \n\t" \
170 "movq %%r11,%%rax \n\t" \
172 "addq %%r10,%%rax \n\t" \
173 "adcq $0,%%rdx \n\t" \
174 "addq %3,%%rax \n\t" \
175 "adcq $0,%%rdx \n\t" \
176 "movq %%rax,0x38(%0) \n\t" \
177 "movq %%rdx,%1 \n\t" \
179 :"=r"(_c), "=r"(cy) \
180 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
181 : "%rax", "%rdx", "%r10", "%r11", "cc")
188 "movzbq %%al,%1 \n\t" \
189 :"=g"(_c[LO]), "=r"(cy) \
190 :"0"(_c[LO]), "1"(cy) \
193 /******************************************************************/
194 #elif defined(TFM_SSE2)
195 /* SSE2 code (assumes 32-bit fp_digits) */
196 /* XMM register assignments:
197 * xmm0 *tmpm++, then Mu * (*tmpm++)
205 __asm__("movd %0,%%mm2"::"g"(mp))
212 "movd %0,%%mm1 \n\t" \
213 "pxor %%mm3,%%mm3 \n\t" \
214 "pmuludq %%mm2,%%mm1 \n\t" \
217 /* pmuludq on mmx registers does a 32x32->64 multiply. */
220 "movd %1,%%mm4 \n\t" \
221 "movd %2,%%mm0 \n\t" \
222 "paddq %%mm4,%%mm3 \n\t" \
223 "pmuludq %%mm1,%%mm0 \n\t" \
224 "paddq %%mm0,%%mm3 \n\t" \
225 "movd %%mm3,%0 \n\t" \
226 "psrlq $32, %%mm3 \n\t" \
227 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
231 "movd 0(%1),%%mm4 \n\t" \
232 "movd 0(%2),%%mm0 \n\t" \
233 "paddq %%mm4,%%mm3 \n\t" \
234 "pmuludq %%mm1,%%mm0 \n\t" \
235 "movd 4(%2),%%mm5 \n\t" \
236 "paddq %%mm0,%%mm3 \n\t" \
237 "movd 4(%1),%%mm6 \n\t" \
238 "movd %%mm3,0(%0) \n\t" \
239 "psrlq $32, %%mm3 \n\t" \
241 "paddq %%mm6,%%mm3 \n\t" \
242 "pmuludq %%mm1,%%mm5 \n\t" \
243 "movd 8(%2),%%mm6 \n\t" \
244 "paddq %%mm5,%%mm3 \n\t" \
245 "movd 8(%1),%%mm7 \n\t" \
246 "movd %%mm3,4(%0) \n\t" \
247 "psrlq $32, %%mm3 \n\t" \
249 "paddq %%mm7,%%mm3 \n\t" \
250 "pmuludq %%mm1,%%mm6 \n\t" \
251 "movd 12(%2),%%mm7 \n\t" \
252 "paddq %%mm6,%%mm3 \n\t" \
253 "movd 12(%1),%%mm5 \n\t" \
254 "movd %%mm3,8(%0) \n\t" \
255 "psrlq $32, %%mm3 \n\t" \
257 "paddq %%mm5,%%mm3 \n\t" \
258 "pmuludq %%mm1,%%mm7 \n\t" \
259 "movd 16(%2),%%mm5 \n\t" \
260 "paddq %%mm7,%%mm3 \n\t" \
261 "movd 16(%1),%%mm6 \n\t" \
262 "movd %%mm3,12(%0) \n\t" \
263 "psrlq $32, %%mm3 \n\t" \
265 "paddq %%mm6,%%mm3 \n\t" \
266 "pmuludq %%mm1,%%mm5 \n\t" \
267 "movd 20(%2),%%mm6 \n\t" \
268 "paddq %%mm5,%%mm3 \n\t" \
269 "movd 20(%1),%%mm7 \n\t" \
270 "movd %%mm3,16(%0) \n\t" \
271 "psrlq $32, %%mm3 \n\t" \
273 "paddq %%mm7,%%mm3 \n\t" \
274 "pmuludq %%mm1,%%mm6 \n\t" \
275 "movd 24(%2),%%mm7 \n\t" \
276 "paddq %%mm6,%%mm3 \n\t" \
277 "movd 24(%1),%%mm5 \n\t" \
278 "movd %%mm3,20(%0) \n\t" \
279 "psrlq $32, %%mm3 \n\t" \
281 "paddq %%mm5,%%mm3 \n\t" \
282 "pmuludq %%mm1,%%mm7 \n\t" \
283 "movd 28(%2),%%mm5 \n\t" \
284 "paddq %%mm7,%%mm3 \n\t" \
285 "movd 28(%1),%%mm6 \n\t" \
286 "movd %%mm3,24(%0) \n\t" \
287 "psrlq $32, %%mm3 \n\t" \
289 "paddq %%mm6,%%mm3 \n\t" \
290 "pmuludq %%mm1,%%mm5 \n\t" \
291 "paddq %%mm5,%%mm3 \n\t" \
292 "movd %%mm3,28(%0) \n\t" \
293 "psrlq $32, %%mm3 \n\t" \
294 :"=r"(_c) : "0"(_c), "r"(tmpm) );
296 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
300 __asm__( "movd %%mm3,%0 \n" :"=r"(cy))
306 "movzbl %%al,%1 \n\t" \
307 :"=g"(_c[LO]), "=r"(cy) \
308 :"0"(_c[LO]), "1"(cy) \
311 /******************************************************************/
312 #elif defined(TFM_ARM)
327 " ADDS r0,r0,%0 \n\t" \
329 " MOVCS %0,#1 \n\t" \
330 " MOVCC %0,#0 \n\t" \
331 " UMLAL r0,%0,%3,%4 \n\t" \
333 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
338 " ADDS r0,r0,%0 \n\t" \
341 " MOVCS %0,#1 \n\t" \
342 " MOVCC %0,#0 \n\t" \
343 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
346 /* TAO thumb mode uses ite (if then else) to detect carry directly
347 * fixed unmatched constraint warning by changing 1 to m */
349 #else /* __thumb__ */
354 " ADDS r0,r0,%0 \n\t" \
355 " MOVCS %0,#1 \n\t" \
356 " MOVCC %0,#0 \n\t" \
357 " UMLAL r0,%0,%3,%4 \n\t" \
359 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
364 " ADDS r0,r0,%0 \n\t" \
366 " MOVCS %0,#1 \n\t" \
367 " MOVCC %0,#0 \n\t" \
368 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
370 #endif /* __thumb__ */
372 #elif defined(TFM_PPC32)
383 " mullw 16,%3,%4 \n\t" \
384 " mulhwu 17,%3,%4 \n\t" \
385 " addc 16,16,%0 \n\t" \
386 " addze 17,17 \n\t" \
388 " addc 16,16,18 \n\t" \
389 " addze %0,17 \n\t" \
391 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
396 " addc 16,16,%0 \n\t" \
398 " xor %0,%0,%0 \n\t" \
399 " addze %0,%0 \n\t" \
400 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
402 #elif defined(TFM_PPC64)
413 " mulld 16,%3,%4 \n\t" \
414 " mulhdu 17,%3,%4 \n\t" \
415 " addc 16,16,%0 \n\t" \
416 " addze 17,17 \n\t" \
417 " ldx 18,0,%1 \n\t" \
418 " addc 16,16,18 \n\t" \
419 " addze %0,17 \n\t" \
420 " sdx 16,0,%1 \n\t" \
421 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
425 " ldx 16,0,%1 \n\t" \
426 " addc 16,16,%0 \n\t" \
427 " sdx 16,0,%1 \n\t" \
428 " xor %0,%0,%0 \n\t" \
429 " addze %0,%0 \n\t" \
430 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
432 /******************************************************************/
434 #elif defined(TFM_AVR32)
449 " macu.d r2,%3,%4 \n\t" \
452 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
461 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
474 t = ((fp_word)_c[0] + (fp_word)cy) + \
475 (((fp_word)mu) * ((fp_word)*tmpm++)); \
476 _c[0] = (fp_digit)t; \
477 cy = (fp_digit)(t >> DIGIT_BIT); \
481 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
484 /******************************************************************/
488 /* end fp_montogomery_reduce.c asm */
491 /* start fp_sqr_comba.c asm */
494 /* x86-32 optimized */
498 #define CLEAR_CARRY \
501 #define COMBA_STORE(x) \
504 #define COMBA_STORE2(x) \
507 #define CARRY_FORWARD \
508 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
512 #define SQRADD(i, j) \
514 "movl %6,%%eax \n\t" \
516 "addl %%eax,%0 \n\t" \
517 "adcl %%edx,%1 \n\t" \
519 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
521 #define SQRADD2(i, j) \
523 "movl %6,%%eax \n\t" \
525 "addl %%eax,%0 \n\t" \
526 "adcl %%edx,%1 \n\t" \
528 "addl %%eax,%0 \n\t" \
529 "adcl %%edx,%1 \n\t" \
531 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx", "cc");
533 #define SQRADDSC(i, j) \
535 "movl %3,%%eax \n\t" \
537 "movl %%eax,%0 \n\t" \
538 "movl %%edx,%1 \n\t" \
540 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
542 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
544 #define SQRADDAC(i, j) \
546 "movl %6,%%eax \n\t" \
548 "addl %%eax,%0 \n\t" \
549 "adcl %%edx,%1 \n\t" \
551 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
561 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
563 #elif defined(TFM_X86_64)
564 /* x86-64 optimized */
568 #define CLEAR_CARRY \
571 #define COMBA_STORE(x) \
574 #define COMBA_STORE2(x) \
577 #define CARRY_FORWARD \
578 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
582 #define SQRADD(i, j) \
584 "movq %6,%%rax \n\t" \
586 "addq %%rax,%0 \n\t" \
587 "adcq %%rdx,%1 \n\t" \
589 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
591 #define SQRADD2(i, j) \
593 "movq %6,%%rax \n\t" \
595 "addq %%rax,%0 \n\t" \
596 "adcq %%rdx,%1 \n\t" \
598 "addq %%rax,%0 \n\t" \
599 "adcq %%rdx,%1 \n\t" \
601 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
603 #define SQRADDSC(i, j) \
605 "movq %3,%%rax \n\t" \
607 "movq %%rax,%0 \n\t" \
608 "movq %%rdx,%1 \n\t" \
610 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
612 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
614 #define SQRADDAC(i, j) \
616 "movq %6,%%rax \n\t" \
618 "addq %%rax,%0 \n\t" \
619 "adcq %%rdx,%1 \n\t" \
621 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
631 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
633 #elif defined(TFM_SSE2)
638 #define CLEAR_CARRY \
641 #define COMBA_STORE(x) \
644 #define COMBA_STORE2(x) \
647 #define CARRY_FORWARD \
648 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
653 #define SQRADD(i, j) \
655 "movd %6,%%mm0 \n\t" \
656 "pmuludq %%mm0,%%mm0\n\t" \
657 "movd %%mm0,%%eax \n\t" \
658 "psrlq $32,%%mm0 \n\t" \
659 "addl %%eax,%0 \n\t" \
660 "movd %%mm0,%%eax \n\t" \
661 "adcl %%eax,%1 \n\t" \
663 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
665 #define SQRADD2(i, j) \
667 "movd %6,%%mm0 \n\t" \
668 "movd %7,%%mm1 \n\t" \
669 "pmuludq %%mm1,%%mm0\n\t" \
670 "movd %%mm0,%%eax \n\t" \
671 "psrlq $32,%%mm0 \n\t" \
672 "movd %%mm0,%%edx \n\t" \
673 "addl %%eax,%0 \n\t" \
674 "adcl %%edx,%1 \n\t" \
676 "addl %%eax,%0 \n\t" \
677 "adcl %%edx,%1 \n\t" \
679 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
681 #define SQRADDSC(i, j) \
683 "movd %3,%%mm0 \n\t" \
684 "movd %4,%%mm1 \n\t" \
685 "pmuludq %%mm1,%%mm0\n\t" \
686 "movd %%mm0,%0 \n\t" \
687 "psrlq $32,%%mm0 \n\t" \
688 "movd %%mm0,%1 \n\t" \
690 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
692 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
694 #define SQRADDAC(i, j) \
696 "movd %6,%%mm0 \n\t" \
697 "movd %7,%%mm1 \n\t" \
698 "pmuludq %%mm1,%%mm0\n\t" \
699 "movd %%mm0,%%eax \n\t" \
700 "psrlq $32,%%mm0 \n\t" \
701 "movd %%mm0,%%edx \n\t" \
702 "addl %%eax,%0 \n\t" \
703 "adcl %%edx,%1 \n\t" \
705 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc");
715 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
717 #elif defined(TFM_ARM)
723 #define CLEAR_CARRY \
726 #define COMBA_STORE(x) \
729 #define COMBA_STORE2(x) \
732 #define CARRY_FORWARD \
733 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
737 /* multiplies point i and j, updates carry "c1" and digit c2 */
738 #define SQRADD(i, j) \
740 " UMULL r0,r1,%6,%6 \n\t" \
741 " ADDS %0,%0,r0 \n\t" \
742 " ADCS %1,%1,r1 \n\t" \
743 " ADC %2,%2,#0 \n\t" \
744 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
746 /* for squaring some of the terms are doubled... */
747 #define SQRADD2(i, j) \
749 " UMULL r0,r1,%6,%7 \n\t" \
750 " ADDS %0,%0,r0 \n\t" \
751 " ADCS %1,%1,r1 \n\t" \
752 " ADC %2,%2,#0 \n\t" \
753 " ADDS %0,%0,r0 \n\t" \
754 " ADCS %1,%1,r1 \n\t" \
755 " ADC %2,%2,#0 \n\t" \
756 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
758 #define SQRADDSC(i, j) \
760 " UMULL %0,%1,%3,%4 \n\t" \
761 " SUB %2,%2,%2 \n\t" \
762 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
764 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
766 #define SQRADDAC(i, j) \
768 " UMULL r0,r1,%6,%7 \n\t" \
769 " ADDS %0,%0,r0 \n\t" \
770 " ADCS %1,%1,r1 \n\t" \
771 " ADC %2,%2,#0 \n\t" \
772 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
776 " ADDS %0,%0,%3 \n\t" \
777 " ADCS %1,%1,%4 \n\t" \
778 " ADC %2,%2,%5 \n\t" \
779 " ADDS %0,%0,%3 \n\t" \
780 " ADCS %1,%1,%4 \n\t" \
781 " ADC %2,%2,%5 \n\t" \
782 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
784 #elif defined(TFM_PPC32)
790 #define CLEAR_CARRY \
793 #define COMBA_STORE(x) \
796 #define COMBA_STORE2(x) \
799 #define CARRY_FORWARD \
800 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
804 /* multiplies point i and j, updates carry "c1" and digit c2 */
805 #define SQRADD(i, j) \
807 " mullw 16,%6,%6 \n\t" \
808 " addc %0,%0,16 \n\t" \
809 " mulhwu 16,%6,%6 \n\t" \
810 " adde %1,%1,16 \n\t" \
811 " addze %2,%2 \n\t" \
812 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
814 /* for squaring some of the terms are doubled... */
815 #define SQRADD2(i, j) \
817 " mullw 16,%6,%7 \n\t" \
818 " mulhwu 17,%6,%7 \n\t" \
819 " addc %0,%0,16 \n\t" \
820 " adde %1,%1,17 \n\t" \
821 " addze %2,%2 \n\t" \
822 " addc %0,%0,16 \n\t" \
823 " adde %1,%1,17 \n\t" \
824 " addze %2,%2 \n\t" \
825 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
827 #define SQRADDSC(i, j) \
829 " mullw %0,%6,%7 \n\t" \
830 " mulhwu %1,%6,%7 \n\t" \
831 " xor %2,%2,%2 \n\t" \
832 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
834 #define SQRADDAC(i, j) \
836 " mullw 16,%6,%7 \n\t" \
837 " addc %0,%0,16 \n\t" \
838 " mulhwu 16,%6,%7 \n\t" \
839 " adde %1,%1,16 \n\t" \
840 " addze %2,%2 \n\t" \
841 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
845 " addc %0,%0,%3 \n\t" \
846 " adde %1,%1,%4 \n\t" \
847 " adde %2,%2,%5 \n\t" \
848 " addc %0,%0,%3 \n\t" \
849 " adde %1,%1,%4 \n\t" \
850 " adde %2,%2,%5 \n\t" \
851 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
853 #elif defined(TFM_PPC64)
858 #define CLEAR_CARRY \
861 #define COMBA_STORE(x) \
864 #define COMBA_STORE2(x) \
867 #define CARRY_FORWARD \
868 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
872 /* multiplies point i and j, updates carry "c1" and digit c2 */
873 #define SQRADD(i, j) \
875 " mulld 16,%6,%6 \n\t" \
876 " addc %0,%0,16 \n\t" \
877 " mulhdu 16,%6,%6 \n\t" \
878 " adde %1,%1,16 \n\t" \
879 " addze %2,%2 \n\t" \
880 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
882 /* for squaring some of the terms are doubled... */
883 #define SQRADD2(i, j) \
885 " mulld 16,%6,%7 \n\t" \
886 " mulhdu 17,%6,%7 \n\t" \
887 " addc %0,%0,16 \n\t" \
888 " adde %1,%1,17 \n\t" \
889 " addze %2,%2 \n\t" \
890 " addc %0,%0,16 \n\t" \
891 " adde %1,%1,17 \n\t" \
892 " addze %2,%2 \n\t" \
893 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
895 #define SQRADDSC(i, j) \
897 " mulld %0,%6,%7 \n\t" \
898 " mulhdu %1,%6,%7 \n\t" \
899 " xor %2,%2,%2 \n\t" \
900 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
902 #define SQRADDAC(i, j) \
904 " mulld 16,%6,%7 \n\t" \
905 " addc %0,%0,16 \n\t" \
906 " mulhdu 16,%6,%7 \n\t" \
907 " adde %1,%1,16 \n\t" \
908 " addze %2,%2 \n\t" \
909 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
913 " addc %0,%0,%3 \n\t" \
914 " adde %1,%1,%4 \n\t" \
915 " adde %2,%2,%5 \n\t" \
916 " addc %0,%0,%3 \n\t" \
917 " adde %1,%1,%4 \n\t" \
918 " adde %2,%2,%5 \n\t" \
919 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
922 #elif defined(TFM_AVR32)
928 #define CLEAR_CARRY \
931 #define COMBA_STORE(x) \
934 #define COMBA_STORE2(x) \
937 #define CARRY_FORWARD \
938 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
942 /* multiplies point i and j, updates carry "c1" and digit c2 */
943 #define SQRADD(i, j) \
945 " mulu.d r2,%6,%6 \n\t" \
946 " add %0,%0,r2 \n\t" \
947 " adc %1,%1,r3 \n\t" \
949 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
951 /* for squaring some of the terms are doubled... */
952 #define SQRADD2(i, j) \
954 " mulu.d r2,%6,%7 \n\t" \
955 " add %0,%0,r2 \n\t" \
956 " adc %1,%1,r3 \n\t" \
958 " add %0,%0,r2 \n\t" \
959 " adc %1,%1,r3 \n\t" \
961 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
963 #define SQRADDSC(i, j) \
965 " mulu.d r2,%6,%7 \n\t" \
969 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
971 #define SQRADDAC(i, j) \
973 " mulu.d r2,%6,%7 \n\t" \
974 " add %0,%0,r2 \n\t" \
975 " adc %1,%1,r3 \n\t" \
977 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
981 " add %0,%0,%3 \n\t" \
982 " adc %1,%1,%4 \n\t" \
983 " adc %2,%2,%5 \n\t" \
984 " add %0,%0,%3 \n\t" \
985 " adc %1,%1,%4 \n\t" \
986 " adc %2,%2,%5 \n\t" \
987 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
994 /* ISO C portable code */
998 #define CLEAR_CARRY \
1001 #define COMBA_STORE(x) \
1004 #define COMBA_STORE2(x) \
1007 #define CARRY_FORWARD \
1008 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1012 /* multiplies point i and j, updates carry "c1" and digit c2 */
1013 #define SQRADD(i, j) \
1015 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
1016 t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \
1017 c2 +=(fp_digit) (t >> DIGIT_BIT); \
1021 /* for squaring some of the terms are doubled... */
1022 #define SQRADD2(i, j) \
1024 t = ((fp_word)i) * ((fp_word)j); \
1025 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
1026 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
1027 c2 +=(fp_digit)( tt >> DIGIT_BIT); \
1028 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
1029 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
1030 c2 +=(fp_digit) (tt >> DIGIT_BIT); \
1033 #define SQRADDSC(i, j) \
1035 t = ((fp_word)i) * ((fp_word)j); \
1036 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
1039 #define SQRADDAC(i, j) \
1041 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = (fp_digit)t; \
1042 t = sc1 + (t >> DIGIT_BIT); sc1 = (fp_digit)t; \
1043 sc2 += (fp_digit)(t >> DIGIT_BIT); \
1048 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t; \
1049 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); \
1051 c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT)); \
1056 #ifdef TFM_SMALL_SET
1057 #include "fp_sqr_comba_small_set.i"
1060 #if defined(TFM_SQR3)
1061 #include "fp_sqr_comba_3.i"
1063 #if defined(TFM_SQR4)
1064 #include "fp_sqr_comba_4.i"
1066 #if defined(TFM_SQR6)
1067 #include "fp_sqr_comba_6.i"
1069 #if defined(TFM_SQR7)
1070 #include "fp_sqr_comba_7.i"
1072 #if defined(TFM_SQR8)
1073 #include "fp_sqr_comba_8.i"
1075 #if defined(TFM_SQR9)
1076 #include "fp_sqr_comba_9.i"
1078 #if defined(TFM_SQR12)
1079 #include "fp_sqr_comba_12.i"
1081 #if defined(TFM_SQR17)
1082 #include "fp_sqr_comba_17.i"
1084 #if defined(TFM_SQR20)
1085 #include "fp_sqr_comba_20.i"
1087 #if defined(TFM_SQR24)
1088 #include "fp_sqr_comba_24.i"
1090 #if defined(TFM_SQR28)
1091 #include "fp_sqr_comba_28.i"
1093 #if defined(TFM_SQR32)
1094 #include "fp_sqr_comba_32.i"
1096 #if defined(TFM_SQR48)
1097 #include "fp_sqr_comba_48.i"
1099 #if defined(TFM_SQR64)
1100 #include "fp_sqr_comba_64.i"
1102 /* end fp_sqr_comba.c asm */
1104 /* start fp_mul_comba.c asm */
1105 /* these are the combas. Worship them. */
1106 #if defined(TFM_X86)
1107 /* Generic x86 optimized code */
1109 /* anything you need at the start */
1112 /* clear the chaining variables */
1113 #define COMBA_CLEAR \
1116 /* forward the carry to the next digit */
1117 #define COMBA_FORWARD \
1118 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1120 /* store the first sum */
1121 #define COMBA_STORE(x) \
1124 /* store the second sum [carry] */
1125 #define COMBA_STORE2(x) \
1128 /* anything you need at the end */
1131 /* this should multiply i and j */
1132 #define MULADD(i, j) \
1134 "movl %6,%%eax \n\t" \
1136 "addl %%eax,%0 \n\t" \
1137 "adcl %%edx,%1 \n\t" \
1139 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
1141 #elif defined(TFM_X86_64)
1142 /* x86-64 optimized */
1144 /* anything you need at the start */
1147 /* clear the chaining variables */
1148 #define COMBA_CLEAR \
1151 /* forward the carry to the next digit */
1152 #define COMBA_FORWARD \
1153 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1155 /* store the first sum */
1156 #define COMBA_STORE(x) \
1159 /* store the second sum [carry] */
1160 #define COMBA_STORE2(x) \
1163 /* anything you need at the end */
1166 /* this should multiply i and j */
1167 #define MULADD(i, j) \
1169 "movq %6,%%rax \n\t" \
1171 "addq %%rax,%0 \n\t" \
1172 "adcq %%rdx,%1 \n\t" \
1174 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
1176 #elif defined(TFM_SSE2)
1177 /* use SSE2 optimizations */
1179 /* anything you need at the start */
1182 /* clear the chaining variables */
1183 #define COMBA_CLEAR \
1186 /* forward the carry to the next digit */
1187 #define COMBA_FORWARD \
1188 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1190 /* store the first sum */
1191 #define COMBA_STORE(x) \
1194 /* store the second sum [carry] */
1195 #define COMBA_STORE2(x) \
1198 /* anything you need at the end */
1199 #define COMBA_FINI \
1202 /* this should multiply i and j */
1203 #define MULADD(i, j) \
1205 "movd %6,%%mm0 \n\t" \
1206 "movd %7,%%mm1 \n\t" \
1207 "pmuludq %%mm1,%%mm0\n\t" \
1208 "movd %%mm0,%%eax \n\t" \
1209 "psrlq $32,%%mm0 \n\t" \
1210 "addl %%eax,%0 \n\t" \
1211 "movd %%mm0,%%eax \n\t" \
1212 "adcl %%eax,%1 \n\t" \
1214 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc");
1216 #elif defined(TFM_ARM)
1221 #define COMBA_CLEAR \
1224 #define COMBA_FORWARD \
1225 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1227 #define COMBA_STORE(x) \
1230 #define COMBA_STORE2(x) \
1235 #define MULADD(i, j) \
1237 " UMULL r0,r1,%6,%7 \n\t" \
1238 " ADDS %0,%0,r0 \n\t" \
1239 " ADCS %1,%1,r1 \n\t" \
1240 " ADC %2,%2,#0 \n\t" \
1241 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
1243 #elif defined(TFM_PPC32)
1244 /* For 32-bit PPC */
1248 #define COMBA_CLEAR \
1251 #define COMBA_FORWARD \
1252 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1254 #define COMBA_STORE(x) \
1257 #define COMBA_STORE2(x) \
1262 /* untested: will mulhwu change the flags? Docs say no */
1263 #define MULADD(i, j) \
1265 " mullw 16,%6,%7 \n\t" \
1266 " addc %0,%0,16 \n\t" \
1267 " mulhwu 16,%6,%7 \n\t" \
1268 " adde %1,%1,16 \n\t" \
1269 " addze %2,%2 \n\t" \
1270 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
1272 #elif defined(TFM_PPC64)
1273 /* For 64-bit PPC */
1277 #define COMBA_CLEAR \
1280 #define COMBA_FORWARD \
1281 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1283 #define COMBA_STORE(x) \
1286 #define COMBA_STORE2(x) \
1291 /* untested: will mulhwu change the flags? Docs say no */
1292 #define MULADD(i, j) \
1294 " mulld 16,%6,%7 \n\t" \
1295 " addc %0,%0,16 \n\t" \
1296 " mulhdu 16,%6,%7 \n\t" \
1297 " adde %1,%1,16 \n\t" \
1298 " addze %2,%2 \n\t" \
1299 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
1301 #elif defined(TFM_AVR32)
1307 #define COMBA_CLEAR \
1310 #define COMBA_FORWARD \
1311 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1313 #define COMBA_STORE(x) \
1316 #define COMBA_STORE2(x) \
1321 #define MULADD(i, j) \
1323 " mulu.d r2,%6,%7 \n\t"\
1325 " adc %1,%1,r3 \n\t"\
1327 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
1334 #define COMBA_CLEAR \
1337 #define COMBA_FORWARD \
1338 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1340 #define COMBA_STORE(x) \
1343 #define COMBA_STORE2(x) \
1348 #define MULADD(i, j) \
1350 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
1351 t = (fp_word)c1 + (t >> DIGIT_BIT); \
1352 c1 = (fp_digit)t; c2 += (fp_digit)(t >> DIGIT_BIT); \
1358 #ifdef TFM_SMALL_SET
1359 #include "fp_mul_comba_small_set.i"
1362 #if defined(TFM_MUL3)
1363 #include "fp_mul_comba_3.i"
1365 #if defined(TFM_MUL4)
1366 #include "fp_mul_comba_4.i"
1368 #if defined(TFM_MUL6)
1369 #include "fp_mul_comba_6.i"
1371 #if defined(TFM_MUL7)
1372 #include "fp_mul_comba_7.i"
1374 #if defined(TFM_MUL8)
1375 #include "fp_mul_comba_8.i"
1377 #if defined(TFM_MUL9)
1378 #include "fp_mul_comba_9.i"
1380 #if defined(TFM_MUL12)
1381 #include "fp_mul_comba_12.i"
1383 #if defined(TFM_MUL17)
1384 #include "fp_mul_comba_17.i"
1386 #if defined(TFM_MUL20)
1387 #include "fp_mul_comba_20.i"
1389 #if defined(TFM_MUL24)
1390 #include "fp_mul_comba_24.i"
1392 #if defined(TFM_MUL28)
1393 #include "fp_mul_comba_28.i"
1395 #if defined(TFM_MUL32)
1396 #include "fp_mul_comba_32.i"
1398 #if defined(TFM_MUL48)
1399 #include "fp_mul_comba_48.i"
1401 #if defined(TFM_MUL64)
1402 #include "fp_mul_comba_64.i"
1405 /* end fp_mul_comba.c asm */