git.sur5r.net Git - freertos/blob - FreeRTOS-Plus/Source/CyaSSL/ctaocrypt/src/tfm.c

   1 /* tfm.c
   2  *
   3  * Copyright (C) 2006-2012 Sawtooth Consulting Ltd.
   4  *
   5  * This file is part of CyaSSL.
   6  *
   7  * CyaSSL is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * CyaSSL is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  20  */
  21
  22
  23 /*
  24  * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
  25  * http://math.libtomcrypt.com
  26  */
  27
  28 /**
  29  *  Edited by Moisés Guimarães (moises.guimaraes@phoebus.com.br)
  30  *  to fit CyaSSL's needs.
  31  */
  32
  33 #ifdef HAVE_CONFIG_H
  34     #include <config.h>
  35 #endif
  36
  37 /* in case user set USE_FAST_MATH there */
  38 #include <cyassl/ctaocrypt/settings.h>
  39
  40 #ifdef USE_FAST_MATH
  41
  42 #include <cyassl/ctaocrypt/tfm.h>
  43 #include <ctaocrypt/src/asm.c>  /* will define asm MACROS or C ones */
  44
  45
  46 /* math settings check */
  47 word32 CheckRunTimeSettings(void)
  48 {
  49     return CTC_SETTINGS;
  50 }
  51
  52
  53 /* math settings size check */
  54 word32 CheckRunTimeFastMath(void)
  55 {
  56     return FP_SIZE;
  57 }
  58
  59
  60 /* Functions */
  61
  62 void fp_add(fp_int *a, fp_int *b, fp_int *c)
  63 {
  64   int     sa, sb;
  65
  66   /* get sign of both inputs */
  67   sa = a->sign;
  68   sb = b->sign;
  69
  70   /* handle two cases, not four */
  71   if (sa == sb) {
  72     /* both positive or both negative */
  73     /* add their magnitudes, copy the sign */
  74     c->sign = sa;
  75     s_fp_add (a, b, c);
  76   } else {
  77     /* one positive, the other negative */
  78     /* subtract the one with the greater magnitude from */
  79     /* the one of the lesser magnitude.  The result gets */
  80     /* the sign of the one with the greater magnitude. */
  81     if (fp_cmp_mag (a, b) == FP_LT) {
  82       c->sign = sb;
  83       s_fp_sub (b, a, c);
  84     } else {
  85       c->sign = sa;
  86       s_fp_sub (a, b, c);
  87     }
  88   }
  89 }
  90
  91 /* unsigned addition */
  92 void s_fp_add(fp_int *a, fp_int *b, fp_int *c)
  93 {
  94   int      x, y, oldused;
  95   register fp_word  t;
  96
  97   y       = MAX(a->used, b->used);
  98   oldused = c->used;
  99   c->used = y;
 100
 101   t = 0;
 102   for (x = 0; x < y; x++) {
 103       t         += ((fp_word)a->dp[x]) + ((fp_word)b->dp[x]);
 104       c->dp[x]   = (fp_digit)t;
 105       t        >>= DIGIT_BIT;
 106   }
 107   if (t != 0 && x < FP_SIZE) {
 108      c->dp[c->used++] = (fp_digit)t;
 109      ++x;
 110   }
 111
 112   c->used = x;
 113   for (; x < oldused; x++) {
 114      c->dp[x] = 0;
 115   }
 116   fp_clamp(c);
 117 }
 118
 119 /* c = a - b */
 120 void fp_sub(fp_int *a, fp_int *b, fp_int *c)
 121 {
 122   int     sa, sb;
 123
 124   sa = a->sign;
 125   sb = b->sign;
 126
 127   if (sa != sb) {
 128     /* subtract a negative from a positive, OR */
 129     /* subtract a positive from a negative. */
 130     /* In either case, ADD their magnitudes, */
 131     /* and use the sign of the first number. */
 132     c->sign = sa;
 133     s_fp_add (a, b, c);
 134   } else {
 135     /* subtract a positive from a positive, OR */
 136     /* subtract a negative from a negative. */
 137     /* First, take the difference between their */
 138     /* magnitudes, then... */
 139     if (fp_cmp_mag (a, b) != FP_LT) {
 140       /* Copy the sign from the first */
 141       c->sign = sa;
 142       /* The first has a larger or equal magnitude */
 143       s_fp_sub (a, b, c);
 144     } else {
 145       /* The result has the *opposite* sign from */
 146       /* the first number. */
 147       c->sign = (sa == FP_ZPOS) ? FP_NEG : FP_ZPOS;
 148       /* The second has a larger magnitude */
 149       s_fp_sub (b, a, c);
 150     }
 151   }
 152 }
 153
 154 /* unsigned subtraction ||a|| >= ||b|| ALWAYS! */
 155 void s_fp_sub(fp_int *a, fp_int *b, fp_int *c)
 156 {
 157   int      x, oldbused, oldused;
 158   fp_word  t;
 159
 160   oldused  = c->used;
 161   oldbused = b->used;
 162   c->used  = a->used;
 163   t       = 0;
 164   for (x = 0; x < oldbused; x++) {
 165      t         = ((fp_word)a->dp[x]) - (((fp_word)b->dp[x]) + t);
 166      c->dp[x]  = (fp_digit)t;
 167      t         = (t >> DIGIT_BIT)&1;
 168   }
 169   for (; x < a->used; x++) {
 170      t         = ((fp_word)a->dp[x]) - t;
 171      c->dp[x]  = (fp_digit)t;
 172      t         = (t >> DIGIT_BIT);
 173    }
 174   for (; x < oldused; x++) {
 175      c->dp[x] = 0;
 176   }
 177   fp_clamp(c);
 178 }
 179
 180 /* c = a * b */
 181 void fp_mul(fp_int *A, fp_int *B, fp_int *C)
 182 {
 183     int   y, yy;
 184
 185     y  = MAX(A->used, B->used);
 186     yy = MIN(A->used, B->used);
 187
 188     /* call generic if we're out of range */
 189     if (y + yy > FP_SIZE) {
 190        fp_mul_comba(A, B, C);
 191        return ;
 192     }
 193
 194     /* pick a comba (unrolled 4/8/16/32 x or rolled) based on the size
 195        of the largest input.  We also want to avoid doing excess mults if the
 196        inputs are not close to the next power of two.  That is, for example,
 197        if say y=17 then we would do (32-17)^2 = 225 unneeded multiplications
 198     */
 199
 200 #ifdef TFM_MUL3
 201         if (y <= 3) {
 202            fp_mul_comba3(A,B,C);
 203            return;
 204         }
 205 #endif
 206 #ifdef TFM_MUL4
 207         if (y == 4) {
 208            fp_mul_comba4(A,B,C);
 209            return;
 210         }
 211 #endif
 212 #ifdef TFM_MUL6
 213         if (y <= 6) {
 214            fp_mul_comba6(A,B,C);
 215            return;
 216         }
 217 #endif
 218 #ifdef TFM_MUL7
 219         if (y == 7) {
 220            fp_mul_comba7(A,B,C);
 221            return;
 222         }
 223 #endif
 224 #ifdef TFM_MUL8
 225         if (y == 8) {
 226            fp_mul_comba8(A,B,C);
 227            return;
 228         }
 229 #endif
 230 #ifdef TFM_MUL9
 231         if (y == 9) {
 232            fp_mul_comba9(A,B,C);
 233            return;
 234         }
 235 #endif
 236 #ifdef TFM_MUL12
 237         if (y <= 12) {
 238            fp_mul_comba12(A,B,C);
 239            return;
 240         }
 241 #endif
 242 #ifdef TFM_MUL17
 243         if (y <= 17) {
 244            fp_mul_comba17(A,B,C);
 245            return;
 246         }
 247 #endif
 248
 249 #ifdef TFM_SMALL_SET
 250         if (y <= 16) {
 251            fp_mul_comba_small(A,B,C);
 252            return;
 253         }
 254 #endif
 255 #if defined(TFM_MUL20)
 256         if (y <= 20) {
 257            fp_mul_comba20(A,B,C);
 258            return;
 259         }
 260 #endif
 261 #if defined(TFM_MUL24)
 262         if (yy >= 16 && y <= 24) {
 263            fp_mul_comba24(A,B,C);
 264            return;
 265         }
 266 #endif
 267 #if defined(TFM_MUL28)
 268         if (yy >= 20 && y <= 28) {
 269            fp_mul_comba28(A,B,C);
 270            return;
 271         }
 272 #endif
 273 #if defined(TFM_MUL32)
 274         if (yy >= 24 && y <= 32) {
 275            fp_mul_comba32(A,B,C);
 276            return;
 277         }
 278 #endif
 279 #if defined(TFM_MUL48)
 280         if (yy >= 40 && y <= 48) {
 281            fp_mul_comba48(A,B,C);
 282            return;
 283         }
 284 #endif
 285 #if defined(TFM_MUL64)
 286         if (yy >= 56 && y <= 64) {
 287            fp_mul_comba64(A,B,C);
 288            return;
 289         }
 290 #endif
 291         fp_mul_comba(A,B,C);
 292 }
 293
 294 void fp_mul_2(fp_int * a, fp_int * b)
 295 {
 296   int     x, oldused;
 297
 298   oldused = b->used;
 299   b->used = a->used;
 300
 301   {
 302     register fp_digit r, rr, *tmpa, *tmpb;
 303
 304     /* alias for source */
 305     tmpa = a->dp;
 306
 307     /* alias for dest */
 308     tmpb = b->dp;
 309
 310     /* carry */
 311     r = 0;
 312     for (x = 0; x < a->used; x++) {
 313
 314       /* get what will be the *next* carry bit from the
 315        * MSB of the current digit
 316        */
 317       rr = *tmpa >> ((fp_digit)(DIGIT_BIT - 1));
 318
 319       /* now shift up this digit, add in the carry [from the previous] */
 320       *tmpb++ = ((*tmpa++ << ((fp_digit)1)) | r);
 321
 322       /* copy the carry that would be from the source
 323        * digit into the next iteration
 324        */
 325       r = rr;
 326     }
 327
 328     /* new leading digit? */
 329     if (r != 0 && b->used != (FP_SIZE-1)) {
 330       /* add a MSB which is always 1 at this point */
 331       *tmpb = 1;
 332       ++(b->used);
 333     }
 334
 335     /* now zero any excess digits on the destination
 336      * that we didn't write to
 337      */
 338     tmpb = b->dp + b->used;
 339     for (x = b->used; x < oldused; x++) {
 340       *tmpb++ = 0;
 341     }
 342   }
 343   b->sign = a->sign;
 344 }
 345
 346 /* c = a * b */
 347 void fp_mul_d(fp_int *a, fp_digit b, fp_int *c)
 348 {
 349    fp_word  w;
 350    int      x, oldused;
 351
 352    oldused = c->used;
 353    c->used = a->used;
 354    c->sign = a->sign;
 355    w       = 0;
 356    for (x = 0; x < a->used; x++) {
 357        w         = ((fp_word)a->dp[x]) * ((fp_word)b) + w;
 358        c->dp[x]  = (fp_digit)w;
 359        w         = w >> DIGIT_BIT;
 360    }
 361    if (w != 0 && (a->used != FP_SIZE)) {
 362       c->dp[c->used++] = (fp_digit) w;
 363       ++x;
 364    }
 365    for (; x < oldused; x++) {
 366       c->dp[x] = 0;
 367    }
 368    fp_clamp(c);
 369 }
 370
 371 /* c = a * 2**d */
 372 void fp_mul_2d(fp_int *a, int b, fp_int *c)
 373 {
 374    fp_digit carry, carrytmp, shift;
 375    int x;
 376
 377    /* copy it */
 378    fp_copy(a, c);
 379
 380    /* handle whole digits */
 381    if (b >= DIGIT_BIT) {
 382       fp_lshd(c, b/DIGIT_BIT);
 383    }
 384    b %= DIGIT_BIT;
 385
 386    /* shift the digits */
 387    if (b != 0) {
 388       carry = 0;
 389       shift = DIGIT_BIT - b;
 390       for (x = 0; x < c->used; x++) {
 391           carrytmp = c->dp[x] >> shift;
 392           c->dp[x] = (c->dp[x] << b) + carry;
 393           carry = carrytmp;
 394       }
 395       /* store last carry if room */
 396       if (carry && x < FP_SIZE) {
 397          c->dp[c->used++] = carry;
 398       }
 399    }
 400    fp_clamp(c);
 401 }
 402
 403 /* generic PxQ multiplier */
 404 void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
 405 {
 406    int       ix, iy, iz, tx, ty, pa;
 407    fp_digit  c0, c1, c2, *tmpx, *tmpy;
 408    fp_int    tmp, *dst;
 409
 410    COMBA_START;
 411    COMBA_CLEAR;
 412
 413    /* get size of output and trim */
 414    pa = A->used + B->used;
 415    if (pa >= FP_SIZE) {
 416       pa = FP_SIZE-1;
 417    }
 418
 419    if (A == C || B == C) {
 420       fp_zero(&tmp);
 421       dst = &tmp;
 422    } else {
 423       fp_zero(C);
 424       dst = C;
 425    }
 426
 427    for (ix = 0; ix < pa; ix++) {
 428       /* get offsets into the two bignums */
 429       ty = MIN(ix, B->used-1);
 430       tx = ix - ty;
 431
 432       /* setup temp aliases */
 433       tmpx = A->dp + tx;
 434       tmpy = B->dp + ty;
 435
 436       /* this is the number of times the loop will iterrate, essentially its
 437          while (tx++ < a->used && ty-- >= 0) { ... }
 438        */
 439       iy = MIN(A->used-tx, ty+1);
 440
 441       /* execute loop */
 442       COMBA_FORWARD;
 443       for (iz = 0; iz < iy; ++iz) {
 444           /* TAO change COMBA_ADD back to MULADD */
 445           MULADD(*tmpx++, *tmpy--);
 446       }
 447
 448       /* store term */
 449       COMBA_STORE(dst->dp[ix]);
 450   }
 451   COMBA_FINI;
 452
 453   dst->used = pa;
 454   dst->sign = A->sign ^ B->sign;
 455   fp_clamp(dst);
 456   fp_copy(dst, C);
 457 }
 458
 459 /* a/b => cb + d == a */
 460 int fp_div(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 461 {
 462   fp_int  q, x, y, t1, t2;
 463   int     n, t, i, norm, neg;
 464
 465   /* is divisor zero ? */
 466   if (fp_iszero (b) == 1) {
 467     return FP_VAL;
 468   }
 469
 470   /* if a < b then q=0, r = a */
 471   if (fp_cmp_mag (a, b) == FP_LT) {
 472     if (d != NULL) {
 473       fp_copy (a, d);
 474     }
 475     if (c != NULL) {
 476       fp_zero (c);
 477     }
 478     return FP_OKAY;
 479   }
 480
 481   fp_init(&q);
 482   q.used = a->used + 2;
 483
 484   fp_init(&t1);
 485   fp_init(&t2);
 486   fp_init_copy(&x, a);
 487   fp_init_copy(&y, b);
 488
 489   /* fix the sign */
 490   neg = (a->sign == b->sign) ? FP_ZPOS : FP_NEG;
 491   x.sign = y.sign = FP_ZPOS;
 492
 493   /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
 494   norm = fp_count_bits(&y) % DIGIT_BIT;
 495   if (norm < (int)(DIGIT_BIT-1)) {
 496      norm = (DIGIT_BIT-1) - norm;
 497      fp_mul_2d (&x, norm, &x);
 498      fp_mul_2d (&y, norm, &y);
 499   } else {
 500      norm = 0;
 501   }
 502
 503   /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
 504   n = x.used - 1;
 505   t = y.used - 1;
 506
 507   /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
 508   fp_lshd (&y, n - t);                                             /* y = y*b**{n-t} */
 509
 510   while (fp_cmp (&x, &y) != FP_LT) {
 511     ++(q.dp[n - t]);
 512     fp_sub (&x, &y, &x);
 513   }
 514
 515   /* reset y by shifting it back down */
 516   fp_rshd (&y, n - t);
 517
 518   /* step 3. for i from n down to (t + 1) */
 519   for (i = n; i >= (t + 1); i--) {
 520     if (i > x.used) {
 521       continue;
 522     }
 523
 524     /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
 525      * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
 526     if (x.dp[i] == y.dp[t]) {
 527       q.dp[i - t - 1] = ((((fp_word)1) << DIGIT_BIT) - 1);
 528     } else {
 529       fp_word tmp;
 530       tmp = ((fp_word) x.dp[i]) << ((fp_word) DIGIT_BIT);
 531       tmp |= ((fp_word) x.dp[i - 1]);
 532       tmp /= ((fp_word)y.dp[t]);
 533       q.dp[i - t - 1] = (fp_digit) (tmp);
 534     }
 535
 536     /* while (q{i-t-1} * (yt * b + y{t-1})) >
 537              xi * b**2 + xi-1 * b + xi-2
 538
 539        do q{i-t-1} -= 1;
 540     */
 541     q.dp[i - t - 1] = (q.dp[i - t - 1] + 1);
 542     do {
 543       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1);
 544
 545       /* find left hand */
 546       fp_zero (&t1);
 547       t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
 548       t1.dp[1] = y.dp[t];
 549       t1.used = 2;
 550       fp_mul_d (&t1, q.dp[i - t - 1], &t1);
 551
 552       /* find right hand */
 553       t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
 554       t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
 555       t2.dp[2] = x.dp[i];
 556       t2.used = 3;
 557     } while (fp_cmp_mag(&t1, &t2) == FP_GT);
 558
 559     /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
 560     fp_mul_d (&y, q.dp[i - t - 1], &t1);
 561     fp_lshd  (&t1, i - t - 1);
 562     fp_sub   (&x, &t1, &x);
 563
 564     /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
 565     if (x.sign == FP_NEG) {
 566       fp_copy (&y, &t1);
 567       fp_lshd (&t1, i - t - 1);
 568       fp_add (&x, &t1, &x);
 569       q.dp[i - t - 1] = q.dp[i - t - 1] - 1;
 570     }
 571   }
 572
 573   /* now q is the quotient and x is the remainder
 574    * [which we have to normalize]
 575    */
 576
 577   /* get sign before writing to c */
 578   x.sign = x.used == 0 ? FP_ZPOS : a->sign;
 579
 580   if (c != NULL) {
 581     fp_clamp (&q);
 582     fp_copy (&q, c);
 583     c->sign = neg;
 584   }
 585
 586   if (d != NULL) {
 587     fp_div_2d (&x, norm, &x, NULL);
 588
 589 /* the following is a kludge, essentially we were seeing the right remainder but
 590    with excess digits that should have been zero
 591  */
 592     for (i = b->used; i < x.used; i++) {
 593         x.dp[i] = 0;
 594     }
 595     fp_clamp(&x);
 596     fp_copy (&x, d);
 597   }
 598
 599   return FP_OKAY;
 600 }
 601
 602 /* b = a/2 */
 603 void fp_div_2(fp_int * a, fp_int * b)
 604 {
 605   int     x, oldused;
 606
 607   oldused = b->used;
 608   b->used = a->used;
 609   {
 610     register fp_digit r, rr, *tmpa, *tmpb;
 611
 612     /* source alias */
 613     tmpa = a->dp + b->used - 1;
 614
 615     /* dest alias */
 616     tmpb = b->dp + b->used - 1;
 617
 618     /* carry */
 619     r = 0;
 620     for (x = b->used - 1; x >= 0; x--) {
 621       /* get the carry for the next iteration */
 622       rr = *tmpa & 1;
 623
 624       /* shift the current digit, add in carry and store */
 625       *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
 626
 627       /* forward carry to next iteration */
 628       r = rr;
 629     }
 630
 631     /* zero excess digits */
 632     tmpb = b->dp + b->used;
 633     for (x = b->used; x < oldused; x++) {
 634       *tmpb++ = 0;
 635     }
 636   }
 637   b->sign = a->sign;
 638   fp_clamp (b);
 639 }
 640
 641 /* c = a / 2**b */
 642 void fp_div_2d(fp_int *a, int b, fp_int *c, fp_int *d)
 643 {
 644   fp_digit D, r, rr;
 645   int      x;
 646   fp_int   t;
 647
 648   /* if the shift count is <= 0 then we do no work */
 649   if (b <= 0) {
 650     fp_copy (a, c);
 651     if (d != NULL) {
 652       fp_zero (d);
 653     }
 654     return;
 655   }
 656
 657   fp_init(&t);
 658
 659   /* get the remainder */
 660   if (d != NULL) {
 661     fp_mod_2d (a, b, &t);
 662   }
 663
 664   /* copy */
 665   fp_copy(a, c);
 666
 667   /* shift by as many digits in the bit count */
 668   if (b >= (int)DIGIT_BIT) {
 669     fp_rshd (c, b / DIGIT_BIT);
 670   }
 671
 672   /* shift any bit count < DIGIT_BIT */
 673   D = (fp_digit) (b % DIGIT_BIT);
 674   if (D != 0) {
 675     register fp_digit *tmpc, mask, shift;
 676
 677     /* mask */
 678     mask = (((fp_digit)1) << D) - 1;
 679
 680     /* shift for lsb */
 681     shift = DIGIT_BIT - D;
 682
 683     /* alias */
 684     tmpc = c->dp + (c->used - 1);
 685
 686     /* carry */
 687     r = 0;
 688     for (x = c->used - 1; x >= 0; x--) {
 689       /* get the lower  bits of this word in a temp */
 690       rr = *tmpc & mask;
 691
 692       /* shift the current word and mix in the carry bits from the previous word */
 693       *tmpc = (*tmpc >> D) | (r << shift);
 694       --tmpc;
 695
 696       /* set the carry to the carry bits of the current word found above */
 697       r = rr;
 698     }
 699   }
 700   fp_clamp (c);
 701   if (d != NULL) {
 702     fp_copy (&t, d);
 703   }
 704 }
 705
 706 /* c = a mod b, 0 <= c < b  */
 707 int fp_mod(fp_int *a, fp_int *b, fp_int *c)
 708 {
 709    fp_int t;
 710    int    err;
 711
 712    fp_zero(&t);
 713    if ((err = fp_div(a, b, NULL, &t)) != FP_OKAY) {
 714       return err;
 715    }
 716    if (t.sign != b->sign) {
 717       fp_add(&t, b, c);
 718    } else {
 719       fp_copy(&t, c);
 720   }
 721   return FP_OKAY;
 722 }
 723
 724 /* c = a mod 2**d */
 725 void fp_mod_2d(fp_int *a, int b, fp_int *c)
 726 {
 727    int x;
 728
 729    /* zero if count less than or equal to zero */
 730    if (b <= 0) {
 731       fp_zero(c);
 732       return;
 733    }
 734
 735    /* get copy of input */
 736    fp_copy(a, c);
 737
 738    /* if 2**d is larger than we just return */
 739    if (b >= (DIGIT_BIT * a->used)) {
 740       return;
 741    }
 742
 743   /* zero digits above the last digit of the modulus */
 744   for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
 745     c->dp[x] = 0;
 746   }
 747   /* clear the digit that is not completely outside/inside the modulus */
 748   c->dp[b / DIGIT_BIT] &= ~((fp_digit)0) >> (DIGIT_BIT - b);
 749   fp_clamp (c);
 750 }
 751
 752 static int fp_invmod_slow (fp_int * a, fp_int * b, fp_int * c)
 753 {
 754   fp_int  x, y, u, v, A, B, C, D;
 755   int     res;
 756
 757   /* b cannot be negative */
 758   if (b->sign == FP_NEG || fp_iszero(b) == 1) {
 759     return FP_VAL;
 760   }
 761
 762   /* init temps */
 763   fp_init(&x);    fp_init(&y);
 764   fp_init(&u);    fp_init(&v);
 765   fp_init(&A);    fp_init(&B);
 766   fp_init(&C);    fp_init(&D);
 767
 768   /* x = a, y = b */
 769   if ((res = fp_mod(a, b, &x)) != FP_OKAY) {
 770       return res;
 771   }
 772   fp_copy(b, &y);
 773
 774   /* 2. [modified] if x,y are both even then return an error! */
 775   if (fp_iseven (&x) == 1 && fp_iseven (&y) == 1) {
 776     return FP_VAL;
 777   }
 778
 779   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
 780   fp_copy (&x, &u);
 781   fp_copy (&y, &v);
 782   fp_set (&A, 1);
 783   fp_set (&D, 1);
 784
 785 top:
 786   /* 4.  while u is even do */
 787   while (fp_iseven (&u) == 1) {
 788     /* 4.1 u = u/2 */
 789     fp_div_2 (&u, &u);
 790
 791     /* 4.2 if A or B is odd then */
 792     if (fp_isodd (&A) == 1 || fp_isodd (&B) == 1) {
 793       /* A = (A+y)/2, B = (B-x)/2 */
 794       fp_add (&A, &y, &A);
 795       fp_sub (&B, &x, &B);
 796     }
 797     /* A = A/2, B = B/2 */
 798     fp_div_2 (&A, &A);
 799     fp_div_2 (&B, &B);
 800   }
 801
 802   /* 5.  while v is even do */
 803   while (fp_iseven (&v) == 1) {
 804     /* 5.1 v = v/2 */
 805     fp_div_2 (&v, &v);
 806
 807     /* 5.2 if C or D is odd then */
 808     if (fp_isodd (&C) == 1 || fp_isodd (&D) == 1) {
 809       /* C = (C+y)/2, D = (D-x)/2 */
 810       fp_add (&C, &y, &C);
 811       fp_sub (&D, &x, &D);
 812     }
 813     /* C = C/2, D = D/2 */
 814     fp_div_2 (&C, &C);
 815     fp_div_2 (&D, &D);
 816   }
 817
 818   /* 6.  if u >= v then */
 819   if (fp_cmp (&u, &v) != FP_LT) {
 820     /* u = u - v, A = A - C, B = B - D */
 821     fp_sub (&u, &v, &u);
 822     fp_sub (&A, &C, &A);
 823     fp_sub (&B, &D, &B);
 824   } else {
 825     /* v - v - u, C = C - A, D = D - B */
 826     fp_sub (&v, &u, &v);
 827     fp_sub (&C, &A, &C);
 828     fp_sub (&D, &B, &D);
 829   }
 830
 831   /* if not zero goto step 4 */
 832   if (fp_iszero (&u) == 0)
 833     goto top;
 834
 835   /* now a = C, b = D, gcd == g*v */
 836
 837   /* if v != 1 then there is no inverse */
 838   if (fp_cmp_d (&v, 1) != FP_EQ) {
 839     return FP_VAL;
 840   }
 841
 842   /* if its too low */
 843   while (fp_cmp_d(&C, 0) == FP_LT) {
 844       fp_add(&C, b, &C);
 845   }
 846
 847   /* too big */
 848   while (fp_cmp_mag(&C, b) != FP_LT) {
 849       fp_sub(&C, b, &C);
 850   }
 851
 852   /* C is now the inverse */
 853   fp_copy(&C, c);
 854   return FP_OKAY;
 855 }
 856
 857 /* c = 1/a (mod b) for odd b only */
 858 int fp_invmod(fp_int *a, fp_int *b, fp_int *c)
 859 {
 860   fp_int  x, y, u, v, B, D;
 861   int     neg;
 862
 863   /* 2. [modified] b must be odd   */
 864   if (fp_iseven (b) == FP_YES) {
 865     return fp_invmod_slow(a,b,c);
 866   }
 867
 868   /* init all our temps */
 869   fp_init(&x);  fp_init(&y);
 870   fp_init(&u);  fp_init(&v);
 871   fp_init(&B);  fp_init(&D);
 872
 873   /* x == modulus, y == value to invert */
 874   fp_copy(b, &x);
 875
 876   /* we need y = |a| */
 877   fp_abs(a, &y);
 878
 879   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
 880   fp_copy(&x, &u);
 881   fp_copy(&y, &v);
 882   fp_set (&D, 1);
 883
 884 top:
 885   /* 4.  while u is even do */
 886   while (fp_iseven (&u) == FP_YES) {
 887     /* 4.1 u = u/2 */
 888     fp_div_2 (&u, &u);
 889
 890     /* 4.2 if B is odd then */
 891     if (fp_isodd (&B) == FP_YES) {
 892       fp_sub (&B, &x, &B);
 893     }
 894     /* B = B/2 */
 895     fp_div_2 (&B, &B);
 896   }
 897
 898   /* 5.  while v is even do */
 899   while (fp_iseven (&v) == FP_YES) {
 900     /* 5.1 v = v/2 */
 901     fp_div_2 (&v, &v);
 902
 903     /* 5.2 if D is odd then */
 904     if (fp_isodd (&D) == FP_YES) {
 905       /* D = (D-x)/2 */
 906       fp_sub (&D, &x, &D);
 907     }
 908     /* D = D/2 */
 909     fp_div_2 (&D, &D);
 910   }
 911
 912   /* 6.  if u >= v then */
 913   if (fp_cmp (&u, &v) != FP_LT) {
 914     /* u = u - v, B = B - D */
 915     fp_sub (&u, &v, &u);
 916     fp_sub (&B, &D, &B);
 917   } else {
 918     /* v - v - u, D = D - B */
 919     fp_sub (&v, &u, &v);
 920     fp_sub (&D, &B, &D);
 921   }
 922
 923   /* if not zero goto step 4 */
 924   if (fp_iszero (&u) == FP_NO) {
 925     goto top;
 926   }
 927
 928   /* now a = C, b = D, gcd == g*v */
 929
 930   /* if v != 1 then there is no inverse */
 931   if (fp_cmp_d (&v, 1) != FP_EQ) {
 932     return FP_VAL;
 933   }
 934
 935   /* b is now the inverse */
 936   neg = a->sign;
 937   while (D.sign == FP_NEG) {
 938     fp_add (&D, b, &D);
 939   }
 940   fp_copy (&D, c);
 941   c->sign = neg;
 942   return FP_OKAY;
 943 }
 944
 945 /* d = a * b (mod c) */
 946 int fp_mulmod(fp_int *a, fp_int *b, fp_int *c, fp_int *d)
 947 {
 948   fp_int tmp;
 949   fp_zero(&tmp);
 950   fp_mul(a, b, &tmp);
 951   return fp_mod(&tmp, c, d);
 952 }
 953
 954 #ifdef TFM_TIMING_RESISTANT
 955
 956 /* timing resistant montgomery ladder based exptmod
 957
 958    Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder", Cryptographic Hardware and Embedded Systems, CHES 2002
 959 */
 960 static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
 961 {
 962   fp_int   R[2];
 963   fp_digit buf, mp;
 964   int      err, bitcnt, digidx, y;
 965
 966   /* now setup montgomery  */
 967   if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
 968      return err;
 969   }
 970
 971   fp_init(&R[0]);
 972   fp_init(&R[1]);
 973
 974   /* now we need R mod m */
 975   fp_montgomery_calc_normalization (&R[0], P);
 976
 977   /* now set R[0][1] to G * R mod m */
 978   if (fp_cmp_mag(P, G) != FP_GT) {
 979      /* G > P so we reduce it first */
 980      fp_mod(G, P, &R[1]);
 981   } else {
 982      fp_copy(G, &R[1]);
 983   }
 984   fp_mulmod (&R[1], &R[0], P, &R[1]);
 985
 986   /* for j = t-1 downto 0 do
 987         r_!k = R0*R1; r_k = r_k^2
 988   */
 989
 990   /* set initial mode and bit cnt */
 991   bitcnt = 1;
 992   buf    = 0;
 993   digidx = X->used - 1;
 994
 995   for (;;) {
 996     /* grab next digit as required */
 997     if (--bitcnt == 0) {
 998       /* if digidx == -1 we are out of digits so break */
 999       if (digidx == -1) {
1000         break;
1001       }
1002       /* read next digit and reset bitcnt */
1003       buf    = X->dp[digidx--];
1004       bitcnt = (int)DIGIT_BIT;
1005     }
1006
1007     /* grab the next msb from the exponent */
1008     y     = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
1009     buf <<= (fp_digit)1;
1010
1011     /* do ops */
1012     fp_mul(&R[0], &R[1], &R[y^1]); fp_montgomery_reduce(&R[y^1], P, mp);
1013     fp_sqr(&R[y], &R[y]);          fp_montgomery_reduce(&R[y], P, mp);
1014   }
1015
1016    fp_montgomery_reduce(&R[0], P, mp);
1017    fp_copy(&R[0], Y);
1018    return FP_OKAY;
1019 }
1020
1021 #else
1022
1023 /* y = g**x (mod b)
1024  * Some restrictions... x must be positive and < b
1025  */
1026 static int _fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
1027 {
1028   fp_int   M[64], res;
1029   fp_digit buf, mp;
1030   int      err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
1031
1032   /* find window size */
1033   x = fp_count_bits (X);
1034   if (x <= 21) {
1035     winsize = 1;
1036   } else if (x <= 36) {
1037     winsize = 3;
1038   } else if (x <= 140) {
1039     winsize = 4;
1040   } else if (x <= 450) {
1041     winsize = 5;
1042   } else {
1043     winsize = 6;
1044   }
1045
1046   /* init M array */
1047   XMEMSET(M, 0, sizeof(M));
1048
1049   /* now setup montgomery  */
1050   if ((err = fp_montgomery_setup (P, &mp)) != FP_OKAY) {
1051      return err;
1052   }
1053
1054   /* setup result */
1055   fp_init(&res);
1056
1057   /* create M table
1058    *
1059    * The M table contains powers of the input base, e.g. M[x] = G^x mod P
1060    *
1061    * The first half of the table is not computed though accept for M[0] and M[1]
1062    */
1063
1064    /* now we need R mod m */
1065    fp_montgomery_calc_normalization (&res, P);
1066
1067    /* now set M[1] to G * R mod m */
1068    if (fp_cmp_mag(P, G) != FP_GT) {
1069       /* G > P so we reduce it first */
1070       fp_mod(G, P, &M[1]);
1071    } else {
1072       fp_copy(G, &M[1]);
1073    }
1074    fp_mulmod (&M[1], &res, P, &M[1]);
1075
1076   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
1077   fp_copy (&M[1], &M[1 << (winsize - 1)]);
1078   for (x = 0; x < (winsize - 1); x++) {
1079     fp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)]);
1080     fp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp);
1081   }
1082
1083   /* create upper table */
1084   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
1085     fp_mul(&M[x - 1], &M[1], &M[x]);
1086     fp_montgomery_reduce(&M[x], P, mp);
1087   }
1088
1089   /* set initial mode and bit cnt */
1090   mode   = 0;
1091   bitcnt = 1;
1092   buf    = 0;
1093   digidx = X->used - 1;
1094   bitcpy = 0;
1095   bitbuf = 0;
1096
1097   for (;;) {
1098     /* grab next digit as required */
1099     if (--bitcnt == 0) {
1100       /* if digidx == -1 we are out of digits so break */
1101       if (digidx == -1) {
1102         break;
1103       }
1104       /* read next digit and reset bitcnt */
1105       buf    = X->dp[digidx--];
1106       bitcnt = (int)DIGIT_BIT;
1107     }
1108
1109     /* grab the next msb from the exponent */
1110     y     = (fp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
1111     buf <<= (fp_digit)1;
1112
1113     /* if the bit is zero and mode == 0 then we ignore it
1114      * These represent the leading zero bits before the first 1 bit
1115      * in the exponent.  Technically this opt is not required but it
1116      * does lower the # of trivial squaring/reductions used
1117      */
1118     if (mode == 0 && y == 0) {
1119       continue;
1120     }
1121
1122     /* if the bit is zero and mode == 1 then we square */
1123     if (mode == 1 && y == 0) {
1124       fp_sqr(&res, &res);
1125       fp_montgomery_reduce(&res, P, mp);
1126       continue;
1127     }
1128
1129     /* else we add it to the window */
1130     bitbuf |= (y << (winsize - ++bitcpy));
1131     mode    = 2;
1132
1133     if (bitcpy == winsize) {
1134       /* ok window is filled so square as required and multiply  */
1135       /* square first */
1136       for (x = 0; x < winsize; x++) {
1137         fp_sqr(&res, &res);
1138         fp_montgomery_reduce(&res, P, mp);
1139       }
1140
1141       /* then multiply */
1142       fp_mul(&res, &M[bitbuf], &res);
1143       fp_montgomery_reduce(&res, P, mp);
1144
1145       /* empty window and reset */
1146       bitcpy = 0;
1147       bitbuf = 0;
1148       mode   = 1;
1149     }
1150   }
1151
1152   /* if bits remain then square/multiply */
1153   if (mode == 2 && bitcpy > 0) {
1154     /* square then multiply if the bit is set */
1155     for (x = 0; x < bitcpy; x++) {
1156       fp_sqr(&res, &res);
1157       fp_montgomery_reduce(&res, P, mp);
1158
1159       /* get next bit of the window */
1160       bitbuf <<= 1;
1161       if ((bitbuf & (1 << winsize)) != 0) {
1162         /* then multiply */
1163         fp_mul(&res, &M[1], &res);
1164         fp_montgomery_reduce(&res, P, mp);
1165       }
1166     }
1167   }
1168
1169   /* fixup result if Montgomery reduction is used
1170    * recall that any value in a Montgomery system is
1171    * actually multiplied by R mod n.  So we have
1172    * to reduce one more time to cancel out the factor
1173    * of R.
1174    */
1175   fp_montgomery_reduce(&res, P, mp);
1176
1177   /* swap res with Y */
1178   fp_copy (&res, Y);
1179   return FP_OKAY;
1180 }
1181
1182 #endif
1183
1184 int fp_exptmod(fp_int * G, fp_int * X, fp_int * P, fp_int * Y)
1185 {
1186    fp_int tmp;
1187    int    err;
1188
1189    /* prevent overflows */
1190    if (P->used > (FP_SIZE/2)) {
1191       return FP_VAL;
1192    }
1193
1194    /* is X negative?  */
1195    if (X->sign == FP_NEG) {
1196       /* yes, copy G and invmod it */
1197       fp_copy(G, &tmp);
1198       if ((err = fp_invmod(&tmp, P, &tmp)) != FP_OKAY) {
1199          return err;
1200       }
1201       X->sign = FP_ZPOS;
1202       err =  _fp_exptmod(&tmp, X, P, Y);
1203       if (X != Y) {
1204          X->sign = FP_NEG;
1205       }
1206       return err;
1207    } else {
1208       /* Positive exponent so just exptmod */
1209       return _fp_exptmod(G, X, P, Y);
1210    }
1211 }
1212
1213 /* computes a = 2**b */
1214 void fp_2expt(fp_int *a, int b)
1215 {
1216    int     z;
1217
1218    /* zero a as per default */
1219    fp_zero (a);
1220
1221    if (b < 0) {
1222       return;
1223    }
1224
1225    z = b / DIGIT_BIT;
1226    if (z >= FP_SIZE) {
1227       return;
1228    }
1229
1230   /* set the used count of where the bit will go */
1231   a->used = z + 1;
1232
1233   /* put the single bit in its place */
1234   a->dp[z] = ((fp_digit)1) << (b % DIGIT_BIT);
1235 }
1236
1237 /* b = a*a  */
1238 void fp_sqr(fp_int *A, fp_int *B)
1239 {
1240     int y = A->used;
1241
1242     /* call generic if we're out of range */
1243     if (y + y > FP_SIZE) {
1244        fp_sqr_comba(A, B);
1245        return ;
1246     }
1247
1248 #if defined(TFM_SQR3)
1249         if (y <= 3) {
1250            fp_sqr_comba3(A,B);
1251            return;
1252         }
1253 #endif
1254 #if defined(TFM_SQR4)
1255         if (y == 4) {
1256            fp_sqr_comba4(A,B);
1257            return;
1258         }
1259 #endif
1260 #if defined(TFM_SQR6)
1261         if (y <= 6) {
1262            fp_sqr_comba6(A,B);
1263            return;
1264         }
1265 #endif
1266 #if defined(TFM_SQR7)
1267         if (y == 7) {
1268            fp_sqr_comba7(A,B);
1269            return;
1270         }
1271 #endif
1272 #if defined(TFM_SQR8)
1273         if (y == 8) {
1274            fp_sqr_comba8(A,B);
1275            return;
1276         }
1277 #endif
1278 #if defined(TFM_SQR9)
1279         if (y == 9) {
1280            fp_sqr_comba9(A,B);
1281            return;
1282         }
1283 #endif
1284 #if defined(TFM_SQR12)
1285         if (y <= 12) {
1286            fp_sqr_comba12(A,B);
1287            return;
1288         }
1289 #endif
1290 #if defined(TFM_SQR17)
1291         if (y <= 17) {
1292            fp_sqr_comba17(A,B);
1293            return;
1294         }
1295 #endif
1296 #if defined(TFM_SMALL_SET)
1297         if (y <= 16) {
1298            fp_sqr_comba_small(A,B);
1299            return;
1300         }
1301 #endif
1302 #if defined(TFM_SQR20)
1303         if (y <= 20) {
1304            fp_sqr_comba20(A,B);
1305            return;
1306         }
1307 #endif
1308 #if defined(TFM_SQR24)
1309         if (y <= 24) {
1310            fp_sqr_comba24(A,B);
1311            return;
1312         }
1313 #endif
1314 #if defined(TFM_SQR28)
1315         if (y <= 28) {
1316            fp_sqr_comba28(A,B);
1317            return;
1318         }
1319 #endif
1320 #if defined(TFM_SQR32)
1321         if (y <= 32) {
1322            fp_sqr_comba32(A,B);
1323            return;
1324         }
1325 #endif
1326 #if defined(TFM_SQR48)
1327         if (y <= 48) {
1328            fp_sqr_comba48(A,B);
1329            return;
1330         }
1331 #endif
1332 #if defined(TFM_SQR64)
1333         if (y <= 64) {
1334            fp_sqr_comba64(A,B);
1335            return;
1336         }
1337 #endif
1338        fp_sqr_comba(A, B);
1339 }
1340
1341 /* generic comba squarer */
1342 void fp_sqr_comba(fp_int *A, fp_int *B)
1343 {
1344   int       pa, ix, iz;
1345   fp_digit  c0, c1, c2;
1346   fp_int    tmp, *dst;
1347 #ifdef TFM_ISO
1348   fp_word   tt;
1349 #endif
1350
1351   /* get size of output and trim */
1352   pa = A->used + A->used;
1353   if (pa >= FP_SIZE) {
1354      pa = FP_SIZE-1;
1355   }
1356
1357   /* number of output digits to produce */
1358   COMBA_START;
1359   COMBA_CLEAR;
1360
1361   if (A == B) {
1362      fp_zero(&tmp);
1363      dst = &tmp;
1364   } else {
1365      fp_zero(B);
1366      dst = B;
1367   }
1368
1369   for (ix = 0; ix < pa; ix++) {
1370       int      tx, ty, iy;
1371       fp_digit *tmpy, *tmpx;
1372
1373       /* get offsets into the two bignums */
1374       ty = MIN(A->used-1, ix);
1375       tx = ix - ty;
1376
1377       /* setup temp aliases */
1378       tmpx = A->dp + tx;
1379       tmpy = A->dp + ty;
1380
1381       /* this is the number of times the loop will iterrate,
1382          while (tx++ < a->used && ty-- >= 0) { ... }
1383        */
1384       iy = MIN(A->used-tx, ty+1);
1385
1386       /* now for squaring tx can never equal ty
1387        * we halve the distance since they approach
1388        * at a rate of 2x and we have to round because
1389        * odd cases need to be executed
1390        */
1391       iy = MIN(iy, (ty-tx+1)>>1);
1392
1393       /* forward carries */
1394       COMBA_FORWARD;
1395
1396       /* execute loop */
1397       for (iz = 0; iz < iy; iz++) {
1398           SQRADD2(*tmpx++, *tmpy--);
1399       }
1400
1401       /* even columns have the square term in them */
1402       if ((ix&1) == 0) {
1403           /* TAO change COMBA_ADD back to SQRADD */
1404           SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
1405       }
1406
1407       /* store it */
1408       COMBA_STORE(dst->dp[ix]);
1409   }
1410
1411   COMBA_FINI;
1412
1413   /* setup dest */
1414   dst->used = pa;
1415   fp_clamp (dst);
1416   if (dst != B) {
1417      fp_copy(dst, B);
1418   }
1419 }
1420
1421 int fp_cmp(fp_int *a, fp_int *b)
1422 {
1423    if (a->sign == FP_NEG && b->sign == FP_ZPOS) {
1424       return FP_LT;
1425    } else if (a->sign == FP_ZPOS && b->sign == FP_NEG) {
1426       return FP_GT;
1427    } else {
1428       /* compare digits */
1429       if (a->sign == FP_NEG) {
1430          /* if negative compare opposite direction */
1431          return fp_cmp_mag(b, a);
1432       } else {
1433          return fp_cmp_mag(a, b);
1434       }
1435    }
1436 }
1437
1438 /* compare against a single digit */
1439 int fp_cmp_d(fp_int *a, fp_digit b)
1440 {
1441   /* compare based on sign */
1442   if ((b && a->used == 0) || a->sign == FP_NEG) {
1443     return FP_LT;
1444   }
1445
1446   /* compare based on magnitude */
1447   if (a->used > 1) {
1448     return FP_GT;
1449   }
1450
1451   /* compare the only digit of a to b */
1452   if (a->dp[0] > b) {
1453     return FP_GT;
1454   } else if (a->dp[0] < b) {
1455     return FP_LT;
1456   } else {
1457     return FP_EQ;
1458   }
1459
1460 }
1461
1462 int fp_cmp_mag(fp_int *a, fp_int *b)
1463 {
1464    int x;
1465
1466    if (a->used > b->used) {
1467       return FP_GT;
1468    } else if (a->used < b->used) {
1469       return FP_LT;
1470    } else {
1471       for (x = a->used - 1; x >= 0; x--) {
1472           if (a->dp[x] > b->dp[x]) {
1473              return FP_GT;
1474           } else if (a->dp[x] < b->dp[x]) {
1475              return FP_LT;
1476           }
1477       }
1478    }
1479    return FP_EQ;
1480 }
1481
1482 /* setups the montgomery reduction */
1483 int fp_montgomery_setup(fp_int *a, fp_digit *rho)
1484 {
1485   fp_digit x, b;
1486
1487 /* fast inversion mod 2**k
1488  *
1489  * Based on the fact that
1490  *
1491  * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
1492  *                    =>  2*X*A - X*X*A*A = 1
1493  *                    =>  2*(1) - (1)     = 1
1494  */
1495   b = a->dp[0];
1496
1497   if ((b & 1) == 0) {
1498     return FP_VAL;
1499   }
1500
1501   x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
1502   x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
1503   x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
1504   x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
1505 #ifdef FP_64BIT
1506   x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
1507 #endif
1508
1509   /* rho = -1/m mod b */
1510   *rho = (fp_digit) (((fp_word) 1 << ((fp_word) DIGIT_BIT)) - ((fp_word)x));
1511
1512   return FP_OKAY;
1513 }
1514
1515 /* computes a = B**n mod b without division or multiplication useful for
1516  * normalizing numbers in a Montgomery system.
1517  */
1518 void fp_montgomery_calc_normalization(fp_int *a, fp_int *b)
1519 {
1520   int     x, bits;
1521
1522   /* how many bits of last digit does b use */
1523   bits = fp_count_bits (b) % DIGIT_BIT;
1524   if (!bits) bits = DIGIT_BIT;
1525
1526   /* compute A = B^(n-1) * 2^(bits-1) */
1527   if (b->used > 1) {
1528      fp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1);
1529   } else {
1530      fp_set(a, 1);
1531      bits = 1;
1532   }
1533
1534   /* now compute C = A * B mod b */
1535   for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
1536     fp_mul_2 (a, a);
1537     if (fp_cmp_mag (a, b) != FP_LT) {
1538       s_fp_sub (a, b, a);
1539     }
1540   }
1541 }
1542
1543
1544 #ifdef TFM_SMALL_MONT_SET
1545     #include "fp_mont_small.i"
1546 #endif
1547
1548 /* computes x/R == x (mod N) via Montgomery Reduction */
1549 void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
1550 {
1551    fp_digit c[FP_SIZE], *_c, *tmpm, mu;
1552    int      oldused, x, y, pa;
1553
1554    /* bail if too large */
1555    if (m->used > (FP_SIZE/2)) {
1556       (void)mu;                     /* shut up compiler */
1557       return;
1558    }
1559
1560 #ifdef TFM_SMALL_MONT_SET
1561    if (m->used <= 16) {
1562       fp_montgomery_reduce_small(a, m, mp);
1563       return;
1564    }
1565 #endif
1566
1567
1568 #if defined(USE_MEMSET)
1569    /* now zero the buff */
1570    XMEMSET(c, 0, sizeof c);
1571 #endif
1572    pa = m->used;
1573
1574    /* copy the input */
1575    oldused = a->used;
1576    for (x = 0; x < oldused; x++) {
1577        c[x] = a->dp[x];
1578    }
1579 #if !defined(USE_MEMSET)
1580    for (; x < 2*pa+1; x++) {
1581        c[x] = 0;
1582    }
1583 #endif
1584    MONT_START;
1585
1586    for (x = 0; x < pa; x++) {
1587        fp_digit cy = 0;
1588        /* get Mu for this round */
1589        LOOP_START;
1590        _c   = c + x;
1591        tmpm = m->dp;
1592        y = 0;
1593        #if (defined(TFM_SSE2) || defined(TFM_X86_64))
1594         for (; y < (pa & ~7); y += 8) {
1595               INNERMUL8;
1596               _c   += 8;
1597               tmpm += 8;
1598            }
1599        #endif
1600
1601        for (; y < pa; y++) {
1602           INNERMUL;
1603           ++_c;
1604        }
1605        LOOP_END;
1606        while (cy) {
1607            PROPCARRY;
1608            ++_c;
1609        }
1610   }
1611
1612   /* now copy out */
1613   _c   = c + pa;
1614   tmpm = a->dp;
1615   for (x = 0; x < pa+1; x++) {
1616      *tmpm++ = *_c++;
1617   }
1618
1619   for (; x < oldused; x++)   {
1620      *tmpm++ = 0;
1621   }
1622
1623   MONT_FINI;
1624
1625   a->used = pa+1;
1626   fp_clamp(a);
1627
1628   /* if A >= m then A = A - m */
1629   if (fp_cmp_mag (a, m) != FP_LT) {
1630     s_fp_sub (a, m, a);
1631   }
1632 }
1633
1634 void fp_read_unsigned_bin(fp_int *a, unsigned char *b, int c)
1635 {
1636   /* zero the int */
1637   fp_zero (a);
1638
1639   /* If we know the endianness of this architecture, and we're using
1640      32-bit fp_digits, we can optimize this */
1641 #if (defined(ENDIAN_LITTLE) || defined(ENDIAN_BIG)) && !defined(FP_64BIT)
1642   /* But not for both simultaneously */
1643 #if defined(ENDIAN_LITTLE) && defined(ENDIAN_BIG)
1644 #error Both ENDIAN_LITTLE and ENDIAN_BIG defined.
1645 #endif
1646   {
1647      unsigned char *pd = (unsigned char *)a->dp;
1648
1649      if ((unsigned)c > (FP_SIZE * sizeof(fp_digit))) {
1650         int excess = c - (FP_SIZE * sizeof(fp_digit));
1651         c -= excess;
1652         b += excess;
1653      }
1654      a->used = (c + sizeof(fp_digit) - 1)/sizeof(fp_digit);
1655      /* read the bytes in */
1656 #ifdef ENDIAN_BIG
1657      {
1658        /* Use Duff's device to unroll the loop. */
1659        int idx = (c - 1) & ~3;
1660        switch (c % 4) {
1661        case 0:  do { pd[idx+0] = *b++;
1662        case 3:       pd[idx+1] = *b++;
1663        case 2:       pd[idx+2] = *b++;
1664        case 1:       pd[idx+3] = *b++;
1665                      idx -= 4;
1666                         } while ((c -= 4) > 0);
1667        }
1668      }
1669 #else
1670      for (c -= 1; c >= 0; c -= 1) {
1671        pd[c] = *b++;
1672      }
1673 #endif
1674   }
1675 #else
1676   /* read the bytes in */
1677   for (; c > 0; c--) {
1678      fp_mul_2d (a, 8, a);
1679      a->dp[0] |= *b++;
1680      a->used += 1;
1681   }
1682 #endif
1683   fp_clamp (a);
1684 }
1685
1686 void fp_to_unsigned_bin(fp_int *a, unsigned char *b)
1687 {
1688   int     x;
1689   fp_int  t;
1690
1691   fp_init_copy(&t, a);
1692
1693   x = 0;
1694   while (fp_iszero (&t) == FP_NO) {
1695       b[x++] = (unsigned char) (t.dp[0] & 255);
1696       fp_div_2d (&t, 8, &t, NULL);
1697   }
1698   fp_reverse (b, x);
1699 }
1700
1701 int fp_unsigned_bin_size(fp_int *a)
1702 {
1703   int     size = fp_count_bits (a);
1704   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
1705 }
1706
1707 void fp_set(fp_int *a, fp_digit b)
1708 {
1709    fp_zero(a);
1710    a->dp[0] = b;
1711    a->used  = a->dp[0] ? 1 : 0;
1712 }
1713
1714 int fp_count_bits (fp_int * a)
1715 {
1716   int     r;
1717   fp_digit q;
1718
1719   /* shortcut */
1720   if (a->used == 0) {
1721     return 0;
1722   }
1723
1724   /* get number of digits and add that */
1725   r = (a->used - 1) * DIGIT_BIT;
1726
1727   /* take the last digit and count the bits in it */
1728   q = a->dp[a->used - 1];
1729   while (q > ((fp_digit) 0)) {
1730     ++r;
1731     q >>= ((fp_digit) 1);
1732   }
1733   return r;
1734 }
1735
1736 void fp_lshd(fp_int *a, int x)
1737 {
1738    int y;
1739
1740    /* move up and truncate as required */
1741    y = MIN(a->used + x - 1, (int)(FP_SIZE-1));
1742
1743    /* store new size */
1744    a->used = y + 1;
1745
1746    /* move digits */
1747    for (; y >= x; y--) {
1748        a->dp[y] = a->dp[y-x];
1749    }
1750
1751    /* zero lower digits */
1752    for (; y >= 0; y--) {
1753        a->dp[y] = 0;
1754    }
1755
1756    /* clamp digits */
1757    fp_clamp(a);
1758 }
1759
1760 void fp_rshd(fp_int *a, int x)
1761 {
1762   int y;
1763
1764   /* too many digits just zero and return */
1765   if (x >= a->used) {
1766      fp_zero(a);
1767      return;
1768   }
1769
1770    /* shift */
1771    for (y = 0; y < a->used - x; y++) {
1772       a->dp[y] = a->dp[y+x];
1773    }
1774
1775    /* zero rest */
1776    for (; y < a->used; y++) {
1777       a->dp[y] = 0;
1778    }
1779
1780    /* decrement count */
1781    a->used -= x;
1782    fp_clamp(a);
1783 }
1784
1785 /* reverse an array, used for radix code */
1786 void fp_reverse (unsigned char *s, int len)
1787 {
1788   int     ix, iy;
1789   unsigned char t;
1790
1791   ix = 0;
1792   iy = len - 1;
1793   while (ix < iy) {
1794     t     = s[ix];
1795     s[ix] = s[iy];
1796     s[iy] = t;
1797     ++ix;
1798     --iy;
1799   }
1800 }
1801
1802
1803 /* c = a - b */
1804 void fp_sub_d(fp_int *a, fp_digit b, fp_int *c)
1805 {
1806    fp_int tmp;
1807    fp_set(&tmp, b);
1808    fp_sub(a, &tmp, c);
1809 }
1810
1811
1812 /* CyaSSL callers from normal lib */
1813
1814 /* init a new mp_int */
1815 int mp_init (mp_int * a)
1816 {
1817   if (a)
1818     fp_init(a);
1819   return MP_OKAY;
1820 }
1821
1822 /* clear one (frees)  */
1823 void mp_clear (mp_int * a)
1824 {
1825   fp_zero(a);
1826 }
1827
1828 /* handle up to 6 inits */
1829 int mp_init_multi(mp_int* a, mp_int* b, mp_int* c, mp_int* d, mp_int* e, mp_int* f)
1830 {
1831     if (a)
1832         fp_init(a);
1833     if (b)
1834         fp_init(b);
1835     if (c)
1836         fp_init(c);
1837     if (d)
1838         fp_init(d);
1839     if (e)
1840         fp_init(e);
1841     if (f)
1842         fp_init(f);
1843
1844     return MP_OKAY;
1845 }
1846
1847 /* high level addition (handles signs) */
1848 int mp_add (mp_int * a, mp_int * b, mp_int * c)
1849 {
1850   fp_add(a, b, c);
1851   return MP_OKAY;
1852 }
1853
1854 /* high level subtraction (handles signs) */
1855 int mp_sub (mp_int * a, mp_int * b, mp_int * c)
1856 {
1857   fp_sub(a, b, c);
1858   return MP_OKAY;
1859 }
1860
1861 /* high level multiplication (handles sign) */
1862 int mp_mul (mp_int * a, mp_int * b, mp_int * c)
1863 {
1864   fp_mul(a, b, c);
1865   return MP_OKAY;
1866 }
1867
1868 /* d = a * b (mod c) */
1869 int mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
1870 {
1871   return fp_mulmod(a, b, c, d);
1872 }
1873
1874 /* c = a mod b, 0 <= c < b */
1875 int mp_mod (mp_int * a, mp_int * b, mp_int * c)
1876 {
1877   return fp_mod (a, b, c);
1878 }
1879
1880 /* hac 14.61, pp608 */
1881 int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
1882 {
1883   return fp_invmod(a, b, c);
1884 }
1885
1886 /* this is a shell function that calls either the normal or Montgomery
1887  * exptmod functions.  Originally the call to the montgomery code was
1888  * embedded in the normal function but that wasted alot of stack space
1889  * for nothing (since 99% of the time the Montgomery code would be called)
1890  */
1891 int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
1892 {
1893   return fp_exptmod(G, X, P, Y);
1894 }
1895
1896 /* compare two ints (signed)*/
1897 int mp_cmp (mp_int * a, mp_int * b)
1898 {
1899   return fp_cmp(a, b);
1900 }
1901
1902 /* compare a digit */
1903 int mp_cmp_d(mp_int * a, mp_digit b)
1904 {
1905   return fp_cmp_d(a, b);
1906 }
1907
1908 /* get the size for an unsigned equivalent */
1909 int mp_unsigned_bin_size (mp_int * a)
1910 {
1911   return fp_unsigned_bin_size(a);
1912 }
1913
1914 /* store in unsigned [big endian] format */
1915 int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
1916 {
1917   fp_to_unsigned_bin(a,b);
1918   return MP_OKAY;
1919 }
1920
1921 /* reads a unsigned char array, assumes the msb is stored first [big endian] */
1922 int mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
1923 {
1924   fp_read_unsigned_bin(a, (unsigned char *)b, c);
1925   return MP_OKAY;
1926 }
1927
1928
1929 int mp_sub_d(fp_int *a, fp_digit b, fp_int *c)
1930 {
1931     fp_sub_d(a, b, c);
1932     return MP_OKAY;
1933 }
1934
1935
1936 /* fast math conversion */
1937 int mp_copy(fp_int* a, fp_int* b)
1938 {
1939     fp_copy(a, b);
1940     return MP_OKAY;
1941 }
1942
1943
1944 /* fast math conversion */
1945 int mp_isodd(mp_int* a)
1946 {
1947     return fp_isodd(a);
1948 }
1949
1950
1951 /* fast math conversion */
1952 int mp_iszero(mp_int* a)
1953 {
1954     return fp_iszero(a);
1955 }
1956
1957
1958 /* fast math conversion */
1959 int mp_count_bits (mp_int* a)
1960 {
1961     return fp_count_bits(a);
1962 }
1963
1964
1965 /* fast math wrappers */
1966 int mp_set_int(fp_int *a, fp_digit b)
1967 {
1968     fp_set(a, b);
1969     return MP_OKAY;
1970 }
1971
1972
1973 #if defined(CYASSL_KEY_GEN) || defined (HAVE_ECC)
1974
1975 /* c = a * a (mod b) */
1976 int fp_sqrmod(fp_int *a, fp_int *b, fp_int *c)
1977 {
1978   fp_int tmp;
1979   fp_zero(&tmp);
1980   fp_sqr(a, &tmp);
1981   return fp_mod(&tmp, b, c);
1982 }
1983
1984 /* fast math conversion */
1985 int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c)
1986 {
1987     return fp_sqrmod(a, b, c);
1988 }
1989
1990 /* fast math conversion */
1991 int mp_montgomery_calc_normalization(mp_int *a, mp_int *b)
1992 {
1993     fp_montgomery_calc_normalization(a, b);
1994     return MP_OKAY;
1995 }
1996
1997 #endif /* CYASSL_KEYGEN || HAVE_ECC */
1998
1999
2000 #ifdef CYASSL_KEY_GEN
2001
2002 void fp_gcd(fp_int *a, fp_int *b, fp_int *c);
2003 void fp_lcm(fp_int *a, fp_int *b, fp_int *c);
2004 int  fp_isprime(fp_int *a);
2005 int  fp_cnt_lsb(fp_int *a);
2006
2007 int mp_gcd(fp_int *a, fp_int *b, fp_int *c)
2008 {
2009     fp_gcd(a, b, c);
2010     return MP_OKAY;
2011 }
2012
2013
2014 int mp_lcm(fp_int *a, fp_int *b, fp_int *c)
2015 {
2016     fp_lcm(a, b, c);
2017     return MP_OKAY;
2018 }
2019
2020
2021 int mp_prime_is_prime(mp_int* a, int t, int* result)
2022 {
2023     (void)t;
2024     *result = fp_isprime(a);
2025     return MP_OKAY;
2026 }
2027
2028
2029
2030 static int s_is_power_of_two(fp_digit b, int *p)
2031 {
2032    int x;
2033
2034    /* fast return if no power of two */
2035    if ((b==0) || (b & (b-1))) {
2036       return 0;
2037    }
2038
2039    for (x = 0; x < DIGIT_BIT; x++) {
2040       if (b == (((fp_digit)1)<<x)) {
2041          *p = x;
2042          return 1;
2043       }
2044    }
2045    return 0;
2046 }
2047
2048 /* a/b => cb + d == a */
2049 static int fp_div_d(fp_int *a, fp_digit b, fp_int *c, fp_digit *d)
2050 {
2051   fp_int   q;
2052   fp_word  w;
2053   fp_digit t;
2054   int      ix;
2055
2056   /* cannot divide by zero */
2057   if (b == 0) {
2058      return FP_VAL;
2059   }
2060
2061   /* quick outs */
2062   if (b == 1 || fp_iszero(a) == 1) {
2063      if (d != NULL) {
2064         *d = 0;
2065      }
2066      if (c != NULL) {
2067         fp_copy(a, c);
2068      }
2069      return FP_OKAY;
2070   }
2071
2072   /* power of two ? */
2073   if (s_is_power_of_two(b, &ix) == 1) {
2074      if (d != NULL) {
2075         *d = a->dp[0] & ((((fp_digit)1)<<ix) - 1);
2076      }
2077      if (c != NULL) {
2078         fp_div_2d(a, ix, c, NULL);
2079      }
2080      return FP_OKAY;
2081   }
2082
2083   /* no easy answer [c'est la vie].  Just division */
2084   fp_init(&q);
2085
2086   q.used = a->used;
2087   q.sign = a->sign;
2088   w = 0;
2089   for (ix = a->used - 1; ix >= 0; ix--) {
2090      w = (w << ((fp_word)DIGIT_BIT)) | ((fp_word)a->dp[ix]);
2091
2092      if (w >= b) {
2093         t = (fp_digit)(w / b);
2094         w -= ((fp_word)t) * ((fp_word)b);
2095       } else {
2096         t = 0;
2097       }
2098       q.dp[ix] = (fp_digit)t;
2099   }
2100
2101   if (d != NULL) {
2102      *d = (fp_digit)w;
2103   }
2104
2105   if (c != NULL) {
2106      fp_clamp(&q);
2107      fp_copy(&q, c);
2108   }
2109
2110   return FP_OKAY;
2111 }
2112
2113
2114 /* c = a mod b, 0 <= c < b  */
2115 static int fp_mod_d(fp_int *a, fp_digit b, fp_digit *c)
2116 {
2117    return fp_div_d(a, b, NULL, c);
2118 }
2119
2120
2121 /* Miller-Rabin test of "a" to the base of "b" as described in
2122  * HAC pp. 139 Algorithm 4.24
2123  *
2124  * Sets result to 0 if definitely composite or 1 if probably prime.
2125  * Randomly the chance of error is no more than 1/4 and often
2126  * very much lower.
2127  */
2128 static void fp_prime_miller_rabin (fp_int * a, fp_int * b, int *result)
2129 {
2130   fp_int  n1, y, r;
2131   int     s, j;
2132
2133   /* default */
2134   *result = FP_NO;
2135
2136   /* ensure b > 1 */
2137   if (fp_cmp_d(b, 1) != FP_GT) {
2138      return;
2139   }
2140
2141   /* get n1 = a - 1 */
2142   fp_init_copy(&n1, a);
2143   fp_sub_d(&n1, 1, &n1);
2144
2145   /* set 2**s * r = n1 */
2146   fp_init_copy(&r, &n1);
2147
2148   /* count the number of least significant bits
2149    * which are zero
2150    */
2151   s = fp_cnt_lsb(&r);
2152
2153   /* now divide n - 1 by 2**s */
2154   fp_div_2d (&r, s, &r, NULL);
2155
2156   /* compute y = b**r mod a */
2157   fp_init(&y);
2158   fp_exptmod(b, &r, a, &y);
2159
2160   /* if y != 1 and y != n1 do */
2161   if (fp_cmp_d (&y, 1) != FP_EQ && fp_cmp (&y, &n1) != FP_EQ) {
2162     j = 1;
2163     /* while j <= s-1 and y != n1 */
2164     while ((j <= (s - 1)) && fp_cmp (&y, &n1) != FP_EQ) {
2165       fp_sqrmod (&y, a, &y);
2166
2167       /* if y == 1 then composite */
2168       if (fp_cmp_d (&y, 1) == FP_EQ) {
2169          return;
2170       }
2171       ++j;
2172     }
2173
2174     /* if y != n1 then composite */
2175     if (fp_cmp (&y, &n1) != FP_EQ) {
2176        return;
2177     }
2178   }
2179
2180   /* probably prime now */
2181   *result = FP_YES;
2182 }
2183
2184
2185 /* a few primes */
2186 static const fp_digit primes[256] = {
2187   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
2188   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
2189   0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
2190   0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
2191   0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
2192   0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
2193   0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
2194   0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
2195
2196   0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
2197   0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
2198   0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
2199   0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
2200   0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
2201   0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
2202   0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
2203   0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
2204
2205   0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
2206   0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
2207   0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
2208   0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
2209   0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
2210   0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
2211   0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
2212   0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
2213
2214   0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
2215   0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
2216   0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
2217   0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
2218   0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
2219   0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
2220   0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
2221   0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
2222 };
2223
2224 int fp_isprime(fp_int *a)
2225 {
2226    fp_int   b;
2227    fp_digit d = 0;
2228    int      r, res;
2229
2230    /* do trial division */
2231    for (r = 0; r < 256; r++) {
2232        fp_mod_d(a, primes[r], &d);
2233        if (d == 0) {
2234           return FP_NO;
2235        }
2236    }
2237
2238    /* now do 8 miller rabins */
2239    fp_init(&b);
2240    for (r = 0; r < 8; r++) {
2241        fp_set(&b, primes[r]);
2242        fp_prime_miller_rabin(a, &b, &res);
2243        if (res == FP_NO) {
2244           return FP_NO;
2245        }
2246    }
2247    return FP_YES;
2248 }
2249
2250
2251 /* c = [a, b] */
2252 void fp_lcm(fp_int *a, fp_int *b, fp_int *c)
2253 {
2254    fp_int  t1, t2;
2255
2256    fp_init(&t1);
2257    fp_init(&t2);
2258    fp_gcd(a, b, &t1);
2259    if (fp_cmp_mag(a, b) == FP_GT) {
2260       fp_div(a, &t1, &t2, NULL);
2261       fp_mul(b, &t2, c);
2262    } else {
2263       fp_div(b, &t1, &t2, NULL);
2264       fp_mul(a, &t2, c);
2265    }
2266 }
2267
2268
2269 static const int lnz[16] = {
2270    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
2271 };
2272
2273 /* Counts the number of lsbs which are zero before the first zero bit */
2274 int fp_cnt_lsb(fp_int *a)
2275 {
2276    int x;
2277    fp_digit q, qq;
2278
2279    /* easy out */
2280    if (fp_iszero(a) == 1) {
2281       return 0;
2282    }
2283
2284    /* scan lower digits until non-zero */
2285    for (x = 0; x < a->used && a->dp[x] == 0; x++);
2286    q = a->dp[x];
2287    x *= DIGIT_BIT;
2288
2289    /* now scan this digit until a 1 is found */
2290    if ((q & 1) == 0) {
2291       do {
2292          qq  = q & 15;
2293          x  += lnz[qq];
2294          q >>= 4;
2295       } while (qq == 0);
2296    }
2297    return x;
2298 }
2299
2300
2301 /* c = (a, b) */
2302 void fp_gcd(fp_int *a, fp_int *b, fp_int *c)
2303 {
2304    fp_int u, v, r;
2305
2306    /* either zero than gcd is the largest */
2307    if (fp_iszero (a) == 1 && fp_iszero (b) == 0) {
2308      fp_abs (b, c);
2309      return;
2310    }
2311    if (fp_iszero (a) == 0 && fp_iszero (b) == 1) {
2312      fp_abs (a, c);
2313      return;
2314    }
2315
2316    /* optimized.  At this point if a == 0 then
2317     * b must equal zero too
2318     */
2319    if (fp_iszero (a) == 1) {
2320      fp_zero(c);
2321      return;
2322    }
2323
2324    /* sort inputs */
2325    if (fp_cmp_mag(a, b) != FP_LT) {
2326       fp_init_copy(&u, a);
2327       fp_init_copy(&v, b);
2328    } else {
2329       fp_init_copy(&u, b);
2330       fp_init_copy(&v, a);
2331    }
2332
2333    fp_zero(&r);
2334    while (fp_iszero(&v) == FP_NO) {
2335       fp_mod(&u, &v, &r);
2336       fp_copy(&v, &u);
2337       fp_copy(&r, &v);
2338    }
2339    fp_copy(&u, c);
2340 }
2341
2342 #endif /* CYASSL_KEY_GEN */
2343
2344
2345 #if defined(HAVE_ECC) || !defined(NO_PWDBASED)
2346 /* c = a + b */
2347 void fp_add_d(fp_int *a, fp_digit b, fp_int *c)
2348 {
2349    fp_int tmp;
2350    fp_set(&tmp, b);
2351    fp_add(a,&tmp,c);
2352 }
2353
2354 /* external compatibility */
2355 int mp_add_d(fp_int *a, fp_digit b, fp_int *c)
2356 {
2357     fp_add_d(a, b, c);
2358     return MP_OKAY;
2359 }
2360
2361 #endif  /* HAVE_ECC || !NO_PWDBASED */
2362
2363
2364 #ifdef HAVE_ECC
2365
2366 /* chars used in radix conversions */
2367 const char *fp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
2368
2369 static int fp_read_radix(fp_int *a, const char *str, int radix)
2370 {
2371   int     y, neg;
2372   char    ch;
2373
2374   /* make sure the radix is ok */
2375   if (radix < 2 || radix > 64) {
2376     return FP_VAL;
2377   }
2378
2379   /* if the leading digit is a
2380    * minus set the sign to negative.
2381    */
2382   if (*str == '-') {
2383     ++str;
2384     neg = FP_NEG;
2385   } else {
2386     neg = FP_ZPOS;
2387   }
2388
2389   /* set the integer to the default of zero */
2390   fp_zero (a);
2391
2392   /* process each digit of the string */
2393   while (*str) {
2394     /* if the radix < 36 the conversion is case insensitive
2395      * this allows numbers like 1AB and 1ab to represent the same  value
2396      * [e.g. in hex]
2397      */
2398     ch = (char) ((radix < 36) ? XTOUPPER(*str) : *str);
2399     for (y = 0; y < 64; y++) {
2400       if (ch == fp_s_rmap[y]) {
2401          break;
2402       }
2403     }
2404
2405     /* if the char was found in the map
2406      * and is less than the given radix add it
2407      * to the number, otherwise exit the loop.
2408      */
2409     if (y < radix) {
2410       fp_mul_d (a, (fp_digit) radix, a);
2411       fp_add_d (a, (fp_digit) y, a);
2412     } else {
2413       break;
2414     }
2415     ++str;
2416   }
2417
2418   /* set the sign only if a != 0 */
2419   if (fp_iszero(a) != FP_YES) {
2420      a->sign = neg;
2421   }
2422   return FP_OKAY;
2423 }
2424
2425 /* fast math conversion */
2426 int mp_read_radix(mp_int *a, const char *str, int radix)
2427 {
2428     return fp_read_radix(a, str, radix);
2429 }
2430
2431 /* fast math conversion */
2432 int mp_set(fp_int *a, fp_digit b)
2433 {
2434     fp_set(a,b);
2435     return MP_OKAY;
2436 }
2437
2438 /* fast math conversion */
2439 int mp_sqr(fp_int *A, fp_int *B)
2440 {
2441     fp_sqr(A, B);
2442     return MP_OKAY;
2443 }
2444
2445 /* fast math conversion */
2446 int mp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
2447 {
2448     fp_montgomery_reduce(a, m, mp);
2449     return MP_OKAY;
2450 }
2451
2452
2453 /* fast math conversion */
2454 int mp_montgomery_setup(fp_int *a, fp_digit *rho)
2455 {
2456     return fp_montgomery_setup(a, rho);
2457 }
2458
2459 int mp_div_2(fp_int * a, fp_int * b)
2460 {
2461     fp_div_2(a, b);
2462     return MP_OKAY;
2463 }
2464
2465
2466 int mp_init_copy(fp_int * a, fp_int * b)
2467 {
2468     fp_init_copy(a, b);
2469     return MP_OKAY;
2470 }
2471
2472
2473
2474 #endif /* HAVE_ECC */
2475
2476 #endif /* USE_FAST_MATH */