📄 lbn16.c
字号:
}#endif/* * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^16. * * This just performs Newton's iteration until it gets the * inverse. The initial estimate is always correct to 3 bits, and * sometimes 4. The number of valid bits doubles each iteration. * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable * for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow * the iteration through.) */#ifndef lbnMontInv1_16BNWORD16lbnMontInv1_16(BNWORD16 const x){ BNWORD16 y = x, z; assert(x & 1); while ((z = x*y) != 1) y *= 2 - z; return -y;}#endif /* !lbnMontInv1_16 */#if defined(BNWORD32) && PRODUCT_SCAN/* * Test code for product-scanning Montgomery reduction. * This seems to slow the C code down rather than speed it up. * * The first loop computes the Montgomery multipliers, storing them over * the low half of the number n. * * The second half multiplies the upper half, adding in the modulus * times the Montgomery multipliers. The results of this multiply * are stored. */voidlbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned mlen, BNWORD16 inv){ BNWORD32 x, y; BNWORD16 const *pm; BNWORD16 *pn; BNWORD16 t; unsigned carry; unsigned i, j; /* Special case of zero */ if (!mlen) return; /* Pass 1 - compute Montgomery multipliers */ /* First iteration can have certain simplifications. */ t = BIGLITTLE(n[-1],n[0]); x = t; t *= inv; BIGLITTLE(n[-1], n[0]) = t; x += (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */ assert((BNWORD16)x == 0); x = x >> 16; for (i = 1; i < mlen; i++) { carry = 0; pn = n; pm = BIGLITTLE(mod-i-1,mod+i+1); for (j = 0; j < i; j++) { y = (BNWORD32)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm); x += y; carry += (x < y); } assert(BIGLITTLE(pn == n-i, pn == n+i)); y = t = BIGLITTLE(pn[-1], pn[0]); x += y; carry += (x < y); BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD16)x; assert(BIGLITTLE(pm == mod-1, pm == mod+1)); y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]); x += y; carry += (x < y); assert((BNWORD16)x == 0); x = x >> 16 | (BNWORD32)carry << 16; } BIGLITTLE(n -= mlen, n += mlen); /* Pass 2 - compute upper words and add to n */ for (i = 1; i < mlen; i++) { carry = 0; pm = BIGLITTLE(mod-i,mod+i); pn = n; for (j = i; j < mlen; j++) { y = (BNWORD32)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn); x += y; carry += (x < y); } assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen)); assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i)); y = t = BIGLITTLE(*(n-i),*(n+i-1)); x += y; carry += (x < y); BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD16)x; x = (x >> 16) | (BNWORD32)carry << 16; } /* Last round of second half, simplified. */ t = BIGLITTLE(*(n-mlen),*(n+mlen-1)); x += t; BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD16)x; carry = (unsigned)(x >> 16); while (carry) carry -= lbnSubN_16(n, mod, mlen); while (lbnCmp_16(n, mod, mlen) >= 0) (void)lbnSubN_16(n, mod, mlen);}#define lbnMontReduce_16 lbnMontReduce_16#endif/* * Montgomery reduce n, modulo mod. This reduces modulo mod and divides * by 2^(16*mlen). I'll fill in an explanation of Montgomery representation * later. Returns the result in the *top* n words of the argument n. * This is ready for another multiplication using mulnn. * * TODO: Change to a full inverse and use Karatsuba's multiplication * rather than this word-at-a-time. */#ifndef lbnMontReduce_16voidlbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned const mlen, BNWORD16 inv){ BNWORD16 t; BNWORD16 c = 0; unsigned len = mlen; /* inv must be the negative inverse of mod's least significant word */ assert((BNWORD16)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD16)-1); assert(len); do { t = lbnMulAdd1_16(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0])); c += lbnAdd1_16(BIGLITTLE(n-mlen,n+mlen), len, t); BIGLITTLE(--n,++n); } while (--len); /* * All that adding can cause an overflow past the modulus size, * but it's unusual, and never by much, so a subtraction loop * is the right way to deal with it. * This subtraction happens infrequently - I've only ever seen it * invoked once per reduction, and then just under 22.5% of the time. */ while (c) c -= lbnSubN_16(n, mod, mlen); while (lbnCmp_16(n, mod, mlen) >= 0) (void)lbnSubN_16(n, mod, mlen);}#endif /* !lbnMontReduce_16 *//* * A couple of helpers that you might want to implement atomically * in asm sometime. */#ifndef lbnMontMul_16/* * Multiply "num1" by "num2", modulo "mod", all of length "len", and * place the result in the high half of "prod". "inv" is the inverse * of the least-significant word of the modulus, modulo 2^16. * This uses numbers in Montgomery form. Reduce using "len" and "inv". * * This is implemented as a macro to win on compilers that don't do * inlining, since it's so trivial. */#define lbnMontMul_16(prod, n1, n2, mod, len, inv) \ (lbnMulX_16(prod, n1, n2, len), lbnMontReduce_16(prod, mod, len, inv))#endif /* !lbnMontMul_16 */#ifndef lbnMontSquare_16/* * Square "num", modulo "mod", both of length "len", and place the result * in the high half of "prod". "inv" is the inverse of the least-significant * word of the modulus, modulo 2^16. * This uses numbers in Montgomery form. Reduce using "len" and "inv". * * This is implemented as a macro to win on compilers that don't do * inlining, since it's so trivial. */#define lbnMontSquare_16(prod, n, mod, len, inv) \ (lbnSquare_16(prod, n, len), lbnMontReduce_16(prod, mod, len, inv)) #endif /* !lbnMontSquare_16 *//* * Convert a number to Montgomery form - requires mlen + nlen + 1 words * of memory in "n". */voidlbnToMont_16(BNWORD16 *n, unsigned nlen, BNWORD16 *mod, unsigned mlen){ /* Move n up "mlen" words */ lbnCopy_16(BIGLITTLE(n-mlen,n+mlen), n, nlen); lbnZero_16(n, mlen); /* Do the division - dump the quotient in the high-order words */ (void)lbnDiv_16(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);}/* * Convert from Montgomery form. Montgomery reduction is all that is * needed. */voidlbnFromMont_16(BNWORD16 *n, BNWORD16 *mod, unsigned len){ /* Zero the high words of n */ lbnZero_16(BIGLITTLE(n-len,n+len), len); lbnMontReduce_16(n, mod, len, lbnMontInv1_16(BIGLITTLE(mod[-1],mod[0]))); /* Move n down len words */ lbnCopy_16(n, BIGLITTLE(n-len,n+len), len);}/* * The windowed exponentiation algorithm, precomputes a table of odd * powers of n up to 2^k. It takes 2^(k-1)-1 multiplies to compute * the table, and (e-1)/(k+1) multiplies (on average) to perform the * exponentiation. To minimize the sum, k must vary with e. * The optimal window sizes vary with the exponent length. Here are * some selected values and the boundary cases. * (An underscore _ has been inserted into some of the numbers to ensure * that magic strings like 16 do not appear in this table. It should be * ignored.) * * At e = 1 bits, k= 1 (0.000000) is best. * At e = 2 bits, k= 1 (0.500000) is best. * At e = 4 bits, k= 1 (1.500000) is best. * At e = 8 bits, k= 2 (3.333333) < k = 1 (3.500000) * At e = 1_6 bits, k= 2 (6.000000) is best. * At e = 26 bits, k= 3 (9.250000) < k = 2 (9.333333) * At e = 3_2 bits, k= 3 (10.750000) is best. * At e = 6_4 bits, k= 3 (18.750000) is best. * At e = 82 bits, k= 4 (23.200000) < k = 3 (23.250000) * At e = 128 bits, k= 4 (3_2.400000) is best. * At e = 242 bits, k= 5 (55.1_66667) < k = 4 (55.200000) * At e = 256 bits, k= 5 (57.500000) is best. * At e = 512 bits, k= 5 (100.1_66667) is best. * At e = 674 bits, k= 6 (127.142857) < k = 5 (127.1_66667) * At e = 1024 bits, k= 6 (177.142857) is best. * At e = 1794 bits, k= 7 (287.125000) < k = 6 (287.142857) * At e = 2048 bits, k= 7 (318.875000) is best. * At e = 4096 bits, k= 7 (574.875000) is best. * * The numbers in parentheses are the expected number of multiplications * needed to do the computation. The normal russian-peasant modular * exponentiation technique always uses (e-1)/2. For exponents as * small as 192 bits (below the range of current factoring algorithms), * half of the multiplies are eliminated, 45.2 as opposed to the naive * 95.5. Counting the 191 squarings as 3/4 a multiply each (squaring * proper is just over half of multiplying, but the Montgomery * reduction in each case is also a multiply), that's 143.25 * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings. * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a * 24.3% savings. It asymptotically approaches 25%. * * Given that exponents for which k>7 are useful are uncommon, * a fixed size table for k <= 7 is used for simplicity. * k = 8 is uzeful at 4610 bits, k = 9 at 11522 bits. * * The basic number of squarings needed is e-1, although a k-bit * window (for k > 1) can save, on average, k-2 of those, too. * That savings currently isn't counted here. It would drive the * crossover points slightly lower. * (Actually, this win is also reduced in the DoubleExpMod case, * meaning we'd have to split the tables. Except for that, the * multiples by powers of the two bases are independent, so * the same logic applies to each as the single case.) * * Table entry i is the largest number of bits in an exponent to * process with a window size of i+1. So the window never goes above 7 * bits, requiring 2^(7-1) = 0x40 precomputed multiples. */#define BNEXPMOD_MAX_WINDOW 7static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = { 7, 25, 81, 241, 673, 1793, (unsigned)-1};/* * Perform modular exponentiation, as fast as possible! This uses * Montgomery reduction, optimized squaring, and windowed exponentiation. * The modulus "mod" MUST be odd! * * This returns 0 on success, -1 on out of memory. * * The window algorithm: * The idea is to keep a running product of b1 = n^(high-order bits of exp), * and then keep appending exponent bits to it. The following patterns * apply to a 3-bit window (k = 3): * To append 0: square * To append 1: square, multiply by n^1 * To append 10: square, multiply by n^1, square * To append 11: square, square, multiply by n^3 * To append 100: square, multiply by n^1, square, square * To append 101: square, square, square, multiply by n^5 * To append 110: square, square, multiply by n^3, square * To append 111: square, square, square, multiply by n^7 * * Since each pattern involves only one multiply, the longer the pattern * the better, except that a 0 (no multiplies) can be appended directly. * We precompute a table of odd powers of n, up to 2^k, and can then * multiply k bits of exponent at a time. Actually, assuming random * exponents, there is on average one zero bit between needs to * multiply (1/2 of the time there's none, 1/4 of the time there's 1, * 1/8 of the time, there's 2, 1/16 of the time, there's 3, etc.), so * you have to do one multiply per k+1 bits of exponent. * * The loop walks down the exponent, squaring the result buffer as * it goes. There is a wbits+1 bit lookahead buffer, buf, that is * filled with the upcoming exponent bits. (What is read after the * end of the exponent is unimportant, but it is filled with zero here.) * When the most-significant bit of this buffer becomes set, i.e. * (buf & tblmask) != 0, we have to decide what pattern to multiply * by, and when to do it. We decide, remember to do it in future * after a suitable number of squarings have passed (e.g. a pattern * of "100" in the buffer requires that we multiply by n^1 immediately; * a pattern of "110" calls for multiplying by n^3 after one more * squaring), clear the buffer, and continue. * * When we start, there is one more optimization: the result buffer * is implcitly one, so squaring it or multiplying by it can be * optimized away. Further, if we start with a pattern like "100" * in the lookahead window, rather than placing n into the buffer * and then starting to square it, we have already computed n^2 * to compute the odd-powers table, so we can place that into * the buffer and save a squaring. * * This means that if you have a k-bit window, to compute n^z, * where z is the high k bits of the exponent, 1/2 of the time * it requires no squarings. 1/4 of the time, it requires 1 * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings. * And the remaining 1/2^(k-1) of the time, the top k bits are a * 1 followed by k-1 0 bits, so it again only requires k-2 * squarings, not k-1. The average of these is 1. Add that * to the one squaring we have to do to compute the table, * and you'll see that a k-bit window saves k-2 squarings * as well as reducing the multiplies. (It actually doesn't * hurt in the case k = 1, either.) * * n must have mlen words allocated. Although fewer may be in use * when n is passed in, all are in use on exit. */intlbnExpMod_16(BNWORD16 *result, BNWORD16 const *n, unsigned nlen, BNWORD16 const *e, unsigned elen, BNWORD16 *mod, unsigned mlen){ BNWORD16 *table[1 << (BNEXPMOD_MAX_WINDOW-1)]; /* Table of odd powers of n */ unsigned ebits; /* Exponent bits */ unsigned wbits; /* Window size */ unsigned tblmask; /* Mask of exponentiation window */ BNWORD16 bitpos; /* Mask of current look-ahead bit */ unsigned buf; /* Buffer of exponent bits */ unsigned multpos; /* Where to do pending multiply */ BNWORD16 const *mult; /* What to multiply by */ unsigned i; /* Loop counter */ int isone; /* Flag: accum. is implicitly one */ BNWORD16 *a, *b; /* Working buffers/accumulators */ BNWORD16 *t; /* Pointer into the working buffers */ BNWORD16 inv; /* mod^-1 modulo 2^16 */ assert(mlen); assert(nlen <= mlen); /* First, a couple of trivial cases. */ elen = lbnNorm_16(e, elen); if (!elen) { /* x ^ 0 == 1 */ lbnZero_16(result, mlen); BIGLITTLE(result[-1],result[0]) = 1; return 0; } ebits = lbnBits_16(e, elen); if (ebits == 1) { /* x ^ 1 == x */ if (n != result) lbnCopy_16(result, n, nlen); if (mlen > nlen) lbnZero_16(BIGLITTLE(result-nlen,result+nlen), mlen-nlen); return 0; } /* Okay, now move the exponent pointer to the most-significant word */ e = BIGLITTLE(e-elen, e+elen-1); /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */ wbits = 0; while (ebits > bnExpModThreshTable[wbits]) wbits++; /* Allocate working storage: two product buffers and the tables. */ LBNALLOC(a, 2*mlen); if (!a) return -1; LBNALLOC(b, 2*mlen); if (!b) { LBNFREE(a, 2*mlen); return -1; } /* Convert to the appropriate table size: tblmask = 1<<(k-1) */ tblmask = 1u << wbits; LBNALLOC(t, mlen); if (!t) { LBNFREE(b, 2*mlen); LBNFREE(a, 2*mlen); return -1; } /* We have the result buffer available, so use it. */ table[0] = result; /* * Okay, we now have a minimal-sized table - expand it. * This is allowed to fail! If so, scale back the table size * and proceed. */ for (i = 1; i < tblmask; i++) { LBNALLOC(t, mlen); if (!t) /* Out of memory! Quit the loop. */ break; table[i] = t; } /* If we stopped, with i < tblmask, shrink the tables appropriately */ while (tblmask > i) { wbits--; tblmask >>= 1; } /* Fre
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -