📄 bni64.c
字号:
/* Decrement qhat and adjust comparison parameters */
qhat--;
if ((r += dh) >= dh) {
t2high -= (t2low < dl);
t2low -= dl;
if (t2high > r || (t2high == r && t2low > nl))
qhat--;
}
}
#endif
/* Do the multiply and subtract */
r = bniMulSub1_64(n, d, dlen, qhat);
/* If there was a borrow, add back once. */
if (r > nh) { /* Borrow? */
(void)bniAddN_64(n, d, dlen);
qhat--;
}
/* Remember the first quotient digit. */
qhigh = qhat;
/* Now, the main division loop: */
divloop:
while (qlen--) {
/* Advance n */
nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
BIGLITTLE(++n,--n);
nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
if (nh == dh) {
qhat = ~(BNWORD64)0;
/* Optimized computation of r = (nh,nm) - qhat * dh */
r = nh + nm;
if (r < nh)
goto subtract;
} else {
pgpAssert(nh < dh);
r = bniDiv21_64(&qhat, nh, nm, dh);
}
nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
#ifdef mul64_ppmm
mul64_ppmm(nm, t64, qhat, dl);
if (nm > r || (nm == r && t64 > nl)) {
/* Decrement qhat and adjust comparison parameters */
qhat--;
if ((r += dh) >= dh) {
nm -= (t64 < dl);
t64 -= dl;
if (nm > r || (nm == r && t64 > nl))
qhat--;
}
}
#elif defined(BNWORD128)
t128 = (BNWORD128)qhat * dl;
if (t128 > ((BNWORD128)r<<64) + nl) {
/* Decrement qhat and adjust comparison parameters */
qhat--;
if ((r += dh) >= dh) {
t128 -= dl;
if (t128 > ((BNWORD128)r << 64) + nl)
qhat--;
}
}
#else /* Use bniMulN1_64 */
bniMulN1_64(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
if (t2high > r || (t2high == r && t2low > nl)) {
/* Decrement qhat and adjust comparison parameters */
qhat--;
if ((r += dh) >= dh) {
t2high -= (t2low < dl);
t2low -= dl;
if (t2high > r || (t2high == r && t2low > nl))
qhat--;
}
}
#endif
/*
* As a point of interest, note that it is not worth checking
* for qhat of 0 or 1 and installing special-case code. These
* occur with probability 2^-64, so spending 1 cycle to check
* for them is only worth it if we save more than 2^15 cycles,
* and a multiply-and-subtract for numbers in the 1024-bit
* range just doesn't take that long.
*/
subtract:
/*
* n points to the least significant end of the substring
* of n to be subtracted from. qhat is either exact or
* one too large. If the subtract gets a borrow, it was
* one too large and the divisor is added back in. It's
* a dlen+1 word add which is guaranteed to produce a
* carry out, so it can be done very simply.
*/
r = bniMulSub1_64(n, d, dlen, qhat);
if (r > nh) { /* Borrow? */
(void)bniAddN_64(n, d, dlen);
qhat--;
}
/* Store the quotient digit */
BIGLITTLE(*q++,*--q) = qhat;
}
/* Tah dah! */
if (shift) {
bniRshift_64(d, dlen, shift);
bniRshift_64(n, dlen, shift);
}
return qhigh;
}
#endif
/*
* Find the negative multiplicative inverse of x (x must be odd!) modulo 2^64.
*
* This just performs Newton's iteration until it gets the
* inverse. The initial estimate is always correct to 3 bits, and
* sometimes 4. The number of valid bits doubles each iteration.
* (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
* for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow
* the iteration through.)
*/
#ifndef bniMontInv1_64
BNWORD64
bniMontInv1_64(BNWORD64 const x)
{
BNWORD64 y = x, z;
pgpAssert(x & 1);
while ((z = x*y) != 1)
y *= 2 - z;
return -y;
}
#endif /* !bniMontInv1_64 */
#if defined(BNWORD128) && PRODUCT_SCAN
/*
* Test code for product-scanning Montgomery reduction.
* This seems to slow the C code down rather than speed it up.
*
* The first loop computes the Montgomery multipliers, storing them over
* the low half of the number n.
*
* The second half multiplies the upper half, adding in the modulus
* times the Montgomery multipliers. The results of this multiply
* are stored.
*/
void
bniMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned mlen, BNWORD64 inv)
{
BNWORD128 x, y;
BNWORD64 const *pm;
BNWORD64 *pn;
BNWORD64 t;
unsigned carry;
unsigned i, j;
/* Special case of zero */
if (!mlen)
return;
/* Pass 1 - compute Montgomery multipliers */
/* First iteration can have certain simplifications. */
t = BIGLITTLE(n[-1],n[0]);
x = t;
t *= inv;
BIGLITTLE(n[-1], n[0]) = t;
x += (BNWORD128)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
pgpAssert((BNWORD64)x == 0);
x = x >> 64;
for (i = 1; i < mlen; i++) {
carry = 0;
pn = n;
pm = BIGLITTLE(mod-i-1,mod+i+1);
for (j = 0; j < i; j++) {
y = (BNWORD128)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
x += y;
carry += (x < y);
}
pgpAssert(BIGLITTLE(pn == n-i, pn == n+i));
y = t = BIGLITTLE(pn[-1], pn[0]);
x += y;
carry += (x < y);
BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD64)x;
pgpAssert(BIGLITTLE(pm == mod-1, pm == mod+1));
y = (BNWORD128)t * BIGLITTLE(pm[0],pm[-1]);
x += y;
carry += (x < y);
pgpAssert((BNWORD64)x == 0);
x = x >> 64 | (BNWORD128)carry << 64;
}
BIGLITTLE(n -= mlen, n += mlen);
/* Pass 2 - compute upper words and add to n */
for (i = 1; i < mlen; i++) {
carry = 0;
pm = BIGLITTLE(mod-i,mod+i);
pn = n;
for (j = i; j < mlen; j++) {
y = (BNWORD128)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
x += y;
carry += (x < y);
}
pgpAssert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
pgpAssert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
y = t = BIGLITTLE(*(n-i),*(n+i-1));
x += y;
carry += (x < y);
BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD64)x;
x = (x >> 64) | (BNWORD128)carry << 64;
}
/* Last round of second half, simplified. */
t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
x += t;
BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD64)x;
carry = (unsigned)(x >> 64);
while (carry)
carry -= bniSubN_64(n, mod, mlen);
while (bniCmp_64(n, mod, mlen) >= 0)
(void)bniSubN_64(n, mod, mlen);
}
#define bniMontReduce_64 bniMontReduce_64
#endif
/*
* Montgomery reduce n, modulo mod. This reduces modulo mod and divides by
* 2^(64*mlen). Returns the result in the *top* mlen words of the argument n.
* This is ready for another multiplication using bniMul_64.
*
* Montgomery representation is a very useful way to encode numbers when
* you're doing lots of modular reduction. What you do is pick a multiplier
* R which is relatively prime to the modulus and very easy to divide by.
* Since the modulus is odd, R is closen as a power of 2, so the division
* is a shift. In fact, it's a shift of an integral number of words,
* so the shift can be implicit - just drop the low-order words.
*
* Now, choose R *larger* than the modulus m, 2^(64*mlen). Then convert
* all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
* relationship M(a) = a*R mod m, M(b) = b*R mod m, etc. Note that:
* - The Montgomery form of a number depends on the modulus m.
* A fixed modulus m is assumed throughout this discussion.
* - Since R is relaitvely prime to m, multiplication by R is invertible;
* no information about the numbers is lost, they're just scrambled.
* - Adding (and subtracting) numbers in this form works just as usual.
* M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
* - Multiplying numbers in this form produces a*b*R*R. The problem
* is to divide out the excess factor of R, modulo m as well as to
* reduce to the given length mlen. It turns out that this can be
* done *faster* than a normal divide, which is where the speedup
* in Montgomery division comes from.
*
* Normal reduction chooses a most-significant quotient digit q and then
* subtracts q*m from the number to be reduced. Choosing q is tricky
* and involved (just look at bniDiv_64 to see!) and is usually
* imperfect, requiring a check for correction after the subtraction.
*
* Montgomery reduction *adds* a multiple of m to the *low-order* part
* of the number to be reduced. This multiple is chosen to make the
* low-order part of the number come out to zero. This can be done
* with no trickery or error using a precomputed inverse of the modulus.
* In this code, the "part" is one word, but any width can be used.
*
* Repeating this step sufficiently often results in a value which
* is a multiple of R (a power of two, remember) but is still (since
* the additions were to the low-order part and thus did not increase
* the value of the number being reduced very much) still not much
* larger than m*R. Then implicitly divide by R and subtract off
* m until the result is in the correct range.
*
* Since the low-order part being cancelled is less than R, the
* multiple of m added must have a multiplier which is at most R-1.
* Assuming that the input is at most m*R-1, the final number is
* at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
* the high-order part, equivalent to subtracting m*R from the
* while number, produces a result which is at most m*R - m - 1,
* which divided by R is at most m-1.
*
* To convert *to* Montgomery form, you need a regular remainder
* routine, although you can just compute R*R (mod m) and do the
* conversion using Montgomery multiplication. To convert *from*
* Montgomery form, just Montgomery reduce the number to
* remove the extra factor of R.
*
* TODO: Change to a full inverse and use Karatsuba's multiplication
* rather than this word-at-a-time.
*/
#ifndef bniMontReduce_64
void
bniMontReduce_64(BNWORD64 *n, BNWORD64 const *mod, unsigned const mlen,
BNWORD64 inv)
{
BNWORD64 t;
BNWORD64 c = 0;
unsigned len = mlen;
/* inv must be the negative inverse of mod's least significant word */
pgpAssert((BNWORD64)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD64)-1);
pgpAssert(len);
do {
t = bniMulAdd1_64(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
c += bniAdd1_64(BIGLITTLE(n-mlen,n+mlen), len, t);
BIGLITTLE(--n,++n);
} while (--len);
/*
* All that adding can cause an overflow past the modulus size,
* but it's unusual, and never by much, so a subtraction loop
* is the right way to deal with it.
* This subtraction happens infrequently - I've only ever seen it
* invoked once per reduction, and then just under 22.5% of the time.
*/
while (c)
c -= bniSubN_64(n, mod, mlen);
while (bniCmp_64(n, mod, mlen) >= 0)
(void)bniSubN_64(n, mod, mlen);
}
#endif /* !bniMontReduce_64 */
/*
* A couple of helpers that you might want to implement atomically
* in asm sometime.
*/
#ifndef bniMontMul_64
/*
* Multiply "num1" by "num2", modulo "mod", all of length "len", and
* place the result in the high half of "prod". "inv" is the inverse
* of the least-significant word of the modulus, modulo 2^64.
* This uses numbers in Montgomery form. Reduce using "len" and "inv".
*
* This is implemented as a macro to win on compilers that don't do
* inlining, since it's so trivial.
*/
#define bniMontMul_64(prod, n1, n2, mod, len, inv) \
(bniMulX_64(prod, n1, n2, len), bniMontReduce_64(prod, mod, len, inv))
#endif /* !bniMontMul_64 */
#ifndef bniMontSquare_64
/*
* Square "num", modulo "mod", both of length "len", and place the result
* in the high half of "prod". "inv" is the inverse of the least-significant
* word of the modulus, modulo 2^64.
* This uses numbers in Montgomery form. Reduce using "len" and "inv".
*
* This is implemented as a macro to win on compilers that don't do
* inlining, since it's so trivial.
*/
#define bniMontSquare_64(prod, n, mod, len, inv) \
(bniSquare_64(prod, n, len), bniMontReduce_64(prod, mod, len, inv))
#endif /* !bniMontSquare_64 */
/*
* Convert a number to Montgomery form - requires mlen + nlen words
* of memory in "n".
*/
void
bniToMont_64(BNWORD64 *n, unsigned nlen, BNWORD64 *mod, unsigned mlen)
{
/* Move n up "mlen" words */
bniCopy_64(BIGLITTLE(n-mlen,n+mlen), n, nlen);
bniZero_64(n, mlen);
/* Do the division - dump the quotient in the high-order words */
(void)bniDiv_64(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
}
/*
* Convert from Montgomery form. Montgomery reduction is all that is
* needed.
*/
void
bniFromMont_64(BNWORD64 *n, BNWORD64 *mod, unsigned len)
{
/* Zero the high words of n */
bniZero_64(BIGLITTLE(n-len,n+len), len);
bniMontReduce_64(n, mod, len, bniMontInv1_64(mod[BIGLITTLE(-1,0)]));
/* Move n down len words */
bniCopy_64(n, BIGLITTLE(n-len,n+len), len);
}
/*
* The windowed exponentiation algorithm, precomputes a table of odd
* powers of n up to 2^k. See the comment in bnExpMod_64 below for
* an explanation of how it actually works works.
*
* It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
* multiplies (on average) to perform the exponentiation. To minimize
* the sum, k must vary with e. The optimal window sizes vary with the
* exponent length. Here are some selected values and the boundary cases.
* (An underscore _ has been inserted into some of the numbers to ensure
* that magic strings like 64 do not appear in this table. It should be
* ignored.)
*
* At e = 1 bits, k=1 (0.000000) is best
* At e = 2 bits, k=1 (0.500000) is best
* At e = 4 bits, k=1 (1.500000) is best
* At e = 8 bits, k=2 (3.333333) < k=1 (3.500000)
* At e = 1_6 bits, k=2 (6.000000) is best
* At e = 26 bits, k=3 (9.250000) < k=2 (9.333333)
* At e = 3_2 bits, k=3 (10.750000) is best
* At e = 6_4 bits, k=3 (18.750000) is best
* At e = 82 bits, k=4 (23.200000) < k=3 (23.250000)
* At e = 128 bits, k=4 (3_2.400000) is best
* At e = 242 bits, k=5 (55.1_66667) < k=4 (55.200000)
* At e = 256 bits, k=5 (57.500000) is best
* At e = 512 bits, k=5 (100.1_66667) is best
* At e = 674 bits, k=6 (127.142857) < k=5 (127.1_66667)
* At e = 1024 bits, k=6 (177.142857) is best
* At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
* At e = 2048 bits, k=7 (318.875000) is best
* At e = 4096 bits, k=7 (574.875000) is best
*
* The numbers in parentheses are the expected number of multiplications
* needed to do the computation. The normal russian-peasant modular
* exponentiation technique always uses (e-1)/2. For exponents as
* small as 192 bits (
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -