📄 lbn16.c
字号:
x = x >> 16 | (BNWORD32)carry << 16; } /* Pass 2 - compute reduced product and store */ for (i = 1; i < len; i++) { carry = 0; p1 = BIGLITTLE(num1-i,num1+i); p2 = BIGLITTLE(num2-len,num2+len); pm = BIGLITTLE(mod-i,mod+i); pp = BIGLITTLE(prod-len,prod+len); for (j = i; j < len; j++) { y = (BNWORD32)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2); x += y; carry += (x < y); y = (BNWORD32)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp); x += y; carry += (x < y); } assert(BIGLITTLE(pm == mod-len, pm == mod+len)); assert(BIGLITTLE(pp == prod-i, pp == prod+i)); BIGLITTLE(pp[0],pp[-1]) = (BNWORD16)x; x = (x >> 16) | (BNWORD32)carry << 16; } /* Last round of second half, simplified. */ BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD16)x; carry = (x >> 16); while (carry) carry -= lbnSubN_16(prod, mod, len); while (lbnCmp_16(prod, mod, len) >= 0) (void)lbnSubN_16(prod, mod, len);}#define lbnMontMul_16 lbnMontMul_16#endif#if defined(BNWORD32) && PRODUCT_SCAN/* * Trial code for product-scanning squaring. This seems to slow the C * code down rather than speed it up. */voidlbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len){ BNWORD32 x, y, z; BNWORD16 const *p1, *p2; unsigned carry; unsigned i, j; /* Special case of zero */ if (!len) return; /* Word 0 of product */ x = (BNWORD32)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]); BIGLITTLE(*--prod, *prod++) = (BNWORD16)x; x >>= 16; /* Words 1 through len-1 */ for (i = 1; i < len; i++) { carry = 0; y = 0; p1 = num; p2 = BIGLITTLE(num-i-1,num+i+1); for (j = 0; j < (i+1)/2; j++) { BIG(z = (BNWORD32)*--p1 * *p2++;) LITTLE(z = (BNWORD32)*p1++ * *--p2;) y += z; carry += (y < z); } y += z = y; carry += carry + (y < z); if ((i & 1) == 0) { assert(BIGLITTLE(--p1 == p2, p1 == --p2)); BIG(z = (BNWORD32)*p2 * *p2;) LITTLE(z = (BNWORD32)*p1 * *p1;) y += z; carry += (y < z); } x += y; carry += (x < y); BIGLITTLE(*--prod,*prod++) = (BNWORD16)x; x = (x >> 16) | (BNWORD32)carry << 16; } /* Words len through 2*len-2 */ for (i = 1; i < len; i++) { carry = 0; y = 0; p1 = BIGLITTLE(num-i,num+i); p2 = BIGLITTLE(num-len,num+len); for (j = 0; j < (len-i)/2; j++) { BIG(z = (BNWORD32)*--p1 * *p2++;) LITTLE(z = (BNWORD32)*p1++ * *--p2;) y += z; carry += (y < z); } y += z = y; carry += carry + (y < z); if ((len-i) & 1) { assert(BIGLITTLE(--p1 == p2, p1 == --p2)); BIG(z = (BNWORD32)*p2 * *p2;) LITTLE(z = (BNWORD32)*p1 * *p1;) y += z; carry += (y < z); } x += y; carry += (x < y); BIGLITTLE(*--prod,*prod++) = (BNWORD16)x; x = (x >> 16) | (BNWORD32)carry << 16; } /* Word 2*len-1 */ BIGLITTLE(*--prod,*prod) = (BNWORD16)x;}#define lbnSquare_16 lbnSquare_16#endif/* * Square a number, using optimized squaring to reduce the number of * primitive multiples that are executed. There may not be any * overlap of the input and output. * * Technique: Consider the partial products in the multiplication * of "abcde" by itself: * * a b c d e * * a b c d e * ================== * ae be ce de ee * ad bd cd dd de * ac bc cc cd ce * ab bb bc bd be * aa ab ac ad ae * * Note that everything above the main diagonal: * ae be ce de = (abcd) * e * ad bd cd = (abc) * d * ac bc = (ab) * c * ab = (a) * b * * is a copy of everything below the main diagonal: * de * cd ce * bc bd be * ab ac ad ae * * Thus, the sum is 2 * (off the diagonal) + diagonal. * * This is accumulated beginning with the diagonal (which * consist of the squares of the digits of the input), which is then * divided by two, the off-diagonal added, and multiplied by two * again. The low bit is simply a copy of the low bit of the * input, so it doesn't need special care. * * TODO: Merge the shift by 1 with the squaring loop. * TODO: Use Karatsuba. (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W. */#ifndef lbnSquare_16voidlbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len){ BNWORD16 t; BNWORD16 *prodx = prod; /* Working copy of the argument */ BNWORD16 const *numx = num; /* Working copy of the argument */ unsigned lenx = len; /* Working copy of the argument */ if (!len) return; /* First, store all the squares */ while (lenx--) {#ifdef mul16_ppmm BNWORD16 ph, pl; t = BIGLITTLE(*--numx,*numx++); mul16_ppmm(ph,pl,t,t); BIGLITTLE(*--prodx,*prodx++) = pl; BIGLITTLE(*--prodx,*prodx++) = ph;#elif defined(BNWORD32) /* use BNWORD32 */ BNWORD32 p; t = BIGLITTLE(*--numx,*numx++); p = (BNWORD32)t * t; BIGLITTLE(*--prodx,*prodx++) = (BNWORD16)p; BIGLITTLE(*--prodx,*prodx++) = (BNWORD16)(p>>16);#else /* Use lbnMulN1_16 */ t = BIGLITTLE(numx[-1],*numx); lbnMulN1_16(prodx, numx, 1, t); BIGLITTLE(--numx,numx++); BIGLITTLE(prodx -= 2, prodx += 2);#endif } /* Then, shift right 1 bit */ (void)lbnRshift_16(prod, 2*len, 1); /* Then, add in the off-diagonal sums */ lenx = len; numx = num; prodx = prod; while (--lenx) { t = BIGLITTLE(*--numx,*numx++); BIGLITTLE(--prodx,prodx++); t = lbnMulAdd1_16(prodx, numx, lenx, t); lbnAdd1_16(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t); BIGLITTLE(--prodx,prodx++); } /* Shift it back up */ lbnDouble_16(prod, 2*len); /* And set the low bit appropriately */ BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;}#endif /* !lbnSquare_16 *//* * lbnNorm_16 - given a number, return a modified length such that the * most significant digit is non-zero. Zero-length input is okay. */#ifndef lbnNorm_16unsignedlbnNorm_16(BNWORD16 const *num, unsigned len){ BIGLITTLE(num -= len,num += len); while (len && BIGLITTLE(*num++,*--num) == 0) --len; return len;}#endif /* lbnNorm_16 *//* * lbnBits_16 - return the number of significant bits in the array. * It starts by normalizing the array. Zero-length input is okay. * Then assuming there's anything to it, it fetches the high word, * generates a bit length by multiplying the word length by 16, and * subtracts off 16/2, 16/4, 16/8, ... bits if the high bits are clear. */#ifndef lbnBits_16unsignedlbnBits_16(BNWORD16 const *num, unsigned len){ BNWORD16 t; unsigned i; len = lbnNorm_16(num, len); if (len) { t = BIGLITTLE(*(num-len),*(num+(len-1))); assert(t); len *= 16; i = 16; while (i /= 2) { if (t >> i) t >>= i; else len -= i; } } return len;}#endif /* lbnBits_16 *//* * If defined, use hand-rolled divide rather than compiler's native. * If the machine doesn't do it in line, the manual code is probably * faster, since it can assume normalization. */#ifndef BN_SLOW_DIVIDE_32#define BN_SLOW_DIVIDE_32 1#endif/* * Return (nh<<16|nl) % d, and place the quotient digit into *q. * It is guaranteed that nh < d, and that d is normalized (with its high * bit set). If we have a double-width type, it's easy. If not, ooh, * yuk! */#ifndef lbnDiv21_16#if defined(BNWORD32) && !BN_SLOW_DIVIDE_32BNWORD16lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d){ BNWORD32 n = (BNWORD32)nh << 16 | nl; /* Divisor must be normalized */ assert(d >> (16-1) == 1); *q = n / d; return n % d;}#else/* * This is where it gets ugly. * * Do the division in two halves, using Algorithm D from section 4.3.1 * of Knuth. Note Theorem B from that section, that the quotient estimate * is never more than the true quotient, and is never more than two * too low. * * The mapping onto conventional long division is (everything a half word): * _____________qh___ql_ * dh dl ) nh.h nh.l nl.h nl.l * - (implicit) * ----------- * rrrrrrrrr nl.l * - (implicit) * ----------- * rrrrrrrrr * * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way: * First, estimate a q digit so that nh/dh works. Subtracting q*dh from * the (nh.h nh.l) list leaves a 1/2-word remainder r. Then compute the * low part of the subtractor, q * dl. This also needs to be subtracted * from (nh.h nh.l nl.h) to get the final remainder. So we take the * remainder, which is (nh.h nh.l) - q*dl, shift it and add in nl.h, and * try to subtract q * dl from that. * It is possible that the remainder we're working with, r, is less than * the product q * d1, due to an error in estimating q. The estimation * technique can produce a q that is too large (never too small), leading * to r which is too small. In that case, decrement the digit q, add * shifted dh to r (to correct for that error), and subtract dl from the * product we're comparing r with. That's the "correct" way to do it, but * just adding dl to r instead of subtracting it from the product is * equivalent and a lot simpler. You just have to watch out for overflow. * * The process is repeated with (rrrr rrrr nl.l) for the low digit of the * quotient. * * The various uses of 16/2 for shifts are because of the note about * automatic editing of this file at the very top of the file. */#define highhalf(x) ( (x) >> 16/2 )#define lowhalf(x) ( (x) & (((BNWORD16)1 << 16/2)-1) )BNWORD16lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d){ BNWORD16 dh = highhalf(d), dl = lowhalf(d); BNWORD16 qh, ql, prod, r; /* Divisor must be normalized */ assert(d >> (16-1) == 1); /* Do first half-word of division */ qh = nh / dh; r = nh % dh; prod = qh * dl; /* * Add next half-word of numerator to remainder and correct. * qh may be up to two too large. * */ r = r << (16/2) | highhalf(nl); if (r < prod) { --qh; r += d; if (r >= d && r < prod) { --qh; r += d; } } r -= prod; /* Do second half-word of division */ ql = r / dh; r = r % dh; prod = ql * dl; r = r << (16/2) | lowhalf(nl); if (r < prod) { --ql; r += d; if (r >= d && r < prod) { --ql; r += d; } } r -= prod; *q = qh << (16/2) | ql; return r;}#endif#endif /* lbnDiv21_16 *//* * In the division functions, the dividend and divisor are referred to * as "n" and "d", which stand for "numerator" and "denominator". * * The quotient is (nlen-dlen+1) digits long. It may be overlapped with * the high (nlen-dlen) words of the dividend, but one extra word is needed * on top to hold the top word. *//* * Divide an n-word number by a 1-word number, storing the remainder * and n-1 words of the n-word quotient. The high word is returned. * It IS legal for rem to point to the same address as n, and for * q to point one word higher. * * TODO: If BN_SLOW_DIVIDE_32, add a divnhalf_16 which uses 16-bit * dividends if the divisor is half that long. * TODO: Shift the dividend on the fly to avoid the last division and * instead have a remainder that needs shifting. * TODO: Use reciprocals rather than dividing. */#ifndef lbnDiv1_16BNWORD16lbnDiv1_16(BNWORD16 *q, BNWORD16 *rem, BNWORD16 const *n, unsigned len, BNWORD16 d){ unsigned shift; unsigned xlen; BNWORD16 r; BNWORD16 qhigh; assert(len > 0); assert(d); if (len == 1) { r = *n; *rem = r%d; return r/d; } shift = 0; r = d; xlen = 16; while (xlen /= 2) { if (r >> xlen) r >>= xlen; else shift += xlen; } assert(d >> (16-1-shift) == 1); d <<= shift; BIGLITTLE(q -= len-1,q += len-1); BIGLITTLE(n -= len,n += len); r = BIGLITTLE(*n++,*--n); if (r < d) { qhigh = 0; } else { qhigh = r/d; r %= d; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -