📄 lbn16.c
字号:
#if !defined(mul16_ppmm) && defined(mul16_ppmma)#define mul16_ppmm(ph,pl,x,y) mul16_ppmma(ph,pl,x,y,0)#endif/* * Use this definition to test the mul16_ppmm-based operations on machines * that do not provide mul16_ppmm. Change the final "0" to a "1" to * enable it. */#if !defined(mul16_ppmm) && defined(BNWORD32) && 0 /* Debugging */#define mul16_ppmm(ph,pl,x,y) \ ({BNWORD32 _ = (BNWORD32)(x)*(y); (pl) = _; (ph) = _>>16;})#endif#if defined(mul16_ppmm) && !defined(mul16_ppmma)#define mul16_ppmma(ph,pl,x,y,a) \ (mul16_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))#endif#if defined(mul16_ppmma) && !defined(mul16_ppmmaa)#define mul16_ppmmaa(ph,pl,x,y,a,b) \ (mul16_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))#endif/* * lbnMulN1_16: Multiply an n-word input by a 1-word input and store the * n+1-word product. This uses either the umul16_ppmm and umul16_ppmma * macros, or C multiplication with the BNWORD32 type. This uses mul16_ppmma * if available, assuming you won't bother defining it unless you can do * better than the normal multiplication. */#ifndef lbnMulN1_16#ifdef lbnMulAdd1_16 /* If we have this asm primitive, use it. */voidlbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k){ lbnZero_16(out, len); BIGLITTLE(*(out-len),*(out+len)) = lbnMulAdd1_16(out, in, len, k);}#elif defined(umul16_ppmm)voidlbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k){ BNWORD16 prod, carry, carryin; assert(len > 0); BIG(--out;--in;); mul16_ppmm(carry, *out, *in, k); LITTLE(out++;in++;) while (--len) { BIG(--out;--in;) carryin = carry; mul16_ppmma(carry, *out, *in, k, carryin); LITTLE(out++;in++;) } BIGLITTLE(*--out,*out) = carry;}#elif defined(BNWORD32)voidlbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k){ BNWORD32 p; assert(len > 0); p = (BNWORD32)BIGLITTLE(*--in,*in++) * k; BIGLITTLE(*--out,*out++) = (BNWORD16)p; while (--len) { p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + (BNWORD16)(p >> 16); BIGLITTLE(*--out,*out++) = (BNWORD16)p; } BIGLITTLE(*--out,*out) = (BNWORD16)(p >> 16);}#else#error No 16x16 -> 32 multiply available for 16-bit bignum package#endif#endif /* lbnMulN1_16 *//* * lbnMulAdd1_16: Multiply an n-word input by a 1-word input and add the * low n words of the product to the destination. *Returns the n+1st word * of the product.* (That turns out to be more convenient than adding * it into the destination and dealing with a possible unit carry out * of *that*.) This uses either the umul16_ppmma and umul16_ppmmaa macros, * or C multiplication with the BNWORD32 type. * * If you're going to write assembly primitives, this is the one to * start with. It is by far the most commonly called function. */#ifndef lbnMulAdd1_16#if defined(mul16_ppmm)BNWORD16lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k){ BNWORD16 prod, carry, carryin; assert(len > 0); BIG(--out;--in;); carryin = *out; mul16_ppmma(carry, *out, *in, k, carryin); LITTLE(out++;in++;) while (--len) { BIG(--out;--in;); carryin = carry; mul16_ppmmaa(carry, prod, *in, k, carryin, *out); *out = prod; LITTLE(out++;in++;) } return carry;}#elif defined(BNWORD32)BNWORD16lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k){ BNWORD32 p; assert(len > 0); p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out); BIGLITTLE(*out,*out++) = (BNWORD16)p; while (--len) { p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + (BNWORD16)(p >> 16) + BIGLITTLE(*--out,*out); BIGLITTLE(*out,*out++) = (BNWORD16)p; } return (BNWORD16)(p >> 16);}#else#error No 16x16 -> 32 multiply available for 16-bit bignum package#endif#endif /* lbnMulAdd1_16 *//* * lbnMulSub1_16: Multiply an n-word input by a 1-word input and subtract the * n-word product from the destination. Returns the n+1st word of the product. * This uses either the umul16_ppmm and umul16_ppmma macros, or * C multiplication with the BNWORD32 type. * * This is rather uglier than adding, but fortunately it's only used in * division which is not used too heavily. */#ifndef lbnMulN1_16#if defined(mul16_ppmm)BNWORD16lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k){ BNWORD16 prod, carry, carryin; assert(len > 0); BIG(--in;) mul16_ppmm(carry, prod, *in, k); LITTLE(in++;) carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD16)~prod; while (--len) { BIG(--in;); carryin = carry; mul16_ppmma(carry, prod, *in, k, carryin); LITTLE(in++;) carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD16)~prod; } return carry;}#elif defined(BNWORD32)BNWORD16lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k){ BNWORD32 p; BNWORD16 carry, t; assert(len > 0); p = (BNWORD32)BIGLITTLE(*--in,*in++) * k; t = BIGLITTLE(*--out,*out); carry = (BNWORD16)(p>>16) + ((BIGLITTLE(*out,*out++)=t-(BNWORD16)p) > t); while (--len) { p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + carry; t = BIGLITTLE(*--out,*out); carry = (BNWORD16)(p>>16) + ( (BIGLITTLE(*out,*out++)=t-(BNWORD16)p) > t ); } return carry;}#else#error No 16x16 -> 32 multiply available for 16-bit bignum package#endif#endif /* !lbnMulSub1_16 *//* * Shift n words left "shift" bits. 0 < shift < 16. Returns the * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift). */#ifndef lbnLshift_16BNWORD16lbnLshift_16(BNWORD16 *num, unsigned len, unsigned shift){ BNWORD16 x, carry; assert(shift > 0); assert(shift < 16); carry = 0; while (len--) { BIG(--num;) x = *num; *num = x<<shift | carry; LITTLE(num++;) carry = x >> (16-shift); } return carry;}#endif /* !lbnLshift_16 *//* * An optimized version of the above, for shifts of 1. * Some machines can use add-with-carry tricks for this. */#ifndef lbnDouble_16BNWORD16lbnDouble_16(BNWORD16 *num, unsigned len){ BNWORD16 x, carry; carry = 0; while (len--) { BIG(--num;) x = *num; *num = x<<1 | carry; LITTLE(num++;) carry = x >> (16-1); } return carry;}#endif /* !lbnDouble_16 *//* * Shift n words right "shift" bits. 0 < shift < 16. Returns the * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift). */#ifndef lbnRshift_16BNWORD16lbnRshift_16(BNWORD16 *num, unsigned len, unsigned shift){ BNWORD16 x, carry = 0; assert(shift > 0); assert(shift < 16); BIGLITTLE(num -= len, num += len); while (len--) { LITTLE(--num;) x = *num; *num = x>>shift | carry; BIG(num++;) carry = x << (16-shift); } return carry >> (16-shift);}#endif /* !lbnRshift_16 *//* * Multiply two numbers of the given lengths. prod and num2 may overlap, * provided that the low len1 bits of prod are free. (This corresponds * nicely to the place the result is returned from lbnMontReduce_16.) * * TODO: Use Karatsuba multiply. The overlap constraints may have * to get rewhacked. */#ifndef lbnMul_16voidlbnMul_16(BNWORD16 *prod, BNWORD16 const *num1, unsigned len1, BNWORD16 const *num2, unsigned len2){ /* Special case of zero */ if (!len1 || !len2) { lbnZero_16(prod, len1+len2); return; } /* Multiply first word */ lbnMulN1_16(prod, num1, len1, BIGLITTLE(*--num2,*num2++)); /* * Add in subsequent words, storing the most significant word, * which is new each time. */ while (--len2) { BIGLITTLE(--prod,prod++); BIGLITTLE(*(prod-len1-1),*(prod+len1)) = lbnMulAdd1_16(prod, num1, len1, BIGLITTLE(*--num2,*num2++)); }}#endif /* !lbnMul_16 *//* * lbnMulX_16 is a square multiply - both inputs are the same length. * It's normally just a macro wrapper around the general multiply, * but might be implementable in assembly more efficiently (such as * in product scanning). */#if defined(BNWORD32) && PRODUCT_SCAN/* * Test code to see whether product scanning is any faster. It seems * to make the C code slower, so PRODUCT_SCAN is not defined. */static voidlbnMulX_16(BNWORD16 *prod, BNWORD16 const *num1, BNWORD16 const *num2, unsigned len){ BNWORD32 x, y; BNWORD16 const *p1, *p2; unsigned carry; unsigned i, j; /* Special case of zero */ if (!len) return; x = (BNWORD32)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]); BIGLITTLE(*--prod, *prod++) = (BNWORD16)x; x >>= 16; for (i = 1; i < len; i++) { carry = 0; p1 = num1; p2 = BIGLITTLE(num2-i-1,num2+i+1); for (j = 0; j <= i; j++) { BIG(y = (BNWORD32)*--p1 * *p2++;) LITTLE(y = (BNWORD32)*p1++ * *--p2;) x += y; carry += (x < y); } BIGLITTLE(*--prod,*prod++) = (BNWORD16)x; x = (x >> 16) | (BNWORD32)carry << 16; } for (i = 1; i < len; i++) { carry = 0; p1 = BIGLITTLE(num1-i,num1+i); p2 = BIGLITTLE(num2-len,num2+len); for (j = i; j < len; j++) { BIG(y = (BNWORD32)*--p1 * *p2++;) LITTLE(y = (BNWORD32)*p1++ * *--p2;) x += y; carry += (x < y); } BIGLITTLE(*--prod,*prod++) = (BNWORD16)x; x = (x >> 16) | (BNWORD32)carry << 16; } BIGLITTLE(*--prod,*prod) = (BNWORD16)x;}#else/* Default trivial macro definition */#define lbnMulX_16(prod, num1, num2, len) lbnMul_16(prod, num1, len, num2, len)#endif#if !defined(lbnMontMul) && defined(BNWORD32) && PRODUCT_SCAN/* * Test code for product-scanning multiply. This seems to slow the C * code down rather than speed it up. * This does a multiply and Montgomery reduction together, using the * same loops. The outer loop scans across the product, twice. * The first pass computes the low half of the product and the * Montgomery multipliers. These are stored in the product array, * which contains no data as of yet. x and carry add up the columns * and propagate carries forward. * * The second half multiplies the upper half, adding in the modulus * times the Montgomery multipliers. The results of this multiply * are stored. */static voidlbnMontMul_16(BNWORD16 *prod, BNWORD16 const *num1, BNWORD16 const *num2, BNWORD16 const *mod, unsigned len, BNWORD16 inv){ BNWORD32 x, y; BNWORD16 const *p1, *p2, *pm; BNWORD16 *pp; BNWORD16 t; unsigned carry; unsigned i, j; /* Special case of zero */ if (!len) return; /* * This computes directly into the high half of prod, so just * shift the pointer and consider prod only "len" elements long * for the rest of the code. */ BIGLITTLE(prod -= len, prod += len); /* Pass 1 - compute Montgomery multipliers */ /* First iteration can have certain simplifications. */ x = (BNWORD32)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]); BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD16)x; y = (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]); x += y; /* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */ carry = (x < y); assert((BNWORD16)x == 0); x = x >> 16 | (BNWORD32)carry << 16; for (i = 1; i < len; i++) { carry = 0; p1 = num1; p2 = BIGLITTLE(num2-i-1,num2+i+1); pp = prod; pm = BIGLITTLE(mod-i-1,mod+i+1); for (j = 0; j < i; j++) { y = (BNWORD32)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2); x += y; carry += (x < y); y = (BNWORD32)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm); x += y; carry += (x < y); } y = (BNWORD32)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]); x += y; carry += (x < y); assert(BIGLITTLE(pp == prod-i, pp == prod+i)); BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD16)x; assert(BIGLITTLE(pm == mod-1, pm == mod+1)); y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]); x += y; carry += (x < y); assert((BNWORD16)x == 0);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -