lbn16.c

来自「包含哈希,对称以及非对称的经典算法包含经典事例」· C语言代码 · 共 2,209 行 · 第 1/5 页
2,209 行
}#endif/* * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^16. * * This just performs Newton's iteration until it gets the * inverse.  The initial estimate is always correct to 3 bits, and * sometimes 4.  The number of valid bits doubles each iteration. * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable * for the error mod 2^2n.  x * y == 1 + k*2^n (mod 2^2n) and follow * the iteration through.) */#ifndef lbnMontInv1_16BNWORD16lbnMontInv1_16(BNWORD16 const x){        BNWORD16 y = x, z;	assert(x & 1);         while ((z = x*y) != 1)                y *= 2 - z;        return -y;}#endif /* !lbnMontInv1_16 */#if defined(BNWORD32) && PRODUCT_SCAN/* * Test code for product-scanning Montgomery reduction. * This seems to slow the C code down rather than speed it up. * * The first loop computes the Montgomery multipliers, storing them over * the low half of the number n. * * The second half multiplies the upper half, adding in the modulus * times the Montgomery multipliers.  The results of this multiply * are stored. */voidlbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned mlen, BNWORD16 inv){	BNWORD32 x, y;	BNWORD16 const *pm;	BNWORD16 *pn;	BNWORD16 t;	unsigned carry;	unsigned i, j;	/* Special case of zero */	if (!mlen)		return;	/* Pass 1 - compute Montgomery multipliers */	/* First iteration can have certain simplifications. */	t = BIGLITTLE(n[-1],n[0]);	x = t;	t *= inv;	BIGLITTLE(n[-1], n[0]) = t;	x += (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */	assert((BNWORD16)x == 0);	x = x >> 16;	for (i = 1; i < mlen; i++) {		carry = 0;		pn = n;		pm = BIGLITTLE(mod-i-1,mod+i+1);		for (j = 0; j < i; j++) {			y = (BNWORD32)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);			x += y;			carry += (x < y);		}		assert(BIGLITTLE(pn == n-i, pn == n+i));		y = t = BIGLITTLE(pn[-1], pn[0]);		x += y;		carry += (x < y);		BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD16)x;		assert(BIGLITTLE(pm == mod-1, pm == mod+1));		y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]);		x += y;		carry += (x < y);		assert((BNWORD16)x == 0);		x = x >> 16 | (BNWORD32)carry << 16;	}	BIGLITTLE(n -= mlen, n += mlen);	/* Pass 2 - compute upper words and add to n */	for (i = 1; i < mlen; i++) {		carry = 0;		pm = BIGLITTLE(mod-i,mod+i);		pn = n;		for (j = i; j < mlen; j++) {			y = (BNWORD32)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);			x += y;			carry += (x < y);		}		assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));		assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));		y = t = BIGLITTLE(*(n-i),*(n+i-1));		x += y;		carry += (x < y);		BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD16)x;		x = (x >> 16) | (BNWORD32)carry << 16;	}	/* Last round of second half, simplified. */	t = BIGLITTLE(*(n-mlen),*(n+mlen-1));	x += t;	BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD16)x;	carry = (unsigned)(x >> 16);	while (carry)		carry -= lbnSubN_16(n, mod, mlen);	while (lbnCmp_16(n, mod, mlen) >= 0)		(void)lbnSubN_16(n, mod, mlen);}#define lbnMontReduce_16 lbnMontReduce_16#endif/* * Montgomery reduce n, modulo mod.  This reduces modulo mod and divides * by 2^(16*mlen).  I'll fill in an explanation of Montgomery representation * later.  Returns the result in the *top* n words of the argument n. * This is ready for another multiplication using mulnn. *  * TODO: Change to a full inverse and use Karatsuba's multiplication * rather than this word-at-a-time. */#ifndef lbnMontReduce_16voidlbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned const mlen,                BNWORD16 inv){	BNWORD16 t;	BNWORD16 c = 0;	unsigned len = mlen;	/* inv must be the negative inverse of mod's least significant word */	assert((BNWORD16)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD16)-1);	assert(len);	do {		t = lbnMulAdd1_16(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));		c += lbnAdd1_16(BIGLITTLE(n-mlen,n+mlen), len, t);		BIGLITTLE(--n,++n);	} while (--len);	/*	 * All that adding can cause an overflow past the modulus size,	 * but it's unusual, and never by much, so a subtraction loop	 * is the right way to deal with it.	 * This subtraction happens infrequently - I've only ever seen it	 * invoked once per reduction, and then just under 22.5% of the time.	 */	while (c)		c -= lbnSubN_16(n, mod, mlen);	while (lbnCmp_16(n, mod, mlen) >= 0)		(void)lbnSubN_16(n, mod, mlen);}#endif /* !lbnMontReduce_16 *//* * A couple of helpers that you might want to implement atomically * in asm sometime. */#ifndef lbnMontMul_16/* * Multiply "num1" by "num2", modulo "mod", all of length "len", and * place the result in the high half of "prod".  "inv" is the inverse * of the least-significant word of the modulus, modulo 2^16. * This uses numbers in Montgomery form.  Reduce using "len" and "inv". * * This is implemented as a macro to win on compilers that don't do * inlining, since it's so trivial. */#define lbnMontMul_16(prod, n1, n2, mod, len, inv) \	(lbnMulX_16(prod, n1, n2, len), lbnMontReduce_16(prod, mod, len, inv))#endif /* !lbnMontMul_16 */#ifndef lbnMontSquare_16/* * Square "num", modulo "mod", both of length "len", and place the result * in the high half of "prod".  "inv" is the inverse of the least-significant * word of the modulus, modulo 2^16. * This uses numbers in Montgomery form.  Reduce using "len" and "inv". * * This is implemented as a macro to win on compilers that don't do * inlining, since it's so trivial. */#define lbnMontSquare_16(prod, n, mod, len, inv) \	(lbnSquare_16(prod, n, len), lbnMontReduce_16(prod, mod, len, inv))	#endif /* !lbnMontSquare_16 *//* * Convert a number to Montgomery form - requires mlen + nlen + 1 words * of memory in "n". */voidlbnToMont_16(BNWORD16 *n, unsigned nlen, BNWORD16 *mod, unsigned mlen){	/* Move n up "mlen" words */	lbnCopy_16(BIGLITTLE(n-mlen,n+mlen), n, nlen);	lbnZero_16(n, mlen);	/* Do the division - dump the quotient in the high-order words */	(void)lbnDiv_16(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);}/* * Convert from Montgomery form.  Montgomery reduction is all that is * needed. */voidlbnFromMont_16(BNWORD16 *n, BNWORD16 *mod, unsigned len){	/* Zero the high words of n */	lbnZero_16(BIGLITTLE(n-len,n+len), len);	lbnMontReduce_16(n, mod, len, lbnMontInv1_16(BIGLITTLE(mod[-1],mod[0])));	/* Move n down len words */	lbnCopy_16(n, BIGLITTLE(n-len,n+len), len);}/* * The windowed exponentiation algorithm, precomputes a table of odd * powers of n up to 2^k.  It takes 2^(k-1)-1 multiplies to compute * the table, and (e-1)/(k+1) multiplies (on average) to perform the * exponentiation.  To minimize the sum, k must vary with e. * The optimal window sizes vary with the exponent length.  Here are * some selected values and the boundary cases. * (An underscore _ has been inserted into some of the numbers to ensure * that magic strings like 16 do not appear in this table.  It should be * ignored.) * * At e =    1 bits, k= 1 (0.000000) is best. * At e =    2 bits, k= 1 (0.500000) is best. * At e =    4 bits, k= 1 (1.500000) is best. * At e =    8 bits, k= 2 (3.333333) < k =  1 (3.500000) * At e =  1_6 bits, k= 2 (6.000000) is best. * At e =   26 bits, k= 3 (9.250000) < k =  2 (9.333333) * At e =  3_2 bits, k= 3 (10.750000) is best. * At e =  6_4 bits, k= 3 (18.750000) is best. * At e =   82 bits, k= 4 (23.200000) < k =  3 (23.250000) * At e =  128 bits, k= 4 (3_2.400000) is best. * At e =  242 bits, k= 5 (55.1_66667) < k =  4 (55.200000) * At e =  256 bits, k= 5 (57.500000) is best. * At e =  512 bits, k= 5 (100.1_66667) is best. * At e =  674 bits, k= 6 (127.142857) < k =  5 (127.1_66667) * At e = 1024 bits, k= 6 (177.142857) is best. * At e = 1794 bits, k= 7 (287.125000) < k =  6 (287.142857) * At e = 2048 bits, k= 7 (318.875000) is best. * At e = 4096 bits, k= 7 (574.875000) is best. * * The numbers in parentheses are the expected number of multiplications * needed to do the computation.  The normal russian-peasant modular * exponentiation technique always uses (e-1)/2.  For exponents as * small as 192 bits (below the range of current factoring algorithms), * half of the multiplies are eliminated, 45.2 as opposed to the naive * 95.5.  Counting the 191 squarings as 3/4 a multiply each (squaring * proper is just over half of multiplying, but the Montgomery * reduction in each case is also a multiply), that's 143.25 * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings. * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a * 24.3% savings.  It asymptotically approaches 25%. * * Given that exponents for which k>7 are useful are uncommon, * a fixed size table for k <= 7 is used for simplicity. * k = 8 is uzeful at 4610 bits, k = 9 at 11522 bits. * * The basic number of squarings needed is e-1, although a k-bit * window (for k > 1) can save, on average, k-2 of those, too. * That savings currently isn't counted here.  It would drive the * crossover points slightly lower. * (Actually, this win is also reduced in the DoubleExpMod case, * meaning we'd have to split the tables.  Except for that, the * multiples by powers of the two bases are independent, so * the same logic applies to each as the single case.) * * Table entry i is the largest number of bits in an exponent to * process with a window size of i+1.  So the window never goes above 7 * bits, requiring 2^(7-1) = 0x40 precomputed multiples. */#define BNEXPMOD_MAX_WINDOW	7static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {	7, 25, 81, 241, 673, 1793, (unsigned)-1};/* * Perform modular exponentiation, as fast as possible!  This uses * Montgomery reduction, optimized squaring, and windowed exponentiation. * The modulus "mod" MUST be odd! * * This returns 0 on success, -1 on out of memory. * * The window algorithm: * The idea is to keep a running product of b1 = n^(high-order bits of exp), * and then keep appending exponent bits to it.  The following patterns * apply to a 3-bit window (k = 3): * To append   0: square * To append   1: square, multiply by n^1 * To append  10: square, multiply by n^1, square * To append  11: square, square, multiply by n^3 * To append 100: square, multiply by n^1, square, square * To append 101: square, square, square, multiply by n^5 * To append 110: square, square, multiply by n^3, square * To append 111: square, square, square, multiply by n^7 * * Since each pattern involves only one multiply, the longer the pattern * the better, except that a 0 (no multiplies) can be appended directly. * We precompute a table of odd powers of n, up to 2^k, and can then * multiply k bits of exponent at a time.  Actually, assuming random * exponents, there is on average one zero bit between needs to * multiply (1/2 of the time there's none, 1/4 of the time there's 1, * 1/8 of the time, there's 2, 1/16 of the time, there's 3, etc.), so * you have to do one multiply per k+1 bits of exponent. * * The loop walks down the exponent, squaring the result buffer as * it goes.  There is a wbits+1 bit lookahead buffer, buf, that is * filled with the upcoming exponent bits.  (What is read after the * end of the exponent is unimportant, but it is filled with zero here.) * When the most-significant bit of this buffer becomes set, i.e. * (buf & tblmask) != 0, we have to decide what pattern to multiply * by, and when to do it.  We decide, remember to do it in future * after a suitable number of squarings have passed (e.g. a pattern * of "100" in the buffer requires that we multiply by n^1 immediately; * a pattern of "110" calls for multiplying by n^3 after one more * squaring), clear the buffer, and continue. * * When we start, there is one more optimization: the result buffer * is implcitly one, so squaring it or multiplying by it can be * optimized away.  Further, if we start with a pattern like "100" * in the lookahead window, rather than placing n into the buffer * and then starting to square it, we have already computed n^2 * to compute the odd-powers table, so we can place that into * the buffer and save a squaring. * * This means that if you have a k-bit window, to compute n^z, * where z is the high k bits of the exponent, 1/2 of the time * it requires no squarings.  1/4 of the time, it requires 1 * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings. * And the remaining 1/2^(k-1) of the time, the top k bits are a * 1 followed by k-1 0 bits, so it again only requires k-2 * squarings, not k-1.  The average of these is 1.  Add that * to the one squaring we have to do to compute the table, * and you'll see that a k-bit window saves k-2 squarings * as well as reducing the multiplies.  (It actually doesn't * hurt in the case k = 1, either.) * * n must have mlen words allocated.  Although fewer may be in use * when n is passed in, all are in use on exit. */intlbnExpMod_16(BNWORD16 *result, BNWORD16 const *n, unsigned nlen,	BNWORD16 const *e, unsigned elen, BNWORD16 *mod, unsigned mlen){	BNWORD16 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];				/* Table of odd powers of n */	unsigned ebits;		/* Exponent bits */	unsigned wbits;		/* Window size */	unsigned tblmask;	/* Mask of exponentiation window */	BNWORD16 bitpos;	/* Mask of current look-ahead bit */	unsigned buf;		/* Buffer of exponent bits */	unsigned multpos;	/* Where to do pending multiply */	BNWORD16 const *mult;	/* What to multiply by */	unsigned i;		/* Loop counter */	int isone;		/* Flag: accum. is implicitly one */	BNWORD16 *a, *b;	/* Working buffers/accumulators */	BNWORD16 *t;		/* Pointer into the working buffers */	BNWORD16 inv;		/* mod^-1 modulo 2^16 */	assert(mlen);	assert(nlen <= mlen);	/* First, a couple of trivial cases. */	elen = lbnNorm_16(e, elen);	if (!elen) {		/* x ^ 0 == 1 */		lbnZero_16(result, mlen);		BIGLITTLE(result[-1],result[0]) = 1;		return 0;	}	ebits = lbnBits_16(e, elen);	if (ebits == 1) {		/* x ^ 1 == x */		if (n != result)			lbnCopy_16(result, n, nlen);		if (mlen > nlen)			lbnZero_16(BIGLITTLE(result-nlen,result+nlen),			           mlen-nlen);		return 0;	}	/* Okay, now move the exponent pointer to the most-significant word */	e = BIGLITTLE(e-elen, e+elen-1);	/* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */	wbits = 0;	while (ebits > bnExpModThreshTable[wbits])		wbits++;	/* Allocate working storage: two product buffers and the tables. */	LBNALLOC(a, 2*mlen);	if (!a)		return -1;	LBNALLOC(b, 2*mlen);	if (!b) {		LBNFREE(a, 2*mlen);		return -1;	}	/* Convert to the appropriate table size: tblmask = 1<<(k-1) */	tblmask = 1u << wbits;	LBNALLOC(t, mlen);	if (!t) {		LBNFREE(b, 2*mlen);		LBNFREE(a, 2*mlen);		return -1;	}	/* We have the result buffer available, so use it. */	table[0] = result;	/*	 * Okay, we now have a minimal-sized table - expand it.	 * This is allowed to fail!  If so, scale back the table size	 * and proceed.	 */	for (i = 1; i < tblmask; i++) {		LBNALLOC(t, mlen);		if (!t)	/* Out of memory!  Quit the loop. */			break;		table[i] = t;	}	/* If we stopped, with i < tblmask, shrink the tables appropriately */	while (tblmask > i) {		wbits--;		tblmask >>= 1;	}	/* Fre
lbn16.c - 源码说明

本页面展示了「包含哈希,对称以及非对称的经典算法包含经典事例」中的 lbn16.c 源码文件，采用 C语言编程语言编写，共 2,209 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与对称相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?