📄 mul_n.c
字号:
ASSERT (t2 < 7); *pth = th; *pt1 = t1; *pt2 = t2;}#endif/*-- interpolate3 ----------------------------------------------------------*//* Interpolates B, C, D (in-place) from: * 16*A+8*B+4*C+2*D+E * A+B+C+D+E * A+2*B+4*C+8*D+16*E * where: * A[], B[], C[] and D[] all have length l, * E[] has length ls with l-ls = 0, 2 or 4. * * Reads top words (from earlier overflow) from ptb, ptc and ptd, * and returns new top words there. */#if USE_MORE_MPNstatic voidinterpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len,mp_size_t len2){ mp_ptr ws; mp_limb_t t, tb,tc,td; TMP_DECL (marker); TMP_MARK (marker); ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4); /* Let x1, x2, x3 be the values to interpolate. We have: * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e * c = a + x1 + x2 + x3 + e * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e */ ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB); tb = *ptb; tc = *ptc; td = *ptd; /* b := b - 16*a - e * c := c - a - e * d := d - a - 16*e */ t = mpn_lshift (ws, A, len, 4); tb -= t + mpn_sub_n (B, B, ws, len); t = mpn_sub_n (B, B, E, len2); if (len2 == len) tb -= t; else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t); tc -= mpn_sub_n (C, C, A, len); t = mpn_sub_n (C, C, E, len2); if (len2 == len) tc -= t; else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t); t = mpn_lshift (ws, E, len2, 4); t += mpn_add_n (ws, ws, A, len2);#if 1 if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t); td -= t + mpn_sub_n (D, D, ws, len);#else t += mpn_sub_n (D, D, ws, len2); if (len2 != len) { t = mpn_sub_1 (D+len2, D+len2, len-len2, t); t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2); } td -= t;#endif /* b, d := b + d, b - d */#ifdef HAVE_MPN_ADD_SUB_N /* #error TO DO ... */#else t = tb + td + mpn_add_n (ws, B, D, len); td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK; tb = t; MPN_COPY (B, ws, len);#endif /* b := b-8*c */ t = 8 * tc + mpn_lshift (ws, C, len, 3); tb -= t + mpn_sub_n (B, B, ws, len); /* c := 2*c - b */ tc = 2 * tc + mpn_lshift (C, C, len, 1); tc -= tb + mpn_sub_n (C, C, B, len); /* d := d/3 */ td = ((td - mpn_divexact_by3 (D, D, len)) * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; /* b, d := b + d, b - d */#ifdef HAVE_MPN_ADD_SUB_N /* #error TO DO ... */#else t = (tb + td + mpn_add_n (ws, B, D, len)) & GMP_NUMB_MASK; td = (tb - td - mpn_sub_n (D, B, D, len)) & GMP_NUMB_MASK; tb = t; MPN_COPY (B, ws, len);#endif /* Now: * b = 4*x1 * c = 2*x2 * d = 4*x3 */ ASSERT(!(*B & 3)); mpn_rshift (B, B, len, 2); B[len-1] |= (tb << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK; ASSERT((mp_limb_signed_t)tb >= 0); tb >>= 2; ASSERT(!(*C & 1)); mpn_rshift (C, C, len, 1); C[len-1] |= (tc << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK; ASSERT((mp_limb_signed_t)tc >= 0); tc >>= 1; ASSERT(!(*D & 3)); mpn_rshift (D, D, len, 2); D[len-1] |= (td << (GMP_NUMB_BITS - 2)) & GMP_NUMB_MASK; ASSERT((mp_limb_signed_t)td >= 0); td >>= 2;#if WANT_ASSERT ASSERT (tb < 2); if (len == len2) { ASSERT (tc < 3); ASSERT (td < 2); } else { ASSERT (tc < 2); ASSERT (!td); }#endif *ptb = tb; *ptc = tc; *ptd = td; TMP_FREE (marker);}#elsestatic voidinterpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls){ mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od; const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);#if WANT_ASSERT t = l - ls; ASSERT (t == 0 || t == 2 || t == 4);#endif sb = sc = sd = 0; for (i = 0; i < l; ++i) { mp_limb_t tb, tc, td, tt; a = *A; b = *B; c = *C; d = *D; e = i < ls ? *E : 0; /* Let x1, x2, x3 be the values to interpolate. We have: * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e * c = a + x1 + x2 + x3 + e * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e */ /* b := b - 16*a - e * c := c - a - e * d := d - a - 16*e */ t = a << 4; tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t); b -= t; tb -= b < e; b -= e; tc = -(c < a); c -= a; tc -= c < e; c -= e; td = -(d < a); d -= a; t = e << 4; td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t); d -= t; /* b, d := b + d, b - d */ t = b + d; tt = tb + td + (t < b); td = tb - td - (b < d); d = b - d; b = t; tb = tt; /* b := b-8*c */ t = c << 3; tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t); b -= t; /* c := 2*c - b */ t = c << 1; tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b); c = t - b; /* d := d/3 */ d *= MODLIMB_INVERSE_3; td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d); td *= MODLIMB_INVERSE_3; /* b, d := b + d, b - d */ t = b + d; tt = tb + td + (t < b); td = tb - td - (b < d); d = b - d; b = t; tb = tt; /* Now: * b = 4*x1 * c = 2*x2 * d = 4*x3 */ /* sb has period 2. */ b += sb; tb += b < sb; sb &= maskOffHalf; sb |= sb >> (BITS_PER_MP_LIMB >> 1); sb += tb; /* sc has period 1. */ c += sc; tc += c < sc; /* TO DO: choose one of the following alternatives. */#if 1 sc = (mp_limb_signed_t) sc >> (BITS_PER_MP_LIMB - 1); sc += tc;#else sc = tc - ((mp_limb_signed_t) sc < 0L);#endif /* sd has period 2. */ d += sd; td += d < sd; sd &= maskOffHalf; sd |= sd >> (BITS_PER_MP_LIMB >> 1); sd += td; if (i != 0) { B[-1] = ob | b << (BITS_PER_MP_LIMB - 2); C[-1] = oc | c << (BITS_PER_MP_LIMB - 1); D[-1] = od | d << (BITS_PER_MP_LIMB - 2); } ob = b >> 2; oc = c >> 1; od = d >> 2; ++A; ++B; ++C; ++D; ++E; } /* Handle top words. */ b = *ptb; c = *ptc; d = *ptd; t = b + d; d = b - d; b = t; b -= c << 3; c = (c << 1) - b; d *= MODLIMB_INVERSE_3; t = b + d; d = b - d; b = t; b += sb; c += sc; d += sd; B[-1] = ob | b << (BITS_PER_MP_LIMB - 2); C[-1] = oc | c << (BITS_PER_MP_LIMB - 1); D[-1] = od | d << (BITS_PER_MP_LIMB - 2); b >>= 2; c >>= 1; d >>= 2;#if WANT_ASSERT ASSERT (b < 2); if (l == ls) { ASSERT (c < 3); ASSERT (d < 2); } else { ASSERT (c < 2); ASSERT (!d); }#endif *ptb = b; *ptc = c; *ptd = d;}#endif/*-- mpn_toom3_mul_n --------------------------------------------------------------*//* Multiplies using 5 mults of one third size and so on recursively. * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1]. * No overlap of p[...] with a[...] or b[...]. * ws is workspace. *//* TO DO: If MUL_TOOM3_THRESHOLD is much bigger than MUL_KARATSUBA_THRESHOLD then the * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n() * because the "n < MUL_KARATSUBA_THRESHOLD" test here will always be false. */#define TOOM3_MUL_REC(p, a, b, n, ws) \ do { \ if (n < MUL_KARATSUBA_THRESHOLD) \ mpn_mul_basecase (p, a, n, b, n); \ else if (n < MUL_TOOM3_THRESHOLD) \ mpn_kara_mul_n (p, a, b, n, ws); \ else \ mpn_toom3_mul_n (p, a, b, n, ws); \ } while (0)voidmpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws){ mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD; mp_limb_t *A,*B,*C,*D,*E, *W; mp_size_t l,l2,l3,l4,l5,ls; /* Break n words into chunks of size l, l and ls. * n = 3*k => l = k, ls = k * n = 3*k+1 => l = k+1, ls = k-1 * n = 3*k+2 => l = k+1, ls = k */ { mp_limb_t m; /* this is probably unnecessarily strict */ ASSERT (n >= MUL_TOOM3_THRESHOLD); l = ls = n / 3; m = n - l * 3; if (m != 0) ++l; if (m == 1) --ls; l2 = l * 2; l3 = l * 3; l4 = l * 4; l5 = l * 5; A = p; B = ws; C = p + l2; D = ws + l2; E = p + l4; W = ws + l4; } ASSERT (l >= 1); ASSERT (ls >= 1); /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls); /** Second stage: pointwise multiplies. **/ TOOM3_MUL_REC(D, C, C + l, l, W); tD = cD*dD; if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD); if (dD) tD += mpn_addmul_1 (D + l, C, l, dD); ASSERT (tD < 49); TOOM3_MUL_REC(C, B, B + l, l, W); tC = cC*dC; /* TO DO: choose one of the following alternatives. */#if 0 if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC); if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);#else if (cC) { if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l); else tC += add2Times (C + l, C + l, B + l, l); } if (dC) { if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l); else tC += add2Times (C + l, C + l, B, l); }#endif ASSERT (tC < 9); TOOM3_MUL_REC(B, A, A + l, l, W); tB = cB*dB; if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB); if (dB) tB += mpn_addmul_1 (B + l, A, l, dB); ASSERT (tB < 49); TOOM3_MUL_REC(A, a, b, l, W); TOOM3_MUL_REC(E, a + l2, b + l2, ls, W); /** Third stage: interpolation. **/ interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); /** Final stage: add up the coefficients. **/ tB += mpn_add_n (p + l, p + l, B, l2); tD += mpn_add_n (p + l3, p + l3, D, l2); MPN_INCR_U (p + l3, 2 * n - l3, tB); MPN_INCR_U (p + l4, 2 * n - l4, tC); MPN_INCR_U (p + l5, 2 * n - l5, tD);}/*-- mpn_toom3_sqr_n --------------------------------------------------------------*//* Like previous function but for squaring *//* FIXME: If SQR_TOOM3_THRESHOLD is big enough it might never get into the basecase range. Try to arrange those conditonals go dead. */#define TOOM3_SQR_REC(p, a, n, ws) \ do { \ if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) \ mpn_mul_basecase (p, a, n, a, n); \ else if (BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \ mpn_sqr_basecase (p, a, n); \ else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ mpn_kara_sqr_n (p, a, n, ws); \ else \ mpn_toom3_sqr_n (p, a, n, ws); \ } while (0)voidmpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws){ mp_limb_t cB,cC,cD, tB,tC,tD; mp_limb_t *A,*B,*C,*D,*E, *W; mp_size_t l,l2,l3,l4,l5,ls; /* Break n words into chunks of size l, l and ls. * n = 3*k => l = k, ls = k * n = 3*k+1 => l = k+1, ls = k-1 * n = 3*k+2 => l = k+1, ls = k */ { mp_limb_t m; /* this is probably unnecessarily strict */ ASSERT (n >= SQR_TOOM3_THRESHOLD); l = ls = n / 3; m = n - l * 3; if (m != 0) ++l; if (m == 1) --ls; l2 = l * 2; l3 = l * 3; l4 = l * 4; l5 = l * 5; A = p; B = ws; C = p + l2; D = ws + l2; E = p + l4; W = ws + l4; } ASSERT (l >= 1); ASSERT (ls >= 1); /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); /** Second stage: pointwise multiplies. **/ TOOM3_SQR_REC(D, C, l, W); tD = cD*cD; if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD); ASSERT (tD < 49); TOOM3_SQR_REC(C, B, l, W); tC = cC*cC; /* TO DO: choose one of the following alternatives. */#if 0 if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);#else if (cC >= 1) { tC += add2Times (C + l, C + l, B, l); if (cC == 2) tC += add2Times (C + l, C + l, B, l); }#endif ASSERT (tC < 9); TOOM3_SQR_REC(B, A, l, W); tB = cB*cB; if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB); ASSERT (tB < 49); TOOM3_SQR_REC(A, a, l, W); TOOM3_SQR_REC(E, a + l2, ls, W); /** Third stage: interpolation. **/ interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); /** Final stage: add up the coefficients. **/ tB += mpn_add_n (p + l, p + l, B, l2); tD += mpn_add_n (p + l3, p + l3, D, l2); MPN_INCR_U (p + l3, 2 * n - l3, tB); MPN_INCR_U (p + l4, 2 * n - l4, tC); MPN_INCR_U (p + l5, 2 * n - l5, tD);}voidmpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n){ ASSERT (n >= 1); ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n)); if (n < MUL_KARATSUBA_THRESHOLD) mpn_mul_basecase (p, a, n, b, n); else if (n < MUL_TOOM3_THRESHOLD) { /* Allocate workspace of fixed size on stack: fast! */#if TUNE_PROGRAM_BUILD mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD_LIMIT-1)];#else mp_limb_t ws[MPN_KARA_MUL_N_TSIZE (MUL_TOOM3_THRESHOLD-1)];#endif mpn_kara_mul_n (p, a, b, n, ws); }#if WANT_FFT || TUNE_PROGRAM_BUILD else if (n < MUL_FFT_THRESHOLD)#else else#endif { /* Use workspace of unknown size in heap, as stack space may * be limited. Since n is at least MUL_TOOM3_THRESHOLD, the * multiplication will take much longer than malloc()/free(). */ mp_limb_t wsLen, *ws; wsLen = MPN_TOOM3_MUL_N_TSIZE (n); ws = __GMP_ALLOCATE_FUNC_LIMBS ((size_t) wsLen); mpn_toom3_mul_n (p, a, b, n, ws); __GMP_FREE_FUNC_LIMBS (ws, (size_t) wsLen); }#if WANT_FFT || TUNE_PROGRAM_BUILD else { mpn_mul_fft_full (p, a, n, b, n); }#endif}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -