📄 hb2_32.c
字号:
E T5F, T5S, T6a, T6g; T5F = FMA(KP980785280, T5E, T5x); T5S = FNMS(KP980785280, T5E, T5x); T6a = T60 * T69; cr[WS(rs, 15)] = FNMS(T66, T69, T64); T6g = T6b * T6f; cr[WS(rs, 31)] = FNMS(T6e, T6f, T6d); { E T5W, T5T, T5Q, T5G; T5W = T5U * T5S; T5T = T5R * T5S; T5Q = T5I * T5F; T5G = T5u * T5F; ci[WS(rs, 15)] = FMA(T66, T63, T6a); ci[WS(rs, 31)] = FMA(T6e, T6c, T6g); ci[WS(rs, 7)] = FMA(T5R, T5V, T5W); cr[WS(rs, 7)] = FNMS(T5U, T5V, T5T); ci[WS(rs, 23)] = FMA(T5u, T5P, T5Q); cr[WS(rs, 23)] = FNMS(T5I, T5P, T5G); } } } } }}static const tw_instr twinstr[] = { {TW_CEXP, 1, 1}, {TW_CEXP, 1, 3}, {TW_CEXP, 1, 9}, {TW_CEXP, 1, 27}, {TW_NEXT, 1, 0}};static const hc2hc_desc desc = { 32, "hb2_32", twinstr, &GENUS, {236, 98, 252, 0} };void X(codelet_hb2_32) (planner *p) { X(khc2hc_register) (p, hb2_32, &desc);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_hc2hc -compact -variables 4 -pipeline-latency 4 -sign 1 -twiddle-log3 -precompute-twiddles -n 32 -dif -name hb2_32 -include hb.h *//* * This function contains 488 FP additions, 280 FP multiplications, * (or, 376 additions, 168 multiplications, 112 fused multiply/add), * 160 stack variables, 7 constants, and 128 memory accesses */#include "hb.h"static void hb2_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms){ DK(KP555570233, +0.555570233019602224742830813948532874374937191); DK(KP831469612, +0.831469612302545237078788377617905756738560812); DK(KP980785280, +0.980785280403230449126182236134239036973933731); DK(KP195090322, +0.195090322016128267848284868477022240927691618); DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP382683432, +0.382683432365089771728459984030398866761344562); DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(rs)) { E T11, T14, T12, T15, T17, T2z, T2B, T1c, T18, T1d, T1g, T1k, T2F, T2L, T3t; E T4H, T3h, T3V, T3b, T4v, T4T, T4X, T6t, T71, T6z, T75, T81, T8x, T8f, T8z; E T2R, T2V, T8p, T8t, T4r, T4t, T53, T69, T3n, T3r, T7P, T7T, T4P, T4R, T6F; E T6R, T1f, T2X, T1j, T2Y, T1l, T31, T2d, T2Z, T49, T4h, T4c, T4i, T4d, T4n; E T4f, T4j; { E T2P, T3q, T2U, T3l, T2Q, T3p, T2T, T3m, T2D, T3g, T2K, T39, T2E, T3f, T2J; E T3a; { E T13, T1b, T16, T1a; T11 = W[0]; T14 = W[1]; T12 = W[2]; T15 = W[3]; T13 = T11 * T12; T1b = T14 * T12; T16 = T14 * T15; T1a = T11 * T15; T17 = T13 + T16; T2z = T13 - T16; T2B = T1a + T1b; T1c = T1a - T1b; T18 = W[4]; T2P = T12 * T18; T3q = T14 * T18; T2U = T15 * T18; T3l = T11 * T18; T1d = W[5]; T2Q = T15 * T1d; T3p = T11 * T1d; T2T = T12 * T1d; T3m = T14 * T1d; T1g = W[6]; T2D = T11 * T1g; T3g = T15 * T1g; T2K = T14 * T1g; T39 = T12 * T1g; T1k = W[7]; T2E = T14 * T1k; T3f = T12 * T1k; T2J = T11 * T1k; T3a = T15 * T1k; } T2F = T2D - T2E; T2L = T2J + T2K; T3t = T39 - T3a; T4H = T2J - T2K; T3h = T3f - T3g; T3V = T3f + T3g; T3b = T39 + T3a; T4v = T2D + T2E; T4T = FMA(T18, T1g, T1d * T1k); T4X = FNMS(T1d, T1g, T18 * T1k); { E T6r, T6s, T6x, T6y; T6r = T17 * T1g; T6s = T1c * T1k; T6t = T6r - T6s; T71 = T6r + T6s; T6x = T17 * T1k; T6y = T1c * T1g; T6z = T6x + T6y; T75 = T6x - T6y; } { E T7Z, T80, T8d, T8e; T7Z = T2z * T1g; T80 = T2B * T1k; T81 = T7Z + T80; T8x = T7Z - T80; T8d = T2z * T1k; T8e = T2B * T1g; T8f = T8d - T8e; T8z = T8d + T8e; T2R = T2P - T2Q; T2V = T2T + T2U; T8p = FMA(T2R, T1g, T2V * T1k); T8t = FNMS(T2V, T1g, T2R * T1k); } T4r = T2P + T2Q; T4t = T2T - T2U; T53 = FMA(T4r, T1g, T4t * T1k); T69 = FNMS(T4t, T1g, T4r * T1k); T3n = T3l + T3m; T3r = T3p - T3q; T7P = FMA(T3n, T1g, T3r * T1k); T7T = FNMS(T3r, T1g, T3n * T1k); T4P = T3l - T3m; T4R = T3p + T3q; T6F = FMA(T4P, T1g, T4R * T1k); T6R = FNMS(T4R, T1g, T4P * T1k); { E T19, T1e, T1h, T1i; T19 = T17 * T18; T1e = T1c * T1d; T1f = T19 + T1e; T2X = T19 - T1e; T1h = T17 * T1d; T1i = T1c * T18; T1j = T1h - T1i; T2Y = T1h + T1i; } T1l = FMA(T1f, T1g, T1j * T1k); T31 = FNMS(T2Y, T1g, T2X * T1k); T2d = FNMS(T1j, T1g, T1f * T1k); T2Z = FMA(T2X, T1g, T2Y * T1k); { E T47, T48, T4a, T4b; T47 = T2z * T18; T48 = T2B * T1d; T49 = T47 - T48; T4h = T47 + T48; T4a = T2z * T1d; T4b = T2B * T18; T4c = T4a + T4b; T4i = T4a - T4b; } T4d = FMA(T49, T1g, T4c * T1k); T4n = FNMS(T4i, T1g, T4h * T1k); T4f = FNMS(T4c, T1g, T49 * T1k); T4j = FMA(T4h, T1g, T4i * T1k); } { E T56, T7b, T7C, T6c, Tf, T1m, T6f, T7c, T3Y, T4I, T2t, T32, T5d, T7D, T3w; E T4w, Tu, T2e, T7g, T7F, T7j, T7G, T1B, T33, T3z, T40, T5l, T6i, T5s, T6h; E T3C, T3Z, TK, T1D, T7v, T86, T7y, T85, T1S, T35, T3O, T4C, T5F, T6J, T5M; E T6K, T3R, T4D, TZ, T1U, T7o, T89, T7r, T88, T29, T36, T3H, T4z, T5Y, T6M; E T65, T6N, T3K, T4A; { E T3, T54, T2o, T58, T2r, T5b, T6, T6a, Ta, T57, T2h, T6b, T2k, T55, Td; E T5a; { E T1, T2, T2m, T2n; T1 = cr[0]; T2 = ci[WS(rs, 15)]; T3 = T1 + T2; T54 = T1 - T2; T2m = ci[WS(rs, 27)]; T2n = cr[WS(rs, 20)]; T2o = T2m - T2n; T58 = T2m + T2n; } { E T2p, T2q, T4, T5; T2p = ci[WS(rs, 19)]; T2q = cr[WS(rs, 28)]; T2r = T2p - T2q; T5b = T2p + T2q; T4 = cr[WS(rs, 8)]; T5 = ci[WS(rs, 7)]; T6 = T4 + T5; T6a = T4 - T5; } { E T8, T9, T2f, T2g; T8 = cr[WS(rs, 4)]; T9 = ci[WS(rs, 11)]; Ta = T8 + T9; T57 = T8 - T9; T2f = ci[WS(rs, 31)]; T2g = cr[WS(rs, 16)]; T2h = T2f - T2g; T6b = T2f + T2g; } { E T2i, T2j, Tb, Tc; T2i = ci[WS(rs, 23)]; T2j = cr[WS(rs, 24)]; T2k = T2i - T2j; T55 = T2i + T2j; Tb = ci[WS(rs, 3)]; Tc = cr[WS(rs, 12)]; Td = Tb + Tc; T5a = Tb - Tc; } { E T7, Te, T2l, T2s; T56 = T54 - T55; T7b = T54 + T55; T7C = T6b - T6a; T6c = T6a + T6b; T7 = T3 + T6; Te = Ta + Td; Tf = T7 + Te; T1m = T7 - Te; { E T6d, T6e, T3W, T3X; T6d = T57 + T58; T6e = T5a + T5b; T6f = KP707106781 * (T6d - T6e); T7c = KP707106781 * (T6d + T6e); T3W = T2h - T2k; T3X = Ta - Td; T3Y = T3W - T3X; T4I = T3X + T3W; } T2l = T2h + T2k; T2s = T2o + T2r; T2t = T2l - T2s; T32 = T2l + T2s; { E T59, T5c, T3u, T3v; T59 = T57 - T58; T5c = T5a - T5b; T5d = KP707106781 * (T59 + T5c); T7D = KP707106781 * (T59 - T5c); T3u = T3 - T6; T3v = T2r - T2o; T3w = T3u - T3v; T4w = T3u + T3v; } } } { E Ti, T5p, T1w, T5n, T1z, T5q, Tl, T5m, Tp, T5i, T1p, T5g, T1s, T5j, Ts; E T5f; { E Tg, Th, T1u, T1v; Tg = cr[WS(rs, 2)]; Th = ci[WS(rs, 13)]; Ti = Tg + Th; T5p = Tg - Th; T1u = ci[WS(rs, 29)]; T1v = cr[WS(rs, 18)]; T1w = T1u - T1v; T5n = T1u + T1v; } { E T1x, T1y, Tj, Tk; T1x = ci[WS(rs, 21)]; T1y = cr[WS(rs, 26)]; T1z = T1x - T1y; T5q = T1x + T1y; Tj = cr[WS(rs, 10)]; Tk = ci[WS(rs, 5)]; Tl = Tj + Tk; T5m = Tj - Tk; } { E Tn, To, T1n, T1o; Tn = ci[WS(rs, 1)]; To = cr[WS(rs, 14)]; Tp = Tn + To; T5i = Tn - To; T1n = ci[WS(rs, 17)]; T1o = cr[WS(rs, 30)]; T1p = T1n - T1o; T5g = T1n + T1o; } { E T1q, T1r, Tq, Tr; T1q = ci[WS(rs, 25)]; T1r = cr[WS(rs, 22)]; T1s = T1q - T1r; T5j = T1q + T1r; Tq = cr[WS(rs, 6)]; Tr = ci[WS(rs, 9)]; Ts = Tq + Tr; T5f = Tq - Tr; } { E Tm, Tt, T7e, T7f; Tm = Ti + Tl; Tt = Tp + Ts; Tu = Tm + Tt; T2e = Tm - Tt; T7e = T5p + T5q; T7f = T5n - T5m; T7g = FNMS(KP923879532, T7f, KP382683432 * T7e); T7F = FMA(KP382683432, T7f, KP923879532 * T7e); } { E T7h, T7i, T1t, T1A; T7h = T5i + T5j; T7i = T5f + T5g; T7j = FNMS(KP923879532, T7i, KP382683432 * T7h); T7G = FMA(KP382683432, T7i, KP923879532 * T7h); T1t = T1p + T1s; T1A = T1w + T1z; T1B = T1t - T1A; T33 = T1A + T1t; } { E T3x, T3y, T5h, T5k; T3x = T1p - T1s; T3y = Tp - Ts; T3z = T3x - T3y; T40 = T3y + T3x; T5h = T5f - T5g; T5k = T5i - T5j; T5l = FNMS(KP382683432, T5k, KP923879532 * T5h); T6i = FMA(KP382683432, T5h, KP923879532 * T5k); } { E T5o, T5r, T3A, T3B; T5o = T5m + T5n; T5r = T5p - T5q; T5s = FMA(KP923879532, T5o, KP382683432 * T5r); T6h = FNMS(KP382683432, T5o, KP923879532 * T5r); T3A = Ti - Tl; T3B = T1w - T1z; T3C = T3A + T3B; T3Z = T3A - T3B; } } { E Ty, T5v, TB, T5G, T1J, T5w, T1G, T5H, TI, T5K, T1Q, T5D, TF, T5J, T1N; E T5A; { E Tw, Tx, T1E, T1F; Tw = cr[WS(rs, 1)]; Tx = ci[WS(rs, 14)]; Ty = Tw + Tx; T5v = Tw - Tx; { E Tz, TA, T1H, T1I; Tz = cr[WS(rs, 9)]; TA = ci[WS(rs, 6)]; TB = Tz + TA; T5G = Tz - TA; T1H = ci[WS(rs, 22)]; T1I = cr[WS(rs, 25)]; T1J = T1H - T1I; T5w = T1H + T1I; } T1E = ci[WS(rs, 30)]; T1F = cr[WS(rs, 17)]; T1G = T1E - T1F; T5H = T1E + T1F; { E TG, TH, T5B, T1O, T1P, T5C; TG = ci[WS(rs, 2)]; TH = cr[WS(rs, 13)]; T5B = TG - TH; T1O = ci[WS(rs, 18)]; T1P = cr[WS(rs, 29)]; T5C = T1O + T1P; TI = TG + TH; T5K = T5B + T5C; T1Q = T1O - T1P; T5D = T5B - T5C; } { E TD, TE, T5y, T1L, T1M, T5z; TD = cr[WS(rs, 5)]; TE = ci[WS(rs, 10)]; T5y = TD - TE; T1L = ci[WS(rs, 26)]; T1M = cr[WS(rs, 21)]; T5z = T1L + T1M; TF = TD + TE; T5J = T5y + T5z; T1N = T1L - T1M; T5A = T5y - T5z; } } { E TC, TJ, T7t, T7u; TC = Ty + TB; TJ = TF + TI; TK = TC + TJ; T1D = TC - TJ; T7t = T5H - T5G; T7u = KP707106781 * (T5A - T5D); T7v = T7t + T7u; T86 = T7t - T7u; } { E T7w, T7x, T1K, T1R; T7w = T5v + T5w; T7x = KP707106781 * (T5J + T5K); T7y = T7w - T7x; T85 = T7w + T7x; T1K = T1G + T1J; T1R = T1N + T1Q; T1S = T1K - T1R; T35 = T1K + T1R; } { E T3M, T3N, T5x, T5E; T3M = T1G - T1J; T3N = TF - TI; T3O = T3M - T3N; T4C = T3N + T3M; T5x = T5v - T5w; T5E = KP707106781 * (T5A + T5D); T5F = T5x - T5E; T6J = T5x + T5E; } { E T5I, T5L, T3P, T3Q; T5I = T5G + T5H; T5L = KP707106781 * (T5J - T5K); T5M = T5I - T5L; T6K = T5I + T5L; T3P = Ty - TB; T3Q = T1Q - T1N; T3R = T3P - T3Q; T4D = T3P + T3Q; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -