📄 hf_32.c
字号:
} } } } cr[WS(rs, 5)] = FMA(KP831469612, T6D, T6A); ci[WS(rs, 10)] = FNMS(KP831469612, T6D, T6A); }}static const tw_instr twinstr[] = { {TW_FULL, 1, 32}, {TW_NEXT, 1, 0}};static const hc2hc_desc desc = { 32, "hf_32", twinstr, &GENUS, {236, 62, 198, 0} };void X(codelet_hf_32) (planner *p) { X(khc2hc_register) (p, hf_32, &desc);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_hc2hc -compact -variables 4 -pipeline-latency 4 -n 32 -dit -name hf_32 -include hf.h *//* * This function contains 434 FP additions, 208 FP multiplications, * (or, 340 additions, 114 multiplications, 94 fused multiply/add), * 96 stack variables, 7 constants, and 128 memory accesses */#include "hf.h"static void hf_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms){ DK(KP555570233, +0.555570233019602224742830813948532874374937191); DK(KP831469612, +0.831469612302545237078788377617905756738560812); DK(KP980785280, +0.980785280403230449126182236134239036973933731); DK(KP195090322, +0.195090322016128267848284868477022240927691618); DK(KP382683432, +0.382683432365089771728459984030398866761344562); DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 62, MAKE_VOLATILE_STRIDE(rs)) { E Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T56, T41; E T59, T2B, T67, T6e, T6O, T4b, T5g, T4s, T5d, TG, T7l, T5I, T73, T3a, T4U; E T3f, T4V, T14, T5K, T5N, T6F, T3m, T4Z, T3r, T4Y, T1r, T5P, T5S, T6E, T3x; E T52, T3C, T51, T2d, T5Z, T64, T6K, T3V, T5a, T44, T57, T2Y, T6f, T6a, T6P; E T4m, T5e, T4v, T5h; { E T1, T76, T6, T75, Tc, T32, Th, T33; T1 = cr[0]; T76 = ci[0]; { E T3, T5, T2, T4; T3 = cr[WS(rs, 16)]; T5 = ci[WS(rs, 16)]; T2 = W[30]; T4 = W[31]; T6 = FMA(T2, T3, T4 * T5); T75 = FNMS(T4, T3, T2 * T5); } { E T9, Tb, T8, Ta; T9 = cr[WS(rs, 8)]; Tb = ci[WS(rs, 8)]; T8 = W[14]; Ta = W[15]; Tc = FMA(T8, T9, Ta * Tb); T32 = FNMS(Ta, T9, T8 * Tb); } { E Te, Tg, Td, Tf; Te = cr[WS(rs, 24)]; Tg = ci[WS(rs, 24)]; Td = W[46]; Tf = W[47]; Th = FMA(Td, Te, Tf * Tg); T33 = FNMS(Tf, Te, Td * Tg); } { E T7, Ti, T7A, T7B; T7 = T1 + T6; Ti = Tc + Th; Tj = T7 + Ti; T5F = T7 - Ti; T7A = Tc - Th; T7B = T76 - T75; T7C = T7A + T7B; T7Q = T7B - T7A; } { E T31, T34, T74, T77; T31 = T1 - T6; T34 = T32 - T33; T35 = T31 + T34; T4T = T31 - T34; T74 = T32 + T33; T77 = T75 + T76; T78 = T74 + T77; T7m = T77 - T74; } } { E T1y, T3X, T1O, T3I, T1D, T3Y, T1J, T3H; { E T1v, T1x, T1u, T1w; T1v = cr[WS(rs, 1)]; T1x = ci[WS(rs, 1)]; T1u = W[0]; T1w = W[1]; T1y = FMA(T1u, T1v, T1w * T1x); T3X = FNMS(T1w, T1v, T1u * T1x); } { E T1L, T1N, T1K, T1M; T1L = cr[WS(rs, 25)]; T1N = ci[WS(rs, 25)]; T1K = W[48]; T1M = W[49]; T1O = FMA(T1K, T1L, T1M * T1N); T3I = FNMS(T1M, T1L, T1K * T1N); } { E T1A, T1C, T1z, T1B; T1A = cr[WS(rs, 17)]; T1C = ci[WS(rs, 17)]; T1z = W[32]; T1B = W[33]; T1D = FMA(T1z, T1A, T1B * T1C); T3Y = FNMS(T1B, T1A, T1z * T1C); } { E T1G, T1I, T1F, T1H; T1G = cr[WS(rs, 9)]; T1I = ci[WS(rs, 9)]; T1F = W[16]; T1H = W[17]; T1J = FMA(T1F, T1G, T1H * T1I); T3H = FNMS(T1H, T1G, T1F * T1I); } { E T1E, T1P, T5W, T5X; T1E = T1y + T1D; T1P = T1J + T1O; T1Q = T1E + T1P; T61 = T1E - T1P; T5W = T3X + T3Y; T5X = T3H + T3I; T5Y = T5W - T5X; T6J = T5W + T5X; } { E T3G, T3J, T3Z, T40; T3G = T1y - T1D; T3J = T3H - T3I; T3K = T3G + T3J; T56 = T3G - T3J; T3Z = T3X - T3Y; T40 = T1J - T1O; T41 = T3Z - T40; T59 = T3Z + T40; } } { E T2j, T47, T2z, T4q, T2o, T48, T2u, T4p; { E T2g, T2i, T2f, T2h; T2g = cr[WS(rs, 31)]; T2i = ci[WS(rs, 31)]; T2f = W[60]; T2h = W[61]; T2j = FMA(T2f, T2g, T2h * T2i); T47 = FNMS(T2h, T2g, T2f * T2i); } { E T2w, T2y, T2v, T2x; T2w = cr[WS(rs, 23)]; T2y = ci[WS(rs, 23)]; T2v = W[44]; T2x = W[45]; T2z = FMA(T2v, T2w, T2x * T2y); T4q = FNMS(T2x, T2w, T2v * T2y); } { E T2l, T2n, T2k, T2m; T2l = cr[WS(rs, 15)]; T2n = ci[WS(rs, 15)]; T2k = W[28]; T2m = W[29]; T2o = FMA(T2k, T2l, T2m * T2n); T48 = FNMS(T2m, T2l, T2k * T2n); } { E T2r, T2t, T2q, T2s; T2r = cr[WS(rs, 7)]; T2t = ci[WS(rs, 7)]; T2q = W[12]; T2s = W[13]; T2u = FMA(T2q, T2r, T2s * T2t); T4p = FNMS(T2s, T2r, T2q * T2t); } { E T2p, T2A, T6c, T6d; T2p = T2j + T2o; T2A = T2u + T2z; T2B = T2p + T2A; T67 = T2p - T2A; T6c = T47 + T48; T6d = T4p + T4q; T6e = T6c - T6d; T6O = T6c + T6d; } { E T49, T4a, T4o, T4r; T49 = T47 - T48; T4a = T2u - T2z; T4b = T49 - T4a; T5g = T49 + T4a; T4o = T2j - T2o; T4r = T4p - T4q; T4s = T4o + T4r; T5d = T4o - T4r; } } { E To, T37, TE, T3d, Tt, T38, Tz, T3c; { E Tl, Tn, Tk, Tm; Tl = cr[WS(rs, 4)]; Tn = ci[WS(rs, 4)]; Tk = W[6]; Tm = W[7]; To = FMA(Tk, Tl, Tm * Tn); T37 = FNMS(Tm, Tl, Tk * Tn); } { E TB, TD, TA, TC; TB = cr[WS(rs, 12)]; TD = ci[WS(rs, 12)]; TA = W[22]; TC = W[23]; TE = FMA(TA, TB, TC * TD); T3d = FNMS(TC, TB, TA * TD); } { E Tq, Ts, Tp, Tr; Tq = cr[WS(rs, 20)]; Ts = ci[WS(rs, 20)]; Tp = W[38]; Tr = W[39]; Tt = FMA(Tp, Tq, Tr * Ts); T38 = FNMS(Tr, Tq, Tp * Ts); } { E Tw, Ty, Tv, Tx; Tw = cr[WS(rs, 28)]; Ty = ci[WS(rs, 28)]; Tv = W[54]; Tx = W[55]; Tz = FMA(Tv, Tw, Tx * Ty); T3c = FNMS(Tx, Tw, Tv * Ty); } { E Tu, TF, T5G, T5H; Tu = To + Tt; TF = Tz + TE; TG = Tu + TF; T7l = Tu - TF; T5G = T3c + T3d; T5H = T37 + T38; T5I = T5G - T5H; T73 = T5H + T5G; } { E T36, T39, T3b, T3e; T36 = To - Tt; T39 = T37 - T38; T3a = T36 + T39; T4U = T36 - T39; T3b = Tz - TE; T3e = T3c - T3d; T3f = T3b - T3e; T4V = T3b + T3e; } } { E TM, T3n, T12, T3k, TR, T3o, TX, T3j; { E TJ, TL, TI, TK; TJ = cr[WS(rs, 2)]; TL = ci[WS(rs, 2)]; TI = W[2]; TK = W[3]; TM = FMA(TI, TJ, TK * TL); T3n = FNMS(TK, TJ, TI * TL); } { E TZ, T11, TY, T10; TZ = cr[WS(rs, 26)]; T11 = ci[WS(rs, 26)]; TY = W[50]; T10 = W[51]; T12 = FMA(TY, TZ, T10 * T11); T3k = FNMS(T10, TZ, TY * T11); } { E TO, TQ, TN, TP; TO = cr[WS(rs, 18)]; TQ = ci[WS(rs, 18)]; TN = W[34]; TP = W[35]; TR = FMA(TN, TO, TP * TQ); T3o = FNMS(TP, TO, TN * TQ); } { E TU, TW, TT, TV; TU = cr[WS(rs, 10)]; TW = ci[WS(rs, 10)]; TT = W[18]; TV = W[19]; TX = FMA(TT, TU, TV * TW); T3j = FNMS(TV, TU, TT * TW); } { E TS, T13, T5L, T5M; TS = TM + TR; T13 = TX + T12; T14 = TS + T13; T5K = TS - T13; T5L = T3n + T3o; T5M = T3j + T3k; T5N = T5L - T5M; T6F = T5L + T5M; } { E T3i, T3l, T3p, T3q; T3i = TM - TR; T3l = T3j - T3k; T3m = T3i + T3l; T4Z = T3i - T3l; T3p = T3n - T3o; T3q = TX - T12; T3r = T3p - T3q; T4Y = T3p + T3q; } } { E T19, T3t, T1p, T3A, T1e, T3u, T1k, T3z; { E T16, T18, T15, T17; T16 = cr[WS(rs, 30)]; T18 = ci[WS(rs, 30)]; T15 = W[58]; T17 = W[59]; T19 = FMA(T15, T16, T17 * T18); T3t = FNMS(T17, T16, T15 * T18); } { E T1m, T1o, T1l, T1n; T1m = cr[WS(rs, 22)]; T1o = ci[WS(rs, 22)]; T1l = W[42]; T1n = W[43]; T1p = FMA(T1l, T1m, T1n * T1o); T3A = FNMS(T1n, T1m, T1l * T1o); } { E T1b, T1d, T1a, T1c; T1b = cr[WS(rs, 14)]; T1d = ci[WS(rs, 14)]; T1a = W[26]; T1c = W[27]; T1e = FMA(T1a, T1b, T1c * T1d); T3u = FNMS(T1c, T1b, T1a * T1d); } { E T1h, T1j, T1g, T1i; T1h = cr[WS(rs, 6)]; T1j = ci[WS(rs, 6)]; T1g = W[10]; T1i = W[11]; T1k = FMA(T1g, T1h, T1i * T1j); T3z = FNMS(T1i, T1h, T1g * T1j); } { E T1f, T1q, T5Q, T5R; T1f = T19 + T1e; T1q = T1k + T1p; T1r = T1f + T1q; T5P = T1f - T1q; T5Q = T3t + T3u; T5R = T3z + T3A; T5S = T5Q - T5R; T6E = T5Q + T5R; } { E T3v, T3w, T3y, T3B; T3v = T3t - T3u; T3w = T1k - T1p; T3x = T3v - T3w; T52 = T3v + T3w; T3y = T19 - T1e; T3B = T3z - T3A; T3C = T3y + T3B; T51 = T3y - T3B; } } { E T1V, T3M, T20, T3N, T3L, T3O, T26, T3Q, T2b, T3R, T3S, T3T; { E T1S, T1U, T1R, T1T; T1S = cr[WS(rs, 5)]; T1U = ci[WS(rs, 5)]; T1R = W[8]; T1T = W[9]; T1V = FMA(T1R, T1S, T1T * T1U); T3M = FNMS(T1T, T1S, T1R * T1U); } { E T1X, T1Z, T1W, T1Y; T1X = cr[WS(rs, 21)]; T1Z = ci[WS(rs, 21)]; T1W = W[40]; T1Y = W[41]; T20 = FMA(T1W, T1X, T1Y * T1Z); T3N = FNMS(T1Y, T1X, T1W * T1Z); } T3L = T1V - T20; T3O = T3M - T3N; { E T23, T25, T22, T24; T23 = cr[WS(rs, 29)]; T25 = ci[WS(rs, 29)]; T22 = W[56]; T24 = W[57]; T26 = FMA(T22, T23, T24 * T25); T3Q = FNMS(T24, T23, T22 * T25); } { E T28, T2a, T27, T29; T28 = cr[WS(rs, 13)]; T2a = ci[WS(rs, 13)]; T27 = W[24]; T29 = W[25];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -