📄 hc2cfdft_20.c
字号:
T3V = T36 + T34; T37 = T34 - T36; T51 = T3R + T3T; T3U = T3R - T3T; T38 = T1c + T1m; T1n = T1c - T1m; T5w = T51 - T52; T53 = T51 + T52; T2Q = T1n + T12; T1o = T12 - T1n; T3A = T38 + T37; T39 = T37 - T38; } } } } } } { E T4l, T4m, T4n, T4w, T4u; { E T4L, T2O, T3W, T4K, T4I, T4G, T4S, T4U, T4J, T4z, T4H; { E T4C, T2N, T4R, T1p, T4E, T2q, T4Q; T4L = T4A + T4B; T4C = T4A - T4B; T2N = T2E + T2M; T2O = T2M - T2E; T4R = T1o - TH; T1p = TH + T1o; T4E = T3U - T3V; T3W = T3U + T3V; T2q = T1Y + T2p; T4Q = T2p - T1Y; { E T4y, T4x, T4F, T2r; T4F = T4D - T4E; T4K = T4D + T4E; T4y = T1p - T2q; T2r = T1p + T2q; T4I = FMA(KP618033988, T4C, T4F); T4G = FNMS(KP618033988, T4F, T4C); T4S = FNMS(KP618033988, T4R, T4Q); T4U = FMA(KP618033988, T4Q, T4R); Im[WS(rs, 4)] = KP500000000 * (T2r - T2N); T4x = FMA(KP250000000, T2r, T2N); T4J = T4j - T4k; T4l = T4j + T4k; T4z = FMA(KP559016994, T4y, T4x); T4H = FNMS(KP559016994, T4y, T4x); } } { E T2R, T4s, T4d, T4f, T4t, T2U, T4P, T4T; { E T3X, T4O, T4M, T4c, T4N; T4m = T3P + T3W; T3X = T3P - T3W; Ip[WS(rs, 7)] = KP500000000 * (FMA(KP951056516, T4G, T4z)); Ip[WS(rs, 3)] = KP500000000 * (FNMS(KP951056516, T4G, T4z)); Im[WS(rs, 8)] = -(KP500000000 * (FNMS(KP951056516, T4I, T4H))); Im[0] = -(KP500000000 * (FMA(KP951056516, T4I, T4H))); T4O = T4K - T4L; T4M = T4K + T4L; T4c = T44 - T4b; T4n = T44 + T4b; T2R = T2P + T2Q; T4s = T2P - T2Q; Rm[WS(rs, 4)] = KP500000000 * (T4J + T4M); T4N = FNMS(KP250000000, T4M, T4J); T4d = FMA(KP618033988, T4c, T3X); T4f = FNMS(KP618033988, T3X, T4c); T4t = T2S - T2T; T2U = T2S + T2T; T4P = FNMS(KP559016994, T4O, T4N); T4T = FMA(KP559016994, T4O, T4N); } { E T3H, T3G, T2V, T3I, T4e; T2V = T2R + T2U; T3H = T2R - T2U; Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T4S, T4P)); Rp[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T4S, T4P)); Rm[0] = KP500000000 * (FNMS(KP951056516, T4U, T4T)); Rm[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T4U, T4T)); Ip[WS(rs, 5)] = KP500000000 * (T2O + T2V); T3G = FNMS(KP250000000, T2V, T2O); T3I = FMA(KP559016994, T3H, T3G); T4e = FNMS(KP559016994, T3H, T3G); T4w = FNMS(KP618033988, T4s, T4t); T4u = FMA(KP618033988, T4t, T4s); Ip[WS(rs, 9)] = KP500000000 * (FMA(KP951056516, T4d, T3I)); Ip[WS(rs, 1)] = KP500000000 * (FNMS(KP951056516, T4d, T3I)); Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP951056516, T4f, T4e))); Im[WS(rs, 2)] = -(KP500000000 * (FMA(KP951056516, T4f, T4e))); } } } { E T3y, T5O, T5Q, T5F, T5K, T5I; { E T5G, T5H, T3x, T4q, T5E, T5C, T3a, T5N, T4p, T5M, T3p, T5y, T5B, T4o; T5G = T5x + T5w; T5y = T5w - T5x; T5B = T5z - T5A; T5H = T5z + T5A; T3y = T3w - T3v; T3x = T3v + T3w; T4q = T4m - T4n; T4o = T4m + T4n; T5E = FMA(KP618033988, T5y, T5B); T5C = FNMS(KP618033988, T5B, T5y); T3a = T32 + T39; T5N = T39 - T32; Rp[WS(rs, 5)] = KP500000000 * (T4l + T4o); T4p = FNMS(KP250000000, T4o, T4l); T5M = T3o - T3h; T3p = T3h + T3o; { E T5u, T5t, T4r, T4v, T3q, T5D, T5v; T4r = FMA(KP559016994, T4q, T4p); T4v = FNMS(KP559016994, T4q, T4p); T5u = T3p - T3a; T3q = T3a + T3p; Rp[WS(rs, 9)] = KP500000000 * (FNMS(KP951056516, T4u, T4r)); Rp[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T4u, T4r)); Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T4w, T4v)); Rm[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T4w, T4v)); Im[WS(rs, 9)] = KP500000000 * (T3q - T3x); T5t = FMA(KP250000000, T3q, T3x); T5O = FNMS(KP618033988, T5N, T5M); T5Q = FMA(KP618033988, T5M, T5N); T5F = T4V - T4W; T4X = T4V + T4W; T5D = FNMS(KP559016994, T5u, T5t); T5v = FMA(KP559016994, T5u, T5t); Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP951056516, T5C, T5v))); Ip[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T5C, T5v)); Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP951056516, T5E, T5D))); Ip[WS(rs, 2)] = KP500000000 * (FMA(KP951056516, T5E, T5D)); T5K = T5G - T5H; T5I = T5G + T5H; } } { E T54, T5b, T5s, T5q, T5g, T5h, T3F, T5m, T5o, T5p, T5J, T5l, T5r, T5n; T54 = T50 + T53; T5o = T50 - T53; T5p = T5a - T57; T5b = T57 + T5a; Rm[WS(rs, 9)] = KP500000000 * (T5F + T5I); T5J = FNMS(KP250000000, T5I, T5F); T5s = FMA(KP618033988, T5o, T5p); T5q = FNMS(KP618033988, T5p, T5o); { E T5L, T5P, T3B, T3E; T5L = FNMS(KP559016994, T5K, T5J); T5P = FMA(KP559016994, T5K, T5J); T3B = T3z + T3A; T5g = T3z - T3A; T5h = T3C - T3D; T3E = T3C + T3D; Rm[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T5O, T5L)); Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T5O, T5L)); Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP951056516, T5Q, T5P)); Rp[WS(rs, 6)] = KP500000000 * (FMA(KP951056516, T5Q, T5P)); T3F = T3B + T3E; T5m = T3B - T3E; } Ip[0] = KP500000000 * (T3y + T3F); T5l = FNMS(KP250000000, T3F, T3y); T5i = FMA(KP618033988, T5h, T5g); T5k = FNMS(KP618033988, T5g, T5h); T5r = FNMS(KP559016994, T5m, T5l); T5n = FMA(KP559016994, T5m, T5l); Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP951056516, T5q, T5n))); Ip[WS(rs, 4)] = KP500000000 * (FMA(KP951056516, T5q, T5n)); Im[WS(rs, 7)] = -(KP500000000 * (FNMS(KP951056516, T5s, T5r))); Ip[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5s, T5r)); T5e = T54 - T5b; T5c = T54 + T5b; } } } } Rp[0] = KP500000000 * (T4X + T5c); T5d = FNMS(KP250000000, T5c, T4X); T5j = FNMS(KP559016994, T5e, T5d); T5f = FMA(KP559016994, T5e, T5d); Rm[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T5i, T5f)); Rp[WS(rs, 4)] = KP500000000 * (FNMS(KP951056516, T5i, T5f)); Rm[WS(rs, 7)] = KP500000000 * (FNMS(KP951056516, T5k, T5j)); Rp[WS(rs, 8)] = KP500000000 * (FMA(KP951056516, T5k, T5j)); }}static const tw_instr twinstr[] = { {TW_FULL, 1, 20}, {TW_NEXT, 1, 0}};static const hc2c_desc desc = { 20, "hc2cfdft_20", twinstr, &GENUS, {176, 78, 110, 0} };void X(codelet_hc2cfdft_20) (planner *p) { X(khc2c_register) (p, hc2cfdft_20, &desc, HC2C_VIA_DFT);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_hc2cdft -compact -variables 4 -pipeline-latency 4 -n 20 -dit -name hc2cfdft_20 -include hc2cf.h *//* * This function contains 286 FP additions, 140 FP multiplications, * (or, 224 additions, 78 multiplications, 62 fused multiply/add), * 98 stack variables, 5 constants, and 80 memory accesses */#include "hc2cf.h"static void hc2cfdft_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms){ DK(KP125000000, +0.125000000000000000000000000000000000000000000); DK(KP500000000, +0.500000000000000000000000000000000000000000000); DK(KP279508497, +0.279508497187473712051146708591409529430077295); DK(KP293892626, +0.293892626146236564584352977319536384298826219); DK(KP475528258, +0.475528258147576786058219666689691071702849317); INT m; for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 38, MAKE_VOLATILE_STRIDE(rs)) { E T12, T2w, T4o, T4V, T2H, T3a, T4y, T4Y, T1z, T2v, T25, T2y, T2s, T2z, T4v; E T4X, T4r, T4U, T3A, T3Z, T2X, T37, T3k, T41, T2M, T39, T3v, T3Y, T2S, T36; E T3p, T42, Td, T4G, T33, T3N, Tw, T4H, T32, T3O; { E T3, T3L, T1x, T2V, Th, Tl, TC, T3g, Tq, Tu, TH, T3h, T7, Tb, T1q; E T2U, TR, T2P, T1F, T3r, T23, T2K, T2f, T3y, T1k, T3m, T2q, T2E, T10, T2Q; E T1K, T3s, T1U, T2J, T2a, T3x, T1b, T3l, T2l, T2D; { E T1, T2, T1s, T1u, T1v, T1w, T1r, T1t; T1 = Ip[0]; T2 = Im[0]; T1s = T1 + T2; T1u = Rp[0]; T1v = Rm[0]; T1w = T1u - T1v; T3 = T1 - T2; T3L = T1u + T1v; T1r = W[0]; T1t = W[1]; T1x = FNMS(T1t, T1w, T1r * T1s); T2V = FMA(T1r, T1w, T1t * T1s); } { E Tf, Tg, Tz, Tj, Tk, TB, Ty, TA; Tf = Ip[WS(rs, 2)]; Tg = Im[WS(rs, 2)]; Tz = Tf - Tg; Tj = Rp[WS(rs, 2)]; Tk = Rm[WS(rs, 2)]; TB = Tj + Tk; Th = Tf + Tg; Tl = Tj - Tk; Ty = W[6]; TA = W[7]; TC = FNMS(TA, TB, Ty * Tz); T3g = FMA(TA, Tz, Ty * TB); } { E To, Tp, TE, Ts, Tt, TG, TD, TF; To = Ip[WS(rs, 7)]; Tp = Im[WS(rs, 7)]; TE = To - Tp; Ts = Rp[WS(rs, 7)]; Tt = Rm[WS(rs, 7)]; TG = Ts + Tt; Tq = To + Tp; Tu = Ts - Tt; TD = W[26]; TF = W[27]; TH = FNMS(TF, TG, TD * TE); T3h = FMA(TF, TE, TD * TG); } { E T5, T6, T1n, T9, Ta, T1p, T1m, T1o; T5 = Ip[WS(rs, 5)]; T6 = Im[WS(rs, 5)]; T1n = T5 + T6; T9 = Rp[WS(rs, 5)]; Ta = Rm[WS(rs, 5)]; T1p = T9 - Ta; T7 = T5 - T6; Tb = T9 + Ta; T1m = W[20]; T1o = W[21]; T1q = FNMS(T1o, T1p, T1m * T1n); T2U = FMA(T1m, T1p, T1o * T1n); } { E TM, T1C, TQ, T1E; { E TK, TL, TO, TP; TK = Ip[WS(rs, 4)]; TL = Im[WS(rs, 4)]; TM = TK + TL; T1C = TK - TL; TO = Rp[WS(rs, 4)]; TP = Rm[WS(rs, 4)]; TQ = TO - TP; T1E = TO + TP; } { E TJ, TN, T1B, T1D; TJ = W[16]; TN = W[17]; TR = FNMS(TN, TQ, TJ * TM); T2P = FMA(TN, TM, TJ * TQ); T1B = W[14]; T1D = W[15]; T1F = FNMS(T1D, T1E, T1B * T1C); T3r = FMA(T1D, T1C, T1B * T1E); } } { E T1Y, T2c, T22, T2e; { E T1W, T1X, T20, T21; T1W = Ip[WS(rs, 1)]; T1X = Im[WS(rs, 1)]; T1Y = T1W + T1X; T2c = T1W - T1X; T20 = Rp[WS(rs, 1)]; T21 = Rm[WS(rs, 1)]; T22 = T20 - T21; T2e = T20 + T21; } { E T1V, T1Z, T2b, T2d; T1V = W[4]; T1Z = W[5]; T23 = FNMS(T1Z, T22, T1V * T1Y); T2K = FMA(T1Z, T1Y, T1V * T22); T2b = W[2]; T2d = W[3]; T2f = FNMS(T2d, T2e, T2b * T2c); T3y = FMA(T2d, T2c, T2b * T2e); } } { E T1f, T2n, T1j, T2p; { E T1d, T1e, T1h, T1i; T1d = Ip[WS(rs, 3)]; T1e = Im[WS(rs, 3)]; T1f = T1d - T1e; T2n = T1d + T1e; T1h = Rp[WS(rs, 3)]; T1i = Rm[WS(rs, 3)]; T1j = T1h + T1i; T2p = T1h - T1i; } { E T1c, T1g, T2m, T2o; T1c = W[10]; T1g = W[11]; T1k = FNMS(T1g, T1j, T1c * T1f); T3m = FMA(T1c, T1j, T1g * T1f); T2m = W[12]; T2o = W[13]; T2q = FNMS(T2o, T2p, T2m * T2n); T2E = FMA(T2m, T2p, T2o * T2n); } } { E TV, T1H, TZ, T1J; { E TT, TU, TX, TY; TT = Ip[WS(rs, 9)]; TU = Im[WS(rs, 9)]; TV = TT + TU; T1H = TT - TU; TX = Rp[WS(rs, 9)]; TY = Rm[WS(rs, 9)]; TZ = TX - TY; T1J = TX + TY;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -