📄 hc2cfdft_32.c
字号:
Im[WS(rs, 8)] = -(KP500000000 * (FNMS(KP980785280, Ta0, T9X))); Ip[WS(rs, 7)] = KP500000000 * (FMA(KP980785280, Ta0, T9X)); }}static const tw_instr twinstr[] = { {TW_FULL, 1, 32}, {TW_NEXT, 1, 0}};static const hc2c_desc desc = { 32, "hc2cfdft_32", twinstr, &GENUS, {300, 126, 198, 0} };void X(codelet_hc2cfdft_32) (planner *p) { X(khc2c_register) (p, hc2cfdft_32, &desc, HC2C_VIA_DFT);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_hc2cdft -compact -variables 4 -pipeline-latency 4 -n 32 -dit -name hc2cfdft_32 -include hc2cf.h *//* * This function contains 498 FP additions, 228 FP multiplications, * (or, 404 additions, 134 multiplications, 94 fused multiply/add), * 106 stack variables, 9 constants, and 128 memory accesses */#include "hc2cf.h"static void hc2cfdft_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms){ DK(KP277785116, +0.277785116509801112371415406974266437187468595); DK(KP415734806, +0.415734806151272618539394188808952878369280406); DK(KP097545161, +0.097545161008064133924142434238511120463845809); DK(KP490392640, +0.490392640201615224563091118067119518486966865); DK(KP707106781, +0.707106781186547524400844362104849039284835938); DK(KP191341716, +0.191341716182544885864229992015199433380672281); DK(KP461939766, +0.461939766255643378064091594698394143411208313); DK(KP353553390, +0.353553390593273762200422181052424519642417969); DK(KP500000000, +0.500000000000000000000000000000000000000000000); INT m; for (m = mb, W = W + ((mb - 1) * 62); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 62, MAKE_VOLATILE_STRIDE(rs)) { E T2S, T5K, T52, T5N, T7p, T8r, T7i, T8o, T2q, T7t, T45, T6L, T2d, T7u, T48; E T6M, T1A, T4c, T4f, T1T, T3f, T5M, T7e, T7l, T6J, T7x, T4V, T5J, T7b, T7k; E T6G, T7w, Tj, TC, T5r, T4k, T4n, T5s, T3D, T5C, T6V, T72, T4G, T5F, T6u; E T86, T6S, T71, T6r, T85, TW, T1f, T5v, T4r, T4u, T5u, T40, T5G, T76, T8k; E T4N, T5D, T6B, T89, T6Z, T8h, T6y, T88; { E T1Y, T22, T2L, T4W, T2p, T43, T2A, T50, T27, T2b, T2Q, T4X, T2h, T2l, T2F; E T4Z; { E T1W, T1X, T2K, T20, T21, T2I, T2H, T2J; T1W = Ip[WS(rs, 4)]; T1X = Im[WS(rs, 4)]; T2K = T1W + T1X; T20 = Rp[WS(rs, 4)]; T21 = Rm[WS(rs, 4)]; T2I = T20 - T21; T1Y = T1W - T1X; T22 = T20 + T21; T2H = W[16]; T2J = W[17]; T2L = FMA(T2H, T2I, T2J * T2K); T4W = FNMS(T2J, T2I, T2H * T2K); } { E T2n, T2o, T2z, T2v, T2w, T2x, T2u, T2y; T2n = Ip[0]; T2o = Im[0]; T2z = T2n + T2o; T2v = Rm[0]; T2w = Rp[0]; T2x = T2v - T2w; T2p = T2n - T2o; T43 = T2w + T2v; T2u = W[0]; T2y = W[1]; T2A = FNMS(T2y, T2z, T2u * T2x); T50 = FMA(T2y, T2x, T2u * T2z); } { E T25, T26, T2P, T29, T2a, T2N, T2M, T2O; T25 = Ip[WS(rs, 12)]; T26 = Im[WS(rs, 12)]; T2P = T25 + T26; T29 = Rp[WS(rs, 12)]; T2a = Rm[WS(rs, 12)]; T2N = T29 - T2a; T27 = T25 - T26; T2b = T29 + T2a; T2M = W[48]; T2O = W[49]; T2Q = FMA(T2M, T2N, T2O * T2P); T4X = FNMS(T2O, T2N, T2M * T2P); } { E T2f, T2g, T2E, T2j, T2k, T2C, T2B, T2D; T2f = Ip[WS(rs, 8)]; T2g = Im[WS(rs, 8)]; T2E = T2f + T2g; T2j = Rp[WS(rs, 8)]; T2k = Rm[WS(rs, 8)]; T2C = T2j - T2k; T2h = T2f - T2g; T2l = T2j + T2k; T2B = W[32]; T2D = W[33]; T2F = FMA(T2B, T2C, T2D * T2E); T4Z = FNMS(T2D, T2C, T2B * T2E); } { E T2G, T2R, T7g, T7h; T2G = T2A - T2F; T2R = T2L + T2Q; T2S = T2G - T2R; T5K = T2R + T2G; { E T4Y, T51, T7n, T7o; T4Y = T4W + T4X; T51 = T4Z + T50; T52 = T4Y + T51; T5N = T51 - T4Y; T7n = T2Q - T2L; T7o = T50 - T4Z; T7p = T7n + T7o; T8r = T7o - T7n; } T7g = T2F + T2A; T7h = T4W - T4X; T7i = T7g - T7h; T8o = T7h + T7g; { E T2m, T44, T2e, T2i; T2e = W[30]; T2i = W[31]; T2m = FNMS(T2i, T2l, T2e * T2h); T44 = FMA(T2e, T2l, T2i * T2h); T2q = T2m + T2p; T7t = T43 - T44; T45 = T43 + T44; T6L = T2p - T2m; } { E T23, T46, T2c, T47; { E T1V, T1Z, T24, T28; T1V = W[14]; T1Z = W[15]; T23 = FNMS(T1Z, T22, T1V * T1Y); T46 = FMA(T1V, T22, T1Z * T1Y); T24 = W[46]; T28 = W[47]; T2c = FNMS(T28, T2b, T24 * T27); T47 = FMA(T24, T2b, T28 * T27); } T2d = T23 + T2c; T7u = T23 - T2c; T48 = T46 + T47; T6M = T46 - T47; } } } { E T1q, T4a, T2X, T4P, T1S, T4e, T3d, T4T, T1z, T4b, T32, T4Q, T1J, T4d, T38; E T4S; { E T1l, T2W, T1p, T2U; { E T1j, T1k, T1n, T1o; T1j = Ip[WS(rs, 2)]; T1k = Im[WS(rs, 2)]; T1l = T1j - T1k; T2W = T1j + T1k; T1n = Rp[WS(rs, 2)]; T1o = Rm[WS(rs, 2)]; T1p = T1n + T1o; T2U = T1n - T1o; } { E T1i, T1m, T2T, T2V; T1i = W[6]; T1m = W[7]; T1q = FNMS(T1m, T1p, T1i * T1l); T4a = FMA(T1i, T1p, T1m * T1l); T2T = W[8]; T2V = W[9]; T2X = FMA(T2T, T2U, T2V * T2W); T4P = FNMS(T2V, T2U, T2T * T2W); } } { E T1N, T3c, T1R, T3a; { E T1L, T1M, T1P, T1Q; T1L = Ip[WS(rs, 6)]; T1M = Im[WS(rs, 6)]; T1N = T1L - T1M; T3c = T1L + T1M; T1P = Rp[WS(rs, 6)]; T1Q = Rm[WS(rs, 6)]; T1R = T1P + T1Q; T3a = T1P - T1Q; } { E T1K, T1O, T39, T3b; T1K = W[22]; T1O = W[23]; T1S = FNMS(T1O, T1R, T1K * T1N); T4e = FMA(T1K, T1R, T1O * T1N); T39 = W[24]; T3b = W[25]; T3d = FMA(T39, T3a, T3b * T3c); T4T = FNMS(T3b, T3a, T39 * T3c); } } { E T1u, T31, T1y, T2Z; { E T1s, T1t, T1w, T1x; T1s = Ip[WS(rs, 10)]; T1t = Im[WS(rs, 10)]; T1u = T1s - T1t; T31 = T1s + T1t; T1w = Rp[WS(rs, 10)]; T1x = Rm[WS(rs, 10)]; T1y = T1w + T1x; T2Z = T1w - T1x; } { E T1r, T1v, T2Y, T30; T1r = W[38]; T1v = W[39]; T1z = FNMS(T1v, T1y, T1r * T1u); T4b = FMA(T1r, T1y, T1v * T1u); T2Y = W[40]; T30 = W[41]; T32 = FMA(T2Y, T2Z, T30 * T31); T4Q = FNMS(T30, T2Z, T2Y * T31); } } { E T1E, T37, T1I, T35; { E T1C, T1D, T1G, T1H; T1C = Ip[WS(rs, 14)]; T1D = Im[WS(rs, 14)]; T1E = T1C - T1D; T37 = T1C + T1D; T1G = Rp[WS(rs, 14)]; T1H = Rm[WS(rs, 14)]; T1I = T1G + T1H; T35 = T1G - T1H; } { E T1B, T1F, T34, T36; T1B = W[54]; T1F = W[55]; T1J = FNMS(T1F, T1I, T1B * T1E); T4d = FMA(T1B, T1I, T1F * T1E); T34 = W[56]; T36 = W[57]; T38 = FMA(T34, T35, T36 * T37); T4S = FNMS(T36, T35, T34 * T37); } } { E T33, T3e, T4R, T4U; T1A = T1q + T1z; T4c = T4a + T4b; T4f = T4d + T4e; T1T = T1J + T1S; T33 = T2X + T32; T3e = T38 + T3d; T3f = T33 + T3e; T5M = T3e - T33; { E T7c, T7d, T6H, T6I; T7c = T4S - T4T; T7d = T3d - T38; T7e = T7c + T7d; T7l = T7c - T7d; T6H = T4d - T4e; T6I = T1J - T1S; T6J = T6H + T6I; T7x = T6H - T6I; } T4R = T4P + T4Q; T4U = T4S + T4T; T4V = T4R + T4U; T5J = T4U - T4R; { E T79, T7a, T6E, T6F; T79 = T32 - T2X; T7a = T4P - T4Q; T7b = T79 - T7a; T7k = T7a + T79; T6E = T1q - T1z; T6F = T4a - T4b; T6G = T6E - T6F; T7w = T6F + T6E; } } } { E T9, T4i, T3l, T4A, TB, T4m, T3B, T4E, Ti, T4j, T3q, T4B, Ts, T4l, T3w; E T4D; { E T4, T3k, T8, T3i; { E T2, T3, T6, T7; T2 = Ip[WS(rs, 1)]; T3 = Im[WS(rs, 1)]; T4 = T2 - T3; T3k = T2 + T3; T6 = Rp[WS(rs, 1)]; T7 = Rm[WS(rs, 1)]; T8 = T6 + T7; T3i = T6 - T7; } { E T1, T5, T3h, T3j; T1 = W[2]; T5 = W[3]; T9 = FNMS(T5, T8, T1 * T4); T4i = FMA(T1, T8, T5 * T4); T3h = W[4]; T3j = W[5]; T3l = FMA(T3h, T3i, T3j * T3k); T4A = FNMS(T3j, T3i, T3h * T3k); } } { E Tw, T3A, TA, T3y; { E Tu, Tv, Ty, Tz; Tu = Ip[WS(rs, 13)]; Tv = Im[WS(rs, 13)]; Tw = Tu - Tv; T3A = Tu + Tv; Ty = Rp[WS(rs, 13)]; Tz = Rm[WS(rs, 13)]; TA = Ty + Tz; T3y = Ty - Tz; } { E Tt, Tx, T3x, T3z; Tt = W[50]; Tx = W[51]; TB = FNMS(Tx, TA, Tt * Tw); T4m = FMA(Tt, TA, Tx * Tw); T3x = W[52]; T3z = W[53]; T3B = FMA(T3x, T3y, T3z * T3A); T4E = FNMS(T3z, T3y, T3x * T3A); } } { E Td, T3p, Th, T3n; { E Tb, Tc, Tf, Tg; Tb = Ip[WS(rs, 9)]; Tc = Im[WS(rs, 9)]; Td = Tb - Tc; T3p = Tb + Tc; Tf = Rp[WS(rs, 9)]; Tg = Rm[WS(rs, 9)]; Th = Tf + Tg; T3n = Tf - Tg; } { E Ta, Te, T3m, T3o; Ta = W[34]; Te = W[35]; Ti = FNMS(Te, Th, Ta * Td); T4j = FMA(Ta, Th, Te * Td); T3m = W[36]; T3o = W[37]; T3q = FMA(T3m, T3n, T3o * T3p); T4B = FNMS(T3o, T3n, T3m * T3p); } } { E Tn, T3v, Tr, T3t; { E Tl, Tm, Tp, Tq; Tl = Ip[WS(rs, 5)]; Tm = Im[WS(rs, 5)]; Tn = Tl - Tm; T3v = Tl + Tm; Tp = Rp[WS(rs, 5)]; Tq = Rm[WS(rs, 5)]; Tr = Tp + Tq; T3t = Tp - Tq; } { E Tk, To, T3s, T3u; Tk = W[18]; To = W[19]; Ts = FNMS(To, Tr, Tk * Tn); T4l = FMA(Tk, Tr, To * Tn); T3s = W[20]; T3u = W[21]; T3w = FMA(T3s, T3t, T3u * T3v); T4D = FNMS(T3u, T3t, T3s * T3v); } } Tj = T9 + Ti; TC = Ts + TB; T5r = Tj - TC; T4k = T4i + T4j; T4n = T4l + T4m; T5s = T4k - T4n; { E T3r, T3C, T6T, T6U; T3r = T3l + T3q; T3C = T3w + T3B; T3D = T3r + T3C; T5C = T3C - T3r; T6T = T4E - T4D; T6U = T3w - T3B; T6V = T6T + T6U; T72 = T6T - T6U; } { E T4C, T4F, T6s, T6t; T4C = T4A + T4B; T4F = T4D + T4E; T4G = T4C + T4F; T5F = T4F - T4C; T6s = T4i - T4j; T6t = Ts - TB; T6u = T6s + T6t; T86 = T6s - T6t; } { E T6Q, T6R, T6p, T6q; T6Q = T3q - T3l; T6R = T4A - T4B; T6S = T6Q - T6R; T71 = T6R + T6Q; T6p = T9 - Ti; T6q = T4l - T4m; T6r = T6p - T6q; T85 = T6p + T6q; } } { E TM, T4p, T3I, T4H, T1e, T4t, T3Y, T4L, TV, T4q, T3N, T4I, T15, T4s, T3T; E T4K; { E TH, T3H, TL, T3F; { E TF, TG, TJ, TK; TF = Ip[WS(rs, 15)]; TG = Im[WS(rs, 15)]; TH = TF - TG; T3H = TF + TG; TJ = Rp[WS(rs, 15)]; TK = Rm[WS(rs, 15)]; TL = TJ + TK; T3F = TJ - TK; } { E TE, TI, T3E, T3G; TE = W[58]; TI = W[59]; TM = FNMS(TI, TL, TE * TH); T4p = FMA(TE, TL, TI * TH); T3E = W[60]; T3G = W[61]; T3I = FMA(T3E, T3F, T3G * T3H); T4H = FNMS(T3G, T3F, T3E * T3H); } } { E T19, T3X, T1d, T3V; { E T17, T18, T1b, T1c; T17 = Ip[WS(rs, 11)]; T18 = Im[WS(rs, 11)]; T19 = T17 - T18; T3X = T17 + T18; T1b = Rp[WS(rs, 11)]; T1c = Rm[WS(rs, 11)]; T1d = T1b + T1c; T3V = T1b - T1c; } {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -