📄 hf2_32.c
字号:
/* * Copyright (c) 2003, 2007-8 Matteo Frigo * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * *//* This file was automatically generated --- DO NOT EDIT *//* Generated on Sat Nov 15 20:57:54 EST 2008 */#include "codelet-rdft.h"#ifdef HAVE_FMA/* Generated by: ../../../genfft/gen_hc2hc -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -dit -name hf2_32 -include hf.h *//* * This function contains 488 FP additions, 350 FP multiplications, * (or, 236 additions, 98 multiplications, 252 fused multiply/add), * 181 stack variables, 7 constants, and 128 memory accesses */#include "hf.h"static void hf2_32(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms){ DK(KP831469612, +0.831469612302545237078788377617905756738560812); DK(KP980785280, +0.980785280403230449126182236134239036973933731); DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP668178637, +0.668178637919298919997757686523080761552472251); DK(KP198912367, +0.198912367379658006911597622644676228597850501); DK(KP414213562, +0.414213562373095048801688724209698078569671875); DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(rs)) { E T7d, T7a; { E T2, T8, T3, T6, Te, Tr, T18, T4, Ta, Tz, T1n, T10, Ti, T5, Tc; T2 = W[0]; T8 = W[4]; T3 = W[2]; T6 = W[3]; Te = W[6]; Tr = T2 * T8; T18 = T3 * T8; T4 = T2 * T3; Ta = T2 * T6; Tz = T3 * Te; T1n = T8 * Te; T10 = T2 * Te; Ti = W[7]; T5 = W[1]; Tc = W[5]; { E T34, T31, T2X, T2T, Tq, T46, T8H, T98, TH, T97, T4b, T8D, TZ, T7g, T4j; E T6t, T1g, T7f, T4q, T6u, T4z, T6y, T1J, T7j, T7m, T8e, T6x, T4G, T2k, T7o; E T7r, T8d, T6B, T4O, T6A, T4V, T6P, T61, T7G, T3G, T6M, T5E, T8n, T7N, T6I; E T5s, T7v, T2N, T6F, T55, T8i, T7C, T5L, T63, T43, T7O, T5S, T62, T7J, T8o; E T2U, T2R, T2V, T58, T3a, T5h, T2Y, T32, T35; { E T1K, T23, T1N, T26, T2b, T1U, T3C, T3j, T3z, T3f, T1R, T29, TR, Th, T2J; E T2F, Td, TP, T3r, T3n, T2w, T2s, T3Q, T3M, T1Z, T1V, T2g, T2c; { E T11, T1C, TM, Tb, TJ, T7, T1o, T19, T1w, T1F, T15, T1s, T1d, T1z, TW; E TS, Ty, T48, TG, T4a; { E T1, TA, Ts, TE, Tw, Tn, Tj, T8G, Tk, To, T14; T1 = cr[0]; TA = FMA(T6, Ti, Tz); T1K = FNMS(T6, Ti, Tz); T14 = T2 * Ti; { E T1r, TD, T1c, Tv; T1r = T8 * Ti; TD = T3 * Ti; T11 = FNMS(T5, Ti, T10); T1C = FMA(T5, Ti, T10); TM = FMA(T5, T3, Ta); Tb = FNMS(T5, T3, Ta); TJ = FNMS(T5, T6, T4); T7 = FMA(T5, T6, T4); T1o = FMA(Tc, Ti, T1n); T23 = FMA(T6, Tc, T18); T19 = FNMS(T6, Tc, T18); T1w = FNMS(T5, Tc, Tr); Ts = FMA(T5, Tc, Tr); T1c = T3 * Tc; Tv = T2 * Tc; T1F = FNMS(T5, Te, T14); T15 = FMA(T5, Te, T14); T1s = FNMS(Tc, Te, T1r); T1N = FMA(T6, Te, TD); TE = FNMS(T6, Te, TD); { E T1T, T3i, T3e, T1Q; T1T = TJ * Tc; T3i = TJ * Ti; T3e = TJ * Te; T1Q = TJ * T8; { E Tg, T2I, T2E, T9; Tg = T7 * Tc; T2I = T7 * Ti; T2E = T7 * Te; T9 = T7 * T8; { E T3q, T3m, T2v, T2r; T3q = T19 * Ti; T3m = T19 * Te; T2v = T1w * Ti; T2r = T1w * Te; { E T2W, T2S, T3P, T3L; T2W = T23 * Ti; T2S = T23 * Te; T3P = Ts * Ti; T3L = Ts * Te; T26 = FNMS(T6, T8, T1c); T1d = FMA(T6, T8, T1c); T1z = FMA(T5, T8, Tv); Tw = FNMS(T5, T8, Tv); T2b = FNMS(TM, T8, T1T); T1U = FMA(TM, T8, T1T); T3C = FNMS(TM, Te, T3i); T3j = FMA(TM, Te, T3i); T3z = FMA(TM, Ti, T3e); T3f = FNMS(TM, Ti, T3e); T1R = FNMS(TM, Tc, T1Q); T29 = FMA(TM, Tc, T1Q); TR = FNMS(Tb, T8, Tg); Th = FMA(Tb, T8, Tg); T34 = FMA(Tb, Te, T2I); T2J = FNMS(Tb, Te, T2I); T31 = FNMS(Tb, Ti, T2E); T2F = FMA(Tb, Ti, T2E); Td = FNMS(Tb, Tc, T9); TP = FMA(Tb, Tc, T9); T2X = FNMS(T26, Te, T2W); T2T = FMA(T26, Ti, T2S); T3r = FNMS(T1d, Te, T3q); T3n = FMA(T1d, Ti, T3m); T2w = FNMS(T1z, Te, T2v); T2s = FMA(T1z, Ti, T2r); T3Q = FNMS(Tw, Te, T3P); T3M = FMA(Tw, Ti, T3L); { E T1Y, T1S, T2f, T2a; T1Y = T1R * Ti; T1S = T1R * Te; T2f = T29 * Ti; T2a = T29 * Te; { E Tm, Tf, TV, TQ; Tm = Td * Ti; Tf = Td * Te; TV = TP * Ti; TQ = TP * Te; T1Z = FNMS(T1U, Te, T1Y); T1V = FMA(T1U, Ti, T1S); T2g = FNMS(T2b, Te, T2f); T2c = FMA(T2b, Ti, T2a); Tn = FNMS(Th, Te, Tm); Tj = FMA(Th, Ti, Tf); TW = FNMS(TR, Te, TV); TS = FMA(TR, Ti, TQ); T8G = ci[0]; } } } } } } } Tk = cr[WS(rs, 16)]; To = ci[WS(rs, 16)]; { E Tt, Tx, Tu, T47, TB, TF, TC, T49; { E Tl, T8E, Tp, T8F; Tt = cr[WS(rs, 8)]; Tx = ci[WS(rs, 8)]; Tl = Tj * Tk; T8E = Tj * To; Tu = Ts * Tt; T47 = Ts * Tx; Tp = FMA(Tn, To, Tl); T8F = FNMS(Tn, Tk, T8E); TB = cr[WS(rs, 24)]; TF = ci[WS(rs, 24)]; Tq = T1 + Tp; T46 = T1 - Tp; T8H = T8F + T8G; T98 = T8G - T8F; TC = TA * TB; T49 = TA * TF; } Ty = FMA(Tw, Tx, Tu); T48 = FNMS(Tw, Tt, T47); TG = FMA(TE, TF, TC); T4a = FNMS(TE, TB, T49); } } { E TT, TX, TO, T4f, TU, T4g; { E TK, TN, TL, T4e; TK = cr[WS(rs, 4)]; TN = ci[WS(rs, 4)]; TH = Ty + TG; T97 = Ty - TG; T4b = T48 - T4a; T8D = T48 + T4a; TL = TJ * TK; T4e = TJ * TN; TT = cr[WS(rs, 20)]; TX = ci[WS(rs, 20)]; TO = FMA(TM, TN, TL); T4f = FNMS(TM, TK, T4e); TU = TS * TT; T4g = TS * TX; } { E T17, T4m, T1a, T1e, T4d, T4i; { E T12, T16, TY, T4h, T13, T4l; T12 = cr[WS(rs, 28)]; T16 = ci[WS(rs, 28)]; TY = FMA(TW, TX, TU); T4h = FNMS(TW, TT, T4g); T13 = T11 * T12; T4l = T11 * T16; TZ = TO + TY; T4d = TO - TY; T7g = T4f + T4h; T4i = T4f - T4h; T17 = FMA(T15, T16, T13); T4m = FNMS(T15, T12, T4l); } T4j = T4d - T4i; T6t = T4d + T4i; T1a = cr[WS(rs, 12)]; T1e = ci[WS(rs, 12)]; { E T1m, T4u, T1H, T4E, T1x, T1A, T1u, T4w, T1y, T4B; { E T1D, T1G, T1E, T4D; { E T1f, T4o, T4k, T4p; { E T1j, T1l, T1b, T4n, T1k, T4t; T1j = cr[WS(rs, 2)]; T1l = ci[WS(rs, 2)]; T1b = T19 * T1a; T4n = T19 * T1e; T1k = T7 * T1j; T4t = T7 * T1l; T1f = FMA(T1d, T1e, T1b); T4o = FNMS(T1d, T1a, T4n); T1m = FMA(Tb, T1l, T1k); T4u = FNMS(Tb, T1j, T4t); } T1g = T17 + T1f; T4k = T17 - T1f; T7f = T4m + T4o; T4p = T4m - T4o; T1D = cr[WS(rs, 26)]; T1G = ci[WS(rs, 26)]; T4q = T4k + T4p; T6u = T4k - T4p; T1E = T1C * T1D; T4D = T1C * T1G; } { E T1p, T1t, T1q, T4v; T1p = cr[WS(rs, 18)]; T1t = ci[WS(rs, 18)]; T1H = FMA(T1F, T1G, T1E); T4E = FNMS(T1F, T1D, T4D); T1q = T1o * T1p; T4v = T1o * T1t; T1x = cr[WS(rs, 10)]; T1A = ci[WS(rs, 10)]; T1u = FMA(T1s, T1t, T1q); T4w = FNMS(T1s, T1p, T4v); T1y = T1w * T1x; T4B = T1w * T1A; } } { E T4A, T1v, T7k, T4x, T1B, T4C; T4A = T1m - T1u; T1v = T1m + T1u; T7k = T4u + T4w; T4x = T4u - T4w; T1B = FMA(T1z, T1A, T1y); T4C = FNMS(T1z, T1x, T4B); { E T1I, T4y, T4F, T7l; T1I = T1B + T1H; T4y = T1B - T1H; T4F = T4C - T4E; T7l = T4C + T4E; T4z = T4x + T4y; T6y = T4x - T4y; T1J = T1v + T1I; T7j = T1v - T1I; T7m = T7k - T7l; T8e = T7k + T7l; T6x = T4A + T4F; T4G = T4A - T4F; } } } } } } { E T5C, T3u, T5y, T7L, T60, T5V, T3F, T5A, T4P, T4U; { E T1P, T4J, T2i, T4T, T21, T4L, T28, T4R; { E T1L, T1O, T1W, T20; T1L = cr[WS(rs, 30)]; T1O = ci[WS(rs, 30)]; { E T2d, T2h, T1M, T4I, T2e, T4S; T2d = cr[WS(rs, 22)]; T2h = ci[WS(rs, 22)]; T1M = T1K * T1L; T4I = T1K * T1O; T2e = T2c * T2d; T4S = T2c * T2h; T1P = FMA(T1N, T1O, T1M); T4J = FNMS(T1N, T1L, T4I); T2i = FMA(T2g, T2h, T2e); T4T = FNMS(T2g, T2d, T4S); } T1W = cr[WS(rs, 14)]; T20 = ci[WS(rs, 14)]; { E T24, T27, T1X, T4K, T25, T4Q; T24 = cr[WS(rs, 6)]; T27 = ci[WS(rs, 6)]; T1X = T1V * T1W; T4K = T1V * T20; T25 = T23 * T24; T4Q = T23 * T27; T21 = FMA(T1Z, T20, T1X); T4L = FNMS(T1Z, T1W, T4K); T28 = FMA(T26, T27, T25); T4R = FNMS(T26, T24, T4Q); } } { E T22, T7p, T4M, T4N, T2j, T7q; T4P = T1P - T21; T22 = T1P + T21; T7p = T4J + T4L; T4M = T4J - T4L; T4N = T28 - T2i; T2j = T28 + T2i; T7q = T4R + T4T; T4U = T4R - T4T; T2k = T22 + T2j; T7o = T22 - T2j; T7r = T7p - T7q; T8d = T7p + T7q; T6B = T4M - T4N; T4O = T4M + T4N; } } { E T3l, T5X, T3E, T3v, T3t, T3w, T3x, T5Z, T3A, T3B, T3D, T3y, T5z; { E T3g, T3k, T3h, T5W; T3g = cr[WS(rs, 31)]; T3k = ci[WS(rs, 31)]; T3A = cr[WS(rs, 23)]; T6A = T4P + T4U; T4V = T4P - T4U; T3h = T3f * T3g; T5W = T3f * T3k; T3B = T3z * T3A; T3D = ci[WS(rs, 23)]; T3l = FMA(T3j, T3k, T3h); T5X = FNMS(T3j, T3g, T5W); } { E T3o, T5B, T3s, T3p, T5Y; T3o = cr[WS(rs, 15)]; T3E = FMA(T3C, T3D, T3B); T5B = T3z * T3D; T3s = ci[WS(rs, 15)]; T3p = T3n * T3o; T3v = cr[WS(rs, 7)]; T5C = FNMS(T3C, T3A, T5B); T5Y = T3n * T3s; T3t = FMA(T3r, T3s, T3p); T3w = TP * T3v; T3x = ci[WS(rs, 7)]; T5Z = FNMS(T3r, T3o, T5Y); } T3u = T3l + T3t; T5y = T3l - T3t; T3y = FMA(TR, T3x, T3w); T5z = TP * T3x; T7L = T5X + T5Z; T60 = T5X - T5Z; T5V = T3E - T3y; T3F = T3y + T3E; T5A = FNMS(TR, T3v, T5z); } { E T2L, T53, T4Z, T2z, T7A, T5q, T2D, T51; { E T2q, T5n, T2y, T2A, T2C, T5p, T2B, T50; { E T2G, T2K, T2n, T5m, T2t, T5o; { E T2o, T2p, T5D, T7M; T2n = cr[WS(rs, 1)]; T6P = T60 + T5V; T61 = T5V - T60; T7G = T3u - T3F; T3G = T3u + T3F; T5D = T5A - T5C; T7M = T5A + T5C; T2o = T2 * T2n; T2p = ci[WS(rs, 1)]; T6M = T5y + T5D; T5E = T5y - T5D; T8n = T7L + T7M; T7N = T7L - T7M; T5m = T2 * T2p; T2q = FMA(T5, T2p, T2o); } T2G = cr[WS(rs, 25)]; T2K = ci[WS(rs, 25)]; T5n = FNMS(T5, T2n, T5m); { E T2x, T2u, T2H, T52; T2t = cr[WS(rs, 17)]; T2H = T2F * T2G; T52 = T2F * T2K; T2x = ci[WS(rs, 17)]; T2u = T2s * T2t; T2L = FMA(T2J, T2K, T2H); T53 = FNMS(T2J, T2G, T52);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -