📄 hf2_8.c
字号:
/* * Copyright (c) 2003, 2006 Matteo Frigo * Copyright (c) 2003, 2006 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * *//* This file was automatically generated --- DO NOT EDIT *//* Generated on Fri Jan 27 20:29:35 EST 2006 */#include "codelet-rdft.h"#ifdef HAVE_FMA/* Generated by: ../../../genfft/gen_hc2hc -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hf2_8 -include hf.h *//* * This function contains 74 FP additions, 50 FP multiplications, * (or, 44 additions, 20 multiplications, 30 fused multiply/add), * 68 stack variables, and 32 memory accesses *//* * Generator Id's : * $Id: algsimp.ml,v 1.8 2006-01-05 03:04:27 stevenj Exp $ * $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $ * $Id: gen_hc2hc.ml,v 1.15 2006-01-05 03:04:27 stevenj Exp $ */#include "hf.h"static const R *hf2_8(R *rio, R *iio, const R *W, stride ios, INT m, INT dist){ DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT i; for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 6, MAKE_VOLATILE_STRIDE(ios)) { E T1m, TS, T1l, TJ, T1k, Tw, T1w, T1u; { E T2, T6, T3, Tl, T5, Tr, Tm, Ti, Tb, T7, Tf, TF, TB, Tn, T4; E Ta; T2 = W[2]; T6 = W[1]; T3 = W[0]; Tl = W[4]; T5 = W[3]; Ta = T2 * T6; T4 = T2 * T3; Tr = Tl * T6; Tm = Tl * T3; Ti = FNMS(T5, T3, Ta); Tb = FMA(T5, T3, Ta); T7 = FNMS(T5, T6, T4); Tf = FMA(T5, T6, T4); TF = Tl * Ti; TB = Tl * Tf; Tn = W[5]; { E T1, T1s, TC, TG, TY, Tu, TW, Tk, Td, T1r, Tx, TH, Ty, T1d, T14; E Tz, TD, T18; { E Tt, Ts, To, Tp; T1 = rio[0]; T1s = iio[-WS(ios, 7)]; Tt = iio[-WS(ios, 1)]; TC = FNMS(Tn, Ti, TB); Ts = FNMS(Tn, T3, Tr); TG = FMA(Tn, Tf, TF); To = FMA(Tn, T6, Tm); Tp = rio[WS(ios, 6)]; { E T8, T9, Tc, Tj, Tg; Tj = iio[-WS(ios, 5)]; Tg = rio[WS(ios, 2)]; T8 = rio[WS(ios, 4)]; { E TX, Tq, TV, Th; TX = Ts * Tp; Tq = To * Tp; TV = Ti * Tg; Th = Tf * Tg; TY = FMA(To, Tt, TX); Tu = FNMS(Ts, Tt, Tq); TW = FMA(Tf, Tj, TV); Tk = FNMS(Ti, Tj, Th); T9 = T7 * T8; } Tc = iio[-WS(ios, 3)]; { E TQ, T1b, TO, TN, T1a, TR, T1c, TP; { E TK, TM, TL, T19, T1q; TK = rio[WS(ios, 7)]; TQ = iio[-WS(ios, 4)]; TM = iio[0]; Td = FMA(Tb, Tc, T9); T1q = T7 * Tc; TL = Tl * TK; T1b = T2 * TQ; T19 = Tl * TM; T1r = FNMS(Tb, T8, T1q); TO = rio[WS(ios, 3)]; TN = FMA(Tn, TM, TL); T1a = FNMS(Tn, TK, T19); } Tx = rio[WS(ios, 1)]; T1c = FNMS(T5, TO, T1b); TP = T2 * TO; TH = iio[-WS(ios, 2)]; Ty = T3 * Tx; T1m = T1a + T1c; T1d = T1a - T1c; TR = FMA(T5, TQ, TP); T14 = TC * TH; Tz = iio[-WS(ios, 6)]; TD = rio[WS(ios, 5)]; T18 = TN - TR; TS = TN + TR; } } } { E Te, T1p, T1t, Tv; { E T1g, T10, T1z, T1B, T1A, T1j, T1C, T1f; { E T1x, T16, T11, T1y; { E TU, TA, T15, TE, TZ, T13, T12, TI; Te = T1 + Td; TU = T1 - Td; TA = FMA(T6, Tz, Ty); T12 = T3 * Tz; T15 = FNMS(TG, TD, T14); TE = TC * TD; TZ = TW - TY; T1p = TW + TY; T13 = FNMS(T6, Tx, T12); T1x = T1s - T1r; T1t = T1r + T1s; TI = FMA(TG, TH, TE); T1g = TU - TZ; T10 = TU + TZ; T16 = T13 - T15; T1l = T13 + T15; TJ = TA + TI; T11 = TA - TI; T1y = Tk - Tu; Tv = Tk + Tu; } { E T1i, T1e, T17, T1h; T1i = T18 + T1d; T1e = T18 - T1d; T17 = T11 + T16; T1h = T16 - T11; T1z = T1x - T1y; T1B = T1y + T1x; T1A = T1h + T1i; T1j = T1h - T1i; T1C = T1e - T17; T1f = T17 + T1e; } } iio[-WS(ios, 7)] = FNMS(KP707106781, T1j, T1g); rio[WS(ios, 7)] = FMS(KP707106781, T1C, T1B); rio[WS(ios, 1)] = FMA(KP707106781, T1f, T10); iio[-WS(ios, 5)] = FNMS(KP707106781, T1f, T10); iio[-WS(ios, 1)] = FMA(KP707106781, T1A, T1z); rio[WS(ios, 5)] = FMS(KP707106781, T1A, T1z); rio[WS(ios, 3)] = FMA(KP707106781, T1j, T1g); iio[-WS(ios, 3)] = FMA(KP707106781, T1C, T1B); } T1k = Te - Tv; Tw = Te + Tv; T1w = T1t - T1p; T1u = T1p + T1t; } } } { E TT, T1v, T1n, T1o; TT = TJ + TS; T1v = TS - TJ; T1n = T1l - T1m; T1o = T1l + T1m; iio[-WS(ios, 2)] = T1v + T1w; rio[WS(ios, 6)] = T1v - T1w; rio[0] = Tw + TT; iio[-WS(ios, 4)] = Tw - TT; iio[0] = T1o + T1u; rio[WS(ios, 4)] = T1o - T1u; rio[WS(ios, 2)] = T1k + T1n; iio[-WS(ios, 6)] = T1k - T1n; } } return W;}static const tw_instr twinstr[] = { {TW_CEXP, 0, 1}, {TW_CEXP, 0, 3}, {TW_CEXP, 0, 7}, {TW_NEXT, 1, 0}};static const hc2hc_desc desc = { 8, "hf2_8", twinstr, &GENUS, {44, 20, 30, 0}, 0, 0, 0 };void X(codelet_hf2_8) (planner *p) { X(khc2hc_register) (p, hf2_8, &desc);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_hc2hc -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hf2_8 -include hf.h *//* * This function contains 74 FP additions, 44 FP multiplications, * (or, 56 additions, 26 multiplications, 18 fused multiply/add), * 42 stack variables, and 32 memory accesses *//* * Generator Id's : * $Id: algsimp.ml,v 1.8 2006-01-05 03:04:27 stevenj Exp $ * $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $ * $Id: gen_hc2hc.ml,v 1.15 2006-01-05 03:04:27 stevenj Exp $ */#include "hf.h"static const R *hf2_8(R *rio, R *iio, const R *W, stride ios, INT m, INT dist){ DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT i; for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 6, MAKE_VOLATILE_STRIDE(ios)) { E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx; { E T4, Tb, T7, Ta; T2 = W[2]; T5 = W[3]; T3 = W[0]; T6 = W[1]; T4 = T2 * T3; Tb = T5 * T3; T7 = T5 * T6; Ta = T2 * T6; T8 = T4 - T7; Tc = Ta + Tb; Tg = T4 + T7; Ti = Ta - Tb; Tl = W[4]; Tm = W[5]; Tn = FMA(Tl, T3, Tm * T6); Tz = FMA(Tl, Ti, Tm * Tg); Tp = FNMS(Tm, T3, Tl * T6); Tx = FNMS(Tm, Ti, Tl * Tg); } { E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ; E TT; { E T1, T1c, Te, T1b, T9, Td; T1 = rio[0]; T1c = iio[-WS(ios, 7)]; T9 = rio[WS(ios, 4)]; Td = iio[-WS(ios, 3)]; Te = FMA(T8, T9, Tc * Td); T1b = FNMS(Tc, T9, T8 * Td); Tf = T1 + Te; T1i = T1c - T1b; TL = T1 - Te; T1d = T1b + T1c; } { E TF, TW, TI, TX; { E TD, TE, TG, TH; TD = rio[WS(ios, 7)]; TE = iio[0]; TF = FMA(Tl, TD, Tm * TE); TW = FNMS(Tm, TD, Tl * TE); TG = rio[WS(ios, 3)]; TH = iio[-WS(ios, 4)]; TI = FMA(T2, TG, T5 * TH); TX = FNMS(T5, TG, T2 * TH); } TJ = TF + TI; T17 = TW + TX; TV = TF - TI; TY = TW - TX; } { E Tk, TM, Tr, TN; { E Th, Tj, To, Tq; Th = rio[WS(ios, 2)]; Tj = iio[-WS(ios, 5)]; Tk = FNMS(Ti, Tj, Tg * Th); TM = FMA(Ti, Th, Tg * Tj); To = rio[WS(ios, 6)]; Tq = iio[-WS(ios, 1)]; Tr = FNMS(Tp, Tq, Tn * To); TN = FMA(Tp, To, Tn * Tq); } Ts = Tk + Tr; T1j = Tk - Tr; TO = TM - TN; T1a = TM + TN; } { E Tw, TR, TB, TS; { E Tu, Tv, Ty, TA; Tu = rio[WS(ios, 1)]; Tv = iio[-WS(ios, 6)]; Tw = FMA(T3, Tu, T6 * Tv); TR = FNMS(T6, Tu, T3 * Tv); Ty = rio[WS(ios, 5)]; TA = iio[-WS(ios, 2)]; TB = FMA(Tx, Ty, Tz * TA); TS = FNMS(Tz, Ty, Tx * TA); } TC = Tw + TB; T16 = TR + TS; TQ = Tw - TB; TT = TR - TS; } { E Tt, TK, T1f, T1g; Tt = Tf + Ts; TK = TC + TJ; iio[-WS(ios, 4)] = Tt - TK; rio[0] = Tt + TK; { E T19, T1e, T15, T18; T19 = T16 + T17; T1e = T1a + T1d; rio[WS(ios, 4)] = T19 - T1e; iio[0] = T19 + T1e; T15 = Tf - Ts; T18 = T16 - T17; iio[-WS(ios, 6)] = T15 - T18; rio[WS(ios, 2)] = T15 + T18; } T1f = TJ - TC; T1g = T1d - T1a; rio[WS(ios, 6)] = T1f - T1g; iio[-WS(ios, 2)] = T1f + T1g; { E T11, T1k, T14, T1h, T12, T13; T11 = TL - TO; T1k = T1i - T1j; T12 = TT - TQ; T13 = TV + TY; T14 = KP707106781 * (T12 - T13); T1h = KP707106781 * (T12 + T13); iio[-WS(ios, 7)] = T11 - T14; iio[-WS(ios, 1)] = T1h + T1k; rio[WS(ios, 3)] = T11 + T14; rio[WS(ios, 5)] = T1h - T1k; } { E TP, T1m, T10, T1l, TU, TZ; TP = TL + TO; T1m = T1j + T1i; TU = TQ + TT; TZ = TV - TY; T10 = KP707106781 * (TU + TZ); T1l = KP707106781 * (TZ - TU); iio[-WS(ios, 5)] = TP - T10; iio[-WS(ios, 3)] = T1l + T1m; rio[WS(ios, 1)] = TP + T10; rio[WS(ios, 7)] = T1l - T1m; } } } } return W;}static const tw_instr twinstr[] = { {TW_CEXP, 0, 1}, {TW_CEXP, 0, 3}, {TW_CEXP, 0, 7}, {TW_NEXT, 1, 0}};static const hc2hc_desc desc = { 8, "hf2_8", twinstr, &GENUS, {56, 26, 18, 0}, 0, 0, 0 };void X(codelet_hf2_8) (planner *p) { X(khc2hc_register) (p, hf2_8, &desc);}#endif /* HAVE_FMA */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -