📄 t2sv_32.c
字号:
/* * Copyright (c) 2003, 2007-8 Matteo Frigo * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * *//* This file was automatically generated --- DO NOT EDIT *//* Generated on Sat Nov 15 20:54:08 EST 2008 */#include "codelet-dft.h"#ifdef HAVE_FMA/* Generated by: ../../../genfft/gen_twiddle -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 32 -name t2sv_32 -include ts.h *//* * This function contains 488 FP additions, 350 FP multiplications, * (or, 236 additions, 98 multiplications, 252 fused multiply/add), * 204 stack variables, 7 constants, and 128 memory accesses */#include "ts.h"static void t2sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms){ DVK(KP831469612, +0.831469612302545237078788377617905756738560812); DVK(KP980785280, +0.980785280403230449126182236134239036973933731); DVK(KP668178637, +0.668178637919298919997757686523080761552472251); DVK(KP198912367, +0.198912367379658006911597622644676228597850501); DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP414213562, +0.414213562373095048801688724209698078569671875); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(rs)) { V T6H, T74, T6U, T6E, T9r, T9t, T78, T7c, T6W, T6S, T73, T6K, T7a, T72, T9x; V T9z; { V T2, T8, T3, T6, Te, Ti, T5, Tc; T2 = LDW(&(W[0])); T8 = LDW(&(W[TWVL * 4])); T3 = LDW(&(W[TWVL * 2])); T6 = LDW(&(W[TWVL * 3])); Te = LDW(&(W[TWVL * 6])); Ti = LDW(&(W[TWVL * 7])); T5 = LDW(&(W[TWVL * 1])); Tc = LDW(&(W[TWVL * 5])); { V T2X, T2T, T34, T31, Tq, T46, T97, T8H, TH, T98, T4b, T8D, TZ, T7f, T1g; V T7g, T4j, T6t, T4q, T6u, T6x, T4z, T7m, T1J, T4G, T6y, T8d, T7l, T4O, T6A; V T2k, T7o, T6B, T4V, T7r, T8e, T5E, T6P, T3G, T7L, T6M, T61, T8n, T7I, T55; V T6I, T2N, T7A, T5s, T6F, T7x, T8i, T2R, T2U, T57, T3a, T5h, T62, T5L, T7J; V T43, T63, T5S, T8o, T7O, T2V, T2Y, T32, T35; { V T1w, T23, T1K, T1F, T1s, T1N, T26, T1z, T2w, T2s, T3Q, T3M, T3r, T3n, T2b; V T1U, T3C, T3j, T3z, T3f, T1R, T29, TR, Th, T2J, T2F, Td, TP, T1Z, T1V; V T2g, T2c, T1m, T4u, T1D, T1G, T1p, T1t, T1E, T4D, T1x, T1A, T1q, T4v; { V T1, Ts, T19, TJ, T7, TM, Tb, T11, T1C, T1o, TA, T15, TE, T1d, Tw; V T8G, Tk, Tn, Tj, TW, TS, To, Tt, Tx, TB, TF, Tl; { V T1Y, T1S, T2f, T2a; T1 = LD(&(ri[0]), ms, &(ri[0])); { V Tr, T18, T4, Ta; Tr = VMUL(T2, T8); T18 = VMUL(T3, T8); T4 = VMUL(T2, T3); Ta = VMUL(T2, T6); { V T10, T1n, Tz, T14; T10 = VMUL(T2, Te); T1n = VMUL(T8, Te); Tz = VMUL(T3, Te); T14 = VMUL(T2, Ti); { V T1r, TD, T1c, Tv; T1r = VMUL(T8, Ti); TD = VMUL(T3, Ti); T1c = VMUL(T3, Tc); Tv = VMUL(T2, Tc); T1w = VFNMS(T5, Tc, Tr); Ts = VFMA(T5, Tc, Tr); T19 = VFNMS(T6, Tc, T18); T23 = VFMA(T6, Tc, T18); TJ = VFNMS(T5, T6, T4); T7 = VFMA(T5, T6, T4); TM = VFMA(T5, T3, Ta); Tb = VFNMS(T5, T3, Ta); T11 = VFNMS(T5, Ti, T10); T1C = VFMA(T5, Ti, T10); T1o = VFMA(Tc, Ti, T1n); TA = VFMA(T6, Ti, Tz); T1K = VFNMS(T6, Ti, Tz); T1F = VFNMS(T5, Te, T14); T15 = VFMA(T5, Te, T14); T1s = VFNMS(Tc, Te, T1r); T1N = VFMA(T6, Te, TD); TE = VFNMS(T6, Te, TD); T26 = VFNMS(T6, T8, T1c); T1d = VFMA(T6, T8, T1c); T1z = VFMA(T5, T8, Tv); Tw = VFNMS(T5, T8, Tv); { V T2v, T2r, T3P, T3L; T2v = VMUL(T1w, Ti); T2r = VMUL(T1w, Te); T3P = VMUL(Ts, Ti); T3L = VMUL(Ts, Te); { V T3q, T3m, T2W, T2S; T3q = VMUL(T19, Ti); T3m = VMUL(T19, Te); T2W = VMUL(T23, Ti); T2S = VMUL(T23, Te); { V T1T, T3i, T3e, T1Q; T1T = VMUL(TJ, Tc); T3i = VMUL(TJ, Ti); T3e = VMUL(TJ, Te); T1Q = VMUL(TJ, T8); { V Tg, T2I, T2E, T9; Tg = VMUL(T7, Tc); T2I = VMUL(T7, Ti); T2E = VMUL(T7, Te); T9 = VMUL(T7, T8); T2w = VFNMS(T1z, Te, T2v); T2s = VFMA(T1z, Ti, T2r); T3Q = VFNMS(Tw, Te, T3P); T3M = VFMA(Tw, Ti, T3L); T3r = VFNMS(T1d, Te, T3q); T3n = VFMA(T1d, Ti, T3m); T2X = VFNMS(T26, Te, T2W); T2T = VFMA(T26, Ti, T2S); T2b = VFNMS(TM, T8, T1T); T1U = VFMA(TM, T8, T1T); T3C = VFNMS(TM, Te, T3i); T3j = VFMA(TM, Te, T3i); T3z = VFMA(TM, Ti, T3e); T3f = VFNMS(TM, Ti, T3e); T1R = VFNMS(TM, Tc, T1Q); T29 = VFMA(TM, Tc, T1Q); TR = VFNMS(Tb, T8, Tg); Th = VFMA(Tb, T8, Tg); T34 = VFMA(Tb, Te, T2I); T2J = VFNMS(Tb, Te, T2I); T31 = VFNMS(Tb, Ti, T2E); T2F = VFMA(Tb, Ti, T2E); Td = VFNMS(Tb, Tc, T9); TP = VFMA(Tb, Tc, T9); T1Y = VMUL(T1R, Ti); T1S = VMUL(T1R, Te); T2f = VMUL(T29, Ti); T2a = VMUL(T29, Te); T8G = LD(&(ii[0]), ms, &(ii[0])); } } } } } } } Tk = LD(&(ri[WS(rs, 16)]), ms, &(ri[0])); { V Tm, Tf, TV, TQ; Tm = VMUL(Td, Ti); Tf = VMUL(Td, Te); TV = VMUL(TP, Ti); TQ = VMUL(TP, Te); T1Z = VFNMS(T1U, Te, T1Y); T1V = VFMA(T1U, Ti, T1S); T2g = VFNMS(T2b, Te, T2f); T2c = VFMA(T2b, Ti, T2a); Tn = VFNMS(Th, Te, Tm); Tj = VFMA(Th, Ti, Tf); TW = VFNMS(TR, Te, TV); TS = VFMA(TR, Ti, TQ); } To = LD(&(ii[WS(rs, 16)]), ms, &(ii[0])); } Tt = LD(&(ri[WS(rs, 8)]), ms, &(ri[0])); Tx = LD(&(ii[WS(rs, 8)]), ms, &(ii[0])); TB = LD(&(ri[WS(rs, 24)]), ms, &(ri[0])); TF = LD(&(ii[WS(rs, 24)]), ms, &(ii[0])); Tl = VMUL(Tj, Tk); { V TO, T4f, TT, TX; { V Ty, T48, TG, T4a; { V TK, TN, T8E, Tu, T47, TC, T49, Tp, TL, T4e, T8F; TK = LD(&(ri[WS(rs, 4)]), ms, &(ri[0])); TN = LD(&(ii[WS(rs, 4)]), ms, &(ii[0])); T8E = VMUL(Tj, To); Tu = VMUL(Ts, Tt); T47 = VMUL(Ts, Tx); TC = VMUL(TA, TB); T49 = VMUL(TA, TF); Tp = VFMA(Tn, To, Tl); TL = VMUL(TJ, TK); T4e = VMUL(TJ, TN); T8F = VFNMS(Tn, Tk, T8E); Ty = VFMA(Tw, Tx, Tu); T48 = VFNMS(Tw, Tt, T47); TG = VFMA(TE, TF, TC); T4a = VFNMS(TE, TB, T49); Tq = VADD(T1, Tp); T46 = VSUB(T1, Tp); TO = VFMA(TM, TN, TL); T97 = VSUB(T8G, T8F); T8H = VADD(T8F, T8G); T4f = VFNMS(TM, TK, T4e); } TH = VADD(Ty, TG); T98 = VSUB(Ty, TG); T4b = VSUB(T48, T4a); T8D = VADD(T48, T4a); TT = LD(&(ri[WS(rs, 20)]), ms, &(ri[0])); TX = LD(&(ii[WS(rs, 20)]), ms, &(ii[0])); } { V T12, T16, T1a, T1e, T4k, T4p; T12 = LD(&(ri[WS(rs, 28)]), ms, &(ri[0])); T16 = LD(&(ii[WS(rs, 28)]), ms, &(ii[0])); T1a = LD(&(ri[WS(rs, 12)]), ms, &(ri[0])); T1e = LD(&(ii[WS(rs, 12)]), ms, &(ii[0])); { V TY, T4h, T17, T4m, T1f, T4o, T4d, T4i; { V T1j, T1l, TU, T4g, T13, T4l, T1b, T4n, T1k, T4t; T1j = LD(&(ri[WS(rs, 2)]), ms, &(ri[0])); T1l = LD(&(ii[WS(rs, 2)]), ms, &(ii[0])); TU = VMUL(TS, TT); T4g = VMUL(TS, TX); T13 = VMUL(T11, T12); T4l = VMUL(T11, T16); T1b = VMUL(T19, T1a); T4n = VMUL(T19, T1e); T1k = VMUL(T7, T1j); T4t = VMUL(T7, T1l); TY = VFMA(TW, TX, TU); T4h = VFNMS(TW, TT, T4g); T17 = VFMA(T15, T16, T13); T4m = VFNMS(T15, T12, T4l); T1f = VFMA(T1d, T1e, T1b); T4o = VFNMS(T1d, T1a, T4n); T1m = VFMA(Tb, T1l, T1k); T4u = VFNMS(Tb, T1j, T4t); } TZ = VADD(TO, TY); T4d = VSUB(TO, TY); T7f = VADD(T4f, T4h); T4i = VSUB(T4f, T4h); T1g = VADD(T17, T1f); T4k = VSUB(T17, T1f); T7g = VADD(T4m, T4o); T4p = VSUB(T4m, T4o); T1D = LD(&(ri[WS(rs, 26)]), ms, &(ri[0])); T1G = LD(&(ii[WS(rs, 26)]), ms, &(ii[0])); T4j = VADD(T4d, T4i); T6t = VSUB(T4i, T4d); } T1p = LD(&(ri[WS(rs, 18)]), ms, &(ri[0])); T1t = LD(&(ii[WS(rs, 18)]), ms, &(ii[0])); T4q = VSUB(T4k, T4p); T6u = VADD(T4k, T4p); T1E = VMUL(T1C, T1D); T4D = VMUL(T1C, T1G); T1x = LD(&(ri[WS(rs, 10)]), ms, &(ri[0])); T1A = LD(&(ii[WS(rs, 10)]), ms, &(ii[0])); T1q = VMUL(T1o, T1p); T4v = VMUL(T1o, T1t); } } } { V T3l, T5z, T3E, T5Z, T3v, T3x, T3w, T3t, T5B, T5W; { V T1P, T4J, T1W, T20, T2i, T4T, T1X, T4K, T24, T27; { V T2d, T2h, T1v, T4A, T7j, T4x, T2e, T4y, T1I, T4F, T7k, T4S; { V T1L, T1O, T1H, T4E, T1y, T4B, T1u, T4w, T1M, T4I, T1B, T4C; T1L = LD(&(ri[WS(rs, 30)]), ms, &(ri[0])); T1O = LD(&(ii[WS(rs, 30)]), ms, &(ii[0])); T1H = VFMA(T1F, T1G, T1E); T4E = VFNMS(T1F, T1D, T4D); T1y = VMUL(T1w, T1x); T4B = VMUL(T1w, T1A); T1u = VFMA(T1s, T1t, T1q); T4w = VFNMS(T1s, T1p, T4v); T1M = VMUL(T1K, T1L); T4I = VMUL(T1K, T1O); T2d = LD(&(ri[WS(rs, 22)]), ms, &(ri[0])); T2h = LD(&(ii[WS(rs, 22)]), ms, &(ii[0])); T1B = VFMA(T1z, T1A, T1y); T4C = VFNMS(T1z, T1x, T4B); T1v = VADD(T1m, T1u); T4A = VSUB(T1m, T1u); T7j = VADD(T4u, T4w); T4x = VSUB(T4u, T4w); T1P = VFMA(T1N, T1O, T1M); T4J = VFNMS(T1N, T1L, T4I); T2e = VMUL(T2c, T2d); T4y = VSUB(T1B, T1H); T1I = VADD(T1B, T1H); T4F = VSUB(T4C, T4E); T7k = VADD(T4C, T4E); T4S = VMUL(T2c, T2h); } T1W = LD(&(ri[WS(rs, 14)]), ms, &(ri[0])); T20 = LD(&(ii[WS(rs, 14)]), ms, &(ii[0])); T2i = VFMA(T2g, T2h, T2e); T6x = VADD(T4x, T4y); T4z = VSUB(T4x, T4y); T7m = VSUB(T1v, T1I); T1J = VADD(T1v, T1I); T4G = VADD(T4A, T4F); T6y = VSUB(T4A, T4F); T8d = VADD(T7j, T7k); T7l = VSUB(T7j, T7k); T4T = VFNMS(T2g, T2d, T4S); T1X = VMUL(T1V, T1W); T4K = VMUL(T1V, T20); T24 = LD(&(ri[WS(rs, 6)]), ms, &(ri[0])); T27 = LD(&(ii[WS(rs, 6)]), ms, &(ii[0])); } { V T22, T4P, T7p, T4M, T28, T4R, T3g, T3k; T3g = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)])); T3k = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)])); { V T3A, T3D, T21, T4L, T25, T4Q, T3h, T5y, T3B, T5Y; T3A = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)])); T3D = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)])); T21 = VFMA(T1Z, T20, T1X); T4L = VFNMS(T1Z, T1W, T4K); T25 = VMUL(T23, T24); T4Q = VMUL(T23, T27); T3h = VMUL(T3f, T3g); T5y = VMUL(T3f, T3k); T3B = VMUL(T3z, T3A); T5Y = VMUL(T3z, T3D); T22 = VADD(T1P, T21); T4P = VSUB(T1P, T21); T7p = VADD(T4J, T4L);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -