📄 t2fv_64.c
字号:
/* * Copyright (c) 2003, 2007-8 Matteo Frigo * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * *//* This file was automatically generated --- DO NOT EDIT *//* Generated on Sat Nov 15 20:48:56 EST 2008 */#include "codelet-dft.h"#ifdef HAVE_FMA/* Generated by: ../../../genfft/gen_twiddle_c -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2fv_64 -include t2f.h *//* * This function contains 519 FP additions, 384 FP multiplications, * (or, 261 additions, 126 multiplications, 258 fused multiply/add), * 187 stack variables, 15 constants, and 128 memory accesses */#include "t2f.h"static void t2fv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms){ DVK(KP773010453, +0.773010453362736960810906609758469800971041293); DVK(KP995184726, +0.995184726672196886244836953109479921575474869); DVK(KP820678790, +0.820678790828660330972281985331011598767386482); DVK(KP098491403, +0.098491403357164253077197521291327432293052451); DVK(KP956940335, +0.956940335732208864935797886980269969482849206); DVK(KP881921264, +0.881921264348355029712756863660388349508442621); DVK(KP303346683, +0.303346683607342391675883946941299872384187453); DVK(KP534511135, +0.534511135950791641089685961295362908582039528); DVK(KP831469612, +0.831469612302545237078788377617905756738560812); DVK(KP980785280, +0.980785280403230449126182236134239036973933731); DVK(KP668178637, +0.668178637919298919997757686523080761552472251); DVK(KP198912367, +0.198912367379658006911597622644676228597850501); DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP414213562, +0.414213562373095048801688724209698078569671875); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; R *x; x = ri; for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(rs)) { V T6L, T6M, T6O, T6P, T75, T6V, T5A, T6A, T72, T6K, T6t, T6D, T6w, T6B, T6h; V T6E; { V Ta, T3U, T3V, T37, T7a, T58, T7B, T6l, T1v, T24, T5Q, T7o, T5F, T7l, T43; V T4F, T2i, T2R, T6b, T7v, T60, T7s, T4a, T4I, T5u, T7h, T5x, T7g, T1i, T3a; V T4j, T4C, T7e, T5l, T7d, T5o, T3b, TV, T4B, T4m, T3X, T3Y, T6o, T7b, T5f; V T7C, Tx, T38, T2p, T61, T2n, T65, T2D, T7p, T5M, T7m, T5T, T4G, T46, T25; V T1S, T2q, T2u, T2w; { V T5q, T10, T5v, T15, T1b, T5s, T1c, T1e; { V T1V, T1p, T5B, T5O, T1u, T1X, T20, T21; { V T1, T2, T7, T5, T32, T34, T2X, T2Z; T1 = LD(&(x[0]), ms, &(x[0])); T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0])); T7 = LD(&(x[WS(rs, 48)]), ms, &(x[0])); T5 = LD(&(x[WS(rs, 16)]), ms, &(x[0])); T32 = LD(&(x[WS(rs, 8)]), ms, &(x[0])); T34 = LD(&(x[WS(rs, 40)]), ms, &(x[0])); T2X = LD(&(x[WS(rs, 56)]), ms, &(x[0])); T2Z = LD(&(x[WS(rs, 24)]), ms, &(x[0])); { V T1m, T54, T6j, T36, T55, T31, T56, T1n, T1q, T1s, T4, T9; { V T3, T8, T6, T33, T35, T2Y, T30, T1l; T1l = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); T3 = BYTWJ(&(W[TWVL * 62]), T2); T8 = BYTWJ(&(W[TWVL * 94]), T7); T6 = BYTWJ(&(W[TWVL * 30]), T5); T33 = BYTWJ(&(W[TWVL * 14]), T32); T35 = BYTWJ(&(W[TWVL * 78]), T34); T2Y = BYTWJ(&(W[TWVL * 110]), T2X); T30 = BYTWJ(&(W[TWVL * 46]), T2Z); T1m = BYTWJ(&(W[0]), T1l); T54 = VSUB(T1, T3); T4 = VADD(T1, T3); T6j = VSUB(T6, T8); T9 = VADD(T6, T8); T36 = VADD(T33, T35); T55 = VSUB(T33, T35); T31 = VADD(T2Y, T30); T56 = VSUB(T2Y, T30); T1n = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)])); } T1q = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)])); T1s = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)])); Ta = VSUB(T4, T9); T3U = VADD(T4, T9); { V T57, T6k, T1o, T1r, T1t, T1W, T1U, T1Z; T1U = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)])); T3V = VADD(T36, T31); T37 = VSUB(T31, T36); T57 = VADD(T55, T56); T6k = VSUB(T56, T55); T1o = BYTWJ(&(W[TWVL * 64]), T1n); T1r = BYTWJ(&(W[TWVL * 32]), T1q); T1t = BYTWJ(&(W[TWVL * 96]), T1s); T1V = BYTWJ(&(W[TWVL * 16]), T1U); T1W = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)])); T1Z = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)])); T7a = VFNMS(LDK(KP707106781), T57, T54); T58 = VFMA(LDK(KP707106781), T57, T54); T7B = VFMA(LDK(KP707106781), T6k, T6j); T6l = VFNMS(LDK(KP707106781), T6k, T6j); T1p = VADD(T1m, T1o); T5B = VSUB(T1m, T1o); T5O = VSUB(T1r, T1t); T1u = VADD(T1r, T1t); T1X = BYTWJ(&(W[TWVL * 80]), T1W); T20 = BYTWJ(&(W[TWVL * 112]), T1Z); T21 = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)])); } } } { V T5W, T2N, T69, T2L, T5Y, T2P, T48, T2c, T2h; { V T41, T1Y, T5C, T22, T2d, T29, T2b, T2f, T28, T2a, T2H, T2J; T28 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)])); T2a = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)])); T1v = VSUB(T1p, T1u); T41 = VADD(T1p, T1u); T1Y = VADD(T1V, T1X); T5C = VSUB(T1V, T1X); T22 = BYTWJ(&(W[TWVL * 48]), T21); T2d = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)])); T29 = BYTWJ(&(W[TWVL * 124]), T28); T2b = BYTWJ(&(W[TWVL * 60]), T2a); T2f = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)])); T2H = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)])); T2J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)])); { V T23, T5D, T2e, T2g, T2I, T2K, T2M; T2M = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)])); T23 = VADD(T20, T22); T5D = VSUB(T20, T22); T2e = BYTWJ(&(W[TWVL * 28]), T2d); T2c = VADD(T29, T2b); T5W = VSUB(T29, T2b); T2g = BYTWJ(&(W[TWVL * 92]), T2f); T2I = BYTWJ(&(W[TWVL * 108]), T2H); T2K = BYTWJ(&(W[TWVL * 44]), T2J); T2N = BYTWJ(&(W[TWVL * 12]), T2M); { V T5E, T5P, T42, T2O; T5E = VADD(T5C, T5D); T5P = VSUB(T5C, T5D); T24 = VSUB(T1Y, T23); T42 = VADD(T1Y, T23); T69 = VSUB(T2g, T2e); T2h = VADD(T2e, T2g); T2O = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)])); T2L = VADD(T2I, T2K); T5Y = VSUB(T2I, T2K); T5Q = VFMA(LDK(KP707106781), T5P, T5O); T7o = VFNMS(LDK(KP707106781), T5P, T5O); T5F = VFMA(LDK(KP707106781), T5E, T5B); T7l = VFNMS(LDK(KP707106781), T5E, T5B); T43 = VADD(T41, T42); T4F = VSUB(T41, T42); T2P = BYTWJ(&(W[TWVL * 76]), T2O); } } } T2i = VSUB(T2c, T2h); T48 = VADD(T2c, T2h); { V TW, TY, T11, T2Q, T5X, T13; TW = LD(&(x[WS(rs, 62)]), ms, &(x[0])); TY = LD(&(x[WS(rs, 30)]), ms, &(x[0])); T11 = LD(&(x[WS(rs, 14)]), ms, &(x[0])); T2Q = VADD(T2N, T2P); T5X = VSUB(T2N, T2P); T13 = LD(&(x[WS(rs, 46)]), ms, &(x[0])); { V T12, T5Z, T6a, T49, T14, T18, T1a; { V T17, T19, TX, TZ; T17 = LD(&(x[WS(rs, 54)]), ms, &(x[0])); T19 = LD(&(x[WS(rs, 22)]), ms, &(x[0])); TX = BYTWJ(&(W[TWVL * 122]), TW); TZ = BYTWJ(&(W[TWVL * 58]), TY); T12 = BYTWJ(&(W[TWVL * 26]), T11); T5Z = VADD(T5X, T5Y); T6a = VSUB(T5Y, T5X); T2R = VSUB(T2L, T2Q); T49 = VADD(T2Q, T2L); T14 = BYTWJ(&(W[TWVL * 90]), T13); T18 = BYTWJ(&(W[TWVL * 106]), T17); T5q = VSUB(TX, TZ); T10 = VADD(TX, TZ); T1a = BYTWJ(&(W[TWVL * 42]), T19); } T6b = VFMA(LDK(KP707106781), T6a, T69); T7v = VFNMS(LDK(KP707106781), T6a, T69); T60 = VFMA(LDK(KP707106781), T5Z, T5W); T7s = VFNMS(LDK(KP707106781), T5Z, T5W); T4a = VADD(T48, T49); T4I = VSUB(T48, T49); T5v = VSUB(T14, T12); T15 = VADD(T12, T14); T1b = VADD(T18, T1a); T5s = VSUB(T18, T1a); } T1c = LD(&(x[WS(rs, 6)]), ms, &(x[0])); T1e = LD(&(x[WS(rs, 38)]), ms, &(x[0])); } } } { V Th, T59, Tf, Tv, T5d, Tj, Tm, To; { V T5h, TQ, T5m, T5i, TO, TS, TJ, T4k, TD, TI; { V T4h, T16, TB, T1d, T1f, TE, TG, TA, Tz, TK, TM, TC; Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0])); T4h = VADD(T10, T15); T16 = VSUB(T10, T15); TB = LD(&(x[WS(rs, 34)]), ms, &(x[0])); T1d = BYTWJ(&(W[TWVL * 10]), T1c); T1f = BYTWJ(&(W[TWVL * 74]), T1e); TE = LD(&(x[WS(rs, 18)]), ms, &(x[0])); TG = LD(&(x[WS(rs, 50)]), ms, &(x[0])); TA = BYTWJ(&(W[TWVL * 2]), Tz); TK = LD(&(x[WS(rs, 10)]), ms, &(x[0])); TM = LD(&(x[WS(rs, 42)]), ms, &(x[0])); TC = BYTWJ(&(W[TWVL * 66]), TB); { V T1g, T5r, TF, TH, TL, TN, TP; TP = LD(&(x[WS(rs, 58)]), ms, &(x[0])); T1g = VADD(T1d, T1f); T5r = VSUB(T1d, T1f); TF = BYTWJ(&(W[TWVL * 34]), TE); TH = BYTWJ(&(W[TWVL * 98]), TG); TL = BYTWJ(&(W[TWVL * 18]), TK); TN = BYTWJ(&(W[TWVL * 82]), TM); T5h = VSUB(TA, TC); TD = VADD(TA, TC); TQ = BYTWJ(&(W[TWVL * 114]), TP); { V T5w, T5t, T4i, T1h, TR; T5w = VSUB(T5s, T5r); T5t = VADD(T5r, T5s); T4i = VADD(T1g, T1b); T1h = VSUB(T1b, T1g); T5m = VSUB(TF, TH); TI = VADD(TF, TH); T5i = VSUB(TL, TN); TO = VADD(TL, TN); TR = LD(&(x[WS(rs, 26)]), ms, &(x[0])); T5u = VFMA(LDK(KP707106781), T5t, T5q); T7h = VFNMS(LDK(KP707106781), T5t, T5q); T5x = VFMA(LDK(KP707106781), T5w, T5v); T7g = VFNMS(LDK(KP707106781), T5w, T5v); T1i = VFNMS(LDK(KP414213562), T1h, T16); T3a = VFMA(LDK(KP414213562), T16, T1h); T4j = VADD(T4h, T4i); T4C = VSUB(T4h, T4i); TS = BYTWJ(&(W[TWVL * 50]), TR); } } } TJ = VSUB(TD, TI); T4k = VADD(TD, TI); { V Tb, Td, Tr, T5j, TT, Tt, Tg; Tb = LD(&(x[WS(rs, 4)]), ms, &(x[0])); Td = LD(&(x[WS(rs, 36)]), ms, &(x[0])); Tr = LD(&(x[WS(rs, 12)]), ms, &(x[0])); T5j = VSUB(TQ, TS); TT = VADD(TQ, TS); Tt = LD(&(x[WS(rs, 44)]), ms, &(x[0])); Tg = LD(&(x[WS(rs, 20)]), ms, &(x[0])); { V Ti, Tc, Te, Ts; Ti = LD(&(x[WS(rs, 52)]), ms, &(x[0])); Tc = BYTWJ(&(W[TWVL * 6]), Tb); Te = BYTWJ(&(W[TWVL * 70]), Td); Ts = BYTWJ(&(W[TWVL * 22]), Tr); { V T5k, T5n, TU, T4l, Tu; T5k = VADD(T5i, T5j); T5n = VSUB(T5i, T5j); TU = VSUB(TO, TT); T4l = VADD(TO, TT); Tu = BYTWJ(&(W[TWVL * 86]), Tt); Th = BYTWJ(&(W[TWVL * 38]), Tg); T59 = VSUB(Tc, Te); Tf = VADD(Tc, Te); T7e = VFNMS(LDK(KP707106781), T5k, T5h); T5l = VFMA(LDK(KP707106781), T5k, T5h); T7d = VFNMS(LDK(KP707106781), T5n, T5m); T5o = VFMA(LDK(KP707106781), T5n, T5m); T3b = VFMA(LDK(KP414213562), TJ, TU); TV = VFNMS(LDK(KP414213562), TU, TJ); T4B = VSUB(T4k, T4l); T4m = VADD(T4k, T4l); Tv = VADD(Ts, Tu); T5d = VSUB(Tu, Ts); Tj = BYTWJ(&(W[TWVL * 102]), Ti); } } Tm = LD(&(x[WS(rs, 60)]), ms, &(x[0])); To = LD(&(x[WS(rs, 28)]), ms, &(x[0])); } } { V T5b, T6m, Tl, T1A, T5G, T1Q, T5K, T1C, T1D, T5e, T6n, Tw, T1H, T1J; { V T1w, T1y, T1M, T1O, Tq, T5c, T1B; T1w = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)])); T1y = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)])); T1M = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)])); T1O = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)])); T1B = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)])); { V Tk, T5a, Tn, Tp; Tk = VADD(Th, Tj); T5a = VSUB(Th, Tj); Tn = BYTWJ(&(W[TWVL * 118]), Tm); Tp = BYTWJ(&(W[TWVL * 54]), To); { V T1x, T1z, T1N, T1P; T1x = BYTWJ(&(W[TWVL * 8]), T1w); T1z = BYTWJ(&(W[TWVL * 72]), T1y); T1N = BYTWJ(&(W[TWVL * 24]), T1M); T1P = BYTWJ(&(W[TWVL * 88]), T1O); T5b = VFNMS(LDK(KP414213562), T5a, T59); T6m = VFMA(LDK(KP414213562), T59, T5a); T3X = VADD(Tf, Tk); Tl = VSUB(Tf, Tk); Tq = VADD(Tn, Tp); T5c = VSUB(Tn, Tp); T1A = VADD(T1x, T1z); T5G = VSUB(T1x, T1z); T1Q = VADD(T1N, T1P); T5K = VSUB(T1N, T1P); T1C = BYTWJ(&(W[TWVL * 40]), T1B); } } T1D = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)])); T5e = VFNMS(LDK(KP414213562), T5d, T5c); T6n = VFMA(LDK(KP414213562), T5c, T5d); T3Y = VADD(Tq, Tv); Tw = VSUB(Tq, Tv); T1H = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)])); T1J = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)])); } { V T1I, T1K, T1F, T5H, T2k, T2l, T2z, T2B, T2j, T1E; T2j = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)])); T1E = BYTWJ(&(W[TWVL * 104]), T1D); T6o = VSUB(T6m, T6n); T7b = VADD(T6m, T6n); T5f = VADD(T5b, T5e); T7C = VSUB(T5e, T5b);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -