📄 r2cf_64.c
字号:
/* * Copyright (c) 2003, 2007-8 Matteo Frigo * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * *//* This file was automatically generated --- DO NOT EDIT *//* Generated on Sat Nov 15 20:56:00 EST 2008 */#include "codelet-rdft.h"#ifdef HAVE_FMA/* Generated by: ../../../genfft/gen_r2cf -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 64 -name r2cf_64 -include r2cf.h *//* * This function contains 394 FP additions, 196 FP multiplications, * (or, 198 additions, 0 multiplications, 196 fused multiply/add), * 133 stack variables, 15 constants, and 128 memory accesses */#include "r2cf.h"static void r2cf_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs){ DK(KP773010453, +0.773010453362736960810906609758469800971041293); DK(KP995184726, +0.995184726672196886244836953109479921575474869); DK(KP098491403, +0.098491403357164253077197521291327432293052451); DK(KP820678790, +0.820678790828660330972281985331011598767386482); DK(KP956940335, +0.956940335732208864935797886980269969482849206); DK(KP881921264, +0.881921264348355029712756863660388349508442621); DK(KP534511135, +0.534511135950791641089685961295362908582039528); DK(KP303346683, +0.303346683607342391675883946941299872384187453); DK(KP980785280, +0.980785280403230449126182236134239036973933731); DK(KP198912367, +0.198912367379658006911597622644676228597850501); DK(KP831469612, +0.831469612302545237078788377617905756738560812); DK(KP668178637, +0.668178637919298919997757686523080761552472251); DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP414213562, +0.414213562373095048801688724209698078569671875); DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E T5n, T5o; { E T11, T2j, T4P, T5P, T3D, T5p, T3d, Tf, T1k, T1H, T5D, T4l, T5A, T4a, T3i; E T2U, T1R, T2e, T5K, T4G, T5H, T4v, T3l, T31, T5s, T42, T5t, T3Z, T2n, T1b; E T3f, TZ, T5v, T3T, T5w, T3Q, T2m, T18, T3e, TK, T3K, T5Q, T4S, T5q, T14; E T2k, T3p, Tu, T4w, T1U, T5E, T4h, T5B, T4o, T3j, T2X, T1I, T1z, T1Z, T4A; E T24, T4x, T1X, T20; { E TN, T3V, TS, TX, T3X, TQ, T40, TT; { E T1g, T46, T1B, T1G, T47, T1j, T4j, T1C; { E T4, T3z, T3, T3B, Td, T5, T8, T9; { E T1, T2, Tb, Tc; T1 = R0[0]; T2 = R0[WS(rs, 16)]; Tb = R0[WS(rs, 28)]; Tc = R0[WS(rs, 12)]; T4 = R0[WS(rs, 8)]; T3z = T1 - T2; T3 = T1 + T2; T3B = Tb - Tc; Td = Tb + Tc; T5 = R0[WS(rs, 24)]; T8 = R0[WS(rs, 4)]; T9 = R0[WS(rs, 20)]; } { E T1E, T1F, T1h, T1i; { E T1e, T4N, T6, T3A, Ta, T1f; T1e = R1[0]; T4N = T4 - T5; T6 = T4 + T5; T3A = T8 - T9; Ta = T8 + T9; T1f = R1[WS(rs, 16)]; { E T7, T3C, T4O, Te; T11 = T3 - T6; T7 = T3 + T6; T3C = T3A + T3B; T4O = T3B - T3A; T2j = Td - Ta; Te = Ta + Td; T4P = FNMS(KP707106781, T4O, T4N); T5P = FMA(KP707106781, T4O, T4N); T3D = FMA(KP707106781, T3C, T3z); T5p = FNMS(KP707106781, T3C, T3z); T3d = T7 - Te; Tf = T7 + Te; T1g = T1e + T1f; T46 = T1e - T1f; } } T1E = R1[WS(rs, 4)]; T1F = R1[WS(rs, 20)]; T1h = R1[WS(rs, 8)]; T1i = R1[WS(rs, 24)]; T1B = R1[WS(rs, 28)]; T1G = T1E + T1F; T47 = T1E - T1F; T1j = T1h + T1i; T4j = T1h - T1i; T1C = R1[WS(rs, 12)]; } } { E T1N, T4r, T28, T2d, T4s, T1Q, T4E, T29; { E T2b, T2c, T1O, T1P; { E T2S, T48, T1D, T1L, T1M, T4k, T49, T2T; T1L = R1[WS(rs, 31)]; T1M = R1[WS(rs, 15)]; T2S = T1g + T1j; T1k = T1g - T1j; T48 = T1B - T1C; T1D = T1B + T1C; T1N = T1L + T1M; T4r = T1L - T1M; T4k = T47 - T48; T49 = T47 + T48; T2T = T1G + T1D; T1H = T1D - T1G; T5D = FNMS(KP707106781, T4k, T4j); T4l = FMA(KP707106781, T4k, T4j); T5A = FNMS(KP707106781, T49, T46); T4a = FMA(KP707106781, T49, T46); T3i = T2S - T2T; T2U = T2S + T2T; T2b = R1[WS(rs, 3)]; T2c = R1[WS(rs, 19)]; } T1O = R1[WS(rs, 7)]; T1P = R1[WS(rs, 23)]; T28 = R1[WS(rs, 27)]; T2d = T2b + T2c; T4s = T2b - T2c; T1Q = T1O + T1P; T4E = T1P - T1O; T29 = R1[WS(rs, 11)]; } { E TV, TW, TO, TP; { E T2Z, T4t, T2a, TL, TM, T4F, T4u, T30; TL = R0[WS(rs, 31)]; TM = R0[WS(rs, 15)]; T2Z = T1N + T1Q; T1R = T1N - T1Q; T4t = T28 - T29; T2a = T28 + T29; TN = TL + TM; T3V = TL - TM; T4F = T4t - T4s; T4u = T4s + T4t; T30 = T2d + T2a; T2e = T2a - T2d; T5K = FNMS(KP707106781, T4F, T4E); T4G = FMA(KP707106781, T4F, T4E); T5H = FNMS(KP707106781, T4u, T4r); T4v = FMA(KP707106781, T4u, T4r); T3l = T2Z - T30; T31 = T2Z + T30; TV = R0[WS(rs, 27)]; TW = R0[WS(rs, 11)]; } TO = R0[WS(rs, 7)]; TP = R0[WS(rs, 23)]; TS = R0[WS(rs, 3)]; TX = TV + TW; T3X = TV - TW; TQ = TO + TP; T40 = TO - TP; TT = R0[WS(rs, 19)]; } } } { E Ti, T3E, Tn, Ts, T3I, Tl, T3F, To; { E Ty, T3M, TD, TI, T3O, TB, T3R, TE; { E TG, TH, Tz, TA; { E T19, TR, T3W, TU, Tw, Tx; Tw = R0[WS(rs, 1)]; Tx = R0[WS(rs, 17)]; T19 = TN - TQ; TR = TN + TQ; T3W = TS - TT; TU = TS + TT; Ty = Tw + Tx; T3M = Tw - Tx; { E T41, T3Y, T1a, TY; T41 = T3W - T3X; T3Y = T3W + T3X; T1a = TX - TU; TY = TU + TX; T5s = FNMS(KP707106781, T41, T40); T42 = FMA(KP707106781, T41, T40); T5t = FNMS(KP707106781, T3Y, T3V); T3Z = FMA(KP707106781, T3Y, T3V); T2n = FMA(KP414213562, T19, T1a); T1b = FNMS(KP414213562, T1a, T19); T3f = TR - TY; TZ = TR + TY; TG = R0[WS(rs, 29)]; TH = R0[WS(rs, 13)]; } } Tz = R0[WS(rs, 9)]; TA = R0[WS(rs, 25)]; TD = R0[WS(rs, 5)]; TI = TG + TH; T3O = TG - TH; TB = Tz + TA; T3R = Tz - TA; TE = R0[WS(rs, 21)]; } { E Tq, Tr, Tj, Tk; { E T16, TC, T3N, TF, Tg, Th; Tg = R0[WS(rs, 2)]; Th = R0[WS(rs, 18)]; T16 = Ty - TB; TC = Ty + TB; T3N = TD - TE; TF = TD + TE; Ti = Tg + Th; T3E = Tg - Th; { E T3S, T3P, T17, TJ; T3S = T3N - T3O; T3P = T3N + T3O; T17 = TI - TF; TJ = TF + TI; T5v = FNMS(KP707106781, T3S, T3R); T3T = FMA(KP707106781, T3S, T3R); T5w = FNMS(KP707106781, T3P, T3M); T3Q = FMA(KP707106781, T3P, T3M); T2m = FNMS(KP414213562, T16, T17); T18 = FMA(KP414213562, T17, T16); T3e = TC - TJ; TK = TC + TJ; Tq = R0[WS(rs, 6)]; Tr = R0[WS(rs, 22)]; } } Tj = R0[WS(rs, 10)]; Tk = R0[WS(rs, 26)]; Tn = R0[WS(rs, 30)]; Ts = Tq + Tr; T3I = Tq - Tr; Tl = Tj + Tk; T3F = Tj - Tk; To = R0[WS(rs, 14)]; } } { E T1n, T4b, T1s, T4f, T1x, T4c, T1q, T1t; { E T1v, T1w, T1o, T1p; { E T1l, T4Q, T3G, Tm, T12, Tp, T3H, T1m; T1l = R1[WS(rs, 2)]; T4Q = FMA(KP414213562, T3E, T3F); T3G = FNMS(KP414213562, T3F, T3E); Tm = Ti + Tl; T12 = Ti - Tl; Tp = Tn + To; T3H = Tn - To; T1m = R1[WS(rs, 18)]; T1v = R1[WS(rs, 6)]; { E T4R, T3J, Tt, T13; T4R = FNMS(KP414213562, T3H, T3I); T3J = FMA(KP414213562, T3I, T3H); Tt = Tp + Ts; T13 = Tp - Ts; T1n = T1l + T1m; T4b = T1l - T1m; T3K = T3G + T3J; T5Q = T3J - T3G; T4S = T4Q + T4R; T5q = T4Q - T4R; T14 = T12 + T13; T2k = T13 - T12; T3p = Tt - Tm; Tu = Tm + Tt; T1w = R1[WS(rs, 22)]; } } T1o = R1[WS(rs, 10)]; T1p = R1[WS(rs, 26)]; T1s = R1[WS(rs, 30)]; T4f = T1v - T1w; T1x = T1v + T1w; T4c = T1o - T1p; T1q = T1o + T1p; T1t = R1[WS(rs, 14)]; } { E T22, T23, T1V, T1W; { E T1S, T4d, T4m, T2V, T1r, T4e, T1u, T1T; T1S = R1[WS(rs, 1)]; T4d = FNMS(KP414213562, T4c, T4b); T4m = FMA(KP414213562, T4b, T4c); T2V = T1n + T1q; T1r = T1n - T1q; T4e = T1s - T1t; T1u = T1s + T1t; T1T = R1[WS(rs, 17)]; T22 = R1[WS(rs, 5)]; { E T4g, T4n, T2W, T1y; T4g = FMA(KP414213562, T4f, T4e); T4n = FNMS(KP414213562, T4e, T4f); T2W = T1u + T1x; T1y = T1u - T1x; T4w = T1S - T1T; T1U = T1S + T1T; T5E = T4g - T4d; T4h = T4d + T4g; T5B = T4m - T4n; T4o = T4m + T4n; T3j = T2W - T2V; T2X = T2V + T2W; T1I = T1y - T1r; T1z = T1r + T1y; T23 = R1[WS(rs, 21)]; } } T1V = R1[WS(rs, 9)]; T1W = R1[WS(rs, 25)]; T1Z = R1[WS(rs, 29)]; T4A = T23 - T22; T24 = T22 + T23; T4x = T1W - T1V; T1X = T1V + T1W; T20 = R1[WS(rs, 13)]; } } } } { E T4C, T5L, T4J, T5I, T26, T2f, T3q, T3h, T3w, T3s, T3o, T3r, T3t; { E T2R, T37, T2Y, T3a, T39, T3m, T3b, T35, Tv, T10, T34, T3c, T3x, T3y; { E T4y, T4H, T32, T1Y, T4z, T21; T2R = Tf - Tu; Tv = Tf + Tu; T4y = FMA(KP414213562, T4x, T4w); T4H = FNMS(KP414213562, T4w, T4x); T32 = T1U + T1X; T1Y = T1U - T1X; T4z = T1Z - T20; T21 = T1Z + T20; T10 = TK + TZ; T37 = TZ - TK; T2Y = T2U - T2X; T3a = T2U + T2X; { E T4B, T4I, T33, T25; T4B = FNMS(KP414213562, T4A, T4z); T4I = FMA(KP414213562, T4z, T4A); T33 = T21 + T24; T25 = T21 - T24; T39 = Tv + T10; T4C = T4y + T4B; T5L = T4B - T4y; T4J = T4H + T4I; T5I = T4I - T4H; T34 = T32 + T33; T3m = T33 - T32; T26 = T1Y + T25; T2f = T25 - T1Y; } } Cr[WS(csr, 16)] = Tv - T10; T3b = T31 + T34; T35 = T31 - T34; Ci[WS(csi, 16)] = T3b - T3a; T3c = T3a + T3b; { E T3k, T3u, T3v, T3n, T36, T38, T3g; T3g = T3e + T3f; T3q = T3f - T3e; Cr[0] = T39 + T3c; Cr[WS(csr, 32)] = T39 - T3c; T36 = T2Y + T35; T38 = T35 - T2Y; T3x = FNMS(KP707106781, T3g, T3d); T3h = FMA(KP707106781, T3g, T3d); Ci[WS(csi, 8)] = FMA(KP707106781, T38, T37); Ci[WS(csi, 24)] = FMS(KP707106781, T38, T37); Cr[WS(csr, 8)] = FMA(KP707106781, T36, T2R); Cr[WS(csr, 24)] = FNMS(KP707106781, T36, T2R); T3k = FMA(KP414213562, T3j, T3i); T3u = FNMS(KP414213562, T3i, T3j); T3v = FMA(KP414213562, T3l, T3m); T3n = FNMS(KP414213562, T3m, T3l); T3y = T3v - T3u; T3w = T3u + T3v; T3s = T3n - T3k; T3o = T3k + T3n; } Cr[WS(csr, 12)] = FMA(KP923879532, T3y, T3x); Cr[WS(csr, 20)] = FNMS(KP923879532, T3y, T3x); } Cr[WS(csr, 4)] = FMA(KP923879532, T3o, T3h); Cr[WS(csr, 28)] = FNMS(KP923879532, T3o, T3h); T3r = FNMS(KP707106781, T3q, T3p); T3t = FMA(KP707106781, T3q, T3p); { E T27, T2g, T2v, T1d, T2r, T2p, T2s, T1K, T6l, T6m; { E T15, T2o, T2P, T2z, T2l, T1c, T1A, T1J, T2D, T2L, T2J, T2M, T2C, T2E, T2N; E T2F; { E T2H, T2I, T2x, T2y, T2A, T2B; T15 = FMA(KP707106781, T14, T11); T2x = FNMS(KP707106781, T14, T11); T2y = T2n - T2m; T2o = T2m + T2n; Ci[WS(csi, 4)] = FMA(KP923879532, T3w, T3t); Ci[WS(csi, 28)] = FMS(KP923879532, T3w, T3t); Ci[WS(csi, 20)] = FMA(KP923879532, T3s, T3r); Ci[WS(csi, 12)] = FMS(KP923879532, T3s, T3r); T2P = FNMS(KP923879532, T2y, T2x); T2z = FMA(KP923879532, T2y, T2x); T2l = FMA(KP707106781, T2k, T2j); T2H = FNMS(KP707106781, T2k, T2j); T2I = T1b - T18; T1c = T18 + T1b; T1A = FMA(KP707106781, T1z, T1k); T2A = FNMS(KP707106781, T1z, T1k);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -