📄 hc2cfdft2_32.c
字号:
/* * Copyright (c) 2003, 2007-8 Matteo Frigo * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * *//* This file was automatically generated --- DO NOT EDIT *//* Generated on Sat Nov 15 21:04:03 EST 2008 */#include "codelet-rdft.h"#ifdef HAVE_FMA/* Generated by: ../../../genfft/gen_hc2cdft -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 32 -dit -name hc2cfdft2_32 -include hc2cf.h *//* * This function contains 552 FP additions, 414 FP multiplications, * (or, 300 additions, 162 multiplications, 252 fused multiply/add), * 196 stack variables, 8 constants, and 128 memory accesses */#include "hc2cf.h"static void hc2cfdft2_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms){ DK(KP980785280, +0.980785280403230449126182236134239036973933731); DK(KP831469612, +0.831469612302545237078788377617905756738560812); DK(KP668178637, +0.668178637919298919997757686523080761552472251); DK(KP198912367, +0.198912367379658006911597622644676228597850501); DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP414213562, +0.414213562373095048801688724209698078569671875); DK(KP707106781, +0.707106781186547524400844362104849039284835938); DK(KP500000000, +0.500000000000000000000000000000000000000000000); INT m; for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(rs)) { E Tax, TaA; { E T1, Th, T2, T5, Ti, Ty, T1t, T3, Tb, Tj, TY, TK, Tl, T4, Tk; T1 = W[0]; Th = W[4]; T2 = W[2]; T5 = W[3]; Ti = W[6]; Ty = T1 * Th; T1t = T2 * Th; T3 = T1 * T2; Tb = T1 * T5; Tj = Th * Ti; TY = T2 * Ti; TK = T1 * Ti; Tl = W[7]; T4 = W[1]; Tk = W[5]; { E T3j, T7Z, T5b, T93, T6B, T8V, T4d, T8J, T8r, T6e, T8l, T1T, T8C, T54, T8i; E T5O, T94, T31, T8K, T6w, T8U, T3Y, T80, T5g, T8B, T69, T8h, T1s, T8q, T4T; E T8k, T5J, Tx, T8a, T5y, T8d, T4s, T5Y, T8v, T8E, T2k, T82, T6l, T3z, T83; E T5m, T8X, T8O, T2F, T86, T6q, T3M, T85, T5r, T8Y, T8R, TW, T8e, T8x, T4B; E T5D, T8b, T63, T8w; { E TL, T2l, T1c, Tc, T1a, T6, Tm, T2v, Tz, T2q, TR, Ts, T2A, TF, T1H; E T1g, T1d, T1F, T34, T3F, T3B, T32, T3w, T3s, T4p, T4l, T2f, T29, T4K, T4S; E T5G, T5I; { E TZ, T2R, T2H, T15, T2W, T2M, T4I, T4E, T3V, T3S, T4Q, T4M, T1n, T1h, T4X; E T53, T5L, T5N, T5d, T5f; { E T1u, T1A, T51, T4Y, T28, T25, T44, T40, T1O, T1I, T3b, T35, T4b, T3i, T45; E T38, T39, T58, T49, T3e, T41; { E T3g, T3h, T36, T37, TQ; T3g = Ip[0]; TZ = FNMS(T5, Tl, TY); T2R = FMA(T5, Tl, TY); TQ = T1 * Tl; { E T14, Tr, T1z, TE; T14 = T2 * Tl; Tr = Th * Tl; TL = FMA(T4, Tl, TK); T2l = FNMS(T4, Tl, TK); T1c = FMA(T4, T2, Tb); Tc = FNMS(T4, T2, Tb); T1a = FNMS(T4, T5, T3); T6 = FMA(T4, T5, T3); Tm = FMA(Tk, Tl, Tj); T2v = FNMS(T5, Tk, T1t); T1u = FMA(T5, Tk, T1t); Tz = FNMS(T4, Tk, Ty); T2H = FMA(T4, Tk, Ty); T1z = T2 * Tk; TE = T1 * Tk; T2q = FMA(T4, Ti, TQ); TR = FNMS(T4, Ti, TQ); T15 = FMA(T5, Ti, T14); T2W = FNMS(T5, Ti, T14); Ts = FNMS(Tk, Ti, Tr); { E T1f, T4H, T4D, T1b; T1f = T1a * Tk; T4H = T1a * Tl; T4D = T1a * Ti; T1b = T1a * Th; { E T27, T3E, T3A, T24; T27 = T6 * Tk; T3E = T6 * Tl; T3A = T6 * Ti; T24 = T6 * Th; { E T3v, T3r, T4P, T4L; T3v = T1u * Tl; T3r = T1u * Ti; T4P = T2v * Tl; T4L = T2v * Ti; { E T4o, T4k, T43, T3Z; T4o = T2H * Tl; T4k = T2H * Ti; T43 = Tz * Tl; T3Z = Tz * Ti; T1A = FNMS(T5, Th, T1z); T2A = FMA(T5, Th, T1z); T2M = FNMS(T4, Th, TE); TF = FMA(T4, Th, TE); T1H = FNMS(T1c, Th, T1f); T1g = FMA(T1c, Th, T1f); T51 = FNMS(T1c, Ti, T4H); T4I = FMA(T1c, Ti, T4H); T4Y = FMA(T1c, Tl, T4D); T4E = FNMS(T1c, Tl, T4D); T1d = FNMS(T1c, Tk, T1b); T1F = FMA(T1c, Tk, T1b); T34 = FMA(Tc, Th, T27); T28 = FNMS(Tc, Th, T27); T3V = FNMS(Tc, Ti, T3E); T3F = FMA(Tc, Ti, T3E); T3S = FMA(Tc, Tl, T3A); T3B = FNMS(Tc, Tl, T3A); T25 = FMA(Tc, Tk, T24); T32 = FNMS(Tc, Tk, T24); T3w = FNMS(T1A, Ti, T3v); T3s = FMA(T1A, Tl, T3r); T4Q = FNMS(T2A, Ti, T4P); T4M = FMA(T2A, Tl, T4L); T4p = FNMS(T2M, Ti, T4o); T4l = FMA(T2M, Tl, T4k); T44 = FNMS(TF, Ti, T43); T40 = FMA(TF, Tl, T3Z); { E T1m, T1e, T1N, T1G; T1m = T1d * Tl; T1e = T1d * Ti; T1N = T1F * Tl; T1G = T1F * Ti; { E T2e, T26, T3a, T33; T2e = T25 * Tl; T26 = T25 * Ti; T3a = T32 * Tl; T33 = T32 * Ti; T1n = FNMS(T1g, Ti, T1m); T1h = FMA(T1g, Tl, T1e); T1O = FNMS(T1H, Ti, T1N); T1I = FMA(T1H, Tl, T1G); T2f = FNMS(T28, Ti, T2e); T29 = FMA(T28, Tl, T26); T3b = FNMS(T34, Ti, T3a); T35 = FMA(T34, Tl, T33); T3h = Im[0]; } } } } } } } T36 = Ip[WS(rs, 8)]; T37 = Im[WS(rs, 8)]; { E T47, T48, T3c, T3d; T47 = Rm[0]; T4b = T3g + T3h; T3i = T3g - T3h; T45 = T36 + T37; T38 = T36 - T37; T48 = Rp[0]; T3c = Rp[WS(rs, 8)]; T3d = Rm[WS(rs, 8)]; T39 = T35 * T38; T58 = T48 + T47; T49 = T47 - T48; T3e = T3c + T3d; T41 = T3d - T3c; } } { E T4W, T1x, T1y, T6a, T4U, T1D, T1P, T4V, T5K, T52, T1L, T1Q; { E T1B, T1C, T1J, T1K; { E T1v, T6A, T4c, T5a, T6y, T46, T1w, T6z, T4a; T1v = Ip[WS(rs, 3)]; T6z = T4 * T49; T4a = T1 * T49; { E T3f, T59, T6x, T42; T3f = FNMS(T3b, T3e, T39); T59 = T35 * T3e; T6x = T44 * T41; T42 = T40 * T41; T6A = FMA(T1, T4b, T6z); T4c = FNMS(T4, T4b, T4a); T3j = T3f + T3i; T7Z = T3i - T3f; T5a = FMA(T3b, T38, T59); T6y = FMA(T40, T45, T6x); T46 = FNMS(T44, T45, T42); T1w = Im[WS(rs, 3)]; } T5b = T58 + T5a; T93 = T58 - T5a; T6B = T6y + T6A; T8V = T6A - T6y; T4d = T46 + T4c; T8J = T4c - T46; T4W = T1v + T1w; T1x = T1v - T1w; } T1B = Rp[WS(rs, 3)]; T1C = Rm[WS(rs, 3)]; T1y = T1u * T1x; T6a = T25 * T4W; T1J = Ip[WS(rs, 11)]; T4U = T1B - T1C; T1D = T1B + T1C; T1K = Im[WS(rs, 11)]; T1P = Rp[WS(rs, 11)]; T4V = T25 * T4U; T5K = T1u * T1D; T52 = T1J + T1K; T1L = T1J - T1K; T1Q = Rm[WS(rs, 11)]; } { E T1E, T6c, T1M, T4Z, T1R, T6b; T1E = FNMS(T1A, T1D, T1y); T6c = T4Y * T52; T1M = T1I * T1L; T4Z = T1P - T1Q; T1R = T1P + T1Q; T6b = FNMS(T28, T4U, T6a); { E T5M, T6d, T50, T1S; T4X = FMA(T28, T4W, T4V); T6d = FNMS(T51, T4Z, T6c); T50 = T4Y * T4Z; T1S = FNMS(T1O, T1R, T1M); T5M = T1I * T1R; T8r = T6d - T6b; T6e = T6b + T6d; T8l = T1E - T1S; T1T = T1E + T1S; T53 = FMA(T51, T52, T50); T5L = FMA(T1A, T1x, T5K); T5N = FMA(T1O, T1L, T5M); } } } } { E T3Q, T2K, T2P, T2L, T6s, T3P, T5c, T3W, T2U, T2X, T2Y, T2V; { E T2I, T2J, T2N, T2O, T2S, T3O, T2T; T2I = Ip[WS(rs, 4)]; T8C = T53 - T4X; T54 = T4X + T53; T8i = T5L - T5N; T5O = T5L + T5N; T2J = Im[WS(rs, 4)]; T2N = Rp[WS(rs, 4)]; T2O = Rm[WS(rs, 4)]; T2S = Ip[WS(rs, 12)]; T3Q = T2I + T2J; T2K = T2I - T2J; T3O = T2O - T2N; T2P = T2N + T2O; T2T = Im[WS(rs, 12)]; T2L = T2H * T2K; T6s = Tk * T3O; T3P = Th * T3O; T5c = T2H * T2P; T3W = T2S + T2T; T2U = T2S - T2T; T2X = Rp[WS(rs, 12)]; T2Y = Rm[WS(rs, 12)]; T2V = T2R * T2U; } { E T2Q, T6t, T3T, T2Z, T3R, T6u, T3U; T2Q = FNMS(T2M, T2P, T2L); T6t = FMA(Th, T3Q, T6s); T3T = T2Y - T2X; T2Z = T2X + T2Y; T3R = FNMS(Tk, T3Q, T3P); T5d = FMA(T2M, T2K, T5c); T6u = T3V * T3T; T3U = T3S * T3T; { E T30, T5e, T6v, T3X; T30 = FNMS(T2W, T2Z, T2V); T5e = T2R * T2Z; T6v = FMA(T3S, T3W, T6u); T3X = FNMS(T3V, T3W, T3U); T94 = T2Q - T30; T31 = T2Q + T30; T8K = T6t - T6v; T6w = T6t + T6v; T8U = T3R - T3X; T3Y = T3R + T3X; T5f = FMA(T2W, T2U, T5e); } } } { E T4J, T12, T65, T13, T4F, T18, T1o, T4G, T5F, T4R, T1k, T1p; { E T16, T17, T10, T11, T1i, T1j; T10 = Ip[WS(rs, 15)]; T11 = Im[WS(rs, 15)]; T16 = Rp[WS(rs, 15)]; T80 = T5d - T5f; T5g = T5d + T5f; T4J = T10 + T11; T12 = T10 - T11; T17 = Rm[WS(rs, 15)]; T1i = Ip[WS(rs, 7)]; T65 = T4E * T4J; T13 = TZ * T12; T4F = T16 - T17; T18 = T16 + T17; T1j = Im[WS(rs, 7)]; T1o = Rp[WS(rs, 7)]; T4G = T4E * T4F; T5F = TZ * T18; T4R = T1i + T1j; T1k = T1i - T1j; T1p = Rm[WS(rs, 7)]; } { E T19, T67, T1l, T4N, T1q, T66; T19 = FNMS(T15, T18, T13); T67 = T4M * T4R; T1l = T1h * T1k; T4N = T1o - T1p; T1q = T1o + T1p; T66 = FNMS(T4I, T4F, T65); { E T5H, T68, T4O, T1r; T4K = FMA(T4I, T4J, T4G); T68 = FNMS(T4Q, T4N, T67); T4O = T4M * T4N; T1r = FNMS(T1n, T1q, T1l); T5H = T1h * T1q; T8B = T66 - T68; T69 = T66 + T68; T8h = T19 - T1r; T1s = T19 + T1r; T4S = FMA(T4Q, T4R, T4O); T5G = FMA(T15, T12, T5F); T5I = FMA(T1n, T1k, T5H); } } } } { E T2c, T3x, T2d, T23, T5j, T3q, T2i, T3t, T6i, T8t, T5V, T5X; { E Tn, T4i, T9, T4g, Tf, T5U, Ta, T4h, T5u, To, Tt, Tu; { E T7, T8, Td, Te; T7 = Ip[WS(rs, 1)]; T8q = T4S - T4K; T4T = T4K + T4S; T8k = T5G - T5I; T5J = T5G + T5I; T8 = Im[WS(rs, 1)]; Td = Rp[WS(rs, 1)]; Te = Rm[WS(rs, 1)]; Tn = Ip[WS(rs, 9)]; T4i = T7 + T8; T9 = T7 - T8; T4g = Td - Te; Tf = Td + Te; T5U = T2 * T4i; Ta = T6 * T9; T4h = T2 * T4g; T5u = T6 * Tf; To = Im[WS(rs, 9)]; Tt = Rp[WS(rs, 9)]; Tu = Rm[WS(rs, 9)]; } { E Tg, T4q, Tp, T4m, Tv, T5W, Tq, T4n, T5w; Tg = FNMS(Tc, Tf, Ta); T4q = Tn + To; Tp = Tn - To; T4m = Tt - Tu; Tv = Tt + Tu; T5W = T4l * T4q; Tq = Tm * Tp; T4n = T4l * T4m; T5w = Tm * Tv; { E T5v, Tw, T4j, T5x, T4r; T5v = FMA(Tc, T9, T5u); Tw = FNMS(Ts, Tv, Tq); T4j = FMA(T5, T4i, T4h); T5x = FMA(Ts, Tp, T5w); T4r = FMA(T4p, T4q, T4n); Tx = Tg + Tw; T8a = Tg - Tw; T5y = T5v + T5x; T8d = T5v - T5x; T4s = T4j + T4r; T8t = T4r - T4j; T5V = FNMS(T5, T4g, T5U); T5X = FNMS(T4p, T4m, T5W); } } } { E T3p, T1Y, T1Z, T22, T2g, T6h, T3o, T5i, T2h; { E T20, T21, T1W, T1X, T8u, T2a, T2b, T3n; T1W = Ip[WS(rs, 2)]; T1X = Im[WS(rs, 2)]; T8u = T5V - T5X; T5Y = T5V + T5X; T20 = Rp[WS(rs, 2)]; T3p = T1W + T1X; T1Y = T1W - T1X; T8v = T8t - T8u; T8E = T8u + T8t; T21 = Rm[WS(rs, 2)]; T1Z = T1a * T1Y; T2a = Ip[WS(rs, 10)]; T2b = Im[WS(rs, 10)]; T3n = T21 - T20; T22 = T20 + T21; T2g = Rp[WS(rs, 10)]; T2c = T2a - T2b; T3x = T2a + T2b; T6h = T1H * T3n; T3o = T1F * T3n; T5i = T1a * T22; T2d = T29 * T2c; T2h = Rm[WS(rs, 10)]; } T23 = FNMS(T1c, T22, T1Z); T5j = FMA(T1c, T1Y, T5i); T3q = FNMS(T1H, T3p, T3o); T2i = T2g + T2h; T3t = T2h - T2g; T6i = FMA(T1F, T3p, T6h); } { E T2y, T3K, T2z, T2u, T5o, T3H, T2D, T3I, T6n; { E T3G, T2o, T2p, T2t, T6m, T3D, T5n, T2B, T2C; { E T2r, T2s, T2m, T2n, T3C, T2w, T2x; { E T8N, T8M, T6j, T3u, T2j; T2m = Ip[WS(rs, 14)]; T6j = T3w * T3t; T3u = T3s * T3t; T2j = FNMS(T2f, T2i, T2d); { E T5k, T6k, T3y, T5l; T5k = T29 * T2i; T6k = FMA(T3s, T3x, T6j); T3y = FNMS(T3w, T3x, T3u); T2k = T23 + T2j; T82 = T23 - T2j; T5l = FMA(T2f, T2c, T5k); T6l = T6i + T6k; T8N = T6i - T6k; T3z = T3q + T3y; T8M = T3q - T3y;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -