📄 t2sv_32.c
字号:
} } } { V T6s, T9o, T9n, T6v, T6N, T6Q, T6G, T6J, T68, T4Y, T9f, T9d, T9l, T9j, T6g; V T6o, T6q, T6m, T66, T6a, T6p, T6j, T5x, T69; { V T6d, T6e, T6c, T4s, T9c, T4X, T9h, T9b, T5T, T64, T5k, T5v, T9i, T6f; { V T4c, T4r, T4H, T4W, T99, T9a; T6s = VSUB(T46, T4b); T4c = VADD(T46, T4b); T4r = VADD(T4j, T4q); T9o = VSUB(T4q, T4j); T6d = VFMA(LDK(KP414213562), T4z, T4G); T4H = VFNMS(LDK(KP414213562), T4G, T4z); T4W = VFMA(LDK(KP414213562), T4V, T4O); T6e = VFNMS(LDK(KP414213562), T4O, T4V); T9n = VADD(T98, T97); T99 = VSUB(T97, T98); T9a = VADD(T6t, T6u); T6v = VSUB(T6t, T6u); ST(&(ii[WS(rs, 26)]), VFNMS(LDK(KP923879532), T90, T8Z), ms, &(ii[0])); ST(&(ii[WS(rs, 10)]), VFMA(LDK(KP923879532), T90, T8Z), ms, &(ii[0])); T6c = VFMA(LDK(KP707106781), T4r, T4c); T4s = VFNMS(LDK(KP707106781), T4r, T4c); T9c = VADD(T4H, T4W); T4X = VSUB(T4H, T4W); T9h = VFNMS(LDK(KP707106781), T9a, T99); T9b = VFMA(LDK(KP707106781), T9a, T99); T6N = VSUB(T5S, T5L); T5T = VADD(T5L, T5S); T64 = VADD(T62, T63); T6Q = VSUB(T62, T63); T6G = VSUB(T5j, T5c); T5k = VADD(T5c, T5j); T5v = VADD(T5t, T5u); T6J = VSUB(T5t, T5u); } T68 = VFNMS(LDK(KP923879532), T4X, T4s); T4Y = VFMA(LDK(KP923879532), T4X, T4s); T9f = VFNMS(LDK(KP923879532), T9c, T9b); T9d = VFMA(LDK(KP923879532), T9c, T9b); T9i = VSUB(T6e, T6d); T6f = VADD(T6d, T6e); { V T6l, T5U, T6k, T65; T6l = VFMA(LDK(KP707106781), T5T, T5E); T5U = VFNMS(LDK(KP707106781), T5T, T5E); T6k = VFMA(LDK(KP707106781), T64, T61); T65 = VFNMS(LDK(KP707106781), T64, T61); { V T6i, T5l, T6h, T5w; T6i = VFMA(LDK(KP707106781), T5k, T55); T5l = VFNMS(LDK(KP707106781), T5k, T55); T6h = VFMA(LDK(KP707106781), T5v, T5s); T5w = VFNMS(LDK(KP707106781), T5v, T5s); T9l = VFNMS(LDK(KP923879532), T9i, T9h); T9j = VFMA(LDK(KP923879532), T9i, T9h); T6g = VFMA(LDK(KP923879532), T6f, T6c); T6o = VFNMS(LDK(KP923879532), T6f, T6c); T6q = VFMA(LDK(KP198912367), T6k, T6l); T6m = VFNMS(LDK(KP198912367), T6l, T6k); T66 = VFNMS(LDK(KP668178637), T65, T5U); T6a = VFMA(LDK(KP668178637), T5U, T65); T6p = VFNMS(LDK(KP198912367), T6h, T6i); T6j = VFMA(LDK(KP198912367), T6i, T6h); T5x = VFMA(LDK(KP668178637), T5w, T5l); T69 = VFNMS(LDK(KP668178637), T5l, T5w); } } } { V T6Y, T6w, T9w, T6D, T9v, T9p, T9q, T71, T77, T6O, T76, T6R; { V T6Z, T6z, T6C, T70; { V T6n, T9g, T9e, T6r; T6n = VADD(T6j, T6m); T9g = VSUB(T6m, T6j); T9e = VADD(T6p, T6q); T6r = VSUB(T6p, T6q); { V T9k, T6b, T67, T9m; T9k = VSUB(T6a, T69); T6b = VADD(T69, T6a); T67 = VSUB(T5x, T66); T9m = VADD(T5x, T66); ST(&(ii[WS(rs, 25)]), VFNMS(LDK(KP980785280), T9g, T9f), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 9)]), VFMA(LDK(KP980785280), T9g, T9f), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP980785280), T6n, T6g), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 17)]), VFNMS(LDK(KP980785280), T6n, T6g), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 9)]), VFMA(LDK(KP980785280), T6r, T6o), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 25)]), VFNMS(LDK(KP980785280), T6r, T6o), ms, &(ri[WS(rs, 1)])); ST(&(ii[WS(rs, 17)]), VFNMS(LDK(KP980785280), T9e, T9d), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP980785280), T9e, T9d), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 29)]), VFMA(LDK(KP831469612), T6b, T68), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP831469612), T6b, T68), ms, &(ri[WS(rs, 1)])); ST(&(ii[WS(rs, 21)]), VFNMS(LDK(KP831469612), T9k, T9j), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP831469612), T9k, T9j), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 29)]), VFMA(LDK(KP831469612), T9m, T9l), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP831469612), T9m, T9l), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP831469612), T67, T4Y), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 21)]), VFNMS(LDK(KP831469612), T67, T4Y), ms, &(ri[WS(rs, 1)])); T6Y = VFNMS(LDK(KP707106781), T6v, T6s); T6w = VFMA(LDK(KP707106781), T6v, T6s); } } T6Z = VFNMS(LDK(KP414213562), T6x, T6y); T6z = VFMA(LDK(KP414213562), T6y, T6x); T6C = VFNMS(LDK(KP414213562), T6B, T6A); T70 = VFMA(LDK(KP414213562), T6A, T6B); T9w = VADD(T6z, T6C); T6D = VSUB(T6z, T6C); T9v = VFNMS(LDK(KP707106781), T9o, T9n); T9p = VFMA(LDK(KP707106781), T9o, T9n); T9q = VSUB(T70, T6Z); T71 = VADD(T6Z, T70); T77 = VFMA(LDK(KP707106781), T6N, T6M); T6O = VFNMS(LDK(KP707106781), T6N, T6M); T76 = VFMA(LDK(KP707106781), T6Q, T6P); T6R = VFNMS(LDK(KP707106781), T6Q, T6P); T6H = VFNMS(LDK(KP707106781), T6G, T6F); T74 = VFMA(LDK(KP707106781), T6G, T6F); } T6U = VFNMS(LDK(KP923879532), T6D, T6w); T6E = VFMA(LDK(KP923879532), T6D, T6w); T9r = VFMA(LDK(KP923879532), T9q, T9p); T9t = VFNMS(LDK(KP923879532), T9q, T9p); T78 = VFNMS(LDK(KP198912367), T77, T76); T7c = VFMA(LDK(KP198912367), T76, T77); T6W = VFMA(LDK(KP668178637), T6O, T6R); T6S = VFNMS(LDK(KP668178637), T6R, T6O); T73 = VFMA(LDK(KP707106781), T6J, T6I); T6K = VFNMS(LDK(KP707106781), T6J, T6I); T7a = VFMA(LDK(KP923879532), T71, T6Y); T72 = VFNMS(LDK(KP923879532), T71, T6Y); T9x = VFNMS(LDK(KP923879532), T9w, T9v); T9z = VFMA(LDK(KP923879532), T9w, T9v); } } } } } { V T7b, T75, T6L, T6V; T7b = VFNMS(LDK(KP198912367), T73, T74); T75 = VFMA(LDK(KP198912367), T74, T73); T6L = VFMA(LDK(KP668178637), T6K, T6H); T6V = VFNMS(LDK(KP668178637), T6H, T6K); { V T79, T9A, T9y, T7d; T79 = VSUB(T75, T78); T9A = VADD(T75, T78); T9y = VSUB(T7c, T7b); T7d = VADD(T7b, T7c); { V T9s, T6X, T6T, T9u; T9s = VADD(T6V, T6W); T6X = VSUB(T6V, T6W); T6T = VADD(T6L, T6S); T9u = VSUB(T6S, T6L); ST(&(ii[WS(rs, 31)]), VFMA(LDK(KP980785280), T9A, T9z), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 15)]), VFNMS(LDK(KP980785280), T9A, T9z), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 7)]), VFMA(LDK(KP980785280), T79, T72), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 23)]), VFNMS(LDK(KP980785280), T79, T72), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 31)]), VFMA(LDK(KP980785280), T7d, T7a), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 15)]), VFNMS(LDK(KP980785280), T7d, T7a), ms, &(ri[WS(rs, 1)])); ST(&(ii[WS(rs, 23)]), VFNMS(LDK(KP980785280), T9y, T9x), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 7)]), VFMA(LDK(KP980785280), T9y, T9x), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 11)]), VFMA(LDK(KP831469612), T6X, T6U), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 27)]), VFNMS(LDK(KP831469612), T6X, T6U), ms, &(ri[WS(rs, 1)])); ST(&(ii[WS(rs, 19)]), VFNMS(LDK(KP831469612), T9s, T9r), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP831469612), T9s, T9r), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 27)]), VFNMS(LDK(KP831469612), T9u, T9t), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 11)]), VFMA(LDK(KP831469612), T9u, T9t), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP831469612), T6T, T6E), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 19)]), VFNMS(LDK(KP831469612), T6T, T6E), ms, &(ri[WS(rs, 1)])); } } } }}static const tw_instr twinstr[] = { VTW(0, 1), VTW(0, 3), VTW(0, 9), VTW(0, 27), {TW_NEXT, (2 * VL), 0}};static const ct_desc desc = { 32, "t2sv_32", twinstr, &GENUS, {236, 98, 252, 0}, 0, 0, 0 };void X(codelet_t2sv_32) (planner *p) { X(kdft_dit_register) (p, t2sv_32, &desc);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_twiddle -simd -compact -variables 4 -pipeline-latency 8 -twiddle-log3 -precompute-twiddles -n 32 -name t2sv_32 -include ts.h *//* * This function contains 488 FP additions, 280 FP multiplications, * (or, 376 additions, 168 multiplications, 112 fused multiply/add), * 158 stack variables, 7 constants, and 128 memory accesses */#include "ts.h"static void t2sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms){ DVK(KP195090322, +0.195090322016128267848284868477022240927691618); DVK(KP980785280, +0.980785280403230449126182236134239036973933731); DVK(KP555570233, +0.555570233019602224742830813948532874374937191); DVK(KP831469612, +0.831469612302545237078788377617905756738560812); DVK(KP382683432, +0.382683432365089771728459984030398866761344562); DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + (mb * 8); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 8), MAKE_VOLATILE_STRIDE(rs)) { V T2, T5, T3, T6, T8, TM, TO, Td, T9, Te, Th, Tl, TD, TH, T1y; V T1H, T15, T1A, T11, T1F, T1n, T1p, T2q, T2I, T2u, T2K, T2V, T3b, T2Z, T3d; V Tu, Ty, T3l, T3n, T1t, T1v, T2f, T2h, T1a, T1e, T32, T34, T1W, T1Y, T2C; V T2E, Tg, TR, Tk, TS, Tm, TV, To, TT, T1M, T21, T1P, T22, T1Q, T25; V T1S, T23; { V Ts, T1d, Tx, T18, Tt, T1c, Tw, T19, TB, T14, TG, TZ, TC, T13, TF; V T10; { V T4, Tc, T7, Tb; T2 = LDW(&(W[0])); T5 = LDW(&(W[TWVL * 1])); T3 = LDW(&(W[TWVL * 2])); T6 = LDW(&(W[TWVL * 3])); T4 = VMUL(T2, T3); Tc = VMUL(T5, T3); T7 = VMUL(T5, T6); Tb = VMUL(T2, T6); T8 = VADD(T4, T7); TM = VSUB(T4, T7); TO = VADD(Tb, Tc); Td = VSUB(Tb, Tc); T9 = LDW(&(W[TWVL * 4])); Ts = VMUL(T2, T9); T1d = VMUL(T6, T9); Tx = VMUL(T5, T9); T18 = VMUL(T3, T9); Te = LDW(&(W[TWVL * 5])); Tt = VMUL(T5, Te); T1c = VMUL(T3, Te); Tw = VMUL(T2, Te); T19 = VMUL(T6, Te); Th = LDW(&(W[TWVL * 6])); TB = VMUL(T3, Th); T14 = VMUL(T5, Th); TG = VMUL(T6, Th); TZ = VMUL(T2, Th); Tl = LDW(&(W[TWVL * 7])); TC = VMUL(T6, Tl); T13 = VMUL(T2, Tl); TF = VMUL(T3, Tl); T10 = VMUL(T5, Tl); } TD = VADD(TB, TC); TH = VSUB(TF, TG); T1y = VADD(TZ, T10); T1H = VADD(TF, TG); T15 = VADD(T13, T14); T1A = VSUB(T13, T14); T11 = VSUB(TZ, T10); T1F = VSUB(TB, TC); T1n = VFMA(T9, Th, VMUL(Te, Tl)); T1p = VFNMS(Te, Th, VMUL(T9, Tl)); { V T2o, T2p, T2s, T2t; T2o = VMUL(T8, Th); T2p = VMUL(Td, Tl); T2q = VADD(T2o, T2p); T2I = VSUB(T2o, T2p); T2s = VMUL(T8, Tl); T2t = VMUL(Td, Th); T2u = VSUB(T2s, T2t); T2K = VADD(T2s, T2t); } { V T2T, T2U, T2X, T2Y; T2T = VMUL(TM, Th); T2U = VMUL(TO, Tl); T2V = VSUB(T2T, T2U); T3b = VADD(T2T, T2U); T2X = VMUL(TM, Tl); T2Y = VMUL(TO, Th); T2Z = VADD(T2X, T2Y); T3d = VSUB(T2X, T2Y); Tu = VADD(Ts, Tt); Ty = VSUB(Tw, Tx); T3l = VFMA(Tu, Th, VMUL(Ty, Tl)); T3n = VFNMS(Ty, Th, VMUL(Tu, Tl)); } T1t = VSUB(Ts, Tt); T1v = VADD(Tw, Tx); T2f = VFMA(T1t, Th, VMUL(T1v, Tl)); T2h = VFNMS(T1v, Th, VMUL(T1t, Tl)); T1a = VSUB(T18, T19); T1e = VADD(T1c, T1d); T32 = VFMA(T1a, Th, VMUL(T1e, Tl)); T34 = VFNMS(T1e, Th, VMUL(T1a, Tl)); T1W = VADD(T18, T19); T1Y = VSUB(T1c, T1d); T2C = VFMA(T1W, Th, VMUL(T1Y, Tl)); T2E = VFNMS(T1Y, Th, VMUL(T1W, Tl)); { V Ta, Tf, Ti, Tj; Ta = VMUL(T8, T9); Tf = VMUL(Td, Te); Tg = VSUB(Ta, Tf); TR = VADD(Ta, Tf); Ti = VMUL(T8, Te); Tj = VMUL(Td, T9); Tk = VADD(Ti, Tj); TS = VSUB(Ti, Tj); } Tm = VFMA(Tg, Th, VMUL(Tk, Tl)); TV = VFNMS(TS, Th, VMUL(TR, Tl)); To = VFNMS(Tk, Th, VMUL(Tg, Tl)); TT = VFMA(TR, Th, VMUL(TS, Tl)); { V T1K, T1L, T1N, T1O; T1K = VMUL(TM, T9); T1L = VMUL(TO, Te); T1M = VSUB(T1K, T1L); T21 = VADD(T1K, T1L); T1N = VMUL(TM, Te); T1O = VMUL(TO, T9); T1P = VADD(T1N, T1O); T22 = VSUB(T1N, T1O); } T1Q = VFMA(T1M, Th, VMUL(T1P, Tl)); T25 = VFNMS(T22, Th, VMUL(T21, Tl)); T1S = VFNMS(T1P, Th, VMUL(T1M, Tl)); T23 = VFMA(T21, Th, VMUL(T22, Tl)); } { V TL, T6f, T8c, T8q, T3F, T5t, T7I, T7W, T2y, T6B, T6y, T7j, T4k, T5J, T4B; V T5G, T3h, T6H, T6O, T7o, T4L, T5N, T52, T5Q, T1i, T7V, T6i, T7D, T3K, T5u; V T3P, T5v, T1E, T6n, T6m, T7e, T3W, T5y, T41, T5z, T29, T6p, T6s, T7f, T47; V T5B, T4c, T5C, T2R, T6z, T6E, T7k, T4v, T5H, T4E, T5K, T3y, T6P, T6K, T7p; V T4W, T5R, T55, T5O; { V T1, T7G, Tq, T7F, TA, T3C, TJ, T3D, Tn, Tp; T1 = LD(&(ri[0]), ms, &(ri[0])); T7G = LD(&(ii[0]), ms, &(ii[0])); Tn = LD(&(ri[WS(rs, 16)]), ms, &(ri[0])); Tp = LD(&(ii[WS(rs, 16)]), ms, &(ii[0])); Tq = VFMA(Tm, Tn, VMUL(To, Tp)); T7F = VFNMS(To, Tn, VMUL(Tm, Tp)); { V Tv, Tz, TE, TI; Tv = LD(&(ri[WS(rs, 8)]), ms, &(ri[0])); Tz = LD(&(ii[WS(rs, 8)]), ms, &(ii[0]));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -