📄 t1sv_32.c
字号:
T4V = VADD(T4T, T4U); T69 = VSUB(T4T, T4U); } T5y = VFNMS(LDK(KP923879532), T4n, T3S); T4o = VFMA(LDK(KP923879532), T4n, T3S); T8J = VFMA(LDK(KP923879532), T8I, T8H); T8L = VFNMS(LDK(KP923879532), T8I, T8H); T5M = VFNMS(LDK(KP198912367), T5L, T5K); T5Q = VFMA(LDK(KP198912367), T5K, T5L); T5A = VFMA(LDK(KP668178637), T5k, T5v); T5w = VFNMS(LDK(KP668178637), T5v, T5k); T5H = VFMA(LDK(KP707106781), T4V, T4S); T4W = VFNMS(LDK(KP707106781), T4V, T4S); T5O = VFNMS(LDK(KP923879532), T5F, T5C); T5G = VFMA(LDK(KP923879532), T5F, T5C); T8D = VFMA(LDK(KP923879532), T8C, T8B); T8F = VFNMS(LDK(KP923879532), T8C, T8B); } { V T6p, T6q, T6o, T5W, T8W, T63; { V T5J, T5P, T5z, T4X, T5Z, T62; T5J = VFMA(LDK(KP198912367), T5I, T5H); T5P = VFNMS(LDK(KP198912367), T5H, T5I); T5z = VFNMS(LDK(KP668178637), T4L, T4W); T4X = VFMA(LDK(KP668178637), T4W, T4L); T6p = VFNMS(LDK(KP414213562), T5X, T5Y); T5Z = VFMA(LDK(KP414213562), T5Y, T5X); T62 = VFNMS(LDK(KP414213562), T61, T60); T6q = VFMA(LDK(KP414213562), T60, T61); { V T8G, T5N, T5R, T8E; T8G = VSUB(T5M, T5J); T5N = VADD(T5J, T5M); T5R = VSUB(T5P, T5Q); T8E = VADD(T5P, T5Q); { V T5B, T8K, T8M, T5x; T5B = VADD(T5z, T5A); T8K = VSUB(T5A, T5z); T8M = VADD(T4X, T5w); T5x = VSUB(T4X, T5w); T6o = VFNMS(LDK(KP707106781), T5V, T5S); T5W = VFMA(LDK(KP707106781), T5V, T5S); T8W = VADD(T5Z, T62); T63 = VSUB(T5Z, T62); ST(&(ii[WS(rs, 25)]), VFNMS(LDK(KP980785280), T8G, T8F), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 9)]), VFMA(LDK(KP980785280), T8G, T8F), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 1)]), VFMA(LDK(KP980785280), T5N, T5G), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 17)]), VFNMS(LDK(KP980785280), T5N, T5G), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 9)]), VFMA(LDK(KP980785280), T5R, T5O), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 25)]), VFNMS(LDK(KP980785280), T5R, T5O), ms, &(ri[WS(rs, 1)])); ST(&(ii[WS(rs, 17)]), VFNMS(LDK(KP980785280), T8E, T8D), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 1)]), VFMA(LDK(KP980785280), T8E, T8D), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 29)]), VFMA(LDK(KP831469612), T5B, T5y), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 13)]), VFNMS(LDK(KP831469612), T5B, T5y), ms, &(ri[WS(rs, 1)])); ST(&(ii[WS(rs, 21)]), VFNMS(LDK(KP831469612), T8K, T8J), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 5)]), VFMA(LDK(KP831469612), T8K, T8J), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 29)]), VFMA(LDK(KP831469612), T8M, T8L), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 13)]), VFNMS(LDK(KP831469612), T8M, T8L), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 5)]), VFMA(LDK(KP831469612), T5x, T4o), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 21)]), VFNMS(LDK(KP831469612), T5x, T4o), ms, &(ri[WS(rs, 1)])); } } } { V T6k, T64, T8V, T6r, T8R, T8T, T6y, T6C, T6m, T6i, T6v, T6B, T6l, T6b, T6A; V T6s, T8X; { V T6x, T6e, T6w, T6h, T6u, T67, T6t, T6a, T8P, T8Q; T6k = VFNMS(LDK(KP923879532), T63, T5W); T64 = VFMA(LDK(KP923879532), T63, T5W); T8V = VFNMS(LDK(KP707106781), T8O, T8N); T8P = VFMA(LDK(KP707106781), T8O, T8N); T8Q = VSUB(T6q, T6p); T6r = VADD(T6p, T6q); T6x = VFMA(LDK(KP707106781), T6d, T6c); T6e = VFNMS(LDK(KP707106781), T6d, T6c); T6w = VFMA(LDK(KP707106781), T6g, T6f); T6h = VFNMS(LDK(KP707106781), T6g, T6f); T6u = VFMA(LDK(KP707106781), T66, T65); T67 = VFNMS(LDK(KP707106781), T66, T65); T6t = VFMA(LDK(KP707106781), T69, T68); T6a = VFNMS(LDK(KP707106781), T69, T68); T8R = VFMA(LDK(KP923879532), T8Q, T8P); T8T = VFNMS(LDK(KP923879532), T8Q, T8P); T6y = VFNMS(LDK(KP198912367), T6x, T6w); T6C = VFMA(LDK(KP198912367), T6w, T6x); T6m = VFMA(LDK(KP668178637), T6e, T6h); T6i = VFNMS(LDK(KP668178637), T6h, T6e); T6v = VFMA(LDK(KP198912367), T6u, T6t); T6B = VFNMS(LDK(KP198912367), T6t, T6u); T6l = VFNMS(LDK(KP668178637), T67, T6a); T6b = VFMA(LDK(KP668178637), T6a, T67); } T6A = VFMA(LDK(KP923879532), T6r, T6o); T6s = VFNMS(LDK(KP923879532), T6r, T6o); T8X = VFNMS(LDK(KP923879532), T8W, T8V); T8Z = VFMA(LDK(KP923879532), T8W, T8V); { V T6z, T6D, T8Y, T6n, T8S, T8U, T6j; T6z = VSUB(T6v, T6y); T90 = VADD(T6v, T6y); T6D = VADD(T6B, T6C); T8Y = VSUB(T6C, T6B); T6n = VSUB(T6l, T6m); T8S = VADD(T6l, T6m); T8U = VSUB(T6i, T6b); T6j = VADD(T6b, T6i); ST(&(ri[WS(rs, 7)]), VFMA(LDK(KP980785280), T6z, T6s), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 23)]), VFNMS(LDK(KP980785280), T6z, T6s), ms, &(ri[WS(rs, 1)])); ST(&(ii[WS(rs, 23)]), VFNMS(LDK(KP980785280), T8Y, T8X), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 7)]), VFMA(LDK(KP980785280), T8Y, T8X), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 11)]), VFMA(LDK(KP831469612), T6n, T6k), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 27)]), VFNMS(LDK(KP831469612), T6n, T6k), ms, &(ri[WS(rs, 1)])); ST(&(ii[WS(rs, 19)]), VFNMS(LDK(KP831469612), T8S, T8R), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 3)]), VFMA(LDK(KP831469612), T8S, T8R), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 27)]), VFNMS(LDK(KP831469612), T8U, T8T), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 11)]), VFMA(LDK(KP831469612), T8U, T8T), ms, &(ii[WS(rs, 1)])); ST(&(ri[WS(rs, 3)]), VFMA(LDK(KP831469612), T6j, T64), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 19)]), VFNMS(LDK(KP831469612), T6j, T64), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 31)]), VFMA(LDK(KP980785280), T6D, T6A), ms, &(ri[WS(rs, 1)])); ST(&(ri[WS(rs, 15)]), VFNMS(LDK(KP980785280), T6D, T6A), ms, &(ri[WS(rs, 1)])); } } } } } } ST(&(ii[WS(rs, 31)]), VFMA(LDK(KP980785280), T90, T8Z), ms, &(ii[WS(rs, 1)])); ST(&(ii[WS(rs, 15)]), VFNMS(LDK(KP980785280), T90, T8Z), ms, &(ii[WS(rs, 1)])); }}static const tw_instr twinstr[] = { VTW(0, 1), VTW(0, 2), VTW(0, 3), VTW(0, 4), VTW(0, 5), VTW(0, 6), VTW(0, 7), VTW(0, 8), VTW(0, 9), VTW(0, 10), VTW(0, 11), VTW(0, 12), VTW(0, 13), VTW(0, 14), VTW(0, 15), VTW(0, 16), VTW(0, 17), VTW(0, 18), VTW(0, 19), VTW(0, 20), VTW(0, 21), VTW(0, 22), VTW(0, 23), VTW(0, 24), VTW(0, 25), VTW(0, 26), VTW(0, 27), VTW(0, 28), VTW(0, 29), VTW(0, 30), VTW(0, 31), {TW_NEXT, (2 * VL), 0}};static const ct_desc desc = { 32, "t1sv_32", twinstr, &GENUS, {236, 62, 198, 0}, 0, 0, 0 };void X(codelet_t1sv_32) (planner *p) { X(kdft_dit_register) (p, t1sv_32, &desc);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_twiddle -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1sv_32 -include ts.h *//* * This function contains 434 FP additions, 208 FP multiplications, * (or, 340 additions, 114 multiplications, 94 fused multiply/add), * 96 stack variables, 7 constants, and 128 memory accesses */#include "ts.h"static void t1sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms){ DVK(KP195090322, +0.195090322016128267848284868477022240927691618); DVK(KP980785280, +0.980785280403230449126182236134239036973933731); DVK(KP555570233, +0.555570233019602224742830813948532874374937191); DVK(KP831469612, +0.831469612302545237078788377617905756738560812); DVK(KP382683432, +0.382683432365089771728459984030398866761344562); DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + (mb * 62); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 62), MAKE_VOLATILE_STRIDE(rs)) { V Tj, T5F, T7C, T7Q, T35, T4T, T78, T7m, T1Q, T61, T5Y, T6J, T3K, T59, T41; V T56, T2B, T67, T6e, T6O, T4b, T5d, T4s, T5g, TG, T7l, T5I, T73, T3a, T4U; V T3f, T4V, T14, T5N, T5M, T6E, T3m, T4Y, T3r, T4Z, T1r, T5P, T5S, T6F, T3x; V T51, T3C, T52, T2d, T5Z, T64, T6K, T3V, T57, T44, T5a, T2Y, T6f, T6a, T6P; V T4m, T5h, T4v, T5e; { V T1, T76, T6, T75, Tc, T32, Th, T33; T1 = LD(&(ri[0]), ms, &(ri[0])); T76 = LD(&(ii[0]), ms, &(ii[0])); { V T3, T5, T2, T4; T3 = LD(&(ri[WS(rs, 16)]), ms, &(ri[0])); T5 = LD(&(ii[WS(rs, 16)]), ms, &(ii[0])); T2 = LDW(&(W[TWVL * 30])); T4 = LDW(&(W[TWVL * 31])); T6 = VFMA(T2, T3, VMUL(T4, T5)); T75 = VFNMS(T4, T3, VMUL(T2, T5)); } { V T9, Tb, T8, Ta; T9 = LD(&(ri[WS(rs, 8)]), ms, &(ri[0])); Tb = LD(&(ii[WS(rs, 8)]), ms, &(ii[0])); T8 = LDW(&(W[TWVL * 14])); Ta = LDW(&(W[TWVL * 15])); Tc = VFMA(T8, T9, VMUL(Ta, Tb)); T32 = VFNMS(Ta, T9, VMUL(T8, Tb)); } { V Te, Tg, Td, Tf; Te = LD(&(ri[WS(rs, 24)]), ms, &(ri[0])); Tg = LD(&(ii[WS(rs, 24)]), ms, &(ii[0])); Td = LDW(&(W[TWVL * 46])); Tf = LDW(&(W[TWVL * 47])); Th = VFMA(Td, Te, VMUL(Tf, Tg)); T33 = VFNMS(Tf, Te, VMUL(Td, Tg)); } { V T7, Ti, T7A, T7B; T7 = VADD(T1, T6); Ti = VADD(Tc, Th); Tj = VADD(T7, Ti); T5F = VSUB(T7, Ti); T7A = VSUB(T76, T75); T7B = VSUB(Tc, Th); T7C = VSUB(T7A, T7B); T7Q = VADD(T7B, T7A); } { V T31, T34, T74, T77; T31 = VSUB(T1, T6); T34 = VSUB(T32, T33); T35 = VSUB(T31, T34); T4T = VADD(T31, T34); T74 = VADD(T32, T33); T77 = VADD(T75, T76); T78 = VADD(T74, T77); T7m = VSUB(T77, T74); } } { V T1y, T3G, T1O, T3Z, T1D, T3H, T1J, T3Y; { V T1v, T1x, T1u, T1w; T1v = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)])); T1x = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)])); T1u = LDW(&(W[0])); T1w = LDW(&(W[TWVL * 1])); T1y = VFMA(T1u, T1v, VMUL(T1w, T1x)); T3G = VFNMS(T1w, T1v, VMUL(T1u, T1x)); } { V T1L, T1N, T1K, T1M; T1L = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)])); T1N = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)])); T1K = LDW(&(W[TWVL * 48])); T1M = LDW(&(W[TWVL * 49])); T1O = VFMA(T1K, T1L, VMUL(T1M, T1N)); T3Z = VFNMS(T1M, T1L, VMUL(T1K, T1N)); } { V T1A, T1C, T1z, T1B; T1A = LD(&(ri[WS(rs, 17)]), ms, &(ri[WS(rs, 1)])); T1C = LD(&(ii[WS(rs, 17)]), ms, &(ii[WS(rs, 1)])); T1z = LDW(&(W[TWVL * 32])); T1B = LDW(&(W[TWVL * 33])); T1D = VFMA(T1z, T1A, VMUL(T1B, T1C)); T3H = VFNMS(T1B, T1A, VMUL(T1z, T1C)); } { V T1G, T1I, T1F, T1H; T1G = LD(&(ri[WS(rs, 9)]), ms, &(ri[WS(rs, 1)])); T1I = LD(&(ii[WS(rs, 9)]), ms, &(ii[WS(rs, 1)])); T1F = LDW(&(W[TWVL * 16])); T1H = LDW(&(W[TWVL * 17])); T1J = VFMA(T1F, T1G, VMUL(T1H, T1I)); T3Y = VFNMS(T1H, T1G, VMUL(T1F, T1I)); } { V T1E, T1P, T5W, T5X; T1E = VADD(T1y, T1D); T1P = VADD(T1J, T1O); T1Q = VADD(T1E, T1P); T61 = VSUB(T1E, T1P); T5W = VADD(T3G, T3H); T5X = VADD(T3Y, T3Z); T5Y = VSUB(T5W, T5X); T6J = VADD(T5W, T5X); } { V T3I, T3J, T3X, T40; T3I = VSUB(T3G, T3H); T3J = VSUB(T1J, T1O); T3K = VADD(T3I, T3J); T59 = VSUB(T3I, T3J); T3X = VSUB(T1y, T1D); T40 = VSUB(T3Y, T3Z); T41 = VSUB(T3X, T40); T56 = VADD(T3X, T40); } } { V T2j, T4o, T2z, T49, T2o, T4p, T2u, T48; { V T2g, T2i, T2f, T2h; T2g = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)])); T2i = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)])); T2f = LDW(&(W[TWVL * 60])); T2h = LDW(&(W[TWVL * 61])); T2j = VFMA(T2f, T2g, VMUL(T2h, T2i)); T4o = VFNMS(T2h, T2g, VMUL(T2f, T2i)); } { V T2w, T2y, T2v, T2x; T2w = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)])); T2y = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)])); T2v = LDW(&(W[TWVL * 44])); T2x = LDW(&(W[TWVL * 45])); T2z = VFMA(T2v, T2w, VMUL(T2x, T2y)); T49 = VFNMS(T2x, T2w, VMUL(T2v, T2y)); } { V T2l, T2n, T2k, T2m; T2l = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)])); T2n = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)])); T2k = LDW(&(W[TWVL * 28])); T2m = LDW(&(W[TWVL * 29])); T2o = VFMA(T2k, T2l, VMUL(T2m, T2n)); T4p = VFNMS(T2m, T2l, VMUL(T2k, T2n)); } { V T2r, T2t, T2q, T2s; T2r = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)])); T2t = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)])); T2q = LDW(&(W[TWVL * 12])); T2s = LDW(&(W[TWVL * 13])); T2u = VFMA(T2q, T2r, VMUL(T2s, T2t)); T48 = VFNMS(T2s, T2r, VMUL(T2q, T2t)); } { V T2p, T2A, T6c, T6d;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -