📄 hc2cfdftv_32.c
字号:
T4w = VMUL(LDK(KP500000000), VFNMSI(T4v, T4u)); T4s = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4r, T4o))); T4t = VMUL(LDK(KP500000000), VFMAI(T4r, T4o)); ST(&(Rp[0]), T4i, ms, &(Rp[0])); ST(&(Rm[WS(rs, 15)]), T4j, -ms, &(Rm[WS(rs, 1)])); ST(&(Rm[WS(rs, 7)]), T4f, -ms, &(Rm[WS(rs, 1)])); ST(&(Rp[WS(rs, 8)]), T4e, ms, &(Rp[0])); ST(&(Rm[WS(rs, 9)]), T3M, -ms, &(Rm[WS(rs, 1)])); ST(&(Rp[WS(rs, 10)]), T3N, ms, &(Rp[0])); ST(&(Rm[WS(rs, 5)]), T3J, -ms, &(Rm[WS(rs, 1)])); ST(&(Rp[WS(rs, 6)]), T3I, ms, &(Rp[0])); ST(&(Rp[WS(rs, 12)]), T4w, ms, &(Rp[0])); ST(&(Rm[WS(rs, 11)]), T4x, -ms, &(Rm[WS(rs, 1)])); ST(&(Rp[WS(rs, 4)]), T4t, ms, &(Rp[0])); ST(&(Rm[WS(rs, 3)]), T4s, -ms, &(Rm[WS(rs, 1)])); { V T2A, T2W, T2L, T2Z, T2D, T2N, T2M, T2G, T3T, T3X, T16, T2p, T1v, T35, T31; V T2I, T2S, T34, T2Y, T2P, T2T, T1Y, T2H, T30, T3Z, T3Y, T3U, T3V, T2O, T2X; V T32, T33, T36, T37, T2U, T2V, T2Q, T2R, T1Z, T2q; T2A = VFNMS(LDK(KP923879532), T2z, T2y); T2W = VFMA(LDK(KP923879532), T2z, T2y); T2L = VFNMS(LDK(KP923879532), T2K, T2J); T2Z = VFMA(LDK(KP923879532), T2K, T2J); T2D = VFMA(LDK(KP198912367), T2C, T2B); T2N = VFNMS(LDK(KP198912367), T2B, T2C); T2M = VFMA(LDK(KP198912367), T2E, T2F); T2G = VFNMS(LDK(KP198912367), T2F, T2E); T3T = VFMA(LDK(KP923879532), T3S, T3R); T3X = VFNMS(LDK(KP923879532), T3S, T3R); T16 = VFNMS(LDK(KP923879532), T15, Ts); T2m = VFMA(LDK(KP923879532), T15, Ts); T2H = VSUB(T2D, T2G); T30 = VADD(T2D, T2G); T2b = VFNMS(LDK(KP923879532), T2a, T27); T2p = VFMA(LDK(KP923879532), T2a, T27); T1v = VFMA(LDK(KP668178637), T1u, T1n); T2c = VFNMS(LDK(KP668178637), T1n, T1u); T3Z = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3X, T3W))); T3Y = VMUL(LDK(KP500000000), VFNMSI(T3X, T3W)); T3U = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3T, T3Q))); T3V = VMUL(LDK(KP500000000), VFMAI(T3T, T3Q)); T2O = VSUB(T2M, T2N); T2X = VADD(T2N, T2M); T35 = VFNMS(LDK(KP980785280), T30, T2Z); T31 = VFMA(LDK(KP980785280), T30, T2Z); T2I = VFMA(LDK(KP980785280), T2H, T2A); T2S = VFNMS(LDK(KP980785280), T2H, T2A); ST(&(Rp[WS(rs, 14)]), T3Y, ms, &(Rp[0])); ST(&(Rm[WS(rs, 13)]), T3Z, -ms, &(Rm[WS(rs, 1)])); ST(&(Rp[WS(rs, 2)]), T3V, ms, &(Rp[0])); ST(&(Rm[WS(rs, 1)]), T3U, -ms, &(Rm[WS(rs, 1)])); T34 = VFNMS(LDK(KP980785280), T2X, T2W); T2Y = VFMA(LDK(KP980785280), T2X, T2W); T2P = VFMA(LDK(KP980785280), T2O, T2L); T2T = VFNMS(LDK(KP980785280), T2O, T2L); T2d = VFMA(LDK(KP668178637), T1Q, T1X); T1Y = VFNMS(LDK(KP668178637), T1X, T1Q); T32 = VMUL(LDK(KP500000000), VFNMSI(T31, T2Y)); T33 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T31, T2Y))); T36 = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T35, T34))); T37 = VMUL(LDK(KP500000000), VFMAI(T35, T34)); T2U = VMUL(LDK(KP500000000), VFNMSI(T2T, T2S)); T2V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2T, T2S))); T2Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2P, T2I))); T2R = VMUL(LDK(KP500000000), VFMAI(T2P, T2I)); T1Z = VSUB(T1v, T1Y); T2q = VADD(T1Y, T1v); ST(&(Rm[0]), T33, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 1)]), T32, ms, &(Rp[WS(rs, 1)])); ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 14)]), T36, -ms, &(Rm[0])); ST(&(Rm[WS(rs, 8)]), T2V, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 9)]), T2U, ms, &(Rp[WS(rs, 1)])); ST(&(Rp[WS(rs, 7)]), T2R, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 6)]), T2Q, -ms, &(Rm[0])); T2v = VFNMS(LDK(KP831469612), T2q, T2p); T2r = VFMA(LDK(KP831469612), T2q, T2p); T20 = VFMA(LDK(KP831469612), T1Z, T16); T2i = VFNMS(LDK(KP831469612), T1Z, T16); } } } T2n = VADD(T2d, T2c); T2e = VSUB(T2c, T2d); T2o = VFMA(LDK(KP831469612), T2n, T2m); T2u = VFNMS(LDK(KP831469612), T2n, T2m); T2j = VFMA(LDK(KP831469612), T2e, T2b); T2f = VFNMS(LDK(KP831469612), T2e, T2b); T2t = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2r, T2o))); T2s = VMUL(LDK(KP500000000), VFMAI(T2r, T2o)); T2x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2v, T2u))); T2w = VMUL(LDK(KP500000000), VFNMSI(T2v, T2u)); T2l = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2j, T2i))); T2k = VMUL(LDK(KP500000000), VFMAI(T2j, T2i)); T2h = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2f, T20))); T2g = VMUL(LDK(KP500000000), VFNMSI(T2f, T20)); ST(&(Rm[WS(rs, 2)]), T2t, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 3)]), T2s, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 12)]), T2x, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 13)]), T2w, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 10)]), T2l, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 11)]), T2k, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 4)]), T2h, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 5)]), T2g, ms, &(Rp[WS(rs, 1)])); }}static const tw_instr twinstr[] = { VTW(1, 1), VTW(1, 2), VTW(1, 3), VTW(1, 4), VTW(1, 5), VTW(1, 6), VTW(1, 7), VTW(1, 8), VTW(1, 9), VTW(1, 10), VTW(1, 11), VTW(1, 12), VTW(1, 13), VTW(1, 14), VTW(1, 15), VTW(1, 16), VTW(1, 17), VTW(1, 18), VTW(1, 19), VTW(1, 20), VTW(1, 21), VTW(1, 22), VTW(1, 23), VTW(1, 24), VTW(1, 25), VTW(1, 26), VTW(1, 27), VTW(1, 28), VTW(1, 29), VTW(1, 30), VTW(1, 31), {TW_NEXT, VL, 0}};static const hc2c_desc desc = { 32, "hc2cfdftv_32", twinstr, &GENUS, {119, 94, 130, 0} };void X(codelet_hc2cfdftv_32) (planner *p) { X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_hc2cdft_c -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include hc2cfv.h *//* * This function contains 249 FP additions, 133 FP multiplications, * (or, 233 additions, 117 multiplications, 16 fused multiply/add), * 130 stack variables, 9 constants, and 64 memory accesses */#include "hc2cfv.h"static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms){ DVK(KP555570233, +0.555570233019602224742830813948532874374937191); DVK(KP831469612, +0.831469612302545237078788377617905756738560812); DVK(KP195090322, +0.195090322016128267848284868477022240927691618); DVK(KP980785280, +0.980785280403230449126182236134239036973933731); DVK(KP382683432, +0.382683432365089771728459984030398866761344562); DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); DVK(KP353553390, +0.353553390593273762200422181052424519642417969); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); INT m; for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(rs)) { V Ta, T2m, Tx, T2h, T3R, T4h, T3q, T4g, T3B, T4n, T3E, T4o, T1B, T2S, T1O; V T2R, TV, T2p, T1i, T2o, T3L, T4q, T3I, T4r, T3w, T4k, T3t, T4j, T26, T2V; V T2d, T2U; { V T4, T1m, T1H, T2j, T1M, T2l, T9, T1o, Tf, T1r, Tq, T1w, Tv, T1y, Tk; V T1t, Tl, Tw, T3P, T3Q, T3o, T3p, T3z, T3A, T3C, T3D, T1p, T1N, T1A, T1C; V T1u, T1z; { V T1, T3, T2, T1l, T1G, T1F, T1E, T1D, T2i, T1L, T1K, T1J, T1I, T2k, T6; V T8, T7, T5, T1n, Tc, Te, Td, Tb, T1q, Tn, Tp, To, Tm, T1v, Ts; V Tu, Tt, Tr, T1x, Th, Tj, Ti, Tg, T1s; T1 = LD(&(Rp[0]), ms, &(Rp[0])); T2 = LD(&(Rm[0]), -ms, &(Rm[0])); T3 = VCONJ(T2); T4 = VADD(T1, T3); T1l = LDW(&(W[0])); T1m = VZMULIJ(T1l, VSUB(T3, T1)); T1G = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0])); T1E = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0])); T1F = VCONJ(T1E); T1D = LDW(&(W[TWVL * 16])); T1H = VZMULIJ(T1D, VSUB(T1F, T1G)); T2i = LDW(&(W[TWVL * 14])); T2j = VZMULJ(T2i, VADD(T1G, T1F)); T1L = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0])); T1J = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0])); T1K = VCONJ(T1J); T1I = LDW(&(W[TWVL * 48])); T1M = VZMULIJ(T1I, VSUB(T1K, T1L)); T2k = LDW(&(W[TWVL * 46])); T2l = VZMULJ(T2k, VADD(T1L, T1K)); T6 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0])); T7 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0])); T8 = VCONJ(T7); T5 = LDW(&(W[TWVL * 30])); T9 = VZMULJ(T5, VADD(T6, T8)); T1n = LDW(&(W[TWVL * 32])); T1o = VZMULIJ(T1n, VSUB(T8, T6)); Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0])); Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0])); Te = VCONJ(Td); Tb = LDW(&(W[TWVL * 6])); Tf = VZMULJ(Tb, VADD(Tc, Te)); T1q = LDW(&(W[TWVL * 8])); T1r = VZMULIJ(T1q, VSUB(Te, Tc)); Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0])); To = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0])); Tp = VCONJ(To); Tm = LDW(&(W[TWVL * 54])); Tq = VZMULJ(Tm, VADD(Tn, Tp)); T1v = LDW(&(W[TWVL * 56])); T1w = VZMULIJ(T1v, VSUB(Tp, Tn)); Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0])); Tt = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0])); Tu = VCONJ(Tt); Tr = LDW(&(W[TWVL * 22])); Tv = VZMULJ(Tr, VADD(Ts, Tu)); T1x = LDW(&(W[TWVL * 24])); T1y = VZMULIJ(T1x, VSUB(Tu, Ts)); Th = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0])); Ti = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0])); Tj = VCONJ(Ti); Tg = LDW(&(W[TWVL * 38])); Tk = VZMULJ(Tg, VADD(Th, Tj)); T1s = LDW(&(W[TWVL * 40])); T1t = VZMULIJ(T1s, VSUB(Tj, Th)); } Ta = VMUL(LDK(KP500000000), VSUB(T4, T9)); T2m = VSUB(T2j, T2l); Tl = VSUB(Tf, Tk); Tw = VSUB(Tq, Tv); Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw)); T2h = VMUL(LDK(KP707106781), VSUB(Tw, Tl)); T3P = VADD(Tq, Tv); T3Q = VADD(Tf, Tk); T3R = VSUB(T3P, T3Q); T4h = VADD(T3Q, T3P); T3o = VADD(T4, T9); T3p = VADD(T2j, T2l); T3q = VMUL(LDK(KP500000000), VSUB(T3o, T3p)); T4g = VADD(T3o, T3p); T3z = VADD(T1m, T1o); T3A = VADD(T1H, T1M); T3B = VSUB(T3z, T3A); T4n = VADD(T3z, T3A); T3C = VADD(T1w, T1y); T3D = VADD(T1r, T1t); T3E = VSUB(T3C, T3D); T4o = VADD(T3D, T3C); T1p = VSUB(T1m, T1o); T1N = VSUB(T1H, T1M); T1u = VSUB(T1r, T1t); T1z = VSUB(T1w, T1y); T1A = VMUL(LDK(KP707106781), VADD(T1u, T1z)); T1C = VMUL(LDK(KP707106781), VSUB(T1z, T1u)); T1B = VADD(T1p, T1A); T2S = VADD(T1N, T1C); T1O = VSUB(T1C, T1N); T2R = VSUB(T1p, T1A); } { V TD, T1R, T1b, T29, T1g, T2b, TI, T1T, TO, T1Y, T10, T22, T15, T24, TT; V T1W, TJ, TU, T16, T1h, T3J, T3K, T3G, T3H, T3u, T3v, T3r, T3s, T25, T2c; V T20, T27, T1U, T1Z; { V TA, TC, TB, Tz, T1Q, T18, T1a, T19, T17, T28, T1d, T1f, T1e, T1c, T2a; V TF, TH, TG, TE, T1S, TL, TN, TM, TK, T1X, TX, TZ, TY, TW, T21; V T12, T14, T13, T11, T23, TQ, TS, TR, TP, T1V; TA = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)])); TB = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)])); TC = VCONJ(TB); Tz = LDW(&(W[TWVL * 2])); TD = VZMULJ(Tz, VADD(TA, TC)); T1Q = LDW(&(W[TWVL * 4])); T1R = VZMULIJ(T1Q, VSUB(TC, TA)); T18 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)])); T19 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)])); T1a = VCONJ(T19); T17 = LDW(&(W[TWVL * 10])); T1b = VZMULJ(T17, VADD(T18, T1a)); T28 = LDW(&(W[TWVL * 12]));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -