📄 hc2cbdftv_32.c
字号:
T3J = VZMULI(T3I, VFNMSI(T1f, T12)); T1g = VZMULI(T1, VFMAI(T1f, T12)); T2Q = VFNMS(LDK(KP980785280), T2v, T2o); T2w = VFMA(LDK(KP980785280), T2v, T2o); T2R = VFMA(LDK(KP980785280), T2C, T2z); T2D = VFNMS(LDK(KP980785280), T2C, T2z); T1R = LDW(&(W[TWVL * 40])); T4s = LDW(&(W[TWVL * 52])); T23 = LDW(&(W[TWVL * 8])); T42 = LDW(&(W[TWVL * 20])); T4f = VZMULI(T4e, VFNMSI(T2R, T2Q)); T2S = VZMULI(T2P, VFMAI(T2R, T2Q)); T4l = VZMULI(T4k, VFNMSI(T2D, T2w)); T2E = VZMULI(T2l, VFMAI(T2D, T2w)); T24 = VFMA(LDK(KP831469612), T1T, T1S); T1U = VFNMS(LDK(KP831469612), T1T, T1S); T25 = VFMA(LDK(KP831469612), T1W, T1V); T1X = VFNMS(LDK(KP831469612), T1W, T1V); T2X = LDW(&(W[TWVL * 32])); T3O = LDW(&(W[TWVL * 60])); T3b = LDW(&(W[0])); T3i = LDW(&(W[TWVL * 28])); T26 = VZMULI(T23, VFMAI(T25, T24)); T4t = VZMULI(T4s, VFNMSI(T25, T24)); T43 = VZMULI(T42, VFNMSI(T1X, T1U)); T1Y = VZMULI(T1R, VFMAI(T1X, T1U)); T3c = VFMA(LDK(KP980785280), T2Z, T2Y); T30 = VFNMS(LDK(KP980785280), T2Z, T2Y); T3d = VFMA(LDK(KP980785280), T32, T31); T33 = VFNMS(LDK(KP980785280), T32, T31); } } { V T3e, T3P, T3j, T34, T2c, T4j, T2k, T4d, T1P, T1Q, T4x, T4w, T2j, T4c, T21; V T22, T4r, T4q, T2b, T4i, T3h, T3H, T2N, T2O, T41, T40, T3g, T3G, T2V, T2W; V T3V, T3U, T39, T3M; T1P = VADD(T1g, T1O); T1Q = VCONJ(VSUB(T1O, T1g)); T4x = VCONJ(VSUB(T4v, T4t)); T4w = VADD(T4t, T4v); T2j = VADD(T2g, T2i); T2k = VCONJ(VSUB(T2i, T2g)); T4d = VCONJ(VSUB(T4b, T43)); T4c = VADD(T43, T4b); T3e = VZMULI(T3b, VFMAI(T3d, T3c)); T3P = VZMULI(T3O, VFNMSI(T3d, T3c)); T3j = VZMULI(T3i, VFNMSI(T33, T30)); T34 = VZMULI(T2X, VFMAI(T33, T30)); ST(&(Rp[WS(rs, 6)]), T1P, ms, &(Rp[0])); ST(&(Rp[WS(rs, 13)]), T4w, ms, &(Rp[WS(rs, 1)])); ST(&(Rp[WS(rs, 14)]), T2j, ms, &(Rp[0])); ST(&(Rp[WS(rs, 5)]), T4c, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 13)]), T4x, -ms, &(Rm[WS(rs, 1)])); ST(&(Rm[WS(rs, 6)]), T1Q, -ms, &(Rm[0])); T21 = VADD(T1Y, T20); T22 = VCONJ(VSUB(T20, T1Y)); T4r = VCONJ(VSUB(T4p, T4l)); T4q = VADD(T4l, T4p); T2b = VADD(T26, T2a); T2c = VCONJ(VSUB(T2a, T26)); T4j = VCONJ(VSUB(T4h, T4f)); T4i = VADD(T4f, T4h); ST(&(Rm[WS(rs, 5)]), T4d, -ms, &(Rm[WS(rs, 1)])); ST(&(Rm[WS(rs, 14)]), T2k, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 10)]), T21, ms, &(Rp[0])); ST(&(Rp[WS(rs, 3)]), T4q, ms, &(Rp[WS(rs, 1)])); ST(&(Rp[WS(rs, 2)]), T2b, ms, &(Rp[0])); ST(&(Rp[WS(rs, 11)]), T4i, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 3)]), T4r, -ms, &(Rm[WS(rs, 1)])); ST(&(Rm[WS(rs, 10)]), T22, -ms, &(Rm[0])); T2N = VADD(T2E, T2M); T2O = VCONJ(VSUB(T2M, T2E)); T41 = VCONJ(VSUB(T3Z, T3X)); T40 = VADD(T3X, T3Z); T3g = VADD(T3e, T3f); T3h = VCONJ(VSUB(T3f, T3e)); T3H = VCONJ(VSUB(T3F, T3j)); T3G = VADD(T3j, T3F); ST(&(Rm[WS(rs, 11)]), T4j, -ms, &(Rm[WS(rs, 1)])); ST(&(Rm[WS(rs, 2)]), T2c, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 12)]), T2N, ms, &(Rp[0])); ST(&(Rp[WS(rs, 1)]), T40, ms, &(Rp[WS(rs, 1)])); ST(&(Rp[0]), T3g, ms, &(Rp[0])); ST(&(Rp[WS(rs, 7)]), T3G, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 1)]), T41, -ms, &(Rm[WS(rs, 1)])); ST(&(Rm[WS(rs, 12)]), T2O, -ms, &(Rm[0])); T2V = VADD(T2S, T2U); T2W = VCONJ(VSUB(T2U, T2S)); T3V = VCONJ(VSUB(T3T, T3P)); T3U = VADD(T3P, T3T); T39 = VADD(T34, T38); T3a = VCONJ(VSUB(T38, T34)); T3N = VCONJ(VSUB(T3L, T3J)); T3M = VADD(T3J, T3L); ST(&(Rm[WS(rs, 7)]), T3H, -ms, &(Rm[WS(rs, 1)])); ST(&(Rm[0]), T3h, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 4)]), T2V, ms, &(Rp[0])); ST(&(Rp[WS(rs, 15)]), T3U, ms, &(Rp[WS(rs, 1)])); ST(&(Rp[WS(rs, 8)]), T39, ms, &(Rp[0])); ST(&(Rp[WS(rs, 9)]), T3M, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 15)]), T3V, -ms, &(Rm[WS(rs, 1)])); ST(&(Rm[WS(rs, 4)]), T2W, -ms, &(Rm[0])); } } } ST(&(Rm[WS(rs, 9)]), T3N, -ms, &(Rm[WS(rs, 1)])); ST(&(Rm[WS(rs, 8)]), T3a, -ms, &(Rm[0])); }}static const tw_instr twinstr[] = { VTW(1, 1), VTW(1, 2), VTW(1, 3), VTW(1, 4), VTW(1, 5), VTW(1, 6), VTW(1, 7), VTW(1, 8), VTW(1, 9), VTW(1, 10), VTW(1, 11), VTW(1, 12), VTW(1, 13), VTW(1, 14), VTW(1, 15), VTW(1, 16), VTW(1, 17), VTW(1, 18), VTW(1, 19), VTW(1, 20), VTW(1, 21), VTW(1, 22), VTW(1, 23), VTW(1, 24), VTW(1, 25), VTW(1, 26), VTW(1, 27), VTW(1, 28), VTW(1, 29), VTW(1, 30), VTW(1, 31), {TW_NEXT, VL, 0}};static const hc2c_desc desc = { 32, "hc2cbdftv_32", twinstr, &GENUS, {119, 62, 130, 0} };void X(codelet_hc2cbdftv_32) (planner *p) { X(khc2c_register) (p, hc2cbdftv_32, &desc, HC2C_VIA_DFT);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_hc2cdft_c -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dif -sign 1 -name hc2cbdftv_32 -include hc2cbv.h *//* * This function contains 249 FP additions, 104 FP multiplications, * (or, 233 additions, 88 multiplications, 16 fused multiply/add), * 161 stack variables, 7 constants, and 64 memory accesses */#include "hc2cbv.h"static void hc2cbdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms){ DVK(KP195090322, +0.195090322016128267848284868477022240927691618); DVK(KP980785280, +0.980785280403230449126182236134239036973933731); DVK(KP555570233, +0.555570233019602224742830813948532874374937191); DVK(KP831469612, +0.831469612302545237078788377617905756738560812); DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP382683432, +0.382683432365089771728459984030398866761344562); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(rs)) { V T1W, T21, Tf, T2c, T1t, T2r, T3T, T4m, Ty, T2q, T3P, T4n, T1n, T2d, T1T; V T22, T1E, T24, T3I, T4p, TU, T2n, T1i, T2h, T1L, T25, T3L, T4q, T1f, T2o; V T1j, T2k; { V T2, T4, T1Z, T1p, T1r, T20, T9, T1U, Td, T1V, T3, T1q, T6, T8, T7; V Tc, Tb, Ta, T5, Te, T1o, T1s, T3R, T3S, Tj, T1N, Tw, T1Q, Tn, T1O; V Ts, T1R, Tg, Ti, Th, Tv, Tu, Tt, Tk, Tm, Tl, Tp, Tr, Tq, To; V Tx, T3N, T3O, T1l, T1m, T1P, T1S; T2 = LD(&(Rp[0]), ms, &(Rp[0])); T3 = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)])); T4 = VCONJ(T3); T1Z = VADD(T2, T4); T1p = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0])); T1q = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)])); T1r = VCONJ(T1q); T20 = VADD(T1p, T1r); T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0])); T7 = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)])); T8 = VCONJ(T7); T9 = VSUB(T6, T8); T1U = VADD(T6, T8); Tc = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0])); Ta = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)])); Tb = VCONJ(Ta); Td = VSUB(Tb, Tc); T1V = VADD(Tb, Tc); T1W = VSUB(T1U, T1V); T21 = VSUB(T1Z, T20); T5 = VSUB(T2, T4); Te = VMUL(LDK(KP707106781), VADD(T9, Td)); Tf = VSUB(T5, Te); T2c = VADD(T5, Te); T1o = VMUL(LDK(KP707106781), VSUB(T9, Td)); T1s = VSUB(T1p, T1r); T1t = VSUB(T1o, T1s); T2r = VADD(T1s, T1o); T3R = VADD(T1Z, T20); T3S = VADD(T1U, T1V); T3T = VSUB(T3R, T3S); T4m = VADD(T3R, T3S); Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0])); Th = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)])); Ti = VCONJ(Th); Tj = VSUB(Tg, Ti); T1N = VADD(Tg, Ti); Tv = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0])); Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)])); Tu = VCONJ(Tt); Tw = VSUB(Tu, Tv); T1Q = VADD(Tu, Tv); Tk = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0])); Tl = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)])); Tm = VCONJ(Tl); Tn = VSUB(Tk, Tm); T1O = VADD(Tk, Tm); Tp = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0])); Tq = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)])); Tr = VCONJ(Tq); Ts = VSUB(Tp, Tr); T1R = VADD(Tp, Tr); To = VFMA(LDK(KP382683432), Tj, VMUL(LDK(KP923879532), Tn)); Tx = VFNMS(LDK(KP382683432), Tw, VMUL(LDK(KP923879532), Ts)); Ty = VSUB(To, Tx); T2q = VADD(To, Tx); T3N = VADD(T1N, T1O); T3O = VADD(T1Q, T1R); T3P = VSUB(T3N, T3O); T4n = VADD(T3N, T3O); T1l = VFNMS(LDK(KP382683432), Tn, VMUL(LDK(KP923879532), Tj)); T1m = VFMA(LDK(KP923879532), Tw, VMUL(LDK(KP382683432), Ts)); T1n = VSUB(T1l, T1m); T2d = VADD(T1l, T1m); T1P = VSUB(T1N, T1O); T1S = VSUB(T1Q, T1R); T1T = VMUL(LDK(KP707106781), VSUB(T1P, T1S)); T22 = VMUL(LDK(KP707106781), VADD(T1P, T1S)); } { V TD, T1B, TR, T1y, TH, T1C, TM, T1z, TA, TC, TB, TO, TQ, TP, TG; V TF, TE, TJ, TL, TK, T1A, T1D, T3G, T3H, TN, T2f, TT, T2g, TI, TS; V TY, T1I, T1c, T1F, T12, T1J, T17, T1G, TV, TX, TW, T1b, T1a, T19, T11; V T10, TZ, T14, T16, T15, T1H, T1K, T3J, T3K, T18, T2i, T1e, T2j, T13, T1d; TA = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)])); TB = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0])); TC = VCONJ(TB); TD = VSUB(TA, TC); T1B = VADD(TA, TC); TO = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)])); TP = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0])); TQ = VCONJ(TP); TR = VSUB(TO, TQ); T1y = VADD(TO, TQ); TG = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)])); TE = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0])); TF = VCONJ(TE); TH = VSUB(TF, TG); T1C = VADD(TF, TG); TJ = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)])); TK = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0])); TL = VCONJ(TK); TM = VSUB(TJ, TL); T1z = VADD(TJ, TL); T1A = VSUB(T1y, T1z); T1D = VSUB(T1B, T1C); T1E = VFNMS(LDK(KP382683432), T1D, VMUL(LDK(KP923879532), T1A)); T24 = VFMA(LDK(KP382683432), T1A, VMUL(LDK(KP923879532), T1D)); T3G = VADD(T1y, T1z); T3H = VADD(T1B, T1C); T3I = VSUB(T3G, T3H); T4p = VADD(T3G, T3H); TI = VMUL(LDK(KP707106781), VSUB(TD, TH)); TN = VSUB(TI, TM); T2f = VADD(TM, TI); TS = VMUL(LDK(KP707106781), VADD(TD, TH)); TT = VSUB(TR, TS); T2g = VADD(TR, TS); TU = VFMA(LDK(KP831469612), TN, VMUL(LDK(KP555570233), TT)); T2n = VFNMS(LDK(KP195090322), T2f, VMUL(LDK(KP980785280), T2g)); T1i = VFNMS(LDK(KP555570233), TN, VMUL(LDK(KP831469612), TT));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -