📄 hb_64.c
字号:
cr[WS(rs, 5)] = FNMS(T7C, T7D, T7B); T8u = T84 + T81; T85 = T81 - T84; T8b = FNMS(KP831469612, T8a, T89); T8t = FMA(KP831469612, T8a, T89); T7Y = FNMS(KP831469612, T7X, T7W); T8o = FMA(KP831469612, T7X, T7W); T8p = T8c + T8d; T8e = T8c - T8d; T86 = FNMS(KP956940335, T85, T7Y); T8i = FMA(KP956940335, T85, T7Y); T8y = FMA(KP956940335, T8p, T8o); T8q = FNMS(KP956940335, T8p, T8o); T8l = FMA(KP956940335, T8e, T8b); T8f = FNMS(KP956940335, T8e, T8b); } { E T8k, T8j, T7V, T88, T8v, T8s, T8n; T7V = W[88]; T88 = W[89]; { E T8h, T8g, T87, T8m; T8h = W[24]; T8k = W[25]; T8g = T7V * T8f; T87 = T7V * T86; T8m = T8h * T8l; T8j = T8h * T8i; ci[WS(rs, 45)] = FMA(T88, T86, T8g); cr[WS(rs, 45)] = FNMS(T88, T8f, T87); ci[WS(rs, 13)] = FMA(T8k, T8i, T8m); } cr[WS(rs, 13)] = FNMS(T8k, T8l, T8j); T8B = FMA(KP956940335, T8u, T8t); T8v = FNMS(KP956940335, T8u, T8t); T8s = W[57]; T8n = W[56]; { E T8x, T8C, T8w, T8r; T8A = W[121]; T8w = T8s * T8q; T8r = T8n * T8q; T8x = W[120]; T8C = T8A * T8y; ci[WS(rs, 29)] = FMA(T8n, T8v, T8w); cr[WS(rs, 29)] = FNMS(T8s, T8v, T8r); T8z = T8x * T8y; ci[WS(rs, 61)] = FMA(T8x, T8B, T8C); } } } } { E Ta5, Ta4, Ta3, TeN, TeM, TeL; { E T9V, T9Y, Tai, Taa, Tal, Taf, Ta2, T9I; { E T9n, T9G, Tad, Ta9, T94, Ta8, T9W, T9X, Tae, T9H; cr[WS(rs, 61)] = FNMS(T8A, T8B, T8z); T9n = FNMS(KP534511135, T9m, T9f); T9W = FMA(KP534511135, T9f, T9m); T9X = FMA(KP534511135, T9y, T9F); T9G = FNMS(KP534511135, T9F, T9y); T9V = FMA(KP831469612, T9U, T9R); Tad = FNMS(KP831469612, T9U, T9R); Ta9 = T9W + T9X; T9Y = T9W - T9X; T94 = FNMS(KP831469612, T93, T8O); Ta8 = FMA(KP831469612, T93, T8O); Tae = T9G - T9n; T9H = T9n + T9G; Tai = FMA(KP881921264, Ta9, Ta8); Taa = FNMS(KP881921264, Ta9, Ta8); Tal = FNMS(KP881921264, Tae, Tad); Taf = FMA(KP881921264, Tae, Tad); Ta2 = FNMS(KP881921264, T9H, T94); T9I = FMA(KP881921264, T9H, T94); } { E Tak, Taj, Ta7, Tac, T9Z, T9K, T8D; Ta7 = W[52]; Tac = W[53]; { E Tah, Tag, Tab, Tam; Tah = W[116]; Tak = W[117]; Tag = Ta7 * Taf; Tab = Ta7 * Taa; Tam = Tah * Tal; Taj = Tah * Tai; ci[WS(rs, 27)] = FMA(Tac, Taa, Tag); cr[WS(rs, 27)] = FNMS(Tac, Taf, Tab); ci[WS(rs, 59)] = FMA(Tak, Tai, Tam); } cr[WS(rs, 59)] = FNMS(Tak, Tal, Taj); Ta5 = FMA(KP881921264, T9Y, T9V); T9Z = FNMS(KP881921264, T9Y, T9V); T9K = W[85]; T8D = W[84]; { E Ta1, Ta6, Ta0, T9J; Ta4 = W[21]; Ta0 = T9K * T9I; T9J = T8D * T9I; Ta1 = W[20]; Ta6 = Ta4 * Ta2; ci[WS(rs, 43)] = FMA(T8D, T9Z, Ta0); cr[WS(rs, 43)] = FNMS(T9K, T9Z, T9J); Ta3 = Ta1 * Ta2; ci[WS(rs, 11)] = FMA(Ta1, Ta5, Ta6); } } } { E TeD, TeG, Tf0, TeS, Tf3, TeX, TeK, Teo; { E Tem, TdV, TeV, TeR, Tdu, TeQ, TeE, TeF, TeW, Ten; cr[WS(rs, 11)] = FNMS(Ta4, Ta5, Ta3); Tem = FMA(KP668178637, Tel, Tec); TeE = FNMS(KP668178637, Tec, Tel); TeF = FMA(KP668178637, TdL, TdU); TdV = FNMS(KP668178637, TdU, TdL); TeD = FNMS(KP923879532, TeC, Tez); TeV = FMA(KP923879532, TeC, Tez); TeR = TeE + TeF; TeG = TeE - TeF; Tdu = FNMS(KP923879532, Tdt, Td6); TeQ = FMA(KP923879532, Tdt, Td6); TeW = Tem + TdV; Ten = TdV - Tem; Tf0 = FMA(KP831469612, TeR, TeQ); TeS = FNMS(KP831469612, TeR, TeQ); Tf3 = FMA(KP831469612, TeW, TeV); TeX = FNMS(KP831469612, TeW, TeV); TeK = FMA(KP831469612, Ten, Tdu); Teo = FNMS(KP831469612, Ten, Tdu); } { E Tf2, Tf1, TeP, TeU, TeH, Teq, TcP; TeP = W[74]; TeU = W[75]; { E TeZ, TeY, TeT, Tf4; TeZ = W[10]; Tf2 = W[11]; TeY = TeP * TeX; TeT = TeP * TeS; Tf4 = TeZ * Tf3; Tf1 = TeZ * Tf0; ci[WS(rs, 38)] = FMA(TeU, TeS, TeY); cr[WS(rs, 38)] = FNMS(TeU, TeX, TeT); ci[WS(rs, 6)] = FMA(Tf2, Tf0, Tf4); } cr[WS(rs, 6)] = FNMS(Tf2, Tf3, Tf1); TeN = FMA(KP831469612, TeG, TeD); TeH = FNMS(KP831469612, TeG, TeD); Teq = W[107]; TcP = W[106]; { E TeJ, TeO, TeI, Tep; TeM = W[43]; TeI = Teq * Teo; Tep = TcP * Teo; TeJ = W[42]; TeO = TeM * TeK; ci[WS(rs, 54)] = FMA(TcP, TeH, TeI); cr[WS(rs, 54)] = FNMS(Teq, TeH, Tep); TeL = TeJ * TeK; ci[WS(rs, 22)] = FMA(TeJ, TeN, TeO); } } } { E Tcn, Tcq, TcK, TcC, TcN, TcH, Tcu, Tci; { E Tcd, Tcg, TcF, TcB, Tca, TcA, Tco, Tcp, TcG, Tch; cr[WS(rs, 22)] = FNMS(TeM, TeN, TeL); Tcd = FNMS(KP098491403, Tcc, Tcb); Tco = FMA(KP098491403, Tcb, Tcc); Tcp = FMA(KP098491403, Tce, Tcf); Tcg = FNMS(KP098491403, Tcf, Tce); Tcn = FMA(KP980785280, Tcm, Tcl); TcF = FNMS(KP980785280, Tcm, Tcl); TcB = Tco + Tcp; Tcq = Tco - Tcp; Tca = FNMS(KP980785280, Tc9, Tc8); TcA = FMA(KP980785280, Tc9, Tc8); TcG = Tcg - Tcd; Tch = Tcd + Tcg; TcK = FMA(KP995184726, TcB, TcA); TcC = FNMS(KP995184726, TcB, TcA); TcN = FNMS(KP995184726, TcG, TcF); TcH = FMA(KP995184726, TcG, TcF); Tcu = FNMS(KP995184726, Tch, Tca); Tci = FMA(KP995184726, Tch, Tca); } { E TcM, TcL, Tcz, TcE, Tcr, Tck, Tc7; Tcz = W[60]; TcE = W[61]; { E TcJ, TcI, TcD, TcO; TcJ = W[124]; TcM = W[125]; TcI = Tcz * TcH; TcD = Tcz * TcC; TcO = TcJ * TcN; TcL = TcJ * TcK; ci[WS(rs, 31)] = FMA(TcE, TcC, TcI); cr[WS(rs, 31)] = FNMS(TcE, TcH, TcD); ci[WS(rs, 63)] = FMA(TcM, TcK, TcO); } cr[WS(rs, 63)] = FNMS(TcM, TcN, TcL); Tcx = FMA(KP995184726, Tcq, Tcn); Tcr = FNMS(KP995184726, Tcq, Tcn); Tck = W[93]; Tc7 = W[92]; { E Tct, Tcy, Tcs, Tcj; Tcw = W[29]; Tcs = Tck * Tci; Tcj = Tc7 * Tci; Tct = W[28]; Tcy = Tcw * Tcu; ci[WS(rs, 47)] = FMA(Tc7, Tcr, Tcs); cr[WS(rs, 47)] = FNMS(Tck, Tcr, Tcj); Tcv = Tct * Tcu; ci[WS(rs, 15)] = FMA(Tct, Tcx, Tcy); } } } } } } } cr[WS(rs, 15)] = FNMS(Tcw, Tcx, Tcv); }}static const tw_instr twinstr[] = { {TW_FULL, 1, 64}, {TW_NEXT, 1, 0}};static const hc2hc_desc desc = { 64, "hb_64", twinstr, &GENUS, {520, 126, 518, 0} };void X(codelet_hb_64) (planner *p) { X(khc2hc_register) (p, hb_64, &desc);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_hc2hc -compact -variables 4 -pipeline-latency 4 -sign 1 -n 64 -dif -name hb_64 -include hb.h *//* * This function contains 1038 FP additions, 500 FP multiplications, * (or, 808 additions, 270 multiplications, 230 fused multiply/add), * 196 stack variables, 15 constants, and 256 memory accesses */#include "hb.h"static void hb_64(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms){ DK(KP098017140, +0.098017140329560601994195563888641845861136673); DK(KP995184726, +0.995184726672196886244836953109479921575474869); DK(KP773010453, +0.773010453362736960810906609758469800971041293); DK(KP634393284, +0.634393284163645498215171613225493370675687095); DK(KP471396736, +0.471396736825997648556387625905254377657460319); DK(KP881921264, +0.881921264348355029712756863660388349508442621); DK(KP956940335, +0.956940335732208864935797886980269969482849206); DK(KP290284677, +0.290284677254462367636192375817395274691476278); DK(KP195090322, +0.195090322016128267848284868477022240927691618); DK(KP980785280, +0.980785280403230449126182236134239036973933731); DK(KP555570233, +0.555570233019602224742830813948532874374937191); DK(KP831469612, +0.831469612302545237078788377617905756738560812); DK(KP382683432, +0.382683432365089771728459984030398866761344562); DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + ((mb - 1) * 126); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 126, MAKE_VOLATILE_STRIDE(rs)) { E Tf, T8C, Tfa, Thk, Tgg, ThM, T2c, T5O, T4K, T6g, Tag, TdE, TcA, Te6, T7P; E T94, TK, T7o, T38, T4P, Tfv, Thn, T5W, T6j, Tb0, TdK, Tfs, Tho, T8K, T97; E Tb7, TdL, TZ, T7l, T2P, T4Q, Tfo, Thq, T5T, T6k, TaH, TdH, Tfl, Thr, T8H; E T98, TaO, TdI, Tu, T95, Tfh, ThN, Tgj, Thl, T2v, T6h, T4N, T5P, Tav, Te7; E TcD, TdF, T7S, T8D, T1L, T20, T7A, T7D, T7G, T7H, T40, T62, Tg1, Thv, Tg8; E Thz, Tg5, Thw, T4t, T5Z, T4j, T60, T4w, T63, TbY, TdS, Tcd, TdQ, TfU, Thy; E T8P, T9z, T8S, T9A, Tcl, TdP, Tco, TdT, T1g, T1v, T7r, T7u, T7x, T7y, T3j; E T69, TfI, ThD, TfP, ThG, TfM, ThC, T3M, T66, T3C, T67, T3P, T6a, Tbl, TdZ; E TbA, TdX, TfB, ThF, T8W, T9C, T8Z, T9D, TbI, TdW, TbL, Te0; { E T3, Ta6, T6, Tcu, T4I, Ta7, T4F, Tcv, Td, Tcy, T27, Tae, Ta, Tcx, T2a; E Tab; { E T1, T2, T4D, T4E; T1 = cr[0]; T2 = ci[WS(rs, 31)]; T3 = T1 + T2; Ta6 = T1 - T2; { E T4, T5, T4G, T4H; T4 = cr[WS(rs, 16)]; T5 = ci[WS(rs, 15)]; T6 = T4 + T5; Tcu = T4 - T5; T4G = ci[WS(rs, 47)]; T4H = cr[WS(rs, 48)]; T4I = T4G - T4H; Ta7 = T4G + T4H; } T4D = ci[WS(rs, 63)]; T4E = cr[WS(rs, 32)]; T4F = T4D - T4E; Tcv = T4D + T4E; { E Tb, Tc, Tac, T25, T26, Tad; Tb = ci[WS(rs, 7)]; Tc = cr[WS(rs, 24)]; Tac = Tb - Tc; T25 = ci[WS(rs, 39)]; T26 = cr[WS(rs, 56)]; Tad = T25 + T26; Td = Tb + Tc; Tcy = Tac + Tad; T27 = T25 - T26; Tae = Tac - Tad; } { E T8, T9, Ta9, T28, T29, Taa; T8 = cr[WS(rs, 8)]; T9 = ci[WS(rs, 23)]; Ta9 = T8 - T9; T28 = ci[WS(rs, 55)]; T29 = cr[WS(rs, 40)]; Taa = T28 + T29; Ta = T8 + T9; Tcx = Ta9 + Taa; T2a = T28 - T29; Tab = Ta9 - Taa; } } { E T7, Te, Tf8, Tf9; T7 = T3 + T6; Te = Ta + Td; Tf = T7 + Te; T8C = T7 - Te; Tf8 = Ta6 + Ta7; Tf9 = KP707106781 * (Tcx + Tcy); Tfa = Tf8 - Tf9; Thk = Tf8 + Tf9; } { E Tge, Tgf, T24, T2b; Tge = Tcv - Tcu; Tgf = KP707106781 * (Tab - Tae); Tgg = Tge + Tgf; ThM = Tge - Tgf; T24 = T3 - T6; T2b = T27 - T2a; T2c = T24 + T2b; T5O = T24 - T2b; } { E T4C, T4J, Ta8, Taf; T4C = Ta - Td; T4J = T4F - T4I; T4K = T4C + T4J; T6g = T4J - T4C; Ta8 = Ta6 - Ta7; Taf = KP707106781 * (Tab + Tae); Tag = Ta8 - Taf; TdE = Ta8 + Taf; } { E Tcw, Tcz, T7N, T7O; Tcw = Tcu + Tcv; Tcz = KP707106781 * (Tcx - Tcy); TcA = Tcw - Tcz; Te6 = Tcw + Tcz; T7N = T4F + T4I; T7O = T2a + T27; T7P = T7N + T7O; T94 = T7N - T7O; } } { E TC, Tb1, T2Z, TaQ, T2X, Tb2, T7m, TaR, TJ, Tb4, Tb5, T2Q, T36, TaV, TaY; E T7n, Tfq, Tfr; { E Tw, Tx, Ty, Tz, TA, TB; Tw = cr[WS(rs, 2)]; Tx = ci[WS(rs, 29)]; Ty = Tw + Tx; Tz = cr[WS(rs, 18)]; TA = ci[WS(rs, 13)]; TB = Tz + TA; TC = Ty + TB; Tb1 = Tz - TA; T2Z = Ty - TB; TaQ = Tw - Tx; } { E T2R, T2S, T2T, T2U, T2V, T2W; T2R = ci[WS(rs, 61)]; T2S = cr[WS(rs, 34)]; T2T = T2R - T2S; T2U = ci[WS(rs, 45)]; T2V = cr[WS(rs, 50)]; T2W = T2U - T2V; T2X = T2T - T2W; Tb2 = T2R + T2S; T7m = T2T + T2W; TaR = T2U + T2V; } { E TF, TaT, T35, TaU, TI, TaW, T32, TaX; { E TD, TE, T33, T34; TD = cr[WS(rs, 10)]; TE = ci[WS(rs, 21)]; TF = TD + TE; TaT = TD - TE; T33 = ci[WS(rs, 53)]; T34 = cr[WS(rs, 42)]; T35 = T33 - T34; TaU = T33 + T34; } { E TG, TH, T30, T31; TG = ci[WS(rs, 5)]; TH = cr[WS(rs, 26)]; TI = TG + TH; TaW = TG - TH; T30 = ci[WS(rs, 37)]; T31 = cr[WS(rs, 58)]; T32 = T30 - T31; TaX = T30 + T31; } TJ = TF + TI; Tb4 = TaT + TaU; Tb5 = TaW + TaX; T2Q = TF
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -