📄 q1_8.c
字号:
E T5V, T5I, T5R, T5U, T5T, T5W; { E T2W, T31, T2V, T2Y, T5S, T32, T2X; T2W = T1D - T1K; T31 = T2Z - T30; iio[WS(vs, 2) + WS(rs, 5)] = FNMS(T8W, T8X, T8V); rio[WS(vs, 2) + WS(rs, 5)] = FMA(T8T, T8X, T8Y); T2V = W[6]; T2Y = W[7]; T5P = T5L - T5O; T5V = T5L + T5O; T5S = T5H + T5G; T5I = T5G - T5H; T32 = T2V * T31; T2X = T2V * T2W; T5R = W[2]; T5U = W[3]; iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T2Y, T2W, T32); rio[WS(vs, 4) + WS(rs, 1)] = FMA(T2Y, T31, T2X); T5T = T5R * T5S; T5W = T5U * T5S; } { E T3R, T3W, T40, T3V; iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T5U, T5V, T5T); rio[WS(vs, 2) + WS(rs, 3)] = FMA(T5R, T5V, T5W); T3R = W[8]; T3W = W[9]; T40 = T3R * T3Z; T3V = T3R * T3U; T5D = W[10]; T5K = W[11]; iio[WS(vs, 5) + WS(rs, 2)] = FNMS(T3W, T3U, T40); rio[WS(vs, 5) + WS(rs, 2)] = FMA(T3W, T3Z, T3V); T5J = T5D * T5I; T5Q = T5K * T5I; } } { E T73, T76, T78, T75, T9V; iio[WS(vs, 6) + WS(rs, 3)] = FNMS(T5K, T5P, T5J); rio[WS(vs, 6) + WS(rs, 3)] = FMA(T5D, T5P, T5Q); T73 = W[0]; T76 = W[1]; T78 = T73 * T77; T75 = T73 * T74; T9V = W[8]; Ta0 = W[9]; iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T76, T74, T78); rio[WS(vs, 1) + WS(rs, 4)] = FMA(T76, T77, T75); Ta4 = T9V * Ta3; T9Z = T9V * T9Y; } { E T79, T7g, T7f, T7m, T8Z; iio[WS(vs, 5) + WS(rs, 6)] = FNMS(Ta0, T9Y, Ta4); rio[WS(vs, 5) + WS(rs, 6)] = FMA(Ta0, Ta3, T9Z); T79 = W[10]; T7g = W[11]; T90 = T7H - T7O; T95 = T93 - T94; T7f = T79 * T7e; T7m = T7g * T7e; T8Z = W[6]; T92 = W[7]; iio[WS(vs, 6) + WS(rs, 4)] = FNMS(T7g, T7l, T7f); rio[WS(vs, 6) + WS(rs, 4)] = FMA(T79, T7l, T7m); T96 = T8Z * T95; T91 = T8Z * T90; } } } { E T8A, T8D, T8C, T8E, T8B; { E T4s, T4x, T4u, T4y, T4t; { E T4p, T4m, T5s, T5w, T5r; { E T4j, T4c, T47, T4e, T4d, T4k, T5n; T4p = T4f + T4i; T4j = T4f - T4i; T4c = T4a - T4b; T4m = T4b + T4a; iio[WS(vs, 4) + WS(rs, 5)] = FNMS(T92, T90, T96); rio[WS(vs, 4) + WS(rs, 5)] = FMA(T92, T95, T91); T47 = W[10]; T4e = W[11]; T4d = T47 * T4c; T4k = T4e * T4c; T5n = W[8]; T5s = W[9]; iio[WS(vs, 6) + WS(rs, 2)] = FNMS(T4e, T4j, T4d); rio[WS(vs, 6) + WS(rs, 2)] = FMA(T47, T4j, T4k); T5w = T5n * T5v; T5r = T5n * T5q; } { E T4l, T4o, T4n, T4q, T4r; iio[WS(vs, 5) + WS(rs, 3)] = FNMS(T5s, T5q, T5w); rio[WS(vs, 5) + WS(rs, 3)] = FMA(T5s, T5v, T5r); T4l = W[2]; T4o = W[3]; T4s = T39 - T3g; T4x = T4v - T4w; T4n = T4l * T4m; T4q = T4o * T4m; T4r = W[6]; T4u = W[7]; iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T4o, T4p, T4n); rio[WS(vs, 2) + WS(rs, 2)] = FMA(T4l, T4p, T4q); T4y = T4r * T4x; T4t = T4r * T4s; } } { E T8F, T8M, T8L, T8S; { E T7u, T7z, T7t, T7w, T7A, T7v; T7u = T6b - T6i; T7z = T7x - T7y; iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T4u, T4s, T4y); rio[WS(vs, 4) + WS(rs, 2)] = FMA(T4u, T4x, T4t); T7t = W[6]; T7w = W[7]; T7A = T7t * T7z; T7v = T7t * T7u; T8F = W[10]; T8M = W[11]; iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T7w, T7u, T7A); rio[WS(vs, 4) + WS(rs, 4)] = FMA(T7w, T7z, T7v); T8L = T8F * T8K; T8S = T8M * T8K; } { E T8s, T8x, T8p, T8u, T8y, T8t, T8z; T8A = FMA(KP707106781, T8r, T8q); T8s = FNMS(KP707106781, T8r, T8q); T8x = FNMS(KP707106781, T8w, T8v); T8D = FMA(KP707106781, T8w, T8v); iio[WS(vs, 6) + WS(rs, 5)] = FNMS(T8M, T8R, T8L); rio[WS(vs, 6) + WS(rs, 5)] = FMA(T8F, T8R, T8S); T8p = W[8]; T8u = W[9]; T8y = T8p * T8x; T8t = T8p * T8s; T8z = W[0]; T8C = W[1]; iio[WS(vs, 5) + WS(rs, 5)] = FNMS(T8u, T8s, T8y); rio[WS(vs, 5) + WS(rs, 5)] = FMA(T8u, T8x, T8t); T8E = T8z * T8D; T8B = T8z * T8A; } } } { E T3y, T3J, T3h, T3A, T3z, T3K; { E T54, T5f, T4N, T56, T55, T5g; { E Tw, TH, Tf, Ty, Tx, TI; { E TN, TJ, TM, TL, TO, TK; TK = FMA(KP707106781, Tv, Tk); Tw = FNMS(KP707106781, Tv, Tk); iio[WS(vs, 1) + WS(rs, 5)] = FNMS(T8C, T8A, T8E); rio[WS(vs, 1) + WS(rs, 5)] = FMA(T8C, T8D, T8B); TH = FNMS(KP707106781, TG, TD); TN = FMA(KP707106781, TG, TD); TJ = W[4]; TM = W[5]; Tf = W[12]; TL = TJ * TK; TO = TM * TK; Ty = W[13]; Tx = Tf * Tw; iio[WS(vs, 3)] = FNMS(TM, TN, TL); rio[WS(vs, 3)] = FMA(TJ, TN, TO); } TI = Ty * Tw; iio[WS(vs, 7)] = FNMS(Ty, TH, Tx); { E T5h, T5l, T5k, T5j, T5m, T5i; T5i = FMA(KP707106781, T53, T4S); T54 = FNMS(KP707106781, T53, T4S); rio[WS(vs, 7)] = FMA(Tf, TH, TI); T5h = W[4]; T5f = FNMS(KP707106781, T5e, T5b); T5l = FMA(KP707106781, T5e, T5b); T5k = W[5]; T5j = T5h * T5i; T4N = W[12]; T5m = T5k * T5i; T56 = W[13]; iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T5k, T5l, T5j); T55 = T4N * T54; rio[WS(vs, 3) + WS(rs, 3)] = FMA(T5h, T5l, T5m); } } T5g = T56 * T54; { E T22, T2d, T1L, T24, T23, T2e; { E T2j, T2f, T2i, T2h, T2k, T2g; iio[WS(vs, 7) + WS(rs, 3)] = FNMS(T56, T5f, T55); T22 = FNMS(KP707106781, T21, T1Q); T2g = FMA(KP707106781, T21, T1Q); rio[WS(vs, 7) + WS(rs, 3)] = FMA(T4N, T5f, T5g); T2d = FNMS(KP707106781, T2c, T29); T2j = FMA(KP707106781, T2c, T29); T2f = W[4]; T2i = W[5]; T1L = W[12]; T2h = T2f * T2g; T2k = T2i * T2g; T24 = W[13]; T23 = T1L * T22; iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T2i, T2j, T2h); rio[WS(vs, 3) + WS(rs, 1)] = FMA(T2f, T2j, T2k); } T2e = T24 * T22; iio[WS(vs, 7) + WS(rs, 1)] = FNMS(T24, T2d, T23); { E T3L, T3P, T3O, T3N, T3Q, T3M; T3M = FMA(KP707106781, T3x, T3m); T3y = FNMS(KP707106781, T3x, T3m); rio[WS(vs, 7) + WS(rs, 1)] = FMA(T1L, T2d, T2e); T3L = W[4]; T3J = FNMS(KP707106781, T3I, T3F); T3P = FMA(KP707106781, T3I, T3F); T3O = W[5]; T3N = T3L * T3M; T3h = W[12]; T3Q = T3O * T3M; T3A = W[13]; iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T3O, T3P, T3N); T3z = T3h * T3y; rio[WS(vs, 3) + WS(rs, 2)] = FMA(T3L, T3P, T3Q); } } } T3K = T3A * T3y; { E Tb8, Tbj, TaR, Tba, Tb9, Tbk; { E T6A, T6L, T6j, T6C, T6B, T6M; { E T6R, T6N, T6Q, T6P, T6S, T6O; iio[WS(vs, 7) + WS(rs, 2)] = FNMS(T3A, T3J, T3z); T6A = FNMS(KP707106781, T6z, T6o); T6O = FMA(KP707106781, T6z, T6o); rio[WS(vs, 7) + WS(rs, 2)] = FMA(T3h, T3J, T3K); T6L = FNMS(KP707106781, T6K, T6H); T6R = FMA(KP707106781, T6K, T6H); T6N = W[4]; T6Q = W[5]; T6j = W[12]; T6P = T6N * T6O; T6S = T6Q * T6O; T6C = W[13]; T6B = T6j * T6A; iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T6Q, T6R, T6P); rio[WS(vs, 3) + WS(rs, 4)] = FMA(T6N, T6R, T6S); } T6M = T6C * T6A; iio[WS(vs, 7) + WS(rs, 4)] = FNMS(T6C, T6L, T6B); { E Tbl, Tbp, Tbo, Tbn, Tbq, Tbm; Tbm = FMA(KP707106781, Tb7, TaW); Tb8 = FNMS(KP707106781, Tb7, TaW); rio[WS(vs, 7) + WS(rs, 4)] = FMA(T6j, T6L, T6M); Tbl = W[4]; Tbj = FNMS(KP707106781, Tbi, Tbf); Tbp = FMA(KP707106781, Tbi, Tbf); Tbo = W[5]; Tbn = Tbl * Tbm; TaR = W[12]; Tbq = Tbo * Tbm; Tba = W[13]; iio[WS(vs, 3) + WS(rs, 7)] = FNMS(Tbo, Tbp, Tbn); Tb9 = TaR * Tb8; rio[WS(vs, 3) + WS(rs, 7)] = FMA(Tbl, Tbp, Tbq); } } Tbk = Tba * Tb8; { E T86, T8h, T7P, T88, T87, T8i; { E T8n, T8j, T8m, T8l, T8o, T8k; iio[WS(vs, 7) + WS(rs, 7)] = FNMS(Tba, Tbj, Tb9); T86 = FNMS(KP707106781, T85, T7U); T8k = FMA(KP707106781, T85, T7U); rio[WS(vs, 7) + WS(rs, 7)] = FMA(TaR, Tbj, Tbk); T8h = FNMS(KP707106781, T8g, T8d); T8n = FMA(KP707106781, T8g, T8d); T8j = W[4]; T8m = W[5]; T7P = W[12]; T8l = T8j * T8k; T8o = T8m * T8k; T88 = W[13]; T87 = T7P * T86; iio[WS(vs, 3) + WS(rs, 5)] = FNMS(T8m, T8n, T8l); rio[WS(vs, 3) + WS(rs, 5)] = FMA(T8j, T8n, T8o); } T8i = T88 * T86; iio[WS(vs, 7) + WS(rs, 5)] = FNMS(T88, T8h, T87); { E T9P, T9T, T9S, T9R, T9U, T9Q; T9Q = FMA(KP707106781, T9B, T9q); T9C = FNMS(KP707106781, T9B, T9q); rio[WS(vs, 7) + WS(rs, 5)] = FMA(T7P, T8h, T8i); T9P = W[4]; T9N = FNMS(KP707106781, T9M, T9J); T9T = FMA(KP707106781, T9M, T9J); T9S = W[5]; T9R = T9P * T9Q; T9l = W[12]; T9U = T9S * T9Q; T9E = W[13]; iio[WS(vs, 3) + WS(rs, 6)] = FNMS(T9S, T9T, T9R); T9D = T9l * T9C; rio[WS(vs, 3) + WS(rs, 6)] = FMA(T9P, T9T, T9U); } } } } } } } } T9O = T9E * T9C; iio[WS(vs, 7) + WS(rs, 6)] = FNMS(T9E, T9N, T9D); rio[WS(vs, 7) + WS(rs, 6)] = FMA(T9l, T9N, T9O); }}static const tw_instr twinstr[] = { {TW_FULL, 0, 8}, {TW_NEXT, 1, 0}};static const ct_desc desc = { 8, "q1_8", twinstr, &GENUS, {352, 112, 176, 0}, 0, 0, 0 };void X(codelet_q1_8) (planner *p) { X(kdft_difsq_register) (p, q1_8, &desc);}#else /* HAVE_FMA *//* Generated by: ../../../genfft/gen_twidsq -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 8 -name q1_8 -include q.h *//* * This function contains 528 FP additions, 256 FP multiplications, * (or, 416 additions, 144 multiplications, 112 fused multiply/add), * 142 stack variables, 1 constants, and 256 memory accesses */#include "q.h"static void q1_8(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms){ DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + (mb * 14); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 14, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(vs)) { E T7, T14, T1g, Tk, TC, TQ, T10, TM, T1w, T2p, T2z, T1H, T1M, T1W, T2j; E T1V, T7R, T8O, T90, T84, T8m, T8A, T8K, T8w, T9g, Ta9, Taj, T9r, T9w, T9G; E Ta3, T9F, Te, T17, T1h, Tp, Tu, TE, T11, TD, T1p, T2m, T2y, T1C, T1U; E T28, T2i, T24, T7Y, T8R, T91, T89, T8e, T8o, T8L, T8n, T99, Ta6, Tai, T9m; E T9E, T9S, Ta2, T9O, T2H, T3E, T3Q, T2U, T3c, T3q, T3A, T3m, T46, T4Z, T59; E T4h, T4m, T4w, T4T, T4v, T5h, T6e, T6q, T5u, T5M, T60, T6a, T5W, T6G, T7z; E T7J, T6R, T6W, T76, T7t, T75, T2O, T3H, T3R, T2Z, T34, T3e, T3B, T3d, T3Z; E T4W, T58, T4c, T4u, T4I, T4S, T4E, T5o, T6h, T6r, T5z, T5E, T5O, T6b, T5N; E T6z, T7w, T7I, T6M, T74, T7i, T7s, T7e; { E T3, Ty, Tj, TY, T6, Tg, TB, TZ; { E T1, T2, Th, Ti; T1 = rio[0]; T2 = rio[WS(rs, 4)]; T3 = T1 + T2; Ty = T1 - T2; Th = iio[0]; Ti = iio[WS(rs, 4)]; Tj = Th - Ti; TY = Th + Ti; } { E T4, T5, Tz, TA; T4 = rio[WS(rs, 2)]; T5 = rio[WS(rs, 6)]; T6 = T4 + T5; Tg = T4 - T5; Tz = iio[WS(rs, 2)]; TA = iio[WS(rs, 6)]; TB = Tz - TA; TZ = Tz + TA; } T7 = T3 + T6; T14 = T3 - T6; T1g = TY + TZ; Tk = Tg + Tj; TC = Ty - TB; TQ = Tj - Tg; T10 = TY - TZ; TM = Ty + TB; } { E T1s, T1I, T1L, T2n, T1v, T1D, T1G, T2o; { E T1q, T1r, T1J, T1K; T1q = rio[WS(vs, 1) + WS(rs, 1)]; T1r = rio[WS(vs, 1) + WS(rs, 5)]; T1s = T1q + T1r; T1I = T1q - T1r; T1J = iio[WS(vs, 1) + WS(rs, 1)]; T1K = iio[WS(vs, 1) + WS(rs, 5)]; T1L = T1J - T1K; T2n = T1J + T1K; } { E T1t, T1u, T1E, T1F; T1t = rio[WS(vs, 1) + WS(rs, 7)]; T1u = rio[WS(vs, 1) + WS(rs, 3)]; T1v = T1t + T1u; T1D = T1t - T1u; T1E = iio[WS(vs, 1) + WS(rs, 7)]; T1F = iio[WS(vs, 1) + WS(rs, 3)]; T1G = T1E - T1F; T2o = T1E + T1F; } T1w = T1s + T1v; T2p = T2n - T2o; T2z = T2n + T2o; T1H = T1D - T1G; T1M = T1I + T1L; T1W = T1D + T1G; T2j = T1v - T1s; T1V = T1L - T1I; } { E T7N, T8i, T83, T8I, T7Q, T80, T8l, T8J; { E T7L, T7M, T81, T82; T7L = rio[WS(vs, 6)]; T7M = rio[WS(vs, 6) + WS(rs, 4)]; T7N = T7L + T7M; T8i = T7L - T7M; T81 = iio[WS(vs, 6)]; T82 = iio[WS(vs, 6) + WS(rs, 4)]; T83 = T81 - T82; T8I = T81 + T82; } { E T7O, T7P, T8j, T8k; T7O = rio[WS(vs, 6) + WS(rs, 2)]; T7P = rio[WS(vs, 6) + WS(rs, 6)]; T7Q = T7O + T7P; T80 = T7O - T7P; T8j = iio[WS(vs, 6) + WS(rs, 2)]; T8k = iio[WS(vs, 6) + WS(rs, 6)]; T8l = T8j - T8k; T8J = T8j + T8k; } T7R = T7N + T7Q;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -