📄 gemm_kernel.s
字号:
#endif#endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L040 } ;;#if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mfi LDFPD f48, f49 = [B] mov f65 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f73 = f0#ifndef TRMMKERNEL adds L = 1, K#else adds L = 1, L#endif }#else { .mmf shladd BOFFSET = KK8, 3, B shladd AOFFSET = KK8, 1, AOFFSET mov f65 = f0 } ;; { .mfi LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0#ifndef TRMMKERNEL adds L = 1, K#else adds L = 1, L#endif }#endif ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f81 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f89 = f0 shr L = L, 1 } ;; { .mfi LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f97 = f0 adds L = -1, L } { .mfi nop __LINE__ mov f105 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov f113 = f0 mov ar.lc = L } { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f121 = f0 cmp.eq p3, p0 = r0, r0 } ;; .align 32.L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1], SIZE#else nop __LINE__#endif (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2], SIZE#else nop __LINE__#endif (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1], -1 * SIZE#else nop __LINE__#endif (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C2], -1 * SIZE#else nop __LINE__#endif (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3], SIZE#else nop __LINE__#endif (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4], SIZE#else nop __LINE__#endif (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfi#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f85 = [C3], -1 * SIZE#else nop __LINE__#endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C4], -1 * SIZE#else nop __LINE__#endif (p3) FMA f121 = f41, f63, f121 // A2 * B8 br.cloop.sptk.few .L032 } ;;.L038:#if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb LDFD f100 = [C5], SIZE FMA f64 = ALPHA, f64, f68 nop __LINE__ } { .mfb LDFD f108 = [C6], SIZE FMA f65 = ALPHA, f65, f69 nop __LINE__ } ;; { .mfb LDFD f101 = [C5], -1 * SIZE FMA f72 = ALPHA, f72, f76 nop __LINE__ } { .mfb LDFD f109 = [C6], -1 * SIZE FMA f73 = ALPHA, f73, f77 nop __LINE__ } ;; { .mfb LDFD f116 = [C7], SIZE FMA f80 = ALPHA, f80, f84 nop __LINE__ } { .mfb LDFD f124 = [C8], SIZE FMA f81 = ALPHA, f81, f85 nop __LINE__ } ;; { .mfb LDFD f117 = [C7], -1 * SIZE FMA f88 = ALPHA, f88, f92 nop __LINE__ } { .mfb LDFD f125 = [C8], -1 * SIZE FMA f89 = ALPHA, f89, f93 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMA f96 = ALPHA, f96, f100 nop __LINE__ } { .mfb STFD [C2 ] = f72, SIZE FMA f104 = ALPHA, f104, f108 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMA f97 = ALPHA, f97, f101 nop __LINE__ } { .mfb STFD [C2 ] = f73, SIZE FMA f105 = ALPHA, f105, f109 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMA f112 = ALPHA, f112, f116 nop __LINE__ } { .mfb STFD [C4 ] = f88, SIZE FMA f120 = ALPHA, f120, f124 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, SIZE FMA f113 = ALPHA, f113, f117 nop __LINE__ } { .mfb STFD [C4 ] = f89, SIZE FMA f121 = ALPHA, f121, f125 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C6 ] = f104, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C6 ] = f105, SIZE mov f88 = f0 nop __LINE__ } ;; { .mfb STFD [C7 ] = f112, SIZE mov f96 = f0 nop __LINE__ } { .mfb STFD [C8 ] = f120, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfb STFD [C7 ] = f113, SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C8 ] = f121, SIZE mov f120 = f0 nop __LINE__ } ;;#else { .mfb nop __LINE__ FMPY f64 = ALPHA, f64 nop __LINE__ } { .mfb nop __LINE__ FMPY f65 = ALPHA, f65 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f72 = ALPHA, f72 nop __LINE__ } { .mfb nop __LINE__ FMPY f73 = ALPHA, f73 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb nop __LINE__ FMPY f81 = ALPHA, f81 nop __LINE__ } ;; { .mfb nop __LINE__ FMPY f88 = ALPHA, f88 nop __LINE__ } { .mfb nop __LINE__ FMPY f89 = ALPHA, f89 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f96 = ALPHA, f96 nop __LINE__ } { .mfb STFD [C2 ] = f72, SIZE FMPY f104 = ALPHA, f104 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, SIZE FMPY f97 = ALPHA, f97 nop __LINE__ } { .mfb STFD [C2 ] = f73, SIZE FMPY f105 = ALPHA, f105 nop __LINE__ } ;; { .mfi STFD [C3 ] = f80, SIZE FMPY f112 = ALPHA, f112#if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK#else nop __LINE__#endif } { .mfb STFD [C4 ] = f88, SIZE FMPY f120 = ALPHA, f120 nop __LINE__ } ;; { .mfi STFD [C3 ] = f81, SIZE FMPY f113 = ALPHA, f113#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -2, L#else nop __LINE__#endif } { .mfi STFD [C4 ] = f89, SIZE FMPY f121 = ALPHA, f121#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -8, L#else nop __LINE__#endif } ;; { .mfi STFD [C5 ] = f96, SIZE mov f64 = f0#if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0#else nop __LINE__#endif } { .mfb STFD [C6 ] = f104, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C5 ] = f97, SIZE mov f80 = f0#if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 1, AOFFSET#else nop __LINE__#endif } { .mfi STFD [C6 ] = f105, SIZE mov f88 = f0#if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 3, BOFFSET#else nop __LINE__#endif } ;; { .mfi STFD [C7 ] = f112, SIZE mov f96 = f0#if defined(TRMMKERNEL) && defined(LEFT) adds KK = 2, KK#else nop __LINE__#endif } { .mfb STFD [C8 ] = f120, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C7 ] = f113, SIZE mov f112 = f0#ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0#else nop __LINE__#endif } { .mfb STFD [C8 ] = f121, SIZE mov f120 = f0 nop __LINE__ } ;;#endif .align 32.L040: { .mib#ifndef TRMMKERNEL nop __LINE__#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK#elif defined(LEFT) adds L = 1, KK#else adds L = 8, KK#endif#endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L049 } ;;#if !defined(TRMMKERNEL) || \ defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) { .mmi LDFPD f48, f49 = [B] adds BOFFSET = 2 * SIZE, B#ifndef TRMMKERNEL adds L = 1, K#else adds L = 1, L#endif }#else { .mmi shladd BOFFSET = KK8, 3, B add AOFFSET = KK8, AOFFSET nop __LINE__ } ;; { .mmi LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__#ifndef TRMMKERNEL adds L = 1, K#else adds L = 1, L#endif }#endif ;; { .mii LDFPD f50, f51 = [BOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi LDFPD f52, f53 = [BOFFSET], 2 * SIZE LDFD f32 = [AOFFSET], 1 * SIZE adds L = -1, L } ;; { .mmi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } { .mmi LDFPD f54, f55 = [BOFFSET], 2 * SIZE adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET nop __LINE__ } ;; .align 32.L042: { .mfb lfetch.nt1 [PREB], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA f72 = f32, f49, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p3) LDFD f40 = [AOFFSET], 1 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1]#else nop __LINE__#endif FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2]#else nop __LINE__#endif FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3]#else nop __LINE__#endif (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4]#else nop __LINE__#endif (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 adds L = -1, L } { .mmb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f100 = [C5] (p5) LDFD f108 = [C6]#else nop __LINE__ nop __LINE__#endif nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } { .mmb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f116 = [C7] (p5) LDFD f124 = [C8]#else nop __LINE__ nop __LINE__#endif br.cloop.sptk.few .L042 } ;;#if! defined(TRMMKERNEL) && !defined(BETAZERO) FMA f64 = ALPHA, f64, f68 FMA f72 = ALPHA, f72, f76 FMA f80 = ALPHA, f80, f84 FMA f88 = ALPHA, f88, f92 FMA f96 = ALPHA, f96, f100 FMA f104 = ALPHA, f104, f108 FMA f112 = ALPHA, f112, f116 FMA f120 = ALPHA, f120, f124 ;; STFD [C1 ] = f64, SIZE mov f64 = f0 STFD [C2 ] = f72, SIZE mov f72 = f0 ;;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -