📄 gemm_kernel.s
字号:
{ .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f68 = [C1 ], SIZE#else nop __LINE__#endif (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f70 = [C9 ], SIZE#else nop __LINE__#endif (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f69 = [C1 ], -1 * SIZE#else nop __LINE__#endif (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f71 = [C9 ], -1 * SIZE#else nop __LINE__#endif (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f76 = [C2 ], SIZE#else nop __LINE__#endif (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f78 = [C10], SIZE#else nop __LINE__#endif (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f77 = [C2 ], -1 * SIZE#else nop __LINE__#endif (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f79 = [C10], -1 * SIZE#else nop __LINE__#endif (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f84 = [C3 ], SIZE#else nop __LINE__#endif (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f86 = [C11], SIZE#else nop __LINE__#endif (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f85 = [C3 ], -1 * SIZE#else nop __LINE__#endif (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f87 = [C11], -1 * SIZE#else nop __LINE__#endif (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f92 = [C4 ], SIZE#else nop __LINE__#endif (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f94 = [C12], SIZE#else nop __LINE__#endif (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f93 = [C4 ], -1 * SIZE#else nop __LINE__#endif (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f95 = [C12], -1 * SIZE#else nop __LINE__#endif (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f100 = [C5 ], SIZE#else nop __LINE__#endif (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f102 = [C13], SIZE#else nop __LINE__#endif (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f101 = [C5 ], -1 * SIZE#else nop __LINE__#endif (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f103 = [C13], -1 * SIZE#else nop __LINE__#endif (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f108 = [C6 ], SIZE#else nop __LINE__#endif (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f110 = [C14], SIZE#else nop __LINE__#endif (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; { .mfi#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f109 = [C6 ], -1 * SIZE#else nop __LINE__#endif (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO) (p5) LDFD f111 = [C14], -1 * SIZE#else nop __LINE__#endif (p3) FMA f123 = f43, f63, f123 // A4 * B8 br.cloop.sptk.few .L022 } ;;.L028:#if! defined(TRMMKERNEL) && !defined(BETAZERO) { .mfb LDFD f116 = [C7 ], SIZE FMA f64 = ALPHA, f64, f68 nop __LINE__ } { .mfb LDFD f118 = [C15], SIZE FMA f66 = ALPHA, f66, f70 nop __LINE__ } ;; { .mfb LDFD f117 = [C7 ], -1 * SIZE FMA f65 = ALPHA, f65, f69 nop __LINE__ } { .mfb LDFD f119 = [C15], -1 * SIZE FMA f67 = ALPHA, f67, f71 nop __LINE__ } ;; { .mfb LDFD f124 = [C8], SIZE FMA f72 = ALPHA, f72, f76 nop __LINE__ } { .mfb LDFD f126 = [C16], SIZE FMA f74 = ALPHA, f74, f78 nop __LINE__ } ;; { .mfb LDFD f125 = [C8], -1 * SIZE FMA f73 = ALPHA, f73, f77 nop __LINE__ } { .mfb LDFD f127 = [C16], -1 * SIZE FMA f75 = ALPHA, f75, f79 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMA f80 = ALPHA, f80, f84 nop __LINE__ } { .mfb STFD [C9 ] = f66, SIZE FMA f82 = ALPHA, f82, f86 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, 3 * SIZE FMA f81 = ALPHA, f81, f85 nop __LINE__ } { .mfb STFD [C9 ] = f67, 3 * SIZE FMA f83 = ALPHA, f83, f87 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMA f88 = ALPHA, f88, f92 nop __LINE__ } { .mfb STFD [C10] = f74, SIZE FMA f90 = ALPHA, f90, f94 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, 3 * SIZE FMA f89 = ALPHA, f89, f93 nop __LINE__ } { .mfb STFD [C10] = f75, 3 * SIZE FMA f91 = ALPHA, f91, f95 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMA f96 = ALPHA, f96, f100 nop __LINE__ } { .mfb STFD [C11] = f82, SIZE FMA f98 = ALPHA, f98, f102 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, 3 * SIZE FMA f97 = ALPHA, f97, f101 nop __LINE__ } { .mfb STFD [C11] = f83, 3 * SIZE FMA f99 = ALPHA, f99, f103 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE FMA f104 = ALPHA, f104, f108 nop __LINE__ } { .mfb STFD [C12] = f90, SIZE FMA f106 = ALPHA, f106, f110 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, 3 * SIZE FMA f105 = ALPHA, f105, f109 nop __LINE__ } { .mfb STFD [C12] = f91, 3 * SIZE FMA f107 = ALPHA, f107, f111 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE FMA f112 = ALPHA, f112, f116 nop __LINE__ } { .mfb STFD [C13] = f98, SIZE FMA f114 = ALPHA, f114, f118 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, 3 * SIZE FMA f113 = ALPHA, f113, f117 nop __LINE__ } { .mfb STFD [C13] = f99, 3 * SIZE FMA f115 = ALPHA, f115, f119 nop __LINE__ } ;; { .mfb STFD [C6 ] = f104, SIZE FMA f120 = ALPHA, f120, f124 nop __LINE__ } { .mfb STFD [C14] = f106, SIZE FMA f122 = ALPHA, f122, f126 nop __LINE__ } ;; { .mfb STFD [C6 ] = f105, 3 * SIZE FMA f121 = ALPHA, f121, f125 nop __LINE__ } { .mfb STFD [C14] = f107, 3 * SIZE FMA f123 = ALPHA, f123, f127 nop __LINE__ } ;; { .mfb STFD [C7 ] = f112, SIZE mov f64 = f0 nop __LINE__ } { .mfb STFD [C15] = f114, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfb STFD [C7 ] = f113, 3 * SIZE mov f80 = f0 nop __LINE__ } { .mfb STFD [C15] = f115, 3 * SIZE mov f88 = f0 nop __LINE__ } ;; { .mfb STFD [C8 ] = f120, SIZE mov f96 = f0 nop __LINE__ } { .mfb STFD [C16] = f122, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfb STFD [C8 ] = f121, 3 * SIZE mov f112 = f0 nop __LINE__ } { .mfb STFD [C16] = f123, 3 * SIZE mov f120 = f0 nop __LINE__ } ;;#else { .mfb FMPY f64 = ALPHA, f64 nop __LINE__ } { .mfb FMPY f66 = ALPHA, f66 nop __LINE__ } ;; { .mfb FMPY f65 = ALPHA, f65 nop __LINE__ } { .mfb FMPY f67 = ALPHA, f67 nop __LINE__ } ;; { .mfb FMPY f72 = ALPHA, f72 nop __LINE__ } { .mfb FMPY f74 = ALPHA, f74 nop __LINE__ } ;; { .mfb FMPY f73 = ALPHA, f73 nop __LINE__ } { .mfb FMPY f75 = ALPHA, f75 nop __LINE__ } ;; { .mfb STFD [C1 ] = f64, SIZE FMPY f80 = ALPHA, f80 nop __LINE__ } { .mfb STFD [C9 ] = f66, SIZE FMPY f82 = ALPHA, f82 nop __LINE__ } ;; { .mfb STFD [C1 ] = f65, 3 * SIZE FMPY f81 = ALPHA, f81 nop __LINE__ } { .mfb STFD [C9 ] = f67, 3 * SIZE FMPY f83 = ALPHA, f83 nop __LINE__ } ;; { .mfb STFD [C2 ] = f72, SIZE FMPY f88 = ALPHA, f88 nop __LINE__ } { .mfb STFD [C10] = f74, SIZE FMPY f90 = ALPHA, f90 nop __LINE__ } ;; { .mfb STFD [C2 ] = f73, 3 * SIZE FMPY f89 = ALPHA, f89 nop __LINE__ } { .mfb STFD [C10] = f75, 3 * SIZE FMPY f91 = ALPHA, f91 nop __LINE__ } ;; { .mfb STFD [C3 ] = f80, SIZE FMPY f96 = ALPHA, f96 nop __LINE__ } { .mfb STFD [C11] = f82, SIZE FMPY f98 = ALPHA, f98 nop __LINE__ } ;; { .mfb STFD [C3 ] = f81, 3 * SIZE FMPY f97 = ALPHA, f97 nop __LINE__ } { .mfb STFD [C11] = f83, 3 * SIZE FMPY f99 = ALPHA, f99 nop __LINE__ } ;; { .mfb STFD [C4 ] = f88, SIZE FMPY f104 = ALPHA, f104 nop __LINE__ } { .mfb STFD [C12] = f90, SIZE FMPY f106 = ALPHA, f106 nop __LINE__ } ;; { .mfb STFD [C4 ] = f89, 3 * SIZE FMPY f105 = ALPHA, f105 nop __LINE__ } { .mfb STFD [C12] = f91, 3 * SIZE FMPY f107 = ALPHA, f107 nop __LINE__ } ;; { .mfb STFD [C5 ] = f96, SIZE FMPY f112 = ALPHA, f112 nop __LINE__ } { .mfb STFD [C13] = f98, SIZE FMPY f114 = ALPHA, f114 nop __LINE__ } ;; { .mfb STFD [C5 ] = f97, 3 * SIZE FMPY f113 = ALPHA, f113 nop __LINE__ } { .mfb STFD [C13] = f99, 3 * SIZE FMPY f115 = ALPHA, f115 nop __LINE__ } ;; { .mfi STFD [C6 ] = f104, SIZE FMPY f120 = ALPHA, f120#if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) sub L = K, KK#else nop __LINE__#endif } { .mfb STFD [C14] = f106, SIZE FMPY f122 = ALPHA, f122 nop __LINE__ } ;; { .mfi STFD [C6 ] = f105, 3 * SIZE FMPY f121 = ALPHA, f121#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) adds L = -4, L#else nop __LINE__#endif } { .mfi STFD [C14] = f107, 3 * SIZE FMPY f123 = ALPHA, f123#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) adds L = -8, L#else nop __LINE__#endif } ;; { .mfi STFD [C7 ] = f112, SIZE mov f64 = f0#if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd KK8 = L, BASE_SHIFT, r0#else nop __LINE__#endif } { .mfb STFD [C15] = f114, SIZE mov f72 = f0 nop __LINE__ } ;; { .mfi STFD [C7 ] = f113, 3 * SIZE mov f80 = f0#if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd AOFFSET = KK8, 2, AOFFSET#else nop __LINE__#endif } { .mfi STFD [C15] = f115, 3 * SIZE mov f88 = f0#if defined(TRMMKERNEL) && \ ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) shladd BOFFSET = KK8, 3, BOFFSET#else nop __LINE__#endif } ;; { .mfi STFD [C8 ] = f120, SIZE mov f96 = f0#if defined(TRMMKERNEL) && defined(LEFT) adds KK = 4, KK#else nop __LINE__#endif } { .mfb STFD [C16] = f122, SIZE mov f104 = f0 nop __LINE__ } ;; { .mfi STFD [C8 ] = f121, 3 * SIZE mov f112 = f0#ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0#else nop __LINE__#endif } { .mfb STFD [C16] = f123, 3 * SIZE mov f120 = f0 nop __LINE__ } ;;#endif .align 32.L030: { .mib#ifndef TRMMKERNEL nop __LINE__#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub L = K, KK#elif defined(LEFT) adds L = 2, KK#else adds L = 8, KK
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -