📄 trsm_kernel_ln.s
字号:
} ;; { .mfi FNMA f81 = f80, f33, f81 } { .mfi nop __LINE__ FNMA f113 = f112, f33, f113 nop __LINE__ } ;; { .mfi FNMA f89 = f88, f33, f89 nop __LINE__ } { .mfi nop __LINE__ FNMA f121 = f120, f33, f121 nop __LINE__ } ;; FMPY f65 = f65, f34 FMPY f97 = f97, f34 FMPY f73 = f73, f34 FMPY f105 = f105, f34 FMPY f81 = f81, f34 FMPY f113 = f113, f34 FMPY f89 = f89, f34 FMPY f121 = f121, f34 ;; { .mfi STFD [BOFFSET] = f64, SIZE } { .mfi STFD [BOFFSET2] = f96, SIZE } ;; { .mfi STFD [BOFFSET] = f72, SIZE } { .mfi STFD [BOFFSET2] = f104, SIZE } ;; { .mfi STFD [BOFFSET] = f80, SIZE } { .mfi STFD [BOFFSET2] = f112, SIZE } ;; { .mfi STFD [BOFFSET] = f88, 5 * SIZE } { .mfi STFD [BOFFSET2] = f120, 5 * SIZE } ;; { .mfi STFD [BOFFSET] = f65, SIZE } { .mfi STFD [BOFFSET2] = f97, SIZE } ;; { .mfi STFD [BOFFSET] = f73, SIZE } { .mfi STFD [BOFFSET2] = f105, SIZE } ;; { .mfi STFD [BOFFSET] = f81, SIZE } { .mfi STFD [BOFFSET2] = f113, SIZE } ;; { .mfi STFD [BOFFSET] = f89, -11 * SIZE } { .mfi STFD [BOFFSET2] = f121, -11 * SIZE }#endif#ifdef RN LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [BOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [BOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f47, f48 = [BOFFSET], 2 * SIZE ;; LDFPD f49, f50 = [BOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [BOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f58, f59 = [BOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [BOFFSET] adds BOFFSET = 7 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f19, f20 = [BOFFSET] adds BOFFSET = 9 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 ;; FNMA f80 = f64, f34, f80 FNMA f81 = f65, f34, f81 ;; FNMA f88 = f64, f35, f88 FNMA f89 = f65, f35, f89 ;; FNMA f96 = f64, f36, f96 FNMA f97 = f65, f36, f97 ;; FNMA f104 = f64, f37, f104 FNMA f105 = f65, f37, f105 ;; FNMA f112 = f64, f38, f112 FNMA f113 = f65, f38, f113 ;; FNMA f120 = f64, f39, f120 FNMA f121 = f65, f39, f121 ;; FMPY f72 = f72, f40 FMPY f73 = f73, f40 ;; FNMA f80 = f72, f41, f80 FNMA f81 = f73, f41, f81 ;; FNMA f88 = f72, f42, f88 FNMA f89 = f73, f42, f89 ;; FNMA f96 = f72, f43, f96 FNMA f97 = f73, f43, f97 ;; FNMA f104 = f72, f44, f104 FNMA f105 = f73, f44, f105 ;; FNMA f112 = f72, f45, f112 FNMA f113 = f73, f45, f113 ;; FNMA f120 = f72, f46, f120 FNMA f121 = f73, f46, f121 ;; FMPY f80 = f80, f47 FMPY f81 = f81, f47 ;; FNMA f88 = f80, f48, f88 FNMA f89 = f81, f48, f89 ;; FNMA f96 = f80, f49, f96 FNMA f97 = f81, f49, f97 ;; FNMA f104 = f80, f50, f104 FNMA f105 = f81, f50, f105 ;; FNMA f112 = f80, f51, f112 FNMA f113 = f81, f51, f113 ;; FNMA f120 = f80, f52, f120 FNMA f121 = f81, f52, f121 ;; FMPY f88 = f88, f53 FMPY f89 = f89, f53 ;; FNMA f96 = f88, f54, f96 FNMA f97 = f89, f54, f97 ;; FNMA f104 = f88, f55, f104 FNMA f105 = f89, f55, f105 ;; FNMA f112 = f88, f56, f112 FNMA f113 = f89, f56, f113 ;; FNMA f120 = f88, f57, f120 FNMA f121 = f89, f57, f121 ;; FMPY f96 = f96, f58 FMPY f97 = f97, f58 ;; FNMA f104 = f96, f59, f104 FNMA f105 = f97, f59, f105 ;; FNMA f112 = f96, f60, f112 FNMA f113 = f97, f60, f113 ;; FNMA f120 = f96, f61, f120 FNMA f121 = f97, f61, f121 ;; FMPY f104 = f104, f16 FMPY f105 = f105, f16 ;; FNMA f112 = f104, f17, f112 FNMA f113 = f105, f17, f113 ;; FNMA f120 = f104, f18, f120 FNMA f121 = f105, f18, f121 ;; FMPY f112 = f112, f19 FMPY f113 = f113, f19 ;; FNMA f120 = f112, f20, f120 FNMA f121 = f113, f20, f121 ;; FMPY f120 = f120, f21 FMPY f121 = f121, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f73, 5 * SIZE STFD [AOFFSET2] = f89, 5 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE ;; STFD [AOFFSET] = f104, SIZE STFD [AOFFSET2] = f120, SIZE ;; STFD [AOFFSET] = f105, -11 * SIZE STFD [AOFFSET2] = f121, - 11 * SIZE ;;#endif#ifdef RT adds BOFFSET = 62 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f35, f34 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f37, f36 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f39, f38 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f40 = [BOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f44, f43 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f46, f45 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f48, f47 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f50, f49 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f52, f51 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFD f53 = [BOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f57, f56 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f59, f58 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f61, f60 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFD f16 = [BOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f20, f19 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFD f21 = [BOFFSET] ;; FMPY f120 = f120, f32 FMPY f121 = f121, f32 ;; FNMA f112 = f120, f33, f112 FNMA f113 = f121, f33, f113 ;; FNMA f104 = f120, f34, f104 FNMA f105 = f121, f34, f105 ;; FNMA f96 = f120, f35, f96 FNMA f97 = f121, f35, f97 ;; FNMA f88 = f120, f36, f88 FNMA f89 = f121, f36, f89 ;; FNMA f80 = f120, f37, f80 FNMA f81 = f121, f37, f81 ;; FNMA f72 = f120, f38, f72 FNMA f73 = f121, f38, f73 ;; FNMA f64 = f120, f39, f64 FNMA f65 = f121, f39, f65 ;; FMPY f112 = f112, f40 FMPY f113 = f113, f40 ;; FNMA f104 = f112, f41, f104 FNMA f105 = f113, f41, f105 ;; FNMA f96 = f112, f42, f96 FNMA f97 = f113, f42, f97 ;; FNMA f88 = f112, f43, f88 FNMA f89 = f113, f43, f89 ;; FNMA f80 = f112, f44, f80 FNMA f81 = f113, f44, f81 ;; FNMA f72 = f112, f45, f72 FNMA f73 = f113, f45, f73 ;; FNMA f64 = f112, f46, f64 FNMA f65 = f113, f46, f65 ;; FMPY f104 = f104, f47 FMPY f105 = f105, f47 ;; FNMA f96 = f104, f48, f96 FNMA f97 = f105, f48, f97 ;; FNMA f88 = f104, f49, f88 FNMA f89 = f105, f49, f89 ;; FNMA f80 = f104, f50, f80 FNMA f81 = f105, f50, f81 ;; FNMA f72 = f104, f51, f72 FNMA f73 = f105, f51, f73 ;; FNMA f64 = f104, f52, f64 FNMA f65 = f105, f52, f65 ;; FMPY f96 = f96, f53 FMPY f97 = f97, f53 ;; FNMA f88 = f96, f54, f88 FNMA f89 = f97, f54, f89 ;; FNMA f80 = f96, f55, f80 FNMA f81 = f97, f55, f81 ;; FNMA f72 = f96, f56, f72 FNMA f73 = f97, f56, f73 ;; FNMA f64 = f96, f57, f64 FNMA f65 = f97, f57, f65 ;; FMPY f88 = f88, f58 FMPY f89 = f89, f58 ;; FNMA f80 = f88, f59, f80 FNMA f81 = f89, f59, f81 ;; FNMA f72 = f88, f60, f72 FNMA f73 = f89, f60, f73 ;; FNMA f64 = f88, f61, f64 FNMA f65 = f89, f61, f65 ;; FMPY f80 = f80, f16 FMPY f81 = f81, f16 ;; FNMA f72 = f80, f17, f72 FNMA f73 = f81, f17, f73 ;; FNMA f64 = f80, f18, f64 FNMA f65 = f81, f18, f65 ;; FMPY f72 = f72, f19 FMPY f73 = f73, f19 ;; FNMA f64 = f72, f20, f64 FNMA f65 = f73, f20, f65 ;; FMPY f64 = f64, f21 FMPY f65 = f65, f21 ;; adds AOFFSET = 8 * SIZE, AOFFSET adds AOFFSET2 = 8 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE ;; STFD [AOFFSET] = f104, SIZE STFD [AOFFSET2] = f120, SIZE ;; STFD [AOFFSET] = f105, - 11 * SIZE STFD [AOFFSET2] = f121, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f88, SIZE ;; STFD [AOFFSET] = f73, - 3 * SIZE STFD [AOFFSET2] = f89, - 3 * SIZE ;;#endif STFD [C1 ] = f64, SIZE mov f64 = f0 ;;#ifndef LN STFD [C1 ] = f65, SIZE#else STFD [C1 ] = f65, -SIZE#endif ;; STFD [C2 ] = f72, SIZE mov f72 = f0 ;;#ifndef LN STFD [C2 ] = f73, SIZE#else STFD [C2 ] = f73, -SIZE#endif ;; STFD [C3 ] = f80, SIZE mov f80 = f0 ;;#ifndef LN STFD [C3 ] = f81, SIZE#else STFD [C3 ] = f81, - SIZE#endif ;; STFD [C4 ] = f88, SIZE mov f88 = f0 ;;#ifndef LN STFD [C4 ] = f89, SIZE#else STFD [C4 ] = f89, -SIZE#endif ;; STFD [C5 ] = f96, SIZE mov f96 = f0 ;;#ifndef LN STFD [C5 ] = f97, SIZE#else STFD [C5 ] = f97, -SIZE#endif ;; STFD [C6 ] = f104, SIZE mov f104 = f0 ;;#ifndef LN STFD [C6 ] = f105, SIZE#else STFD [C6 ] = f105, -SIZE#endif ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;;#ifdef RT shladd AORIG = r2, 1, AORIG#else nop __LINE__#endif ;; STFD [C7 ] = f112, SIZE mov f112 = f0 ;; { .mmi#ifndef LN STFD [C7 ] = f113, SIZE#else STFD [C7 ] = f113, -SIZE#endif#if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) shladd AOFFSET = L, 1, AOFFSET#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) shladd BOFFSET = L, 3, BOFFSET#else nop __LINE__#endif } ;; { .mmf STFD [C8 ] = f120, SIZE mov f120 = f0 } ;; { .mmi#ifndef LN STFD [C8 ] = f121, SIZE#else STFD [C8 ] = f121, -SIZE#endif#ifdef LT adds KK = 2, KK#elif defined LN adds KK = -2, KK#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif } ;; .align 8.L020: { .mib sub L = K, KK tbit.z p6, p0 = M, 2 (p6) br.cond.dptk .L010 } ;; ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;; #if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f65 = f0 } ;; #else { .mfi shladd BOFFSET = r3, 3, B mov f65 = f0 #ifdef LN sub AORIG = AORIG, r2 #else nop __LINE__ #endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 shladd AOFFSET = r3, 2, AORIG } ;; #endif { .mfi setf.d f105 = r0 mov f81 = f0 adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f113 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f97 = r0 mov f121 = f0 shr L = L, 1 } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f67 = f0 } { .mfi setf.d f74 = r0 mov f75 = f0 adds L = -1, L } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE setf.d f82 = r0 mov f83 = f0 } { .mfi setf.d f90 = r0 mov f91 = f0 cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f98 = r0 mov f99 = f0 } { .mfi setf.d f106 = r0 mov f107 = f0 mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE setf.d f114 = r0 mov f115 = f0 } { .mfb setf.d f122 = r0 mov f123 = f0 (p6) br.cond.dpnt .L028 } ;; .L022: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 (p5) adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 (p5) adds C10 = 2 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 (p5) adds C11 = 2 * SIZE, C3 } { .mfi nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 (p5) adds C12 = 2 * SIZE, C4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -