📄 trsm_kernel_ln.s
字号:
;; FNMA f72 = f120, f38, f72 ;; FNMA f64 = f120, f39, f64 ;; FMPY f112 = f112, f40 ;; FNMA f104 = f112, f41, f104 ;; FNMA f96 = f112, f42, f96 ;; FNMA f88 = f112, f43, f88 ;; FNMA f80 = f112, f44, f80 ;; FNMA f72 = f112, f45, f72 ;; FNMA f64 = f112, f46, f64 ;; FMPY f104 = f104, f47 ;; FNMA f96 = f104, f48, f96 ;; FNMA f88 = f104, f49, f88 ;; FNMA f80 = f104, f50, f80 ;; FNMA f72 = f104, f51, f72 ;; FNMA f64 = f104, f52, f64 ;; FMPY f96 = f96, f53 ;; FNMA f88 = f96, f54, f88 ;; FNMA f80 = f96, f55, f80 ;; FNMA f72 = f96, f56, f72 ;; FNMA f64 = f96, f57, f64 ;; FMPY f88 = f88, f58 ;; FNMA f80 = f88, f59, f80 ;; FNMA f72 = f88, f60, f72 ;; FNMA f64 = f88, f61, f64 ;; FMPY f80 = f80, f16 ;; FNMA f72 = f80, f17, f72 ;; FNMA f64 = f80, f18, f64 ;; FMPY f72 = f72, f19 ;; FNMA f64 = f72, f20, f64 ;; FMPY f64 = f64, f21 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f104, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f88, - 3 * SIZE STFD [AOFFSET2] = f120, - 3 * SIZE ;;#endif#ifndef LN STFD [C1 ] = f64, SIZE#else STFD [C1 ] = f64#endif#ifndef LN STFD [C2 ] = f72, SIZE#else STFD [C2 ] = f72#endif#ifndef LN STFD [C3 ] = f80, SIZE#else STFD [C3 ] = f80#endif#ifndef LN STFD [C4 ] = f88, SIZE#else STFD [C4 ] = f88#endif#ifndef LN STFD [C5 ] = f96, SIZE#else STFD [C5 ] = f96#endif#ifndef LN STFD [C6 ] = f104, SIZE#else STFD [C6 ] = f104#endif#ifndef LN STFD [C7 ] = f112, SIZE#else STFD [C7 ] = f112#endif#ifndef LN STFD [C8 ] = f120, SIZE#else STFD [C8 ] = f120#endif ;; mov f64 = f0 mov f72 = f0 mov f80 = f0 mov f88 = f0 mov f96 = f0 mov f104 = f0 mov f112 = f0 mov f120 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;;#ifdef RT add AORIG = r2, AORIG#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) add AOFFSET = L, AOFFSET#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) shladd BOFFSET = L, 3, BOFFSET#else nop __LINE__#endif ;;#ifdef LT adds KK = 1, KK#elif defined LN adds KK = -1, KK#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif ;; .align 8.L030: { .mib sub L = K, KK tbit.z p6, p0 = M, 1 (p6) br.cond.dptk .L020 } ;; ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;;#if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE setf.d f73 = r0 mov f65 = f0 } ;;#else { .mfi shladd BOFFSET = r3, 3, B mov f65 = f0#ifdef LN sub AORIG = AORIG, r2#else nop __LINE__#endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f73 = f0 shladd AOFFSET = r3, 1, AORIG } ;;#endif { .mfi setf.d f105 = r0 mov f81 = f0 adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET mov f89 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f113 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f97 = r0 mov f121 = f0 shr L = L, 1 } ;; { .mmf (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE adds L = -1, L } ;; { .mmf (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L038 } ;;.L032: { .mfb lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f113 = f41, f62, f113 // A2 * B7 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f121 = f41, f63, f121 // A2 * B8 br.cloop.sptk.few .L032 } ;;.L038:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -2, KK#else adds r2 = -8, KK#endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 3, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;;#if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; { .mfi FSUB f64 = f32, f64 nop __LINE__ } { .mfi nop __LINE__ FSUB f72 = f33, f72 nop __LINE__ } ;; { .mfi FSUB f80 = f34, f80 nop __LINE__ } { .mfi nop __LINE__ FSUB f88 = f35, f88 nop __LINE__ } ;; { .mfi FSUB f96 = f36, f96 nop __LINE__ } { .mfi nop __LINE__ FSUB f104 = f37, f104 nop __LINE__ } ;; { .mfi FSUB f112 = f38, f112 nop __LINE__ } { .mfi nop __LINE__ FSUB f120 = f39, f120 nop __LINE__ } ;; { .mfi FSUB f65 = f40, f65 nop __LINE__ } { .mfi nop __LINE__ FSUB f73 = f41, f73 nop __LINE__ } ;; { .mfi FSUB f81 = f42, f81 nop __LINE__ } { .mfi nop __LINE__ FSUB f89 = f43, f89 nop __LINE__ } ;; { .mfi FSUB f97 = f44, f97 nop __LINE__ } { .mfi nop __LINE__ FSUB f105 = f45, f105 nop __LINE__ } ;; { .mfi FSUB f113 = f46, f113 } { .mfi nop __LINE__ FSUB f121 = f47, f121 nop __LINE__ } ;;#else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 FSUB f80 = f36, f80 FSUB f81 = f37, f81 FSUB f88 = f38, f88 FSUB f89 = f39, f89 ;; FSUB f96 = f40, f96 FSUB f97 = f41, f97 ;; FSUB f104 = f42, f104 FSUB f105 = f43, f105 ;; FSUB f112 = f44, f112 FSUB f113 = f45, f113 ;; FSUB f120 = f46, f120 FSUB f121 = f47, f121 ;;#endif#ifdef LN adds AOFFSET = 2 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET] ;; FMPY f65 = f65, f32 FMPY f97 = f97, f32 FMPY f73 = f73, f32 FMPY f105 = f105, f32 FMPY f81 = f81, f32 FMPY f113 = f113, f32 FMPY f89 = f89, f32 FMPY f121 = f121, f32 ;; FNMA f64 = f65, f33, f64 FNMA f96 = f97, f33, f96 FNMA f72 = f73, f33, f72 FNMA f104 = f105, f33, f104 FNMA f80 = f81, f33, f80 FNMA f112 = f113, f33, f112 FNMA f88 = f89, f33, f88 FNMA f120 = f121, f33, f120 ;; FMPY f64 = f64, f34 FMPY f96 = f96, f34 FMPY f72 = f72, f34 FMPY f104 = f104, f34 FMPY f80 = f80, f34 FMPY f112 = f112, f34 FMPY f88 = f88, f34 FMPY f120 = f120, f34 ;; adds BOFFSET = 8 * SIZE, BOFFSET adds BOFFSET2 = 8 * SIZE, BOFFSET2 ;; { .mfi STFD [BOFFSET] = f65, SIZE } { .mfi STFD [BOFFSET2] = f97, SIZE } ;; { .mfi STFD [BOFFSET] = f73, SIZE } { .mfi STFD [BOFFSET2] = f105, SIZE } ;; { .mfi STFD [BOFFSET] = f81, SIZE } { .mfi STFD [BOFFSET2] = f113, SIZE } ;; { .mfi STFD [BOFFSET] = f89, - 11 * SIZE } { .mfi STFD [BOFFSET2] = f121, - 11 * SIZE } ;; { .mmi STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE adds C1 = -2 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f104, SIZE adds C2 = -2 * SIZE, C2 } ;; { .mmi STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE nop __LINE__ } ;; { .mmi STFD [BOFFSET] = f88, - 3 * SIZE STFD [BOFFSET2] = f120, - 3 * SIZE } ;; adds C3 = -2 * SIZE, C3 adds C4 = -2 * SIZE, C4 adds C5 = -2 * SIZE, C5 adds C6 = -2 * SIZE, C6 adds C7 = -2 * SIZE, C7 adds C8 = -2 * SIZE, C8 ;;#endif#ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; { .mfi FMPY f64 = f64, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f96 = f96, f32 nop __LINE__ } ;; { .mfi FMPY f72 = f72, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f104 = f104, f32 nop __LINE__ } ;; { .mfi FMPY f80 = f80, f32 } { .mfi nop __LINE__ FMPY f112 = f112, f32 nop __LINE__ } ;; { .mfi FMPY f88 = f88, f32 nop __LINE__ } { .mfi nop __LINE__ FMPY f120 = f120, f32 nop __LINE__ } ;; { .mfi FNMA f65 = f64, f33, f65 nop __LINE__ } { .mfi nop __LINE__ FNMA f97 = f96, f33, f97 nop __LINE__ } ;; { .mfi FNMA f73 = f72, f33, f73 nop __LINE__ } { .mfi nop __LINE__ FNMA f105 = f104, f33, f105 nop __LINE__
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -