📄 trsm_kernel_rt.s
字号:
LDFPD f49, f50 = [AOFFSET], 2 * SIZE ;; LDFPD f51, f52 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], 1 * SIZE ;; LDFPD f54, f55 = [AOFFSET], 2 * SIZE ;; LDFPD f56, f57 = [AOFFSET] adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f58, f59 = [AOFFSET], 2 * SIZE ;; LDFPD f60, f61 = [AOFFSET] adds AOFFSET = 7 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], 1 * SIZE ;; LDFPD f17, f18 = [AOFFSET] adds AOFFSET = 8 * SIZE, AOFFSET ;; LDFPD f19, f20 = [AOFFSET] adds AOFFSET = 9 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] adds AOFFSET = -63 * SIZE, AOFFSET ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 ;; FNMA f68 = f64, f36, f68 FNMA f76 = f72, f36, f76 ;; FNMA f69 = f64, f37, f69 FNMA f77 = f72, f37, f77 ;; FNMA f70 = f64, f38, f70 FNMA f78 = f72, f38, f78 ;; FNMA f71 = f64, f39, f71 FNMA f79 = f72, f39, f79 ;; FMPY f65 = f65, f40 FMPY f73 = f73, f40 ;; FNMA f66 = f65, f41, f66 FNMA f74 = f73, f41, f74 ;; FNMA f67 = f65, f42, f67 FNMA f75 = f73, f42, f75 ;; FNMA f68 = f65, f43, f68 FNMA f76 = f73, f43, f76 ;; FNMA f69 = f65, f44, f69 FNMA f77 = f73, f44, f77 ;; FNMA f70 = f65, f45, f70 FNMA f78 = f73, f45, f78 ;; FNMA f71 = f65, f46, f71 FNMA f79 = f73, f46, f79 ;; FMPY f66 = f66, f47 FMPY f74 = f74, f47 ;; FNMA f67 = f66, f48, f67 FNMA f75 = f74, f48, f75 ;; FNMA f68 = f66, f49, f68 FNMA f76 = f74, f49, f76 ;; FNMA f69 = f66, f50, f69 FNMA f77 = f74, f50, f77 ;; FNMA f70 = f66, f51, f70 FNMA f78 = f74, f51, f78 ;; FNMA f71 = f66, f52, f71 FNMA f79 = f74, f52, f79 ;; FMPY f67 = f67, f53 FMPY f75 = f75, f53 ;; FNMA f68 = f67, f54, f68 FNMA f76 = f75, f54, f76 ;; FNMA f69 = f67, f55, f69 FNMA f77 = f75, f55, f77 ;; FNMA f70 = f67, f56, f70 FNMA f78 = f75, f56, f78 ;; FNMA f71 = f67, f57, f71 FNMA f79 = f75, f57, f79 ;; FMPY f68 = f68, f58 FMPY f76 = f76, f58 ;; FNMA f69 = f68, f59, f69 FNMA f77 = f76, f59, f77 ;; FNMA f70 = f68, f60, f70 FNMA f78 = f76, f60, f78 ;; FNMA f71 = f68, f61, f71 FNMA f79 = f76, f61, f79 ;; FMPY f69 = f69, f16 FMPY f77 = f77, f16 ;; FNMA f70 = f69, f17, f70 FNMA f78 = f77, f17, f78 ;; FNMA f71 = f69, f18, f71 FNMA f79 = f77, f18, f79 ;; FMPY f70 = f70, f19 FMPY f78 = f78, f19 ;; FNMA f71 = f70, f20, f71 FNMA f79 = f78, f20, f79 ;; FMPY f71 = f71, f21 FMPY f79 = f79, f21 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, 5 * SIZE STFD [BOFFSET2] = f75, 5 * SIZE ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f78, SIZE ;; STFD [BOFFSET] = f69, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f77, -11 * SIZE STFD [BOFFSET2] = f79, -11 * SIZE ;; adds C9 = 4 * SIZE, C1 ;;#endif#ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f68 = f68, f32 FMPY f65 = f65, f32 FMPY f69 = f69, f32 FMPY f66 = f66, f32 FMPY f70 = f70, f32 FMPY f67 = f67, f32 FMPY f71 = f71, f32 ;; FNMA f72 = f64, f33, f72 FNMA f76 = f68, f33, f76 FNMA f73 = f65, f33, f73 FNMA f77 = f69, f33, f77 FNMA f74 = f66, f33, f74 FNMA f78 = f70, f33, f78 FNMA f75 = f67, f33, f75 FNMA f79 = f71, f33, f79 ;; FMPY f72 = f72, f34 FMPY f76 = f76, f34 FMPY f73 = f73, f34 FMPY f77 = f77, f34 FMPY f74 = f74, f34 FMPY f78 = f78, f34 FMPY f75 = f75, f34 FMPY f79 = f79, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f71, 5 * SIZE ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, -11 * SIZE STFD [AOFFSET2] = f79, -11 * SIZE ;;#endif#ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 FMPY f76 = f76, f32 FMPY f73 = f73, f32 FMPY f77 = f77, f32 FMPY f74 = f74, f32 FMPY f78 = f78, f32 FMPY f75 = f75, f32 FMPY f79 = f79, f32 ;; FNMA f64 = f72, f33, f64 FNMA f68 = f76, f33, f68 FNMA f65 = f73, f33, f65 FNMA f69 = f77, f33, f69 FNMA f66 = f74, f33, f66 FNMA f70 = f78, f33, f70 FNMA f67 = f75, f33, f67 FNMA f71 = f79, f33, f71 ;; FMPY f64 = f64, f34 FMPY f68 = f68, f34 FMPY f65 = f65, f34 FMPY f69 = f69, f34 FMPY f66 = f66, f34 FMPY f70 = f70, f34 FMPY f67 = f67, f34 FMPY f71 = f71, f34 ;; adds AOFFSET = 8 * SIZE, AOFFSET adds AOFFSET2 = 8 * SIZE, AOFFSET2 ;; STFD [AOFFSET] = f72, SIZE STFD [AOFFSET2] = f76, SIZE ;; STFD [AOFFSET] = f73, SIZE STFD [AOFFSET2] = f77, SIZE ;; STFD [AOFFSET] = f74, SIZE STFD [AOFFSET2] = f78, SIZE ;; STFD [AOFFSET] = f75, - 11 * SIZE STFD [AOFFSET2] = f79, - 11 * SIZE ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f68, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f69, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f70, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f71, - 3 * SIZE ;;#endif adds C9 = 4 * SIZE, C1 ;; { .mmf STFD [C1 ] = f64, SIZE STFD [C9 ] = f68, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE STFD [C9 ] = f69, SIZE adds C10 = 4 * SIZE, C2 } ;; { .mmi STFD [C1 ] = f66, SIZE STFD [C9 ] = f70, SIZE } ;; { .mmi#ifndef LN STFD [C1 ] = f67, 5 * SIZE#else STFD [C1 ] = f67, - 3 * SIZE#endif STFD [C9 ] = f71 adds C11 = 4 * SIZE, C3 } ;; { .mmf STFD [C2 ] = f72, SIZE STFD [C10] = f76, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE STFD [C10] = f77, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE STFD [C10] = f78, SIZE adds C12 = 4 * SIZE, C4 } ;; { .mmi#ifndef LN STFD [C2 ] = f75, 5 * SIZE#else STFD [C2 ] = f75, - 3 * SIZE#endif STFD [C10] = f79 } ;; { .mmf cmp.ne p6, p0 = 1, I } ;; adds I = -1, I ;; { .mmi shladd r2 = K, BASE_SHIFT, r0 } ;; { .mmi sub L = K, KK } ;; { .mmi#ifdef RT shladd AORIG = r2, 3, AORIG#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0#else nop __LINE__#endif } ;; ;; { .mmi#if defined(LT) || defined(RN) shladd AOFFSET = L, 3, AOFFSET#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET#else nop __LINE__#endif } ;; { .mmi#ifdef LT adds KK = 8, KK#elif defined LN adds KK = -8, KK#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif } ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f72 = f0 mov f73 = f0 mov f74 = f0 mov f75 = f0 (p6) br.cond.dptk .L092 ;; .align 8.L100: tbit.z p6, p7 = M, 2 (p6) br.cond.dptk .L110 ;; { .mib#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;;#if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f65 = f0 } ;;#else { .mfi shladd BOFFSET = r3, 1, B#ifdef LN sub AORIG = AORIG, r2#else nop __LINE__#endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 2, AORIG } ;;#endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi cmp.eq p6, p0 = -1, L } ;; { .mmf (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE } { .mfi mov ar.lc = L } ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } { .mfb (p6) br.cond.dpnt .L108 } ;;.L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 2 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 2 * SIZE, C2 } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 br.cloop.sptk.few .L102 } ;; .align 8.L108:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -4, KK#else adds r2 = -2, KK#endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;;#if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET] adds BOFFSET = -6 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;; FSUB f65 = f34, f65 FSUB f73 = f35, f73 ;; FSUB f66 = f36, f66 FSUB f74 = f37, f74 ;; FSUB f67 = f38, f67 FSUB f75 = f39, f75 ;;#else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = -6 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f72 = f36, f72 FSUB f73 = f37, f73 FSUB f74 = f38, f74 FSUB f75 = f39, f75 ;;#endif#ifdef LN adds AOFFSET = 14 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], - 2 * SIZE ;; LDFPD f38, f37 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f40, f39 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET] ;; FMPY f67 = f67, f32 FMPY f75 = f75, f32 ;; FNMA f66 = f67, f33, f66 FNMA f74 = f75, f33, f74 ;; FNMA f65 = f67, f34, f65 FNMA f73 = f75, f34, f73 ;; FNMA f64 = f67, f35, f64 FNMA f72 = f75, f35, f72 ;; FMPY f66 = f66, f36 FMPY f74 = f74, f36 ;; FNMA f65 = f66, f37, f65 FNMA f73 = f74, f37, f73 ;; FNMA f64 = f66, f38, f64 FNMA f72 = f74, f38, f72 ;; FMPY f65 = f65, f39 FMPY f73 = f73, f39 ;; FNMA f64 = f65, f40, f64 FNMA f72 = f73, f40, f72 ;; FMPY f64 = f64, f41 FMPY f72 = f72, f41 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE STFD [BOFFSET2] = f75, -3 * SIZE ;; adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 ;;#endif#ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f36 = [AOFFSET], 1 * SIZE ;; LDFPD f37, f38 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f39, f40 = [AOFFSET] adds AOFFSET = 5 * SIZE, AOFFSET ;; LDFD f41 = [AOFFSET], -15 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FNMA f66 = f64, f34, f66 FNMA f74 = f72, f34, f74 ;; FNMA f67 = f64, f35, f67 FNMA f75 = f72, f35, f75 ;; FMPY f65 = f65, f36 FMPY f73 = f73, f36 ;; FNMA f66 = f65, f37, f66 FNMA f74 = f73, f37, f74 ;; FNMA f67 = f65, f38, f67 FNMA f75 = f73, f38, f75 ;; FMPY f66 = f66, f39 FMPY f74 = f74, f39 ;; FNMA f67 = f66, f40, f67 FNMA f75 = f74, f40, f75 ;; FMPY f67 = f67, f41 FMPY f75 = f75, f41 ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE STFD [BOFFSET2] = f75, -3 * SIZE ;;#endif#ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 FMPY f66 = f66, f32 FMPY f67 = f67, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 FNMA f74 = f66, f33, f74 FNMA f75 = f67, f33, f75 ;; FMPY f72 = f72, f34 FMPY f73 = f73, f34 FMPY f74 = f74, f34 FMPY f75 = f75, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, -3 * SIZE STFD [AOFFSET2] = f75, -3 * SIZE ;;#endif#ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -