📄 trsm_kernel_rt.s
字号:
} ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L168 } ;; .align 8.L162: { .mmf cmp.ne p4, p5 = 0, L (p12) cmp.ne p3, p0 = 0, L FMA f64 = f32, f48, f64 // A1 * B1 } ;; { .mmi (p3) LDFD f56 = [BOFFSET], 1 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } ;; { .mmi (p4) LDFD f32 = [AOFFSET], 1 * SIZE nop __LINE__ adds L = -1, L } { .mfb (p4) LDFD f48 = [BOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 br.cloop.sptk.few .L162 } ;; .align 8.L168:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -1, KK#else adds r2 = -1, KK#endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B ;; #endif#if defined(LN) || defined(LT) { .mmi LDFD f32 = [BOFFSET] LDFD f33 = [AOFFSET]#ifdef LN adds C1 = -1 * SIZE, C1#else nop __LINE__#endif } ;;#else { .mmi LDFD f32 = [AOFFSET] LDFD f33 = [BOFFSET] nop __LINE__ } ;;#endif { .mmf sub L = K, KK#ifdef RT shladd AORIG = K, BASE_SHIFT, AORIG#else nop __LINE__#endif FSUB f64 = f32, f64 } ;;#ifdef LT adds KK = 1, KK#elif defined LN adds KK = -1, KK#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif ;; FMPY f64 = f64, f33 ;;#if defined(LN) || defined(LT) { .mmf STFD [BOFFSET] = f64#ifndef LN STFD [C1 ] = f64, SIZE#else STFD [C1 ] = f64#endif mov f64 = f0 } ;;#else { .mmf STFD [AOFFSET] = f64 STFD [C1 ] = f64, SIZE mov f64 = f0 } ;;#endif#if defined(LT) || defined(RN) shladd AOFFSET = L, BASE_SHIFT, AOFFSET#else nop __LINE__#endif#if defined(LT) || defined(RN) shladd BOFFSET = L, BASE_SHIFT, BOFFSET#else nop __LINE__#endif ;; .align 8.L169: { .mii#ifdef LN shladd B = K, BASE_SHIFT, B#elif defined(LT) || defined(RN) mov B = BOFFSET#else nop __LINE__#endif#ifdef RN adds KK = 1, KK#elif defined RT adds KK = -1, KK#else nop __LINE__#endif mov AOFFSET = A } ;; .align 16.L090: tbit.z p6, p0 = N, 1 (p6) br.cond.dpnt .L050 ;;#ifdef RT { .mmi shladd r3 = LDC, 1, r0 nop __LINE__ shl r2 = K, 1 + BASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ }#endif ;; mov f64 = f0 mov f65 = f0 mov f66 = f0 mov f67 = f0 mov f72 = f0 mov f73 = f0 mov f74 = f0 mov f75 = f0 ;; { .mfi shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc#ifdef LN add KK = M, OFFSET#elif defined LT mov KK = OFFSET#else nop __LINE__#endif } ;; { .mmf cmp.eq p6, p7 = 0, I#if defined(LN) || defined(RT) mov AORIG = A#else mov AOFFSET = A#endif } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc } ;; { .mfi#ifndef RT shladd C = LDC, 1, C // coffset += 8 * ldc#else nop __LINE__#endif mov f81 = f0#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif }{ .mfb (p6) br.cond.dpnt .L100 } ;; .align 16.L092: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 3 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;;#if defined(LT) || defined(RN) { .mmi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE nop __LINE__ nop __LINE__ } ;;#else { .mfi shladd BOFFSET = r3, 1, B#ifdef LN sub AORIG = AORIG, r2#else nop __LINE__#endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 3, AORIG } ;;#endif (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; { .mmf (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE } ;; { .mmf (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE } { .mfi cmp.eq p3, p0 = r0, r0 } ;; { .mmf (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE } { .mfi adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mmf CPREFETCH [PREC], LDC } { .mfi adds L = 1, L } ;; { .mmf CPREFETCH [PREC] } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } ;; { .mfi adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mfi adds L = -1, L } ;; { .mfi mov ar.lc = L } ;; mov f68 = f0 mov f69 = f0 mov f70 = f0 mov f71 = f0 mov f76 = f0 mov f77 = f0 mov f78 = f0 mov f79 = f0 ;; { .mfb cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L098 } ;; .align 8.L093:/* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C10 = 4 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 adds C11 = 4 * SIZE, C3 } { .mfi nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 adds C12 = 4 * SIZE, C4 } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f71 = f47, f56, f71 // A8 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f79 = f47, f57, f79 // A8 * B2 br.cloop.sptk.few .L093 } ;; .align 8.L098:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -8, KK#else adds r2 = -2, KK#endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;;#if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [BOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [BOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [BOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [BOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [BOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f65 = f34, f65 FSUB f73 = f35, f73 FSUB f66 = f36, f66 FSUB f74 = f37, f74 FSUB f67 = f38, f67 FSUB f75 = f39, f75 FSUB f68 = f40, f68 FSUB f76 = f41, f76 FSUB f69 = f42, f69 FSUB f77 = f43, f77 FSUB f70 = f44, f70 FSUB f78 = f45, f78 FSUB f71 = f46, f71 FSUB f79 = f47, f79 ;;#else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET], 2 * SIZE ;; LDFPD f40, f41 = [AOFFSET], 2 * SIZE ;; LDFPD f42, f43 = [AOFFSET], 2 * SIZE ;; LDFPD f44, f45 = [AOFFSET], 2 * SIZE ;; LDFPD f46, f47 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f66 = f34, f66 FSUB f67 = f35, f67 FSUB f68 = f36, f68 FSUB f69 = f37, f69 FSUB f70 = f38, f70 FSUB f71 = f39, f71 ;; FSUB f72 = f40, f72 FSUB f73 = f41, f73 FSUB f74 = f42, f74 FSUB f75 = f43, f75 FSUB f76 = f44, f76 FSUB f77 = f45, f77 FSUB f78 = f46, f78 FSUB f79 = f47, f79 ;;#endif#ifdef LN adds AOFFSET = 62 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f35, f34 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f37, f36 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f39, f38 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], -2 * SIZE ;; LDFPD f42, f41 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f44, f43 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f46, f45 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f48, f47 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f50, f49 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f52, f51 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFD f53 = [AOFFSET], -2 * SIZE ;; LDFPD f55, f54 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f57, f56 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f59, f58 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f61, f60 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFD f16 = [AOFFSET], -2 * SIZE ;; LDFPD f18, f17 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f20, f19 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFD f21 = [AOFFSET] ;; FMPY f71 = f71, f32 FMPY f79 = f79, f32 ;; FNMA f70 = f71, f33, f70 FNMA f78 = f79, f33, f78 ;; FNMA f69 = f71, f34, f69 FNMA f77 = f79, f34, f77 ;; FNMA f68 = f71, f35, f68 FNMA f76 = f79, f35, f76 ;; FNMA f67 = f71, f36, f67 FNMA f75 = f79, f36, f75 ;; FNMA f66 = f71, f37, f66 FNMA f74 = f79, f37, f74 ;; FNMA f65 = f71, f38, f65 FNMA f73 = f79, f38, f73 ;; FNMA f64 = f71, f39, f64 FNMA f72 = f79, f39, f72 ;; FMPY f70 = f70, f40 FMPY f78 = f78, f40 ;; FNMA f69 = f70, f41, f69 FNMA f77 = f78, f41, f77 ;; FNMA f68 = f70, f42, f68 FNMA f76 = f78, f42, f76 ;; FNMA f67 = f70, f43, f67 FNMA f75 = f78, f43, f75 ;; FNMA f66 = f70, f44, f66 FNMA f74 = f78, f44, f74 ;; FNMA f65 = f70, f45, f65 FNMA f73 = f78, f45, f73 ;; FNMA f64 = f70, f46, f64 FNMA f72 = f78, f46, f72 ;; FMPY f69 = f69, f47 FMPY f77 = f77, f47 ;; FNMA f68 = f69, f48, f68 FNMA f76 = f77, f48, f76 ;; FNMA f67 = f69, f49, f67 FNMA f75 = f77, f49, f75 ;; FNMA f66 = f69, f50, f66 FNMA f74 = f77, f50, f74 ;; FNMA f65 = f69, f51, f65 FNMA f73 = f77, f51, f73 ;; FNMA f64 = f69, f52, f64 FNMA f72 = f77, f52, f72 ;; FMPY f68 = f68, f53 FMPY f76 = f76, f53 ;; FNMA f67 = f68, f54, f67 FNMA f75 = f76, f54, f75 ;; FNMA f66 = f68, f55, f66 FNMA f74 = f76, f55, f74 ;; FNMA f65 = f68, f56, f65 FNMA f73 = f76, f56, f73 ;; FNMA f64 = f68, f57, f64 FNMA f72 = f76, f57, f72 ;; FMPY f67 = f67, f58 FMPY f75 = f75, f58 ;; FNMA f66 = f67, f59, f66 FNMA f74 = f75, f59, f74 ;; FNMA f65 = f67, f60, f65 FNMA f73 = f75, f60, f73 ;; FNMA f64 = f67, f61, f64 FNMA f72 = f75, f61, f72 ;; FMPY f66 = f66, f16 FMPY f74 = f74, f16 ;; FNMA f65 = f66, f17, f65 FNMA f73 = f74, f17, f73 ;; FNMA f64 = f66, f18, f64 FNMA f72 = f74, f18, f72 ;; FMPY f65 = f65, f19 FMPY f73 = f73, f19 ;; FNMA f64 = f65, f20, f64 FNMA f72 = f73, f20, f72 ;; FMPY f64 = f64, f21 FMPY f72 = f72, f21 ;; adds BOFFSET = 8 * SIZE, BOFFSET adds BOFFSET2 = 8 * SIZE, BOFFSET2 ;; STFD [BOFFSET] = f68, SIZE STFD [BOFFSET2] = f70, SIZE ;; STFD [BOFFSET] = f76, SIZE STFD [BOFFSET2] = f78, SIZE ;; STFD [BOFFSET] = f69, SIZE STFD [BOFFSET2] = f71, SIZE ;; STFD [BOFFSET] = f77, - 11 * SIZE STFD [BOFFSET2] = f79, - 11 * SIZE ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f66, SIZE ;; STFD [BOFFSET] = f72, SIZE STFD [BOFFSET2] = f74, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f67, SIZE ;; STFD [BOFFSET] = f73, - 3 * SIZE STFD [BOFFSET2] = f75, - 3 * SIZE ;; adds C1 = -8 * SIZE, C1 adds C2 = -8 * SIZE, C2 ;;#endif#ifdef LT LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET], 2 * SIZE ;; LDFPD f36, f37 = [AOFFSET], 2 * SIZE ;; LDFPD f38, f39 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f40 = [AOFFSET], 1 * SIZE ;; LDFPD f41, f42 = [AOFFSET], 2 * SIZE ;; LDFPD f43, f44 = [AOFFSET], 2 * SIZE ;; LDFPD f45, f46 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f47, f48 = [AOFFSET], 2 * SIZE ;;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -