📄 ztrsm_kernel_rt.s
字号:
#ifdef LN adds C1 = -2 * SIZE, C1#endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 ;;#ifdef LN adds C1 = -2 * SIZE, C1#endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;;#ifdef RT add AORIG = r2, AORIG#endif ;;#if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; add AOFFSET = L, AOFFSET add BOFFSET = L, BOFFSET#endif ;;#ifdef LT adds KK = 1, KK#elif defined LN adds KK = -1, KK#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif .align 16.L119:#ifdef LN shladd KK8 = K, ZBASE_SHIFT, r0 ;; add B = KK8, B#endif#if defined(LT) || defined(RN) mov B = BOFFSET#endif#ifdef RN adds KK = 1, KK#endif#ifdef RT adds KK = -1, KK#endif ;; { .mmi mov AOFFSET = A nop __LINE__ } ;; .align 16.L050: { .mmi shr I = M, 2 } { .mib tbit.z p6, p0 = N, 1 (p6) br.cond.dpnt .L010 } ;;#ifdef RT { .mmi shladd r3 = LDC, 1, r0 nop __LINE__ shl r2 = K, 1 + ZBASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ } ;;#endif mov C1 = C add C2 = LDC, C ;;#ifdef LN add KK = M, OFFSET#elif defined LT mov KK = OFFSET#else nop __LINE__#endif ;;#if defined(LN) || defined(RT) mov AORIG = A#else mov AOFFSET = A#endif ;;#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif ;; { .mib cmp.eq p6, p7 = 0, I#ifndef RT shladd C = LDC, 1, C#else nop __LINE__#endif (p6) br.cond.dpnt .L060 } ;; .align 16.L052: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;;#if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;;#else { .mfi shladd BOFFSET = r3, 1, B mov f66 = f0#ifdef LN sub AORIG = AORIG, r2#else nop __LINE__#endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;;#endif { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 nop __LINE__ } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi cmp.eq p3, p0 = r0, r0 mov f99 = f0 adds L = 1, L } ;; { .mfi (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi CPREFETCH [PREC], LDC mov f115 = f0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE adds C5 = 4 * SIZE, C1 adds L = -1, L } ;; { .mmi CPREFETCH [PREC], LDC adds C6 = 4 * SIZE, C2 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L058 ;; .align 16.L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA_B f65 = f32, f49, f65 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfi FMA_B f97 = f34, f49, f97 // A3 * B2 nop __LINE__ } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f34, f50, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f113 = f34, f51, f113 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f96 = f35, f49, f96 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f113 = f35, f50, f113 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f112 = f35, f51, f112 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f36, f48, f66 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f67 = f36, f49, f67 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f36, f50, f82 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f83 = f36, f51, f83 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f38, f48, f98 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f38, f49, f99 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f38, f50, f114 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f38, f51, f115 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f37, f48, f67 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f66 = f37, f49, f66 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f37, f50, f83 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f82 = f37, f51, f82 // A6 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f39, f48, f99 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA_A f98 = f39, f49, f98 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f39, f50, f115 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA_A f114 = f39, f51, f114 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f112 = f42, f58, f112 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f97 = f43, f56, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f113 = f43, f58, f113 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f66 = f44, f56, f66 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f82 = f44, f58, f82 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f98 = f46, f56, f98 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f114 = f46, f58, f114 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f67 = f45, f56, f67 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f83 = f45, f58, f83 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ (p3) FMA f99 = f47, f56, f99 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 nop __LINE__ } ;; { .mfi nop __LINE__ (p3) FMA f115 = f47, f58, f115 // A8 * B3 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 br.cloop.sptk.few .L053 } ;;.L058:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -4, KK#else adds r2 = -2, KK#endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B ;; #endif#if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f104, f105 = [BOFFSET], 2 * SIZE ;; LDFPD f106, f107 = [BOFFSET], 2 * SIZE ;; LDFPD f120, f121 = [BOFFSET], 2 * SIZE ;; LDFPD f122, f123 = [BOFFSET] adds BOFFSET = -14 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f80 = f74, f80 FSUB_A f81 = f75, f81 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 FSUB f112 = f90, f112 FSUB_A f113 = f91, f113 FSUB f66 = f104, f66 FSUB_A f67 = f105, f67 FSUB f82 = f106, f82 FSUB_A f83 = f107, f83 FSUB f98 = f120, f98 FSUB_A f99 = f121, f99 FSUB f114 = f122, f114 FSUB_A f115 = f123, f115 ;;#else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [AOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET], 2 * SIZE ;; LDFPD f90, f91 = [AOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [AOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [AOFFSET] adds AOFFSET = -14 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f74, f96 FSUB f97 = f75, f97 FSUB f66 = f76, f66 FSUB f67 = f77, f67 FSUB f98 = f78, f98 FSUB f99 = f79, f99 FSUB f80 = f88, f80 FSUB f81 = f89, f81 FSUB f112 = f90, f112 FSUB f113 = f91, f113 FSUB f82 = f92, f82 FSUB f83 = f93, f83 FSUB f114 = f94, f114 FSUB f115 = f95, f115 ;;#endif#ifdef LN adds AOFFSET = 30 * SIZE, AOFFSET ;; LDFPD f72, f73 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f76, f77 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f78, f79 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f92, f93 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 8 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f72, f98 FMPY f33 = f73, f98 FMPY f34 = f72, f114 FMPY f35 = f73, f114 ;; FMA_C f98 = f73, f99, f32 FMA_D f99 = f72, f99, f33 FMA_C f114 = f73, f115, f34 FMA_D f115 = f72, f115, f35 ;; FNMA f66 = f74, f98, f66 FMA_A f67 = f75, f98, f67 FNMA f82 = f74, f114, f82 FMA_A f83 = f75, f114, f83 ;; FMA_B f66 = f75, f99, f66 FNMA f67 = f74, f99, f67 FMA_B f82 = f75, f115, f82 FNMA f83 = f74, f115, f83 ;; FNMA f96 = f76, f98, f96 FMA_A f97 = f77, f98, f97 FNMA f112 = f76, f114, f112 FMA_A f113 = f77, f114, f113 ;; FMA_B f96 = f77, f99, f96 FNMA f97 = f76, f99, f97 FMA_B f112 = f77, f115, f112 FNMA f113 = f76, f115, f113 ;; FNMA f64 = f78, f98, f64 FMA_A f65 = f79, f98, f65 FNMA f80 = f78, f114, f80 FMA_A f81 = f79, f114, f81 ;; FMA_B f64 = f79, f99, f64 FNMA f65 = f78, f99, f65 FMA_B f80 = f79, f115, f80 FNMA f81 = f78, f115, f81 ;; FMPY f32 = f88, f66 FMPY f33 = f89, f66 FMPY f34 = f88, f82 FMPY f35 = f89, f82 ;; FMA_C f66 = f89, f67, f32 FMA_D f67 = f88, f67, f33 FMA_C f82 = f89, f83, f34 FMA_D f83 = f88, f83, f35 ;; FNMA f96 = f90, f66, f96 FMA_A f97 = f91, f66, f97 FNMA f112 = f90, f82, f112 FMA_A f113 = f91, f82, f113 ;;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -