📄 ztrsm_kernel_rt.s
字号:
FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;;#endif#ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;;#endif#if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; adds BOFFSET = - 8 * SIZE, BOFFSET ;;#else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f96, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f97, SIZE ;; STFD [AOFFSET] = f80, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f81, 5 * SIZE STFD [AOFFSET2] = f113, 5 * SIZE ;; adds AOFFSET = - 8 * SIZE, AOFFSET ;;#endif#ifdef LN adds C1 = -8 * SIZE, C1 adds C5 = -8 * SIZE, C5#endif ;; STFD [C1 ] = f64, SIZE STFD [C5 ] = f96, SIZE ;; STFD [C1 ] = f65, SIZE STFD [C5 ] = f97, SIZE ;; STFD [C1 ] = f80, SIZE STFD [C5 ] = f112, SIZE ;; STFD [C1 ] = f81, 5 * SIZE STFD [C5 ] = f113, 5 * SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;;#ifdef LN adds C1 = -8 * SIZE, C1 adds C5 = -8 * SIZE, C5#endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;;#ifdef RT shladd AORIG = r2, 2, AORIG#endif ;;#if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 2, AOFFSET add BOFFSET = L, BOFFSET#endif ;;#ifdef LT adds KK = 4, KK#elif defined LN adds KK = -4, KK#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif ;; (p6) br.cond.dptk .L092 ;; .align 16.L100: { .mib#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L110 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;;#if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;;#else { .mfi add BOFFSET = r3, B mov f66 = f0#ifdef LN sub AORIG = AORIG, r2#else nop __LINE__#endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 1, AORIG } ;;#endif ;; adds L = 1, L ;; { .mii (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE nop __LINE__ adds L = -1, L } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L108 ;; .align 16.L102: { .mfi lfetch.nt1 [PREA], 8 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET FMA f80 = f32, f49, f80 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfb lfetch.nt1 [PREB], 4 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f81 = f33, f49, f81 // A2 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f34, f48, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f112 = f34, f49, f112 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f97 = f35, f48, f97 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f113 = f35, f49, f113 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f80 = f40, f57, f80 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f81 = f41, f57, f81 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f42, f56, f96 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f112 = f42, f57, f112 // A3 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f97 = f43, f56, f97 // A4 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f113 = f43, f57, f113 // A4 * B2 br.cloop.sptk.few .L102 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f81 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f80 nop __LINE__ } { .mfb nop __LINE__ FCALC_A f96 = f96, f113 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f97 = f97, f112 nop __LINE__ } ;;.L108:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -2, KK#else adds r2 = -1, KK#endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B ;; #endif#if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 FSUB f96 = f88, f96 FSUB_A f97 = f89, f97 ;;#else LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f88, f89 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 FSUB f96 = f88, f96 FSUB f97 = f89, f97 ;;#endif#ifdef LN adds AOFFSET = 6 * SIZE, AOFFSET ;; LDFPD f104, f105 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFPD f106, f107 = [AOFFSET] adds AOFFSET = - 4 * SIZE, AOFFSET ;; LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f104, f96 FMPY f33 = f105, f96 ;; FMA_C f96 = f105, f97, f32 FMA_D f97 = f104, f97, f33 ;; FNMA f64 = f106, f96, f64 FMA_A f65 = f107, f96, f65 ;; FMA_B f64 = f107, f97, f64 FNMA f65 = f106, f97, f65 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;;#endif#ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;; FNMA f96 = f74, f64, f96 FMA_A f97 = f75, f64, f97 ;; FMA_B f96 = f75, f65, f96 FNMA f97 = f74, f65, f97 ;; FMPY f32 = f90, f96 FMPY f33 = f91, f96 ;; FMA_C f96 = f91, f97, f32 FMA_D f97 = f90, f97, f33 ;;#endif#ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f36 = f72, f96 FMPY f37 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 ;;#endif#ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f36 = f72, f96 FMPY f37 = f73, f96 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 ;;#endif#if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f96, SIZE ;; STFD [BOFFSET] = f97, SIZE ;; adds BOFFSET = - 4 * SIZE, BOFFSET ;;#else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f96, SIZE ;; STFD [AOFFSET] = f97, SIZE ;; adds AOFFSET = - 4 * SIZE, AOFFSET ;;#endif#ifdef LN adds C1 = -4 * SIZE, C1 adds C5 = -4 * SIZE, C5#endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f96, SIZE ;; STFD [C1 ] = f97, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;;#ifdef LN adds C1 = -4 * SIZE, C1 adds C5 = -4 * SIZE, C5#endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;;#ifdef RT shladd AORIG = r2, 1, AORIG#endif ;;#if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET add BOFFSET = L, BOFFSET#endif ;;#ifdef LT adds KK = 2, KK#elif defined LN adds KK = -2, KK#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif .align 16.L110: { .mib#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L119 } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, ZBASE_SHIFT } { .mmi shladd r3 = KK, ZBASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;;#if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;;#else { .mfi add BOFFSET = r3, B mov f66 = f0#ifdef LN sub AORIG = AORIG, r2#else nop __LINE__#endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 add AOFFSET = r3, AORIG } ;;#endif ;; adds L = 1, L ;; { .mii nop __LINE__ tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE cmp.eq p3, p0 = r0, r0 adds L = -1, L } ;; { .mmi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET mov ar.lc = L } ;; cmp.eq p6, p0 = -1, L (p6) br.cond.dpnt .L118 ;; .align 16.L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f80 = f32, f49, f80 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf nop __LINE__ nop __LINE__ FMA f81 = f33, f49, f81 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f80 = f40, f57, f80 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb (p3) FMA f81 = f41, f57, f81 // A2 * B2 br.cloop.sptk.few .L112 } ;; { .mfb nop __LINE__ FCALC_A f64 = f64, f81 nop __LINE__ } { .mfb nop __LINE__ FCALC_B f65 = f65, f80 nop __LINE__ } ;;.L118:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -1, KK#else adds r2 = -1, KK#endif ;; shladd r2 = r2, ZBASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B ;; #endif#if defined(LN) || defined(LT) LDFPD f72, f73 = [BOFFSET] ;; FSUB f64 = f72, f64 FSUB_A f65 = f73, f65 ;;#else LDFPD f72, f73 = [AOFFSET] ;; FSUB f64 = f72, f64 FSUB f65 = f73, f65 ;;#endif#ifdef LN LDFPD f120, f121 = [AOFFSET] ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 ;;#endif#ifdef LT LDFPD f72, f73 = [AOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;;#endif#ifdef RN LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;;#endif#ifdef RT LDFPD f72, f73 = [BOFFSET] ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 ;;#endif#if defined(LN) || defined(LT) STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; adds BOFFSET = - 2 * SIZE, BOFFSET ;;#else STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; adds AOFFSET = - 2 * SIZE, AOFFSET ;;#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -