📄 ztrsm_kernel_ln.s
字号:
FMA_C f114 = f105, f115, f38 FMA_D f115 = f104, f115, f39 ;; FNMA f64 = f106, f66, f64 FMA_A f65 = f107, f66, f65 FNMA f80 = f106, f82, f80 FMA_A f81 = f107, f82, f81 FNMA f96 = f106, f98, f96 FMA_A f97 = f107, f98, f97 FNMA f112 = f106, f114, f112 FMA_A f113 = f107, f114, f113 ;; FMA_B f64 = f107, f67, f64 FNMA f65 = f106, f67, f65 FMA_B f80 = f107, f83, f80 FNMA f81 = f106, f83, f81 FMA_B f96 = f107, f99, f96 FNMA f97 = f106, f99, f97 FMA_B f112 = f107, f115, f112 FNMA f113 = f106, f115, f113 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f80 FMPY f35 = f121, f80 FMPY f36 = f120, f96 FMPY f37 = f121, f96 FMPY f38 = f120, f112 FMPY f39 = f121, f112 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f80 = f121, f81, f34 FMA_D f81 = f120, f81, f35 FMA_C f96 = f121, f97, f36 FMA_D f97 = f120, f97, f37 FMA_C f112 = f121, f113, f38 FMA_D f113 = f120, f113, f39 ;;#endif#ifdef LT LDFPD f72, f73 = [AOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [AOFFSET] adds AOFFSET = 4 * SIZE, AOFFSET ;; LDFPD f90, f91 = [AOFFSET] adds AOFFSET = - 6 * SIZE, AOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f80 FMPY f35 = f73, f80 FMPY f36 = f72, f96 FMPY f37 = f73, f96 FMPY f38 = f72, f112 FMPY f39 = f73, f112 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f80 = f73, f81, f34 FMA_D f81 = f72, f81, f35 FMA_C f96 = f73, f97, f36 FMA_D f97 = f72, f97, f37 FMA_C f112 = f73, f113, f38 FMA_D f113 = f72, f113, f39 ;; FNMA f66 = f74, f64, f66 FMA_A f67 = f75, f64, f67 FNMA f82 = f74, f80, f82 FMA_A f83 = f75, f80, f83 FNMA f98 = f74, f96, f98 FMA_A f99 = f75, f96, f99 FNMA f114 = f74, f112, f114 FMA_A f115 = f75, f112, f115 ;; FMA_B f66 = f75, f65, f66 FNMA f67 = f74, f65, f67 FMA_B f82 = f75, f81, f82 FNMA f83 = f74, f81, f83 FMA_B f98 = f75, f97, f98 FNMA f99 = f74, f97, f99 FMA_B f114 = f75, f113, f114 FNMA f115 = f74, f113, f115 ;; FMPY f32 = f90, f66 FMPY f33 = f91, f66 FMPY f34 = f90, f82 FMPY f35 = f91, f82 FMPY f36 = f90, f98 FMPY f37 = f91, f98 FMPY f38 = f90, f114 FMPY f39 = f91, f114 ;; FMA_C f66 = f91, f67, f32 FMA_D f67 = f90, f67, f33 FMA_C f82 = f91, f83, f34 FMA_D f83 = f90, f83, f35 FMA_C f98 = f91, f99, f36 FMA_D f99 = f90, f99, f37 FMA_C f114 = f91, f115, f38 FMA_D f115 = f90, f115, f39 ;;#endif#ifdef RN LDFPD f72, f73 = [BOFFSET], 2 * SIZE ;; LDFPD f74, f75 = [BOFFSET], 2 * SIZE ;; LDFPD f76, f77 = [BOFFSET], 2 * SIZE ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET], 2 * SIZE ;; LDFPD f92, f93 = [BOFFSET], 2 * SIZE ;; LDFPD f94, f95 = [BOFFSET] adds BOFFSET = 6 * SIZE, BOFFSET ;; LDFPD f108, f109 = [BOFFSET], 2 * SIZE ;; LDFPD f110, f111 = [BOFFSET] adds BOFFSET = 8 * SIZE, BOFFSET ;; LDFPD f126, f127 = [BOFFSET] adds BOFFSET = - 30 * SIZE, BOFFSET ;; FMPY f32 = f72, f64 FMPY f33 = f73, f64 FMPY f34 = f72, f66 FMPY f35 = f73, f66 ;; FMA_C f64 = f73, f65, f32 FMA_D f65 = f72, f65, f33 FMA_C f66 = f73, f67, f34 FMA_D f67 = f72, f67, f35 ;; FNMA f80 = f74, f64, f80 FMA_A f81 = f75, f64, f81 FNMA f82 = f74, f66, f82 FMA_A f83 = f75, f66, f83 ;; FMA_B f80 = f75, f65, f80 FNMA f81 = f74, f65, f81 FMA_B f82 = f75, f67, f82 FNMA f83 = f74, f67, f83 ;; FNMA f96 = f76, f64, f96 FMA_A f97 = f77, f64, f97 FNMA f98 = f76, f66, f98 FMA_A f99 = f77, f66, f99 ;; FMA_B f96 = f77, f65, f96 FNMA f97 = f76, f65, f97 FMA_B f98 = f77, f67, f98 FNMA f99 = f76, f67, f99 ;; FNMA f112 = f78, f64, f112 FMA_A f113 = f79, f64, f113 FNMA f114 = f78, f66, f114 FMA_A f115 = f79, f66, f115 ;; FMA_B f112 = f79, f65, f112 FNMA f113 = f78, f65, f113 FMA_B f114 = f79, f67, f114 FNMA f115 = f78, f67, f115 ;; FMPY f32 = f90, f80 FMPY f33 = f91, f80 FMPY f34 = f90, f82 FMPY f35 = f91, f82 ;; FMA_C f80 = f91, f81, f32 FMA_D f81 = f90, f81, f33 FMA_C f82 = f91, f83, f34 FMA_D f83 = f90, f83, f35 ;; FNMA f96 = f92, f80, f96 FMA_A f97 = f93, f80, f97 FNMA f98 = f92, f82, f98 FMA_A f99 = f93, f82, f99 ;; FMA_B f96 = f93, f81, f96 FNMA f97 = f92, f81, f97 FMA_B f98 = f93, f83, f98 FNMA f99 = f92, f83, f99 ;; FNMA f112 = f94, f80, f112 FMA_A f113 = f95, f80, f113 FNMA f114 = f94, f82, f114 FMA_A f115 = f95, f82, f115 ;; FMA_B f112 = f95, f81, f112 FNMA f113 = f94, f81, f113 FMA_B f114 = f95, f83, f114 FNMA f115 = f94, f83, f115 ;; FMPY f32 = f108, f96 FMPY f33 = f109, f96 FMPY f34 = f108, f98 FMPY f35 = f109, f98 ;; FMA_C f96 = f109, f97, f32 FMA_D f97 = f108, f97, f33 FMA_C f98 = f109, f99, f34 FMA_D f99 = f108, f99, f35 ;; FNMA f112 = f110, f96, f112 FMA_A f113 = f111, f96, f113 FNMA f114 = f110, f98, f114 FMA_A f115 = f111, f98, f115 ;; FMA_B f112 = f111, f97, f112 FNMA f113 = f110, f97, f113 FMA_B f114 = f111, f99, f114 FNMA f115 = f110, f99, f115 ;; FMPY f32 = f126, f112 FMPY f33 = f127, f112 FMPY f34 = f126, f114 FMPY f35 = f127, f114 ;; FMA_C f112 = f127, f113, f32 FMA_D f113 = f126, f113, f33 FMA_C f114 = f127, f115, f34 FMA_D f115 = f126, f115, f35 ;;#endif#ifdef RT adds BOFFSET = 30 * SIZE, BOFFSET ;; LDFPD f72, f73 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f74, f75 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f76, f77 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f78, f79 = [BOFFSET] adds BOFFSET = - 4 * SIZE, BOFFSET ;; LDFPD f88, f89 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f90, f91 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f92, f93 = [BOFFSET] adds BOFFSET = - 6 * SIZE, BOFFSET ;; LDFPD f104, f105 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFPD f106, f107 = [BOFFSET] adds BOFFSET = - 8 * SIZE, BOFFSET ;; LDFPD f120, f121 = [BOFFSET] ;; FMPY f32 = f72, f112 FMPY f33 = f73, f112 FMPY f34 = f72, f114 FMPY f35 = f73, f114 ;; FMA_C f112 = f73, f113, f32 FMA_D f113 = f72, f113, f33 FMA_C f114 = f73, f115, f34 FMA_D f115 = f72, f115, f35 ;; FNMA f96 = f74, f112, f96 FMA_A f97 = f75, f112, f97 FNMA f98 = f74, f114, f98 FMA_A f99 = f75, f114, f99 ;; FMA_B f96 = f75, f113, f96 FNMA f97 = f74, f113, f97 FMA_B f98 = f75, f115, f98 FNMA f99 = f74, f115, f99 ;; FNMA f80 = f76, f112, f80 FMA_A f81 = f77, f112, f81 FNMA f82 = f76, f114, f82 FMA_A f83 = f77, f114, f83 ;; FMA_B f80 = f77, f113, f80 FNMA f81 = f76, f113, f81 FMA_B f82 = f77, f115, f82 FNMA f83 = f76, f115, f83 ;; FNMA f64 = f78, f112, f64 FMA_A f65 = f79, f112, f65 FNMA f66 = f78, f114, f66 FMA_A f67 = f79, f114, f67 ;; FMA_B f64 = f79, f113, f64 FNMA f65 = f78, f113, f65 FMA_B f66 = f79, f115, f66 FNMA f67 = f78, f115, f67 ;; FMPY f32 = f88, f96 FMPY f33 = f89, f96 FMPY f34 = f88, f98 FMPY f35 = f89, f98 ;; FMA_C f96 = f89, f97, f32 FMA_D f97 = f88, f97, f33 FMA_C f98 = f89, f99, f34 FMA_D f99 = f88, f99, f35 ;; FNMA f80 = f90, f96, f80 FMA_A f81 = f91, f96, f81 FNMA f82 = f90, f98, f82 FMA_A f83 = f91, f98, f83 ;; FMA_B f80 = f91, f97, f80 FNMA f81 = f90, f97, f81 FMA_B f82 = f91, f99, f82 FNMA f83 = f90, f99, f83 ;; FNMA f64 = f92, f96, f64 FMA_A f65 = f93, f96, f65 FNMA f66 = f92, f98, f66 FMA_A f67 = f93, f98, f67 ;; FMA_B f64 = f93, f97, f64 FNMA f65 = f92, f97, f65 FMA_B f66 = f93, f99, f66 FNMA f67 = f92, f99, f67 ;; FMPY f32 = f104, f80 FMPY f33 = f105, f80 FMPY f34 = f104, f82 FMPY f35 = f105, f82 ;; FMA_C f80 = f105, f81, f32 FMA_D f81 = f104, f81, f33 FMA_C f82 = f105, f83, f34 FMA_D f83 = f104, f83, f35 ;; FNMA f64 = f106, f80, f64 FMA_A f65 = f107, f80, f65 FNMA f66 = f106, f82, f66 FMA_A f67 = f107, f82, f67 ;; FMA_B f64 = f107, f81, f64 FNMA f65 = f106, f81, f65 FMA_B f66 = f107, f83, f66 FNMA f67 = f106, f83, f67 ;; FMPY f32 = f120, f64 FMPY f33 = f121, f64 FMPY f34 = f120, f66 FMPY f35 = f121, f66 ;; FMA_C f64 = f121, f65, f32 FMA_D f65 = f120, f65, f33 FMA_C f66 = f121, f67, f34 FMA_D f67 = f120, f67, f35 ;;#endif#if defined(LN) || defined(LT) adds BOFFSET2 = 4 * SIZE, BOFFSET ;; STFD [BOFFSET] = f64, SIZE STFD [BOFFSET2] = f96, SIZE ;; STFD [BOFFSET] = f65, SIZE STFD [BOFFSET2] = f97, SIZE ;; STFD [BOFFSET] = f80, SIZE STFD [BOFFSET2] = f112, SIZE ;; STFD [BOFFSET] = f81, 5 * SIZE STFD [BOFFSET2] = f113, 5 * SIZE ;; STFD [BOFFSET] = f66, SIZE STFD [BOFFSET2] = f98, SIZE ;; STFD [BOFFSET] = f67, SIZE STFD [BOFFSET2] = f99, SIZE ;; STFD [BOFFSET] = f82, SIZE STFD [BOFFSET2] = f114, SIZE ;; STFD [BOFFSET] = f83, 5 * SIZE STFD [BOFFSET2] = f115, 5 * SIZE ;; adds BOFFSET = - 16 * SIZE, BOFFSET ;;#else adds AOFFSET2 = 4 * SIZE, AOFFSET ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f80, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f81, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f82, SIZE ;; STFD [AOFFSET] = f67, 5 * SIZE STFD [AOFFSET2] = f83, 5 * SIZE ;; STFD [AOFFSET] = f96, SIZE STFD [AOFFSET2] = f112, SIZE ;; STFD [AOFFSET] = f97, SIZE STFD [AOFFSET2] = f113, SIZE ;; STFD [AOFFSET] = f98, SIZE STFD [AOFFSET2] = f114, SIZE ;; STFD [AOFFSET] = f99, 5 * SIZE STFD [AOFFSET2] = f115, 5 * SIZE ;; adds AOFFSET = - 16 * SIZE, AOFFSET ;;#endif#ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 adds C3 = -4 * SIZE, C3 adds C4 = -4 * SIZE, C4#endif ;; STFD [C1 ] = f64, SIZE ;; STFD [C1 ] = f65, SIZE ;; STFD [C1 ] = f66, SIZE ;; STFD [C1 ] = f67, SIZE ;; STFD [C2 ] = f80, SIZE ;; STFD [C2 ] = f81, SIZE ;; STFD [C2 ] = f82, SIZE ;; STFD [C2 ] = f83, SIZE ;; STFD [C3 ] = f96, SIZE ;; STFD [C3 ] = f97, SIZE ;; STFD [C3 ] = f98, SIZE ;; STFD [C3 ] = f99, SIZE ;; STFD [C4 ] = f112, SIZE ;; STFD [C4 ] = f113, SIZE ;; STFD [C4 ] = f114, SIZE ;; STFD [C4 ] = f115, SIZE ;; mov f64 = f0 mov f65 = f0 mov f80 = f0 mov f81 = f0 mov f96 = f0 mov f97 = f0 mov f112 = f0 mov f113 = f0 ;;#ifdef LN adds C1 = -4 * SIZE, C1 adds C2 = -4 * SIZE, C2 adds C3 = -4 * SIZE, C3 adds C4 = -4 * SIZE, C4#endif ;; cmp.ne p6, p0 = 1, I ;; adds I = -1, I ;; shladd r2 = K, ZBASE_SHIFT, r0 ;; sub L = K, KK ;;#ifdef RT shladd AORIG = r2, 1, AORIG#endif ;;#if defined(LT) || defined(RN) shladd L = L, ZBASE_SHIFT, r0 ;; shladd AOFFSET = L, 1, AOFFSET shladd BOFFSET = L, 2, BOFFSET#endif ;;#ifdef LT adds KK = 2, KK#elif defined LN adds KK = -2, KK#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif ;; .align 16.L010x:#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif shr I = M, 2 ;; cmp.eq p6, p7 = 0, I (p6) br.cond.dpnt .L049 ;; .align 16.L011: { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 2 + ZBASE_SHIFT } { .mfi shladd r3 = KK, ZBASE_SHIFT, r0 mov f118 = f0 nop __LINE__ } ;;#if defined(LT) || defined(RN) { .mfb (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f66 = f0 nop __LINE__ } { .mmf nop __LINE__ nop __LINE__ mov f67 = f0 } ;;#else { .mfi shladd BOFFSET = r3, 2, B mov f66 = f0#ifdef LN sub AORIG = AORIG, r2#else nop __LINE__#endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE mov f67 = f0 shladd AOFFSET = r3, 2, AORIG } ;;#endif ;; { .mfi (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 nop __LINE__ } { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } ;; { .mfi (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f98 = f0 adds L = 1, L } { .mfi (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE mov f99 = f0 adds C5 = 4 * SIZE, C1 } ;; { .mfi (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f114 = f0 tbit.z p12, p0 = L, 0 } { .mfi (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f115 = f0 adds C6 = 4 * SIZE, C2 } ;; { .mfi (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f68 = f0 shr L = L, 1 } { .mfi setf.d f86 = r0 mov f69 = f0 adds C7 = 4 * SIZE, C3 } ;; { .mfi CPREFETCH [PREC], LDC mov f84 = f0 adds L = -1, L } { .mfi setf.d f87 = r0 mov f85 = f0 adds C8 = 4 * SIZE, C4 } ;; { .mfi CPREFETCH [PREC], LDC mov f100 = f0 mov ar.lc = L } { .mfi setf.d f102 = r0 mov f101 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi CPREFETCH [PREC], LDC mov f116 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } { .mfi setf.d f103 = r0 mov f117 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mfi CPREFETCH [PREC] mov f70 = f0 cmp.eq p6, p0 = -1, L } { .mfb setf.d f119 = r0 mov f71 = f0 (p6) br.cond.dpnt .L018 } ;; .align 16.L012:/* 1 */ { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfb (p12) cmp.ne p3, p0 = 0, L FMA_B f65 = f32, f49, f65 // A1 * B2 nop __LINE__ } ;;/* 2 */ { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 nop __LINE__ } { .mfb cmp.ne p4, p5 = 0, L FMA_B f81 = f32, f51, f81 // A1 * B4 nop __LINE__ } ;;/* 3 */ { .mfb (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 nop __LINE__ } { .mfb FMA_B f97 = f32, f53, f97 // A1 * B6 nop __LINE__ } ;;/* 4 */ { .mfb (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 nop __LINE__ } { .mfb FMA_B f113 = f32, f55, f113 // A1 * B8 nop __LINE__ } ;;/* 5 */ { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 nop __LINE__ } { .mfb FMA_A f64 = f33, f49, f64 // A2 * B2 nop __LINE__ } ;;/* 6 */ { .mfb (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 nop __LINE__ } { .mfb FMA_A f80 = f33, f51, f80 // A2 * B4 nop __LINE__ } ;;/* 7 */ { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb FMA_A f96 = f33, f53, f96 // A2 * B6 nop __LINE__ } ;;/* 8 */ { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb FMA_A f112 = f33, f55, f112 // A2 * B8 nop __LINE__ } ;;/* 9 */ { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb FMA_B f67 = f34, f49, f67 // A3 * B2 nop __LINE__ } ;;/* 10 */ { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb FMA_B f83 = f34, f51, f83 // A3 * B4 nop __LINE__ } ;;/* 11 */ { .mfb FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA_B f115 = f34, f55, f115 // A3 * B8 nop __LINE__ } ;;/* 13 */ { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 } { .mfb nop __LINE__ FMA_A f66 = f35, f49, f66 // A4 * B2 nop __LINE__
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -