📄 trsm_kernel_rt.s
字号:
LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 FMPY f73 = f73, f32 FMPY f74 = f74, f32 FMPY f75 = f75, f32 ;; FNMA f64 = f72, f33, f64 FNMA f65 = f73, f33, f65 FNMA f66 = f74, f33, f66 FNMA f67 = f75, f33, f67 ;; FMPY f64 = f64, f34 FMPY f65 = f65, f34 FMPY f66 = f66, f34 FMPY f67 = f67, f34 ;; STFD [AOFFSET] = f64, SIZE STFD [AOFFSET2] = f72, SIZE ;; STFD [AOFFSET] = f65, SIZE STFD [AOFFSET2] = f73, SIZE ;; STFD [AOFFSET] = f66, SIZE STFD [AOFFSET2] = f74, SIZE ;; STFD [AOFFSET] = f67, - 3 * SIZE STFD [AOFFSET2] = f75, - 3 * SIZE ;;#endif { .mmf STFD [C1 ] = f64, SIZE mov f64 = f0 } ;; { .mmi STFD [C1 ] = f65, SIZE } ;; { .mmi STFD [C1 ] = f66, SIZE } ;; { .mmi#ifndef LN STFD [C1 ] = f67, SIZE#else STFD [C1 ] = f67, - 3 * SIZE#endif } ;; { .mmf STFD [C2 ] = f72, SIZE mov f72 = f0 } ;; { .mmi STFD [C2 ] = f73, SIZE } ;; { .mmi STFD [C2 ] = f74, SIZE } ;; { .mmi#ifndef LN STFD [C2 ] = f75, SIZE#else STFD [C2 ] = f75, - 3 * SIZE#endif } ;; mov f65 = f0 mov f73 = f0 mov f66 = f0 mov f74 = f0 mov f67 = f0 mov f75 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; { .mmi sub L = K, KK } ;; { .mmi#ifdef RT shladd AORIG = r2, 2, AORIG#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) shladd AOFFSET = L, 2, AOFFSET#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET#else nop __LINE__#endif } ;; { .mmi#ifdef LT adds KK = 4, KK#elif defined LN adds KK = -4, KK#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif } ;; .align 8.L110: tbit.z p6, p7 = M, 1 (p6) br.cond.dptk .L120 ;; { .mib#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 1 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;;#if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;;#else { .mfi shladd BOFFSET = r3, 1, B#ifdef LN sub AORIG = AORIG, r2#else nop __LINE__#endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE shladd AOFFSET = r3, 1, AORIG } ;;#endif { .mfi adds L = 1, L } { .mfi adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mfi tbit.z p12, p0 = L, 0 } { .mfi shr L = L, 1 } ;; { .mmf adds L = -1, L } ;; { .mmf cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L118 } ;;.L112: { .mfi lfetch.nt1 [PREA], 4 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi lfetch.nt1 [PREB], 4 * SIZE FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmf (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 } { .mmf nop __LINE__ nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 adds L = -1, L } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 br.cloop.sptk.few .L112 } ;; .align 8.L118:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -2, KK#else adds r2 = -2, KK#endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;;#if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [BOFFSET] adds BOFFSET = -2 * SIZE, BOFFSET ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 FSUB f65 = f34, f65 FSUB f73 = f35, f73 ;;#else LDFPD f32, f33 = [AOFFSET], 2 * SIZE ;; LDFPD f34, f35 = [AOFFSET] adds AOFFSET = -2 * SIZE, AOFFSET ;; FSUB f64 = f32, f64 FSUB f65 = f33, f65 FSUB f72 = f34, f72 FSUB f73 = f35, f73 ;;#endif#ifdef LN adds AOFFSET = 2 * SIZE, AOFFSET ;; LDFPD f33, f32 = [AOFFSET] adds AOFFSET = - 2 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET] ;; FMPY f65 = f65, f32 FMPY f73 = f73, f32 ;; FNMA f64 = f65, f33, f64 FNMA f72 = f73, f33, f72 ;; FMPY f64 = f64, f34 FMPY f72 = f72, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f73, - 3 * SIZE ;; adds C1 = -2 * SIZE, C1 adds C2 = -2 * SIZE, C2 ;;#endif#ifdef LT LDFPD f32, f33 = [AOFFSET] adds AOFFSET = 3 * SIZE, AOFFSET ;; LDFD f34 = [AOFFSET], - 3 * SIZE ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; FNMA f65 = f64, f33, f65 FNMA f73 = f72, f33, f73 ;; FMPY f65 = f65, f34 FMPY f73 = f73, f34 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, SIZE ;; STFD [BOFFSET] = f65, SIZE ;; STFD [BOFFSET] = f73, -3 * SIZE ;;#endif#ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 FMPY f65 = f65, f32 ;; FNMA f72 = f64, f33, f72 FNMA f73 = f65, f33, f73 ;; FMPY f72 = f72, f34 FMPY f73 = f73, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE ;;#endif#ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 FMPY f73 = f73, f32 ;; FNMA f64 = f72, f33, f64 FNMA f65 = f73, f33, f65 ;; FMPY f64 = f64, f34 FMPY f65 = f65, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f65, SIZE ;; STFD [AOFFSET] = f72, SIZE ;; STFD [AOFFSET] = f73, -3 * SIZE ;;#endif STFD [C1 ] = f64, SIZE mov f64 = f0 ;;#ifndef LN STFD [C1 ] = f65, SIZE#else STFD [C1 ] = f65, -SIZE#endif ;; STFD [C2 ] = f72, SIZE mov f72 = f0 ;;#ifndef LN STFD [C2 ] = f73, SIZE#else STFD [C2 ] = f73, -SIZE#endif ;; mov f65 = f0 mov f73 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;;#ifdef RT shladd AORIG = r2, 1, AORIG#else nop __LINE__#endif ;; { .mmi#if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) shladd AOFFSET = L, 1, AOFFSET#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET#else nop __LINE__#endif } ;; { .mmi#ifdef LT adds KK = 2, KK#elif defined LN adds KK = -2, KK#else nop __LINE__#endif } ;; { .mmi#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif } ;; .align 8.L120: tbit.z p6, p7 = M, 0 (p6) br.cond.dptk .L129 ;; { .mib#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif } ;; { .mmi cmp.ne p7, p0 = r0, L adds BOFFSET = 0 * SIZE, B shl r2 = K, 0 + BASE_SHIFT } { .mmi shladd r3 = KK, BASE_SHIFT, r0 nop __LINE__ nop __LINE__ } ;;#if defined(LT) || defined(RN) { .mmf (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE } ;;#else { .mfi shladd BOFFSET = r3, 1, B#ifdef LN sub AORIG = AORIG, r2#else nop __LINE__#endif } ;; { .mfi (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE add AOFFSET = r3, AORIG } ;;#endif { .mmi adds L = 1, L adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET cmp.eq p3, p0 = r0, r0 } ;; { .mii tbit.z p12, p0 = L, 0 shr L = L, 1 } ;; { .mmi adds L = -1, L } ;; { .mmi cmp.eq p6, p0 = -1, L } ;; { .mib (p7) LDFD f32 = [AOFFSET], 1 * SIZE mov ar.lc = L (p6) br.cond.dpnt .L128 } ;; .align 8.L122: { .mfi FMA f64 = f32, f48, f64 // A1 * B1 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mmi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE (p3) LDFD f40 = [AOFFSET], 1 * SIZE nop __LINE__ } { .mmi nop __LINE__ nop __LINE__ nop __LINE__ } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 adds L = -1, L } { .mfb (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 br.cloop.sptk.few .L122 } ;;.L128:#if defined(LN) || defined(RT)#ifdef LN adds r2 = -1, KK#else adds r2 = -2, KK#endif ;; shladd r2 = r2, BASE_SHIFT, r0 ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET ;;#if defined(LN) || defined(LT) LDFPD f32, f33 = [BOFFSET] ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;;#else LDFPD f32, f33 = [AOFFSET] ;; FSUB f64 = f32, f64 FSUB f72 = f33, f72 ;;#endif#ifdef LN LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; { .mmi STFD [BOFFSET] = f64, SIZE adds C1 = -1 * SIZE, C1 } ;; { .mmi STFD [BOFFSET] = f72, -SIZE adds C2 = -1 * SIZE, C2 } ;;#endif#ifdef LT LDFD f32 = [AOFFSET] ;; FMPY f64 = f64, f32 FMPY f72 = f72, f32 ;; STFD [BOFFSET] = f64, SIZE ;; STFD [BOFFSET] = f72, -SIZE ;;#endif#ifdef RN LDFPD f32, f33 = [BOFFSET] adds BOFFSET = 3 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET], -3 * SIZE ;; FMPY f64 = f64, f32 ;; FNMA f72 = f64, f33, f72 ;; FMPY f72 = f72, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, -SIZE ;;#endif#ifdef RT adds BOFFSET = 2 * SIZE, BOFFSET ;; LDFPD f33, f32 = [BOFFSET] adds BOFFSET = - 2 * SIZE, BOFFSET ;; LDFD f34 = [BOFFSET] ;; FMPY f72 = f72, f32 ;; FNMA f64 = f72, f33, f64 ;; FMPY f64 = f64, f34 ;; STFD [AOFFSET] = f64, SIZE ;; STFD [AOFFSET] = f72, -SIZE ;;#endif#ifndef LN STFD [C1 ] = f64, SIZE#else STFD [C1 ] = f64#endif#ifndef LN STFD [C2 ] = f72, SIZE#else STFD [C2 ] = f72#endif mov f64 = f0 mov f72 = f0 ;; shladd r2 = K, BASE_SHIFT, r0 ;; sub L = K, KK ;;#ifdef RT add AORIG = r2, AORIG#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) shladd L = L, BASE_SHIFT, r0#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) add AOFFSET = L, AOFFSET#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) shladd BOFFSET = L, 1, BOFFSET#else nop __LINE__#endif ;;#ifdef LT adds KK = 1, KK#elif defined LN adds KK = -1, KK#else nop __LINE__#endif ;;#if defined(LT) || defined(RN) mov L = KK#else sub L = K, KK#endif ;; .align 8.L129:#ifdef LN shladd KK8 = K, BASE_SHIFT, r0 ;; shladd B = KK8, 1, B#endif#if defined(LT) || defined(RN) mov B = BOFFSET#endif#ifdef RN adds KK = 2, KK#endif#ifdef RT adds KK = -2, KK#endif ;; mov AOFFSET = A ;; .align 16.L050: { .mib setf.d f64 = r0 tbit.z p6, p0 = N, 2 (p6) br.cond.dpnt .L000 } ;;#ifdef RT { .mmi shladd r3 = LDC, 2, r0 nop __LINE__ shl r2 = K, 2 + BASE_SHIFT } ;; { .mmi sub B = B, r2 sub C = C, r3 nop __LINE__ }#endif ;; { .mfi setf.d f72 = r0 mov f80 = f0 shr I = M, 3 } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f88 = f0#ifdef LN add KK = M, OFFSET#elif defined LT mov KK = OFFSET#else nop __LINE__#endif } ;; { .mmf cmp.eq p6, p7 = 0, I#if defined(LN) || defined(RT) mov AORIG = A#else mov AOFFSET = A#endif mov f65 = f0 } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coff
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -