📄 zgemm3m_kernel.s
字号:
(p5) LDFD f11 = [C2], -SIZE FMA f120 = f32, f55, f120 // A1 * B8 } ;; { .mmf (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE (p4) LDFD f32 = [AOFFSET], 1 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 } { .mmf (p5) LDFD f12 = [C3], SIZE (p5) LDFD f14 = [C4], SIZE (p3) FMA f72 = f40, f57, f72 // A1 * B2 } ;; { .mfi (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mmf (p5) LDFD f13 = [C3], -SIZE (p5) LDFD f15 = [C4], -SIZE (p3) FMA f88 = f40, f59, f88 // A1 * B4 } ;; { .mfi (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mmf (p5) LDFD f16 = [C5], SIZE (p5) LDFD f18 = [C6], SIZE (p3) FMA f104 = f40, f61, f104 // A1 * B6 } ;; { .mfi (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 adds L = -1, L } { .mmb (p5) LDFD f17 = [C5], -SIZE (p5) LDFD f19 = [C6], -SIZE nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } { .mmb (p5) LDFD f20 = [C7], SIZE (p5) LDFD f22 = [C8], SIZE br.cloop.sptk.few .L042 } ;; { .mmf LDFD f21 = [C7], -SIZE LDFD f23 = [C8], -SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f10 = ALPHA_R, f72, f10 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_I, f64, f7 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f72, f11 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f12 = ALPHA_R, f80, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f14 = ALPHA_R, f88, f14 } ;; { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_I, f80, f13 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f88, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C2 ] = f10, SIZE FMA f16 = ALPHA_R, f96, f16 } { .mmf nop __LINE__ nop __LINE__ FMA f18 = ALPHA_R, f104, f18 } ;; { .mmf STFD [C1 ] = f7, SIZE STFD [C2 ] = f11, SIZE FMA f17 = ALPHA_I, f96, f17 } { .mmf nop __LINE__ nop __LINE__ FMA f19 = ALPHA_I, f104, f19 } ;; { .mmf STFD [C3 ] = f12, SIZE STFD [C4 ] = f14, SIZE FMA f20 = ALPHA_R, f112, f20 } { .mmf nop __LINE__ nop __LINE__ FMA f22 = ALPHA_R, f120, f22 } ;; { .mmf STFD [C3 ] = f13, SIZE STFD [C4 ] = f15, SIZE FMA f21 = ALPHA_I, f112, f21 } { .mmf nop __LINE__ nop __LINE__ FMA f23 = ALPHA_I, f120, f23 } ;; { .mmi STFD [C5 ] = f16, SIZE STFD [C6 ] = f18, SIZE nop __LINE__ } ;; { .mmi STFD [C5 ] = f17, SIZE STFD [C6 ] = f19, SIZE nop __LINE__ } ;; { .mmi STFD [C7 ] = f20, SIZE STFD [C8 ] = f22, SIZE nop __LINE__ } ;; { .mmi STFD [C7 ] = f21, SIZE STFD [C8 ] = f23, SIZE nop __LINE__ } ;; .align 32.L049: { .mmi mov B = BOFFSET mov AOFFSET = A nop __LINE__ } ;; { .mmb nop __LINE__ cmp.lt p6, p0 = 0, J (p6) br.cond.dptk .L010 } ;; .align 32.L050: { .mfi mov C1 = C mov f64 = f0 tbit.z p6, p0 = N, 2 } { .mfi add C2 = LDC, C mov f72 = f0 shr I = M, 3 } ;; { .mfi shladd C3 = LDC, 1, C mov f80 = f0 nop __LINE__ } { .mfb mov AOFFSET = A mov f88 = f0 (p6) br.cond.dpnt .L090 } ;; { .mfi cmp.eq p6, p7 = 0, I mov f65 = f0 nop __LINE__ } { .mfi shladd C4 = LDC, 1, C2 mov f73 = f0 nop __LINE__ } ;; { .mfi nop __LINE__ mov f81 = f0 nop __LINE__ } { .mfb shladd C = LDC, 2, C mov f89 = f0 (p6) br.cond.dpnt .L060 } ;; .align 32.L052: { .mfb LDFPD f48, f49 = [B] mov f66 = f0 nop __LINE__ } { .mfb adds BOFFSET = 2 * SIZE, B mov f74 = f0 nop __LINE__ } ;; { .mfi LDFPD f32, f33 = [AOFFSET], 2 * SIZE mov f82 = f0 nop __LINE__ } { .mfi setf.d f84 = r0 mov f90 = f0 nop __LINE__ } ;; { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f67 = f0 adds PREC = CPREFETCHSIZE * SIZE, C1 } { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f75 = f0 adds L = 1, K } ;; { .mfi LDFPD f36, f37 = [AOFFSET], 2 * SIZE mov f83 = f0 tbit.z p12, p0 = L, 0 } { .mfi setf.d f91 = r0 mov f68 = f0 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } ;; { .mfi CPREFETCH [PREC], LDC mov f76 = f0 adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET } { .mfi LDFPD f38, f39 = [AOFFSET], 2 * SIZE mov f92 = f0 cmp.eq p3, p0 = r0, r0 } ;; { .mfi CPREFETCH [PREC], LDC mov f69 = f0 shr L = L, 1 } { .mmf setf.d f77 = r0 setf.d f85 = r0 mov f93 = f0 } ;; { .mfi CPREFETCH [PREC], LDC mov f70 = f0 adds L = -1, L } { .mmf setf.d f78 = r0 setf.d f86 = r0 mov f94 = f0 } ;; { .mfi CPREFETCH [PREC] mov f71 = f0 mov ar.lc = L } { .mmf setf.d f79 = r0 setf.d f87 = r0 mov f95 = f0 } ;; .align 32.L053: { .mfb lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 nop __LINE__ } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 8 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfi nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 adds C9 = 4 * SIZE, C1 } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 adds C10 = 4 * SIZE, C2 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 adds C11 = 4 * SIZE, C3 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 adds C12 = 4 * SIZE, C4 } { .mfb nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f68 = f36, f48, f68 // A5 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f76 = f36, f49, f76 // A5 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f84 = f36, f50, f84 // A5 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f92 = f36, f51, f92 // A5 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f69 = f37, f48, f69 // A6 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f77 = f37, f49, f77 // A6 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f85 = f37, f50, f85 // A6 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f93 = f37, f51, f93 // A6 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f70 = f38, f48, f70 // A7 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f78 = f38, f49, f78 // A7 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f86 = f38, f50, f86 // A7 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f94 = f38, f51, f94 // A7 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f71 = f39, f48, f71 // A8 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f79 = f39, f49, f79 // A8 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f87 = f39, f50, f87 // A8 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f95 = f39, f51, f95 // A8 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f6 = [C1 ], SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb (p5) LDFD f7 = [C9 ], SIZE (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f10 = [C1 ], SIZE (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb (p5) LDFD f11 = [C9 ], SIZE (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f12 = [C1 ], SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb (p5) LDFD f13 = [C9 ], SIZE (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f14 = [C1 ], 5 * SIZE (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb (p5) LDFD f15 = [C9 ], 5 * SIZE (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f16 = [C1 ], SIZE (p3) FMA f68 = f44, f56, f68 // A5 * B1 nop __LINE__ } { .mfb (p5) LDFD f17 = [C9], SIZE (p3) FMA f76 = f44, f57, f76 // A5 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f18 = [C1 ], SIZE (p3) FMA f84 = f44, f58, f84 // A5 * B3 nop __LINE__ } { .mfb (p5) LDFD f19 = [C9], SIZE (p3) FMA f92 = f44, f59, f92 // A5 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f20 = [C1 ], SIZE (p3) FMA f69 = f45, f56, f69 // A6 * B1 nop __LINE__ } { .mfb (p5) LDFD f21 = [C9], SIZE (p3) FMA f77 = f45, f57, f77 // A6 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f22 = [C1 ], -11 * SIZE (p3) FMA f85 = f45, f58, f85 // A6 * B3 nop __LINE__ } { .mfb (p5) LDFD f23 = [C9 ], -11 * SIZE (p3) FMA f93 = f45, f59, f93 // A6 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f24 = [C2 ], SIZE (p3) FMA f70 = f46, f56, f70 // A7 * B1 nop __LINE__ } { .mfb (p5) LDFD f25 = [C10], SIZE (p3) FMA f78 = f46, f57, f78 // A7 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f26 = [C2 ], SIZE (p3) FMA f86 = f46, f58, f86 // A7 * B3 nop __LINE__ } { .mfb (p5) LDFD f27 = [C10], SIZE (p3) FMA f94 = f46, f59, f94 // A7 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f28 = [C2 ], SIZE (p3) FMA f71 = f47, f56, f71 // A8 * B1 nop __LINE__ } { .mfb (p5) LDFD f29 = [C10], SIZE (p3) FMA f79 = f47, f57, f79 // A8 * B2 nop __LINE__ } ;; { .mfi (p5) LDFD f30 = [C2 ], 5 * SIZE (p3) FMA f87 = f47, f58, f87 // A8 * B3 adds L = -1, L } { .mfb (p5) LDFD f31 = [C10], 5 * SIZE (p3) FMA f95 = f47, f59, f95 // A8 * B4 br.cloop.sptk.few .L053 } ;; .align 32.L058: { .mmf LDFD f32 = [C2 ], SIZE LDFD f33 = [C10], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f66, f7 } ;; { .mmf LDFD f34 = [C2 ], SIZE LDFD f35 = [C10], SIZE FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f66, f11 } ;; { .mmf LDFD f36 = [C2 ], SIZE LDFD f37 = [C10], SIZE FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f67, f13 } ;; { .mmf LDFD f38 = [C2 ], - 11 * SIZE LDFD f39 = [C10], - 11 * SIZE FMA f14 = ALPHA_I, f65, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f67, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C9 ] = f7, SIZE FMA f16 = ALPHA_R, f68, f16 } { .mmf LDFD f48 = [C3 ], SIZE LDFD f49 = [C11], SIZE FMA f17 = ALPHA_R, f70, f17 } ;; { .mmf STFD [C1 ] = f10, SIZE STFD [C9 ] = f11, SIZE FMA f18 = ALPHA_I, f68, f18 } { .mmf LDFD f50 = [C3 ], SIZE LDFD f51 = [C11], SIZE FMA f19 = ALPHA_I, f70, f19 } ;; { .mmf STFD [C1 ] = f12, SIZE STFD [C9 ] = f13, SIZE FMA f20 = ALPHA_R, f69, f20 } { .mmf LDFD f52 = [C3 ], SIZE LDFD f53 = [C11], SIZE FMA f21 = ALPHA_R, f71, f21 } ;; { .mmf STFD [C1 ] = f14, 5 * SIZE STFD [C9 ] = f15, 5 * SIZE FMA f22 = ALPHA_I, f69, f22 } { .mmf LDFD f54 = [C3 ], 5 * SIZE LDFD f55 = [C11], 5 * SIZE FMA f23 = ALPHA_I, f71, f23 } ;; { .mmf STFD [C1 ] = f16, SIZE STFD [C9 ] = f17, SIZE FMA f24 = ALPHA_R, f72, f24 } { .mmf LDFD f40 = [C3 ], SIZE LDFD f41 = [C11], SIZE FMA f25 = ALPHA_R, f74, f25 } ;; { .mmf STFD [C1 ] = f18, SIZE STFD [C9 ] = f19, SIZE FMA f26 = ALPHA_I, f72, f26 } { .mmf LDFD f42 = [C3 ], SIZE LDFD f43 = [C11], SIZE FMA f27 = ALPHA_I, f74, f27 } ;; { .mmf STFD [C1 ] = f20, SIZE STFD [C9 ] = f21, SIZE FMA f28 = ALPHA_R, f73, f28 } { .mmf LDFD f44 = [C3 ], SIZE LDFD f45 = [C11], SIZE FMA f29 = ALPHA_R, f75, f29 } ;; { .mmf STF
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -