📄 zgemm3m_kernel.s
字号:
{ .mmf nop __LINE__ nop __LINE__ FMA f49 = ALPHA_I, f110, f49 } ;; { .mmf STFD [C6 ] = f34, SIZE STFD [C14] = f35, SIZE FMA f50 = ALPHA_R, f109, f50 } { .mmf nop __LINE__ nop __LINE__ FMA f51 = ALPHA_R, f111, f51 } ;; { .mmf STFD [C6 ] = f36, 5 * SIZE STFD [C14] = f37, 5 * SIZE FMA f52 = ALPHA_I, f109, f52 } { .mmf nop __LINE__ nop __LINE__ FMA f53 = ALPHA_I, f111, f53 } ;; { .mmf STFD [C6 ] = f38, SIZE STFD [C14] = f39, SIZE FMA f54 = ALPHA_R, f112, f54 } { .mmf nop __LINE__ nop __LINE__ FMA f55 = ALPHA_R, f114, f55 } ;; { .mmf STFD [C6 ] = f48, SIZE STFD [C14] = f49, SIZE FMA f40 = ALPHA_I, f112, f40 } { .mmf nop __LINE__ nop __LINE__ FMA f41 = ALPHA_I, f114, f41 } ;; { .mmf STFD [C6 ] = f50, SIZE STFD [C14] = f51, SIZE FMA f42 = ALPHA_R, f113, f42 } { .mmf nop __LINE__ nop __LINE__ FMA f43 = ALPHA_R, f115, f43 } ;; { .mmf STFD [C6 ] = f52, 5 * SIZE STFD [C14] = f53, 5 * SIZE FMA f44 = ALPHA_I, f113, f44 } { .mmf nop __LINE__ nop __LINE__ FMA f45 = ALPHA_I, f115, f45 } ;; { .mmf STFD [C7 ] = f54, SIZE STFD [C15] = f55, SIZE FMA f46 = ALPHA_R, f116, f46 } { .mmf nop __LINE__ nop __LINE__ FMA f56 = ALPHA_R, f118, f56 } ;; { .mmf STFD [C7 ] = f40, SIZE STFD [C15] = f41, SIZE FMA f57 = ALPHA_I, f116, f57 } { .mmf nop __LINE__ nop __LINE__ FMA f58 = ALPHA_I, f118, f58 } ;; { .mmf STFD [C7 ] = f42, SIZE STFD [C15] = f43, SIZE FMA f59 = ALPHA_R, f117, f59 } { .mmf nop __LINE__ nop __LINE__ FMA f60 = ALPHA_R, f119, f60 } ;; { .mmf STFD [C7 ] = f44, 5 * SIZE STFD [C15] = f45, 5 * SIZE FMA f61 = ALPHA_I, f117, f61 } { .mmf nop __LINE__ nop __LINE__ FMA f62 = ALPHA_I, f119, f62 } ;; { .mmf STFD [C7 ] = f46, SIZE STFD [C15] = f56, SIZE FMA f63 = ALPHA_R, f120, f63 } { .mmf nop __LINE__ nop __LINE__ FMA f47 = ALPHA_R, f122, f47 } ;; { .mmf STFD [C7 ] = f57, SIZE STFD [C15] = f58, SIZE FMA f64 = ALPHA_I, f120, f64 } { .mmf nop __LINE__ nop __LINE__ FMA f65 = ALPHA_I, f122, f65 } ;; { .mmf STFD [C7 ] = f59, SIZE STFD [C15] = f60, SIZE FMA f6 = ALPHA_R, f121, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f123, f7 } ;; { .mmf STFD [C7 ] = f61, 5 * SIZE STFD [C15] = f62, 5 * SIZE FMA f10 = ALPHA_I, f121, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f123, f11 } ;; { .mmf STFD [C8 ] = f63, SIZE STFD [C16] = f47, SIZE FMA f12 = ALPHA_R, f124, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f126, f13 } ;; { .mmf STFD [C8 ] = f64, SIZE STFD [C16] = f65, SIZE FMA f14 = ALPHA_I, f124, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f126, f15 } ;; { .mmf STFD [C8 ] = f6, SIZE STFD [C16] = f7, SIZE FMA f16 = ALPHA_R, f125, f16 } { .mmf nop __LINE__ nop __LINE__ FMA f17 = ALPHA_R, f127, f17 } ;; { .mmf STFD [C8 ] = f10, 5 * SIZE STFD [C16] = f11, 5 * SIZE FMA f18 = ALPHA_I, f125, f18 } { .mmf nop __LINE__ nop __LINE__ FMA f19 = ALPHA_I, f127, f19 } ;; { .mmf STFD [C8 ] = f12, SIZE STFD [C16] = f13, SIZE mov f64 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f72 = f0 } ;; { .mmf STFD [C8 ] = f14, SIZE STFD [C16] = f15, SIZE mov f80 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f88 = f0 } ;; { .mmf STFD [C8 ] = f16, SIZE STFD [C16] = f17, SIZE mov f96 = f0 } { .mmf nop __LINE__ nop __LINE__ mov f104 = f0 } ;; { .mmf STFD [C8 ] = f18, 5 * SIZE STFD [C16] = f19, 5 * SIZE mov f112 = f0 } { .mfb adds I = -1, I mov f120 = f0 (p6) br.cond.dptk .L011 } ;;.L020: { .mfi cmp.eq p3, p0 = r0, r0 mov f89 = f0 tbit.z p6, p7 = M, 2 } { .mfb nop __LINE__ mov f81 = f0 (p6) br.cond.dptk .L030 } ;; { .mfi LDFPD f48, f49 = [B] mov f65 = f0 nop __LINE__ } { .mfi adds BOFFSET = 2 * SIZE, B mov f73 = f0 adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET } ;; { .mmf LDFPD f32, f33 = [AOFFSET], 2 * SIZE setf.d f97 = r0 mov f105 = f0 } { .mfi setf.d f113 = r0 mov f121 = f0 adds L = 1, K } ;; { .mmf LDFPD f50, f51 = [BOFFSET], 2 * SIZE setf.d f66 = r0 mov f74 = f0 } { .mfi setf.d f82 = r0 mov f90 = f0 tbit.z p12, p0 = L, 0 } ;; { .mmf LDFPD f52, f53 = [BOFFSET], 2 * SIZE setf.d f98 = r0 mov f106 = f0 } { .mfi setf.d f114 = r0 mov f122 = f0 shr L = L, 1 } ;; { .mfi LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f75 = f0 adds L = -1, L } { .mmf setf.d f67 = r0 setf.d f83 = r0 mov f91 = f0 } ;; { .mfi LDFPD f34, f35 = [AOFFSET], 2 * SIZE mov f107 = f0 mov ar.lc = L } { .mmf setf.d f99 = r0 setf.d f115 = r0 mov f123 = f0 } ;; .align 32.L022: { .mfi lfetch.nt1 [PREA], 16 * SIZE FMA f64 = f32, f48, f64 // A1 * B1 adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET } { .mfi nop __LINE__ FMA f72 = f32, f49, f72 // A1 * B2 (p12) cmp.ne p3, p0 = 0, L } ;; { .mfi lfetch.nt1 [PREB], 16 * SIZE FMA f80 = f32, f50, f80 // A1 * B3 cmp.ne p4, p5 = 0, L } { .mfb nop __LINE__ FMA f88 = f32, f51, f88 // A1 * B4 nop __LINE__ } ;; { .mfi (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE FMA f96 = f32, f52, f96 // A1 * B5 (p5) adds C9 = 4 * SIZE, C1 } { .mfi nop __LINE__ FMA f104 = f32, f53, f104 // A1 * B6 (p5) adds C10 = 4 * SIZE, C2 } ;; { .mfi (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE FMA f112 = f32, f54, f112 // A1 * B7 (p5) adds C11 = 4 * SIZE, C3 } { .mfi nop __LINE__ FMA f120 = f32, f55, f120 // A1 * B8 (p5) adds C12 = 4 * SIZE, C4 } ;; { .mfi (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE FMA f65 = f33, f48, f65 // A2 * B1 (p5) adds C13 = 4 * SIZE, C5 } { .mfi nop __LINE__ FMA f73 = f33, f49, f73 // A2 * B2 (p5) adds C14 = 4 * SIZE, C6 } ;; { .mfi (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE FMA f81 = f33, f50, f81 // A2 * B3 (p5) adds C15 = 4 * SIZE, C7 } { .mfi nop __LINE__ FMA f89 = f33, f51, f89 // A2 * B4 (p5) adds C16 = 4 * SIZE, C8 } ;; { .mfb (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE FMA f97 = f33, f52, f97 // A2 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f105 = f33, f53, f105 // A2 * B6 nop __LINE__ } ;; { .mfb (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE FMA f113 = f33, f54, f113 // A2 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f121 = f33, f55, f121 // A2 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f66 = f34, f48, f66 // A3 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f74 = f34, f49, f74 // A3 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f82 = f34, f50, f82 // A3 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f90 = f34, f51, f90 // A3 * B4 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f98 = f34, f52, f98 // A3 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f114 = f34, f54, f114 // A3 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f122 = f34, f55, f122 // A3 * B8 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f67 = f35, f48, f67 // A4 * B1 nop __LINE__ } { .mfb nop __LINE__ FMA f75 = f35, f49, f75 // A4 * B2 nop __LINE__ } ;; { .mfb nop __LINE__ FMA f83 = f35, f50, f83 // A4 * B3 nop __LINE__ } { .mfb nop __LINE__ FMA f91 = f35, f51, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE FMA f99 = f35, f52, f99 // A4 * B5 nop __LINE__ } { .mfb nop __LINE__ FMA f107 = f35, f53, f107 // A4 * B6 nop __LINE__ } ;; { .mfb (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE FMA f115 = f35, f54, f115 // A4 * B7 nop __LINE__ } { .mfb nop __LINE__ FMA f123 = f35, f55, f123 // A4 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE (p3) FMA f64 = f40, f56, f64 // A1 * B1 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f72 = f40, f57, f72 // A1 * B2 nop __LINE__ } ;; { .mfb (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE (p3) FMA f80 = f40, f58, f80 // A1 * B3 nop __LINE__ } { .mfb nop __LINE__ (p3) FMA f88 = f40, f59, f88 // A1 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f6 = [C1 ], SIZE (p3) FMA f96 = f40, f60, f96 // A1 * B5 nop __LINE__ } { .mfb (p5) LDFD f7 = [C9 ], SIZE (p3) FMA f104 = f40, f61, f104 // A1 * B6 nop __LINE__ } ;; { .mfb (p5) LDFD f10 = [C1 ], SIZE (p3) FMA f112 = f40, f62, f112 // A1 * B7 nop __LINE__ } { .mfb (p5) LDFD f11 = [C9 ], SIZE (p3) FMA f120 = f40, f63, f120 // A1 * B8 nop __LINE__ } ;; { .mfb (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE (p3) FMA f65 = f41, f56, f65 // A2 * B1 nop __LINE__ } { .mfb (p3) FMA f73 = f41, f57, f73 // A2 * B2 nop __LINE__ } { .mfb (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE (p3) FMA f81 = f41, f58, f81 // A2 * B3 nop __LINE__ } { .mfb (p3) FMA f89 = f41, f59, f89 // A2 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f12 = [C1 ], SIZE (p3) FMA f97 = f41, f60, f97 // A2 * B5 nop __LINE__ } { .mfb (p5) LDFD f13 = [C9 ], SIZE (p3) FMA f105 = f41, f61, f105 // A2 * B6 nop __LINE__ } ;; { .mfb (p5) LDFD f14 = [C1 ], - 3 * SIZE (p3) FMA f113 = f41, f62, f113 // A2 * B7 nop __LINE__ } { .mfb (p5) LDFD f15 = [C9 ], - 3 * SIZE (p3) FMA f121 = f41, f63, f121 // A2 * B8 nop __LINE__ } ;; { .mfb (p5) LDFD f16 = [C2 ], SIZE (p3) FMA f66 = f42, f56, f66 // A3 * B1 nop __LINE__ } { .mfb (p5) LDFD f17 = [C10], SIZE (p3) FMA f74 = f42, f57, f74 // A3 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f18 = [C2 ], SIZE (p3) FMA f82 = f42, f58, f82 // A3 * B3 nop __LINE__ } { .mfb (p5) LDFD f19 = [C10], SIZE (p3) FMA f90 = f42, f59, f90 // A3 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f20 = [C2 ], SIZE (p3) FMA f98 = f42, f60, f98 // A3 * B5 nop __LINE__ } { .mfb (p5) LDFD f21 = [C10], SIZE (p3) FMA f106 = f42, f61, f106 // A3 * B6 nop __LINE__ } ;; { .mfb (p5) LDFD f22 = [C2 ], - 3 * SIZE (p3) FMA f114 = f42, f62, f114 // A3 * B7 nop __LINE__ } { .mfb (p5) LDFD f23 = [C10], - 3 * SIZE (p3) FMA f122 = f42, f63, f122 // A3 * B8 nop __LINE__ } ;; { .mfb (p5) LDFD f24 = [C3 ], SIZE (p3) FMA f67 = f43, f56, f67 // A4 * B1 nop __LINE__ } { .mfb (p5) LDFD f25 = [C11], SIZE (p3) FMA f75 = f43, f57, f75 // A4 * B2 nop __LINE__ } ;; { .mfb (p5) LDFD f26 = [C3 ], SIZE (p3) FMA f83 = f43, f58, f83 // A4 * B3 nop __LINE__ } { .mfb (p5) LDFD f27 = [C11], SIZE (p3) FMA f91 = f43, f59, f91 // A4 * B4 nop __LINE__ } ;; { .mfb (p5) LDFD f28 = [C3 ], SIZE (p3) FMA f99 = f43, f60, f99 // A4 * B5 nop __LINE__ } { .mfb (p5) LDFD f29 = [C11], SIZE (p3) FMA f107 = f43, f61, f107 // A4 * B6 nop __LINE__ } ;; { .mfi (p5) LDFD f30 = [C3 ], - 3 * SIZE (p3) FMA f115 = f43, f62, f115 // A4 * B7 adds L = -1, L } { .mfb (p5) LDFD f31 = [C11], - 3 * SIZE (p3) FMA f123 = f43, f63, f123 // A4 * B8 br.cloop.sptk.few .L022 } ;;.L028: { .mmf LDFD f68 = [C4 ], SIZE LDFD f69 = [C12], SIZE FMA f6 = ALPHA_R, f64, f6 } { .mmf nop __LINE__ nop __LINE__ FMA f7 = ALPHA_R, f66, f7 } ;; { .mmf LDFD f70 = [C4 ], SIZE LDFD f71 = [C12], SIZE FMA f10 = ALPHA_I, f64, f10 } { .mmf nop __LINE__ nop __LINE__ FMA f11 = ALPHA_I, f66, f11 } ;; { .mmf LDFD f76 = [C4 ], SIZE LDFD f77 = [C12], SIZE FMA f12 = ALPHA_R, f65, f12 } { .mmf nop __LINE__ nop __LINE__ FMA f13 = ALPHA_R, f67, f13 } ;; { .mmf LDFD f78 = [C4 ], -3 * SIZE LDFD f79 = [C12], -3 * SIZE FMA f14 = ALPHA_I, f65, f14 } { .mmf nop __LINE__ nop __LINE__ FMA f15 = ALPHA_I, f67, f15 } ;; { .mmf STFD [C1 ] = f6, SIZE STFD [C9 ] = f7, SIZE FMA f16 = ALPHA_R, f72, f16 } { .mmf LDFD f84 = [C5 ], SIZE LDFD f85 = [C13], SIZE FMA f17 = ALPHA_R, f74, f17 } ;; { .mmf STFD [C1 ] = f10, SIZE STFD [C9 ] = f11, SIZE FMA f18 = ALPHA_I, f72, f18 } { .mmf LDFD f86 = [C5 ], SIZE LDFD f87 = [C13], SIZE FMA f19 = ALPHA_I, f74, f19 } ;; { .mmf STFD [C1 ] = f12, SIZE STFD [C9 ] = f13, SIZE FMA f20 = ALPHA_R, f73, f20 } { .mmf LDFD f92 = [C5 ], SIZE LDFD f93 = [C13], SIZE FMA f21 = ALPHA_R, f75, f21 } ;; { .mmf STFD [C1 ] = f14, 5 * SIZE STFD [C9 ] = f15, 5 * SIZE FMA f22 = ALPHA_I, f73, f22 } { .mmf LDFD f94 = [C5 ], -3 * SIZE LDFD f95 = [C13], -3 * SIZE FMA f23 = ALPHA_I, f75, f23 } ;; { .mmf STFD [C2 ] = f16, SIZE STFD [C10] = f17, SIZE FMA f24 = ALPHA_R, f80, f24 } { .mmf LDFD f100 = [C6 ], SIZE LDFD f101 = [C14], SIZE FMA f25 = ALPHA_R, f82, f25 } ;; { .mmf STFD [C2 ] = f18, SIZE STFD [C10] = f19, SIZE FMA f26 = ALPHA_I, f80, f26 } { .mmf LDFD f102 = [C6 ], SIZE LDFD f103 = [C14], SIZE FMA f27 = ALPHA_I, f82, f27 } ;; { .mmf STFD [C2 ] = f20, SIZE STFD [C10] = f21, SIZE FMA f28 = ALPHA_R, f81, f28 } { .mmf LDFD f108 = [C6 ], SIZE LDFD f109 = [C14], SIZE FMA f29 = ALPHA_R, f83, f29 } ;; { .mmf STFD [C2 ] = f22, 5 * SIZE STFD [C10] = f23, 5 * SIZE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -