⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm3m_kernel.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 5 页
字号:
	FMA	f30 = ALPHA_I, f81, f30	}	{ .mmf	LDFD	f110 = [C6 ], -3 * SIZE	LDFD	f111 = [C14], -3 * SIZE	FMA	f31 = ALPHA_I, f83, f31	}	;;	{ .mmf	STFD	[C3 ] = f24, SIZE	STFD	[C11] = f25, SIZE	FMA	f68 = ALPHA_R, f88, f68	}	{ .mmf	LDFD	f116 = [C7 ], SIZE	LDFD	f117 = [C15], SIZE	FMA	f69 = ALPHA_R, f90, f69	}	;;	{ .mmf	STFD	[C3 ] = f26, SIZE	STFD	[C11] = f27, SIZE	FMA	f70 = ALPHA_I, f88, f70	}	{ .mmf	LDFD	f118 = [C7 ], SIZE	LDFD	f119 = [C15], SIZE	FMA	f71 = ALPHA_I, f90, f71	}	;;	{ .mmf	STFD	[C3 ] = f28, SIZE	STFD	[C11] = f29, SIZE	FMA	f76 = ALPHA_R, f89, f76	}	{ .mmf	LDFD	f124 = [C7 ], SIZE	LDFD	f125 = [C15], SIZE	FMA	f77 = ALPHA_R, f91, f77	}	;;	{ .mmf	STFD	[C3 ] = f30, 5 * SIZE	STFD	[C11] = f31, 5 * SIZE	FMA	f78 = ALPHA_I, f89, f78	}	{ .mmf	LDFD	f126 = [C7 ], -3 * SIZE	LDFD	f127 = [C15], -3 * SIZE	FMA	f79 = ALPHA_I, f91, f79	}	;;	{ .mmf	STFD	[C4 ] = f68, SIZE	STFD	[C12] = f69, SIZE	FMA	f84 = ALPHA_R, f96, f84	}	{ .mmf	LDFD	f32 = [C8 ], SIZE	LDFD	f33 = [C16], SIZE	FMA	f85 = ALPHA_R, f98, f85	}	;;	{ .mmf	STFD	[C4 ] = f70, SIZE	STFD	[C12] = f71, SIZE	FMA	f86 = ALPHA_I, f96, f86	}	{ .mmf	LDFD	f34 = [C8 ], SIZE	LDFD	f35 = [C16], SIZE	FMA	f87 = ALPHA_I, f98, f87	}	;;	{ .mmf	STFD	[C4 ] = f76, SIZE	STFD	[C12] = f77, SIZE	FMA	f92 = ALPHA_R, f97, f92	}	{ .mmf	LDFD	f36 = [C8 ], SIZE	LDFD	f37 = [C16], SIZE	FMA	f93 = ALPHA_R, f99, f93	}	;;	{ .mmf	STFD	[C4 ] = f78, 5 * SIZE	STFD	[C12] = f79, 5 * SIZE	FMA	f94 = ALPHA_I, f97, f94	}	{ .mmf	LDFD	f38 = [C8 ], -3 * SIZE	LDFD	f39 = [C16], -3 * SIZE	FMA	f95 = ALPHA_I, f99, f95	}	;;	{ .mmf	STFD	[C5 ] = f84, SIZE	STFD	[C13] = f85, SIZE	FMA	f100 = ALPHA_R, f104, f100	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f101 = ALPHA_R, f106, f101	}	;;	{ .mmf	STFD	[C5 ] = f86, SIZE	STFD	[C13] = f87, SIZE	FMA	f102 = ALPHA_I, f104, f102	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f103 = ALPHA_I, f106, f103	}	;;	{ .mmf	STFD	[C5 ] = f92, SIZE	STFD	[C13] = f93, SIZE	FMA	f108 = ALPHA_R, f105, f108	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f109 = ALPHA_R, f107, f109	}	;;	{ .mmf	STFD	[C5 ] = f94, 5 * SIZE	STFD	[C13] = f95, 5 * SIZE	FMA	f110 = ALPHA_I, f105, f110	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f111 = ALPHA_I, f107, f111	}	;;	{ .mmf	STFD	[C6 ] = f100, SIZE	STFD	[C14] = f101, SIZE	FMA	f116 = ALPHA_R, f112, f116	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f117 = ALPHA_R, f114, f117	}	;;	{ .mmf	STFD	[C6 ] = f102, SIZE	STFD	[C14] = f103, SIZE	FMA	f118 = ALPHA_I, f112, f118	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f119 = ALPHA_I, f114, f119	}	;;	{ .mmf	STFD	[C6 ] = f108, SIZE	STFD	[C14] = f109, SIZE	FMA	f124 = ALPHA_R, f113, f124	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f125 = ALPHA_R, f115, f125	}	;;	{ .mmf	STFD	[C6 ] = f110, 5 * SIZE	STFD	[C14] = f111, 5 * SIZE	FMA	f126 = ALPHA_I, f113, f126	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f127 = ALPHA_I, f115, f127	}	;;	{ .mmf	STFD	[C7 ] = f116, SIZE	STFD	[C15] = f117, SIZE	FMA	f32 = ALPHA_R, f120, f32	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f33 = ALPHA_R, f122, f33	}	;;	{ .mmf	STFD	[C7 ] = f118, SIZE	STFD	[C15] = f119, SIZE	FMA	f34 = ALPHA_I, f120, f34	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f35 = ALPHA_I, f122, f35	}	;;	{ .mmf	STFD	[C7 ] = f124, SIZE	STFD	[C15] = f125, SIZE	FMA	f36 = ALPHA_R, f121, f36	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f37 = ALPHA_R, f123, f37	}	;;	{ .mmf	STFD	[C7 ] = f126, 5 * SIZE	STFD	[C15] = f127, 5 * SIZE	FMA	f38 = ALPHA_I, f121, f38	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f39 = ALPHA_I, f123, f39	}	;;	{ .mmf	STFD	[C8 ] = f32, SIZE	STFD	[C16] = f33, SIZE	mov	f64  = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f72  = f0	}	;;	{ .mmf	STFD	[C8 ] = f34, SIZE	STFD	[C16] = f35, SIZE	mov	f80  = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f88  = f0	}	;;	{ .mmf	STFD	[C8 ] = f36, SIZE	STFD	[C16] = f37, SIZE	mov	f96  = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f104 = f0	}	;;	{ .mmf	STFD	[C8 ] = f38, 5 * SIZE	STFD	[C16] = f39, 5 * SIZE	mov	f112 = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f120 = f0	}	;;	.align 32.L030:	{ .mib	nop	__LINE__	tbit.z	p6, p7 = M, 1	(p6)	br.cond.dptk .L040	}	;;	{ .mfi	LDFPD	f48, f49 = [B]	mov	f65  = f0	nop	__LINE__	}	{ .mfi	adds	BOFFSET = 2 * SIZE, B	mov	f73  = f0	adds	L =  1, K	}	;;	{ .mfi	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE	mov	f81  = f0	tbit.z	p12, p0 = L, 0	}	{ .mfi	(p7) LDFPD	f32, f33 = [AOFFSET], 2 * SIZE	mov	f89  = f0	shr	L = L, 1	}	;;	{ .mfi	LDFPD	f52, f53 = [BOFFSET], 2 * SIZE	mov	f97  = f0	adds	L =  -1, L	}	{ .mfi	nop	__LINE__	mov	f105 = f0	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET	}	;;	{ .mfi	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET	mov	f113 = f0	mov	ar.lc = L	}	{ .mfi	LDFPD	f54, f55 = [BOFFSET], 2 * SIZE	mov	f121 = f0	cmp.eq	p3, p0 = r0, r0	}	;;	.align 32.L032:	{ .mfb	lfetch.nt1	[PREA],  4 * SIZE	FMA	f64   = f32, f48, f64	// A1 * B1	nop	__LINE__	}	{ .mfi	nop	__LINE__	FMA	f72   = f32, f49, f72	// A1 * B2	(p12) cmp.ne p3, p0 =  0, L	}	;;	{ .mfi	lfetch.nt1	[PREB],  16 * SIZE	FMA	f80   = f32, f50, f80	// A1 * B3	cmp.ne	p4, p5 =  0, L	}	{ .mfb	nop	__LINE__	FMA	f88   = f32, f51, f88	// A1 * B4	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE	FMA	f96   = f32, f52, f96	// A1 * B5	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f104  = f32, f53, f104	// A1 * B6	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE	FMA	f112  = f32, f54, f112	// A1 * B7	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f120  = f32, f55, f120	// A1 * B8	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE	FMA	f65   = f33, f48, f65	// A2 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f73   = f33, f49, f73	// A2 * B2	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f60, f61 = [BOFFSET], 2 * SIZE	FMA	f81   = f33, f50, f81	// A2 * B3	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f89   = f33, f51, f89	// A2 * B4	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f62, f63 = [BOFFSET], 2 * SIZE	FMA	f97   = f33, f52, f97	// A2 * B5	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f105  = f33, f53, f105	// A2 * B6	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMA	f113  = f33, f54, f113	// A2 * B7	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f121  = f33, f55, f121	// A2 * B8	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE	(p3) FMA	f64   = f40, f56, f64	// A1 * B1	nop	__LINE__	}	{ .mfb	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE	(p3) FMA	f72   = f40, f57, f72	// A1 * B2	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE	(p3) FMA	f80   = f40, f58, f80	// A1 * B3	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f88   = f40, f59, f88	// A1 * B4	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f52, f53 = [BOFFSET], 2 * SIZE	(p3) FMA	f96   = f40, f60, f96	// A1 * B5	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f104  = f40, f61, f104	// A1 * B6	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f6   = [C1], SIZE	(p3) FMA	f112  = f40, f62, f112	// A1 * B7	nop	__LINE__	}	{ .mfb	(p5) LDFD	f12  = [C2], SIZE	(p3) FMA	f120  = f40, f63, f120	// A1 * B8	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f54, f55 = [BOFFSET], 2 * SIZE	(p3) FMA	f65   = f41, f56, f65	// A2 * B1	nop	__LINE__	}	{ .mfb	(p3) FMA	f73   = f41, f57, f73	// A2 * B2	nop	__LINE__	}	{ .mfb	(p5) LDFD	f7   = [C1], SIZE	(p3) FMA	f81   = f41, f58, f81	// A2 * B3	nop	__LINE__	}	{ .mfb	(p5) LDFD	f13  = [C2], SIZE	(p3) FMA	f89   = f41, f59, f89	// A2 * B4	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f10  = [C1], SIZE	(p3) FMA	f97   = f41, f60, f97	// A2 * B5	nop	__LINE__	}	{ .mfb	(p5) LDFD	f14  = [C2], SIZE	(p3) FMA	f105  = f41, f61, f105	// A2 * B6	nop	__LINE__	}	;;	{ .mfi	(p5) LDFD	f11  = [C1], -3 * SIZE	(p3) FMA	f113  = f41, f62, f113	// A2 * B7	adds	L = -1, L	}	{ .mfb	(p5) LDFD	f15  = [C2], -3 * SIZE	(p3) FMA	f121  = f41, f63, f121	// A2 * B8	br.cloop.sptk.few .L032	}	;;.L038:	{ .mmf	LDFD	f16  = [C3], SIZE	LDFD	f20  = [C4], SIZE	FMA	f6  = ALPHA_R, f64, f6	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f12 = ALPHA_R, f72, f12	}	;;	{ .mmf	LDFD	f17  = [C3], SIZE	LDFD	f21  = [C4], SIZE	FMA	f7  = ALPHA_I, f64, f7	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f13 = ALPHA_I, f72, f13	}	;;	{ .mmf	LDFD	f18  = [C3], SIZE	LDFD	f22  = [C4], SIZE	FMA	f10 = ALPHA_R, f65, f10	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f14 = ALPHA_R, f73, f14	}	;;	{ .mmf	LDFD	f19  = [C3], - 3 * SIZE	LDFD	f23  = [C4], - 3 * SIZE	FMA	f11 = ALPHA_I, f65, f11	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f15 = ALPHA_I, f73, f15	}	;;	{ .mmf	STFD	[C1] = f6,  SIZE	STFD	[C2] = f12, SIZE	FMA	f16 = ALPHA_R, f80, f16	}	{ .mmf	LDFD	f24  = [C5], SIZE	LDFD	f28  = [C6], SIZE	FMA	f20 = ALPHA_R, f88, f20	}	;;	{ .mmf	STFD	[C1] = f7,  SIZE	STFD	[C2] = f13, SIZE	FMA	f17 = ALPHA_I, f80, f17	}	{ .mmf	LDFD	f25  = [C5], SIZE	LDFD	f29  = [C6], SIZE	FMA	f21 = ALPHA_I, f88, f21	}	;;	{ .mmf	STFD	[C1] = f10, SIZE	STFD	[C2] = f14, SIZE	FMA	f18 = ALPHA_R, f81, f18	}	{ .mmf	LDFD	f26  = [C5], SIZE	LDFD	f30  = [C6], SIZE	FMA	f22 = ALPHA_R, f89, f22	}	;;	{ .mmf	STFD	[C1] = f11, SIZE	STFD	[C2] = f15, SIZE	FMA	f19 = ALPHA_I, f81, f19	}	{ .mmf	LDFD	f27  = [C5], - 3 * SIZE	LDFD	f31  = [C6], - 3 * SIZE	FMA	f23 = ALPHA_I, f89, f23	}	;;	{ .mmf	STFD	[C3] = f16, SIZE	STFD	[C4] = f20, SIZE	FMA	f24 = ALPHA_R, f96,  f24	}	{ .mmf	LDFD	f32  = [C7], SIZE	LDFD	f36  = [C8], SIZE	FMA	f28 = ALPHA_R, f104, f28	}	;;	{ .mmf	STFD	[C3] = f17, SIZE	STFD	[C4] = f21, SIZE	FMA	f25 = ALPHA_I, f96,  f25	}	{ .mmf	LDFD	f33  = [C7], SIZE	LDFD	f37  = [C8], SIZE	FMA	f29 = ALPHA_I, f104, f29	}	;;	{ .mmf	STFD	[C3] = f18, SIZE	STFD	[C4] = f22, SIZE	FMA	f26 = ALPHA_R, f97,  f26	}	{ .mmf	LDFD	f34  = [C7], SIZE	LDFD	f38  = [C8], SIZE	FMA	f30 = ALPHA_R, f105, f30	}	;;	{ .mmf	STFD	[C3] = f19, SIZE	STFD	[C4] = f23, SIZE	FMA	f27 = ALPHA_I, f97,  f27	}	{ .mmf	LDFD	f35  = [C7], - 3 * SIZE	LDFD	f39  = [C8], - 3 * SIZE	FMA	f31 = ALPHA_I, f105, f31	}	;;	{ .mmf	STFD	[C5] = f24, SIZE	STFD	[C6] = f28, SIZE	FMA	f32 = ALPHA_R, f112, f32	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f36 = ALPHA_R, f120, f36	}	;;	{ .mmf	STFD	[C5] = f25, SIZE	STFD	[C6] = f29, SIZE	FMA	f33 = ALPHA_I, f112, f33	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f37 = ALPHA_I, f120, f37	}	;;	{ .mmf	STFD	[C5] = f26, SIZE	STFD	[C6] = f30, SIZE	FMA	f34 = ALPHA_R, f113, f34	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f38 = ALPHA_R, f121, f38	}	;;	{ .mmf	STFD	[C5] = f27, SIZE	STFD	[C6] = f31, SIZE	FMA	f35 = ALPHA_I, f113,  f35	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f39 = ALPHA_I, f121, f39	}	;;	{ .mmf	STFD	[C7] = f32, SIZE	STFD	[C8] = f36, SIZE	mov	f64  = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f72  = f0	}	;;	{ .mmf	STFD	[C7] = f33, SIZE	STFD	[C8] = f37, SIZE	mov	f80  = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f88  = f0	}	;;	{ .mmf	STFD	[C7] = f34, SIZE	STFD	[C8] = f38, SIZE	mov	f96  = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f104 = f0	}	;;	{ .mmf	STFD	[C7] = f35, SIZE	STFD	[C8] = f39, SIZE	mov	f112 = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f120 = f0	}	;;	.align 32.L040:	{ .mib	nop	__LINE__	tbit.z	p6, p7 = M, 0	(p6)	br.cond.dptk .L049	}	;;	{ .mmi	LDFPD	f48, f49 = [B]	adds	BOFFSET = 2 * SIZE, B	adds	L =  1, K	}	;;	{ .mii	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE	tbit.z	p12, p0 = L, 0	shr	L = L, 1	}	;;	{ .mmi	LDFPD	f52, f53 = [BOFFSET], 2 * SIZE	LDFD	f32 = [AOFFSET], 1 * SIZE	adds	L =  -1, L	}	;;	{ .mmi	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET	cmp.eq	p3, p0 = r0, r0	mov	ar.lc = L	}	{ .mmi	LDFPD	f54, f55 = [BOFFSET], 2 * SIZE	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET	nop	__LINE__	}	;;	.align 32.L042:	{ .mfb	lfetch.nt1	[PREB],  16 * SIZE	FMA	f64   = f32, f48, f64	// A1 * B1	nop	__LINE__	}	{ .mfb	(p12) cmp.ne p3, p0 =  0, L	FMA	f72   = f32, f49, f72	// A1 * B2	nop	__LINE__	}	;;	{ .mfi	(p3) LDFD	f40 = [AOFFSET], 1 * SIZE	FMA	f80   = f32, f50, f80	// A1 * B3	cmp.ne	p4, p5 =  0, L	}	{ .mfb	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE	FMA	f88   = f32, f51, f88	// A1 * B4	nop	__LINE__	}	;;	{ .mfi	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE	FMA	f96   = f32, f52, f96	// A1 * B5	nop	__LINE__	}	{ .mmf	(p5) LDFD	f6   = [C1], SIZE	(p5) LDFD	f10  = [C2], SIZE	FMA	f104  = f32, f53, f104	// A1 * B6	}	;;	{ .mfi	(p3) LDFPD	f60, f61 = [BOFFSET], 2 * SIZE	FMA	f112  = f32, f54, f112	// A1 * B7	nop	__LINE__	}	{ .mmf	(p5) LDFD	f7   = [C1], -SIZE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -