⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm3m_kernel.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 5 页
字号:
	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f49  = ALPHA_I, f110, f49	}	;;	{ .mmf	STFD	[C6 ] = f34, SIZE	STFD	[C14] = f35, SIZE	FMA	f50  = ALPHA_R, f109, f50	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f51  = ALPHA_R, f111, f51	}	;;	{ .mmf	STFD	[C6 ] = f36, 5 * SIZE	STFD	[C14] = f37, 5 * SIZE	FMA	f52  = ALPHA_I, f109, f52	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f53  = ALPHA_I, f111, f53	}	;;	{ .mmf	STFD	[C6 ] = f38, SIZE	STFD	[C14] = f39, SIZE	FMA	f54  = ALPHA_R, f112, f54	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f55  = ALPHA_R, f114, f55	}	;;	{ .mmf	STFD	[C6 ] = f48, SIZE	STFD	[C14] = f49, SIZE	FMA	f40  = ALPHA_I, f112, f40	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f41  = ALPHA_I, f114, f41	}	;;	{ .mmf	STFD	[C6 ] = f50, SIZE	STFD	[C14] = f51, SIZE	FMA	f42  = ALPHA_R, f113, f42	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f43  = ALPHA_R, f115, f43	}	;;	{ .mmf	STFD	[C6 ] = f52, 5 * SIZE	STFD	[C14] = f53, 5 * SIZE	FMA	f44  = ALPHA_I, f113, f44	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f45  = ALPHA_I, f115, f45	}	;;	{ .mmf	STFD	[C7 ] = f54, SIZE	STFD	[C15] = f55, SIZE	FMA	f46  = ALPHA_R, f116, f46	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f56  = ALPHA_R, f118, f56	}	;;	{ .mmf	STFD	[C7 ] = f40, SIZE	STFD	[C15] = f41, SIZE	FMA	f57  = ALPHA_I, f116, f57	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f58  = ALPHA_I, f118, f58	}	;;	{ .mmf	STFD	[C7 ] = f42, SIZE	STFD	[C15] = f43, SIZE	FMA	f59  = ALPHA_R, f117, f59	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f60  = ALPHA_R, f119, f60	}	;;	{ .mmf	STFD	[C7 ] = f44, 5 * SIZE	STFD	[C15] = f45, 5 * SIZE	FMA	f61  = ALPHA_I, f117, f61	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f62  = ALPHA_I, f119, f62	}	;;	{ .mmf	STFD	[C7 ] = f46, SIZE	STFD	[C15] = f56, SIZE	FMA	f63  = ALPHA_R, f120, f63	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f47  = ALPHA_R, f122, f47	}	;;	{ .mmf	STFD	[C7 ] = f57, SIZE	STFD	[C15] = f58, SIZE	FMA	f64  = ALPHA_I, f120, f64	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f65  = ALPHA_I, f122, f65	}	;;	{ .mmf	STFD	[C7 ] = f59, SIZE	STFD	[C15] = f60, SIZE	FMA	f6   = ALPHA_R, f121, f6	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f7   = ALPHA_R, f123, f7	}	;;	{ .mmf	STFD	[C7 ] = f61, 5 * SIZE	STFD	[C15] = f62, 5 * SIZE	FMA	f10  = ALPHA_I, f121, f10	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f11  = ALPHA_I, f123, f11	}	;;	{ .mmf	STFD	[C8 ] = f63, SIZE	STFD	[C16] = f47, SIZE	FMA	f12  = ALPHA_R, f124, f12	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f13  = ALPHA_R, f126, f13	}	;;	{ .mmf	STFD	[C8 ] = f64, SIZE	STFD	[C16] = f65, SIZE	FMA	f14  = ALPHA_I, f124, f14	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f15  = ALPHA_I, f126, f15	}	;;	{ .mmf	STFD	[C8 ] = f6,  SIZE	STFD	[C16] = f7,  SIZE	FMA	f16  = ALPHA_R, f125, f16	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f17  = ALPHA_R, f127, f17	}	;;	{ .mmf	STFD	[C8 ] = f10, 5 * SIZE	STFD	[C16] = f11, 5 * SIZE	FMA	f18  = ALPHA_I, f125, f18	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f19  = ALPHA_I, f127, f19	}	;;	{ .mmf	STFD	[C8 ] = f12, SIZE	STFD	[C16] = f13, SIZE	mov	f64  = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f72  = f0	}	;;	{ .mmf	STFD	[C8 ] = f14, SIZE	STFD	[C16] = f15, SIZE 	mov	f80  = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f88  = f0	}	;;	{ .mmf	STFD	[C8 ] = f16, SIZE	STFD	[C16] = f17, SIZE	mov	f96  = f0	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f104 = f0	}	;;	{ .mmf	STFD	[C8 ] = f18, 5 * SIZE	STFD	[C16] = f19, 5 * SIZE	mov	f112 = f0	}	{ .mfb	adds	I = -1, I	mov	f120 = f0	(p6)	br.cond.dptk .L011	}	;;.L020:	{ .mfi	cmp.eq	p3, p0 = r0, r0	mov	f89  = f0	tbit.z	p6, p7 = M, 2	}	{ .mfb	nop	__LINE__	mov	f81  = f0	(p6)	br.cond.dptk .L030	}	;;	{ .mfi	LDFPD	f48, f49 = [B]	mov	f65  = f0	nop	__LINE__	}	{ .mfi	adds	BOFFSET = 2 * SIZE, B	mov	f73  = f0	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET	}	;;	{ .mmf	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE	setf.d	f97  = r0	mov	f105 = f0	}	{ .mfi	setf.d	f113 = r0	mov	f121 = f0	adds	L =  1, K	}	;;	{ .mmf	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE	setf.d	f66  = r0	mov	f74  = f0	}	{ .mfi	setf.d	f82  = r0	mov	f90  = f0	tbit.z	p12, p0 = L, 0	}	;;	{ .mmf	LDFPD	f52, f53 = [BOFFSET], 2 * SIZE	setf.d	f98   = r0	mov	f106  = f0	}	{ .mfi	setf.d	f114 = r0	mov	f122 = f0	shr	L = L, 1	}	;;	{ .mfi	LDFPD	f54, f55 = [BOFFSET], 2 * SIZE	mov	f75  = f0	adds	L =  -1, L	}	{ .mmf	setf.d	f67  = r0	setf.d	f83  = r0	mov	f91  = f0	}	;;	{ .mfi	LDFPD	f34, f35 = [AOFFSET], 2 * SIZE	mov	f107 = f0	mov	ar.lc = L	}	{ .mmf	setf.d	f99  = r0	setf.d	f115 = r0	mov	f123 = f0	}	;;	.align 32.L022:	{ .mfi	lfetch.nt1	[PREA],  16 * SIZE	FMA	f64   = f32, f48, f64	// A1 * B1	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET	}	{ .mfi	nop	__LINE__	FMA	f72   = f32, f49, f72	// A1 * B2	(p12) cmp.ne p3, p0 =  0, L	}	;;	{ .mfi	lfetch.nt1	[PREB],  16 * SIZE	FMA	f80   = f32, f50, f80	// A1 * B3	cmp.ne	p4, p5 =  0, L	}	{ .mfb	nop	__LINE__	FMA	f88   = f32, f51, f88	// A1 * B4	nop	__LINE__	}	;;	{ .mfi	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE	FMA	f96   = f32, f52, f96	// A1 * B5	(p5) adds	C9  = 4 * SIZE, C1	}	{ .mfi	nop	__LINE__	FMA	f104  = f32, f53, f104	// A1 * B6	(p5) adds	C10 = 4 * SIZE, C2	}	;;	{ .mfi	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE	FMA	f112  = f32, f54, f112	// A1 * B7	(p5) adds	C11 = 4 * SIZE, C3	}	{ .mfi	nop	__LINE__	FMA	f120  = f32, f55, f120	// A1 * B8	(p5) adds	C12 = 4 * SIZE, C4	}	;;	{ .mfi	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE	FMA	f65   = f33, f48, f65	// A2 * B1	(p5) adds	C13 = 4 * SIZE, C5	}	{ .mfi	nop	__LINE__	FMA	f73   = f33, f49, f73	// A2 * B2	(p5) adds	C14 = 4 * SIZE, C6	}	;;	{ .mfi	(p3) LDFPD	f60, f61 = [BOFFSET], 2 * SIZE	FMA	f81   = f33, f50, f81	// A2 * B3	(p5) adds	C15 = 4 * SIZE, C7	}	{ .mfi	nop	__LINE__	FMA	f89   = f33, f51, f89	// A2 * B4	(p5) adds	C16 = 4 * SIZE, C8	}	;;	{ .mfb	(p3) LDFPD	f62, f63 = [BOFFSET], 2 * SIZE	FMA	f97   = f33, f52, f97	// A2 * B5	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f105  = f33, f53, f105	// A2 * B6	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE	FMA	f113  = f33, f54, f113	// A2 * B7	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f121  = f33, f55, f121	// A2 * B8	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMA	f66   = f34, f48, f66	// A3 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f74   = f34, f49, f74	// A3 * B2	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMA	f82   = f34, f50, f82	// A3 * B3	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f90   = f34, f51, f90	// A3 * B4	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMA	f98   = f34, f52, f98	// A3 * B5	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f106  = f34, f53, f106	// A3 * B6	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMA	f114  = f34, f54, f114	// A3 * B7	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f122  = f34, f55, f122	// A3 * B8	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMA	f67   = f35, f48, f67	// A4 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f75   = f35, f49, f75	// A4 * B2	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMA	f83   = f35, f50, f83	// A4 * B3	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f91   = f35, f51, f91	// A4 * B4	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE	FMA	f99   = f35, f52, f99	// A4 * B5	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f107  = f35, f53, f107	// A4 * B6	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE	FMA	f115  = f35, f54, f115	// A4 * B7	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f123  = f35, f55, f123	// A4 * B8	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE	(p3) FMA	f64   = f40, f56, f64	// A1 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f72   = f40, f57, f72	// A1 * B2	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f52, f53 = [BOFFSET], 2 * SIZE	(p3) FMA	f80   = f40, f58, f80	// A1 * B3	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f88   = f40, f59, f88	// A1 * B4	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f6  = [C1 ], SIZE	(p3) FMA	f96   = f40, f60, f96	// A1 * B5	nop	__LINE__	}	{ .mfb	(p5) LDFD	f7  = [C9 ], SIZE	(p3) FMA	f104  = f40, f61, f104	// A1 * B6	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f10 = [C1 ], SIZE	(p3) FMA	f112  = f40, f62, f112	// A1 * B7	nop	__LINE__	}	{ .mfb	(p5) LDFD	f11 = [C9 ], SIZE	(p3) FMA	f120  = f40, f63, f120	// A1 * B8	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f54, f55 = [BOFFSET], 2 * SIZE	(p3) FMA	f65   = f41, f56, f65	// A2 * B1	nop	__LINE__	}	{ .mfb	(p3) FMA	f73   = f41, f57, f73	// A2 * B2	nop	__LINE__	}	{ .mfb	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE	(p3) FMA	f81   = f41, f58, f81	// A2 * B3	nop	__LINE__	}	{ .mfb	(p3) FMA	f89   = f41, f59, f89	// A2 * B4	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f12 = [C1 ], SIZE	(p3) FMA	f97   = f41, f60, f97	// A2 * B5	nop	__LINE__	}	{ .mfb	(p5) LDFD	f13 = [C9 ], SIZE	(p3) FMA	f105  = f41, f61, f105	// A2 * B6	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f14 = [C1 ], - 3 * SIZE	(p3) FMA	f113  = f41, f62, f113	// A2 * B7	nop	__LINE__	}	{ .mfb	(p5) LDFD	f15 = [C9 ], - 3 * SIZE	(p3) FMA	f121  = f41, f63, f121	// A2 * B8	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f16 = [C2 ], SIZE	(p3) FMA	f66   = f42, f56, f66	// A3 * B1	nop	__LINE__	}	{ .mfb	(p5) LDFD	f17 = [C10], SIZE	(p3) FMA	f74   = f42, f57, f74	// A3 * B2	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f18 = [C2 ], SIZE	(p3) FMA	f82   = f42, f58, f82	// A3 * B3	nop	__LINE__	}	{ .mfb	(p5) LDFD	f19 = [C10], SIZE	(p3) FMA	f90   = f42, f59, f90	// A3 * B4	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f20 = [C2 ], SIZE	(p3) FMA	f98   = f42, f60, f98	// A3 * B5	nop	__LINE__	}	{ .mfb	(p5) LDFD	f21 = [C10], SIZE	(p3) FMA	f106  = f42, f61, f106	// A3 * B6	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f22 = [C2 ], - 3 * SIZE	(p3) FMA	f114  = f42, f62, f114	// A3 * B7	nop	__LINE__	}	{ .mfb	(p5) LDFD	f23 = [C10], - 3 * SIZE	(p3) FMA	f122  = f42, f63, f122	// A3 * B8	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f24 = [C3 ], SIZE	(p3) FMA	f67   = f43, f56, f67	// A4 * B1	nop	__LINE__	}	{ .mfb	(p5) LDFD	f25 = [C11], SIZE	(p3) FMA	f75   = f43, f57, f75	// A4 * B2	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f26 = [C3 ], SIZE	(p3) FMA	f83   = f43, f58, f83	// A4 * B3	nop	__LINE__	}	{ .mfb	(p5) LDFD	f27 = [C11], SIZE	(p3) FMA	f91   = f43, f59, f91	// A4 * B4	nop	__LINE__	}	;;	{ .mfb	(p5) LDFD	f28 = [C3 ], SIZE	(p3) FMA	f99   = f43, f60, f99	// A4 * B5	nop	__LINE__	}	{ .mfb	(p5) LDFD	f29 = [C11], SIZE	(p3) FMA	f107  = f43, f61, f107	// A4 * B6	nop	__LINE__	}	;;	{ .mfi	(p5) LDFD	f30 = [C3 ], - 3 * SIZE	(p3) FMA	f115  = f43, f62, f115	// A4 * B7	adds	L = -1, L	}	{ .mfb	(p5) LDFD	f31 = [C11], - 3 * SIZE	(p3) FMA	f123  = f43, f63, f123	// A4 * B8	br.cloop.sptk.few .L022	}	;;.L028:	{ .mmf	LDFD	f68 = [C4 ], SIZE	LDFD	f69 = [C12], SIZE	FMA	f6  = ALPHA_R, f64, f6	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f7  = ALPHA_R, f66, f7	}	;;	{ .mmf	LDFD	f70 = [C4 ], SIZE	LDFD	f71 = [C12], SIZE	FMA	f10 = ALPHA_I, f64, f10	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f11 = ALPHA_I, f66, f11	}	;;	{ .mmf	LDFD	f76 = [C4 ], SIZE	LDFD	f77 = [C12], SIZE	FMA	f12 = ALPHA_R, f65, f12	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f13 = ALPHA_R, f67, f13	}	;;	{ .mmf	LDFD	f78 = [C4 ], -3 * SIZE	LDFD	f79 = [C12], -3 * SIZE	FMA	f14 = ALPHA_I, f65, f14	}	{ .mmf	nop	__LINE__	nop	__LINE__	FMA	f15 = ALPHA_I, f67, f15	}	;;	{ .mmf	STFD	[C1 ] = f6, SIZE	STFD	[C9 ] = f7, SIZE	FMA	f16 = ALPHA_R, f72, f16	}	{ .mmf	LDFD	f84 = [C5 ], SIZE	LDFD	f85 = [C13], SIZE	FMA	f17 = ALPHA_R, f74, f17	}	;;	{ .mmf	STFD	[C1 ] = f10, SIZE	STFD	[C9 ] = f11, SIZE	FMA	f18 = ALPHA_I, f72, f18	}	{ .mmf	LDFD	f86 = [C5 ], SIZE	LDFD	f87 = [C13], SIZE	FMA	f19 = ALPHA_I, f74, f19	}	;;	{ .mmf	STFD	[C1 ] = f12, SIZE	STFD	[C9 ] = f13, SIZE	FMA	f20 = ALPHA_R, f73, f20	}	{ .mmf	LDFD	f92 = [C5 ], SIZE	LDFD	f93 = [C13], SIZE	FMA	f21 = ALPHA_R, f75, f21	}	;;	{ .mmf	STFD	[C1 ] = f14, 5 * SIZE	STFD	[C9 ] = f15, 5 * SIZE	FMA	f22 = ALPHA_I, f73, f22	}	{ .mmf	LDFD	f94 = [C5 ], -3 * SIZE	LDFD	f95 = [C13], -3 * SIZE	FMA	f23 = ALPHA_I, f75, f23	}	;;	{ .mmf	STFD	[C2 ] = f16, SIZE	STFD	[C10] = f17, SIZE	FMA	f24 = ALPHA_R, f80, f24	}	{ .mmf	LDFD	f100 = [C6 ], SIZE	LDFD	f101 = [C14], SIZE	FMA	f25 = ALPHA_R, f82, f25	}	;;	{ .mmf	STFD	[C2 ] = f18, SIZE	STFD	[C10] = f19, SIZE	FMA	f26 = ALPHA_I, f80, f26	}	{ .mmf	LDFD	f102 = [C6 ], SIZE	LDFD	f103 = [C14], SIZE	FMA	f27 = ALPHA_I, f82, f27	}	;;	{ .mmf	STFD	[C2 ] = f20, SIZE	STFD	[C10] = f21, SIZE	FMA	f28 = ALPHA_R, f81, f28	}	{ .mmf	LDFD	f108 = [C6 ], SIZE	LDFD	f109 = [C14], SIZE	FMA	f29 = ALPHA_R, f83, f29	}	;;	{ .mmf	STFD	[C2 ] = f22, 5 * SIZE	STFD	[C10] = f23, 5 * SIZE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -