⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_tcopy.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	(p12) LD	f77  = [A2], TEMP1	(p18) add	B2 = B2, TEMP2	}	;;	{ .mmi	(p13) ST	[B1] = f58,  1 * SIZE	(p13) ST	[B2] = f70,  1 * SIZE	adds TEMP3 = 5 * SIZE, TEMP3	}	{ .mmi	(p16) LD	f80  = [A1], SIZE	(p16) LD	f92  = [A2], SIZE	adds	TEMP1 = -3 * SIZE, LDA	}	;;	{ .mmi	(p13) ST	[B1] = f61,  1 * SIZE	(p13) ST	[B2] = f73,  1 * SIZE	nop   __LINE__	}	{ .mmi	(p16) LD	f83  = [A1], SIZE	(p16) LD	f95  = [A2], SIZE	(p14) mov  TEMP1 = TEMP3	}	;;	{ .mmi	(p13) ST	[B1] = f64,  1 * SIZE	(p13) ST	[B2] = f76,  1 * SIZE	nop   __LINE__	}	{ .mmi	(p16) LD	f86  = [A1], SIZE	(p16) LD	f98  = [A2], SIZE	(p12) mov TEMP1	= 5 * SIZE	}	;;	{ .mmi	(p13) ST	[B1] = f67,  5 * SIZE	(p13) ST	[B2] = f79,  5 * SIZE	(p14) mov	LCOUNT = 0	}	{ .mmi	(p16) LD	f89  = [A1], TEMP1	(p16) LD	f101 = [A2], TEMP1	(p15) mov	SCOUNT = 0	}	;;	{ .mmi	(p13) ST	[B1] = f106,  1 * SIZE	(p13) ST	[B2] = f118,  1 * SIZE	mov	TEMP2 = 5 * SIZE	}	{ .mmi	(p12) LD	f104 = [A1], SIZE	(p12) LD	f116 = [A2], SIZE	nop   __LINE__	}	;;	{ .mmi	(p13) ST	[B1] = f109,  1 * SIZE	(p13) ST	[B2] = f121,  1 * SIZE	sub	TEMP2 = TEMP2, LDB	}	{ .mmi	(p12) LD	f107 = [A1], SIZE	(p12) LD	f119 = [A2], SIZE	adds	TEMP1 = -11 * SIZE, LDA	}	;;	{ .mmi	(p13) ST	[B1] = f112,  1 * SIZE	(p13) ST	[B2] = f124,  1 * SIZE	(p15) adds TEMP2 = -27 * SIZE, LDB	}	{ .mmi	(p12) LD	f110 = [A1], SIZE	(p12) LD	f122 = [A2], SIZE	(p14) mov TEMP1 = TEMP3	}	;;	{ .mmi	(p13) ST	[B1] = f115	(p13) ST	[B2] = f127	(p13) add	B1 = B1, TEMP2	}	{ .mmi	(p12) LD	f113 = [A1], TEMP1	(p12) LD	f125 = [A2], TEMP1	(p13) add	B2 = B2, TEMP2	}	;;	{ .mmb	(p14) adds	I = -2, I	(p15) adds	II = -2, II	br.ctop.sptk .L112	}	;;	.align 32.L120:	{ .mmi	add	A2 = A1, LDA	nop	__LINE__	tbit.nz p7, p0 = N, 2	}	;;	{ .mmi	(p7) LD	f32  = [A1], SIZE	(p7) LD	f36  = [A2], SIZE	tbit.nz p8, p0 = N, 1	}	;;	{ .mmi	(p7) LD	f33  = [A1], SIZE	(p7) LD	f37  = [A2], SIZE	adds	TEMP1 = -3 * SIZE, LDA	}	;;	{ .mmi	(p7) LD	f34  = [A1], SIZE	(p7) LD	f38  = [A2], SIZE	add	TEMP1 = TEMP1, LDA	}	;;	{ .mmi	(p7) LD	f35  = [A1], TEMP1	(p7) LD	f39  = [A2], TEMP1	tbit.nz p9, p0 = N, 0	}	;;	{ .mmi	(p7) LD	f40  = [A1], SIZE	(p7) LD	f44  = [A2], SIZE	mov	TEMP2 = -1 * SIZE	}	;;	{ .mmi	(p7) LD	f41  = [A1], SIZE	(p7) LD	f45  = [A2], SIZE	shladd	TEMP2 = LDA, 1, TEMP2	}	;;	{ .mmi	(p7) LD	f42  = [A1], SIZE	(p7) LD	f46  = [A2], SIZE	sub	TEMP2 = 0, TEMP2	}	;;	{ .mmi	(p7) LD	f43  = [A1], TEMP2	(p7) LD	f47  = [A2]	nop	__LINE__	}	;;	{ .mmi	add	A2 = A1, LDA	adds	TEMP1 = -1 * SIZE, LDA	mov	TEMP2 = -1 * SIZE	}	;;	{ .mmi	(p8) LD	f48  = [A1], SIZE	(p8) LD	f50  = [A2], SIZE	add	TEMP1 = TEMP1, LDA	}	;;	{ .mmi	(p8) LD	f49  = [A1], TEMP1	(p8) LD	f51  = [A2], TEMP1	shladd	TEMP2 = LDA, 1, TEMP2	}	;;	{ .mmi	(p8) LD	f52  = [A1], SIZE	(p8) LD	f54  = [A2], SIZE	sub	TEMP2 = r0, TEMP2	}	;;	{ .mmi	(p8) LD	f53  = [A1], TEMP2	(p8) LD	f55  = [A2], TEMP2	nop	__LINE__	}	;;	{ .mmi	add	A2 = A1, LDA	adds	B2 = 4 * SIZE, BO2	nop	__LINE__	}	;;	{ .mmi	(p9) LD	f56  = [A1]	nop	__LINE__	(p9) shladd	A1 = LDA, 1, A1	}	{ .mmi	(p9) LD	f57  = [A2]	nop	__LINE__	(p9) shladd	A2 = LDA, 1, A2	}	;;	{ .mmi	(p7) ST	[BO2] = f32,  1 * SIZE	(p7) ST	[B2 ] = f36,  1 * SIZE	nop  __LINE__	}	{ .mmi	(p9) LD	f58  = [A1]	(p9) LD	f59  = [A2]	nop  __LINE__	}	;;	;;	{ .mmi	(p7) ST	[BO2] = f33,  1 * SIZE	(p7) ST	[B2 ] = f37,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f34,  1 * SIZE	(p7) ST	[B2 ] = f38,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f35,  5 * SIZE	(p7) ST	[B2 ] = f39,  5 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f40,  1 * SIZE	(p7) ST	[B2 ] = f44,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f41,  1 * SIZE	(p7) ST	[B2 ] = f45,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f42,  1 * SIZE	(p7) ST	[B2 ] = f46,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f43,  5 * SIZE	(p7) ST	[B2 ] = f47	adds	B2 = 4 * SIZE, BO3	}	;;	{ .mmi	(p8) ST	[BO3] = f48,  1 * SIZE	(p8) ST	[B2 ] = f52,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p8) ST	[BO3] = f49,  1 * SIZE	(p8) ST	[B2 ] = f53,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p8) ST	[BO3] = f50,  1 * SIZE	(p8) ST	[B2 ] = f54,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p8) ST	[BO3] = f51,  5 * SIZE	(p8) ST	[B2 ] = f55	adds	B2 = 2 * SIZE, BO4	}	;;	{ .mmi	(p9) ST	[BO4] = f56,  1 * SIZE	(p9) ST	[B2 ] = f58,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p9) ST	[BO4] = f57,  3 * SIZE	(p9) ST	[B2 ] = f59	nop  __LINE__	}	;;	.align 32.L200:	{ .mmi	add	I = 8, N	mov	A1 = A	mov	pr.rot = 0	}	{ .mmi	adds	A2 = 4 * SIZE, A	nop	__LINE__	tbit.z p6, p0 = M, 1	}	;;	{ .mmi	mov	B1 =  B	cmp.eq	p16, p0 = r0, r0	mov	ar.ec  = 3	}	{ .mib	adds	B2 =  4 * SIZE, B	shr	I = I, 4	(p6)	br.cond.dpnt .L300	}	;;	{ .mmi	shladd	A = LDA, 1, A	adds	B  = 16 * SIZE, B	shr	II = N, 3	}	{ .mmi	cmp.eq	p8, p0 = 0, I	adds	I = -1, I	nop	__LINE__	}	;;	{ .mmi	nop	__LINE__	nop	__LINE__	mov	ar.lc = I	}	{ .mib	mov	I = II	nop	__LINE__	(p8)	br.cond.dpnt .L220	}	;;	.align 32.L212:	{ .mmi	(p18) ST	[B1] = f34,  1 * SIZE	(p18) ST	[B2] = f46,  1 * SIZE	(p16) cmp.ne.unc p12, p0 = 1, I	}	{ .mmi	(p16) LD	f32  = [A1], SIZE	(p16) LD	f44  = [A2], SIZE	(p18) cmp.ne.unc p13, p0 = 1, II	}	;;	{ .mmi	(p18) ST	[B1] = f37,  1 * SIZE	(p18) ST	[B2] = f49,  1 * SIZE	adds	TEMP1 = -3 * SIZE, LDA	}	{ .mmi	(p16) LD	f35  = [A1], SIZE	(p16) LD	f47  = [A2], SIZE	nop   __LINE__	}	;;	{ .mmi	(p18) ST	[B1] = f40,  1 * SIZE	(p18) ST	[B2] = f52,  1 * SIZE	(p12) mov TEMP1 = 5 * SIZE	}	{ .mmi	(p16) LD	f38  = [A1], SIZE	(p16) LD	f50  = [A2], SIZE	nop   __LINE__	}	;;	{ .mmi	(p18) ST	[B1] = f43,  5 * SIZE	(p18) ST	[B2] = f55,  5 * SIZE	nop   __LINE__	}	{ .mmi	(p16) LD	f41  = [A1], TEMP1	(p16) LD	f53  = [A2], TEMP1	nop   __LINE__	}	;;	{ .mmi	(p18) ST	[B1] = f82,  1 * SIZE	(p18) ST	[B2] = f94,  1 * SIZE	nop   __LINE__	}	{ .mmi	(p12) LD	f56  = [A1], SIZE	(p12) LD	f68  = [A2], SIZE	nop   __LINE__	}	;;	{ .mmi	(p18) ST	[B1] = f85,  1 * SIZE	(p18) ST	[B2] = f97,  1 * SIZE	mov	TEMP2 = 5 * SIZE	}	{ .mmi	(p12) LD	f59  = [A1], SIZE	(p12) LD	f71  = [A2], SIZE	nop   __LINE__	}	;;	{ .mmi	(p18) ST	[B1] = f88,  1 * SIZE	(p18) ST	[B2] = f100, 1 * SIZE	(p13) adds TEMP2 = - 11 * SIZE, LDB	}	{ .mmi	(p12) LD	f62  = [A1], SIZE	(p12) LD	f74  = [A2], SIZE	(p12) adds  TEMP1 = - 11 * SIZE, LDA	}	;;	{ .mmi	(p18) ST	[B1] = f91	(p18) ST	[B2] = f103	(p18) add	B1 = B1, TEMP2	}	{ .mmi	(p12) LD	f65  = [A1], TEMP1	(p12) LD	f77  = [A2], TEMP1	(p18) add	B2 = B2, TEMP2	}	;;	{ .mmi	(p13) ST	[B1] = f58,  1 * SIZE	(p13) ST	[B2] = f70,  1 * SIZE	nop   __LINE__	}	{ .mmi	(p16) LD	f80  = [A1], SIZE	(p16) LD	f92  = [A2], SIZE	sub TEMP1 = r0, LDA	}	;;	{ .mmi	(p13) ST	[B1] = f61,  1 * SIZE	(p13) ST	[B2] = f73,  1 * SIZE	nop   __LINE__	}	{ .mmi	(p16) LD	f83  = [A1], SIZE	(p16) LD	f95  = [A2], SIZE	(p16) adds TEMP1 = 5 * SIZE, TEMP1	}	;;	{ .mmi	(p13) ST	[B1] = f64,  1 * SIZE	(p13) ST	[B2] = f76,  1 * SIZE	nop   __LINE__	}	{ .mmi	(p16) LD	f86  = [A1], SIZE	(p16) LD	f98  = [A2], SIZE	(p12) mov TEMP1	= 5 * SIZE	}	;;	{ .mmi	(p13) ST	[B1] = f67,  5 * SIZE	(p13) ST	[B2] = f79,  5 * SIZE	nop   __LINE__	}	{ .mmi	(p16) LD	f89  = [A1], TEMP1	(p16) LD	f101 = [A2], TEMP1	adds	TEMP1 = -11 * SIZE, LDA	}	;;	{ .mmi	(p13) ST	[B1] = f106,  1 * SIZE	(p13) ST	[B2] = f118,  1 * SIZE	mov	TEMP2 = 5 * SIZE	}	{ .mmi	(p12) LD	f104 = [A1], SIZE	(p12) LD	f116 = [A2], SIZE	(p16) shladd TEMP1 = LDA, 1, r0	}	;;	{ .mmi	(p13) ST	[B1] = f109,  1 * SIZE	(p13) ST	[B2] = f121,  1 * SIZE	sub	TEMP2 = TEMP2, LDB	}	{ .mmi	(p12) LD	f107 = [A1], SIZE	(p12) LD	f119 = [A2], SIZE	(p16) sub TEMP1 = LDA, TEMP1	}	;;	{ .mmi	(p13) ST	[B1] = f112,  1 * SIZE	(p13) ST	[B2] = f124,  1 * SIZE	(p18) adds TEMP2 = -11 * SIZE, LDB	}	{ .mmi	(p12) LD	f110 = [A1], SIZE	(p12) LD	f122 = [A2], SIZE	(p16) adds TEMP1 = 5 * SIZE, TEMP1	}	;;	{ .mmi	(p13) ST	[B1] = f115	(p13) ST	[B2] = f127	(p13) add	B1 = B1, TEMP2	}	{ .mmi	(p12) LD	f113 = [A1], TEMP1	(p12) LD	f125 = [A2], TEMP1	(p13) add	B2 = B2, TEMP2	}	;;	{ .mmb	(p16) adds	I = -2, I	(p18) adds	II = -2, II	br.ctop.sptk .L212	}	;;	.align 32.L220:	{ .mmi	add	A2 = A1, LDA	nop  __LINE__	tbit.nz p7, p0 = N, 2	}	;;	{ .mmi	(p7) LD	f32  = [A1], SIZE	(p7) LD	f36  = [A2], SIZE	tbit.nz p8, p0 = N, 1	}	;;	{ .mmi	(p7) LD	f33  = [A1], SIZE	(p7) LD	f37  = [A2], SIZE	tbit.nz p9, p0 = N, 0	}	;;	{ .mmi	(p7) LD	f34  = [A1], SIZE	(p7) LD	f38  = [A2], SIZE	nop  __LINE__	}	;;	{ .mmi	(p7) LD	f35  = [A1], SIZE	(p7) LD	f39  = [A2]	nop  __LINE__	}	;;	{ .mmi	add	A2 = A1, LDA	nop  __LINE__	nop  __LINE__	}	;;	{ .mmi	(p8) LD	f40  = [A1], SIZE	(p8) LD	f42  = [A2], SIZE	nop  __LINE__	}	;;	{ .mmi	(p8) LD	f41  = [A1], SIZE	(p8) LD	f43  = [A2]	nop  __LINE__	}	;;	{ .mmi	add	A2 = A1, LDA	nop  __LINE__	nop  __LINE__	}	;;	{ .mmi	(p9) LD	f44  = [A1]	(p9) LD	f45  = [A2]	adds	B2 = 4 * SIZE, BO2	}	;;	{ .mmi	(p7) ST	[BO2] = f32,  1 * SIZE	(p7) ST	[B2 ] = f36,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f33,  1 * SIZE	(p7) ST	[B2 ] = f37,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f34,  1 * SIZE	(p7) ST	[B2 ] = f38,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f35,  5 * SIZE	(p7) ST	[B2 ] = f39	adds	B2 = 2 * SIZE, BO3	}	;;	{ .mmi	(p8) ST	[BO3] = f40,  1 * SIZE	(p8) ST	[B2 ] = f42,  1 * SIZE	nop  __LINE__	}	;;	{ .mmi	(p8) ST	[BO3] = f41,  3 * SIZE	(p8) ST	[B2 ] = f43	adds	B2 = 1 * SIZE, BO4	}	;;	{ .mmi	(p9) ST	[BO4] = f44,  2 * SIZE	(p9) ST	[B2 ] = f45	nop  __LINE__	}	;;	.align 32.L300:	{ .mmi	add	I = 8, N	mov	A1 = A	mov	pr.rot = 0	}	{ .mmi	mov	B1 =  B	adds	A2 = 4 * SIZE, A	tbit.z p6, p0 = M, 0	}	;;	{ .mmi	adds	B2 =  4 * SIZE, B	cmp.eq	p16, p0 = r0, r0	mov	ar.ec  = 3	}	{ .mib	nop	__LINE__	shr	I = I, 4	(p6)	br.cond.dpnt .L999	}	;;	{ .mmi	cmp.eq	p8, p0 = 0, I	adds	I = -1, I	shr	II = N, 3	}	;;	{ .mmi	nop	__LINE__	nop	__LINE__	mov	ar.lc = I	}	{ .mib	nop	__LINE__	mov	I = II	(p8)	br.cond.dpnt .L320	}	;;	.align 32.L312:	{ .mmi	(p18) ST	[B1] = f34,  1 * SIZE	(p18) ST	[B2] = f46,  1 * SIZE	(p16) cmp.ne.unc p12, p0 = 1, I	}	{ .mmi	(p16) LD	f32  = [A1], SIZE	(p16) LD	f44  = [A2], SIZE	(p18) cmp.ne.unc p13, p0 = 1, II	}	;;	{ .mmi	(p18) ST	[B1] = f37,  1 * SIZE	(p18) ST	[B2] = f49,  1 * SIZE	adds	TEMP2 = - 3 * SIZE, LDB	}	{ .mmi	(p16) LD	f35  = [A1], SIZE	(p16) LD	f47  = [A2], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18) ST	[B1] = f40,  1 * SIZE	(p18) ST	[B2] = f52,  1 * SIZE	nop	__LINE__	}	{ .mmi	(p16) LD	f38  = [A1], SIZE	(p16) LD	f50  = [A2], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18) ST	[B1] = f43	(p18) ST	[B2] = f55	(p18) add	B1 = B1, TEMP2	}	{ .mmi	(p16) LD	f41  = [A1], 5 * SIZE	(p16) LD	f53  = [A2], 5 * SIZE	(p18) add	B2 = B2, TEMP2	}	;;	{ .mmi	(p13) ST	[B1] = f58,  1 * SIZE	(p13) ST	[B2] = f70,  1 * SIZE	(p16) adds	I = -2, I	}	{ .mmi	(p12) LD	f56  = [A1], SIZE	(p12) LD	f68  = [A2], SIZE	(p18) adds	II = -2, II	}	;;	{ .mmi	(p13) ST	[B1] = f61,  1 * SIZE	(p13) ST	[B2] = f73,  1 * SIZE	nop	__LINE__	}	{ .mmi	(p12) LD	f59  = [A1], SIZE	(p12) LD	f71  = [A2], SIZE	nop	__LINE__	}	;;	{ .mmi	(p13) ST	[B1] = f64,  1 * SIZE	(p13) ST	[B2] = f76,  1 * SIZE	nop	__LINE__	}	{ .mmi	(p12) LD	f62  = [A1], SIZE	(p12) LD	f74  = [A2], SIZE	nop	__LINE__	}	;;	{ .mmi	(p13) ST	[B1] = f67	(p13) ST	[B2] = f79	(p13) add	B1 = B1, TEMP2	}	{ .mmi	(p12) LD	f65  = [A1], 5 * SIZE	(p12) LD	f77  = [A2], 5 * SIZE	(p13) add	B2 = B2, TEMP2	}	;;	{ .mmb	nop	__LINE__	nop	__LINE__	br.ctop.sptk .L312	}	;;	.align 32.L320:	{ .mmi	adds	A2 = 2 * SIZE, A1	adds	B2 = 2 * SIZE, BO2	tbit.nz p7, p0 = N, 2	}	;;	{ .mmi	(p7) LD	f32  = [A1], SIZE	(p7) LD	f34  = [A2], SIZE	tbit.nz p8, p0 = N, 1	}	;;	{ .mmi	(p7) LD	f33  = [A1], 3 * SIZE	(p7) LD	f35  = [A2]	nop  __LINE__	}	;;	{ .mmi	adds	A2 = SIZE, A1	nop	__LINE__	nop	__LINE__	}	;;	{ .mmi	(p8) LD	f36  = [A1], 2 * SIZE	(p8) LD	f37  = [A2]	tbit.nz p9, p0 = N, 0	}	;;	{ .mmi	(p9) LD	f38  = [A1]	nop	__LINE__	nop	__LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f32,  1 * SIZE	(p7) ST	[B2 ] = f34,  1 * SIZE	nop	__LINE__	}	;;	{ .mmi	(p7) ST	[BO2] = f33,  3 * SIZE	(p7) ST	[B2 ] = f35	adds	B2 = SIZE, BO3	}	;;	{ .mmi	(p8) ST	[BO3] = f36,  2 * SIZE	(p8) ST	[B2 ] = f37	nop	__LINE__	}	;;	{ .mmi	(p9) ST	[BO4] = f38,  1 * SIZE	nop	__LINE__	nop	__LINE__	}	;;	.align 32.L999:	mov pr    = PR, -1	mov	 ar.lc = ARLC	br.ret.sptk.many b0	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -