⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 qgemv_n.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
	;;	{ .mmf	(p16) LDFD	f88  = [AO1], LDA7M8	(p16) LDFD	f89  = [AO2], LDA7M8	(p17) FMA	f107 = ALPHA8,  f91, f107	}	{ .mmf	(p16) LDFD	f90  = [AO3], LDA7M8	(p16) LDFD	f91  = [AO4], LDA7M8	(p17) FMA	f110 = ALPHA8,  f92, f110	}	;;	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f113 = ALPHA8,  f93, f113	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f116 = ALPHA8,  f94, f116	}	;;	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f119 = ALPHA8,  f95, f119	}	{ .mfb	nop   __LINE__	(p17) FMA	f122 = ALPHA8,  f96, f122	br.ctop.sptk.few .L12	}	;;	{ .mmi	(p18) STFD	[YST1] = f102, 2 * SIZE	(p18) STFD	[YST2] = f105, 2 * SIZE	nop   __LINE__	}	;;	{ .mmi	(p18) STFD	[YST1] = f108, 2 * SIZE	(p18) STFD	[YST2] = f111, 2 * SIZE	nop   __LINE__	}	;;	{ .mmi	(p18) STFD	[YST1] = f114, 2 * SIZE	(p18) STFD	[YST2] = f117, 2 * SIZE	nop   __LINE__	}	;;	{ .mmi	(p18) STFD	[YST1] = f120, 2 * SIZE	(p18) STFD	[YST2] = f123, 2 * SIZE	nop   __LINE__	}	;;	.align 16.L15:	{ .mmi	(p7) cmp.eq.unc p9,  p0 = r0, II	(p8) cmp.eq.unc p10, p0 = r0, II	(p11) adds	AO5  = - 4 * SIZE, AO5	}	{ .mbb	(p11) adds	AO7  = - 4 * SIZE, AO7	(p9) br.cond.dptk   .L11	(p10) br.cond.dptk  .L20	}	;;	{ .mmi	(p13) LDFD	f32  = [AO1], LDA	(p13) LDFD	f33  = [AO2], LDA	tbit.nz	p15, p0  = M, 0	}	{ .mmi	(p13) LDFD	f34  = [AO3], LDA	(p11) adds	AO6  = - 4 * SIZE, AO6	(p12) adds	AO7  = - 2 * SIZE, AO7	}	;;	(p13) LDFD	f35  = [AO4], LDA	(p14) LDFD	f36  = [AO5], LDA	(p14) LDFD	f37  = [AO6], LDA	(p15) LDFD	f38  = [AO7], LDA	;;	(p13) LDFD	f40  = [AO1], LDA	(p13) LDFD	f41  = [AO2], LDA	(p13) LDFD	f42  = [AO3], LDA	(p13) LDFD	f43  = [AO4], LDA	;;	(p14) LDFD	f44  = [AO5], LDA	(p14) LDFD	f45  = [AO6], LDA	(p15) LDFD	f46  = [AO7], LDA	;;	(p13) LDFD	f48  = [AO1], LDA	(p13) LDFD	f49  = [AO2], LDA	(p13) LDFD	f50  = [AO3], LDA	(p13) LDFD	f51  = [AO4], LDA	;;	(p14) LDFD	f52  = [AO5], LDA	(p14) LDFD	f53  = [AO6], LDA	(p15) LDFD	f54  = [AO7], LDA	;;	(p13) LDFD	f56  = [AO1], LDA	(p13) LDFD	f57  = [AO2], LDA	(p13) LDFD	f58  = [AO3], LDA	(p13) LDFD	f59  = [AO4], LDA	;;	(p14) LDFD	f60  = [AO5], LDA	(p14) LDFD	f61  = [AO6], LDA	(p15) LDFD	f62  = [AO7], LDA	;;	(p13) LDFD	f64  = [AO1], LDA	(p13) LDFD	f65  = [AO2], LDA	(p13) LDFD	f66  = [AO3], LDA	(p13) LDFD	f67  = [AO4], LDA	;;	(p14) LDFD	f68  = [AO5], LDA	(p14) LDFD	f69  = [AO6], LDA	(p15) LDFD	f70  = [AO7], LDA	;;	(p13) LDFD	f72  = [AO1], LDA	(p13) LDFD	f73  = [AO2], LDA	(p13) LDFD	f74  = [AO3], LDA	(p13) LDFD	f75  = [AO4], LDA	;;	(p14) LDFD	f76  = [AO5], LDA	(p14) LDFD	f77  = [AO6], LDA	(p15) LDFD	f78  = [AO7], LDA	;;	(p13) LDFD	f80  = [AO1], LDA	(p13) LDFD	f81  = [AO2], LDA	(p13) LDFD	f82  = [AO3], LDA	(p13) LDFD	f83  = [AO4], LDA	;;	(p14) LDFD	f84  = [AO5], LDA	(p14) LDFD	f85  = [AO6], LDA	(p15) LDFD	f86  = [AO7], LDA	;;	(p13) LDFD	f88  = [AO1]	(p13) LDFD	f89  = [AO2]	(p13) LDFD	f90  = [AO3]	(p13) LDFD	f91  = [AO4]	;;	(p14) LDFD	f92  = [AO5]	(p14) LDFD	f93  = [AO6]	(p15) LDFD	f94  = [AO7]	;;	(p13) LDFD	f96  = [YLD1], 2 * SIZE	(p13) LDFD	f97  = [YLD2], 2 * SIZE	;;	(p13) LDFD	f98  = [YLD1], 2 * SIZE	(p13) LDFD	f99  = [YLD2], 2 * SIZE	;;	(p14) LDFD	f100 = [YLD1], 1 * SIZE	;;	(p14) LDFD	f101 = [YLD1], 1 * SIZE	;;	(p15) LDFD	f102 = [YLD1], 1 * SIZE	;;	(p13) FMA	f96  = ALPHA1,  f32, f96	(p13) FMA	f97  = ALPHA1,  f33, f97	(p13) FMA	f98  = ALPHA1,  f34, f98	(p13) FMA	f99  = ALPHA1,  f35, f99	(p14) FMA	f100 = ALPHA1,  f36, f100	(p14) FMA	f101 = ALPHA1,  f37, f101	(p15) FMA	f102 = ALPHA1,  f38, f102	;;	(p13) FMA	f96  = ALPHA2,  f40, f96	(p13) FMA	f97  = ALPHA2,  f41, f97	(p13) FMA	f98  = ALPHA2,  f42, f98	(p13) FMA	f99  = ALPHA2,  f43, f99	(p14) FMA	f100 = ALPHA2,  f44, f100	(p14) FMA	f101 = ALPHA2,  f45, f101	(p15) FMA	f102 = ALPHA2,  f46, f102	;;	(p13) FMA	f96  = ALPHA3,  f48, f96	(p13) FMA	f97  = ALPHA3,  f49, f97	(p13) FMA	f98  = ALPHA3,  f50, f98	(p13) FMA	f99  = ALPHA3,  f51, f99	(p14) FMA	f100 = ALPHA3,  f52, f100	(p14) FMA	f101 = ALPHA3,  f53, f101	(p15) FMA	f102 = ALPHA3,  f54, f102	;;	(p13) FMA	f96  = ALPHA4,  f56, f96	(p13) FMA	f97  = ALPHA4,  f57, f97	(p13) FMA	f98  = ALPHA4,  f58, f98	(p13) FMA	f99  = ALPHA4,  f59, f99	(p14) FMA	f100 = ALPHA4,  f60, f100	(p14) FMA	f101 = ALPHA4,  f61, f101	(p15) FMA	f102 = ALPHA4,  f62, f102	;;	(p13) FMA	f96  = ALPHA5,  f64, f96	(p13) FMA	f97  = ALPHA5,  f65, f97	(p13) FMA	f98  = ALPHA5,  f66, f98	(p13) FMA	f99  = ALPHA5,  f67, f99	(p14) FMA	f100 = ALPHA5,  f68, f100	(p14) FMA	f101 = ALPHA5,  f69, f101	(p15) FMA	f102 = ALPHA5,  f70, f102	;;	(p13) FMA	f96  = ALPHA6,  f72, f96	(p13) FMA	f97  = ALPHA6,  f73, f97	(p13) FMA	f98  = ALPHA6,  f74, f98	(p13) FMA	f99  = ALPHA6,  f75, f99	(p14) FMA	f100 = ALPHA6,  f76, f100	(p14) FMA	f101 = ALPHA6,  f77, f101	(p15) FMA	f102 = ALPHA6,  f78, f102	;;	(p13) FMA	f96  = ALPHA7,  f80, f96	(p13) FMA	f97  = ALPHA7,  f81, f97	(p13) FMA	f98  = ALPHA7,  f82, f98	(p13) FMA	f99  = ALPHA7,  f83, f99	(p14) FMA	f100 = ALPHA7,  f84, f100	(p14) FMA	f101 = ALPHA7,  f85, f101	(p15) FMA	f102 = ALPHA7,  f86, f102	;; 	(p13) FMA	f16  = ALPHA8,  f88, f96	(p13) FMA	f17  = ALPHA8,  f89, f97	(p13) FMA	f18  = ALPHA8,  f90, f98	(p13) FMA	f19  = ALPHA8,  f91, f99	(p14) FMA	f20  = ALPHA8,  f92, f100	(p14) FMA	f21  = ALPHA8,  f93, f101	(p15) FMA	f22  = ALPHA8,  f94, f102	;;	{ .mmi	(p13) STFD	[YST1] = f16, 2 * SIZE	(p13) STFD	[YST2] = f17, 2 * SIZE	nop   __LINE__	}	;;	{ .mmi	(p13) STFD	[YST1] = f18, 2 * SIZE	(p13) STFD	[YST2] = f19	nop   __LINE__	}	;;	{ .mmi	(p14) STFD	[YST1] = f20, 1 * SIZE	;;	(p14) STFD	[YST1] = f21, 1 * SIZE	nop   __LINE__	}	;;	{ .mib	(p15) STFD	[YST1] = f22	cmp.lt p11, p12 = r0, J	(p11) br.cond.dptk .L11	}	;;	.align 16.L20:	tbit.z	p6, p0 = N, 2	;;	(p6) br.cond.dpnt .L30	;;	shladd	LDA7M8 = LDA, 2, r0	;;	sub	LDA7M8 = LDA, LDA7M8		;;	adds	LDA7M8 = 8 * SIZE, LDA7M8	;;		mov	YLD1 = YY	mov	YST1 = YY	adds	YLD2 = 2 * SIZE, YY	adds	YST2 = 2 * SIZE, YY	;;	LDFD	ALPHA1 = [X], INCX	;;	LDFD	ALPHA2 = [X], INCX	;;	LDFD	ALPHA3 = [X], INCX	;;	LDFD	ALPHA4 = [X], INCX	;;	FMPY	ALPHA1 = ALPHA, ALPHA1	FMPY	ALPHA2 = ALPHA, ALPHA2	FMPY	ALPHA3 = ALPHA, ALPHA3	FMPY	ALPHA4 = ALPHA, ALPHA4	;;	mov	AO1 = A	adds	AO2 = 1 * SIZE, A	adds	AO3 = 2 * SIZE, A	adds	AO4 = 3 * SIZE, A	adds	AO5 = 4 * SIZE, A	adds	AO6 = 5 * SIZE, A	adds	AO7 = 6 * SIZE, A	adds	AO8 = 7 * SIZE, A	shladd	A   = LDA, 2, A	;;	shr	I = M, 3	mov	pr.rot= 0	;;	cmp.eq	p16, p0 = r0, r0	;;	adds	I = -1, I	adds J = -1, J	;;	cmp.lt p7, p8 = r0, J	tbit.nz	p13, p11 = M, 2	tbit.nz	p14, p12 = M, 1	mov	ar.ec= 1	;;	{ .mfi	and	II = 7, M	mov	ar.lc = I	}	{ .mfb	cmp.eq	p6, p0 = -1, I	(p6) br.cond.dpnt .L25	}	;;	.align 16.L22:	(p16) LDFD	f32  = [AO1], LDA	(p16) LDFD	f34  = [AO3], LDA	(p16) LDFD	f36  = [AO5], LDA	(p16) LDFD	f38  = [AO7], LDA	;;	(p16) LDFD	f33  = [AO2], LDA	(p16) LDFD	f35  = [AO4], LDA	(p16) LDFD	f37  = [AO6], LDA	(p16) LDFD	f39  = [AO8], LDA	;;	(p16) LDFD	f40  = [AO1], LDA	(p16) LDFD	f42  = [AO3], LDA	(p16) LDFD	f44  = [AO5], LDA	(p16) LDFD	f46  = [AO7], LDA	;;	(p16) LDFD	f41  = [AO2], LDA	(p16) LDFD	f43  = [AO4], LDA	(p16) LDFD	f45  = [AO6], LDA	(p16) LDFD	f47  = [AO8], LDA	;;	(p16) LDFD	f48  = [AO1], LDA	(p16) LDFD	f50  = [AO3], LDA	(p16) LDFD	f52  = [AO5], LDA	(p16) LDFD	f54  = [AO7], LDA	;;	(p16) LDFD	f49  = [AO2], LDA	(p16) LDFD	f51  = [AO4], LDA	(p16) LDFD	f53  = [AO6], LDA	(p16) LDFD	f55  = [AO8], LDA	;;	(p16) LDFD	f56  = [AO1], LDA7M8	(p16) LDFD	f58  = [AO3], LDA7M8	(p16) LDFD	f60  = [AO5], LDA7M8	(p16) LDFD	f62  = [AO7], LDA7M8	;;	(p16) LDFD	f57  = [AO2], LDA7M8	(p16) LDFD	f59  = [AO4], LDA7M8	(p16) LDFD	f61  = [AO6], LDA7M8	(p16) LDFD	f63  = [AO8], LDA7M8	;;	(p16) LDFD	f96  = [YLD1], 1 * SIZE	(p16) LDFD	f98  = [YLD2], 1 * SIZE	;;	(p16) LDFD	f97  = [YLD1], 3 * SIZE	(p16) LDFD	f99  = [YLD2], 3 * SIZE	;;	(p16) LDFD	f100 = [YLD1], 1 * SIZE	(p16) LDFD	f102 = [YLD2], 1 * SIZE	;;	(p16) LDFD	f101 = [YLD1], 3 * SIZE	(p16) LDFD	f103 = [YLD2], 3 * SIZE	;;	(p16) FMA	f96  = ALPHA1,  f32, f96	(p16) FMA	f98  = ALPHA1,  f34, f98	(p16) FMA	f97  = ALPHA1,  f33, f97	(p16) FMA	f99  = ALPHA1,  f35, f99	(p16) FMA	f100 = ALPHA1,  f36, f100	(p16) FMA	f102 = ALPHA1,  f38, f102	(p16) FMA	f101 = ALPHA1,  f37, f101	(p16) FMA	f103 = ALPHA1,  f39, f103	;;	(p16) FMA	f96  = ALPHA2,  f40, f96	(p16) FMA	f98  = ALPHA2,  f42, f98	(p16) FMA	f97  = ALPHA2,  f41, f97	(p16) FMA	f99  = ALPHA2,  f43, f99	(p16) FMA	f100 = ALPHA2,  f44, f100	(p16) FMA	f102 = ALPHA2,  f46, f102	(p16) FMA	f101 = ALPHA2,  f45, f101	(p16) FMA	f103 = ALPHA2,  f47, f103	;;	(p16) FMA	f96  = ALPHA3,  f48, f96	(p16) FMA	f98  = ALPHA3,  f50, f98	(p16) FMA	f97  = ALPHA3,  f49, f97	(p16) FMA	f99  = ALPHA3,  f51, f99	(p16) FMA	f100 = ALPHA3,  f52, f100	(p16) FMA	f102 = ALPHA3,  f54, f102	(p16) FMA	f101 = ALPHA3,  f53, f101	(p16) FMA	f103 = ALPHA3,  f55, f103	;;	(p16) FMA	f16  = ALPHA4,  f56, f96	(p16) FMA	f18  = ALPHA4,  f58, f98	(p16) FMA	f17  = ALPHA4,  f57, f97	(p16) FMA	f19  = ALPHA4,  f59, f99	(p16) FMA	f20  = ALPHA4,  f60, f100	(p16) FMA	f22  = ALPHA4,  f62, f102	(p16) FMA	f21  = ALPHA4,  f61, f101	(p16) FMA	f23  = ALPHA4,  f63, f103	;;	(p16) STFD	[YST1] = f16, 1 * SIZE	(p16) STFD	[YST2] = f18, 1 * SIZE	;;	(p16) STFD	[YST1] = f17, 3 * SIZE	(p16) STFD	[YST2] = f19, 3 * SIZE	;;	(p16) STFD	[YST1] = f20, 1 * SIZE	(p16) STFD	[YST2] = f22, 1 * SIZE	;;	(p16) STFD	[YST1] = f21, 3 * SIZE	(p16) STFD	[YST2] = f23, 3 * SIZE	br.ctop.sptk.few .L22	;;	.align 16.L25:	{ .mmi	(p8) cmp.eq.unc p10, p0 = r0, II	(p11) adds	AO5  = - 4 * SIZE, AO5	}	{ .mbb	(p11) adds	AO7  = - 4 * SIZE, AO7	(p10) br.cond.dptk  .L30	}	;;	{ .mmi	(p13) LDFD	f32  = [AO1], LDA	(p13) LDFD	f34  = [AO3], LDA	tbit.nz	p15, p0  = M, 0	}	{ .mmi	(p14) LDFD	f36  = [AO5], LDA	(p11) adds	AO6  = - 4 * SIZE, AO6	(p12) adds	AO7  = - 2 * SIZE, AO7	}	;;	(p13) LDFD	f33  = [AO2], LDA	(p13) LDFD	f35  = [AO4], LDA	(p14) LDFD	f37  = [AO6], LDA	(p15) LDFD	f38  = [AO7], LDA	;;	(p13) LDFD	f40  = [AO1], LDA	(p13) LDFD	f42  = [AO3], LDA	(p14) LDFD	f44  = [AO5], LDA	(p15) LDFD	f46  = [AO7], LDA	;;	(p13) LDFD	f41  = [AO2], LDA	(p13) LDFD	f43  = [AO4], LDA	(p14) LDFD	f45  = [AO6], LDA	;;	(p13) LDFD	f48  = [AO1], LDA	(p13) LDFD	f50  = [AO3], LDA	(p14) LDFD	f52  = [AO5], LDA	(p15) LDFD	f54  = [AO7], LDA	;;	(p13) LDFD	f49  = [AO2], LDA	(p13) LDFD	f51  = [AO4], LDA	(p14) LDFD	f53  = [AO6], LDA	;;	(p13) LDFD	f56  = [AO1]	(p13) LDFD	f58  = [AO3]	(p14) LDFD	f60  = [AO5]	(p15) LDFD	f62  = [AO7]	;;	(p13) LDFD	f57  = [AO2]	(p13) LDFD	f59  = [AO4]	(p14) LDFD	f61  = [AO6]	;;	(p13) LDFD	f96  = [YLD1], 1 * SIZE	(p13) LDFD	f98  = [YLD2], 1 * SIZE	;;	(p13) LDFD	f97  = [YLD1], 3 * SIZE	(p13) LDFD	f99  = [YLD2], 3 * SIZE	;;	(p14) LDFD	f100 = [YLD1], 1 * SIZE	;;	(p14) LDFD	f101 = [YLD1], 1 * SIZE	;;	(p15) LDFD	f102 = [YLD1], 1 * SIZE	;;	(p13) FMA	f96  = ALPHA1,  f32, f96	(p13) FMA	f98  = ALPHA1,  f34, f98	(p13) FMA	f97  = ALPHA1,  f33, f97	(p13) FMA	f99  = ALPHA1,  f35, f99	(p14) FMA	f100 = ALPHA1,  f36, f100	(p15) FMA	f102 = ALPHA1,  f38, f102	(p14) FMA	f101 = ALPHA1,  f37, f101	;;	(p13) FMA	f96  = ALPHA2,  f40, f96	(p13) FMA	f98  = ALPHA2,  f42, f98	(p13) FMA	f97  = ALPHA2,  f41, f97	(p13) FMA	f99  = ALPHA2,  f43, f99	(p14) FMA	f100 = ALPHA2,  f44, f100	(p15) FMA	f102 = ALPHA2,  f46, f102	(p14) FMA	f101 = ALPHA2,  f45, f101	;;	(p13) FMA	f96  = ALPHA3,  f48, f96	(p13) FMA	f98  = ALPHA3,  f50, f98	(p13) FMA	f97  = ALPHA3,  f49, f97	(p13) FMA	f99  = ALPHA3,  f51, f99	(p14) FMA	f100 = ALPHA3,  f52, f100	(p15) FMA	f102 = ALPHA3,  f54, f102	(p14) FMA	f101 = ALPHA3,  f53, f101	;;	(p13) FMA	f16  = ALPHA4,  f56, f96	(p13) FMA	f18  = ALPHA4,  f58, f98	(p13) FMA	f17  = ALPHA4,  f57, f97	(p13) FMA	f19  = ALPHA4,  f59, f99	(p14) FMA	f20  = ALPHA4,  f60, f100	(p15) FMA	f22  = ALPHA4,  f62, f102	(p14) FMA	f21  = ALPHA4,  f61, f101	;;	{ .mmi	(p13) STFD	[YST1] = f16, 1 * SIZE	(p13) STFD	[YST2] = f18, 1 * SIZE	nop   __LINE__	}	;;	{ .mmi	(p13) STFD	[YST1] = f17, 3 * SIZE	(p13) STFD	[YST2] = f19	nop   __LINE__	}	;;	{ .mmi	(p14) STFD	[YST1] = f20, 1 * SIZE	;;	(p14) STFD	[YST1] = f21, 1 * SIZE	nop   __LINE__	}	;;	{ .mib	(p15) STFD	[YST1] = f22	}	;;	.align 16.L30:	tbit.z	p6, p0 = N, 1	;;	(p6) br.cond.dpnt .L40	;;	shladd	LDA7M8 = LDA, 1, r0	;;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -