⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 qgemv_n.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
	sub	LDA7M8 = LDA, LDA7M8		;;	adds	LDA7M8 = 8 * SIZE, LDA7M8	;;		mov	YLD1 = YY	mov	YST1 = YY	adds	YLD2 = 2 * SIZE, YY	adds	YST2 = 2 * SIZE, YY	;;	LDFD	ALPHA1 = [X], INCX	;;	LDFD	ALPHA2 = [X], INCX	;;	FMPY	ALPHA1 = ALPHA, ALPHA1	FMPY	ALPHA2 = ALPHA, ALPHA2	;;	mov	AO1 = A	adds	AO2 = 1 * SIZE, A	adds	AO3 = 2 * SIZE, A	adds	AO4 = 3 * SIZE, A	adds	AO5 = 4 * SIZE, A	adds	AO6 = 5 * SIZE, A	adds	AO7 = 6 * SIZE, A	adds	AO8 = 7 * SIZE, A	shladd	A   = LDA, 1, A	;;	shr	I = M, 3	mov	pr.rot= 0	;;	cmp.eq	p16, p0 = r0, r0	;;	adds	I = -1, I	adds J = -1, J	;;	cmp.lt p7, p8 = r0, J	tbit.nz	p13, p11 = M, 2	tbit.nz	p14, p12 = M, 1	mov	ar.ec= 1	;;	{ .mfi	and	II = 7, M	mov	ar.lc = I	}	{ .mfb	cmp.eq	p6, p0 = -1, I	(p6) br.cond.dpnt .L35	}	;;	.align 16.L32:	(p16) LDFD	f32  = [AO1], LDA	(p16) LDFD	f34  = [AO3], LDA	(p16) LDFD	f36  = [AO5], LDA	(p16) LDFD	f38  = [AO7], LDA	;;	(p16) LDFD	f33  = [AO2], LDA	(p16) LDFD	f35  = [AO4], LDA	(p16) LDFD	f37  = [AO6], LDA	(p16) LDFD	f39  = [AO8], LDA	;;	(p16) LDFD	f40  = [AO1], LDA7M8	(p16) LDFD	f42  = [AO3], LDA7M8	(p16) LDFD	f44  = [AO5], LDA7M8	(p16) LDFD	f46  = [AO7], LDA7M8	;;	(p16) LDFD	f41  = [AO2], LDA7M8	(p16) LDFD	f43  = [AO4], LDA7M8	(p16) LDFD	f45  = [AO6], LDA7M8	(p16) LDFD	f47  = [AO8], LDA7M8	;;	(p16) LDFD	f96  = [YLD1], 1 * SIZE	(p16) LDFD	f98  = [YLD2], 1 * SIZE	;;	(p16) LDFD	f97  = [YLD1], 3 * SIZE	(p16) LDFD	f99  = [YLD2], 3 * SIZE	;;	(p16) LDFD	f100 = [YLD1], 1 * SIZE	(p16) LDFD	f102 = [YLD2], 1 * SIZE	;;	(p16) LDFD	f101 = [YLD1], 3 * SIZE	(p16) LDFD	f103 = [YLD2], 3 * SIZE	;;	(p16) FMA	f96  = ALPHA1,  f32, f96	(p16) FMA	f98  = ALPHA1,  f34, f98	(p16) FMA	f97  = ALPHA1,  f33, f97	(p16) FMA	f99  = ALPHA1,  f35, f99	(p16) FMA	f100 = ALPHA1,  f36, f100	(p16) FMA	f102 = ALPHA1,  f38, f102	(p16) FMA	f101 = ALPHA1,  f37, f101	(p16) FMA	f103 = ALPHA1,  f39, f103	;;	(p16) FMA	f16  = ALPHA2,  f40, f96	(p16) FMA	f18  = ALPHA2,  f42, f98	(p16) FMA	f17  = ALPHA2,  f41, f97	(p16) FMA	f19  = ALPHA2,  f43, f99	(p16) FMA	f20  = ALPHA2,  f44, f100	(p16) FMA	f22  = ALPHA2,  f46, f102	(p16) FMA	f21  = ALPHA2,  f45, f101	(p16) FMA	f23  = ALPHA2,  f47, f103	;;	(p16) STFD	[YST1] = f16, 1 * SIZE	(p16) STFD	[YST2] = f18, 1 * SIZE	;;	(p16) STFD	[YST1] = f17, 3 * SIZE	(p16) STFD	[YST2] = f19, 3 * SIZE	;;	(p16) STFD	[YST1] = f20, 1 * SIZE	(p16) STFD	[YST2] = f22, 1 * SIZE	;;	(p16) STFD	[YST1] = f21, 3 * SIZE	(p16) STFD	[YST2] = f23, 3 * SIZE	br.ctop.sptk.few .L32	;;	.align 16.L35:	{ .mmi	(p8) cmp.eq.unc p10, p0 = r0, II	(p11) adds	AO5  = - 4 * SIZE, AO5	}	{ .mbb	(p11) adds	AO7  = - 4 * SIZE, AO7	(p10) br.cond.dptk  .L40	}	;;	{ .mmi	(p13) LDFD	f32  = [AO1], LDA	(p13) LDFD	f34  = [AO3], LDA	tbit.nz	p15, p0  = M, 0	}	{ .mmi	(p14) LDFD	f36  = [AO5], LDA	(p11) adds	AO6  = - 4 * SIZE, AO6	(p12) adds	AO7  = - 2 * SIZE, AO7	}	;;	(p13) LDFD	f33  = [AO2], LDA	(p13) LDFD	f35  = [AO4], LDA	(p14) LDFD	f37  = [AO6], LDA	(p15) LDFD	f38  = [AO7], LDA	;;	(p13) LDFD	f40  = [AO1], LDA	(p13) LDFD	f42  = [AO3], LDA	(p14) LDFD	f44  = [AO5], LDA	(p15) LDFD	f46  = [AO7], LDA	;;	(p13) LDFD	f41  = [AO2]	(p13) LDFD	f43  = [AO4]	(p14) LDFD	f45  = [AO6]	;;	(p13) LDFD	f96  = [YLD1], 1 * SIZE	(p13) LDFD	f98  = [YLD2], 1 * SIZE	;;	(p13) LDFD	f97  = [YLD1], 3 * SIZE	(p13) LDFD	f99  = [YLD2], 3 * SIZE	;;	(p14) LDFD	f100 = [YLD1], 1 * SIZE	;;	(p14) LDFD	f101 = [YLD1], 1 * SIZE	;;	(p15) LDFD	f102 = [YLD1], 1 * SIZE	;;	(p13) FMA	f96  = ALPHA1,  f32, f96	(p13) FMA	f98  = ALPHA1,  f34, f98	(p13) FMA	f97  = ALPHA1,  f33, f97	(p13) FMA	f99  = ALPHA1,  f35, f99	(p14) FMA	f100 = ALPHA1,  f36, f100	(p15) FMA	f102 = ALPHA1,  f38, f102	(p14) FMA	f101 = ALPHA1,  f37, f101	;;	(p13) FMA	f16  = ALPHA2,  f40, f96	(p13) FMA	f18  = ALPHA2,  f42, f98	(p13) FMA	f17  = ALPHA2,  f41, f97	(p13) FMA	f19  = ALPHA2,  f43, f99	(p14) FMA	f20  = ALPHA2,  f44, f100	(p15) FMA	f22  = ALPHA2,  f46, f102	(p14) FMA	f21  = ALPHA2,  f45, f101	;;	{ .mmi	(p13) STFD	[YST1] = f16, 1 * SIZE	(p13) STFD	[YST2] = f18, 1 * SIZE	nop   __LINE__	}	;;	{ .mmi	(p13) STFD	[YST1] = f17, 3 * SIZE	(p13) STFD	[YST2] = f19	nop   __LINE__	}	;;	{ .mmi	(p14) STFD	[YST1] = f20, 1 * SIZE	;;	(p14) STFD	[YST1] = f21, 1 * SIZE	nop   __LINE__	}	;;	{ .mib	(p15) STFD	[YST1] = f22	}	;;	.align 16.L40:	tbit.z	p6, p0 = N, 0	;;	(p6) br.cond.dpnt .L990	;;	mov	LDA7M8 = 8 * SIZE	;;		mov	YLD1 = YY	mov	YST1 = YY	adds	YLD2 = 2 * SIZE, YY	adds	YST2 = 2 * SIZE, YY	;;	LDFD	ALPHA1 = [X], INCX	;;	LDFD	ALPHA2 = [X], INCX	;;	FMPY	ALPHA1 = ALPHA, ALPHA1	FMPY	ALPHA2 = ALPHA, ALPHA2	;;	mov	AO1 = A	adds	AO2 = 1 * SIZE, A	adds	AO3 = 2 * SIZE, A	adds	AO4 = 3 * SIZE, A	adds	AO5 = 4 * SIZE, A	adds	AO6 = 5 * SIZE, A	adds	AO7 = 6 * SIZE, A	adds	AO8 = 7 * SIZE, A	add	A   = LDA, A	;;	shr	I = M, 3	mov	pr.rot= 0	;;	cmp.eq	p16, p0 = r0, r0	;;	adds	I = -1, I	adds J = -1, J	;;	cmp.lt p7, p8 = r0, J	tbit.nz	p13, p11 = M, 2	tbit.nz	p14, p12 = M, 1	mov	ar.ec= 1	;;	{ .mfi	and	II = 7, M	mov	ar.lc = I	}	{ .mfb	cmp.eq	p6, p0 = -1, I	(p6) br.cond.dpnt .L45	}	;;	.align 16.L42:	(p16) LDFD	f32  = [AO1], 8 * SIZE	(p16) LDFD	f34  = [AO3], 8 * SIZE	(p16) LDFD	f36  = [AO5], 8 * SIZE	(p16) LDFD	f38  = [AO7], 8 * SIZE	;;	(p16) LDFD	f33  = [AO2], 8 * SIZE	(p16) LDFD	f35  = [AO4], 8 * SIZE	(p16) LDFD	f37  = [AO6], 8 * SIZE	(p16) LDFD	f39  = [AO8], 8 * SIZE	;;	(p16) LDFD	f96  = [YLD1], 1 * SIZE	(p16) LDFD	f98  = [YLD2], 1 * SIZE	;;	(p16) LDFD	f97  = [YLD1], 3 * SIZE	(p16) LDFD	f99  = [YLD2], 3 * SIZE	;;	(p16) LDFD	f100 = [YLD1], 1 * SIZE	(p16) LDFD	f102 = [YLD2], 1 * SIZE	;;	(p16) LDFD	f101 = [YLD1], 3 * SIZE	(p16) LDFD	f103 = [YLD2], 3 * SIZE	;;	(p16) FMA	f16  = ALPHA1,  f32, f96	(p16) FMA	f18  = ALPHA1,  f34, f98	(p16) FMA	f17  = ALPHA1,  f33, f97	(p16) FMA	f19  = ALPHA1,  f35, f99	(p16) FMA	f20  = ALPHA1,  f36, f100	(p16) FMA	f22  = ALPHA1,  f38, f102	(p16) FMA	f21  = ALPHA1,  f37, f101	(p16) FMA	f23 = ALPHA1,  f39, f103	;;	(p16) STFD	[YST1] = f16, 1 * SIZE	(p16) STFD	[YST2] = f18, 1 * SIZE	;;	(p16) STFD	[YST1] = f17, 3 * SIZE	(p16) STFD	[YST2] = f19, 3 * SIZE	;;	(p16) STFD	[YST1] = f20, 1 * SIZE	(p16) STFD	[YST2] = f22, 1 * SIZE	;;	(p16) STFD	[YST1] = f21, 3 * SIZE	(p16) STFD	[YST2] = f23, 3 * SIZE	br.ctop.sptk.few .L42	;;	.align 16.L45:	{ .mmi	(p8) cmp.eq.unc p10, p0 = r0, II	(p11) adds	AO5  = - 4 * SIZE, AO5	}	{ .mbb	(p11) adds	AO7  = - 4 * SIZE, AO7	(p10) br.cond.dptk  .L990	}	;;	{ .mmi	(p13) LDFD	f32  = [AO1], LDA	(p13) LDFD	f34  = [AO3], LDA	tbit.nz	p15, p0  = M, 0	}	{ .mmi	(p14) LDFD	f36  = [AO5], LDA	(p11) adds	AO6  = - 4 * SIZE, AO6	(p12) adds	AO7  = - 2 * SIZE, AO7	}	;;	(p13) LDFD	f33  = [AO2], LDA	(p13) LDFD	f35  = [AO4], LDA	(p14) LDFD	f37  = [AO6], LDA	(p15) LDFD	f38  = [AO7], LDA	;;	(p13) LDFD	f96  = [YLD1], 1 * SIZE	(p13) LDFD	f98  = [YLD2], 1 * SIZE	;;	(p13) LDFD	f97  = [YLD1], 3 * SIZE	(p13) LDFD	f99  = [YLD2], 3 * SIZE	;;	(p14) LDFD	f100 = [YLD1], 1 * SIZE	;;	(p14) LDFD	f101 = [YLD1], 1 * SIZE	;;	(p15) LDFD	f102 = [YLD1], 1 * SIZE	;;	(p13) FMA	f16  = ALPHA1,  f32, f96	(p13) FMA	f18  = ALPHA1,  f34, f98	(p13) FMA	f17  = ALPHA1,  f33, f97	(p13) FMA	f19  = ALPHA1,  f35, f99	(p14) FMA	f20  = ALPHA1,  f36, f100	(p15) FMA	f22  = ALPHA1,  f38, f102	(p14) FMA	f21 = ALPHA1,  f37, f101	;;	{ .mmi	(p13) STFD	[YST1] = f16, 1 * SIZE	(p13) STFD	[YST2] = f18, 1 * SIZE	nop   __LINE__	}	;;	{ .mmi	(p13) STFD	[YST1] = f17, 3 * SIZE	(p13) STFD	[YST2] = f19	nop   __LINE__	}	;;	{ .mmi	(p14) STFD	[YST1] = f20, 1 * SIZE	;;	(p14) STFD	[YST1] = f21, 1 * SIZE	nop   __LINE__	}	;;	{ .mib	(p15) STFD	[YST1] = f22	}	;;	.align 16.L990:	cmp.eq	p10, p0 = SIZE, INCY	;;	{ .mmi	mov	YLD1 = YY	mov	YST1 = Y	mov	pr.rot= 0	}	{ .mib	mov	YST2 = Y	shr	J = M, 3	(p10) br.cond.dptk .L999	}	;;	{ .mmi	cmp.eq	p6, p0 = r0, J	adds	J = -1, J	mov	ar.ec = 4	}	{ .mmi	cmp.eq	p16, p0 = r0, r0	nop	__LINE__	tbit.nz	p13, p0 = M, 2	}	;;	{ .mib	nop	__LINE__	mov	ar.lc = J	(p6) br.cond.dpnt .L995	}	;;.L992:	{ .mfi	(p19)	STFD	[YST2] = f35	(p18)	FADD	f34 = f34, f66	(p19)	add YST2 = YST2, INCY	}	{ .mmi	(p16)	LDFD	f64 = [YLD1], 1 * SIZE	(p16)	LDFD	f32 = [YST1], INCY	}	;;	{ .mfi	(p19)	STFD	[YST2] = f39	(p18)	FADD	f38 = f38, f70	(p19)	add YST2 = YST2, INCY	}	{ .mmi	(p16)	LDFD	f36 = [YST1], INCY	(p16)	LDFD	f68 = [YLD1], 1 * SIZE	}	;;	{ .mfi	(p19)	STFD	[YST2] = f43	(p18)	FADD	f42 = f42, f74	(p19)	add YST2 = YST2, INCY	}	{ .mmi	(p16)	LDFD	f72 = [YLD1], 1 * SIZE	(p16)	LDFD	f40 = [YST1], INCY	}	;;	{ .mfi	(p19)	STFD	[YST2] = f47	(p18)	FADD	f46 = f46, f78	(p19)	add YST2 = YST2, INCY	}	{ .mmi	(p16)	LDFD	f76 = [YLD1], 1 * SIZE	(p16)	LDFD	f44 = [YST1], INCY	}	;;	{ .mfi	(p19)	STFD	[YST2] = f51	(p18)	FADD	f50 = f50, f82	(p19)	add YST2 = YST2, INCY	}	{ .mmi	(p16)	LDFD	f80 = [YLD1], 1 * SIZE	(p16)	LDFD	f48 = [YST1], INCY	}	;;	{ .mfi	(p19)	STFD	[YST2] = f55	(p18)	FADD	f54 = f54, f86	(p19)	add YST2 = YST2, INCY	}	{ .mmi	(p16)	LDFD	f84 = [YLD1], 1 * SIZE	(p16)	LDFD	f52 = [YST1], INCY	}	;;	{ .mfi	(p19)	STFD	[YST2] = f59	(p18)	FADD	f58 = f58, f90	(p19)	add YST2 = YST2, INCY	}	{ .mmi	(p16)	LDFD	f88 = [YLD1], 1 * SIZE	(p16)	LDFD	f56 = [YST1], INCY	}	;;	{ .mfi	(p19)	STFD	[YST2] = f63	(p18)	FADD	f62 = f62, f94	(p19)	add YST2 = YST2, INCY	}	{ .mmb	(p16)	LDFD	f92 = [YLD1], 1 * SIZE	(p16)	LDFD	f60 = [YST1], INCY	br.ctop.sptk.few .L992	}	;;.L995:	(p13)	LDFD	f32 = [YST1], INCY	(p13)	LDFD	f40 = [YLD1], 1 * SIZE	tbit.nz	p14, p0 = M, 1	;;	(p13)	LDFD	f33 = [YST1], INCY	(p13)	LDFD	f41 = [YLD1], 1 * SIZE	tbit.nz	p15, p0 = M, 0	;;	(p13)	LDFD	f34 = [YST1], INCY	(p13)	LDFD	f42 = [YLD1], 1 * SIZE	;;	(p13)	LDFD	f35 = [YST1], INCY	(p13)	LDFD	f43 = [YLD1], 1 * SIZE	;;	(p14)	LDFD	f36 = [YST1], INCY	(p14)	LDFD	f44 = [YLD1], 1 * SIZE	;;	(p14)	LDFD	f37 = [YST1], INCY	(p14)	LDFD	f45 = [YLD1], 1 * SIZE	;;	(p15)	LDFD	f38 = [YST1], INCY	(p15)	LDFD	f46 = [YLD1], 1 * SIZE	;;	(p13)	FADD	f32 = f32, f40	(p13)	FADD	f33 = f33, f41	(p13)	FADD	f34 = f34, f42	(p13)	FADD	f35 = f35, f43	(p14)	FADD	f36 = f36, f44	(p14)	FADD	f37 = f37, f45	(p15)	FADD	f38 = f38, f46	;;	(p13)	STFD	[YST2] = f32	(p13)	add YST2 = YST2, INCY	;;	(p13)	STFD	[YST2] = f33	(p13)	add YST2 = YST2, INCY	;;	(p13)	STFD	[YST2] = f34	(p13)	add YST2 = YST2, INCY	;;	(p13)	STFD	[YST2] = f35	(p13)	add YST2 = YST2, INCY	;;	(p14)	STFD	[YST2] = f36	(p14)	add YST2 = YST2, INCY	;;	(p14)	STFD	[YST2] = f37	(p14)	add YST2 = YST2, INCY	;;	(p15)	STFD	[YST2] = f38	;;.L999:	mov	 ar.lc = ARLC	mov pr    = PR, -1	br.ret.sptk.many b0	;;	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -