⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemv_t.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
	{ .mmf	(p16) cmp.eq.unc p14, p0 = 6, I	(p16) cmp.eq.unc p15, p0 = 7, I	(p20) ADD4	f15 = f116, f71, f15	}	;;	{ .mmf	(p12) PREFETCH [RPRE5], 16 * SIZE	(p16) LDFD	f72 = [AO5], 1 * SIZE	(p20) ADD1	f16 = f116, f76, f16	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f17 = f121, f76, f17	}	;;	{ .mmf	(p16) LDFD	f77 = [AO5], 1 * SIZE	nop   __LINE__	(p20) ADD1	f18 = f116, f86, f18	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f19 = f121, f86, f19	}	;;	{ .mmf	(p13) PREFETCH [RPRE6], 16 * SIZE	(p16) LDFD	f82 = [AO6], 1 * SIZE	(p20) ADD1	f20 = f116, f96, f20	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f21 = f121, f96, f21	}	;;	{ .mmf	(p16) LDFD	f87 = [AO6], 1 * SIZE	nop   __LINE__	(p20) ADD1	f22 = f116, f106, f22	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f23 = f121, f106, f23	}	;;	{ .mmf	(p14) PREFETCH [RPRE7], 16 * SIZE	(p16) LDFD	f92 = [AO7], 1 * SIZE	(p20) ADD3	f16 = f121, f81, f16	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD4	f17 = f116, f81, f17	}	;;	{ .mmf	(p16) LDFD	f97 = [AO7], 1 * SIZE	nop   __LINE__	(p20) ADD3	f18 = f121, f91, f18	}	{ .mmf	nop   __LINE__	(p16) adds I = 1, I	(p20) ADD4	f19 = f116, f91, f19	}	;;	{ .mmf	(p15) PREFETCH [RPRE8], 16 * SIZE	(p16) LDFD	f102 = [AO8], 1 * SIZE	(p20) ADD3	f20 = f121, f101, f20	}	{ .mmf	(p15) mov I = 0	nop   __LINE__	(p20) ADD4	f21 = f116, f101, f21	}	;;	{ .mmf	(p16) LDFD	f107 = [AO8], 1 * SIZE	nop   __LINE__	(p20) ADD3	f22 = f121, f111, f22	}	{ .mfb	(p16) cmp.eq.unc p12, p0 = 0, I	(p20) ADD4	f23 = f116, f111, f23	br.ctop.sptk.few .L116	}	;;.L118:	LDFD	f32 = [CLD1], SIZE	LDFD	f36 = [CLD2], SIZE	shladd	CST2  = INCY, 1, CST1	;;	LDFD	f33 = [CLD1], INCYM1	LDFD	f37 = [CLD2], INCYM1	;;	LDFD	f34 = [CLD1], SIZE	LDFD	f38 = [CLD2], SIZE	;;	LDFD	f35 = [CLD1], INCY3M1	LDFD	f39 = [CLD2], INCY3M1	;;	LDFD	f40 = [CLD1], SIZE	LDFD	f44 = [CLD2], SIZE	;;	LDFD	f41 = [CLD1], INCYM1	LDFD	f45 = [CLD2], INCYM1	;;	LDFD	f42 = [CLD1], SIZE	LDFD	f46 = [CLD2], SIZE	;;	LDFD	f43 = [CLD1], INCY3M1	LDFD	f47 = [CLD2], INCY3M1	;;	FMA	f32 = ALPHA_R, f8,  f32	FMA	f36 = ALPHA_R, f12, f36	FMA	f33 = ALPHA_I, f8,  f33	FMA	f37 = ALPHA_I, f12, f37	FMA	f34 = ALPHA_R, f10, f34	FMA	f38 = ALPHA_R, f14, f38	FMA	f35 = ALPHA_I, f10, f35	FMA	f39 = ALPHA_I, f14, f39	;;	FNMA	f32 = ALPHA_I, f9,  f32	FNMA	f36 = ALPHA_I, f13, f36	FMA	f33 = ALPHA_R, f9,  f33	FMA	f37 = ALPHA_R, f13, f37	FNMA	f34 = ALPHA_I, f11, f34	FNMA	f38 = ALPHA_I, f15, f38	FMA	f35 = ALPHA_R, f11, f35	FMA	f39 = ALPHA_R, f15, f39	;;	FMA	f40 = ALPHA_R, f16, f40	FMA	f44 = ALPHA_R, f20, f44	FMA	f41 = ALPHA_I, f16, f41	FMA	f45 = ALPHA_I, f20, f45	FMA	f42 = ALPHA_R, f18, f42	FMA	f46 = ALPHA_R, f22, f46	FMA	f43 = ALPHA_I, f18, f43	FMA	f47 = ALPHA_I, f22, f47	;;	{ .mmf	STFD [CST1] = f32, SIZE	STFD [CST2] = f36, SIZE	FNMA	f40 = ALPHA_I, f17, f40	}	{ .mmf	nop	__LINE__	nop	__LINE__	FNMA	f44 = ALPHA_I, f21, f44	}	;;	{ .mmf	STFD [CST1] = f33	STFD [CST2] = f37	FMA	f41 = ALPHA_R, f17, f41	}	{ .mmf	add  CST1 = CST1, INCYM1	add  CST2 = CST2, INCYM1	FMA	f45 = ALPHA_R, f21, f45	}	;;	{ .mmf	STFD [CST1] = f34, SIZE	STFD [CST2] = f38, SIZE	FNMA	f42 = ALPHA_I, f19, f42	}	{ .mmf	nop	__LINE__	nop	__LINE__	FNMA	f46 = ALPHA_I, f23, f46	}	;;	{ .mmf	STFD [CST1] = f35	STFD [CST2] = f39	FMA	f43 = ALPHA_R, f19, f43	}	{ .mmf	add  CST1 = CST1, INCY3M1	add  CST2 = CST2, INCY3M1	FMA	f47 = ALPHA_R, f23, f47	}	;;	{ .mmi	STFD [CST1] = f40, SIZE	STFD [CST2] = f44, SIZE	adds J = -1, J	}	;;	{ .mmi	STFD [CST1] = f41	STFD [CST2] = f45	add  CST1 = CST1, INCYM1	}	{ .mmi	nop  __LINE__	nop  __LINE__	add  CST2 = CST2, INCYM1	}	;;	{ .mmi	STFD [CST1] = f42, SIZE	STFD [CST2] = f46, SIZE	cmp.lt p6, p0 = 0, J	}	;;	{ .mmi	STFD [CST1] = f43	STFD [CST2] = f47	add  CST1 = CST1, INCY3M1	}	{ .mmb	add  CST2 = CST2, INCY3M1	(p6) br.cond.dptk .L111	}	;;	.align 16.L120:	{ .mfi	mov	AO1 = A	mov	f8  = f0	mov	pr.rot= 0	}	{ .mfi	add	AO2 = LDA, A	mov	f10 = f0	tbit.z	p6, p0  = N, 2	}	;;	{ .mmf	shladd	AO3 = LDA, 1, A	shladd	AO4 = LDA, 1, AO2	mov	f12 = f0	}	{ .mfb	mov	BO  = BUFFER	mov	f14 = f0	(p6) br.cond.dpnt .L130	}	;;	{ .mfi	adds	RPRE1  = (RPREFETCH +  0) * SIZE, AO1	mov	f9  = f0	mov	ar.ec= 5	}	{ .mmf	adds	RPRE2  = (RPREFETCH +  2) * SIZE, AO2	adds	I = -1, MIN_M	mov	f11 = f0	}	;;	{ .mmf	adds	RPRE3  = (RPREFETCH +  4) * SIZE, AO3	adds	RPRE4  = (RPREFETCH +  6) * SIZE, AO4	mov	f13 = f0	}	{ .mmf	cmp.eq	p16, p0 = r0, r0	shladd	A   = LDA, 2, A	mov	f15 = f0	}	;;	{ .mmi	lfetch.excl.nt1	[WPRE]	adds	PREB   = RPREFETCH * SIZE, BO	mov	ar.lc = I	}	{ .mmi	adds	WPRE = 16 * SIZE, CLD1	cmp.eq  p12, p0 = r0, r0	mov	I = 0	}	;;	.align 16.L126:	{ .mmf	(p12) PREFETCH [RPRE1], 16 * SIZE	(p16) LDFD	f32 = [AO1], 1 * SIZE	(p20) ADD1	f8  = f116, f36, f8	}	{ .mmf	(p16) cmp.eq.unc p13, p0 = 2, I	(p16) cmp.eq.unc p14, p0 = 4, I	(p20) ADD2	f9  = f121, f36, f9	}	;;	{ .mmf	(p12) PREFETCH [PREB], 16 * SIZE	(p16) LDFPD	f112, f117 = [BO], 2 * SIZE	(p20) ADD1	f10 = f116, f46, f10	}	{ .mmf	(p16) LDFD	f37 = [AO1], 1 * SIZE	(p16) cmp.eq.unc p15, p0 = 6, I	(p20) ADD2	f11 = f121, f46, f11	}	;;	{ .mmf	(p16) LDFD	f42 = [AO2], 1 * SIZE	nop   __LINE__	(p20) ADD1	f12 = f116, f56, f12	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f13 = f121, f56, f13	}	;;	{ .mmf	(p13) PREFETCH [RPRE2], 16 * SIZE	(p16) LDFD	f47 = [AO2], 1 * SIZE	(p20) ADD1	f14 = f116, f66, f14	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f15 = f121, f66, f15	}	;;	{ .mmf	(p16) LDFD	f52 = [AO3], 1 * SIZE	nop   __LINE__	(p20) ADD3	f8  = f121, f41, f8	}	{ .mmf	nop   __LINE__	(p16) adds I = 1, I	(p20) ADD4	f9  = f116, f41, f9	}	;;	{ .mmf	(p14) PREFETCH [RPRE3], 16 * SIZE	(p16) LDFD	f57 = [AO3], 1 * SIZE	(p20) ADD3	f10 = f121, f51, f10	}	{ .mmf	nop   __LINE__	(p16) cmp.eq.unc p15, p0 = 8, I	(p20) ADD4	f11 = f116, f51, f11	}	;;	{ .mmf	(p16) LDFD	f62 = [AO4], 1 * SIZE	nop   __LINE__	(p20) ADD3	f12 = f121, f61, f12	}	{ .mmf	(p15) mov I = 0	nop   __LINE__	(p20) ADD4	f13 = f116, f61, f13	}	;;	{ .mmf	(p15) PREFETCH [RPRE4], 16 * SIZE	(p16) LDFD	f67 = [AO4], 1 * SIZE	(p20) ADD3	f14 = f121, f71, f14	}	{ .mfb	(p16) cmp.eq.unc p12, p0 = 0, I	(p20) ADD4	f15 = f116, f71, f15	br.ctop.sptk.few .L126	}	;;.L128:	LDFD	f32 = [CLD1], SIZE	LDFD	f36 = [CLD2], SIZE	shladd	CST2  = INCY, 1, CST1	;;	LDFD	f33 = [CLD1], INCYM1	LDFD	f37 = [CLD2], INCYM1	;;	LDFD	f34 = [CLD1], SIZE	LDFD	f38 = [CLD2], SIZE	;;	LDFD	f35 = [CLD1], INCY3M1	LDFD	f39 = [CLD2], INCY3M1	;;	FMA	f32 = ALPHA_R, f8,  f32	FMA	f36 = ALPHA_R, f12, f36	FMA	f33 = ALPHA_I, f8,  f33	FMA	f37 = ALPHA_I, f12, f37	FMA	f34 = ALPHA_R, f10, f34	FMA	f38 = ALPHA_R, f14, f38	FMA	f35 = ALPHA_I, f10, f35	FMA	f39 = ALPHA_I, f14, f39	;;	FNMA	f32 = ALPHA_I, f9,  f32	FNMA	f36 = ALPHA_I, f13, f36	FMA	f33 = ALPHA_R, f9,  f33	FMA	f37 = ALPHA_R, f13, f37	FNMA	f34 = ALPHA_I, f11, f34	FNMA	f38 = ALPHA_I, f15, f38	FMA	f35 = ALPHA_R, f11, f35	FMA	f39 = ALPHA_R, f15, f39	;;	STFD [CST1] = f32, SIZE	STFD [CST2] = f36, SIZE	;;	STFD [CST1] = f33	STFD [CST2] = f37	add  CST1 = CST1, INCYM1	add  CST2 = CST2, INCYM1	;;	STFD [CST1] = f34, SIZE	STFD [CST2] = f38, SIZE	;;	STFD [CST1] = f35	STFD [CST2] = f39	add  CST1 = CST1, INCY3M1	add  CST2 = CST2, INCY3M1	;;	.align 16.L130:	{ .mfi	mov	AO1 = A	mov	f8  = f0	mov	pr.rot= 0	}	{ .mfi	add	AO2 = LDA, A	mov	f10 = f0	tbit.z	p6, p0  = N, 1	}	;;	{ .mmf	adds	RPRE1  = (RPREFETCH +  0) * SIZE, AO1	adds	RPRE2  = (RPREFETCH +  2) * SIZE, AO2	mov	f12 = f0	}	{ .mfb	adds	I = -1, MIN_M	mov	f14 = f0	(p6) br.cond.dpnt .L140	}	;;	{ .mfi	mov	BO  = BUFFER	mov	f9  = f0	mov	ar.ec= 5	}	{ .mmf	cmp.eq	p16, p0 = r0, r0	shladd	A   = LDA, 1, A	mov	f11 = f0	}	;;	{ .mfi	adds	WPRE = 16 * SIZE, CLD1	mov	f13 = f0	mov	ar.lc = I	}	{ .mmf	adds	PREB   = RPREFETCH * SIZE, BO	nop	__LINE__	mov	f15 = f0	}	;;	{ .mmi	lfetch.excl.nt1	[WPRE]	cmp.eq  p12, p0 = r0, r0	mov	I = 0	}	;;	.align 16.L136:	{ .mmf	(p12) PREFETCH [RPRE1], 16 * SIZE	(p16) LDFD	f32 = [AO1], 1 * SIZE	(p20) ADD1	f8  = f116, f36, f8	}	{ .mmf	(p16) cmp.eq.unc p13, p0 = 4, I	(p16) adds I = 1, I	(p20) ADD2	f9  = f121, f36, f9	}	;;	{ .mmf	(p12) PREFETCH [PREB], 16 * SIZE	(p16) LDFPD	f112, f117 = [BO], 2 * SIZE	(p20) ADD1	f10 = f116, f46, f10	}	{ .mmf	(p16) LDFD	f37 = [AO1], 1 * SIZE	(p16) cmp.eq.unc p12, p0 = 8, I	(p20) ADD2	f11 = f121, f46, f11	}	;;	{ .mmf	(p13) PREFETCH [RPRE2], 16 * SIZE	(p16) LDFD	f42 = [AO2], 1 * SIZE	(p20) ADD3	f12 = f121, f41, f12	}	{ .mmf	(p12) mov I = 0	nop	__LINE__	(p20) ADD4	f13 = f116, f41, f13	}	;;	{ .mmf	(p16) LDFD	f47 = [AO2], 1 * SIZE	nop	__LINE__	(p20) ADD3	f14 = f121, f51, f14	}	{ .mfb	nop	__LINE__	(p20) ADD4	f15 = f116, f51, f15	br.ctop.sptk.few .L136	}	;;.L138:	LDFD	f32 = [CLD1], SIZE	FADD	f8  = f8,  f12	shladd	CST2  = INCY, 1, CST1	;;	LDFD	f33 = [CLD1], INCYM1	FADD	f10 = f10, f14	;;	LDFD	f34 = [CLD1], SIZE	FADD	f9  = f9,  f13	;;	LDFD	f35 = [CLD1], INCYM1	FADD	f11 = f11, f15	;;	FMA	f32 = ALPHA_R, f8,  f32	FMA	f33 = ALPHA_I, f8,  f33	FMA	f34 = ALPHA_R, f10, f34	FMA	f35 = ALPHA_I, f10, f35	;;	FNMA	f32 = ALPHA_I, f9,  f32	FMA	f33 = ALPHA_R, f9,  f33	FNMA	f34 = ALPHA_I, f11, f34	FMA	f35 = ALPHA_R, f11, f35	;;	STFD [CST1] = f32, SIZE	;;	STFD [CST1] = f33	add  CST1 = CST1, INCYM1	;;	STFD [CST1] = f34, SIZE	;;	STFD [CST1] = f35	add  CST1 = CST1, INCYM1	;;	.align 16.L140:	{ .mfi	mov	AO1 = A	mov	f8  = f0	mov	pr.rot= 0	}	{ .mfi	mov	f9  = f0	tbit.z	p6, p0  = N, 0	}	;;	{ .mfi	adds	RPRE1  = (RPREFETCH +  0) * SIZE, AO1	mov	f10 = f0	mov	ar.ec= 5	}	{ .mfb	adds	I = -1, MIN_M	mov	f11 = f0	(p6) br.cond.dpnt .L199	}	;;	{ .mmi	cmp.eq	p16, p0 = r0, r0	shladd	A   = LDA, 1, A	mov	ar.lc = I	}	{ .mmi	adds	WPRE = 16 * SIZE, CLD1	adds	PREB   = RPREFETCH * SIZE, BO	mov	BO  = BUFFER	}	;;	{ .mmi	lfetch.excl.nt1	[WPRE]	cmp.eq  p12, p0 = r0, r0	mov	I = 0	}	;;	.align 16.L146:	{ .mmf	(p12) PREFETCH [RPRE1], 16 * SIZE	(p16) LDFD	f32  = [AO1], 1 * SIZE	(p20) ADD1	f8  = f116, f36, f8	}	{ .mmf	(p16) cmp.eq.unc p12, p0 = 7, I	(p16) adds I = 1, I	(p20) ADD2	f9  = f121, f36, f9	}	;;	{ .mmf	(p16) LDFPD	f112, f117 = [BO], 2 * SIZE	(p16) LDFD	f37  = [AO1], 1 * SIZE	(p20) ADD3	f10  = f121, f41, f10	}	{ .mfb	(p12) mov I = 0	(p20) ADD4	f11  = f116, f41, f11	br.ctop.sptk.few .L146	}	;;.L148:	LDFD	f32 = [CLD1], SIZE	FADD	f8  = f8,  f10	shladd	CST2  = INCY, 1, CST1	;;	LDFD	f33 = [CLD1], INCYM1	FADD	f9  = f9,  f11	;;	FMA	f32 = ALPHA_R, f8,  f32	FMA	f33 = ALPHA_I, f8,  f33	;;	FNMA	f32 = ALPHA_I, f9,  f32	FMA	f33 = ALPHA_R, f9,  f33	;;	STFD [CST1] = f32, SIZE	;;	STFD [CST1] = f33	add  CST1 = CST1, INCYM1	;;	.align 16.L199:	adds	IS = P, IS	shladd	A  = LDAP, ZBASE_SHIFT, A	;;	cmp.gt	p6, p0 = M, IS	(p6) br.cond.dptk .LIs_loop	.align 16	;;.L999:	mov	r8 = r0	adds	r9 = 1 * 16, SP	;;	ldf.fill  f16 = [SP], 32	ldf.fill  f17 = [r9], 32	mov	 ar.lc = ARLC	;;		ldf.fill  f18 = [SP], 32	ldf.fill  f19 = [r9], 32	mov pr    = PR, -1	;;		ldf.fill  f20 = [SP], 32	ldf.fill  f21 = [r9], 32	mov	ar.pfs = ARPFS	;;		ldf.fill  f22 = [SP], 32	ldf.fill  f23 = [r9]	br.ret.sptk.many b0	;;	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -