⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ddot.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	.align 32/* INCY == 1 */.L112:	{ .mmf	(p16) LDFPD	f32, f35  = [Y1], 2 * SIZE	(p16) lfetch.nt1 [PREX], INCX16	(p18) FMA	f8  = f34, f82,  f8	}	{ .mmf	(p16) LDFD	f80 = [X1], INCX	(p16) LDFD	f86 = [X2], INCX	(p18) FMA	f9  = f37, f85,  f9	}	;;	{ .mmf	(p16) LDFPD	f38,  f41  = [Y1], 2 * SIZE	(p16) lfetch.nt1 [PREY], INCY16	(p18) FMA	f10 = f40, f88,  f10	}	{ .mmf	(p16) LDFD	f83  = [X1], INCX3	(p16) LDFD	f89  = [X2], INCX3	(p18) FMA	f11 = f43, f91,  f11	}	;;	{ .mmf	(p16) LDFPD	f44,  f47  = [Y1], 2 * SIZE	(p18) FMA	f12 = f46, f94,  f12	}	{ .mmf	(p16) LDFD	f92  = [X1], INCX	(p16) LDFD	f98  = [X2], INCX	(p18) FMA	f13 = f49, f97,  f13	}	;;	{ .mmf	(p16) LDFPD	f50,  f53  = [Y1], 2 * SIZE	(p18) FMA	f14 = f52, f100,  f14	}	{ .mmf	(p16) LDFD	f95  = [X1], INCX3	(p16) LDFD	f101 = [X2], INCX3	(p18) FMA	f15 = f55, f103, f15	}	;;	{ .mmf	(p16) LDFPD	f56,  f59  = [Y1], 2 * SIZE	(p18) FMA	f8  = f58, f106, f8	}	{ .mmf	(p16) LDFD	f104 = [X1], INCX	(p16) LDFD	f110 = [X2], INCX	(p18) FMA	f9   = f61, f109, f9	}	;;	{ .mmf	(p16) LDFPD	f62,  f65  = [Y1], 2 * SIZE	(p18) FMA	f10 = f64, f112, f10	}	{ .mmf	(p16) LDFD	f107 = [X1], INCX3	(p16) LDFD	f113 = [X2], INCX3	(p18) FMA	f11 = f67, f115, f11	}	;;	{ .mmf	(p16) LDFPD	f68,  f71  = [Y1], 2 * SIZE	(p18) FMA	f12 = f70, f118, f12	}	{ .mmf	(p16) LDFD	f116 = [X1], INCX	(p16) LDFD	f122 = [X2], INCX	(p18) FMA	f13 = f73, f121, f13	}	;;	{ .mmf	(p16) LDFPD	f74,  f77  = [Y1], 2 * SIZE	(p16) LDFD	f119 = [X1], INCX3	(p18) FMA	f14 = f76, f124, f14	}	{ .mfb	(p16) LDFD	f125 = [X2], INCX3	(p18) FMA	f15 = f79, f127, f15	br.ctop.sptk.few .L112	}	;;	.align 32.L115:	{ .mmi	(p12) LDFPD	f32, f33  = [Y1], 2 * SIZE	mov	XX = X1	tbit.z	p0, p13 = N, 2	}	{ .mmb	(p12) LDFD	f34 = [X1], INCX	(p12) LDFD	f38 = [X2], INCX	(p7) br.cond.dptk .L999	}	;;	{ .mmi	(p12) LDFPD	f36, f37  = [Y1], 2 * SIZE	(p12) shladd XX = INCX, 3, XX	tbit.z	p0, p14 = N, 1	}	{ .mmi	(p12) LDFD	f35 = [X1], INCX3	(p12) LDFD	f39 = [X2], INCX3	tbit.z	p0, p15 = N, 0	}	;;	{ .mmi	(p12) LDFPD	f40, f41  = [Y1], 2 * SIZE	(p13) shladd XX = INCX, 2, XX	}	{ .mmi	(p12) LDFD	f42 = [X1], INCX	(p12) LDFD	f46 = [X2], INCX	}	;;	(p12) LDFPD	f44, f45  = [Y1], 2 * SIZE	(p12) LDFD	f43 = [X1], INCX3	(p12) LDFD	f47 = [X2], INCX3	(p14) shladd XX = INCX, 1, XX	;;	(p13) LDFPD	f48, f49  = [Y1], 2 * SIZE	(p13) LDFD	f50 = [X1], INCX	(p13) LDFD	f54 = [X2], INCX	;;	(p13) LDFPD	f52, f53  = [Y1], 2 * SIZE	(p13) LDFD	f51 = [X1], INCX3	(p13) LDFD	f55 = [X2], INCX3	;;	(p14) LDFPD	f56, f57  = [Y1], 2 * SIZE	(p14) LDFD	f58 = [X1], INCX	(p15) LDFD	f61 = [XX]	;;	(p14) LDFD	f59 = [X1]	(p15) LDFD	f60 = [Y1]	;;	(p12) FMA	f8  = f32, f34, f8	(p12) FMA	f9  = f33, f35, f9	(p12) FMA	f10 = f36, f38, f10	(p12) FMA	f11 = f37, f39, f11	(p12) FMA	f12 = f40, f42, f12	(p12) FMA	f13 = f41, f43, f13	(p12) FMA	f14 = f44, f46, f14	(p12) FMA	f15 = f45, f47, f15	;;		      	(p13) FMA	f8  = f48, f50, f8	(p13) FMA	f9  = f49, f51, f9	(p13) FMA	f10 = f52, f54, f10	(p13) FMA	f11 = f53, f55, f11	(p14) FMA	f12 = f56, f58, f12	(p14) FMA	f13 = f57, f59, f13	(p15) FMA	f14 = f60, f61, f14	br	.L999	;;	.align 32.L120:	{ .mmi	adds	PREX = (PREFETCH_SIZE + 17) * SIZE, X1	adds	PREY = (PREFETCH_SIZE + 19) * SIZE, X1	mov	ar.lc = I	}	{ .mfb	cmp.eq	p6 ,p0  =   -1, I	FMA  f15 = f32, f80, f0	(p6) br.cond.dpnt  .L125	}	;;	.align 32.L122:	{ .mmf	(p16) LDFPD	f32, f35  = [Y1], 2 * SIZE	(p16) lfetch.nt1 [PREX], INCX16	(p18) FMA	f8  = f34, f82,  f8	}	{ .mmf	(p17) LDFD	f105 = [X1], INCX	(p17) LDFD	f111 = [X2], INCX	(p18) FMA	f9  = f37, f85,  f9	}	;;	{ .mmf	(p16) LDFPD	f38,  f41  = [Y1], 2 * SIZE	(p16) lfetch.nt1 [PREY], INCY16	(p18) FMA	f10 = f40, f88,  f10	}	{ .mmf	(p17) LDFD	f108 = [X1], INCX3	(p17) LDFD	f114 = [X2], INCX3	(p18) FMA	f11 = f43, f91,  f11	}	;;	{ .mmf	(p16) LDFPD	f44,  f47  = [Y1], 2 * SIZE	(p18) FMA	f12 = f46, f94,  f12	}	{ .mmf	(p17) LDFD	f117 = [X1], INCX	(p17) LDFD	f123 = [X2], INCX	(p18) FMA	f13 = f49, f97,  f13	}	;;	{ .mmf	(p16) LDFPD	f50,  f53  = [Y1], 2 * SIZE	(p18) FMA	f14 = f52, f100,  f14	}	{ .mmf	(p17) LDFD	f120 = [X1], INCX3	(p17) LDFD	f126 = [X2], INCX3	(p18) FMA	f15 = f55, f103, f15	}	;;	{ .mmf	(p16) LDFPD	f56,  f59  = [Y1], 2 * SIZE	(p18) FMA	f8  = f58, f106, f8	}	{ .mmf	(p16) LDFD	f80 = [X1], INCX	(p16) LDFD	f86 = [X2], INCX	(p18) FMA	f9   = f61, f109, f9	}	;;	{ .mmf	(p16) LDFPD	f62,  f65  = [Y1], 2 * SIZE	(p18) FMA	f10 = f64, f112, f10	}	{ .mmf	(p16) LDFD	f83 = [X1], INCX3	(p16) LDFD	f89 = [X2], INCX3	(p18) FMA	f11 = f67, f115, f11	}	;;	{ .mmf	(p16) LDFPD	f68,  f71  = [Y1], 2 * SIZE	(p18) FMA	f12 = f70, f118, f12	}	{ .mmf	(p16) LDFD	f92 = [X1], INCX	(p16) LDFD	f98 = [X2], INCX	(p18) FMA	f13 = f73, f121, f13	}	;;	{ .mmf	(p16) LDFPD	f74,  f77  = [Y1], 2 * SIZE	(p16) LDFD	f95 = [X1], INCX3	(p18) FMA	f14 = f76, f124, f14	}	{ .mfb	(p16) LDFD	f101 = [X2], INCX3	(p18) FMA	f15 = f79, f127, f15	br.ctop.sptk.few .L122	}	;;	.align 32.L125:	{ .mmi	(p12) LDFPD	f32, f33  = [Y1], 2 * SIZE	mov	XX = X1	tbit.z	p0, p13 = N, 2	}	{ .mmb	(p12) LDFD	f34 = [X1], INCX	(p12) LDFD	f38 = [X2], INCX	(p7) br.cond.dptk .L999	}	;;	{ .mmi	(p12) LDFPD	f36, f37  = [Y1], 2 * SIZE	(p12) shladd XX = INCX, 3, XX	tbit.z	p0, p14 = N, 1	}	{ .mmi	(p12) LDFD	f35 = [X1], INCX3	(p12) LDFD	f39 = [X2], INCX3	tbit.z	p0, p15 = N, 0	}	;;	{ .mmi	(p12) LDFPD	f40, f41  = [Y1], 2 * SIZE	(p13) shladd XX = INCX, 2, XX	}	{ .mmi	(p12) LDFD	f42 = [X1], INCX	(p12) LDFD	f46 = [X2], INCX	}	;;	(p12) LDFPD	f44, f45  = [Y1], 2 * SIZE	(p12) LDFD	f43 = [X1], INCX3	(p12) LDFD	f47 = [X2], INCX3	(p14) shladd XX = INCX, 1, XX	;;	(p13) LDFPD	f48, f49  = [Y1], 2 * SIZE	(p13) LDFD	f50 = [X1], INCX	(p13) LDFD	f54 = [X2], INCX	;;	(p13) LDFPD	f52, f53  = [Y1], 2 * SIZE	(p13) LDFD	f51 = [X1], INCX3	(p13) LDFD	f55 = [X2], INCX3	;;	(p14) LDFPD	f56, f57  = [Y1], 2 * SIZE	(p14) LDFD	f58 = [X1], INCX	(p15) LDFD	f61 = [XX]	;;	(p14) LDFD	f59 = [X1]	(p15) LDFD	f60 = [Y1]	;;	(p12) FMA	f8  = f32, f34, f8	(p12) FMA	f9  = f33, f35, f9	(p12) FMA	f10 = f36, f38, f10	(p12) FMA	f11 = f37, f39, f11	(p12) FMA	f12 = f40, f42, f12	(p12) FMA	f13 = f41, f43, f13	(p12) FMA	f14 = f44, f46, f14	(p12) FMA	f15 = f45, f47, f15	;;		      	(p13) FMA	f8  = f48, f50, f8	(p13) FMA	f9  = f49, f51, f9	(p13) FMA	f10 = f52, f54, f10	(p13) FMA	f11 = f53, f55, f11	(p14) FMA	f12 = f56, f58, f12	(p14) FMA	f13 = f57, f59, f13	(p15) FMA	f14 = f60, f61, f14	br	.L999	;;	.align 32.L200:	{ .mfi	shladd	INCX3 = INCX, 1, INCX	mov	f12 = f0	mov	pr.rot= 0	}	{ .mfi	and	J =  15, N	mov	f13 = f0	shr	I =  N, 4	}	;;	{ .mmf	cmp.eq	p16, p0 = r0, r0	shladd	INCY3 = INCY, 1, INCY	mov	f14 = f0	}	{ .mmi	shladd	INCX16 = INCX, 4, r0	shladd	INCY16 = INCY, 4, r0	tbit.z	p0, p12 = N, 3	}	;;	{ .mmi	cmp.eq	p7, p0  =   r0, J	adds	I = -1, I	mov	ar.ec= 3	}	{ .mmi	shladd	Y2 = INCY, 1, Y1	mov	XX = X1	mov	YY = Y1	}	;;	{ .mmi	adds	PREX = (PREFETCH_SIZE + 5) * SIZE, X1	adds	PREY = (PREFETCH_SIZE + 3) * SIZE, Y1	mov	ar.lc = I	}	{ .mfb	cmp.eq	p6 ,p0  =   -1, I	mov	f15 = f0	(p6) br.cond.dpnt  .L215	}	;;	.align 32/* INCY == 1 */.L212:	{ .mmf	(p16) lfetch.nt1 [PREX], INCX16	(p16) lfetch.nt1 [PREY], INCY16	(p18) FMA	f8  = f34, f82,  f8	}	{ .mmf	(p16) LDFD	f32 = [Y1], INCY	(p16) LDFD	f38 = [Y2], INCY	(p18) FMA	f9  = f37, f85,  f9	}	;;	{ .mmf	(p16) LDFD	f80 = [X1], INCX	(p16) LDFD	f86 = [X2], INCX	(p18) FMA	f10 = f40, f88,  f10	}	{ .mmf	(p16) LDFD	f35 = [Y1], INCY3	(p16) LDFD	f41 = [Y2], INCY3	(p18) FMA	f11 = f43, f91,  f11	}	;;	{ .mmf	(p16) LDFD	f83  = [X1], INCX3	(p16) LDFD	f89  = [X2], INCX3	(p18) FMA	f12 = f46, f94,  f12	}	{ .mmf	(p16) LDFD	f44 = [Y1], INCY	(p16) LDFD	f50 = [Y2], INCY	(p18) FMA	f13 = f49, f97,  f13	}	;;	{ .mmf	(p16) LDFD	f92  = [X1], INCX	(p16) LDFD	f98  = [X2], INCX	(p18) FMA	f14 = f52, f100,  f14	}	{ .mmf	(p16) LDFD	f47 = [Y1], INCY3	(p16) LDFD	f53 = [Y2], INCY3	(p18) FMA	f15 = f55, f103, f15	}	;;	{ .mmf	(p16) LDFD	f95  = [X1], INCX3	(p16) LDFD	f101 = [X2], INCX3	(p18) FMA	f8  = f58, f106, f8	}	{ .mmf	(p16) LDFD	f56 = [Y1], INCY	(p16) LDFD	f62 = [Y2], INCY	(p18) FMA	f9   = f61, f109, f9	}	;;	{ .mmf	(p16) LDFD	f104 = [X1], INCX	(p16) LDFD	f110 = [X2], INCX	(p18) FMA	f10 = f64, f112, f10	}	{ .mmf	(p16) LDFD	f59 = [Y1], INCY3	(p16) LDFD	f65 = [Y2], INCY3	(p18) FMA	f11 = f67, f115, f11	}	;;	{ .mmf	(p16) LDFD	f107 = [X1], INCX3	(p16) LDFD	f113 = [X2], INCX3	(p18) FMA	f12 = f70, f118, f12	}	{ .mmf	(p16) LDFD	f68 = [Y1], INCY	(p16) LDFD	f74 = [Y2], INCY	(p18) FMA	f13 = f73, f121, f13	}	;;	{ .mmf	(p16) LDFD	f116 = [X1], INCX	(p16) LDFD	f122 = [X2], INCX	(p18) FMA	f14 = f76, f124, f14	}	{ .mmf	(p16) LDFD	f71 = [Y1], INCY3	(p16) LDFD	f77 = [Y2], INCY3	(p18) FMA	f15 = f79, f127, f15	}	;;	{ .mmi	(p16) LDFD	f119 = [X1], INCX3	(p16) LDFD	f125 = [X2], INCX3	}	{ .mmb	(p16) add XX = INCX16, XX	(p16) add YY = INCY16, YY	br.ctop.sptk.few .L212	}	;;	.align 32.L215:	{ .mmi	(p12) LDFD	f34 = [X1], INCX	(p12) LDFD	f38 = [X2], INCX	tbit.z	p0, p13 = N, 2	}	{ .mmb	(p12) LDFD	f32 = [Y1], INCY	(p12) LDFD	f36 = [Y2], INCY	(p7) br.cond.dptk .L999	}	;;	{ .mmi	(p12) LDFD	f35 = [X1], INCX3	(p12) LDFD	f39 = [X2], INCX3	tbit.z	p0, p14 = N, 1	}	{ .mmi	(p12) LDFD	f33 = [Y1], INCY3	(p12) LDFD	f37 = [Y2], INCY3	tbit.z	p0, p15 = N, 0	}	;;	{ .mmi	(p12) LDFD	f42 = [X1], INCX	(p12) LDFD	f46 = [X2], INCX	(p12) shladd XX = INCX, 3, XX	}	{ .mmi	(p12) LDFD	f40 = [Y1], INCY	(p12) LDFD	f44 = [Y2], INCY	(p12) shladd YY = INCY, 3, YY	}	;;	{ .mmi	(p12) LDFD	f43 = [X1], INCX3	(p12) LDFD	f47 = [X2], INCX3	(p13) shladd XX = INCX, 2, XX	}	{ .mmi	(p12) LDFD	f41 = [Y1], INCY3	(p12) LDFD	f45 = [Y2], INCY3	(p13) shladd YY = INCY, 2, YY	}	;;	(p13) LDFD	f50 = [X1], INCX	(p13) LDFD	f54 = [X2], INCX	(p14) shladd XX = INCX, 1, XX	(p13) LDFD	f48 = [Y1], INCY	(p13) LDFD	f52 = [Y2], INCY	(p14) shladd YY = INCY, 1, YY	;;	(p13) LDFD	f51 = [X1], INCX3	(p13) LDFD	f55 = [X2]	(p13) LDFD	f49 = [Y1], INCY3	(p13) LDFD	f53 = [Y2]	;;	(p14) LDFD	f58 = [X1], INCX	(p15) LDFD	f61 = [XX]	(p14) LDFD	f56 = [Y1], INCY	(p15) LDFD	f60 = [YY]	;;	(p14) LDFD	f59 = [X1]	(p14) LDFD	f57 = [Y1]	;;	;;	;;	(p12) FMA	f8  = f32, f34, f8	(p12) FMA	f9  = f33, f35, f9	(p12) FMA	f10 = f36, f38, f10	(p12) FMA	f11 = f37, f39, f11	(p12) FMA	f12 = f40, f42, f12	(p12) FMA	f13 = f41, f43, f13	(p12) FMA	f14 = f44, f46, f14	(p12) FMA	f15 = f45, f47, f15	;;		      	(p13) FMA	f8  = f48, f50, f8	(p13) FMA	f9  = f49, f51, f9	(p13) FMA	f10 = f52, f54, f10	(p13) FMA	f11 = f53, f55, f11	(p14) FMA	f12 = f56, f58, f12	(p14) FMA	f13 = f57, f59, f13	(p15) FMA	f14 = f60, f61, f14	;;	.align 32.L999:	FADD	f8  = f8,  f9	FADD	f10 = f10, f11	FADD	f12 = f12, f13	FADD	f14 = f14, f15	;;	FADD	f8  = f8,  f10	FADD	f12 = f12, f14	mov	ar.lc = ARLC	;;	FADD	f8  = f8,  f12	mov	pr = PR, -65474	br.ret.sptk.many b0	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -