⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 5 页
字号:
#endif#endif	tbit.z	p6, p7 = M, 1	(p6)	br.cond.dptk .L040	}	;;#if !defined(TRMMKERNEL) || \    defined(TRMMKERNEL) && \    ((defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))	{ .mfi	LDFPD	f48, f49 = [B]	mov	f65  = f0	nop	__LINE__	}	{ .mfi	adds	BOFFSET = 2 * SIZE, B	mov	f73  = f0#ifndef TRMMKERNEL	adds	L =  1, K#else	adds	L =  1, L#endif	}#else	{ .mmf	shladd	BOFFSET = KK8, 3, B	shladd	AOFFSET = KK8, 1, AOFFSET	mov	f65  = f0	}	;;	{ .mfi	LDFPD	f48, f49 = [BOFFSET], 2 * SIZE	mov	f73  = f0#ifndef TRMMKERNEL	adds	L =  1, K#else	adds	L =  1, L#endif	}#endif	;;	{ .mfi	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE	mov	f81  = f0	tbit.z	p12, p0 = L, 0	}	{ .mfi	(p7) LDFPD	f32, f33 = [AOFFSET], 2 * SIZE	mov	f89  = f0	shr	L = L, 1	}	;;	{ .mfi	LDFPD	f52, f53 = [BOFFSET], 2 * SIZE	mov	f97  = f0	adds	L =  -1, L	}	{ .mfi	nop	__LINE__	mov	f105 = f0	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET	}	;;	{ .mfi	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET	mov	f113 = f0	mov	ar.lc = L	}	{ .mfi	LDFPD	f54, f55 = [BOFFSET], 2 * SIZE	mov	f121 = f0	cmp.eq	p3, p0 = r0, r0	}	;;	.align 32.L032:	{ .mfb	lfetch.nt1	[PREA],  4 * SIZE	FMA	f64   = f32, f48, f64	// A1 * B1	nop	__LINE__	}	{ .mfi	nop	__LINE__	FMA	f72   = f32, f49, f72	// A1 * B2	(p12) cmp.ne p3, p0 =  0, L	}	;;	{ .mfi	lfetch.nt1	[PREB],  16 * SIZE	FMA	f80   = f32, f50, f80	// A1 * B3	cmp.ne	p4, p5 =  0, L	}	{ .mfb	nop	__LINE__	FMA	f88   = f32, f51, f88	// A1 * B4	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE	FMA	f96   = f32, f52, f96	// A1 * B5	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f104  = f32, f53, f104	// A1 * B6	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE	FMA	f112  = f32, f54, f112	// A1 * B7	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f120  = f32, f55, f120	// A1 * B8	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE	FMA	f65   = f33, f48, f65	// A2 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f73   = f33, f49, f73	// A2 * B2	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f60, f61 = [BOFFSET], 2 * SIZE	FMA	f81   = f33, f50, f81	// A2 * B3	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f89   = f33, f51, f89	// A2 * B4	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f62, f63 = [BOFFSET], 2 * SIZE	FMA	f97   = f33, f52, f97	// A2 * B5	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f105  = f33, f53, f105	// A2 * B6	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMA	f113  = f33, f54, f113	// A2 * B7	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f121  = f33, f55, f121	// A2 * B8	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE	(p3) FMA	f64   = f40, f56, f64	// A1 * B1	nop	__LINE__	}	{ .mfb	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE	(p3) FMA	f72   = f40, f57, f72	// A1 * B2	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE	(p3) FMA	f80   = f40, f58, f80	// A1 * B3	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f88   = f40, f59, f88	// A1 * B4	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f52, f53 = [BOFFSET], 2 * SIZE	(p3) FMA	f96   = f40, f60, f96	// A1 * B5	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f104  = f40, f61, f104	// A1 * B6	nop	__LINE__	}	;;	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f68  = [C1], SIZE#else	nop	__LINE__#endif	(p3) FMA	f112  = f40, f62, f112	// A1 * B7	nop	__LINE__	}	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f76  = [C2], SIZE#else	nop	__LINE__#endif	(p3) FMA	f120  = f40, f63, f120	// A1 * B8	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f54, f55 = [BOFFSET], 2 * SIZE	(p3) FMA	f65   = f41, f56, f65	// A2 * B1	nop	__LINE__	}	{ .mfb	(p3) FMA	f73   = f41, f57, f73	// A2 * B2	nop	__LINE__	}	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f69  = [C1], -1 * SIZE#else	nop	__LINE__#endif	(p3) FMA	f81   = f41, f58, f81	// A2 * B3	nop	__LINE__	}	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f77  = [C2], -1 * SIZE#else	nop	__LINE__#endif	(p3) FMA	f89   = f41, f59, f89	// A2 * B4	nop	__LINE__	}	;;	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f84  = [C3], SIZE#else	nop	__LINE__#endif	(p3) FMA	f97   = f41, f60, f97	// A2 * B5	nop	__LINE__	}	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f92  = [C4], SIZE#else	nop	__LINE__#endif	(p3) FMA	f105  = f41, f61, f105	// A2 * B6	nop	__LINE__	}	;;	{ .mfi#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f85  = [C3], -1 * SIZE#else	nop	__LINE__#endif	(p3) FMA	f113  = f41, f62, f113	// A2 * B7	adds	L = -1, L	}	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f93  = [C4], -1 * SIZE#else	nop	__LINE__#endif	(p3) FMA	f121  = f41, f63, f121	// A2 * B8	br.cloop.sptk.few .L032	}	;;.L038:#if! defined(TRMMKERNEL) && !defined(BETAZERO)	{ .mfb	LDFD	f100 = [C5], SIZE	FMA	f64  = ALPHA, f64, f68	nop	__LINE__	}	{ .mfb	LDFD	f108 = [C6], SIZE	FMA	f65  = ALPHA, f65, f69	nop	__LINE__	}	;;	{ .mfb	LDFD	f101 = [C5], -1 * SIZE	FMA	f72  = ALPHA, f72, f76	nop	__LINE__	}	{ .mfb	LDFD	f109 = [C6], -1 * SIZE	FMA	f73  = ALPHA, f73, f77	nop	__LINE__	}	;;	{ .mfb	LDFD	f116 = [C7], SIZE	FMA	f80  = ALPHA, f80, f84	nop	__LINE__	}	{ .mfb	LDFD	f124 = [C8], SIZE	FMA	f81  = ALPHA, f81, f85	nop	__LINE__	}	;;	{ .mfb	LDFD	f117 = [C7], -1 * SIZE	FMA	f88  = ALPHA, f88, f92	nop	__LINE__	}	{ .mfb	LDFD	f125 = [C8], -1 * SIZE	FMA	f89  = ALPHA, f89, f93	nop	__LINE__	}	;;	{ .mfb	STFD	[C1 ] = f64, SIZE	FMA	f96  = ALPHA, f96,  f100	nop	__LINE__	}	{ .mfb	STFD	[C2 ] = f72, SIZE	FMA	f104 = ALPHA, f104, f108	nop	__LINE__	}	;;	{ .mfb	STFD	[C1 ] = f65, SIZE	FMA	f97  = ALPHA, f97,  f101	nop	__LINE__	}	{ .mfb	STFD	[C2 ] = f73, SIZE	FMA	f105 = ALPHA, f105, f109	nop	__LINE__	}	;;	{ .mfb	STFD	[C3 ] = f80, SIZE	FMA	f112 = ALPHA, f112, f116	nop	__LINE__	}	{ .mfb	STFD	[C4 ] = f88, SIZE	FMA	f120 = ALPHA, f120, f124	nop	__LINE__	}	;;	{ .mfb	STFD	[C3 ] = f81, SIZE	FMA	f113 = ALPHA, f113, f117	nop	__LINE__	}	{ .mfb	STFD	[C4 ] = f89, SIZE	FMA	f121 = ALPHA, f121, f125	nop	__LINE__	}	;;	{ .mfb	STFD	[C5 ] = f96, SIZE	mov	f64  = f0	nop	__LINE__	}	{ .mfb	STFD	[C6 ] = f104, SIZE	mov	f72  = f0	nop	__LINE__	}	;;	{ .mfb	STFD	[C5 ] = f97,  SIZE	mov	f80  = f0	nop	__LINE__	}	{ .mfb	STFD	[C6 ] = f105, SIZE	mov	f88  = f0	nop	__LINE__	}	;;	{ .mfb	STFD	[C7 ] = f112, SIZE	mov	f96  = f0	nop	__LINE__	}	{ .mfb	STFD	[C8 ] = f120, SIZE	mov	f104 = f0	nop	__LINE__	}	;;	{ .mfb	STFD	[C7 ] = f113, SIZE	mov	f112 = f0	nop	__LINE__	}	{ .mfb	STFD	[C8 ] = f121, SIZE	mov	f120 = f0	nop	__LINE__	}	;;#else	{ .mfb	nop	__LINE__	FMPY	f64  = ALPHA, f64	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMPY	f65  = ALPHA, f65	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMPY	f72  = ALPHA, f72	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMPY	f73  = ALPHA, f73	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMPY	f80  = ALPHA, f80	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMPY	f81  = ALPHA, f81	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	FMPY	f88  = ALPHA, f88	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMPY	f89  = ALPHA, f89	nop	__LINE__	}	;;	{ .mfb	STFD	[C1 ] = f64, SIZE	FMPY	f96  = ALPHA, f96	nop	__LINE__	}	{ .mfb	STFD	[C2 ] = f72, SIZE	FMPY	f104 = ALPHA, f104	nop	__LINE__	}	;;	{ .mfb	STFD	[C1 ] = f65, SIZE	FMPY	f97  = ALPHA, f97	nop	__LINE__	}	{ .mfb	STFD	[C2 ] = f73, SIZE	FMPY	f105 = ALPHA, f105	nop	__LINE__	}	;;	{ .mfi	STFD	[C3 ] = f80, SIZE	FMPY	f112 = ALPHA, f112#if defined(TRMMKERNEL) && \    ((defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))	sub	L = K, KK#else	nop	__LINE__#endif	}	{ .mfb	STFD	[C4 ] = f88, SIZE	FMPY	f120 = ALPHA, f120	nop	__LINE__	}	;;	{ .mfi	STFD	[C3 ] = f81, SIZE	FMPY	f113 = ALPHA, f113#if defined(TRMMKERNEL) && (defined(LEFT) &&  defined(TRANSA))	adds	L = -2, L#else	nop	__LINE__#endif	}	{ .mfi	STFD	[C4 ] = f89, SIZE	FMPY	f121 = ALPHA, f121#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))	adds	L = -8, L#else	nop	__LINE__#endif	}	;;	{ .mfi	STFD	[C5 ] = f96, SIZE	mov	f64  = f0#if defined(TRMMKERNEL) && \    ((defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))	shladd	KK8 = L, BASE_SHIFT, r0#else	nop	__LINE__#endif	}	{ .mfb	STFD	[C6 ] = f104, SIZE	mov	f72  = f0	nop	__LINE__	}	;;	{ .mfi	STFD	[C5 ] = f97,  SIZE	mov	f80  = f0#if defined(TRMMKERNEL) && \    ((defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))	shladd	AOFFSET = KK8, 1, AOFFSET#else	nop	__LINE__#endif	}	{ .mfi	STFD	[C6 ] = f105, SIZE	mov	f88  = f0#if defined(TRMMKERNEL) && \    ((defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))	shladd	BOFFSET = KK8, 3, BOFFSET#else	nop	__LINE__#endif	}	;;	{ .mfi	STFD	[C7 ] = f112, SIZE	mov	f96  = f0#if defined(TRMMKERNEL) && defined(LEFT)	adds	KK = 2, KK#else	nop	__LINE__#endif	}	{ .mfb	STFD	[C8 ] = f120, SIZE	mov	f104 = f0	nop	__LINE__	}	;;	{ .mfi	STFD	[C7 ] = f113, SIZE	mov	f112 = f0#ifdef TRMMKERNEL	shladd	KK8 = KK, BASE_SHIFT, r0#else	nop	__LINE__#endif	}	{ .mfb	STFD	[C8 ] = f121, SIZE	mov	f120 = f0	nop	__LINE__	}	;;#endif	.align 32.L040:	{ .mib#ifndef TRMMKERNEL	nop	__LINE__#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	L = K, KK#elif defined(LEFT)	adds	L = 1, KK#else	adds	L = 8, KK#endif#endif	tbit.z	p6, p7 = M, 0	(p6)	br.cond.dptk .L049	}	;;#if !defined(TRMMKERNEL) || \    defined(TRMMKERNEL) && \    ((defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))	{ .mmi	LDFPD	f48, f49 = [B]	adds	BOFFSET = 2 * SIZE, B#ifndef TRMMKERNEL	adds	L =  1, K#else	adds	L =  1, L#endif	}#else	{ .mmi	shladd	BOFFSET = KK8, 3, B	add	AOFFSET = KK8, AOFFSET	nop	__LINE__	}	;;	{ .mmi	LDFPD	f48, f49 = [BOFFSET], 2 * SIZE	nop	__LINE__#ifndef TRMMKERNEL	adds	L =  1, K#else	adds	L =  1, L#endif	}#endif	;;	{ .mii	LDFPD	f50, f51 = [BOFFSET], 2 * SIZE	tbit.z	p12, p0 = L, 0	shr	L = L, 1	}	;;	{ .mmi	LDFPD	f52, f53 = [BOFFSET], 2 * SIZE	LDFD	f32 = [AOFFSET], 1 * SIZE	adds	L =  -1, L	}	;;	{ .mmi	adds	PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET	cmp.eq	p3, p0 = r0, r0	mov	ar.lc = L	}	{ .mmi	LDFPD	f54, f55 = [BOFFSET], 2 * SIZE	adds	PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET	nop	__LINE__	}	;;	.align 32.L042:	{ .mfb	lfetch.nt1	[PREB],  16 * SIZE	FMA	f64   = f32, f48, f64	// A1 * B1	nop	__LINE__	}	{ .mfb	(p12) cmp.ne p3, p0 =  0, L	FMA	f72   = f32, f49, f72	// A1 * B2	nop	__LINE__	}	;;	{ .mfi	(p3) LDFD	f40 = [AOFFSET], 1 * SIZE	FMA	f80   = f32, f50, f80	// A1 * B3	cmp.ne	p4, p5 =  0, L	}	{ .mfb	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE	FMA	f88   = f32, f51, f88	// A1 * B4	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f58, f59 = [BOFFSET],  2 * SIZE	FMA	f96   = f32, f52, f96	// A1 * B5	nop	__LINE__	}	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f68 = [C1]#else	nop	__LINE__#endif	FMA	f104  = f32, f53, f104	// A1 * B6	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f60, f61 = [BOFFSET], 2 * SIZE	FMA	f112  = f32, f54, f112	// A1 * B7	nop	__LINE__	}	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f76 = [C2]#else	nop	__LINE__#endif	FMA	f120  = f32, f55, f120	// A1 * B8	nop	__LINE__	}	;;	{ .mfb	(p4) LDFD	f32 = [AOFFSET],   1 * SIZE	(p3) FMA	f64   = f40, f56, f64	// A1 * B1	nop	__LINE__	}	{ .mfb	(p3) LDFPD	f62, f63 = [BOFFSET], 2 * SIZE	(p3) FMA	f72   = f40, f57, f72	// A1 * B2	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f48, f49 = [BOFFSET],   2 * SIZE	(p3) FMA	f80   = f40, f58, f80	// A1 * B3	nop	__LINE__	}	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f84 = [C3]#else	nop	__LINE__#endif	(p3) FMA	f88   = f40, f59, f88	// A1 * B4	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f50, f51 = [BOFFSET],  2 * SIZE	(p3) FMA	f96   = f40, f60, f96	// A1 * B5	nop	__LINE__	}	{ .mfb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f92 = [C4]#else	nop	__LINE__#endif	(p3) FMA	f104  = f40, f61, f104	// A1 * B6	nop	__LINE__	}	;;	{ .mfi	(p4) LDFPD	f52, f53 = [BOFFSET], 2 * SIZE	(p3) FMA	f112  = f40, f62, f112	// A1 * B7	adds	L = -1, L	}	{ .mmb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f100 = [C5]	(p5) LDFD	f108 = [C6]#else	nop	__LINE__	nop	__LINE__#endif	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f54, f55 = [BOFFSET], 2 * SIZE	(p3) FMA	f120  = f40, f63, f120	// A1 * B8	nop	__LINE__	}	{ .mmb#if! defined(TRMMKERNEL) && !defined(BETAZERO)	(p5) LDFD	f116 = [C7]	(p5) LDFD	f124 = [C8]#else	nop	__LINE__	nop	__LINE__#endif	br.cloop.sptk.few .L042	}	;;#if! defined(TRMMKERNEL) && !defined(BETAZERO)	FMA	f64  = ALPHA, f64, f68	FMA	f72  = ALPHA, f72, f76	FMA	f80  = ALPHA, f80, f84	FMA	f88  = ALPHA, f88, f92	FMA	f96  = ALPHA, f96,  f100	FMA	f104 = ALPHA, f104, f108	FMA	f112 = ALPHA, f112, f116	FMA	f120 = ALPHA, f120, f124	;;	STFD	[C1 ] = f64, SIZE	mov	f64 = f0	STFD	[C2 ] = f72, SIZE	mov	f72 = f0	;;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -