⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trsm_kernel_rt.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 5 页
字号:
	STFD	[AOFFSET]  = f65, SIZE	STFD	[AOFFSET2] = f69, SIZE	;;	STFD	[AOFFSET]  = f66, SIZE	STFD	[AOFFSET2] = f70, SIZE	;;	STFD	[AOFFSET]  = f67, -3 * SIZE	STFD	[AOFFSET2] = f71, -3 * SIZE	;;#endif	adds	C9  = 4 * SIZE, C1	;;	{ .mmf	STFD	[C1 ] = f64, SIZE	STFD	[C9 ] = f68, SIZE	mov	f64  = f0	}	;;	{ .mmi	STFD	[C1 ] = f65, SIZE	STFD	[C9 ] = f69, SIZE	}	;;	{ .mmi	STFD	[C1 ] = f66, SIZE	STFD	[C9 ] = f70, SIZE	}	;;	{ .mmi#ifndef LN	STFD	[C1 ] = f67, 5 * SIZE#else	STFD	[C1 ] = f67, - 3 * SIZE#endif	STFD	[C9 ] = f71	}	;;	{ .mmf	cmp.ne	p6, p0 = 1, I	}	;;	adds	I = -1, I	;;	{ .mmi	shladd	r2 = K, BASE_SHIFT, r0	}	;;	{ .mmi	sub	L = K, KK	}	;;	{ .mmi#ifdef RT	shladd	AORIG = r2, 3, AORIG#else	nop	__LINE__#endif	}	;;	{ .mmi#if defined(LT) || defined(RN)	shladd	L = L, BASE_SHIFT, r0#else	nop	__LINE__#endif	}	;;       ;;	{ .mmi#if defined(LT) || defined(RN)	shladd	AOFFSET = L, 3, AOFFSET#else	nop	__LINE__#endif	}	;;	{ .mmi#if defined(LT) || defined(RN)	add	BOFFSET = L, BOFFSET#else	nop	__LINE__#endif	}	;;	{ .mmi#ifdef LT	adds	KK =  8, KK#elif defined LN	adds	KK = -8, KK#else	nop	__LINE__#endif	}	;;	{ .mmi#if defined(LT) || defined(RN)	mov	L = KK#else	sub	L = K, KK#endif	}	;;	mov	f64  = f0	mov	f65  = f0	mov	f66  = f0	mov	f67  = f0	mov	f68  = f0	mov	f69  = f0	mov	f70  = f0	mov	f71  = f0	(p6)	br.cond.dptk .L132	.align 8.L140:	tbit.z	p6, p7 = M, 2	(p6)	br.cond.dptk .L150	;;	{ .mib#if defined(LT) || defined(RN)	mov	L = KK#else	sub	L = K, KK#endif	}	;;	{ .mmi	cmp.ne	p7, p0 = r0, L	adds	BOFFSET = 0 * SIZE, B	shl	r2 = K, 2 + BASE_SHIFT	}	;;#if defined(LT) || defined(RN)	{ .mmf	(p7) LDFD	f48 = [BOFFSET], 1 * SIZE	mov	f65  = f0	}	;;#else	{ .mfi	shladd	BOFFSET = KK, BASE_SHIFT, B#ifdef LN	sub	AORIG = AORIG, r2#else	nop	__LINE__#endif	}	;;	{ .mfi	(p7) LDFD	f48 = [BOFFSET], 1 * SIZE	shladd	AOFFSET = r3, 2, AORIG	}	;;#endif	{ .mfi	adds	L =  1, L	}	{ .mfi	adds	PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET	cmp.eq	p3, p0 = r0, r0	}	;;	{ .mfi	tbit.z	p12, p0 = L, 0	}	{ .mfi	shr	L = L, 1	}	;;	{ .mfi	adds	L =  -1, L	}	;;	{ .mfi	cmp.eq  p6, p0 = -1, L	}	;;	{ .mmf	(p7) LDFPD	f32, f33 = [AOFFSET], 2 * SIZE	}	{ .mfi	mov	ar.lc = L	}	;;	{ .mmf	(p7) LDFPD	f34, f35  = [AOFFSET], 2 * SIZE	}	{ .mfb	(p6) br.cond.dpnt   .L148	}	;;.L142:	{ .mfi	lfetch.nt1	[PREA],  8 * SIZE	FMA	f64   = f32, f48, f64	// A1 * B1	cmp.ne	p4, p5 =  0, L	}	{ .mfi	nop	__LINE__	FMA	f65   = f33, f48, f65	// A2 * B1	(p12) cmp.ne p3, p0 =  0, L	}	;;	{ .mfi	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE	FMA	f66   = f34, f48, f66	// A3 * B1	(p5) adds	C9  = 2 * SIZE, C1	}	{ .mmf	nop	__LINE__	(p3) LDFD	f56 = [BOFFSET],   1 * SIZE	FMA	f67   = f35, f48, f67	// A4 * B1	}	;;	{ .mfi	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE	(p3) FMA	f64   = f40, f56, f64	// A1 * B1	(p5) adds	C10 = 2 * SIZE, C2	}	{ .mfb	nop	__LINE__	(p3) FMA	f65   = f41, f56, f65	// A2 * B1	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE	(p3) FMA	f66   = f42, f56, f66	// A3 * B1	nop	__LINE__	}	{ .mmf	(p4) LDFD	f48 = [BOFFSET],   1 * SIZE	nop	__LINE__	(p3) FMA	f67   = f43, f56, f67	// A4 * B1	}	;;	{ .mfi	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE	nop	__LINE__	adds	L = -1, L	}	{ .mfb	nop	__LINE__	nop.f 0	br.cloop.sptk.few .L142	}	;;.L148:#if defined(LN) || defined(RT)#ifdef LN	adds	r2 = -4, KK#else	adds	r2 = -1, KK#endif	;;	shladd	r2 = r2, BASE_SHIFT, r0	;;	shladd	AOFFSET = r2, 2, AORIG	add	BOFFSET = r2, B	;;	#endif 	adds	AOFFSET2 = 4 * SIZE, AOFFSET 	adds	BOFFSET2 = 4 * SIZE, BOFFSET	;;#if defined(LN) || defined(LT)	LDFPD	f32, f33 = [BOFFSET], 2 * SIZE	;;	LDFPD	f34, f35 = [BOFFSET]	adds	BOFFSET = -2 * SIZE, BOFFSET	;;	FSUB	f64  = f32, f64	FSUB	f65  = f33, f65	FSUB	f66  = f34, f66	FSUB	f67  = f35, f67	;;#else	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE	;;	LDFPD	f34, f35 = [AOFFSET]	adds	AOFFSET = -2 * SIZE, AOFFSET	;;	FSUB	f64  = f32, f64	FSUB	f65  = f33, f65	FSUB	f66  = f34, f66	FSUB	f67  = f35, f67	;;#endif#ifdef LN	adds	AOFFSET = 14 * SIZE, AOFFSET	;;	LDFPD	f33, f32 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFPD	f35, f34 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFD	f36 = [AOFFSET], - 2 * SIZE	;;	LDFPD	f38, f37 = [AOFFSET]	adds	AOFFSET = - 4 * SIZE, AOFFSET	;;	LDFPD	f40, f39 = [AOFFSET]	adds	AOFFSET = - 4 * SIZE, AOFFSET	;;	LDFD	f41 = [AOFFSET]	;;	FMPY	f67  = f67,  f32	;;	FNMA	f66  = f67,  f33, f66	;;	FNMA	f65  = f67,  f34, f65	;;	FNMA	f64  = f67,  f35, f64	;;	FMPY	f66  = f66,  f36	;;	FNMA	f65  = f66,  f37, f65	;;	FNMA	f64  = f66,  f38, f64	;;	FMPY	f65  = f65,  f39	;;	FNMA	f64  = f65,  f40, f64	;;	FMPY	f64  = f64,  f41	;;	STFD	[BOFFSET]  = f64, SIZE	;;	STFD	[BOFFSET]  = f65, SIZE	;;	STFD	[BOFFSET]  = f66, SIZE	;;	STFD	[BOFFSET]  = f67, -3 * SIZE	;;	adds	C1 = -4 * SIZE, C1	;;#endif#ifdef LT	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE	;;	LDFPD	f34, f35 = [AOFFSET]	adds	AOFFSET = 3 * SIZE, AOFFSET	;;	LDFD	f36 = [AOFFSET], 1 * SIZE	;;	LDFPD	f37, f38 = [AOFFSET]	adds	AOFFSET = 4 * SIZE, AOFFSET	;;	LDFPD	f39, f40 = [AOFFSET]	adds	AOFFSET = 5 * SIZE, AOFFSET	;;	LDFD	f41 = [AOFFSET], -15 * SIZE	;;	FMPY	f64  = f64,  f32	;;	FNMA	f65  = f64,  f33, f65	;;	FNMA	f66  = f64,  f34, f66	;;	FNMA	f67  = f64,  f35, f67	;;	FMPY	f65  = f65,  f36	;;	FNMA	f66  = f65,  f37, f66	;;	FNMA	f67  = f65,  f38, f67	;;	FMPY	f66  = f66,  f39	;;	FNMA	f67  = f66,  f40, f67	;;	FMPY	f67  = f67,  f41	;;	STFD	[BOFFSET]  = f64, SIZE	;;	STFD	[BOFFSET]  = f65, SIZE	;;	STFD	[BOFFSET]  = f66, SIZE	;;	STFD	[BOFFSET]  = f67, -3 * SIZE	;;#endif#ifdef RN	LDFD	f32 = [BOFFSET]	;;	FMPY	f64  = f64,  f32	FMPY	f65  = f65,  f32	FMPY	f66  = f66,  f32	FMPY	f67  = f67,  f32	;;	STFD	[AOFFSET]  = f64, SIZE	;;	STFD	[AOFFSET]  = f65, SIZE	;;	STFD	[AOFFSET]  = f66, SIZE	;;	STFD	[AOFFSET]  = f67,  -3 * SIZE	;;#endif#ifdef RT	LDFD	f32 = [BOFFSET]	;;	FMPY	f64  = f64,  f32	FMPY	f65  = f65,  f32	FMPY	f66  = f66,  f32	FMPY	f67  = f67,  f32	;;	STFD	[AOFFSET]  = f64, SIZE	;;	STFD	[AOFFSET]  = f65, SIZE	;;	STFD	[AOFFSET]  = f66, SIZE	;;	STFD	[AOFFSET]  = f67, - 3 * SIZE	;;#endif	{ .mmf	STFD	[C1 ] = f64, SIZE	mov	f64  = f0	}	;;	{ .mmi	STFD	[C1 ] = f65, SIZE	}	;;	{ .mmi	STFD	[C1 ] = f66, SIZE	}	;;	{ .mmi#ifndef LN	STFD	[C1 ] = f67, SIZE#else	STFD	[C1 ] = f67, - 3 * SIZE#endif	}	;;	{ .mmf	mov	f72  = f0	}	;;	mov	f65 = f0	mov	f73 = f0	mov	f66 = f0	mov	f74 = f0	mov	f67 = f0	mov	f75 = f0	;;	shladd	r2 = K, BASE_SHIFT, r0	;;	{ .mmi	sub	L = K, KK	}	;;	{ .mmi#ifdef RT	shladd	AORIG = r2, 2, AORIG#else	nop	__LINE__#endif	}	;;	{ .mmi#if defined(LT) || defined(RN)	shladd	L = L, BASE_SHIFT, r0#else	nop	__LINE__#endif	}	;;	{ .mmi#if defined(LT) || defined(RN)	shladd	AOFFSET = L, 2, AOFFSET#else	nop	__LINE__#endif	}	;;	{ .mmi#if defined(LT) || defined(RN)	add	BOFFSET = L, BOFFSET#else	nop	__LINE__#endif	}	;;	{ .mmi#ifdef LT	adds	KK =  4, KK#elif defined LN	adds	KK = -4, KK#else	nop	__LINE__#endif	}	;;	{ .mmi#if defined(LT) || defined(RN)	mov	L = KK#else	sub	L = K, KK#endif	}	;;	.align 8.L150:	tbit.z	p6, p7 = M, 1	(p6)	br.cond.dptk .L160	;;	{ .mib#if defined(LT) || defined(RN)	mov	L = KK#else	sub	L = K, KK#endif	}	;;	{ .mmi	cmp.ne	p7, p0 = r0, L	adds	BOFFSET = 0 * SIZE, B	shl	r2 = K, 1 + BASE_SHIFT	}	;;#if defined(LT) || defined(RN)	{ .mmf	(p7) LDFD	f48 = [BOFFSET], 1 * SIZE	}	;;#else	{ .mfi	shladd	BOFFSET = KK, BASE_SHIFT, B#ifdef LN	sub	AORIG = AORIG, r2#else	nop	__LINE__#endif	}	;;	{ .mfi	(p7) LDFD	f48 = [BOFFSET], 1 * SIZE	shladd	AOFFSET = r3, 1, AORIG	}	;;#endif	{ .mfi	adds	L =  1, L	}	{ .mfi	adds	PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET	cmp.eq	p3, p0 = r0, r0	}	;;	{ .mfi	tbit.z	p12, p0 = L, 0	}	{ .mfi	shr	L = L, 1	}	;;	{ .mmf	adds	L =  -1, L	}	;;	{ .mmf	cmp.eq  p6, p0 = -1, L	}	;;	{ .mib	(p7) LDFPD	f32, f33 = [AOFFSET], 2 * SIZE	mov	ar.lc = L	(p6) br.cond.dpnt   .L158	}	;;.L152:	{ .mfi	cmp.ne	p4, p5 =  0, L	FMA	f64   = f32, f48, f64	// A1 * B1	(p12) cmp.ne p3, p0 =  0, L	}	;;	{ .mmf	(p3) LDFD	f56 = [BOFFSET],   1 * SIZE	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE	FMA	f65   = f33, f48, f65	// A2 * B1	}	;;	{ .mfi	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE	(p3) FMA	f64   = f40, f56, f64	// A1 * B1	adds	L = -1, L	}	;;	{ .mfb	(p4) LDFD	f48 = [BOFFSET],   1 * SIZE	(p3) FMA	f65   = f41, f56, f65	// A2 * B1	br.cloop.sptk.few .L152	}	;;.L158:#if defined(LN) || defined(RT)#ifdef LN	adds	r2 = -2, KK#else	adds	r2 = -1, KK#endif	;;	shladd	r2 = r2, BASE_SHIFT, r0	;;	shladd	AOFFSET = r2, 1, AORIG	add	BOFFSET = r2, B	;;	#endif 	adds	AOFFSET2 = 4 * SIZE, AOFFSET 	adds	BOFFSET2 = 4 * SIZE, BOFFSET	;;#if defined(LN) || defined(LT)	LDFPD	f32, f33 = [BOFFSET]	;;	FSUB	f64  = f32, f64	FSUB	f65  = f33, f65	;;#else	LDFPD	f32, f33 = [AOFFSET]	;;	FSUB	f64  = f32, f64	FSUB	f65  = f33, f65	;;#endif#ifdef LN	adds	AOFFSET = 2 * SIZE, AOFFSET	;;	LDFPD	f33, f32 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFD	f34 = [AOFFSET]	;;	FMPY	f65  = f65,  f32	;;	FNMA	f64  = f65,  f33, f64	;;	FMPY	f64  = f64,  f34	;;	STFD	[BOFFSET]  = f64, SIZE	;;	STFD	[BOFFSET]  = f65, - SIZE	;;	adds	C1 = -2 * SIZE, C1	;;#endif#ifdef LT	LDFPD	f32, f33 = [AOFFSET]	adds	AOFFSET = 3 * SIZE, AOFFSET	;;	LDFD	f34 = [AOFFSET], - 3 * SIZE	;;	FMPY	f64  = f64,  f32	;;	FNMA	f65  = f64,  f33, f65	;;	FMPY	f65  = f65,  f34	;;	STFD	[BOFFSET]  = f64, SIZE	;;	STFD	[BOFFSET]  = f65, -SIZE	;;#endif#ifdef RN	LDFD	f32 = [BOFFSET]	;;	FMPY	f64  = f64,  f32	FMPY	f65  = f65,  f32	;;	STFD	[AOFFSET]  = f64, SIZE	;;	STFD	[AOFFSET]  = f65, - SIZE	;;#endif#ifdef RT	LDFD	f32 = [BOFFSET]	;;	FMPY	f64  = f64,  f32	FMPY	f65  = f65,  f32	;;	STFD	[AOFFSET]  = f64, SIZE	;;	STFD	[AOFFSET]  = f65, - SIZE	;;#endif	STFD	[C1 ] = f64, SIZE	;;#ifndef LN	STFD	[C1 ] = f65, SIZE#else	STFD	[C1 ] = f65, -SIZE#endif	;;	mov	f64  = f0	mov	f65  = f0	;;	shladd	r2 = K, BASE_SHIFT, r0	;;	sub	L = K, KK	;;#ifdef RT	shladd	AORIG = r2, 1, AORIG#else	nop	__LINE__#endif	;;	{ .mmi#if defined(LT) || defined(RN)	shladd	L = L, BASE_SHIFT, r0#else	nop	__LINE__#endif	}	;;	{ .mmi#if defined(LT) || defined(RN)	shladd	AOFFSET = L, 1, AOFFSET#else	nop	__LINE__#endif	}	;;	{ .mmi#if defined(LT) || defined(RN)	add	BOFFSET = L, BOFFSET#else	nop	__LINE__#endif	}	;;	{ .mmi#ifdef LT	adds	KK =  2, KK#elif defined LN	adds	KK = -2, KK#else	nop	__LINE__#endif	}	;;	{ .mmi#if defined(LT) || defined(RN)	mov	L = KK#else	sub	L = K, KK#endif	}	;;	.align 8.L160:	{ .mib#if defined(LT) || defined(RN)	mov	L = KK#else	sub	L = K, KK#endif	tbit.z	p6, p7 = M, 0	(p6)	br.cond.dptk .L169	}	;;	{ .mmi	cmp.ne	p7, p0 = r0, L	adds	BOFFSET = 0 * SIZE, B	shl	r2 = K, 0 + BASE_SHIFT	}	;;#if defined(LT) || defined(RN)	{ .mmi	(p7) LDFD	f48 = [BOFFSET], 1 * SIZE	nop	__LINE__	adds	L =  1, L	}	;;#else	{ .mmi	shladd	BOFFSET = KK, BASE_SHIFT, B	nop	__LINE__#ifdef LN	sub	AORIG = AORIG, r2#else	nop	__LINE__#endif	}	;;	{ .mmi	(p7) LDFD	f48 = [BOFFSET], 1 * SIZE	adds	L =  1, L	add	AOFFSET = r3, AORIG	}	;;#endif	;;	{ .mii	tbit.z	p12, p0 = L, 0	shr	L = L, 1	}	;;	{ .mmi	cmp.eq  p6, p0 = 0, L	adds	L =  -1, L	cmp.eq	p3, p0 = r0, r0

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -