⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trsm_kernel_rt.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 5 页
字号:
	}	;;	{ .mib	(p7) LDFD	f32 = [AOFFSET], 1 * SIZE	mov	ar.lc = L	(p6) br.cond.dpnt   .L168	}	;;	.align 8.L162:	{ .mmf	cmp.ne	p4, p5 =  0, L	(p12) cmp.ne p3, p0 =  0, L	FMA	f64   = f32, f48, f64	// A1 * B1	}	;;	{ .mmi	(p3) LDFD	f56 = [BOFFSET], 1 * SIZE	(p3) LDFD	f40 = [AOFFSET], 1 * SIZE	nop	__LINE__	}	;;	{ .mmi	(p4) LDFD	f32 = [AOFFSET],   1 * SIZE	nop	__LINE__	adds	L = -1, L	}	{ .mfb	(p4) LDFD	f48 = [BOFFSET],   1 * SIZE	(p3) FMA	f64   = f40, f56, f64	// A1 * B1	br.cloop.sptk.few .L162	}	;;	.align 8.L168:#if defined(LN) || defined(RT)#ifdef LN	adds	r2 = -1, KK#else	adds	r2 = -1, KK#endif	;;	shladd	r2 = r2, BASE_SHIFT, r0	;;	add	AOFFSET = r2, AORIG	add	BOFFSET = r2, B	;;	#endif#if defined(LN) || defined(LT)	{ .mmi	LDFD	f32 = [BOFFSET]	LDFD	f33 = [AOFFSET]#ifdef LN	adds	C1 = -1 * SIZE, C1#else	nop	__LINE__#endif	}	;;#else	{ .mmi	LDFD	f32 = [AOFFSET]	LDFD	f33 = [BOFFSET]	nop	__LINE__	}	;;#endif	{ .mmf	sub	L = K, KK#ifdef RT	shladd	AORIG = K, BASE_SHIFT, AORIG#else	nop	__LINE__#endif	FSUB	f64  = f32, f64	}	;;#ifdef LT	adds	KK =  1, KK#elif defined LN	adds	KK = -1, KK#else	nop	__LINE__#endif	;;#if defined(LT) || defined(RN)	mov	L = KK#else	sub	L = K, KK#endif	;;	FMPY	f64  = f64,  f33	;;#if defined(LN) || defined(LT)	{ .mmf	STFD	[BOFFSET]  = f64#ifndef LN	STFD	[C1 ] = f64, SIZE#else	STFD	[C1 ] = f64#endif	mov	f64  = f0	}	;;#else	{ .mmf	STFD	[AOFFSET]  = f64	STFD	[C1 ] = f64, SIZE	mov	f64  = f0	}	;;#endif#if defined(LT) || defined(RN)	shladd	AOFFSET = L, BASE_SHIFT, AOFFSET#else	nop	__LINE__#endif#if defined(LT) || defined(RN)	shladd	BOFFSET = L, BASE_SHIFT, BOFFSET#else	nop	__LINE__#endif	;;	.align 8.L169:	{ .mii#ifdef LN	shladd	B = K, BASE_SHIFT, B#elif defined(LT) || defined(RN)	mov	B =  BOFFSET#else	nop	__LINE__#endif#ifdef RN	adds	KK =  1,  KK#elif defined RT	adds	KK = -1,  KK#else	nop	__LINE__#endif	mov	AOFFSET = A	}	;;	.align 16.L090:	tbit.z	p6, p0 = N, 1	(p6)	br.cond.dpnt .L050	;;#ifdef RT       { .mmi	shladd	r3 = LDC, 1, r0	nop	__LINE__	shl	r2 = K, 1 + BASE_SHIFT	}	;;	{ .mmi	sub	B = B, r2	sub	C = C, r3	nop	__LINE__	}#endif	;;	mov	f64  = f0	mov	f65  = f0	mov	f66  = f0	mov	f67  = f0	mov	f72  = f0	mov	f73  = f0	mov	f74  = f0	mov	f75  = f0	;;	{ .mfi	shr	I  = M, 3	} 	{ .mfi	mov	C1 = C			// coffset1 = c + 0 * ldc#ifdef LN	add	KK = M, OFFSET#elif defined LT	mov	KK = OFFSET#else	nop	__LINE__#endif	}	;;	{ .mmf	cmp.eq	p6, p7 = 0, I#if defined(LN) || defined(RT)	mov	AORIG = A#else	mov	AOFFSET = A#endif	}	{ .mmf	add	C2 = LDC, C		// coffset2 = c + 1 * ldc	}	;;	{ .mfi#ifndef RT	shladd	C = LDC, 1, C		// coffset += 8 * ldc#else	nop	__LINE__#endif	mov	f81  = f0#if defined(LT) || defined(RN)	mov	L = KK#else	sub	L = K, KK#endif	}{ .mfb	(p6)	br.cond.dpnt .L100	}	;;	.align 16.L092:	{ .mmi	cmp.ne	p7, p0 = r0, L	adds	BOFFSET = 0 * SIZE, B	shl	r2 = K, 3 + BASE_SHIFT	}	{ .mmi	shladd	r3 = KK, BASE_SHIFT, r0	nop	__LINE__	nop	__LINE__	}	;;#if defined(LT) || defined(RN)	{ .mmi	(p7) LDFPD	f48, f49 = [BOFFSET], 2 * SIZE	nop	__LINE__	nop	__LINE__	}	;;#else	{ .mfi	shladd	BOFFSET = r3, 1, B#ifdef LN	sub	AORIG = AORIG, r2#else	nop	__LINE__#endif	}	;;	{ .mfi	(p7) LDFPD	f48, f49 = [BOFFSET], 2 * SIZE	shladd	AOFFSET = r3, 3, AORIG	}	;;#endif	(p7) LDFPD	f32, f33 = [AOFFSET], 2 * SIZE	;;	{ .mmf	(p7) LDFPD	f34, f35  = [AOFFSET], 2 * SIZE	}	;;	{ .mmf	(p7) LDFPD	f36, f37  = [AOFFSET], 2 * SIZE	}	{ .mfi	cmp.eq	p3, p0 = r0, r0	}	;;	{ .mmf	(p7) LDFPD	f38, f39  = [AOFFSET], 2 * SIZE	}	{ .mfi	adds	PREC = CPREFETCHSIZE * SIZE, C1	}	;;	{ .mmf	CPREFETCH [PREC], LDC	}	{ .mfi	adds	L =  1, L	}	;;	{ .mmf	CPREFETCH [PREC]	}	{ .mfi	adds	PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET	}	;;	{ .mfi	adds	PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET	}	;;	{ .mfi	tbit.z	p12, p0 = L, 0	}	{ .mfi	shr	L = L, 1	}	;;	{ .mfi	adds	L =  -1, L	}	;;	{ .mfi	mov	ar.lc = L	}	;;	mov	f68  = f0	mov	f69  = f0	mov	f70  = f0	mov	f71  = f0	mov	f76  = f0	mov	f77  = f0	mov	f78  = f0	mov	f79  = f0	;;	{ .mfb	cmp.eq  p6, p0 = -1, L	(p6) br.cond.dpnt   .L098	}	;;	.align 8.L093:/*  1 */	{ .mfi	lfetch.nt1	[PREA],  16 * SIZE	FMA	f64   = f32, f48, f64	// A1 * B1	cmp.ne	p4, p5 =  0, L	}	{ .mfi	nop	__LINE__	FMA	f72   = f32, f49, f72	// A1 * B2	(p12) cmp.ne p3, p0 =  0, L	}	;;	{ .mfi	lfetch.nt1	[PREB],   4 * SIZE	FMA	f65   = f33, f48, f65	// A2 * B1	adds	C9  = 4 * SIZE, C1	}	{ .mfi	nop	__LINE__	FMA	f73   = f33, f49, f73	// A2 * B2	adds	C10 = 4 * SIZE, C2	}	;;	{ .mfi	(p3) LDFPD	f56, f57 = [BOFFSET],   2 * SIZE	FMA	f66   = f34, f48, f66	// A3 * B1	adds	C11 = 4 * SIZE, C3	}	{ .mfi	nop	__LINE__	FMA	f74   = f34, f49, f74	// A3 * B2	adds	C12 = 4 * SIZE, C4	}	;;	{ .mfb	(p3) LDFPD	f40, f41 = [AOFFSET], 2 * SIZE	FMA	f67   = f35, f48, f67	// A4 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f75   = f35, f49, f75	// A4 * B2	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f42, f43 = [AOFFSET], 2 * SIZE	FMA	f68   = f36, f48, f68	// A5 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f76   = f36, f49, f76	// A5 * B2	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f44, f45 = [AOFFSET], 2 * SIZE	FMA	f69   = f37, f48, f69	// A6 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f77   = f37, f49, f77	// A6 * B2	nop	__LINE__	}	;;	{ .mfb	(p3) LDFPD	f46, f47 = [AOFFSET], 2 * SIZE	FMA	f70   = f38, f48, f70	// A7 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f78   = f38, f49, f78	// A7 * B2	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f32, f33 = [AOFFSET],   2 * SIZE	FMA	f71   = f39, f48, f71	// A8 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	FMA	f79   = f39, f49, f79	// A8 * B2	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f48, f49 = [BOFFSET],  2 * SIZE	(p3) FMA	f64   = f40, f56, f64	// A1 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f72   = f40, f57, f72	// A1 * B2	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f34, f35 = [AOFFSET], 2 * SIZE	(p3) FMA	f65   = f41, f56, f65	// A2 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f73   = f41, f57, f73	// A2 * B2	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f36, f37 = [AOFFSET], 2 * SIZE	(p3) FMA	f66   = f42, f56, f66	// A3 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f74   = f42, f57, f74	// A3 * B2	nop	__LINE__	}	;;	{ .mfb	(p4) LDFPD	f38, f39 = [AOFFSET], 2 * SIZE	(p3) FMA	f67   = f43, f56, f67	// A4 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f75   = f43, f57, f75	// A4 * B2	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	(p3) FMA	f68   = f44, f56, f68	// A5 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f76   = f44, f57, f76	// A5 * B2	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	(p3) FMA	f69   = f45, f56, f69	// A6 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f77   = f45, f57, f77	// A6 * B2	nop	__LINE__	}	;;	{ .mfb	nop	__LINE__	(p3) FMA	f70   = f46, f56, f70	// A7 * B1	nop	__LINE__	}	{ .mfb	nop	__LINE__	(p3) FMA	f78   = f46, f57, f78	// A7 * B2	nop	__LINE__	}	;;	{ .mfi	nop	__LINE__	(p3) FMA	f71   = f47, f56, f71	// A8 * B1	adds	L = -1, L	}	{ .mfb	nop	__LINE__	(p3) FMA	f79   = f47, f57, f79	// A8 * B2	br.cloop.sptk.few .L093	}	;;	.align 8.L098:#if defined(LN) || defined(RT)#ifdef LN	adds	r2 = -8, KK#else	adds	r2 = -2, KK#endif	;;	shladd	r2 = r2, BASE_SHIFT, r0	;;	shladd	AOFFSET = r2, 3, AORIG	shladd	BOFFSET = r2, 1, B	;;	#endif 	adds	AOFFSET2 = 4 * SIZE, AOFFSET 	adds	BOFFSET2 = 4 * SIZE, BOFFSET	;;#if defined(LN) || defined(LT)	LDFPD	f32, f33 = [BOFFSET], 2 * SIZE	;;	LDFPD	f34, f35 = [BOFFSET], 2 * SIZE	;;	LDFPD	f36, f37 = [BOFFSET], 2 * SIZE	;;	LDFPD	f38, f39 = [BOFFSET], 2 * SIZE	;;	LDFPD	f40, f41 = [BOFFSET], 2 * SIZE	;;	LDFPD	f42, f43 = [BOFFSET], 2 * SIZE	;;	LDFPD	f44, f45 = [BOFFSET], 2 * SIZE	;;	LDFPD	f46, f47 = [BOFFSET]	adds	BOFFSET = -14 * SIZE, BOFFSET	;;	FSUB	f64  = f32, f64	FSUB	f72  = f33, f72	FSUB	f65  = f34, f65	FSUB	f73  = f35, f73	FSUB	f66  = f36, f66	FSUB	f74  = f37, f74	FSUB	f67  = f38, f67	FSUB	f75  = f39, f75	FSUB	f68  = f40, f68	FSUB	f76  = f41, f76	FSUB	f69  = f42, f69	FSUB	f77  = f43, f77	FSUB	f70  = f44, f70	FSUB	f78  = f45, f78	FSUB	f71  = f46, f71	FSUB	f79  = f47, f79	;;#else	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE	;;	LDFPD	f34, f35 = [AOFFSET], 2 * SIZE	;;	LDFPD	f36, f37 = [AOFFSET], 2 * SIZE	;;	LDFPD	f38, f39 = [AOFFSET], 2 * SIZE	;;	LDFPD	f40, f41 = [AOFFSET], 2 * SIZE	;;	LDFPD	f42, f43 = [AOFFSET], 2 * SIZE	;;	LDFPD	f44, f45 = [AOFFSET], 2 * SIZE	;;	LDFPD	f46, f47 = [AOFFSET]	adds	AOFFSET = -14 * SIZE, AOFFSET	;;	FSUB	f64  = f32, f64	FSUB	f65  = f33, f65	FSUB	f66  = f34, f66	FSUB	f67  = f35, f67	FSUB	f68  = f36, f68	FSUB	f69  = f37, f69	FSUB	f70  = f38, f70	FSUB	f71  = f39, f71	;;	FSUB	f72  = f40, f72	FSUB	f73  = f41, f73	FSUB	f74  = f42, f74	FSUB	f75  = f43, f75	FSUB	f76  = f44, f76	FSUB	f77  = f45, f77	FSUB	f78  = f46, f78	FSUB	f79  = f47, f79	;;#endif#ifdef LN	adds	AOFFSET = 62 * SIZE, AOFFSET	;;	LDFPD	f33, f32 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFPD	f35, f34 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFPD	f37, f36 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFPD	f39, f38 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFD	f40 = [AOFFSET], -2 * SIZE	;;	LDFPD	f42, f41 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFPD	f44, f43 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFPD	f46, f45 = [AOFFSET]	adds	AOFFSET = - 4 * SIZE, AOFFSET	;;	LDFPD	f48, f47 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFPD	f50, f49 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFPD	f52, f51 = [AOFFSET]	adds	AOFFSET = - 4 * SIZE, AOFFSET	;;	LDFD	f53 = [AOFFSET], -2 * SIZE	;;	LDFPD	f55, f54 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFPD	f57, f56 = [AOFFSET]	adds	AOFFSET = - 6 * SIZE, AOFFSET	;;	LDFPD	f59, f58 = [AOFFSET]	adds	AOFFSET = - 2 * SIZE, AOFFSET	;;	LDFPD	f61, f60 = [AOFFSET]	adds	AOFFSET = - 6 * SIZE, AOFFSET	;;	LDFD	f16 = [AOFFSET], -2 * SIZE	;;	LDFPD	f18, f17 = [AOFFSET]	adds	AOFFSET = - 8 * SIZE, AOFFSET	;;	LDFPD	f20, f19 = [AOFFSET]	adds	AOFFSET = - 8 * SIZE, AOFFSET	;;	LDFD	f21 = [AOFFSET]	;;	FMPY	f71  = f71,  f32	FMPY	f79  = f79,  f32	;;	FNMA	f70  = f71,  f33, f70	FNMA	f78  = f79,  f33, f78	;;	FNMA	f69  = f71,  f34, f69	FNMA	f77  = f79,  f34, f77	;;	FNMA	f68  = f71,  f35, f68	FNMA	f76  = f79,  f35, f76	;;	FNMA	f67  = f71,  f36, f67	FNMA	f75  = f79,  f36, f75	;;	FNMA	f66  = f71,  f37, f66	FNMA	f74  = f79,  f37, f74	;;	FNMA	f65  = f71,  f38, f65	FNMA	f73  = f79,  f38, f73	;;	FNMA	f64  = f71,  f39, f64	FNMA	f72  = f79,  f39, f72	;;	FMPY	f70  = f70,  f40	FMPY	f78  = f78,  f40	;;	FNMA	f69  = f70,  f41, f69	FNMA	f77  = f78,  f41, f77	;;	FNMA	f68  = f70,  f42, f68	FNMA	f76  = f78,  f42, f76	;;	FNMA	f67  = f70,  f43, f67	FNMA	f75  = f78,  f43, f75	;;	FNMA	f66  = f70,  f44, f66	FNMA	f74  = f78,  f44, f74	;;	FNMA	f65  = f70,  f45, f65	FNMA	f73  = f78,  f45, f73	;;	FNMA	f64  = f70,  f46, f64	FNMA	f72  = f78,  f46, f72	;;	FMPY	f69  = f69,  f47	FMPY	f77  = f77,  f47	;;	FNMA	f68  = f69,  f48, f68	FNMA	f76  = f77,  f48, f76	;;	FNMA	f67  = f69,  f49, f67	FNMA	f75  = f77,  f49, f75	;;	FNMA	f66  = f69,  f50, f66	FNMA	f74  = f77,  f50, f74	;;	FNMA	f65  = f69,  f51, f65	FNMA	f73  = f77,  f51, f73	;;	FNMA	f64  = f69,  f52, f64	FNMA	f72  = f77,  f52, f72	;;	FMPY	f68  = f68,  f53	FMPY	f76  = f76,  f53	;;	FNMA	f67  = f68,  f54, f67	FNMA	f75  = f76,  f54, f75	;;	FNMA	f66  = f68,  f55, f66	FNMA	f74  = f76,  f55, f74	;;	FNMA	f65  = f68,  f56, f65	FNMA	f73  = f76,  f56, f73	;;	FNMA	f64  = f68,  f57, f64	FNMA	f72  = f76,  f57, f72	;;	FMPY	f67  = f67,  f58	FMPY	f75  = f75,  f58	;;	FNMA	f66  = f67,  f59, f66	FNMA	f74  = f75,  f59, f74	;;	FNMA	f65  = f67,  f60, f65	FNMA	f73  = f75,  f60, f73	;;	FNMA	f64  = f67,  f61, f64	FNMA	f72  = f75,  f61, f72	;;	FMPY	f66  = f66,  f16	FMPY	f74  = f74,  f16	;;	FNMA	f65  = f66,  f17, f65	FNMA	f73  = f74,  f17, f73	;;	FNMA	f64  = f66,  f18, f64	FNMA	f72  = f74,  f18, f72	;;	FMPY	f65  = f65,  f19	FMPY	f73  = f73,  f19	;;	FNMA	f64  = f65,  f20, f64	FNMA	f72  = f73,  f20, f72	;;	FMPY	f64  = f64,  f21	FMPY	f72  = f72,  f21	;;	adds	BOFFSET  =  8 * SIZE, BOFFSET	adds	BOFFSET2 =  8 * SIZE, BOFFSET2	;;	STFD	[BOFFSET]  = f68, SIZE	STFD	[BOFFSET2] = f70, SIZE	;;	STFD	[BOFFSET]  = f76, SIZE	STFD	[BOFFSET2] = f78, SIZE	;;	STFD	[BOFFSET]  = f69, SIZE	STFD	[BOFFSET2] = f71, SIZE	;;	STFD	[BOFFSET]  = f77, - 11 * SIZE	STFD	[BOFFSET2] = f79, - 11 * SIZE	;;	STFD	[BOFFSET]  = f64, SIZE	STFD	[BOFFSET2] = f66, SIZE	;;	STFD	[BOFFSET]  = f72, SIZE	STFD	[BOFFSET2] = f74, SIZE	;;	STFD	[BOFFSET]  = f65, SIZE	STFD	[BOFFSET2] = f67, SIZE	;;	STFD	[BOFFSET]  = f73, - 3 * SIZE	STFD	[BOFFSET2] = f75, - 3 * SIZE	;;	adds	C1 = -8 * SIZE, C1	adds	C2 = -8 * SIZE, C2	;;#endif#ifdef LT	LDFPD	f32, f33 = [AOFFSET], 2 * SIZE	;;	LDFPD	f34, f35 = [AOFFSET], 2 * SIZE	;;	LDFPD	f36, f37 = [AOFFSET], 2 * SIZE	;;	LDFPD	f38, f39 = [AOFFSET]	adds	AOFFSET = 3 * SIZE, AOFFSET	;;	LDFD	f40 = [AOFFSET], 1 * SIZE	;;	LDFPD	f41, f42 = [AOFFSET], 2 * SIZE	;;	LDFPD	f43, f44 = [AOFFSET], 2 * SIZE	;;	LDFPD	f45, f46 = [AOFFSET]	adds	AOFFSET = 4 * SIZE, AOFFSET	;;	LDFPD	f47, f48 = [AOFFSET], 2 * SIZE	;;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -