⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trsm_kernel_lt.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 5 页
字号:
#if defined(LT) || defined(RN)	mov	BO, B#endif#ifdef RN	add	KK, 2, KK#endif#ifdef RT	sub	KK, 2, KK#endif.LL200:	and	N, 1, J	cmp	J, 0	ble,pn	%icc, .LL999	nop#ifdef RT	sll	K, 0 + BASE_SHIFT, TEMP1	sub	B, TEMP1, B	sub	C, LDC, C#endif	mov	C, C1#ifdef LN	add	M, OFFSET, KK#endif#ifdef LT	mov	OFFSET, KK#endif#if defined(LN) || defined(RT)	mov	A, AORIG#else	mov	A, AO#endif#ifndef RT	add	C, LDC, C#endif	sra	M, 2, I	cmp	I, 0	ble,pn	%icc, .LL250	nop.LL221:#if defined(LT) || defined(RN)	sra	KK, 2, L	mov	B, BO	cmp	L,  0#else#ifdef LN	sll	K,  2 + BASE_SHIFT, TEMP1	sub	AORIG, TEMP1, AORIG#endif	sll	KK, 2 + BASE_SHIFT, TEMP1	sll	KK, 0 + BASE_SHIFT, TEMP2	add	AORIG, TEMP1, AO	add	B,     TEMP2, BO	sub	K, KK, TEMP1	sra	TEMP1, 2, L	cmp	L,  0#endif	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, c01	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, t1	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c02	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, t4	ble,pn	%icc, .LL225	prefetch [C1 + 4 * SIZE], 2.LL222:	FADD	c01, t1, c01	add	BO,  4 * SIZE, BO	FMUL	a1, b1, t1	LDF	[AO +  4 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b1, t2	LDF	[AO +  5 * SIZE], a2	FADD	c03, t3, c03	add	L, -1, L	FMUL	a3, b1, t3	LDF	[AO +  6 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b1, t4	LDF	[AO +  7 * SIZE], a4	LDF	[BO +  0 * SIZE], b1	FADD	c01, t1, c01	cmp	L,  0	FMUL	a1, b2, t1	LDF	[AO +  8 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b2, t2	LDF	[AO +  9 * SIZE], a2	FADD	c03, t3, c03	FMUL	a3, b2, t3	LDF	[AO + 10 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b2, t4	LDF	[AO + 11 * SIZE], a4	LDF	[BO +  1 * SIZE], b2	FADD	c01, t1, c01	FMUL	a1, b3, t1	LDF	[AO + 12 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b3, t2	LDF	[AO + 13 * SIZE], a2	FADD	c03, t3, c03	FMUL	a3, b3, t3	LDF	[AO + 14 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b3, t4	LDF	[AO + 15 * SIZE], a4	LDF	[BO +  2 * SIZE], b3	FADD	c01, t1, c01	FMUL	a1, b4, t1	LDF	[AO + 16 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b4, t2	LDF	[AO + 17 * SIZE], a2	FADD	c03, t3, c03	FMUL	a3, b4, t3	LDF	[AO + 18 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b4, t4	LDF	[AO + 19 * SIZE], a4	add	AO, 16 * SIZE, AO	bg,pt	%icc, .LL222	LDF	[BO +  3 * SIZE], b4.LL225:#if defined(LT) || defined(RN)	and	KK,  3, L#else	and	TEMP1, 3, L#endif	cmp	L,  0	ble,a,pn %icc, .LL229	nop.LL226:	FADD	c01, t1, c01	add	BO, 1 * SIZE, BO	FMUL	a1, b1, t1	LDF	[AO + 4 * SIZE], a1	FADD	c02, t2, c02	add	L, -1, L	FMUL	a2, b1, t2	LDF	[AO + 5 * SIZE], a2	FADD	c03, t3, c03	cmp	L, 0	FMUL	a3, b1, t3	LDF	[AO + 6 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b1, t4	LDF	[AO + 7 * SIZE], a4	add	AO, 4 * SIZE, AO	bg,pt	%icc, .LL226	LDF	[BO + 0 * SIZE], b1.LL229:	FADD	c01, t1, c01	FADD	c02, t2, c02	FADD	c03, t3, c03	FADD	c04, t4, c04#if defined(LN) || defined(RT)#ifdef LN	sub	KK, 4, TEMP1#else	sub	KK, 1, TEMP1#endif	sll	TEMP1, 2 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AORIG, TEMP2, AO	add	B,     TEMP1, BO#endif#if defined(LN) || defined(LT)	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	LDF	[BO +  2 * SIZE], a3	LDF	[BO +  3 * SIZE], a4	FSUB	a1, c01, c01	FSUB	a2, c02, c02	FSUB	a3, c03, c03	FSUB	a4, c04, c04#else	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  2 * SIZE], a3	LDF	[AO +  3 * SIZE], a4	FSUB	a1, c01, c01	FSUB	a2, c02, c02	FSUB	a3, c03, c03	FSUB	a4, c04, c04#endif#ifdef LN	LDF	[AO + 15 * SIZE], a1	LDF	[AO + 14 * SIZE], a2	LDF	[AO + 13 * SIZE], a3	LDF	[AO + 12 * SIZE], a4	FMUL	a1, c04, c04	FMUL	a2, c04, t1	FSUB	c03, t1, c03	FMUL	a3, c04, t1	FSUB	c02, t1, c02	FMUL	a4, c04, t1	FSUB	c01, t1, c01	LDF	[AO + 10 * SIZE], a1	LDF	[AO +  9 * SIZE], a2	LDF	[AO +  8 * SIZE], a3	FMUL	a1, c03, c03	FMUL	a2, c03, t1	FSUB	c02, t1, c02	FMUL	a3, c03, t1	FSUB	c01, t1, c01	LDF	[AO +  5 * SIZE], a1	LDF	[AO +  4 * SIZE], a2	FMUL	a1, c02, c02	FMUL	a2, c02, t1	FSUB	c01, t1, c01	LDF	[AO +  0 * SIZE], a1	FMUL	a1, c01, c01#endif#ifdef LT	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  2 * SIZE], a3	LDF	[AO +  3 * SIZE], a4	FMUL	a1, c01, c01	FMUL	a2, c01, t1	FSUB	c02, t1, c02	FMUL	a3, c01, t1	FSUB	c03, t1, c03	FMUL	a4, c01, t1	FSUB	c04, t1, c04	LDF	[AO +  5 * SIZE], a1	LDF	[AO +  6 * SIZE], a2	LDF	[AO +  7 * SIZE], a3	FMUL	a1, c02, c02	FMUL	a2, c02, t1	FSUB	c03, t1, c03	FMUL	a3, c02, t1	FSUB	c04, t1, c04	LDF	[AO + 10 * SIZE], a1	LDF	[AO + 11 * SIZE], a2	FMUL	a1, c03, c03	FMUL	a2, c03, t1	FSUB	c04, t1, c04	LDF	[AO + 15 * SIZE], a1	FMUL	a1, c04, c04#endif#ifdef RN	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01	FMUL	a1, c02, c02	FMUL	a1, c03, c03	FMUL	a1, c04, c04#endif#ifdef RT	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01	FMUL	a1, c02, c02	FMUL	a1, c03, c03	FMUL	a1, c04, c04#endif#ifdef LN	add	C1, -4 * SIZE, C1#endif#if defined(LN) || defined(LT)	STF	c01, [BO +  0 * SIZE]	STF	c02, [BO +  1 * SIZE]	STF	c03, [BO +  2 * SIZE]	STF	c04, [BO +  3 * SIZE]#else	STF	c01, [AO +  0 * SIZE]	STF	c02, [AO +  1 * SIZE]	STF	c03, [AO +  2 * SIZE]	STF	c04, [AO +  3 * SIZE]#endif	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	STF	c03, [C1 + 2 * SIZE]	STF	c04, [C1 + 3 * SIZE]	FMOV	FZERO, t1	FMOV	FZERO, t2	FMOV	FZERO, t3	FMOV	FZERO, t4#ifndef LN	add	C1, 4 * SIZE, C1#endif#ifdef RT	sll	K, 2 + BASE_SHIFT, TEMP1	add	AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN)	sub	K, KK, TEMP1	sll	TEMP1, 2 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LT	add	KK, 4, KK#endif#ifdef LN	sub	KK, 4, KK#endif	add	I, -1, I	cmp	I, 0	bg,pt	%icc, .LL221	nop.LL250:	and	M, 2, I	cmp	I, 0	ble,pn	%icc, .LL270	nop#if defined(LT) || defined(RN)	sra	KK, 2, L	mov	B, BO	cmp	L,  0#else#ifdef LN	sll	K,  1 + BASE_SHIFT, TEMP1	sub	AORIG, TEMP1, AORIG#endif	sll	KK, 1 + BASE_SHIFT, TEMP1	sll	KK, 0 + BASE_SHIFT, TEMP2	add	AORIG, TEMP1, AO	add	B,     TEMP2, BO	sub	K, KK, TEMP1	sra	TEMP1, 2, L	cmp	L,  0#endif	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, c01	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, t1	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c02	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, t4	ble,pn	%icc, .LL255	nop.LL252:	FADD	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1	LDF	[AO + 4 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b1, t2	LDF	[AO +  5 * SIZE], a2	LDF	[BO +  4 * SIZE], b1	FADD	c03, t3, c03	cmp	L, 0	FMUL	a3, b2, t3	LDF	[AO +  6 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b2, t4	LDF	[AO +  7 * SIZE], a4	LDF	[BO +  5 * SIZE], b2	FADD	c01, t1, c01	FMUL	a1, b3, t1	LDF	[AO +  8 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b3, t2	LDF	[AO +  9 * SIZE], a2	LDF	[BO +  6 * SIZE], b3	FADD	c03, t3, c03	FMUL	a3, b4, t3	LDF	[AO + 10 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b4, t4	LDF	[AO + 11 * SIZE], a4	add	AO,  8 * SIZE, AO	LDF	[BO +  7 * SIZE], b4	bg,pt	%icc, .LL252	add	BO,  4 * SIZE, BO.LL255:#if defined(LT) || defined(RN)	and	KK,  3, L#else	and	TEMP1, 3, L#endif	cmp	L,  0	ble,a,pn %icc, .LL259	nop.LL256:	FADD	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1	LDF	[AO + 2 * SIZE], a1	FADD	c02, t2, c02	cmp	L, 0	FMUL	a2, b1, t2	LDF	[AO + 3 * SIZE], a2	LDF	[BO + 1 * SIZE], b1	add	AO, 2 * SIZE, AO	bg,pt	%icc, .LL256	add	BO, 1 * SIZE, BO.LL259:	FADD	c01, t1, c01	FADD	c02, t2, c02	FADD	c03, t3, c03	FADD	c04, t4, c04	FADD	c01, c03, c01	FADD	c02, c04, c02#if defined(LN) || defined(RT)#ifdef LN	sub	KK, 2, TEMP1#else	sub	KK, 1, TEMP1#endif	sll	TEMP1, 1 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AORIG, TEMP2, AO	add	B,     TEMP1, BO#endif#if defined(LN) || defined(LT)	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	FSUB	a1, c01, c01	FSUB	a2, c02, c02#else	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	FSUB	a1, c01, c01	FSUB	a2, c02, c02#endif#ifdef LN	LDF	[AO +  3 * SIZE], a1	LDF	[AO +  2 * SIZE], a2	LDF	[AO +  0 * SIZE], a3	FMUL	a1, c02, c02	FMUL	a2, c02, t1	FSUB	c01, t1, c01	FMUL	a3, c01, c01#endif#ifdef LT	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  3 * SIZE], a3	FMUL	a1, c01, c01	FMUL	a2, c01, t1	FSUB	c02, t1, c02	FMUL	a3, c02, c02#endif#ifdef RN	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01	FMUL	a1, c02, c02#endif#ifdef RT	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01	FMUL	a1, c02, c02#endif#ifdef LN	add	C1, -2 * SIZE, C1#endif#if defined(LN) || defined(LT)	STF	c01, [BO +  0 * SIZE]	STF	c02, [BO +  1 * SIZE]#else	STF	c01, [AO +  0 * SIZE]	STF	c02, [AO +  1 * SIZE]#endif	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	FMOV	FZERO, t1	FMOV	FZERO, t2	FMOV	FZERO, t3	FMOV	FZERO, t4#ifndef LN	add	C1, 2 * SIZE, C1#endif#ifdef RT	sll	K, 1 + BASE_SHIFT, TEMP1	add	AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN)	sub	K, KK, TEMP1	sll	TEMP1, 1 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LT	add	KK, 2, KK#endif#ifdef LN	sub	KK, 2, KK#endif.LL270:	and	M, 1, I	cmp	I, 0	ble,pn	%icc, .LL299	nop#if defined(LT) || defined(RN)	sra	KK, 2, L	mov	B, BO	cmp	L,  0#else#ifdef LN	sll	K,  0 + BASE_SHIFT, TEMP1	sub	AORIG, TEMP1, AORIG#endif	sll	KK, 0 + BASE_SHIFT, TEMP1	add	AORIG, TEMP1, AO	add	B,     TEMP1, BO	sub	K, KK, TEMP1	sra	TEMP1, 2, L	cmp	L,  0#endif	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, t1 	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c01	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, t2	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c02	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, t3	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, t4	LDF	[BO + 2 * SIZE], b3	ble,pn	%icc, .LL275	LDF	[BO + 3 * SIZE], b4.LL272:	FADD	c01, t1, c01	add	L, -1, L	add	AO,  4 * SIZE, AO	FMUL	a1, b1, t1	add	BO,  4 * SIZE, BO	LDF	[AO + 0 * SIZE], a1	FADD	c02, t2, c02	cmp	L, 0	LDF	[BO + 0 * SIZE], b1	FMUL	a2, b2, t2	LDF	[AO + 1 * SIZE], a2	FADD	c01, t3, c01	LDF	[BO + 1 * SIZE], b2	FMUL	a3, b3, t3	LDF	[AO + 2 * SIZE], a3	FADD	c02, t4, c02	LDF	[BO + 2 * SIZE], b3	FMUL	a4, b4, t4	LDF	[AO + 3 * SIZE], a4	bg,pt	%icc, .LL272	LDF	[BO + 3 * SIZE], b4.LL275:#if defined(LT) || defined(RN)	and	KK,  3, L#else	and	TEMP1, 3, L#endif	cmp	L,  0	ble,a,pn %icc, .LL279	nop.LL276:	FADD	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1	LDF	[AO + 1 * SIZE], a1	LDF	[BO + 1 * SIZE], b1	add	BO, 1 * SIZE, BO	cmp	L, 0	bg,pt	%icc, .LL276	add	AO, 1 * SIZE, AO.LL279:	FADD	c01, t1, c01	FADD	c02, t2, c02	FADD	c01, t3, c01	FADD	c02, t4, c02	FADD	c01, c02, c01#if defined(LN) || defined(RT)	sub	KK, 1, TEMP1	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AORIG, TEMP1, AO	add	B,     TEMP1, BO#endif#if defined(LN) || defined(LT)	LDF	[BO +  0 * SIZE], a1	FSUB	a1, c01, c01#else	LDF	[AO +  0 * SIZE], a1	FSUB	a1, c01, c01#endif#ifdef LN	LDF	[AO +  0 * SIZE], a1	FMUL	a1, c01, c01#endif#ifdef LT	LDF	[AO +  0 * SIZE], a1	FMUL	a1, c01, c01#endif#ifdef RN	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01#endif#ifdef RT	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01#endif#ifdef LN	add	C1, -1 * SIZE, C1#endif#if defined(LN) || defined(LT)	STF	c01, [BO +  0 * SIZE]#else	STF	c01, [AO +  0 * SIZE]#endif	STF	c01, [C1 + 0 * SIZE]	FMOV	FZERO, t1	FMOV	FZERO, t2	FMOV	FZERO, t3	FMOV	FZERO, t4#ifndef LN	add	C1, 1 * SIZE, C1#endif#ifdef RT	sll	K, 0 + BASE_SHIFT, TEMP1	add	AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN)	sub	K, KK, TEMP1	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AO, TEMP1, AO	add	BO, TEMP1, BO#endif#ifdef LT	add	KK, 1, KK#endif#ifdef LN	sub	KK, 1, KK#endif.LL299:#ifdef LN	sll	K, 0 + BASE_SHIFT, TEMP1	add	B, TEMP1, B#endif#if defined(LT) || defined(RN)	mov	BO, B#endif#ifdef RN	add	KK, 1, KK#endif#ifdef RT	sub	KK, 1, KK#endif.LL999:	return	%i7 + 8	clr	%o0	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -