⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
	LDF	[BO + 10 * SIZE], b3	FADD	c04, t4, c04	FMUL	a4, b4, t4	LDF	[BO + 11 * SIZE], b4	add	BO,  8 * SIZE, BO	bg,pt	%icc, .LL172	LDF	[AO + 3 * SIZE], a4.LL175:#ifndef TRMMKERNEL	and	K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 1, L#else	add	KK, 2, L#endif	and	L, 3,  L#endif	cmp	L,  0	ble,a,pn %icc, .LL179	nop.LL176:	FADD	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1	add	AO, 1 * SIZE, AO	LDF	[BO + 2 * SIZE], b1	FADD	c02, t2, c02	cmp	L, 0	FMUL	a1, b2, t2	LDF	[BO + 3 * SIZE], b2	add	BO, 2 * SIZE, BO	bg,pt	%icc, .LL176	LDF	[AO + 0 * SIZE], a1.LL179:#ifndef TRMMKERNEL	FADD	c01, t1, c01	LDF	[C1 + 0 * SIZE], a1	FADD	c02, t2, c02	LDF	[C2 + 0 * SIZE], a2	FADD	c03, t3, c03	FADD	c04, t4, c04	FADD	c01, c03, c01	FADD	c02, c04, c02	FMUL	c01, ALPHA, c01	FMUL	c02, ALPHA, c02	FADD	c01, a1, c01	FADD	c02, a2, c02	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C2 + 0 * SIZE]#else	FADD	c01, t1, c01	FADD	c02, t2, c02	FADD	c03, t3, c03	FADD	c04, t4, c04	FADD	c01, c03, c01	FADD	c02, c04, c02	FMUL	c01, ALPHA, c01	FMUL	c02, ALPHA, c02	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C2 + 0 * SIZE]#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	K, KK, TEMP1#ifdef LEFT	add	TEMP1, -1, TEMP1#else	add	TEMP1, -2, TEMP1#endif	sll	TEMP1, 0 + BASE_SHIFT, TEMP2	sll	TEMP1, 1 + BASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LEFT	add	KK, 1, KK#endif#endif.LL199:	mov	BO, B#if defined(TRMMKERNEL) && !defined(LEFT)	add	KK, 2, KK#else	nop#endif.LL200:	and	N, 1, J	sra	M, 2, I	cmp	J, 0	ble,pn	%icc, .LL999	mov	A, AO#if defined(TRMMKERNEL) &&  defined(LEFT)	mov	OFFSET, KK#endif	cmp	I, 0	ble,pn	%icc, .LL250	mov	C, C1.LL221:#if !defined(TRMMKERNEL)	LDF	[AO + 0 * SIZE], a1	sra	K, 2, L	FMOV	FZERO, c01	LDF	[B  + 0 * SIZE], b1	mov	B, BO	FMOV	FZERO, t1	LDF	[AO + 1 * SIZE], a2	cmp	L,  0	FMOV	FZERO, c02	LDF	[B  + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[B  + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[B  + 3 * SIZE], b4	FMOV	FZERO, t4#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	mov	B, BO#else	sll	KK, 2 + BASE_SHIFT, TEMP1	sll	KK, 0 + BASE_SHIFT, TEMP2	add	AO, TEMP1, AO	add	B,  TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 4, L#else	add	KK, 1, L#endif	sra	L, 2, L	cmp	L,  0	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, c01	LDF	[BO  + 0 * SIZE], b1	FMOV	FZERO, t1	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c02	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, t4#endif	ble,pn	%icc, .LL225	prefetch [C1 + 4 * SIZE], 2.LL222:	FADD	c01, t1, c01	add	BO,  4 * SIZE, BO	FMUL	a1, b1, t1	LDF	[AO +  4 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b1, t2	LDF	[AO +  5 * SIZE], a2	FADD	c03, t3, c03	add	L, -1, L	FMUL	a3, b1, t3	LDF	[AO +  6 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b1, t4	LDF	[AO +  7 * SIZE], a4	LDF	[BO +  0 * SIZE], b1	FADD	c01, t1, c01	cmp	L,  0	FMUL	a1, b2, t1	LDF	[AO +  8 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b2, t2	LDF	[AO +  9 * SIZE], a2	FADD	c03, t3, c03	FMUL	a3, b2, t3	LDF	[AO + 10 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b2, t4	LDF	[AO + 11 * SIZE], a4	LDF	[BO +  1 * SIZE], b2	FADD	c01, t1, c01	FMUL	a1, b3, t1	LDF	[AO + 12 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b3, t2	LDF	[AO + 13 * SIZE], a2	FADD	c03, t3, c03	FMUL	a3, b3, t3	LDF	[AO + 14 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b3, t4	LDF	[AO + 15 * SIZE], a4	LDF	[BO +  2 * SIZE], b3	FADD	c01, t1, c01	FMUL	a1, b4, t1	LDF	[AO + 16 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b4, t2	LDF	[AO + 17 * SIZE], a2	FADD	c03, t3, c03	FMUL	a3, b4, t3	LDF	[AO + 18 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b4, t4	LDF	[AO + 19 * SIZE], a4	add	AO, 16 * SIZE, AO	bg,pt	%icc, .LL222	LDF	[BO +  3 * SIZE], b4.LL225:#ifndef TRMMKERNEL	and	K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 4, L#else	add	KK, 1, L#endif	and	L, 3,  L#endif	cmp	L,  0	ble,a,pn %icc, .LL229	nop.LL226:	FADD	c01, t1, c01	add	BO, 1 * SIZE, BO	FMUL	a1, b1, t1	LDF	[AO + 4 * SIZE], a1	FADD	c02, t2, c02	add	L, -1, L	FMUL	a2, b1, t2	LDF	[AO + 5 * SIZE], a2	FADD	c03, t3, c03	cmp	L, 0	FMUL	a3, b1, t3	LDF	[AO + 6 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b1, t4	LDF	[AO + 7 * SIZE], a4	add	AO, 4 * SIZE, AO	bg,pt	%icc, .LL226	LDF	[BO + 0 * SIZE], b1.LL229:#ifndef TRMMKERNEL	FADD	c01, t1, c01	add	I, -1, I	FADD	c02, t2, c02	cmp	I, 0	FADD	c03, t3, c03	FADD	c04, t4, c04	FMUL	c01, ALPHA, c01	FMUL	c02, ALPHA, c02	FMUL	c03, ALPHA, c03	FMUL	c04, ALPHA, c04	LDF	[C1 + 0 * SIZE], a1	LDF	[C1 + 1 * SIZE], a2	LDF	[C1 + 2 * SIZE], a3	LDF	[C1 + 3 * SIZE], a4	FADD	c01, a1, c01	FADD	c02, a2, c02	FADD	c03, a3, c03	FADD	c04, a4, c04	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	STF	c03, [C1 + 2 * SIZE]	STF	c04, [C1 + 3 * SIZE]	add	C1, 4 * SIZE, C1#else	FADD	c01, t1, c01	FADD	c02, t2, c02	FADD	c03, t3, c03	FADD	c04, t4, c04	FMUL	c01, ALPHA, c01	FMUL	c02, ALPHA, c02	FMUL	c03, ALPHA, c03	FMUL	c04, ALPHA, c04	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	STF	c03, [C1 + 2 * SIZE]	STF	c04, [C1 + 3 * SIZE]	add	C1, 4 * SIZE, C1#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	K, KK, TEMP1#ifdef LEFT	add	TEMP1, -4, TEMP1#else	add	TEMP1, -1, TEMP1#endif	sll	TEMP1, 2 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LEFT	add	KK, 4, KK#endif	add	I, -1, I	cmp	I, 0#endif	bg,pt	%icc, .LL221	nop.LL250:	and	M, 2, I	cmp	I, 0	ble,pn	%icc, .LL270	nop.LL251:#if !defined(TRMMKERNEL)	LDF	[AO + 0 * SIZE], a1	sra	K, 2, L	FMOV	FZERO, c01	LDF	[B  + 0 * SIZE], b1	mov	B, BO	FMOV	FZERO, t1	LDF	[AO + 1 * SIZE], a2	cmp	L,  0	FMOV	FZERO, c02	LDF	[B  + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[B  + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[B  + 3 * SIZE], b4	FMOV	FZERO, t4#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	mov	B, BO#else	sll	KK, 1 + BASE_SHIFT, TEMP1	sll	KK, 0 + BASE_SHIFT, TEMP2	add	AO, TEMP1, AO	add	B,  TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 2, L#else	add	KK, 1, L#endif	sra	L, 2, L	cmp	L,  0	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, c01	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, t1	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c02	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, t4#endif	ble,pn	%icc, .LL255	nop.LL252:	FADD	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1	LDF	[AO + 4 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b1, t2	LDF	[AO +  5 * SIZE], a2	LDF	[BO +  4 * SIZE], b1	FADD	c03, t3, c03	cmp	L, 0	FMUL	a3, b2, t3	LDF	[AO +  6 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b2, t4	LDF	[AO +  7 * SIZE], a4	LDF	[BO +  5 * SIZE], b2	FADD	c01, t1, c01	FMUL	a1, b3, t1	LDF	[AO +  8 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b3, t2	LDF	[AO +  9 * SIZE], a2	LDF	[BO +  6 * SIZE], b3	FADD	c03, t3, c03	FMUL	a3, b4, t3	LDF	[AO + 10 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b4, t4	LDF	[AO + 11 * SIZE], a4	add	AO,  8 * SIZE, AO	LDF	[BO +  7 * SIZE], b4	bg,pt	%icc, .LL252	add	BO,  4 * SIZE, BO.LL255:#ifndef TRMMKERNEL	and	K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 2, L#else	add	KK, 1, L#endif	and	L, 3,  L#endif	cmp	L,  0	ble,a,pn %icc, .LL259	nop.LL256:	FADD	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1	LDF	[AO + 2 * SIZE], a1	FADD	c02, t2, c02	cmp	L, 0	FMUL	a2, b1, t2	LDF	[AO + 3 * SIZE], a2	LDF	[BO + 1 * SIZE], b1	add	AO, 2 * SIZE, AO	bg,pt	%icc, .LL256	add	BO, 1 * SIZE, BO.LL259:#ifndef TRMMKERNEL	FADD	c01, t1, c01	LDF	[C1 + 0 * SIZE], a1	FADD	c02, t2, c02	LDF	[C1 + 1 * SIZE], a2	FADD	c03, t3, c03	FADD	c04, t4, c04	FADD	c01, c03, c01	FADD	c02, c04, c02	FMUL	c01, ALPHA, c01	FMUL	c02, ALPHA, c02	FADD	c01, a1, c01	FADD	c02, a2, c02	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	add	C1, 2 * SIZE, C1#else	FADD	c01, t1, c01	FADD	c02, t2, c02	FADD	c03, t3, c03	FADD	c04, t4, c04	FADD	c01, c03, c01	FADD	c02, c04, c02	FMUL	c01, ALPHA, c01	FMUL	c02, ALPHA, c02	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	add	C1, 2 * SIZE, C1#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	K, KK, TEMP1#ifdef LEFT	add	TEMP1, -2, TEMP1#else	add	TEMP1, -1, TEMP1#endif	sll	TEMP1, 1 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LEFT	add	KK, 2, KK#endif#endif.LL270:	and	M, 1, I	cmp	I, 0	ble,pn	%icc, .LL999	nop.LL271:#if !defined(TRMMKERNEL)	LDF	[AO + 0 * SIZE], a1	sra	K, 2, L	FMOV	FZERO, t1 	LDF	[AO + 1 * SIZE], a2	mov	B, BO	FMOV	FZERO, c01	LDF	[AO + 2 * SIZE], a3	cmp	L,  0	FMOV	FZERO, t2	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c02	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, t3	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, t4	LDF	[BO + 2 * SIZE], b3#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	mov	B, BO#else	sll	KK, 0 + BASE_SHIFT, TEMP1	sll	KK, 0 + BASE_SHIFT, TEMP2	add	AO, TEMP1, AO	add	B,  TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 1, L#else	add	KK, 1, L#endif	sra	L, 2, L	cmp	L,  0	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, t1 	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c01	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, t2	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c02	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, t3	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, t4	LDF	[BO + 2 * SIZE], b3#endif	ble,pn	%icc, .LL275	LDF	[BO + 3 * SIZE], b4.LL272:	FADD	c01, t1, c01	add	L, -1, L	add	AO,  4 * SIZE, AO	FMUL	a1, b1, t1	add	BO,  4 * SIZE, BO	LDF	[AO + 0 * SIZE], a1	FADD	c02, t2, c02	cmp	L, 0	LDF	[BO + 0 * SIZE], b1	FMUL	a2, b2, t2	LDF	[AO + 1 * SIZE], a2	FADD	c01, t3, c01	LDF	[BO + 1 * SIZE], b2	FMUL	a3, b3, t3	LDF	[AO + 2 * SIZE], a3	FADD	c02, t4, c02	LDF	[BO + 2 * SIZE], b3	FMUL	a4, b4, t4	LDF	[AO + 3 * SIZE], a4	bg,pt	%icc, .LL272	LDF	[BO + 3 * SIZE], b4.LL275:#ifndef TRMMKERNEL	and	K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 1, L#else	add	KK, 1, L#endif	and	L, 3,  L#endif	cmp	L,  0	ble,a,pn %icc, .LL279	nop.LL276:	FADD	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1	LDF	[AO + 1 * SIZE], a1	LDF	[BO + 1 * SIZE], b1	add	BO, 1 * SIZE, BO	cmp	L, 0	bg,pt	%icc, .LL276	add	AO, 1 * SIZE, AO.LL279:#ifndef TRMMKERNEL	FADD	c01, t1, c01	LDF	[C1 + 0 * SIZE], a1	FADD	c02, t2, c02	FADD	c01, t3, c01	FADD	c02, t4, c02	FADD	c01, c02, c01	FMUL	c01, ALPHA, c01	FADD	c01, a1, c01	STF	c01, [C1 + 0 * SIZE]#else	FADD	c01, t1, c01	FADD	c02, t2, c02	FADD	c01, t3, c01	FADD	c02, t4, c02	FADD	c01, c02, c01	FMUL	c01, ALPHA, c01	STF	c01, [C1 + 0 * SIZE]#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	K, KK, TEMP1#ifdef LEFT	add	TEMP1, -1, TEMP1#else	add	TEMP1, -1, TEMP1#endif	sll	TEMP1, 0 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LEFT	add	KK, 1, KK#endif#endif.LL999:	return	%i7 + 8	clr	%o0	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -