⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
	FMUL	c15, ALPHA, c15	add	C3, 4 * SIZE, C3	FADD	c03, a3, c03	LDF	[C4 + 2 * SIZE], a3	FMUL	c16, ALPHA, c16	nop	FADD	c04, a4, c04	LDF	[C4 + 3 * SIZE], a4	STF	c01, [C1 - 4 * SIZE]	FADD	c05, b1, c05	STF	c02, [C1 - 3 * SIZE]	FADD	c06, b2, c06	STF	c03, [C1 - 2 * SIZE]	FADD	c07, b3, c07	STF	c04, [C1 - 1 * SIZE]	FADD	c08, b4, c08	STF	c05, [C2 - 4 * SIZE]	FADD	c09, t1, c09	STF	c06, [C2 - 3 * SIZE]	FADD	c10, t2, c10	STF	c07, [C2 - 2 * SIZE]	FADD	c11, t3, c11	STF	c08, [C2 - 1 * SIZE]	FADD	c12, t4, c12	STF	c09, [C3 - 4 * SIZE]	FADD	c13, a1, c13	STF	c10, [C3 - 3 * SIZE]	FADD	c14, a2, c14	STF	c11, [C3 - 2 * SIZE]	FADD	c15, a3, c15	STF	c12, [C3 - 1 * SIZE]	FADD	c16, a4, c16	STF	c13, [C4 + 0 * SIZE]	FMOV	FZERO, t1	STF	c14, [C4 + 1 * SIZE]	FMOV	FZERO, t2	STF	c15, [C4 + 2 * SIZE]	FMOV	FZERO, t3	STF	c16, [C4 + 3 * SIZE]	FMOV	FZERO, t4	add	C4, 4 * SIZE, C4#else	FADD	c04, t1, c04	FMUL	c01, ALPHA, c01	FADD	c08, t2, c08	FMUL	c02, ALPHA, c02	FADD	c12, t3, c12	FMUL	c03, ALPHA, c03	FADD	c16, t4, c16	FMUL	c04, ALPHA, c04	STF	c01, [C1 + 0 * SIZE]	FMUL	c05, ALPHA, c05	STF	c02, [C1 + 1 * SIZE]	FMUL	c06, ALPHA, c06	STF	c03, [C1 + 2 * SIZE]	FMUL	c07, ALPHA, c07	STF	c04, [C1 + 3 * SIZE]	FMUL	c08, ALPHA, c08	STF	c05, [C2 + 0 * SIZE]	FMUL	c09, ALPHA, c09	STF	c06, [C2 + 1 * SIZE]	FMUL	c10, ALPHA, c10	STF	c07, [C2 + 2 * SIZE]	FMUL	c11, ALPHA, c11	STF	c08, [C2 + 3 * SIZE]	FMUL	c12, ALPHA, c12	STF	c09, [C3 + 0 * SIZE]	FMUL	c13, ALPHA, c13	STF	c10, [C3 + 1 * SIZE]	FMUL	c14, ALPHA, c14	STF	c11, [C3 + 2 * SIZE]	FMUL	c15, ALPHA, c15	STF	c12, [C3 + 3 * SIZE]	FMUL	c16, ALPHA, c16	STF	c13, [C4 + 0 * SIZE]	STF	c14, [C4 + 1 * SIZE]	STF	c15, [C4 + 2 * SIZE]	STF	c16, [C4 + 3 * SIZE]	FMOV	FZERO, t1	FMOV	FZERO, t2	FMOV	FZERO, t3	FMOV	FZERO, t4	add	C1, 4 * SIZE, C1	add	C2, 4 * SIZE, C2	add	C3, 4 * SIZE, C3	add	C4, 4 * SIZE, C4#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	K, KK, TEMP1#ifdef LEFT	add	TEMP1, -4, TEMP1#else	add	TEMP1, -4, TEMP1#endif	sll	TEMP1, 2 + BASE_SHIFT, TEMP1	add	AO, TEMP1, AO	add	BO, TEMP1, BO#endif#ifdef LEFT	add	KK, 4, KK#endif	add	I, -1, I	cmp	I, 0#endif	sra	K, 2, L	bg,pt	%icc, .LL21	FMOV	FZERO, c01.LL50:	and	M, 2, I	FMOV	FZERO, c02	cmp	I, 0	FMOV	FZERO, t1	ble,pn	%icc, .LL70	FMOV	FZERO, c04#if !defined(TRMMKERNEL)	LDF	[AO + 0 * SIZE], a1	sra	K, 2, L	FMOV	FZERO, t2	LDF	[B  + 0 * SIZE], b1	mov	B, BO	FMOV	FZERO, c06	LDF	[AO + 1 * SIZE], a2	cmp	L,  0	FMOV	FZERO, t3	LDF	[B  + 1 * SIZE], b2	FMOV	FZERO, c08	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, t4	LDF	[B  + 2 * SIZE], b3	FMOV	FZERO, c01	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c03	LDF	[B  + 3 * SIZE], b4	FMOV	FZERO, c05#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	mov	B, BO#else	sll	KK, 1 + BASE_SHIFT, TEMP1	sll	KK, 2 + BASE_SHIFT, TEMP2	add	AO, TEMP1, AO	add	B,  TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 2, L#else	add	KK, 4, L#endif	sra	L, 2, L	cmp	L,  0	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, t2	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, c06	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, t3	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, c08	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, t4	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, c01	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c03	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, c05#endif	ble,pn	%icc, .LL55	FMOV	FZERO, c07.LL52:	FADD	c02, t1, c02	add	AO,  8 * SIZE, AO	prefetch [AO + APREFETCHSIZE * SIZE], 0	FMUL	a1, b1, t1	add	BO, 16 * SIZE, BO	FADD	c04, t2, c04	add	L, -1, L	FMUL	a1, b2, t2	FADD	c06, t3, c06	cmp	L, 0	FMUL	a1, b3, t3	FADD	c08, t4, c08	FMUL	a1, b4, t4	LDF	[AO -  4 * SIZE], a1	FADD	c01, t1, c01	FMUL	a2, b1, t1	LDF	[BO - 12 * SIZE], b1	FADD	c03, t2, c03	FMUL	a2, b2, t2	LDF	[BO - 11 * SIZE], b2	FADD	c05, t3, c05	FMUL	a2, b3, t3	LDF	[BO - 10 * SIZE], b3	FADD	c07, t4, c07	FMUL	a2, b4, t4	LDF	[BO -  9 * SIZE], b4	FADD	c02, t1, c02	FMUL	a3, b1, t1	LDF	[AO -  3 * SIZE], a2	FADD	c04, t2, c04	FMUL	a3, b2, t2	FADD	c06, t3, c06	FMUL	a3, b3, t3	FADD	c08, t4, c08	FMUL	a3, b4, t4	LDF	[AO -  2 * SIZE], a3	FADD	c01, t1, c01	FMUL	a4, b1, t1	LDF	[BO -  8 * SIZE], b1	FADD	c03, t2, c03	FMUL	a4, b2, t2	LDF	[BO -  7 * SIZE], b2	FADD	c05, t3, c05	FMUL	a4, b3, t3	LDF	[BO -  6 * SIZE], b3	FADD	c07, t4, c07	FMUL	a4, b4, t4	LDF	[BO -  5 * SIZE], b4	FADD	c02, t1, c02	FMUL	a1, b1, t1	LDF	[AO -  1 * SIZE], a4	FADD	c04, t2, c04	FMUL	a1, b2, t2	FADD	c06, t3, c06	FMUL	a1, b3, t3	FADD	c08, t4, c08	FMUL	a1, b4, t4	LDF	[AO +  0 * SIZE], a1	FADD	c01, t1, c01	FMUL	a2, b1, t1	LDF	[BO -  4 * SIZE], b1	FADD	c03, t2, c03	FMUL	a2, b2, t2	LDF	[BO -  3 * SIZE], b2	FADD	c05, t3, c05	FMUL	a2, b3, t3	LDF	[BO -  2 * SIZE], b3	FADD	c07, t4, c07	FMUL	a2, b4, t4	LDF	[BO -  1 * SIZE], b4	FADD	c02, t1, c02	FMUL	a3, b1, t1	LDF	[AO +  1 * SIZE], a2	FADD	c04, t2, c04	FMUL	a3, b2, t2	FADD	c06, t3, c06	FMUL	a3, b3, t3	FADD	c08, t4, c08	FMUL	a3, b4, t4	LDF	[AO +  2 * SIZE], a3	FADD	c01, t1, c01	FMUL	a4, b1, t1	LDF	[BO +  0 * SIZE], b1	FADD	c03, t2, c03	FMUL	a4, b2, t2	LDF	[BO +  1 * SIZE], b2	FADD	c05, t3, c05	FMUL	a4, b3, t3	LDF	[BO +  2 * SIZE], b3	FADD	c07, t4, c07	FMUL	a4, b4, t4	LDF	[BO +  3 * SIZE], b4	bg,pt	%icc, .LL52	LDF	[AO +  3 * SIZE], a4.LL55:#ifndef TRMMKERNEL	and	K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 2, L#else	add	KK, 4, L#endif	and	L, 3,  L#endif	cmp	L,  0	ble,a,pn %icc, .LL59	nop.LL56:	FADD	c02, t1, c02	add	AO, 2 * SIZE, AO	FMUL	a1, b1, t1	add	L, -1, L	add	BO, 4 * SIZE, BO	FADD	c04, t2, c04	cmp	L, 0	FMUL	a1, b2, t2	FADD	c06, t3, c06	FMUL	a1, b3, t3	FADD	c08, t4, c08	FMUL	a1, b4, t4	LDF	[AO + 0 * SIZE], a1	FADD	c01, t1, c01	FMUL	a2, b1, t1	LDF	[BO + 0 * SIZE], b1	FADD	c03, t2, c03	FMUL	a2, b2, t2	LDF	[BO + 1 * SIZE], b2	FADD	c05, t3, c05	FMUL	a2, b3, t3	LDF	[BO + 2 * SIZE], b3	FADD	c07, t4, c07	FMUL	a2, b4, t4	LDF	[BO + 3 * SIZE], b4	bg,pt	%icc, .LL56	LDF	[AO + 1 * SIZE], a2.LL59:#ifndef TRMMKERNEL	FADD	c02, t1, c02	FMUL	c01, ALPHA, c01	LDF	[C1 + 0 * SIZE], a1	FADD	c04, t2, c04	FMUL	c03, ALPHA, c03	LDF	[C1 + 1 * SIZE], a2	FADD	c06, t3, c06	FMUL	c05, ALPHA, c05	LDF	[C2 + 0 * SIZE], a3	FADD	c08, t4, c08	FMUL	c07, ALPHA, c07	LDF	[C2 + 1 * SIZE], a4	FMUL	c02, ALPHA, c02	FADD	c01, a1, c01	LDF	[C3 + 0 * SIZE], b1	FMUL	c04, ALPHA, c04	FADD	c02, a2, c02	LDF	[C3 + 1 * SIZE], b2	FMUL	c06, ALPHA, c06	FADD	c03, a3, c03	LDF	[C4 + 0 * SIZE], b3	FMUL	c08, ALPHA, c08	FADD	c04, a4, c04	LDF	[C4 + 1 * SIZE], b4	STF	c01, [C1 + 0 * SIZE]	FADD	c05, b1, c05	STF	c02, [C1 + 1 * SIZE]	FADD	c06, b2, c06	add	C1, 2 * SIZE, C1	STF	c03, [C2 + 0 * SIZE]	FADD	c07, b3, c07	STF	c04, [C2 + 1 * SIZE]	FADD	c08, b4, c08	add	C2, 2 * SIZE, C2	STF	c05, [C3 + 0 * SIZE]	STF	c06, [C3 + 1 * SIZE]	add	C3, 2 * SIZE, C3	STF	c07, [C4 + 0 * SIZE]	STF	c08, [C4 + 1 * SIZE]	add	C4, 2 * SIZE, C4#else	FADD	c02, t1, c02	FADD	c04, t2, c04	FADD	c06, t3, c06	FADD	c08, t4, c08	FMUL	c01, ALPHA, c01	FMUL	c03, ALPHA, c03	FMUL	c05, ALPHA, c05	FMUL	c07, ALPHA, c07	FMUL	c02, ALPHA, c02	FMUL	c04, ALPHA, c04	FMUL	c06, ALPHA, c06	FMUL	c08, ALPHA, c08	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	STF	c03, [C2 + 0 * SIZE]	STF	c04, [C2 + 1 * SIZE]	STF	c05, [C3 + 0 * SIZE]	STF	c06, [C3 + 1 * SIZE]	STF	c07, [C4 + 0 * SIZE]	STF	c08, [C4 + 1 * SIZE]	add	C1, 2 * SIZE, C1	add	C2, 2 * SIZE, C2	add	C3, 2 * SIZE, C3	add	C4, 2 * SIZE, C4#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	K, KK, TEMP1#ifdef LEFT	add	TEMP1, -2, TEMP1#else	add	TEMP1, -4, TEMP1#endif	sll	TEMP1, 1 + BASE_SHIFT, TEMP2	sll	TEMP1, 2 + BASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LEFT	add	KK, 2, KK#endif#endif.LL70:	and	M, 1, I	cmp	I, 0	ble,pn	%icc, .LL99	nop.LL71:#if !defined(TRMMKERNEL)	LDF	[AO + 0 * SIZE], a1	sra	K, 2, L	FMOV	FZERO, c01	LDF	[B  + 0 * SIZE], b1	mov	B, BO	FMOV	FZERO, t1 	LDF	[AO + 1 * SIZE], a2	cmp	L,  0	FMOV	FZERO, c02	LDF	[B  + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[B  + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[B  + 3 * SIZE], b4	FMOV	FZERO, t4#else#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	mov	B, BO#else	sll	KK, 0 + BASE_SHIFT, TEMP1	sll	KK, 2 + BASE_SHIFT, TEMP2	add	AO, TEMP1, AO	add	B,  TEMP2, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 1, L#else	add	KK, 4, L#endif	sra	L, 2, L	cmp	L,  0	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, c01	LDF	[BO  + 0 * SIZE], b1	FMOV	FZERO, t1 	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c02	LDF	[BO  + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[BO  + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[BO  + 3 * SIZE], b4	FMOV	FZERO, t4#endif	ble,pn	%icc, .LL75	nop.LL72:	FADD	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1	LDF	[BO + 4 * SIZE], b1	FADD	c02, t2, c02	cmp	L, 0	FMUL	a1, b2, t2	LDF	[BO + 5 * SIZE], b2	FADD	c03, t3, c03	FMUL	a1, b3, t3	LDF	[BO + 6 * SIZE], b3	FADD	c04, t4, c04	FMUL	a1, b4, t4	LDF	[BO + 7 * SIZE], b4	LDF	[AO +  4 * SIZE], a1	FADD	c01, t1, c01	add	AO,  4 * SIZE, AO	FMUL	a2, b1, t1	LDF	[BO +  8 * SIZE], b1	FADD	c02, t2, c02	FMUL	a2, b2, t2	LDF	[BO +  9 * SIZE], b2	FADD	c03, t3, c03	FMUL	a2, b3, t3	LDF	[BO + 10 * SIZE], b3	FADD	c04, t4, c04	FMUL	a2, b4, t4	LDF	[BO + 11 * SIZE], b4	LDF	[AO +  1 * SIZE], a2	FADD	c01, t1, c01	FMUL	a3, b1, t1	LDF	[BO + 12 * SIZE], b1	FADD	c02, t2, c02	FMUL	a3, b2, t2	LDF	[BO + 13 * SIZE], b2	FADD	c03, t3, c03	FMUL	a3, b3, t3	LDF	[BO + 14 * SIZE], b3	FADD	c04, t4, c04	FMUL	a3, b4, t4	LDF	[BO + 15 * SIZE], b4	LDF	[AO +  2 * SIZE], a3	FADD	c01, t1, c01	FMUL	a4, b1, t1	LDF	[BO + 16 * SIZE], b1	FADD	c02, t2, c02	FMUL	a4, b2, t2	LDF	[BO + 17 * SIZE], b2	FADD	c03, t3, c03	FMUL	a4, b3, t3	LDF	[BO + 18 * SIZE], b3	FADD	c04, t4, c04	FMUL	a4, b4, t4	LDF	[BO + 19 * SIZE], b4	add	BO, 16 * SIZE, BO	bg,pt	%icc, .LL72	LDF	[AO +  3 * SIZE], a4.LL75:#ifndef TRMMKERNEL	and	K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 1, L#else	add	KK, 4, L#endif	and	L, 3,  L#endif	cmp	L,  0	ble,a,pn %icc, .LL79	nop.LL76:	FADD	c01, t1, c01	add	AO, 1 * SIZE, AO	FMUL	a1, b1, t1	LDF	[BO + 4 * SIZE], b1	FADD	c02, t2, c02	add	L, -1, L	FMUL	a1, b2, t2	LDF	[BO + 5 * SIZE], b2	FADD	c03, t3, c03	cmp	L, 0	FMUL	a1, b3, t3	LDF	[BO + 6 * SIZE], b3	FADD	c04, t4, c04	add	BO, 4 * SIZE, BO	FMUL	a1, b4, t4	LDF	[AO + 0 * SIZE], a1	bg,pt	%icc, .LL76	LDF	[BO + 3 * SIZE], b4.LL79:#ifndef TRMMKERNEL	FADD	c01, t1, c01	LDF	[C1 + 0 * SIZE], a1	FADD	c02, t2, c02	LDF	[C2 + 0 * SIZE], a2	FADD	c03, t3, c03	LDF	[C3 + 0 * SIZE], a3	FADD	c04, t4, c04	LDF	[C4 + 0 * SIZE], a4	FMUL	c01, ALPHA, c01	FMUL	c02, ALPHA, c02	FMUL	c03, ALPHA, c03	FMUL	c04, ALPHA, c04	FADD	c01, a1, c01	FADD	c02, a2, c02	FADD	c03, a3, c03	FADD	c04, a4, c04	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C2 + 0 * SIZE]	STF	c03, [C3 + 0 * SIZE]	STF	c04, [C4 + 0 * SIZE]#else	FADD	c01, t1, c01	FADD	c02, t2, c02	FADD	c03, t3, c03	FADD	c04, t4, c04	FMUL	c01, ALPHA, c01	FMUL	c02, ALPHA, c02	FMUL	c03, ALPHA, c03	FMUL	c04, ALPHA, c04	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C2 + 0 * SIZE]	STF	c03, [C3 + 0 * SIZE]	STF	c04, [C4 + 0 * SIZE]#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	K, KK, TEMP1#ifdef LEFT	add	TEMP1, -1, TEMP1#else	add	TEMP1, -4, TEMP1#endif	sll	TEMP1, 0 + BASE_SHIFT, TEMP2	sll	TEMP1, 2 + BASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LEFT	add	KK, 1, KK#endif#endif.LL99:	add	J, -1, J	mov	BO, B	cmp	J, 0	bg,pt	%icc, .LL11#if defined(TRMMKERNEL) && !defined(LEFT)	add	KK, 4, KK#else	nop#endif.LL100:  /* n & 2 */	sra	M, 2, I	and	N, 2, J	cmp	J, 0	add	C, LDC, C2	ble,pn	%icc, .LL200	mov	A, AO#if defined(TRMMKERNEL) &&  defined(LEFT)	mov	OFFSET, KK#endif	mov	C, C1	add	C2, LDC, C	cmp	I, 0	ble,pn	%icc, .LL150	FMOV	FZERO, c03.LL121:#if !defined(TRMMKERNEL)	LDF	[AO + 0 * SIZE], a1	sra	K, 2, L

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -