⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trsm_kernel_rt.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 5 页
字号:
	prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY#else	nop#endif	FMUL	a3, b2, t2	nop	FADD	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO - 2 * SIZE], a3	FADD	c03, t1, c03	nop	FMUL	a4, b1, t1	LDF	[BO -  0 * SIZE], b1	FADD	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO - 3 * SIZE], b2	FADD	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO - 2 * SIZE], b3	FADD	c15, t4, c15	nop	FMUL	a4, b4, t4	LDF	[BO - 1 * SIZE], b4	FADD	c04, t1, c04	nop	FMUL	a5, b5, t1	LDF	[AO - 1 * SIZE], a4	FADD	c08, t2, c08	FMUL	a5, b2, t2	FADD	c12, t3, c12	FMUL	a5, b3, t3	FADD	c16, t4, c16	nop	FMUL	a5, b4, t4	LDF	[AO +  4 * SIZE], a5	FADD	c01, t1, c01	nop	FMUL	a2, b5, t1	nop	FADD	c05, t2, c05	nop	FMUL	a2, b2, t2	nop	FADD	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD	c13, t4, c13	nop	FMUL	a2, b4, t4	LDF	[AO +  1 * SIZE], a2	FADD	c02, t1, c02	nop	FMUL	a3, b5, t1	nop	FADD	c06, t2, c06	nop	FMUL	a3, b2, t2	nop	FADD	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO +  2 * SIZE], a3	FADD	c03, t1, c03	cmp	L, 0	FMUL	a4, b5, t1	LDF	[BO +  4 * SIZE], b5	FADD	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO +  1 * SIZE], b2	FADD	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO +  2 * SIZE], b3	FADD	c15, t4, c15	FMUL	a4, b4, t4	bg,pt	%icc, .LL22	LDF	[BO +  3 * SIZE], b4.LL25:#if defined(LT) || defined(RN)	and	KK,  3, L#else	and	TEMP1, 3, L#endif	cmp	L,  0	ble,a,pn %icc, .LL29	nop.LL26:	FADD	c04, t1, c04	LDF	[AO +  3 * SIZE], a4	FMUL	a1, b1, t1	add	AO, 4 * SIZE, AO	FADD	c08, t2, c08	add	BO, 4 * SIZE, BO	FMUL	a1, b2, t2	add	L, -1, L	FADD	c12, t3, c12	nop	FMUL	a1, b3, t3	cmp	L, 0	FADD	c16, t4, c16	nop	FMUL	a1, b4, t4	LDF	[AO + 0 * SIZE], a1	FADD	c01, t1, c01	nop	FMUL	a2, b1, t1	nop	FADD	c05, t2, c05	nop	FMUL	a2, b2, t2	nop	FADD	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD	c13, t4, c13	nop	FMUL	a2, b4, t4	LDF	[AO + 1 * SIZE], a2	FADD	c02, t1, c02	nop	FMUL	a3, b1, t1	nop	FADD	c06, t2, c06	nop	FMUL	a3, b2, t2	nop	FADD	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO + 2 * SIZE], a3	FADD	c03, t1, c03	nop	FMUL	a4, b1, t1	LDF	[BO + 0 * SIZE], b1	FADD	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO + 1 * SIZE], b2	FADD	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO + 2 * SIZE], b3	FADD	c15, t4, c15	FMUL	a4, b4, t4	bg,pt	%icc, .LL26	LDF	[BO + 3 * SIZE], b4.LL29:#if defined(LN) || defined(RT)	sub	KK, 4, TEMP1	sll	TEMP1, 2 + BASE_SHIFT, TEMP1	add	AORIG, TEMP1, AO	add	B,     TEMP1, BO#endif	FADD	c04, t1, c04	FADD	c08, t2, c08	FADD	c12, t3, c12	FADD	c16, t4, c16#if defined(LN) || defined(LT)	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	LDF	[BO +  2 * SIZE], a3	LDF	[BO +  3 * SIZE], a4	LDF	[BO +  4 * SIZE], b1	LDF	[BO +  5 * SIZE], b2	LDF	[BO +  6 * SIZE], b3	LDF	[BO +  7 * SIZE], b4	FSUB	a1, c01, c01	FSUB	a2, c05, c05	FSUB	a3, c09, c09	FSUB	a4, c13, c13	FSUB	b1, c02, c02	FSUB	b2, c06, c06	FSUB	b3, c10, c10	FSUB	b4, c14, c14	LDF	[BO +  8 * SIZE], a1	LDF	[BO +  9 * SIZE], a2	LDF	[BO + 10 * SIZE], a3	LDF	[BO + 11 * SIZE], a4	LDF	[BO + 12 * SIZE], b1	LDF	[BO + 13 * SIZE], b2	LDF	[BO + 14 * SIZE], b3	LDF	[BO + 15 * SIZE], b4	FSUB	a1, c03, c03	FSUB	a2, c07, c07	FSUB	a3, c11, c11	FSUB	a4, c15, c15	FSUB	b1, c04, c04	FSUB	b2, c08, c08	FSUB	b3, c12, c12	FSUB	b4, c16, c16#else	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  2 * SIZE], a3	LDF	[AO +  3 * SIZE], a4	LDF	[AO +  4 * SIZE], b1	LDF	[AO +  5 * SIZE], b2	LDF	[AO +  6 * SIZE], b3	LDF	[AO +  7 * SIZE], b4	FSUB	a1, c01, c01	FSUB	a2, c02, c02	FSUB	a3, c03, c03	FSUB	a4, c04, c04	FSUB	b1, c05, c05	FSUB	b2, c06, c06	FSUB	b3, c07, c07	FSUB	b4, c08, c08	LDF	[AO +  8 * SIZE], a1	LDF	[AO +  9 * SIZE], a2	LDF	[AO + 10 * SIZE], a3	LDF	[AO + 11 * SIZE], a4	LDF	[AO + 12 * SIZE], b1	LDF	[AO + 13 * SIZE], b2	LDF	[AO + 14 * SIZE], b3	LDF	[AO + 15 * SIZE], b4	FSUB	a1, c09, c09	FSUB	a2, c10, c10	FSUB	a3, c11, c11	FSUB	a4, c12, c12	FSUB	b1, c13, c13	FSUB	b2, c14, c14	FSUB	b3, c15, c15	FSUB	b4, c16, c16#endif#ifdef LN	LDF	[AO + 15 * SIZE], a1	LDF	[AO + 14 * SIZE], a2	LDF	[AO + 13 * SIZE], a3	LDF	[AO + 12 * SIZE], a4	FMUL	a1, c04, c04	FMUL	a1, c08, c08	FMUL	a1, c12, c12	FMUL	a1, c16, c16	FMUL	a2, c04, t1	FMUL	a2, c08, t2	FMUL	a2, c12, t3	FMUL	a2, c16, t4	FSUB	c03, t1, c03	FSUB	c07, t2, c07	FSUB	c11, t3, c11	FSUB	c15, t4, c15	FMUL	a3, c04, t1	FMUL	a3, c08, t2	FMUL	a3, c12, t3	FMUL	a3, c16, t4	FSUB	c02, t1, c02	FSUB	c06, t2, c06	FSUB	c10, t3, c10	FSUB	c14, t4, c14	FMUL	a4, c04, t1	FMUL	a4, c08, t2	FMUL	a4, c12, t3	FMUL	a4, c16, t4	FSUB	c01, t1, c01	FSUB	c05, t2, c05	FSUB	c09, t3, c09	FSUB	c13, t4, c13	LDF	[AO + 10 * SIZE], a1	LDF	[AO +  9 * SIZE], a2	LDF	[AO +  8 * SIZE], a3	FMUL	a1, c03, c03	FMUL	a1, c07, c07	FMUL	a1, c11, c11	FMUL	a1, c15, c15	FMUL	a2, c03, t1	FMUL	a2, c07, t2	FMUL	a2, c11, t3	FMUL	a2, c15, t4	FSUB	c02, t1, c02	FSUB	c06, t2, c06	FSUB	c10, t3, c10	FSUB	c14, t4, c14	FMUL	a3, c03, t1	FMUL	a3, c07, t2	FMUL	a3, c11, t3	FMUL	a3, c15, t4	FSUB	c01, t1, c01	FSUB	c05, t2, c05	FSUB	c09, t3, c09	FSUB	c13, t4, c13	LDF	[AO +  5 * SIZE], a1	LDF	[AO +  4 * SIZE], a2	FMUL	a1, c02, c02	FMUL	a1, c06, c06	FMUL	a1, c10, c10	FMUL	a1, c14, c14	FMUL	a2, c02, t1	FMUL	a2, c06, t2	FMUL	a2, c10, t3	FMUL	a2, c14, t4	FSUB	c01, t1, c01	FSUB	c05, t2, c05	FSUB	c09, t3, c09	FSUB	c13, t4, c13	LDF	[AO +  0 * SIZE], a1	FMUL	a1, c01, c01	FMUL	a1, c05, c05	FMUL	a1, c09, c09	FMUL	a1, c13, c13#endif#ifdef LT	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  2 * SIZE], a3	LDF	[AO +  3 * SIZE], a4	FMUL	a1, c01, c01	FMUL	a1, c05, c05	FMUL	a1, c09, c09	FMUL	a1, c13, c13	FMUL	a2, c01, t1	FMUL	a2, c05, t2	FMUL	a2, c09, t3	FMUL	a2, c13, t4	FSUB	c02, t1, c02	FSUB	c06, t2, c06	FSUB	c10, t3, c10	FSUB	c14, t4, c14	FMUL	a3, c01, t1	FMUL	a3, c05, t2	FMUL	a3, c09, t3	FMUL	a3, c13, t4	FSUB	c03, t1, c03	FSUB	c07, t2, c07	FSUB	c11, t3, c11	FSUB	c15, t4, c15	FMUL	a4, c01, t1	FMUL	a4, c05, t2	FMUL	a4, c09, t3	FMUL	a4, c13, t4	FSUB	c04, t1, c04	FSUB	c08, t2, c08	FSUB	c12, t3, c12	FSUB	c16, t4, c16	LDF	[AO +  5 * SIZE], a1	LDF	[AO +  6 * SIZE], a2	LDF	[AO +  7 * SIZE], a3	FMUL	a1, c02, c02	FMUL	a1, c06, c06	FMUL	a1, c10, c10	FMUL	a1, c14, c14	FMUL	a2, c02, t1	FMUL	a2, c06, t2	FMUL	a2, c10, t3	FMUL	a2, c14, t4	FSUB	c03, t1, c03	FSUB	c07, t2, c07	FSUB	c11, t3, c11	FSUB	c15, t4, c15	FMUL	a3, c02, t1	FMUL	a3, c06, t2	FMUL	a3, c10, t3	FMUL	a3, c14, t4	FSUB	c04, t1, c04	FSUB	c08, t2, c08	FSUB	c12, t3, c12	FSUB	c16, t4, c16	LDF	[AO + 10 * SIZE], a1	LDF	[AO + 11 * SIZE], a2	FMUL	a1, c03, c03	FMUL	a1, c07, c07	FMUL	a1, c11, c11	FMUL	a1, c15, c15	FMUL	a2, c03, t1	FMUL	a2, c07, t2	FMUL	a2, c11, t3	FMUL	a2, c15, t4	FSUB	c04, t1, c04	FSUB	c08, t2, c08	FSUB	c12, t3, c12	FSUB	c16, t4, c16	LDF	[AO + 15 * SIZE], a1	FMUL	a1, c04, c04	FMUL	a1, c08, c08	FMUL	a1, c12, c12	FMUL	a1, c16, c16#endif#ifdef RN	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	LDF	[BO +  2 * SIZE], a3	LDF	[BO +  3 * SIZE], a4	FMUL	a1, c01, c01	FMUL	a1, c02, c02	FMUL	a1, c03, c03	FMUL	a1, c04, c04	FMUL	a2, c01, t1	FMUL	a2, c02, t2	FMUL	a2, c03, t3	FMUL	a2, c04, t4	FSUB	c05, t1, c05	FSUB	c06, t2, c06	FSUB	c07, t3, c07	FSUB	c08, t4, c08	FMUL	a3, c01, t1	FMUL	a3, c02, t2	FMUL	a3, c03, t3	FMUL	a3, c04, t4	FSUB	c09, t1, c09	FSUB	c10, t2, c10	FSUB	c11, t3, c11	FSUB	c12, t4, c12	FMUL	a4, c01, t1	FMUL	a4, c02, t2	FMUL	a4, c03, t3	FMUL	a4, c04, t4	FSUB	c13, t1, c13	FSUB	c14, t2, c14	FSUB	c15, t3, c15	FSUB	c16, t4, c16	LDF	[BO +  5 * SIZE], a1	LDF	[BO +  6 * SIZE], a2	LDF	[BO +  7 * SIZE], a3	FMUL	a1, c05, c05	FMUL	a1, c06, c06	FMUL	a1, c07, c07	FMUL	a1, c08, c08	FMUL	a2, c05, t1	FMUL	a2, c06, t2	FMUL	a2, c07, t3	FMUL	a2, c08, t4	FSUB	c09, t1, c09	FSUB	c10, t2, c10	FSUB	c11, t3, c11	FSUB	c12, t4, c12	FMUL	a3, c05, t1	FMUL	a3, c06, t2	FMUL	a3, c07, t3	FMUL	a3, c08, t4	FSUB	c13, t1, c13	FSUB	c14, t2, c14	FSUB	c15, t3, c15	FSUB	c16, t4, c16	LDF	[BO + 10 * SIZE], a1	LDF	[BO + 11 * SIZE], a2	FMUL	a1, c09, c09	FMUL	a1, c10, c10	FMUL	a1, c11, c11	FMUL	a1, c12, c12	FMUL	a2, c09, t1	FMUL	a2, c10, t2	FMUL	a2, c11, t3	FMUL	a2, c12, t4	FSUB	c13, t1, c13	FSUB	c14, t2, c14	FSUB	c15, t3, c15	FSUB	c16, t4, c16	LDF	[BO + 15 * SIZE], a1	FMUL	a1, c13, c13	FMUL	a1, c14, c14	FMUL	a1, c15, c15	FMUL	a1, c16, c16#endif#ifdef RT	LDF	[BO + 15 * SIZE], a1	LDF	[BO + 14 * SIZE], a2	LDF	[BO + 13 * SIZE], a3	LDF	[BO + 12 * SIZE], a4	FMUL	a1, c13, c13	FMUL	a1, c14, c14	FMUL	a1, c15, c15	FMUL	a1, c16, c16	FMUL	a2, c13, t1	FMUL	a2, c14, t2	FMUL	a2, c15, t3	FMUL	a2, c16, t4	FSUB	c09, t1, c09	FSUB	c10, t2, c10	FSUB	c11, t3, c11	FSUB	c12, t4, c12	FMUL	a3, c13, t1	FMUL	a3, c14, t2	FMUL	a3, c15, t3	FMUL	a3, c16, t4	FSUB	c05, t1, c05	FSUB	c06, t2, c06	FSUB	c07, t3, c07	FSUB	c08, t4, c08	FMUL	a4, c13, t1	FMUL	a4, c14, t2	FMUL	a4, c15, t3	FMUL	a4, c16, t4	FSUB	c01, t1, c01	FSUB	c02, t2, c02	FSUB	c03, t3, c03	FSUB	c04, t4, c04	LDF	[BO + 10 * SIZE], a1	LDF	[BO +  9 * SIZE], a2	LDF	[BO +  8 * SIZE], a3	FMUL	a1, c09, c09	FMUL	a1, c10, c10	FMUL	a1, c11, c11	FMUL	a1, c12, c12	FMUL	a2, c09, t1	FMUL	a2, c10, t2	FMUL	a2, c11, t3	FMUL	a2, c12, t4	FSUB	c05, t1, c05	FSUB	c06, t2, c06	FSUB	c07, t3, c07	FSUB	c08, t4, c08	FMUL	a3, c09, t1	FMUL	a3, c10, t2	FMUL	a3, c11, t3	FMUL	a3, c12, t4	FSUB	c01, t1, c01	FSUB	c02, t2, c02	FSUB	c03, t3, c03	FSUB	c04, t4, c04	LDF	[BO +  5 * SIZE], a1	LDF	[BO +  4 * SIZE], a2	FMUL	a1, c05, c05	FMUL	a1, c06, c06	FMUL	a1, c07, c07	FMUL	a1, c08, c08	FMUL	a2, c05, t1	FMUL	a2, c06, t2	FMUL	a2, c07, t3	FMUL	a2, c08, t4	FSUB	c01, t1, c01	FSUB	c02, t2, c02	FSUB	c03, t3, c03	FSUB	c04, t4, c04	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01	FMUL	a1, c02, c02	FMUL	a1, c03, c03	FMUL	a1, c04, c04#endif#ifdef LN	add	C1, -4 * SIZE, C1	add	C2, -4 * SIZE, C2	add	C3, -4 * SIZE, C3	add	C4, -4 * SIZE, C4#endif#if defined(LN) || defined(LT)	STF	c01, [BO +  0 * SIZE]	STF	c05, [BO +  1 * SIZE]	STF	c09, [BO +  2 * SIZE]	STF	c13, [BO +  3 * SIZE]	STF	c02, [BO +  4 * SIZE]	STF	c06, [BO +  5 * SIZE]	STF	c10, [BO +  6 * SIZE]	STF	c14, [BO +  7 * SIZE]	STF	c03, [BO +  8 * SIZE]	STF	c07, [BO +  9 * SIZE]	STF	c11, [BO + 10 * SIZE]	STF	c15, [BO + 11 * SIZE]	STF	c04, [BO + 12 * SIZE]	STF	c08, [BO + 13 * SIZE]	STF	c12, [BO + 14 * SIZE]	STF	c16, [BO + 15 * SIZE]#else	STF	c01, [AO +  0 * SIZE]	STF	c02, [AO +  1 * SIZE]	STF	c03, [AO +  2 * SIZE]	STF	c04, [AO +  3 * SIZE]	STF	c05, [AO +  4 * SIZE]	STF	c06, [AO +  5 * SIZE]	STF	c07, [AO +  6 * SIZE]	STF	c08, [AO +  7 * SIZE]	STF	c09, [AO +  8 * SIZE]	STF	c10, [AO +  9 * SIZE]	STF	c11, [AO + 10 * SIZE]	STF	c12, [AO + 11 * SIZE]	STF	c13, [AO + 12 * SIZE]	STF	c14, [AO + 13 * SIZE]	STF	c15, [AO + 14 * SIZE]	STF	c16, [AO + 15 * SIZE]#endif	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	STF	c03, [C1 + 2 * SIZE]	STF	c04, [C1 + 3 * SIZE]	STF	c05, [C2 + 0 * SIZE]	STF	c06, [C2 + 1 * SIZE]	STF	c07, [C2 + 2 * SIZE]	STF	c08, [C2 + 3 * SIZE]	STF	c09, [C3 + 0 * SIZE]	STF	c10, [C3 + 1 * SIZE]	STF	c11, [C3 + 2 * SIZE]	STF	c12, [C3 + 3 * SIZE]	STF	c13, [C4 + 0 * SIZE]	STF	c14, [C4 + 1 * SIZE]	STF	c15, [C4 + 2 * SIZE]	STF	c16, [C4 + 3 * SIZE]	FMOV	FZERO, t1	FMOV	FZERO, t2	FMOV	FZERO, t3	FMOV	FZERO, t4#ifndef LN	add	C1, 4 * SIZE, C1	add	C2, 4 * SIZE, C2	add	C3, 4 * SIZE, C3	add	C4, 4 * SIZE, C4#endif#ifdef RT	sll	K, 2 + BASE_SHIFT, TEMP1	add	AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN)	sub	K, KK, TEMP1	sll	TEMP1, 2 + BASE_SHIFT, TEMP1	add	AO, TEMP1, AO	add	BO, TEMP1, BO#endif#ifdef LT	add	KK, 4, KK#endif#ifdef LN	sub	KK, 4, KK#endif	add	I, -1, I	cmp	I, 0	sra	K, 2, L	bg,pt	%icc, .LL21	FMOV	FZERO, c01.LL50:	and	M, 2, I	cmp	I, 0	ble,pn	%icc, .LL70	nop#if defined(LT) || defined(RN)	sra	KK, 2, L	mov	B, BO	cmp	L,  0#else#ifdef LN	sll	K,  1 + BASE_SHIFT, TEMP1	sub	AORIG, TEMP1, AORIG#endif	sll	KK, 1 + BASE_SHIFT, TEMP1	sll	KK, 2 + BASE_SHIFT, TEMP2	add	AORIG, TEMP1, AO	add	B,     TEMP2, BO	sub	K, KK, TEMP1	sra	TEMP1, 2, L	cmp	L,  0#endif	FMOV	FZERO, c02	FMOV	FZERO, t1	FMOV	FZERO, c04	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, t2	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, c06	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, t3	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, c08	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, t4	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, c01	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c03	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, c05	ble,pn	%icc, .LL55	FMOV	FZERO, c07.LL52:	FADD	c02, t1, c02	add	AO,  8 * SIZE, AO	prefetch [AO + APREFETCHSIZE * SIZE], 0

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -