⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ztrsm_kernel_ln.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
	FADD1	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD3	c13, t4, c13	add	L, -1, L	FMUL	a2, b4, t4	LDF	[AO - 11 * SIZE], a2	FADD2	c02, t1, c02	nop	FMUL	a3, b1, t1	nop	FADD4	c06, t2, c06	nop	FMUL	a3, b2, t2	nop	FADD2	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD4	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO - 10 * SIZE], a3	FADD1	c03, t1, c03	nop	FMUL	a4, b1, t1	LDF	[BO -  8 * SIZE], b1	FADD3	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO - 11 * SIZE], b2	FADD1	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO - 10 * SIZE], b3	FADD3	c15, t4, c15	nop	FMUL	a4, b4, t4	LDF	[BO -  9 * SIZE], b4	FADD2	c04, t1, c04	nop	FMUL	a5, b5, t1	LDF	[AO -  9 * SIZE], a4	FADD4	c08, t2, c08	nop	FMUL	a5, b2, t2	nop	FADD2	c12, t3, c12	nop	FMUL	a5, b3, t3	nop	FADD4	c16, t4, c16	nop	FMUL	a5, b4, t4	LDF	[AO - 4 * SIZE], a5	FADD1	c01, t1, c01	nop	FMUL	a2, b5, t1	nop	FADD3	c05, t2, c05	nop	FMUL	a2, b2, t2	nop	FADD1	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD3	c13, t4, c13	nop	FMUL	a2, b4, t4	LDF	[AO -  7 * SIZE], a2	FADD2	c02, t1, c02	nop	FMUL	a3, b5, t1	nop	FADD4	c06, t2, c06	nop	FMUL	a3, b2, t2	nop	FADD2	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD4	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO -  6 * SIZE], a3	FADD1	c03, t1, c03	nop	FMUL	a4, b5, t1	LDF	[BO - 4 * SIZE], b5	FADD3	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO -  7 * SIZE], b2	FADD1	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO -  6 * SIZE], b3	FADD3	c15, t4, c15	nop	FMUL	a4, b4, t4	LDF	[BO -  5 * SIZE], b4	FADD2	c04, t1, c04	nop	FMUL	a1, b1, t1	LDF	[AO -  5 * SIZE], a4	FADD4	c08, t2, c08	nop	FMUL	a1, b2, t2	nop	FADD2	c12, t3, c12	nop	FMUL	a1, b3, t3	nop	FADD4	c16, t4, c16	nop	FMUL	a1, b4, t4	LDF	[AO -  0 * SIZE], a1	FADD1	c01, t1, c01	nop	FMUL	a2, b1, t1	nop#ifdef DOUBLE	prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY#else	nop#endif	FADD3	c05, t2, c05	nop	FMUL	a2, b2, t2	FADD1	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD3	c13, t4, c13	nop	FMUL	a2, b4, t4	nop	FADD2	c02, t1, c02	nop	FMUL	a3, b1, t1	LDF	[AO - 3 * SIZE], a2	FADD4	c06, t2, c06#ifdef DOUBLE	prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY#else	nop#endif	FMUL	a3, b2, t2	nop	FADD2	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD4	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO - 2 * SIZE], a3	FADD1	c03, t1, c03	nop	FMUL	a4, b1, t1	LDF	[BO -  0 * SIZE], b1	FADD3	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO - 3 * SIZE], b2	FADD1	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO - 2 * SIZE], b3	FADD3	c15, t4, c15	nop	FMUL	a4, b4, t4	LDF	[BO - 1 * SIZE], b4	FADD2	c04, t1, c04	nop	FMUL	a5, b5, t1	LDF	[AO - 1 * SIZE], a4	FADD4	c08, t2, c08	FMUL	a5, b2, t2	FADD2	c12, t3, c12	FMUL	a5, b3, t3	FADD4	c16, t4, c16	nop	FMUL	a5, b4, t4	LDF	[AO +  4 * SIZE], a5	FADD1	c01, t1, c01	nop	FMUL	a2, b5, t1	nop	FADD3	c05, t2, c05	nop	FMUL	a2, b2, t2	nop	FADD1	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD3	c13, t4, c13	nop	FMUL	a2, b4, t4	LDF	[AO +  1 * SIZE], a2	FADD2	c02, t1, c02	nop	FMUL	a3, b5, t1	nop	FADD4	c06, t2, c06	nop	FMUL	a3, b2, t2	nop	FADD2	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD4	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO +  2 * SIZE], a3	FADD1	c03, t1, c03	cmp	L, 0	FMUL	a4, b5, t1	LDF	[BO +  4 * SIZE], b5	FADD3	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO +  1 * SIZE], b2	FADD1	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO +  2 * SIZE], b3	FADD3	c15, t4, c15	FMUL	a4, b4, t4	bg,pt	%icc, .LL22	LDF	[BO +  3 * SIZE], b4.LL25:#if defined(LT) || defined(RN)	and	KK,  3, L#else	and	TEMP1, 3, L#endif	cmp	L,  0	ble,pn %icc, .LL29	nop	.LL26:	FADD2	c04, t1, c04	LDF	[AO +  3 * SIZE], a4	FMUL	a1, b1, t1	add	AO, 4 * SIZE, AO	FADD4	c08, t2, c08	add	BO, 4 * SIZE, BO	FMUL	a1, b2, t2	add	L, -1, L	FADD2	c12, t3, c12	nop	FMUL	a1, b3, t3	cmp	L, 0	FADD4	c16, t4, c16	nop	FMUL	a1, b4, t4	LDF	[AO + 0 * SIZE], a1	FADD1	c01, t1, c01	nop	FMUL	a2, b1, t1	nop	FADD3	c05, t2, c05	nop	FMUL	a2, b2, t2	nop	FADD1	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD3	c13, t4, c13	nop	FMUL	a2, b4, t4	LDF	[AO + 1 * SIZE], a2	FADD2	c02, t1, c02	nop	FMUL	a3, b1, t1	nop	FADD4	c06, t2, c06	nop	FMUL	a3, b2, t2	nop	FADD2	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD4	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO + 2 * SIZE], a3	FADD1	c03, t1, c03	nop	FMUL	a4, b1, t1	LDF	[BO + 0 * SIZE], b1	FADD3	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO + 1 * SIZE], b2	FADD1	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO + 2 * SIZE], b3	FADD3	c15, t4, c15	FMUL	a4, b4, t4	bg,pt	%icc, .LL26	LDF	[BO + 3 * SIZE], b4.LL29:#if defined(LN) || defined(RT)	sub	KK, 2, TEMP1	sll	TEMP1, 1 + ZBASE_SHIFT, TEMP1	add	AORIG, TEMP1, AO	add	B,     TEMP1, BO#endif	FADD2	c04, t1, c04	FADD4	c08, t2, c08	FADD2	c12, t3, c12	FADD4	c16, t4, c16	FADD	c01, c06, c01	FADD	c02, c05, c02	FADD	c03, c08, c03	FADD	c04, c07, c04	FADD	c09, c14, c09	FADD	c10, c13, c10	FADD	c11, c16, c11	FADD	c12, c15, c12#if defined(LN) || defined(LT)	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	LDF	[BO +  2 * SIZE], a3	LDF	[BO +  3 * SIZE], a4	LDF	[BO +  4 * SIZE], b1	LDF	[BO +  5 * SIZE], b2	LDF	[BO +  6 * SIZE], b3	LDF	[BO +  7 * SIZE], b4	FSUB	a1, c01, c01	FSUB	a2, c02, c02	FSUB	a3, c09, c09	FSUB	a4, c10, c10	FSUB	b1, c03, c03	FSUB	b2, c04, c04	FSUB	b3, c11, c11	FSUB	b4, c12, c12#else	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  2 * SIZE], a3	LDF	[AO +  3 * SIZE], a4	LDF	[AO +  4 * SIZE], b1	LDF	[AO +  5 * SIZE], b2	LDF	[AO +  6 * SIZE], b3	LDF	[AO +  7 * SIZE], b4	FSUB	a1, c01, c01	FSUB	a2, c02, c02	FSUB	a3, c03, c03	FSUB	a4, c04, c04	FSUB	b1, c09, c09	FSUB	b2, c10, c10	FSUB	b3, c11, c11	FSUB	b4, c12, c12#endif#ifdef LN	LDF	[AO +  6 * SIZE], a1	LDF	[AO +  7 * SIZE], a2	LDF	[AO +  4 * SIZE], a3	LDF	[AO +  5 * SIZE], a4	LDF	[AO +  0 * SIZE], b1	LDF	[AO +  1 * SIZE], b2	FMUL	a1, c03, t1	FMUL	a2, c04, t2	FMUL	a1, c04, t3	FMUL	a2, c03, t4	FMUL	a1, c11, t5	FMUL	a2, c12, t6	FMUL	a1, c12, t7	FMUL	a2, c11, t8	FADD4	t1, t2, c03	FADD2	t3, t4, c04	FADD4	t5, t6, c11	FADD2	t7, t8, c12	FMUL	a3, c03, t1	FMUL	a3, c04, t2	FMUL	a3, c11, t3	FMUL	a3, c12, t4	FMUL	a4, c04, t5	FMUL	a4, c03, t6	FMUL	a4, c12, t7	FMUL	a4, c11, t8	FSUB	c01, t1, c01	FSUB	c02, t2, c02	FSUB	c09, t3, c09	FSUB	c10, t4, c10	FADD2	c01, t5, c01	FADD4	c02, t6, c02	FADD2	c09, t7, c09	FADD4	c10, t8, c10	FMUL	b1, c01, t1	FMUL	b2, c02, t2	FMUL	b1, c02, t3	FMUL	b2, c01, t4	FMUL	b1, c09, t5	FMUL	b2, c10, t6	FMUL	b1, c10, t7	FMUL	b2, c09, t8	FADD4	t1, t2, c01	FADD2	t3, t4, c02	FADD4	t5, t6, c09	FADD2	t7, t8, c10#endif#ifdef LT	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  2 * SIZE], a3	LDF	[AO +  3 * SIZE], a4	LDF	[AO +  6 * SIZE], b1	LDF	[AO +  7 * SIZE], b2	FMUL	a1, c01, t1	FMUL	a2, c02, t2	FMUL	a1, c02, t3	FMUL	a2, c01, t4	FMUL	a1, c09, t5	FMUL	a2, c10, t6	FMUL	a1, c10, t7	FMUL	a2, c09, t8	FADD4	t1, t2, c01	FADD2	t3, t4, c02	FADD4	t5, t6, c09	FADD2	t7, t8, c10	FMUL	a3, c01, t1	FMUL	a3, c02, t2	FMUL	a3, c09, t3	FMUL	a3, c10, t4	FMUL	a4, c02, t5	FMUL	a4, c01, t6	FMUL	a4, c10, t7	FMUL	a4, c09, t8	FSUB	c03, t1, c03	FSUB	c04, t2, c04	FSUB	c11, t3, c11	FSUB	c12, t4, c12	FADD2	c03, t5, c03	FADD4	c04, t6, c04	FADD2	c11, t7, c11	FADD4	c12, t8, c12	FMUL	b1, c03, t1	FMUL	b2, c04, t2	FMUL	b1, c04, t3	FMUL	b2, c03, t4	FMUL	b1, c11, t5	FMUL	b2, c12, t6	FMUL	b1, c12, t7	FMUL	b2, c11, t8	FADD4	t1, t2, c03	FADD2	t3, t4, c04	FADD4	t5, t6, c11	FADD2	t7, t8, c12#endif#ifdef RN	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	LDF	[BO +  2 * SIZE], a3	LDF	[BO +  3 * SIZE], a4	LDF	[BO +  6 * SIZE], b1	LDF	[BO +  7 * SIZE], b2	FMUL	a1, c01, t1	FMUL	a2, c02, t2	FMUL	a1, c02, t3	FMUL	a2, c01, t4	FMUL	a1, c03, t5	FMUL	a2, c04, t6	FMUL	a1, c04, t7	FMUL	a2, c03, t8	FADD4	t1, t2, c01	FADD3	t3, t4, c02	FADD4	t5, t6, c03	FADD3	t7, t8, c04	FMUL	a3, c01, t1	FMUL	a3, c02, t2	FMUL	a3, c03, t3	FMUL	a3, c04, t4	FMUL	a4, c02, t5	FMUL	a4, c01, t6	FMUL	a4, c04, t7	FMUL	a4, c03, t8	FSUB	c09, t1, c09	FSUB	c10, t2, c10	FSUB	c11, t3, c11	FSUB	c12, t4, c12	FADD3	c09, t5, c09	FADD4	c10, t6, c10	FADD3	c11, t7, c11	FADD4	c12, t8, c12	FMUL	b1, c09, t1	FMUL	b2, c10, t2	FMUL	b1, c10, t3	FMUL	b2, c09, t4	FMUL	b1, c11, t5	FMUL	b2, c12, t6	FMUL	b1, c12, t7	FMUL	b2, c11, t8	FADD4	t1, t2, c09	FADD3	t3, t4, c10	FADD4	t5, t6, c11	FADD3	t7, t8, c12#endif#ifdef RT	LDF	[BO +  6 * SIZE], a1	LDF	[BO +  7 * SIZE], a2	LDF	[BO +  4 * SIZE], a3	LDF	[BO +  5 * SIZE], a4	LDF	[BO +  0 * SIZE], b1	LDF	[BO +  1 * SIZE], b2	FMUL	a1, c09, t1	FMUL	a2, c10, t2	FMUL	a1, c10, t3	FMUL	a2, c09, t4	FMUL	a1, c11, t5	FMUL	a2, c12, t6	FMUL	a1, c12, t7	FMUL	a2, c11, t8	FADD4	t1, t2, c09	FADD3	t3, t4, c10	FADD4	t5, t6, c11	FADD3	t7, t8, c12	FMUL	a3, c09, t1	FMUL	a3, c10, t2	FMUL	a3, c11, t3	FMUL	a3, c12, t4	FMUL	a4, c10, t5	FMUL	a4, c09, t6	FMUL	a4, c12, t7	FMUL	a4, c11, t8	FSUB	c01, t1, c01	FSUB	c02, t2, c02	FSUB	c03, t3, c03	FSUB	c04, t4, c04	FADD3	c01, t5, c01	FADD4	c02, t6, c02	FADD3	c03, t7, c03	FADD4	c04, t8, c04	FMUL	b1, c01, t1	FMUL	b2, c02, t2	FMUL	b1, c02, t3	FMUL	b2, c01, t4	FMUL	b1, c03, t5	FMUL	b2, c04, t6	FMUL	b1, c04, t7	FMUL	b2, c03, t8	FADD4	t1, t2, c01	FADD3	t3, t4, c02	FADD4	t5, t6, c03	FADD3	t7, t8, c04#endif#ifdef LN	add	C1, -4 * SIZE, C1	add	C2, -4 * SIZE, C2#endif#if defined(LN) || defined(LT)	STF	c01, [BO +  0 * SIZE]	STF	c02, [BO +  1 * SIZE]	STF	c09, [BO +  2 * SIZE]	STF	c10, [BO +  3 * SIZE]	STF	c03, [BO +  4 * SIZE]	STF	c04, [BO +  5 * SIZE]	STF	c11, [BO +  6 * SIZE]	STF	c12, [BO +  7 * SIZE]#else	STF	c01, [AO +  0 * SIZE]	STF	c02, [AO +  1 * SIZE]	STF	c03, [AO +  2 * SIZE]	STF	c04, [AO +  3 * SIZE]	STF	c09, [AO +  4 * SIZE]	STF	c10, [AO +  5 * SIZE]	STF	c11, [AO +  6 * SIZE]	STF	c12, [AO +  7 * SIZE]#endif	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	STF	c03, [C1 + 2 * SIZE]	STF	c04, [C1 + 3 * SIZE]	STF	c09, [C2 + 0 * SIZE]	STF	c10, [C2 + 1 * SIZE]	STF	c11, [C2 + 2 * SIZE]	STF	c12, [C2 + 3 * SIZE]#ifndef LN	add	C1, 4 * SIZE, C1	add	C2, 4 * SIZE, C2#endif#ifdef RT	sll	K, 1 + ZBASE_SHIFT, TEMP1	add	AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN)	sub	K, KK, TEMP1	sll	TEMP1, 1 + ZBASE_SHIFT, TEMP1	add	AO, TEMP1, AO	add	BO, TEMP1, BO#endif#ifdef LT	add	KK, 2, KK#endif#ifdef LN	sub	KK, 2, KK#endif	add	I, -1, I	cmp	I, 0	bg,pt	%icc, .LL21	nop.LL99:#ifdef LN	sll	K, 1 + ZBASE_SHIFT, TEMP1	add	B, TEMP1, B#endif#if defined(LT) || defined(RN)	mov	BO, B#endif#ifdef RN	add	KK, 2, KK#endif#ifdef RT	sub	KK, 2, KK#endif	add	J, -1, J	cmp	J, 0	bg,pt	%icc, .LL11	nop.LL100:	and	N, 1, J	cmp	J, 0	ble,pn	%icc, .LL999	nop#ifdef RT	sll	K, 0 + ZBASE_SHIFT, TEMP1	sub	B, TEMP1, B	sub	C, LDC, C#endif	mov	C, C1#ifdef LN	add	M, OFFSET, KK#endif#ifdef LT	mov	OFFSET, KK#endif#if defined(LN) || defined(RT)	mov	A, AORIG

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -