⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm_kernel.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
	fmadd	f1,  f17, f20, f1	fmadd	f2,  f18, f20, f2	fmadd	f3,  f19, f20, f3	fmadd	f4,  f16, f21, f4	fmadd	f6,  f18, f21, f6	fmadd	f7,  f19, f21, f7	fmadd	f8,  f16, f22, f8	fmadd	f9,  f17, f22, f9	fmadd	f11, f19, f22, f11	fmadd	f12, f16, f23, f12	fmadd	f13, f17, f23, f13	fmadd	f14, f18, f23, f14	LFD	f16,  4 * SIZE(AO)	LFD	f17,  5 * SIZE(AO)	LFD	f18,  6 * SIZE(AO)	LFD	f19,  7 * SIZE(AO)	LFD	f20,  4 * SIZE(BO)	LFD	f21,  5 * SIZE(BO)	LFD	f22,  6 * SIZE(BO)	LFD	f23,  7 * SIZE(BO)	addi	BO, BO,  4 * SIZE	addi	AO, AO,  4 * SIZE	bdnz	LL(16)	.align 4LL(KERNEL_MainFinish):#ifndef TRMMKERNEL	LFD	f16, 0 * SIZE(CO1)	LFD	f17, 1 * SIZE(CO1)	LFD	f18, 2 * SIZE(CO1)	LFD	f19, 3 * SIZE(CO1)#endif#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \      defined(CC) || defined(CR) || defined(RC) || defined(RR)	FSUB	  f0,  f0,  f5	FADD	  f1,  f1,  f4	FSUB	  f2,  f2,  f7	FADD	  f3,  f3,  f6#ifndef TRMMKERNEL	LFD	f20, 0 * SIZE(CO2)	LFD	f21, 1 * SIZE(CO2)	LFD	f22, 2 * SIZE(CO2)	LFD	f23, 3 * SIZE(CO2)#endif	FSUB	  f8,  f8,  f13	FADD	  f9,  f9,  f12	FSUB	  f10, f10, f15	FADD	  f11, f11, f14#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)	FADD	  f0,  f0,  f5	FSUB	  f1,  f1,  f4	FADD	  f2,  f2,  f7	FSUB	  f3,  f3,  f6#ifndef TRMMKERNEL	LFD	f20, 0 * SIZE(CO2)	LFD	f21, 1 * SIZE(CO2)	LFD	f22, 2 * SIZE(CO2)	LFD	f23, 3 * SIZE(CO2)#endif	FADD	  f8,  f8,  f13	FSUB	  f9,  f9,  f12	FADD	  f10, f10, f15	FSUB	  f11, f11, f14#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */	FADD	  f0,  f0,  f5	FSUB	  f1,  f4,  f1	FADD	  f2,  f2,  f7	FSUB	  f3,  f6,  f3#ifndef TRMMKERNEL	LFD	f20, 0 * SIZE(CO2)	LFD	f21, 1 * SIZE(CO2)	LFD	f22, 2 * SIZE(CO2)	LFD	f23, 3 * SIZE(CO2)#endif	FADD	  f8,  f8,  f13	FSUB	  f9,  f12, f9	FADD	  f10, f10, f15	FSUB	  f11, f14, f11#endif#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)#ifndef TRMMKERNEL	FMADD	f16, f30, f0,  f16	FMADD	f17, f30, f1,  f17	FMADD	f18, f30, f2,  f18	FMADD	f19, f30, f3,  f19	FMADD	f20, f30, f8,  f20	FMADD	f21, f30, f9,  f21	FMADD	f22, f30, f10, f22	FMADD	f23, f30, f11, f23#else	FMUL	f16, f30, f0	FMUL	f17, f30, f1	FMUL	f18, f30, f2	FMUL	f19, f30, f3	FMUL	f20, f30, f8	FMUL	f21, f30, f9	FMUL	f22, f30, f10	FMUL	f23, f30, f11#endif	FNMSUB	f16, f31, f1,  f16	FMADD	f17, f31, f0,  f17	FNMSUB	f18, f31, f3,  f18	FMADD	f19, f31, f2,  f19	FNMSUB	f20, f31, f9,  f20	FMADD	f21, f31, f8,  f21	FNMSUB	f22, f31, f11, f22	FMADD	f23, f31, f10, f23#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */      /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */      /* defined(RC)|| defined(RR) */#ifndef TRMMKERNEL	FMADD	f16, f30, f0,  f16	FNMSUB	f17, f30, f1,  f17	FMADD	f18, f30, f2,  f18	FNMSUB	f19, f30, f3,  f19	FMADD	f20, f30, f8,  f20	FNMSUB	f21, f30, f9,  f21	FMADD	f22, f30, f10, f22	FNMSUB	f23, f30, f11, f23	FMADD	f16, f31, f1,  f16	FMADD	f17, f31, f0,  f17	FMADD	f18, f31, f3,  f18	FMADD	f19, f31, f2,  f19	FMADD	f20, f31, f9,  f20	FMADD	f21, f31, f8,  f21	FMADD	f22, f31, f11, f22	FMADD	f23, f31, f10, f23#else	FMUL	f16, f30, f0	FMUL	f17, f30, f1	FMUL	f18, f30, f2	FMUL	f19, f30, f3	FMUL	f20, f30, f8	FMUL	f21, f30, f9	FMUL	f22, f30, f10	FMUL	f23, f30, f11	FMADD	f16, f31, f1,  f16	FNMADD	f17, f31, f0,  f17	FMADD	f18, f31, f3,  f18	FNMADD	f19, f31, f2,  f19	FMADD	f20, f31, f9,  f20	FNMADD	f21, f31, f8,  f21	FMADD	f22, f31, f11, f22	FNMADD	f23, f31, f10, f23#endif#endif	STFD	f16,  0 * SIZE(CO1)	STFD	f17,  1 * SIZE(CO1)	STFD	f18,  2 * SIZE(CO1)	STFD	f19,  3 * SIZE(CO1)	lfs	f0,  FZERO 	fmr	f1,  f0	fmr	f2,  f0	fmr	f3,  f0	STFD	f20,  0 * SIZE(CO2)	STFD	f21,  1 * SIZE(CO2)	STFD	f22,  2 * SIZE(CO2)	STFD	f23,  3 * SIZE(CO2)	fmr	f4,  f0	fmr	f5,  f0	fmr	f6,  f0	fmr	f7,  f0	fmr	f8,  f0	fmr	f9,  f0	fmr	f10, f0	fmr	f11, f0	fmr	f12, f0	fmr	f13, f0	fmr	f14, f0	fmr	f15, f0	addi	CO1, CO1, 4 * SIZE	addi	CO2, CO2, 4 * SIZE	#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -2#else	addi	TEMP, TEMP, -2#endif	slwi	TEMP, TEMP, 1 + ZBASE_SHIFT	add	AO, AO, TEMP	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 2#endif#endif	addic.	I, I, -1	bgt	LL(11)	.align 4LL(20):	andi.	I,  M,  1	ble	LL(29)#ifndef TRMMKERNEL	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	LFD	f24,  4 * SIZE(B)	LFD	f25,  5 * SIZE(B)	LFD	f26,  6 * SIZE(B)	LFD	f27,  7 * SIZE(B)	lfs	f0, FZERO	fmr	f1, f0	fmr	f2, f0	fmr	f3, f0	fmr	f4, f0	fmr	f5, f0	fmr	f6, f0	fmr	f7, f0	srawi.	r0,  K,  2	mr	BO,  B	mtspr	CTR, r0	ble	LL(25)#else#if	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(B)	LFD	f21,  1 * SIZE(B)	LFD	f22,  2 * SIZE(B)	LFD	f23,  3 * SIZE(B)	LFD	f24,  4 * SIZE(B)	LFD	f25,  5 * SIZE(B)	LFD	f26,  6 * SIZE(B)	LFD	f27,  7 * SIZE(B)	mr	BO,  B#else	slwi	r0,   KK, 0 + ZBASE_SHIFT	slwi	TEMP, KK, 1 + ZBASE_SHIFT	add	AO, AO, r0	add	BO, B,  TEMP	LFD	f16,  0 * SIZE(AO)	LFD	f17,  1 * SIZE(AO)	LFD	f18,  2 * SIZE(AO)	LFD	f19,  3 * SIZE(AO)	LFD	f20,  0 * SIZE(BO)	LFD	f21,  1 * SIZE(BO)	LFD	f22,  2 * SIZE(BO)	LFD	f23,  3 * SIZE(BO)	LFD	f24,  4 * SIZE(BO)	LFD	f25,  5 * SIZE(BO)	LFD	f26,  6 * SIZE(BO)	LFD	f27,  7 * SIZE(BO)#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 1#else	addi	TEMP, KK, 2#endif	srawi.	TEMP,  TEMP,  2	mtspr	CTR, TEMP	ble	LL(25)#endif	.align 4LL(22):	fmadd	f0,  f16, f20, f0	fmadd	f1,  f16, f21, f1	fmadd	f2,  f16, f22, f2	fmadd	f3,  f16, f23, f3	fmadd	f4,  f17, f20, f4	fmadd	f5,  f17, f21, f5	fmadd	f6,  f17, f22, f6	fmadd	f7,  f17, f23, f7 	LFD	f20,  8 * SIZE(BO)	LFD	f21,  9 * SIZE(BO)	LFD	f22, 10 * SIZE(BO)	LFD	f23, 11 * SIZE(BO)	fmadd	f0,  f18, f24, f0	fmadd	f1,  f18, f25, f1	fmadd	f2,  f18, f26, f2	fmadd	f3,  f18, f27, f3	fmadd	f4,  f19, f24, f4	fmadd	f5,  f19, f25, f5	fmadd	f6,  f19, f26, f6	fmadd	f7,  f19, f27, f7 	LFD	f24, 12 * SIZE(BO)	LFD	f25, 13 * SIZE(BO)	LFD	f26, 14 * SIZE(BO)	LFD	f27, 15 * SIZE(BO)	LFD	f16,  4 * SIZE(AO)	LFD	f17,  5 * SIZE(AO)	LFD	f18,  6 * SIZE(AO)	LFD	f19,  7 * SIZE(AO)	fmadd	f0,  f16, f20, f0	fmadd	f1,  f16, f21, f1	fmadd	f2,  f16, f22, f2	fmadd	f3,  f16, f23, f3	fmadd	f4,  f17, f20, f4	fmadd	f5,  f17, f21, f5	fmadd	f6,  f17, f22, f6	fmadd	f7,  f17, f23, f7 	LFD	f20, 16 * SIZE(BO)	LFD	f21, 17 * SIZE(BO)	LFD	f22, 18 * SIZE(BO)	LFD	f23, 19 * SIZE(BO)	fmadd	f0,  f18, f24, f0	fmadd	f1,  f18, f25, f1	fmadd	f2,  f18, f26, f2	fmadd	f3,  f18, f27, f3	fmadd	f4,  f19, f24, f4	fmadd	f5,  f19, f25, f5	fmadd	f6,  f19, f26, f6	fmadd	f7,  f19, f27, f7	LFD	f16,  8 * SIZE(AO)	LFD	f17,  9 * SIZE(AO)	LFD	f18, 10 * SIZE(AO)	LFD	f19, 11 * SIZE(AO) 	LFD	f24, 20 * SIZE(BO)	LFD	f25, 21 * SIZE(BO)	LFD	f26, 22 * SIZE(BO)	LFD	f27, 23 * SIZE(BO)	addi	BO,  BO, 16 * SIZE	addi	AO,  AO,  8 * SIZE	bdnz	LL(22)	.align 4LL(25):#ifndef TRMMKERNEL	andi.	r0,  K,  3	lfd	f30, ALPHA_R	lfd	f31, ALPHA_I	mtspr	CTR, r0	ble	LL(27)#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	TEMP, K, KK#elif defined(LEFT)	addi	TEMP, KK, 1#else	addi	TEMP, KK, 2#endif	andi.	TEMP,  TEMP,  3	lfd	f30, ALPHA_R	lfd	f31, ALPHA_I	mtspr	CTR, TEMP	ble	LL(27)#endif	.align 4LL(26):	fmadd	f0,  f16, f20, f0	fmadd	f1,  f16, f21, f1	fmadd	f2,  f16, f22, f2	fmadd	f3,  f16, f23, f3	fmadd	f4,  f17, f20, f4	fmadd	f5,  f17, f21, f5	fmadd	f6,  f17, f22, f6	fmadd	f7,  f17, f23, f7 	LFD	f20,  4 * SIZE(BO)	LFD	f21,  5 * SIZE(BO)	LFD	f22,  6 * SIZE(BO)	LFD	f23,  7 * SIZE(BO)	LFD	f16,  2 * SIZE(AO)	LFD	f17,  3 * SIZE(AO)	addi	AO, AO,  2 * SIZE	addi	BO, BO,  4 * SIZE	bdnz	LL(26)	.align 4LL(27):#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \      defined(CC) || defined(CR) || defined(RC) || defined(RR)	FSUB	  f0,  f0,  f5	FADD	  f1,  f1,  f4	FSUB	  f2,  f2,  f7	FADD	  f3,  f3,  f6#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)	FADD	  f0,  f0,  f5	FSUB	  f1,  f4,  f1	FADD	  f2,  f2,  f7	FSUB	  f3,  f6,  f3#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */	FADD	  f0,  f0,  f5	FSUB	  f1,  f1,  f4	FADD	  f2,  f2,  f7	FSUB	  f3,  f3,  f6#endif#ifndef TRMMKERNEL	LFD	f16, 0 * SIZE(CO1)	LFD	f17, 1 * SIZE(CO1)	LFD	f18, 0 * SIZE(CO2)	LFD	f19, 1 * SIZE(CO2)#endif#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)#ifndef TRMMKERNEL	FMADD	f16, f30, f0,  f16	FMADD	f17, f30, f1,  f17	FMADD	f18, f30, f2,  f18	FMADD	f19, f30, f3,  f19#else	FMUL	f16, f30, f0	FMUL	f17, f30, f1	FMUL	f18, f30, f2	FMUL	f19, f30, f3#endif	FNMSUB	f16, f31, f1,  f16	FMADD	f17, f31, f0,  f17	FNMSUB	f18, f31, f3,  f18	FMADD	f19, f31, f2,  f19#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */      /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */      /* defined(RC)|| defined(RR) */#ifndef TRMMKERNEL	FMADD	f16, f30, f0,  f16	FNMSUB	f17, f30, f1,  f17	FMADD	f18, f30, f2,  f18	FNMSUB	f19, f30, f3,  f19	FMADD	f16, f31, f1,  f16	FMADD	f17, f31, f0,  f17	FMADD	f18, f31, f3,  f18	FMADD	f19, f31, f2,  f19#else	FMUL	f16, f30, f0	FMUL	f17, f30, f1	FMUL	f18, f30, f2	FMUL	f19, f30, f3	FMADD	f16, f31, f1,  f16	FNMADD	f17, f31, f0,  f17	FMADD	f18, f31, f3,  f18	FNMADD	f19, f31, f2,  f19#endif#endif	STFD	f16, 0 * SIZE(CO1)	STFD	f17, 1 * SIZE(CO1)	STFD	f18, 0 * SIZE(CO2)	STFD	f19, 1 * SIZE(CO2)	addi	CO1, CO1, 2 * SIZE	addi	CO2, CO2, 2 * SIZE#ifdef TRMMKERNEL#if ( defined(LEFT) &&  defined(TRANSA)) || \    (!defined(LEFT) && !defined(TRANSA))	sub	TEMP, K, KK#ifdef LEFT	addi	TEMP, TEMP, -1#else	addi	TEMP, TEMP, -2#endif	slwi	r0,   TEMP, 0 + ZBASE_SHIFT	slwi	TEMP, TEMP, 1 + ZBASE_SHIFT	add	AO, AO, r0	add	BO, BO, TEMP#endif#ifdef LEFT	addi	KK, KK, 1#endif#endif	.align 4LL(29):#if defined(TRMMKERNEL) && !defined(LEFT)	addi	KK, KK, 2#endif	mr	B,  BO	addic.	J, J, -1	lfs	f0, FZERO	bgt	LL(10)	.align 4LL(30):	andi.	J, N,  1	ble	LL(999)#if defined(TRMMKERNEL) && defined(LEFT)	mr	KK, OFFSET#endif	srawi.	I,  M,  1	mr	CO1, C	add	C, C, LDC	mr	AO, A	ble	LL(40)	.align 4LL(31):#ifndef TRMMKERNEL	LFD	f20,  0 * SIZE(AO)	LFD	f21,  1 * SIZE(AO)	LFD	f22,  2 * SIZE(AO)	LFD	f23,  3 * SIZE(AO)	LFD	f24,  4 * SIZE(AO)	LFD	f25,  5 * SIZE(AO)	LFD	f26,  6 * SIZE(AO)	LFD	f27,  7 * SIZE(AO)	LFD	f16, 0 * SIZE(B)	LFD	f17, 1 * SIZE(B)	LFD	f18, 2 * SIZE(B)	LFD	f19, 3 * SIZE(B)	lfs	f0, FZERO	fmr	f1, f0	fmr	f2, f0	fmr	f3, f0	fmr	f4, f0	fmr	f5, f0	fmr	f6, f0	fmr	f7, f0	srawi.	r0,  K,  2	mr	BO, B	mtspr	CTR, r0

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -