⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm_kernel_power3.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
	mr	BO,  B	mtspr	CTR, r0	ble	LL(KERNEL_M_AND_3_K_AND_3)	.align 4LL(KERNEL_M_AND_3_MainLoop):	fmadd	f0,  f16, f20, f0	fmadd	f1,  f16, f21, f1	fmadd	f2,  f16, f22, f2	fmadd	f3,  f16, f23, f3	fmadd	f4,  f17, f20, f4	fmadd	f5,  f17, f21, f5	fmadd	f6,  f17, f22, f6	fmadd	f7,  f17, f23, f7 	LFD	f20,  8 * SIZE(BO)	LFD	f21,  9 * SIZE(BO)	LFD	f22, 10 * SIZE(BO)	LFD	f23, 11 * SIZE(BO)	fmadd	f0,  f18, f24, f0	fmadd	f1,  f18, f25, f1	fmadd	f2,  f18, f26, f2	fmadd	f3,  f18, f27, f3	fmadd	f4,  f19, f24, f4	fmadd	f5,  f19, f25, f5	fmadd	f6,  f19, f26, f6	fmadd	f7,  f19, f27, f7 	LFD	f24, 12 * SIZE(BO)	LFD	f25, 13 * SIZE(BO)	LFD	f26, 14 * SIZE(BO)	LFD	f27, 15 * SIZE(BO)	LFD	f16,  4 * SIZE(AO)	LFD	f17,  5 * SIZE(AO)	LFD	f18,  6 * SIZE(AO)	LFD	f19,  7 * SIZE(AO)	fmadd	f0,  f16, f20, f0	fmadd	f1,  f16, f21, f1	fmadd	f2,  f16, f22, f2	fmadd	f3,  f16, f23, f3	fmadd	f4,  f17, f20, f4	fmadd	f5,  f17, f21, f5	fmadd	f6,  f17, f22, f6	fmadd	f7,  f17, f23, f7 	LFD	f20, 16 * SIZE(BO)	LFD	f21, 17 * SIZE(BO)	LFD	f22, 18 * SIZE(BO)	LFD	f23, 19 * SIZE(BO)	fmadd	f0,  f18, f24, f0	fmadd	f1,  f18, f25, f1	fmadd	f2,  f18, f26, f2	fmadd	f3,  f18, f27, f3	fmadd	f4,  f19, f24, f4	fmadd	f5,  f19, f25, f5	fmadd	f6,  f19, f26, f6	fmadd	f7,  f19, f27, f7	LFD	f16,  8 * SIZE(AO)	LFD	f17,  9 * SIZE(AO)	LFD	f18, 10 * SIZE(AO)	LFD	f19, 11 * SIZE(AO) 	LFD	f24, 20 * SIZE(BO)	LFD	f25, 21 * SIZE(BO)	LFD	f26, 22 * SIZE(BO)	LFD	f27, 23 * SIZE(BO)	addi	BO,  BO, 16 * SIZE	addi	AO,  AO,  8 * SIZE	bdnz	LL(KERNEL_M_AND_3_MainLoop)	.align 4LL(KERNEL_M_AND_3_K_AND_3):	andi.	r0,  K,  3	lfd	f30, ALPHA_R	lfd	f31, ALPHA_I	mtspr	CTR, r0	ble	LL(KERNEL_M_AND3_Finish)	.align 4LL(KERNEL_M_AND_3_SubLoop):	fmadd	f0,  f16, f20, f0	fmadd	f1,  f16, f21, f1	fmadd	f2,  f16, f22, f2	fmadd	f3,  f16, f23, f3	fmadd	f4,  f17, f20, f4	fmadd	f5,  f17, f21, f5	fmadd	f6,  f17, f22, f6	fmadd	f7,  f17, f23, f7 	LFD	f20,  4 * SIZE(BO)	LFD	f21,  5 * SIZE(BO)	LFD	f22,  6 * SIZE(BO)	LFD	f23,  7 * SIZE(BO)	LFD	f16,  2 * SIZE(AO)	LFD	f17,  3 * SIZE(AO)	addi	AO, AO,  2 * SIZE	addi	BO, BO,  4 * SIZE	bdnz	LL(KERNEL_M_AND_3_SubLoop)	.align 4LL(KERNEL_M_AND3_Finish):#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \      defined(CC) || defined(CR) || defined(RC) || defined(RR)	FSUB	  f0,  f0,  f5	FADD	  f1,  f1,  f4	FSUB	  f2,  f2,  f7	FADD	  f3,  f3,  f6#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)	FADD	  f0,  f0,  f5	FSUB	  f1,  f4,  f1	FADD	  f2,  f2,  f7	FSUB	  f3,  f6,  f3#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */	FADD	  f0,  f0,  f5	FSUB	  f1,  f1,  f4	FADD	  f2,  f2,  f7	FSUB	  f3,  f3,  f6#endif	LFD	f16, 0 * SIZE(CO1)	LFD	f17, 1 * SIZE(CO1)	LFD	f18, 0 * SIZE(CO2)	LFD	f19, 1 * SIZE(CO2)#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)	FMADD	f16, f30, f0,  f16	FMADD	f17, f30, f1,  f17	FMADD	f18, f30, f2,  f18	FMADD	f19, f30, f3,  f19	FNMSUB	f16, f31, f1,  f16	FMADD	f17, f31, f0,  f17	FNMSUB	f18, f31, f3,  f18	FMADD	f19, f31, f2,  f19#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */      /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */      /* defined(RC)|| defined(RR) */	FMADD	f16, f30, f0,  f16	FNMSUB	f17, f30, f1,  f17	FMADD	f18, f30, f2,  f18	FNMSUB	f19, f30, f3,  f19	FMADD	f16, f31, f1,  f16	FMADD	f17, f31, f0,  f17	FMADD	f18, f31, f3,  f18	FMADD	f19, f31, f2,  f19#endif	STFD	f16, 0 * SIZE(CO1)	STFD	f17, 1 * SIZE(CO1)	STFD	f18, 0 * SIZE(CO2)	STFD	f19, 1 * SIZE(CO2)	addi	CO1, CO1, 2 * SIZE	addi	CO2, CO2, 2 * SIZE	addic.	I, I, -1	bgt	LL(KERNEL_M_AND_3_SubHead)	.align 4LL(KERNEL_MainTail):	mr	B,  BO	addic.	J, J, -1	lfs	f0, FZERO	bgt	LL(KERNEL_MainHead)	.align 4LL(KERNEL_N_AND_3_HEAD):	andi.	J, N,  1	ble	LL(999)	.align 4LL(KERNEL_N_AND_3_MainHead):	srawi.	I,  M,  1	mr	CO1, C	add	C, C, LDC	mr	AO, A	ble	LL(KERNEL_MN_AND_3_Head)	.align 4LL(KERNEL_N_AND_3_SubHead):	LFD	f20,  0 * SIZE(AO)	LFD	f21,  1 * SIZE(AO)	LFD	f22,  2 * SIZE(AO)	LFD	f23,  3 * SIZE(AO)	LFD	f24,  4 * SIZE(AO)	LFD	f25,  5 * SIZE(AO)	LFD	f26,  6 * SIZE(AO)	LFD	f27,  7 * SIZE(AO)	LFD	f16, 0 * SIZE(B)	LFD	f17, 1 * SIZE(B)	LFD	f18, 2 * SIZE(B)	LFD	f19, 3 * SIZE(B)	lfs	f0, FZERO	fmr	f1, f0	fmr	f2, f0	fmr	f3, f0	fmr	f4, f0	fmr	f5, f0	fmr	f6, f0	fmr	f7, f0	srawi.	r0,  K,  2	mr	BO, B	mtspr	CTR, r0	ble	LL(KERNEL_N_AND_3_K_AND_3)	.align 4LL(KERNEL_N_AND_3_MainLoop):	fmadd	f0,  f16, f20, f0	fmadd	f1,  f16, f21, f1	fmadd	f2,  f16, f22, f2	fmadd	f3,  f16, f23, f3	fmadd	f4,  f17, f20, f4	fmadd	f5,  f17, f21, f5	fmadd	f6,  f17, f22, f6	fmadd	f7,  f17, f23, f7	LFD	f20,  8 * SIZE(AO)	LFD	f21,  9 * SIZE(AO)	LFD	f22, 10 * SIZE(AO)	LFD	f23, 11 * SIZE(AO)	fmadd	f0,  f18, f24, f0	fmadd	f1,  f18, f25, f1	fmadd	f2,  f18, f26, f2	fmadd	f3,  f18, f27, f3	fmadd	f4,  f19, f24, f4	fmadd	f5,  f19, f25, f5	fmadd	f6,  f19, f26, f6	fmadd	f7,  f19, f27, f7	LFD	f24, 12 * SIZE(AO)	LFD	f25, 13 * SIZE(AO)	LFD	f26, 14 * SIZE(AO)	LFD	f27, 15 * SIZE(AO)	LFD	f16,  4 * SIZE(BO)	LFD	f17,  5 * SIZE(BO)	LFD	f18,  6 * SIZE(BO)	LFD	f19,  7 * SIZE(BO)	fmadd	f0,  f16, f20, f0	fmadd	f1,  f16, f21, f1	fmadd	f2,  f16, f22, f2	fmadd	f3,  f16, f23, f3	fmadd	f4,  f17, f20, f4	fmadd	f5,  f17, f21, f5	fmadd	f6,  f17, f22, f6	fmadd	f7,  f17, f23, f7	LFD	f20, 16 * SIZE(AO)	LFD	f21, 17 * SIZE(AO)	LFD	f22, 18 * SIZE(AO)	LFD	f23, 19 * SIZE(AO)	fmadd	f0,  f18, f24, f0	fmadd	f1,  f18, f25, f1	fmadd	f2,  f18, f26, f2	fmadd	f3,  f18, f27, f3	fmadd	f4,  f19, f24, f4	fmadd	f5,  f19, f25, f5	fmadd	f6,  f19, f26, f6	fmadd	f7,  f19, f27, f7	LFD	f24, 20 * SIZE(AO)	LFD	f25, 21 * SIZE(AO)	LFD	f26, 22 * SIZE(AO)	LFD	f27, 23 * SIZE(AO)	LFD	f16,  8 * SIZE(BO)	LFD	f17,  9 * SIZE(BO)	LFD	f18, 10 * SIZE(BO)	LFD	f19, 11 * SIZE(BO)	addi	AO, AO, 16 * SIZE	addi	BO, BO,  8 * SIZE 	dcbt	PREA, AO 	dcbt	PREA, BO	bdnz	LL(KERNEL_N_AND_3_MainLoop)	.align 4LL(KERNEL_N_AND_3_K_AND_3):	andi.	r0,  K,  3	lfd	f30, ALPHA_R	lfd	f31, ALPHA_I	mtspr	CTR, r0	ble	LL(KERNEL_N_AND_3_Finish)	.align 4LL(KERNEL_N_AND_3_SubLoop):	fmadd	f0,  f16, f20, f0	fmadd	f1,  f16, f21, f1	fmadd	f2,  f16, f22, f2	fmadd	f3,  f16, f23, f3	fmadd	f4,  f17, f20, f4	fmadd	f5,  f17, f21, f5	fmadd	f6,  f17, f22, f6	fmadd	f7,  f17, f23, f7	LFD	f20, 4 * SIZE(AO)	LFD	f21, 5 * SIZE(AO)	LFD	f22, 6 * SIZE(AO)	LFD	f23, 7 * SIZE(AO)	LFD	f16, 2 * SIZE(BO)	LFD	f17, 3 * SIZE(BO)	addi	BO,  BO, 2 * SIZE	addi	AO,  AO, 4 * SIZE	bdnz	LL(KERNEL_N_AND_3_SubLoop)	.align 4LL(KERNEL_N_AND_3_Finish):#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \      defined(CC) || defined(CR) || defined(RC) || defined(RR)	FSUB	  f0,  f0,  f5	FADD	  f1,  f1,  f4	FSUB	  f2,  f2,  f7	FADD	  f3,  f3,  f6#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)	FADD	  f0,  f0,  f5	FSUB	  f1,  f1,  f4	FADD	  f2,  f2,  f7	FSUB	  f3,  f3,  f6#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */	FADD	  f0,  f0,  f5	FSUB	  f1,  f4,  f1	FADD	  f2,  f2,  f7	FSUB	  f3,  f6,  f3#endif	LFD	f16, 0 * SIZE(CO1)	LFD	f17, 1 * SIZE(CO1)	LFD	f18, 2 * SIZE(CO1)	LFD	f19, 3 * SIZE(CO1)#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)	FMADD	f16, f30, f0,  f16	FMADD	f17, f30, f1,  f17	FMADD	f18, f30, f2,  f18	FMADD	f19, f30, f3,  f19	FNMSUB	f16, f31, f1,  f16	FMADD	f17, f31, f0,  f17	FNMSUB	f18, f31, f3,  f18	FMADD	f19, f31, f2,  f19#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */      /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */      /* defined(RC)|| defined(RR) */	FMADD	f16, f30, f0,  f16	FNMSUB	f17, f30, f1,  f17	FMADD	f18, f30, f2,  f18	FNMSUB	f19, f30, f3,  f19	FMADD	f16, f31, f1,  f16	FMADD	f17, f31, f0,  f17	FMADD	f18, f31, f3,  f18	FMADD	f19, f31, f2,  f19#endif	STFD	f16, 0 * SIZE(CO1)	STFD	f17, 1 * SIZE(CO1)	STFD	f18, 2 * SIZE(CO1)	STFD	f19, 3 * SIZE(CO1)	addi	CO1, CO1, 4 * SIZE	addic.	I, I, -1	bgt	LL(KERNEL_N_AND_3_SubHead)	.align 4LL(KERNEL_MN_AND_3_Head):	andi.	I,  M,  1	ble	LL(KERNEL_SubEnd)	.align 4LL(KERNEL_MN_AND_3_SubHead):	LFD	f16, 0 * SIZE(AO)	LFD	f17, 1 * SIZE(AO)	LFD	f18, 2 * SIZE(AO)	LFD	f19, 3 * SIZE(AO)	LFD	f20, 0 * SIZE(B)	LFD	f21, 1 * SIZE(B)	LFD	f22, 2 * SIZE(B)	LFD	f23, 3 * SIZE(B)	lfs	f0, FZERO	fmr	f1, f0	fmr	f2, f0	fmr	f3, f0	fmr	f4, f0	fmr	f5, f0	fmr	f6, f0	fmr	f7, f0	srawi.	r0,  K,  2	mr	BO, B	mtspr	CTR, r0	ble	LL(KERNEL_MN_AND_3_K_AND_3)	.align 4LL(KERNEL_MN_AND_3_MainLoop):	fmadd	f0,  f16,  f20,  f0	fmadd	f1,  f17,  f21,  f1	fmadd	f2,  f17,  f20,  f2	fmadd	f3,  f16,  f21,  f3	LFD	f16,  4 * SIZE(AO)	LFD	f17,  5 * SIZE(AO)	LFD	f20,  4 * SIZE(BO)	LFD	f21,  5 * SIZE(BO)	fmadd	f4,  f18,  f22,  f4	fmadd	f5,  f19,  f23,  f5	fmadd	f6,  f19,  f22,  f6	fmadd	f7,  f18,  f23,  f7	LFD	f18,  6 * SIZE(AO)	LFD	f19,  7 * SIZE(AO)	LFD	f22,  6 * SIZE(BO)	LFD	f23,  7 * SIZE(BO)	fmadd	f0,  f16,  f20,  f0	fmadd	f1,  f17,  f21,  f1	fmadd	f2,  f17,  f20,  f2	fmadd	f3,  f16,  f21,  f3	LFD	f16,  8 * SIZE(AO)	LFD	f17,  9 * SIZE(AO)	LFD	f20,  8 * SIZE(BO)	LFD	f21,  9 * SIZE(BO)	fmadd	f4,  f18,  f22,  f4	fmadd	f5,  f19,  f23,  f5	fmadd	f6,  f19,  f22,  f6	fmadd	f7,  f18,  f23,  f7	LFD	f18, 10 * SIZE(AO)	LFD	f19, 11 * SIZE(AO)	LFD	f22, 10 * SIZE(BO)	LFD	f23, 11 * SIZE(BO)	addi	AO, AO,  8 * SIZE	addi	BO, BO,  8 * SIZE	bdnz	LL(KERNEL_MN_AND_3_MainLoop)	.align 4LL(KERNEL_MN_AND_3_K_AND_3):	fadd	f0, f0, f4	fadd	f1, f1, f5	fadd	f2, f2, f6	fadd	f3, f3, f7	andi.	r0,  K,  3	lfd	f30, ALPHA_R	lfd	f31, ALPHA_I	mtspr	CTR,r0	ble	LL(KERNEL_MN_AND_3_Finish)	.align 4LL(KERNEL_MN_AND_3_SubLoop):	fmadd	f0,  f16,  f20,  f0	fmadd	f1,  f17,  f21,  f1	fmadd	f2,  f17,  f20,  f2	fmadd	f3,  f16,  f21,  f3	LFD	f16, 2 * SIZE(AO)	LFD	f17, 3 * SIZE(AO)	LFD	f20, 2 * SIZE(BO)	LFD	f21, 3 * SIZE(BO)	addi	AO, AO, 2 * SIZE	addi	BO, BO, 2 * SIZE	bdnz	LL(KERNEL_MN_AND_3_SubLoop)	.align 4LL(KERNEL_MN_AND_3_Finish):#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \      defined(CC) || defined(CR) || defined(RC) || defined(RR)	fsub	f0, f0, f1	fadd	f2, f2, f3#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)	fadd	f0, f0, f1	fsub	f2, f2, f3#else	fadd	f0, f0, f1	fsub	f2, f3, f2#endif	LFD	f16,  0 * SIZE(CO1)	LFD	f17,  1 * SIZE(CO1)#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)	FMADD	f16, f30, f0, f16	FMADD	f17, f30, f2, f17	FNMSUB	f16, f31, f2, f16	FMADD	f17, f31, f0, f17#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */      /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */      /* defined(RC) || defined(RR) */	FMADD	f16, f30, f0, f16	FNMSUB	f17, f30, f2, f17	FMADD	f16, f31, f2, f16	FMADD	f17, f31, f0, f17#endif	STFD	f16,  0 * SIZE(CO1)	STFD	f17,  1 * SIZE(CO1)	addi	CO1, CO1, 2 * SIZE	addic.	I, I, -1	bgt	LL(KERNEL_MN_AND_3_SubHead)	.align 4LL(KERNEL_SubEnd):	mr	B,  BO	addic.	J, J, -1	bgt	LL(KERNEL_N_AND_3_MainHead)	.align 4LL(999):	addi	r3, 0, 0	lfd	f14,    0(SP)	lfd	f15,    8(SP)	lfd	f16,   16(SP)	lfd	f17,   24(SP)	lfd	f18,   32(SP)	lfd	f19,   40(SP)	lfd	f20,   48(SP)	lfd	f21,   56(SP)	lfd	f22,   64(SP)	lfd	f23,   72(SP)	lfd	f24,   80(SP)	lfd	f25,   88(SP)	lfd	f26,   96(SP)	lfd	f27,  104(SP)	lfd	f28,  112(SP)	lfd	f29,  120(SP)	lfd	f30,  128(SP)	lfd	f31,  136(SP)#ifdef __64BIT__	ld	r31,  144(SP)	ld	r30,  152(SP)	ld	r29,  160(SP)	ld	r28,  168(SP)	ld	r27,  176(SP)	ld	r26,  184(SP)	ld	r25,  192(SP)	ld	r24,  200(SP)#else	lwz	r31,  144(SP)	lwz	r30,  148(SP)	lwz	r29,  152(SP)	lwz	r28,  156(SP)	lwz	r27,  160(SP)	lwz	r26,  164(SP)	lwz	r25,  168(SP)	lwz	r24,  172(SP)#endif	addi	SP, SP, STACKSIZE	blr	EPILOGUE#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -