📄 zgemm_kernel_power3.s
字号:
mr BO, B mtspr CTR, r0 ble LL(KERNEL_M_AND_3_K_AND_3) .align 4LL(KERNEL_M_AND_3_MainLoop): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(KERNEL_M_AND_3_MainLoop) .align 4LL(KERNEL_M_AND_3_K_AND_3): andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(KERNEL_M_AND3_Finish) .align 4LL(KERNEL_M_AND_3_SubLoop): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(KERNEL_M_AND_3_SubLoop) .align 4LL(KERNEL_M_AND3_Finish):#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6#endif LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2)#if defined(NN) || defined(NT) || defined(TN) || defined(TT) FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19#endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 0 * SIZE(CO2) STFD f19, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addic. I, I, -1 bgt LL(KERNEL_M_AND_3_SubHead) .align 4LL(KERNEL_MainTail): mr B, BO addic. J, J, -1 lfs f0, FZERO bgt LL(KERNEL_MainHead) .align 4LL(KERNEL_N_AND_3_HEAD): andi. J, N, 1 ble LL(999) .align 4LL(KERNEL_N_AND_3_MainHead): srawi. I, M, 1 mr CO1, C add C, C, LDC mr AO, A ble LL(KERNEL_MN_AND_3_Head) .align 4LL(KERNEL_N_AND_3_SubHead): LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(KERNEL_N_AND_3_K_AND_3) .align 4LL(KERNEL_N_AND_3_MainLoop): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(AO) LFD f21, 9 * SIZE(AO) LFD f22, 10 * SIZE(AO) LFD f23, 11 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(AO) LFD f25, 13 * SIZE(AO) LFD f26, 14 * SIZE(AO) LFD f27, 15 * SIZE(AO) LFD f16, 4 * SIZE(BO) LFD f17, 5 * SIZE(BO) LFD f18, 6 * SIZE(BO) LFD f19, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(AO) LFD f21, 17 * SIZE(AO) LFD f22, 18 * SIZE(AO) LFD f23, 19 * SIZE(AO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 20 * SIZE(AO) LFD f25, 21 * SIZE(AO) LFD f26, 22 * SIZE(AO) LFD f27, 23 * SIZE(AO) LFD f16, 8 * SIZE(BO) LFD f17, 9 * SIZE(BO) LFD f18, 10 * SIZE(BO) LFD f19, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt PREA, AO dcbt PREA, BO bdnz LL(KERNEL_N_AND_3_MainLoop) .align 4LL(KERNEL_N_AND_3_K_AND_3): andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(KERNEL_N_AND_3_Finish) .align 4LL(KERNEL_N_AND_3_SubLoop): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) LFD f23, 7 * SIZE(AO) LFD f16, 2 * SIZE(BO) LFD f17, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(KERNEL_N_AND_3_SubLoop) .align 4LL(KERNEL_N_AND_3_Finish):#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3#endif LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1)#if defined(NN) || defined(NT) || defined(TN) || defined(TT) FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */ FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19#endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) addi CO1, CO1, 4 * SIZE addic. I, I, -1 bgt LL(KERNEL_N_AND_3_SubHead) .align 4LL(KERNEL_MN_AND_3_Head): andi. I, M, 1 ble LL(KERNEL_SubEnd) .align 4LL(KERNEL_MN_AND_3_SubHead): LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(KERNEL_MN_AND_3_K_AND_3) .align 4LL(KERNEL_MN_AND_3_MainLoop): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) fmadd f4, f18, f22, f4 fmadd f5, f19, f23, f5 fmadd f6, f19, f22, f6 fmadd f7, f18, f23, f7 LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(KERNEL_MN_AND_3_MainLoop) .align 4LL(KERNEL_MN_AND_3_K_AND_3): fadd f0, f0, f4 fadd f1, f1, f5 fadd f2, f2, f6 fadd f3, f3, f7 andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR,r0 ble LL(KERNEL_MN_AND_3_Finish) .align 4LL(KERNEL_MN_AND_3_SubLoop): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f17, f20, f2 fmadd f3, f16, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(KERNEL_MN_AND_3_SubLoop) .align 4LL(KERNEL_MN_AND_3_Finish):#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) fsub f0, f0, f1 fadd f2, f2, f3#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) fadd f0, f0, f1 fsub f2, f2, f3#else fadd f0, f0, f1 fsub f2, f3, f2#endif LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1)#if defined(NN) || defined(NT) || defined(TN) || defined(TT) FMADD f16, f30, f0, f16 FMADD f17, f30, f2, f17 FNMSUB f16, f31, f2, f16 FMADD f17, f31, f0, f17#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC) || defined(RR) */ FMADD f16, f30, f0, f16 FNMSUB f17, f30, f2, f17 FMADD f16, f31, f2, f16 FMADD f17, f31, f0, f17#endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE addic. I, I, -1 bgt LL(KERNEL_MN_AND_3_SubHead) .align 4LL(KERNEL_SubEnd): mr B, BO addic. J, J, -1 bgt LL(KERNEL_N_AND_3_MainHead) .align 4LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP)#ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP)#else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP)#endif addi SP, SP, STACKSIZE blr EPILOGUE#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -