📄 zgemm_kernel.s
字号:
fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 fmadd f8, f16, f22, f8 fmadd f9, f17, f22, f9 fmadd f11, f19, f22, f11 fmadd f12, f16, f23, f12 fmadd f13, f17, f23, f13 fmadd f14, f18, f23, f14 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 4 * SIZE bdnz LL(16) .align 4LL(KERNEL_MainFinish):#ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1)#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6#ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2)#endif FSUB f8, f8, f13 FADD f9, f9, f12 FSUB f10, f10, f15 FADD f11, f11, f14#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6#ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2)#endif FADD f8, f8, f13 FSUB f9, f9, f12 FADD f10, f10, f15 FSUB f11, f11, f14#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3#ifndef TRMMKERNEL LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2)#endif FADD f8, f8, f13 FSUB f9, f12, f9 FADD f10, f10, f15 FSUB f11, f14, f11#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT)#ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19 FMADD f20, f30, f8, f20 FMADD f21, f30, f9, f21 FMADD f22, f30, f10, f22 FMADD f23, f30, f11, f23#else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMUL f20, f30, f8 FMUL f21, f30, f9 FMUL f22, f30, f10 FMUL f23, f30, f11#endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19 FNMSUB f20, f31, f9, f20 FMADD f21, f31, f8, f21 FNMSUB f22, f31, f11, f22 FMADD f23, f31, f10, f23#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */#ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f20, f30, f8, f20 FNMSUB f21, f30, f9, f21 FMADD f22, f30, f10, f22 FNMSUB f23, f30, f11, f23 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FMADD f23, f31, f10, f23#else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMUL f20, f30, f8 FMUL f21, f30, f9 FMUL f22, f30, f10 FMUL f23, f30, f11 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19 FMADD f20, f31, f9, f20 FNMADD f21, f31, f8, f21 FMADD f22, f31, f11, f22 FNMADD f23, f31, f10, f23#endif#endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 2 * SIZE(CO1) STFD f19, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f20, 0 * SIZE(CO2) STFD f21, 1 * SIZE(CO2) STFD f22, 2 * SIZE(CO2) STFD f23, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -2#else addi TEMP, TEMP, -2#endif slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, TEMP add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 2#endif#endif addic. I, I, -1 bgt LL(11) .align 4LL(20): andi. I, M, 1 ble LL(29)#ifndef TRMMKERNEL LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0 ble LL(25)#else#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B#else slwi r0, KK, 0 + ZBASE_SHIFT slwi TEMP, KK, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO)#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 2#endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP ble LL(25)#endif .align 4LL(22): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f18, f26, f2 fmadd f3, f18, f27, f3 fmadd f4, f19, f24, f4 fmadd f5, f19, f25, f5 fmadd f6, f19, f26, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi BO, BO, 16 * SIZE addi AO, AO, 8 * SIZE bdnz LL(22) .align 4LL(25):#ifndef TRMMKERNEL andi. r0, K, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, r0 ble LL(27)#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 2#endif andi. TEMP, TEMP, 3 lfd f30, ALPHA_R lfd f31, ALPHA_I mtspr CTR, TEMP ble LL(27)#endif .align 4LL(26): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f16, f22, f2 fmadd f3, f16, f23, f3 fmadd f4, f17, f20, f4 fmadd f5, f17, f21, f5 fmadd f6, f17, f22, f6 fmadd f7, f17, f23, f7 LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(26) .align 4LL(27):#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(CC) || defined(CR) || defined(RC) || defined(RR) FSUB f0, f0, f5 FADD f1, f1, f4 FSUB f2, f2, f7 FADD f3, f3, f6#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) FADD f0, f0, f5 FSUB f1, f4, f1 FADD f2, f2, f7 FSUB f3, f6, f3#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ FADD f0, f0, f5 FSUB f1, f1, f4 FADD f2, f2, f7 FSUB f3, f3, f6#endif#ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2)#endif#if defined(NN) || defined(NT) || defined(TN) || defined(TT)#ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FMADD f17, f30, f1, f17 FMADD f18, f30, f2, f18 FMADD f19, f30, f3, f19#else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3#endif FNMSUB f16, f31, f1, f16 FMADD f17, f31, f0, f17 FNMSUB f18, f31, f3, f18 FMADD f19, f31, f2, f19#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ /* defined(RC)|| defined(RR) */#ifndef TRMMKERNEL FMADD f16, f30, f0, f16 FNMSUB f17, f30, f1, f17 FMADD f18, f30, f2, f18 FNMSUB f19, f30, f3, f19 FMADD f16, f31, f1, f16 FMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FMADD f19, f31, f2, f19#else FMUL f16, f30, f0 FMUL f17, f30, f1 FMUL f18, f30, f2 FMUL f19, f30, f3 FMADD f16, f31, f1, f16 FNMADD f17, f31, f0, f17 FMADD f18, f31, f3, f18 FNMADD f19, f31, f2, f19#endif#endif STFD f16, 0 * SIZE(CO1) STFD f17, 1 * SIZE(CO1) STFD f18, 0 * SIZE(CO2) STFD f19, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -1#else addi TEMP, TEMP, -2#endif slwi r0, TEMP, 0 + ZBASE_SHIFT slwi TEMP, TEMP, 1 + ZBASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 1#endif#endif .align 4LL(29):#if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 2#endif mr B, BO addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4LL(30): andi. J, N, 1 ble LL(999)#if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET#endif srawi. I, M, 1 mr CO1, C add C, C, LDC mr AO, A ble LL(40) .align 4LL(31):#ifndef TRMMKERNEL LFD f20, 0 * SIZE(AO) LFD f21, 1 * SIZE(AO) LFD f22, 2 * SIZE(AO) LFD f23, 3 * SIZE(AO) LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) LFD f27, 7 * SIZE(AO) LFD f16, 0 * SIZE(B) LFD f17, 1 * SIZE(B) LFD f18, 2 * SIZE(B) LFD f19, 3 * SIZE(B) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 2 mr BO, B mtspr CTR, r0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -