📄 gemm_kernel_cell.s
字号:
bdnz LL(22) fadd f0, f2, f0 fadd f1, f3, f1 fadd f4, f6, f4 fadd f5, f7, f5 fadd f8, f10, f8 fadd f9, f11, f9 fadd f12, f14, f12 fadd f13, f15, f13 .align 4LL(25): lfd f30, ALPHA#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 4#endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ LL(28) .align 4LL(26): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f8, f16, f22, f8 FMADD f9, f17, f22, f9 FMADD f12, f16, f23, f12 FMADD f13, f17, f23, f13 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 2 * SIZE bdnz LL(26) .align 4LL(28):#ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f4, f4, f30, f18 FMADD f5, f5, f30, f19 LFD f20, 0 * SIZE(CO3) LFD f21, 1 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) LFD f23, 1 * SIZE(CO4) FMADD f8, f8, f30, f20 FMADD f9, f9, f30, f21 FMADD f12, f12, f30, f22 FMADD f13, f13, f30, f23#else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f8, f8, f30 FMUL f9, f9, f30 FMUL f12, f12, f30 FMUL f13, f13, f30#endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f8, 0 * SIZE(CO3) STFD f9, 1 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) STFD f13, 1 * SIZE(CO4) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 fmr f8, f0 fmr f9, f0 fmr f10, f0 fmr f11, f0 fmr f12, f0 fmr f13, f0 fmr f14, f0 fmr f15, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -2#else addi TEMP, TEMP, -4#endif slwi r0, TEMP, 1 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 2#endif#endif .align 4LL(30): andi. I, M, 1 ble LL(39)#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B#else slwi r0, KK, 0 + BASE_SHIFT slwi TEMP, KK, 2 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO)#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 4#endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP#else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B#endif ble LL(35) .align 5LL(32): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f1, f17, f24, f1 FMADD f5, f17, f25, f5 FMADD f9, f17, f26, f9 FMADD f13, f17, f27, f13 LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) FMADD f0, f18, f20, f0 FMADD f4, f18, f21, f4 FMADD f8, f18, f22, f8 FMADD f12, f18, f23, f12 LFD f20, 16 * SIZE(BO) LFD f21, 17 * SIZE(BO) LFD f22, 18 * SIZE(BO) LFD f23, 19 * SIZE(BO) FMADD f1, f19, f24, f1 FMADD f5, f19, f25, f5 FMADD f9, f19, f26, f9 FMADD f13, f19, f27, f13 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 20 * SIZE(BO) LFD f25, 21 * SIZE(BO) LFD f26, 22 * SIZE(BO) LFD f27, 23 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 16 * SIZE dcbt 0, BO, PREB bdnz LL(32) fadd f0, f1, f0 fadd f4, f5, f4 fadd f8, f9, f8 fadd f12, f13, f12 .align 4LL(35): lfd f30, ALPHA#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 1#else addi TEMP, KK, 4#endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ LL(38) .align 4LL(36): FMADD f0, f16, f20, f0 FMADD f4, f16, f21, f4 FMADD f8, f16, f22, f8 FMADD f12, f16, f23, f12 LFD f16, 1 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi BO, BO, 4 * SIZE addi AO, AO, 1 * SIZE bdnz LL(36) .align 4LL(38):#ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f20, 0 * SIZE(CO3) LFD f22, 0 * SIZE(CO4) FMADD f0, f0, f30, f16 FMADD f4, f4, f30, f18 FMADD f8, f8, f30, f20 FMADD f12, f12, f30, f22#else FMUL f0, f0, f30 FMUL f4, f4, f30 FMUL f8, f8, f30 FMUL f12, f12, f30#endif STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -1#else addi TEMP, TEMP, -4#endif slwi r0, TEMP, 0 + BASE_SHIFT slwi TEMP, TEMP, 2 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 2#endif#endif .align 4LL(39):#if defined(TRMMKERNEL) && !defined(LEFT) addi KK, KK, 4#endif mr B, BO addic. J, J, -1 bgt LL(10) .align 4LL(40): mr CO1, C add CO2, C, LDC andi. J, N, 2 ble LL(70)#if defined(TRMMKERNEL) && defined(LEFT) mr KK, OFFSET#endif lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 add C, CO2, LDC mr AO, A ble LL(50) .align 4LL(41):#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) mr BO, B#else slwi r0, KK, 2 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO)#endif dcbt CO1, PREC dcbt CO2, PREC#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 4#else addi TEMP, KK, 2#endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP#else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) dcbt CO1, PREC dcbt CO2, PREC srawi. r0, K, 2 mtspr CTR, r0 mr BO, B#endif ble LL(45) .align 5LL(42): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) FMADD f0, f16, f22, f0 FMADD f1, f17, f22, f1 FMADD f2, f18, f22, f2 FMADD f3, f19, f22, f3 FMADD f4, f16, f23, f4 FMADD f5, f17, f23, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE dcbt 0, BO, PREB bdnz LL(42) .align 4LL(45): lfd f30, ALPHA#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 4#else addi TEMP, KK, 2#endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ LL(48) .align 4LL(46): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f18, f20, f2 FMADD f3, f19, f20, f3 FMADD f4, f16, f21, f4 FMADD f5, f17, f21, f5 FMADD f6, f18, f21, f6 FMADD f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4LL(48):#ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19 FMADD f4, f4, f30, f20 FMADD f5, f5, f30, f21 FMADD f6, f6, f30, f22 FMADD f7, f7, f30, f23#else FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30 FMUL f4, f4, f30 FMUL f5, f5, f30 FMUL f6, f6, f30 FMUL f7, f7, f30#endif STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE#ifdef TRMMKERNEL#if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) sub TEMP, K, KK#ifdef LEFT addi TEMP, TEMP, -4#else addi TEMP, TEMP, -2#endif slwi r0, TEMP, 2 + BASE_SHIFT slwi TEMP, TEMP, 1 + BASE_SHIFT add AO, AO, r0 add BO, BO, TEMP#endif#ifdef LEFT addi KK, KK, 4#endif#endif addic. I, I, -1 bgt+ LL(41) .align 4LL(50): andi. I, M, 2 ble LL(60)#if defined(TRMMKERNEL)#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) mr BO, B#else slwi r0, KK, 1 + BASE_SHIFT slwi TEMP, KK, 1 + BASE_SHIFT add AO, AO, r0 add BO, B, TEMP LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(BO) LFD f21, 1 * SIZE(BO) LFD f22, 2 * SIZE(BO) LFD f23, 3 * SIZE(BO) LFD f24, 4 * SIZE(BO) LFD f25, 5 * SIZE(BO) LFD f26, 6 * SIZE(BO) LFD f27, 7 * SIZE(BO)#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 2#endif srawi. TEMP, TEMP, 2 mtspr CTR, TEMP#else LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B#endif ble LL(55) .align 5LL(52): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 FMADD f4, f18, f22, f4 FMADD f5, f19, f22, f5 FMADD f6, f18, f23, f6 FMADD f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) FMADD f0, f16, f24, f0 FMADD f1, f17, f24, f1 FMADD f2, f16, f25, f2 FMADD f3, f17, f25, f3 FMADD f4, f18, f26, f4 FMADD f5, f19, f26, f5 FMADD f6, f18, f27, f6 FMADD f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE dcbt 0, BO, PREB bdnz LL(52) .align 4LL(55): lfd f30, ALPHA#if defined(TRMMKERNEL)#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) sub TEMP, K, KK#elif defined(LEFT) addi TEMP, KK, 2#else addi TEMP, KK, 2#endif andi. TEMP, TEMP, 3 mtspr CTR, TEMP#else andi. r0, K, 3 mtspr CTR, r0#endif ble+ LL(58) .align 4LL(56): FMADD f0, f16, f20, f0 FMADD f1, f17, f20, f1 FMADD f2, f16, f21, f2 FMADD f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4LL(58):#ifndef TRMMKERNEL LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMADD f0, f0, f30, f16 FMADD f1, f1, f30, f17 FMADD f2, f2, f30, f18 FMADD f3, f3, f30, f19#else FADD f0, f4, f0 FADD f1, f5, f1 FADD f2, f6, f2 FADD f3, f7, f3 FMUL f0, f0, f30 FMUL f1, f1, f30 FMUL f2, f2, f30 FMUL f3, f3, f30#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -