📄 gemm_kernel_power3.s
字号:
fmadd f0, f0, f30, f16 fmadd f4, f4, f30, f18 fmadd f8, f8, f30, f20 fmadd f12, f12, f30, f22 STFD f0, 0 * SIZE(CO1) STFD f4, 0 * SIZE(CO2) STFD f8, 0 * SIZE(CO3) STFD f12, 0 * SIZE(CO4) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 fmr f8, f0 fmr f9, f0 fmr f12, f0 fmr f13, f0 .align 4LL(39): mr B, BO addic. J, J, -1 lfs f0, FZERO bgt LL(10) .align 4LL(40): mr CO1, C add CO2, C, LDC andi. J, N, 2 ble LL(70) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. I, M, 2 add C, CO2, LDC mr AO, A ble LL(50) .align 4LL(41): LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) PREFETCH_C1 PREFETCH_C2 srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(45) .align 5LL(42): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f5, f17, f21, f5 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f22, f0 fmadd f1, f17, f22, f1 fmadd f2, f18, f22, f2 fmadd f3, f19, f22, f3 fmadd f4, f16, f23, f4 fmadd f5, f17, f23, f5 fmadd f6, f18, f23, f6 fmadd f7, f19, f23, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f5, f17, f21, f5 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) fmadd f0, f16, f22, f0 fmadd f1, f17, f22, f1 fmadd f2, f18, f22, f2 fmadd f3, f19, f22, f3 fmadd f4, f16, f23, f4 fmadd f5, f17, f23, f5 fmadd f6, f18, f23, f6 fmadd f7, f19, f23, f7 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 8 * SIZE PREFETCH_B bdnz LL(42) .align 4LL(45): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(48) .align 4LL(46): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 fmadd f4, f16, f21, f4 fmadd f5, f17, f21, f5 fmadd f6, f18, f21, f6 fmadd f7, f19, f21, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 4 * SIZE bdnz LL(46) .align 4LL(48): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) LFD f20, 0 * SIZE(CO2) LFD f21, 1 * SIZE(CO2) LFD f22, 2 * SIZE(CO2) LFD f23, 3 * SIZE(CO2) fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f17 fmadd f2, f2, f30, f18 fmadd f3, f3, f30, f19 fmadd f4, f4, f30, f20 fmadd f5, f5, f30, f21 fmadd f6, f6, f30, f22 fmadd f7, f7, f30, f23 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 STFD f4, 0 * SIZE(CO2) STFD f5, 1 * SIZE(CO2) STFD f6, 2 * SIZE(CO2) STFD f7, 3 * SIZE(CO2) fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addic. I, I, -1 bgt+ LL(41) .align 4LL(50): andi. I, M, 2 ble LL(60) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(55) .align 5LL(52): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f16, f21, f2 fmadd f3, f17, f21, f3 fmadd f4, f18, f22, f4 fmadd f5, f19, f22, f5 fmadd f6, f18, f23, f6 fmadd f7, f19, f23, f7 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f16, f24, f0 fmadd f1, f17, f24, f1 fmadd f2, f16, f25, f2 fmadd f3, f17, f25, f3 fmadd f4, f18, f26, f4 fmadd f5, f19, f26, f5 fmadd f6, f18, f27, f6 fmadd f7, f19, f27, f7 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE PREFETCH_B bdnz LL(52) .align 4LL(55): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(58) .align 4LL(56): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f16, f21, f2 fmadd f3, f17, f21, f3 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 2 * SIZE bdnz LL(56) .align 4LL(58): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) LFD f19, 1 * SIZE(CO2) fadd f0, f4, f0 fadd f1, f5, f1 fadd f2, f6, f2 fadd f3, f7, f3 fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f17 fmadd f2, f2, f30, f18 fmadd f3, f3, f30, f19 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE .align 4LL(60): andi. I, M, 1 ble LL(69) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) LFD f24, 4 * SIZE(B) LFD f25, 5 * SIZE(B) LFD f26, 6 * SIZE(B) LFD f27, 7 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(65) .align 5LL(62): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 fmadd f2, f17, f22, f2 fmadd f3, f17, f23, f3 LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) fmadd f0, f18, f24, f0 fmadd f1, f18, f25, f1 fmadd f2, f19, f26, f2 fmadd f3, f19, f27, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f24, 12 * SIZE(BO) LFD f25, 13 * SIZE(BO) LFD f26, 14 * SIZE(BO) LFD f27, 15 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(62) .align 4LL(65): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(68) .align 4LL(66): fmadd f0, f16, f20, f0 fmadd f1, f16, f21, f1 LFD f16, 1 * SIZE(AO) LFD f20, 2 * SIZE(BO) LFD f21, 3 * SIZE(BO) addi BO, BO, 2 * SIZE addi AO, AO, 1 * SIZE bdnz LL(66) .align 4LL(68): LFD f16, 0 * SIZE(CO1) LFD f18, 0 * SIZE(CO2) fadd f0, f2, f0 fadd f1, f3, f1 fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f18 STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) lfs f0, FZERO fmr f1, f0 fmr f4, f0 fmr f5, f0 .align 4LL(69): mr B, BO lfs f0, FZERO .align 4LL(70): mr CO1, C andi. J, N, 1 ble LL(999) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. I, M, 2 mr AO, A ble LL(80) .align 4LL(71): LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) PREFETCH_C1 srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(75) .align 5LL(72): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f21, f0 fmadd f1, f17, f21, f1 fmadd f2, f18, f21, f2 fmadd f3, f19, f21, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) fmadd f0, f16, f22, f0 fmadd f1, f17, f22, f1 fmadd f2, f18, f22, f2 fmadd f3, f19, f22, f3 LFD f16, 12 * SIZE(AO) LFD f17, 13 * SIZE(AO) LFD f18, 14 * SIZE(AO) LFD f19, 15 * SIZE(AO) fmadd f0, f16, f23, f0 fmadd f1, f17, f23, f1 fmadd f2, f18, f23, f2 fmadd f3, f19, f23, f3 LFD f16, 16 * SIZE(AO) LFD f17, 17 * SIZE(AO) LFD f18, 18 * SIZE(AO) LFD f19, 19 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE PREFETCH_B bdnz LL(72) .align 4LL(75): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(78) .align 4LL(76): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f20, f2 fmadd f3, f19, f20, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 4 * SIZE bdnz LL(76) .align 4LL(78): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) LFD f18, 2 * SIZE(CO1) LFD f19, 3 * SIZE(CO1) fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f17 fmadd f2, f2, f30, f18 fmadd f3, f3, f30, f19 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 2 * SIZE(CO1) STFD f3, 3 * SIZE(CO1) lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 addi CO1, CO1, 4 * SIZE addic. I, I, -1 bgt+ LL(71) .align 4LL(80): andi. I, M, 2 ble LL(90) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 2 mtspr CTR, r0 mr BO, B ble LL(85) .align 5LL(82): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 fmadd f2, f18, f21, f2 fmadd f3, f19, f21, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) fmadd f0, f16, f22, f0 fmadd f1, f17, f22, f1 fmadd f2, f18, f23, f2 fmadd f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE PREFETCH_B bdnz LL(82) .align 4LL(85): lfd f30, ALPHA andi. r0, K, 3 mtspr CTR, r0 ble+ LL(88) .align 4LL(86): fmadd f0, f16, f20, f0 fmadd f1, f17, f20, f1 LFD f16, 2 * SIZE(AO) LFD f17, 3 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 2 * SIZE bdnz LL(86) .align 4LL(88): LFD f16, 0 * SIZE(CO1) LFD f17, 1 * SIZE(CO1) fadd f0, f2, f0 fadd f1, f3, f1 fmadd f0, f0, f30, f16 fmadd f1, f1, f30, f17 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE lfs f0, FZERO fmr f1, f0 fmr f2, f0 fmr f3, f0 .align 4LL(90): andi. I, M, 1 ble LL(999) LFD f16, 0 * SIZE(AO) LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) LFD f20, 0 * SIZE(B) LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) srawi. r0, K, 3 mtspr CTR, r0 mr BO, B ble LL(95) .align 5LL(92): fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f18, f22, f2 fmadd f3, f19, f23, f3 LFD f16, 4 * SIZE(AO) LFD f17, 5 * SIZE(AO) LFD f18, 6 * SIZE(AO) LFD f19, 7 * SIZE(AO) LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) LFD f23, 7 * SIZE(BO) fmadd f0, f16, f20, f0 fmadd f1, f17, f21, f1 fmadd f2, f18, f22, f2 fmadd f3, f19, f23, f3 LFD f16, 8 * SIZE(AO) LFD f17, 9 * SIZE(AO) LFD f18, 10 * SIZE(AO) LFD f19, 11 * SIZE(AO) LFD f20, 8 * SIZE(BO) LFD f21, 9 * SIZE(BO) LFD f22, 10 * SIZE(BO) LFD f23, 11 * SIZE(BO) addi AO, AO, 8 * SIZE addi BO, BO, 8 * SIZE bdnz LL(92) .align 4LL(95): lfd f30, ALPHA andi. r0, K, 7 mtspr CTR, r0 ble+ LL(98) .align 4LL(96): fmadd f0, f16, f20, f0 LFD f16, 1 * SIZE(AO) LFD f20, 1 * SIZE(BO) addi BO, BO, 1 * SIZE addi AO, AO, 1 * SIZE bdnz LL(96) .align 4LL(98): LFD f16, 0 * SIZE(CO1) fadd f0, f1, f0 fadd f2, f3, f2 fadd f0, f2, f0 fmadd f0, f0, f30, f16 STFD f0, 0 * SIZE(CO1) .align 4LL(999): addi r3, 0, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP)#ifdef __64BIT__ ld r31, 144(SP) ld r30, 152(SP) ld r29, 160(SP) ld r28, 168(SP) ld r27, 176(SP) ld r26, 184(SP) ld r25, 192(SP) ld r24, 200(SP) ld r23, 208(SP) ld r22, 216(SP) ld r21, 224(SP)#else lwz r31, 144(SP) lwz r30, 148(SP) lwz r29, 152(SP) lwz r28, 156(SP) lwz r27, 160(SP) lwz r26, 164(SP) lwz r25, 168(SP) lwz r24, 172(SP) lwz r23, 176(SP) lwz r22, 180(SP) lwz r21, 184(SP)#endif addi SP, SP, STACKSIZE blr EPILOGUE#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -