📄 zgemm_kernel_altivec.s
字号:
vxor c14, c14, neg vxor c15, c15, neg vxor c16, c16, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c03, c03, c07 vaddfp c04, c04, c08 vaddfp c09, c09, c13 vaddfp c10, c10, c14 vaddfp c11, c11, c15 vaddfp c12, c12, c16 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c07, c03, c03, swap vperm c08, c04, c04, swap vperm c13, c09, c09, swap vperm c14, c10, c10, swap vperm c15, c11, c11, swap vperm c16, c12, c12, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c03, alpha_r, c03, VZERO vmaddfp c04, alpha_r, c04, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c03, alpha_i, c07, c03 vmaddfp c04, alpha_i, c08, c04 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c10, alpha_r, c10, VZERO vmaddfp c11, alpha_r, c11, VZERO vmaddfp c12, alpha_r, c12, VZERO vmaddfp c09, alpha_i, c13, c09 vmaddfp c10, alpha_i, c14, c10 vmaddfp c11, alpha_i, c15, c11 vmaddfp c12, alpha_i, c16, c12 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 vaddfp c03, c03, C4 vaddfp c04, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 lvx C4, OFFSET_3, CO2 lvx C5, OFFSET_4, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, c10, PERMRSHIFT2 vperm c10, c10, c11, PERMRSHIFT2 vperm c11, c11, c12, PERMRSHIFT2 vperm c12, c12, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 vaddfp c10, c10, C3 vaddfp c11, c11, C4 vaddfp c12, c12, C5 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 stvx c10, OFFSET_2, CO2 stvx c11, OFFSET_3, CO2 stvx c12, OFFSET_4, CO2 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addic. I, I, -1 bgt+ LL(11) .align 4LL(20): andi. I, M, 4 ble LL(30) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c09, c09, c09 LOAD_B b1, OFFSET_0, B vxor c10, c10, c10 LOAD_B b2, OFFSET_1, B vxor c13, c13, c13 vxor c14, c14, c14 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(25) .align 4LL(22): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 addi AO, AO, 16 * SIZE vmaddfp c02, a2, bp1, c02 addi BO, BO, 8 * SIZE vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_0, BO vmaddfp c10, a2, bp1, c10 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vmaddfp c01, a3, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a4, bp1, c02 vmaddfp c05, a3, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a4, bp2, c06 vmaddfp c09, a3, bp1, c09 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c10, a4, bp1, c10 vmaddfp c13, a3, bp2, c13 LOAD_A a3, OFFSET_2, AO vmaddfp c14, a4, bp2, c14 LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 0 bdnz LL(22) .align 4LL(25): andi. r0, K, 1 ble+ LL(28) .align 4LL(26): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 8 * SIZE vmaddfp c13, a1, bp2, c13 addi BO, BO, 4 * SIZE vmaddfp c14, a2, bp2, c14 nop .align 4LL(28): vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c06, c06, c06, swap vperm c13, c13, c13, swap vperm c14, c14, c14, swap vxor c05, c05, neg vxor c06, c06, neg vxor c13, c13, neg vxor c14, c14, neg vaddfp c01, c01, c05 vaddfp c02, c02, c06 vaddfp c09, c09, c13 vaddfp c10, c10, c14 vperm c05, c01, c01, swap vperm c06, c02, c02, swap vperm c13, c09, c09, swap vperm c14, c10, c10, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c02, alpha_r, c02, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c02, alpha_i, c06, c02 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c10, alpha_r, c10, VZERO vmaddfp c09, alpha_i, c13, c09 vmaddfp c10, alpha_i, c14, c10 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 vaddfp c02, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, c10, PERMRSHIFT2 vperm c10, c10, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 vaddfp c10, c10, C3 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 stvx c10, OFFSET_2, CO2 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE .align 4LL(30): andi. I, M, 2 ble LL(40) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_B b1, OFFSET_0, B vxor c06, c06, c06 LOAD_B b2, OFFSET_1, B vxor c09, c09, c09 vxor c10, c10, c10 vxor c13, c13, c13 vxor c14, c14, c14 vspltw bp1, b1, 0 mr BO, B srawi. r0, K, 1 mtspr CTR, r0 ble LL(35) .align 4LL(32): vmaddfp c01, a1, bp1, c01 addi AO, AO, 8 * SIZE vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 addi BO, BO, 8 * SIZE vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 LOAD_B b1, OFFSET_0, BO vmaddfp c02, a2, bp1, c02 vspltw bp2, b2, 1 vmaddfp c06, a2, bp2, c06 vspltw bp1, b2, 2 vmaddfp c10, a2, bp1, c10 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vspltw bp1, b1, 0 bdnz LL(32) .align 4LL(35): andi. r0, K, 1 ble+ LL(38) .align 4LL(36): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE .align 4LL(38): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 vxor VZERO, VZERO, VZERO lvx swap, OFFSET_0, SP lvx neg, OFFSET_1, SP lvx alpha_r, OFFSET_2, SP lvx alpha_i, OFFSET_3, SP vperm c05, c05, c05, swap vperm c13, c13, c13, swap vxor c05, c05, neg vxor c13, c13, neg vaddfp c01, c01, c05 vaddfp c09, c09, c13 vperm c05, c01, c01, swap vperm c13, c09, c09, swap vmaddfp c01, alpha_r, c01, VZERO vmaddfp c01, alpha_i, c05, c01 vmaddfp c09, alpha_r, c09, VZERO vmaddfp c09, alpha_i, c13, c09 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vaddfp c00, c00, C1 vaddfp c01, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c09, PERMRSHIFT2 vperm c09, c09, VZERO, PERMRSHIFT2 vaddfp c00, c00, C1 vaddfp c09, c09, C2 stvx c00, OFFSET_0, CO2 stvx c09, OFFSET_1, CO2 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE .align 4LL(40): andi. I, M, 1 ble LL(49) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(BO) LFD f11, 1 * SIZE(BO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(45) .align 4LL(42): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4LL(45): andi. r0, K, 1 ble LL(48) .align 4LL(46): fmadd f0, f8, f10, f0 fmadd f2, f8, f11, f2 fmadd f4, f8, f12, f4 fmadd f6, f8, f13, f6 fmadd f1, f9, f10, f1 fmadd f3, f9, f11, f3 fmadd f5, f9, f12, f5 fmadd f7, f9, f13, f7 addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE .align 4LL(48):#if defined(NN) || defined(NT) || defined(TN) || defined(TT) fsub f0, f0, f3 fadd f1, f1, f2 fsub f4, f4, f7 fadd f5, f5, f6#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) fadd f0, f0, f3 fsub f1, f1, f2 fadd f4, f4, f7 fsub f5, f5, f6#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) fadd f0, f0, f3 fsub f1, f2, f1 fadd f4, f4, f7 fsub f5, f6, f5#else /* RR, RC, CR, CC */ fsub f0, f0, f3 fadd f1, f1, f2 fsub f4, f4, f7 fadd f5, f5, f6#endif LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) lfs f12, ALPHA_R + 0(SP) lfs f13, ALPHA_I + 4(SP)#if defined(RR) || defined(RC) || defined(CR) || defined(CC) fmadd f8, f12, f0, f8 fnmsub f9, f12, f1, f9 fmadd f10, f12, f4, f10 fnmsub f11, f12, f5, f11 fmadd f8, f13, f1, f8 fmadd f9, f13, f0, f9 fmadd f10, f13, f5, f10 fmadd f11, f13, f4, f11#else fmadd f8, f12, f0, f8 fmadd f9, f12, f1, f9 fmadd f10, f12, f4, f10 fmadd f11, f12, f5, f11 fnmsub f8, f13, f1, f8 fmadd f9, f13, f0, f9 fnmsub f10, f13, f5, f10 fmadd f11, f13, f4, f11#endif STFD f8, 0 * SIZE(CO1) STFD f9, 1 * SIZE(CO1) STFD f10, 0 * SIZE(CO2) STFD f11, 1 * SIZE(CO2)LL(49): mr B, BO addic. J, J, -1 bgt LL(01) .align 4LL(50): andi. J, N, 1 ble LL(999)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -