📄 gemm_kernel_altivec_g4.s
字号:
vmaddfp c03, alpha, c03, C4 lvx C4, OFFSET_3, CO3 vmaddfp c04, alpha, c04, C5 lvx C5, OFFSET_4, CO3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, c07, PERMRSHIFT2 vperm c07, c07, c08, PERMRSHIFT2 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C6 lvx C6, OFFSET_1, CO4 vmaddfp c06, alpha, c06, C7 lvx C7, OFFSET_2, CO4 vmaddfp c07, alpha, c07, C8 lvx C8, OFFSET_3, CO4 vmaddfp c08, alpha, c08, C9 lvx C9, OFFSET_4, CO4 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 stvx c07, OFFSET_3, CO2 stvx c08, OFFSET_4, CO2 lvx C1, OFFSET_0, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, c10, PERMRSHIFT3 vperm c10, c10, c11, PERMRSHIFT3 vperm c11, c11, c12, PERMRSHIFT3 vperm c12, c12, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 vmaddfp c10, alpha, c10, C3 vmaddfp c11, alpha, c11, C4 vmaddfp c12, alpha, c12, C5 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 stvx c10, OFFSET_2, CO3 stvx c11, OFFSET_3, CO3 stvx c12, OFFSET_4, CO3 lvx C1, OFFSET_0, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, c14, PERMRSHIFT4 vperm c14, c14, c15, PERMRSHIFT4 vperm c15, c15, c16, PERMRSHIFT4 vperm c16, c16, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C6 vmaddfp c14, alpha, c14, C7 vmaddfp c15, alpha, c15, C8 vmaddfp c16, alpha, c16, C9 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 stvx c15, OFFSET_3, CO4 stvx c16, OFFSET_4, CO4 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE addic. I, I, -1 bgt+ LL(11) .align 4LL(20): andi. I, M, 8 ble LL(30) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c09, c09, c09 LOAD_B b1, OFFSET_0, B vxor c10, c10, c10 LOAD_B b2, OFFSET_1, B vxor c13, c13, c13 vxor c14, c14, c14 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(25) .align 4LL(22): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 addi AO, AO, 16 * SIZE vmaddfp c02, a2, bp1, c02 addi BO, BO, 8 * SIZE vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_0, BO vmaddfp c10, a2, bp1, c10 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vmaddfp c01, a3, bp1, c01 vspltw bp2, b2, 1 vmaddfp c02, a4, bp1, c02 vmaddfp c05, a3, bp2, c05 vspltw bp1, b2, 2 vmaddfp c06, a4, bp2, c06 vmaddfp c09, a3, bp1, c09 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c10, a4, bp1, c10 vmaddfp c13, a3, bp2, c13 LOAD_A a3, OFFSET_2, AO vmaddfp c14, a4, bp2, c14 LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 0 bdnz LL(22) .align 4LL(25): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(28) .align 4LL(26): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 nop vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 nop vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c10, a2, bp1, c10 addi AO, AO, 8 * SIZE vmaddfp c13, a1, bp2, c13 addi BO, BO, 4 * SIZE vmaddfp c14, a2, bp2, c14 nop .align 4LL(28): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 lvx C1, OFFSET_0, CO3 lvx C2, OFFSET_1, CO3 lvx C3, OFFSET_2, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, c10, PERMRSHIFT3 vperm c10, c10, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 vmaddfp c10, alpha, c10, C3 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 stvx c10, OFFSET_2, CO3 lvx C1, OFFSET_0, CO4 lvx C2, OFFSET_1, CO4 lvx C3, OFFSET_2, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, c14, PERMRSHIFT4 vperm c14, c14, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C2 vmaddfp c14, alpha, c14, C3 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 stvx c14, OFFSET_2, CO4 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE addi CO3, CO3, 8 * SIZE addi CO4, CO4, 8 * SIZE .align 4LL(30): andi. I, M, 4 ble LL(40) vxor c01, c01, c01 LOAD_A a1, OFFSET_0, AO vxor c02, c02, c02 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_B b1, OFFSET_0, B vxor c06, c06, c06 LOAD_B b2, OFFSET_1, B vxor c09, c09, c09 vxor c10, c10, c10 vxor c13, c13, c13 vxor c14, c14, c14 vspltw bp1, b1, 0 mr BO, B srawi. r0, K, 1 mtspr CTR, r0 ble LL(35) .align 4LL(32): vmaddfp c01, a1, bp1, c01 addi AO, AO, 8 * SIZE vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 addi BO, BO, 8 * SIZE vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 LOAD_A a1, OFFSET_0, AO vspltw bp1, b2, 0 LOAD_B b1, OFFSET_0, BO vmaddfp c02, a2, bp1, c02 vspltw bp2, b2, 1 vmaddfp c06, a2, bp2, c06 vspltw bp1, b2, 2 vmaddfp c10, a2, bp1, c10 vspltw bp2, b2, 3 LOAD_B b2, OFFSET_1, BO vmaddfp c14, a2, bp2, c14 LOAD_A a2, OFFSET_1, AO vspltw bp1, b1, 0 bdnz LL(32) .align 4LL(35): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(38) .align 4LL(36): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c09, a1, bp1, c09 vspltw bp2, b1, 3 vmaddfp c13, a1, bp2, c13 addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE .align 4LL(38): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 lvx C1, OFFSET_0, CO3 lvx C2, OFFSET_1, CO3 vperm c00, VZERO, c09, PERMRSHIFT3 vperm c09, c09, VZERO, PERMRSHIFT3 vmaddfp c00, alpha, c00, C1 vmaddfp c09, alpha, c09, C2 stvx c00, OFFSET_0, CO3 stvx c09, OFFSET_1, CO3 lvx C1, OFFSET_0, CO4 lvx C2, OFFSET_1, CO4 vperm c00, VZERO, c13, PERMRSHIFT4 vperm c13, c13, VZERO, PERMRSHIFT4 vmaddfp c00, alpha, c00, C1 vmaddfp c13, alpha, c13, C2 stvx c00, OFFSET_0, CO4 stvx c13, OFFSET_1, CO4 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE .align 4LL(40): andi. I, M, 2 ble LL(50) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(45) .align 4LL(42): FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 8 * SIZE bdnz LL(42) .align 4LL(45): andi. r0, K, 1 ble LL(48) .align 4LL(46): FMADD f0, f8, f10, f0 FMADD f2, f8, f11, f2 FMADD f4, f8, f12, f4 FMADD f6, f8, f13, f6 FMADD f1, f9, f10, f1 FMADD f3, f9, f11, f3 FMADD f5, f9, f12, f5 FMADD f7, f9, f13, f7 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE .align 4LL(48): lfs f13, ALPHA(SP) LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 LFD f8, 0 * SIZE(CO3) LFD f9, 1 * SIZE(CO3) LFD f10, 0 * SIZE(CO4) LFD f11, 1 * SIZE(CO4) FMADD f4, f4, f13, f8 FMADD f5, f5, f13, f9 FMADD f6, f6, f13, f10 FMADD f7, f7, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) STFD f4, 0 * SIZE(CO3) STFD f5, 1 * SIZE(CO3) STFD f6, 0 * SIZE(CO4) STFD f7, 1 * SIZE(CO4) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE .align 4LL(50): andi. I, M, 1 ble LL(59) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(55) .align 4LL(52): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f8, f12, f2 FMADD f3, f8, f13, f3 LFD f8, 2 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) FMADD f0, f9, f10, f0 FMADD f1, f9, f11, f1 FMADD f2, f9, f12, f2 FMADD f3, f9, f13, f3 LFD f9, 3 * SIZE(AO) LFD f10, 8 * SIZE(BO) LFD f11, 9 * SIZE(BO) LFD f12, 10 * SIZE(BO) LFD f13, 11 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 8 * SIZE bdnz LL(52) .align 4LL(55): andi. r0, K, 1 ble LL(58) .align 4LL(56): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f8, f12, f2 FMADD f3, f8, f13, f3 LFD f8, 2 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 1 * SIZE addi BO, BO, 4 * SIZE .align 4LL(58): lfs f13, ALPHA(SP) LFD f8, 0 * SIZE(CO1) LFD f9, 0 * SIZE(CO2) LFD f10, 0 * SIZE(CO3) LFD f11, 0 * SIZE(CO4) FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) STFD f2, 0 * SIZE(CO3) STFD f3, 0 * SIZE(CO4) .align 4LL(59): mr B, BO addic. J, J, -1 bgt LL(01) .align 4LL(60): andi. r0, N, 2 ble LL(120) mr CO1, C add CO2, C, LDC add C, CO2, LDC mr AO, A srawi. I, M, 4 ble LL(80) .align 4LL(71): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -