📄 gemm_kernel_altivec_cell.s
字号:
addic. J, J, -1 bgt LL(01) .align 4LL(60): andi. r0, N, 2 ble LL(120) mr CO1, C add CO2, C, LDC add C, CO2, LDC mr AO, A srawi. I, M, 4 ble LL(80) .align 4LL(71): vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B dcbtst CO1, PREC dcbtst CO2, PREC vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(75) .align 4LL(72): LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 vmaddfp c01, a5, bp1, c01 vspltw bp2, b1, 3 vmaddfp c02, a6, bp1, c02 vmaddfp c03, a7, bp1, c03 vmaddfp c04, a8, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c05, a5, bp2, c05 vmaddfp c06, a6, bp2, c06 vmaddfp c07, a7, bp2, c07 vmaddfp c08, a8, bp2, c08 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(72) .align 4LL(75): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(78) .align 4LL(76): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 16 * SIZE vmaddfp c03, a3, bp1, c03 addi BO, BO, 2 * SIZE vmaddfp c04, a4, bp1, c04 nop vmaddfp c05, a1, bp2, c05 vmaddfp c06, a2, bp2, c06 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 .align 4LL(78): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 vmaddfp c03, alpha, c03, C4 vmaddfp c04, alpha, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 lvx C4, OFFSET_3, CO2 lvx C5, OFFSET_4, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, c07, PERMRSHIFT2 vperm c07, c07, c08, PERMRSHIFT2 vperm c08, c08, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 vmaddfp c07, alpha, c07, C4 vmaddfp c08, alpha, c08, C5 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 stvx c07, OFFSET_3, CO2 stvx c08, OFFSET_4, CO2 addi CO1, CO1, 16 * SIZE addi CO2, CO2, 16 * SIZE addic. I, I, -1 bgt+ LL(71) .align 4LL(80): andi. I, M, 8 ble LL(90) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 vxor c03, c03, c03 LOAD_A a1, OFFSET_0, AO vxor c04, c04, c04 LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 LOAD_A a3, OFFSET_2, AO vxor c06, c06, c06 LOAD_A a4, OFFSET_3, AO vxor c07, c07, c07 vxor c08, c08, c08 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(85) .align 4LL(82): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c06, a2, bp2, c06 vmaddfp c03, a3, bp1, c03 vspltw bp2, b1, 3 vmaddfp c04, a4, bp1, c04 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c07, a3, bp2, c07 vmaddfp c08, a4, bp2, c08 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO bdnz LL(82) .align 4LL(85): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(88) .align 4LL(86): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE vmaddfp c05, a1, bp2, c05 addi BO, BO, 2 * SIZE vmaddfp c06, a2, bp2, c06 .align 4LL(88): lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 vaddfp c01, c01, c03 vaddfp c02, c02, c04 vaddfp c05, c05, c07 vaddfp c06, c06, c08 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 lvx C3, OFFSET_2, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, c06, PERMRSHIFT2 vperm c06, c06, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 vmaddfp c06, alpha, c06, C3 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 stvx c06, OFFSET_2, CO2 addi CO1, CO1, 8 * SIZE addi CO2, CO2, 8 * SIZE .align 4LL(90): andi. I, M, 4 ble LL(100) vxor c01, c01, c01 LOAD_B b1, OFFSET_0, B vxor c02, c02, c02 LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO vxor c05, c05, c05 vxor c06, c06, c06 mr BO, B vspltw bp1, b1, 0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(95) .align 4LL(92): vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c05, a1, bp2, c05 vspltw bp1, b1, 2 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 3 LOAD_B b1, OFFSET_1, BO vspltw bp1, b1, 0 vmaddfp c06, a2, bp2, c06 addi AO, AO, 8 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO bdnz LL(92) .align 4LL(95): andi. r0, K, 1 lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO ble+ LL(98) .align 4LL(96): vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c05, a1, bp2, c05 addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE .align 4LL(98): vaddfp c01, c01, c02 vaddfp c05, c05, c06 vaddfp c09, c09, c10 vaddfp c13, c13, c14 lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 lvsr PERMRSHIFT2, 0, CO2 lvsr PERMRSHIFT3, 0, CO3 lvsr PERMRSHIFT4, 0, CO4 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 lvx C1, OFFSET_0, CO2 lvx C2, OFFSET_1, CO2 vperm c00, VZERO, c05, PERMRSHIFT2 vperm c05, c05, VZERO, PERMRSHIFT2 vmaddfp c00, alpha, c00, C1 vmaddfp c05, alpha, c05, C2 stvx c00, OFFSET_0, CO2 stvx c05, OFFSET_1, CO2 addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE .align 4LL(100): andi. I, M, 2 ble LL(110) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 fmr f4, f0 fmr f5, f0 fmr f6, f0 fmr f7, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(105) .align 4LL(102): FMADD f0, f8, f10, f0 FMADD f1, f9, f10, f1 FMADD f2, f8, f11, f2 FMADD f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) FMADD f4, f8, f12, f4 FMADD f5, f9, f12, f5 FMADD f6, f8, f13, f6 FMADD f7, f9, f13, f7 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE bdnz LL(102) .align 4LL(105): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(108) .align 4LL(106): FMADD f0, f8, f10, f0 FMADD f1, f9, f10, f1 FMADD f2, f8, f11, f2 FMADD f3, f9, f11, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE .align 4LL(108): LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) LFD f10, 0 * SIZE(CO2) LFD f11, 1 * SIZE(CO2) FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 FMADD f2, f2, f13, f10 FMADD f3, f3, f13, f11 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) STFD f2, 0 * SIZE(CO2) STFD f3, 1 * SIZE(CO2) addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE .align 4LL(110): andi. I, M, 1 ble LL(119) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) LFD f12, 2 * SIZE(B) LFD f13, 3 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(115) .align 4LL(112): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 FMADD f2, f9, f12, f2 FMADD f3, f9, f13, f3 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 4 * SIZE(BO) LFD f11, 5 * SIZE(BO) LFD f12, 6 * SIZE(BO) LFD f13, 7 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 4 * SIZE bdnz LL(112) .align 4LL(115): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(118) .align 4LL(116): FMADD f0, f8, f10, f0 FMADD f1, f8, f11, f1 LFD f8, 1 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 1 * SIZE addi BO, BO, 2 * SIZE .align 4LL(118): LFD f8, 0 * SIZE(CO1) LFD f9, 0 * SIZE(CO2) FADD f0, f0, f2 FADD f1, f1, f3 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 STFD f0, 0 * SIZE(CO1) STFD f1, 0 * SIZE(CO2) .align 4LL(119): mr B, BO .align 4LL(120): andi. r0, N, 1 ble LL(999) mr CO1, C mr AO, A srawi. I, M, 4 ble LL(140) .align 4LL(130): vxor c01, c01, c01 vxor c02, c02, c02 vxor c03, c03, c03 vxor c04, c04, c04 mr BO, B dcbtst CO1, PREC mr J, K andi. r0, B, 15 ble+ LL(131) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 16 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 subi J, J, 1 cmpwi cr0, J, 0 ble LL(138) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO addi AO, AO, 16 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 subi J, J, 1 cmpwi cr0, J, 0 ble LL(138) .align 4LL(131): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(135) .align 4LL(133): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 vspltw bp2, b1, 1 vmaddfp c01, a5, bp2, c01 vmaddfp c02, a6, bp2, c02 vmaddfp c03, a7, bp2, c03 vmaddfp c04, a8, bp2, c04 addi AO, AO, 32 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -