📄 gemm_kernel_altivec.s
字号:
LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO vspltw bp2, b1, 3 vmaddfp c01, a5, bp2, c01 vmaddfp c02, a6, bp2, c02 vmaddfp c03, a7, bp2, c03 vmaddfp c04, a8, bp2, c04 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO bdnz LL(133) .align 4LL(135): andi. r0, J, 3 ble+ LL(138) cmpwi cr0, r0, 3 bne LL(136) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp2, b1, 1 vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 addi AO, AO, 16 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE addi BO, BO, 3 * SIZE b LL(138) .align 4LL(136): cmpwi cr0, r0, 2 bne LL(137) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 LOAD_A a1, OFFSET_4, AO LOAD_A a2, OFFSET_5, AO LOAD_A a3, OFFSET_6, AO LOAD_A a4, OFFSET_7, AO vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 vmaddfp c03, a3, bp2, c03 vmaddfp c04, a4, bp2, c04 addi AO, AO, 32 * SIZE addi BO, BO, 2 * SIZE b LL(138) .align 4LL(137): cmpwi cr0, r0, 1 bne LL(138) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c03, a3, bp1, c03 vmaddfp c04, a4, bp1, c04 addi AO, AO, 16 * SIZE addi BO, BO, 1 * SIZE .align 4LL(138): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvx C4, OFFSET_3, CO1 lvx C5, OFFSET_4, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, c03, PERMRSHIFT1 vperm c03, c03, c04, PERMRSHIFT1 vperm c04, c04, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 vmaddfp c03, alpha, c03, C4 vmaddfp c04, alpha, c04, C5 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 stvx c03, OFFSET_3, CO1 stvx c04, OFFSET_4, CO1 addi CO1, CO1, 16 * SIZE addic. I, I, -1 bgt+ LL(130) .align 4LL(140): andi. I, M, 8 ble LL(150) vxor c01, c01, c01 vxor c02, c02, c02 mr BO, B mr J, K andi. r0, B, 15 ble+ LL(141) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 8 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 subi J, J, 1 cmpwi cr0, J, 0 ble LL(148) LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO addi AO, AO, 8 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 vmaddfp c02, a2, bp2, c02 subi J, J, 1 cmpwi cr0, J, 0 ble LL(148) .align 4LL(141): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(145) .align 4LL(143): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 1 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 vspltw bp1, b1, 2 vmaddfp c01, a5, bp1, c01 vmaddfp c02, a6, bp1, c02 vspltw bp2, b1, 3 vmaddfp c01, a7, bp2, c01 vmaddfp c02, a8, bp2, c02 addi AO, AO, 32 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_A a5, OFFSET_4, AO LOAD_A a6, OFFSET_5, AO LOAD_A a7, OFFSET_6, AO LOAD_A a8, OFFSET_7, AO LOAD_B b1, OFFSET_0, BO bdnz LL(143) .align 4LL(145): andi. r0, J, 3 ble+ LL(148) cmpwi cr0, r0, 3 bne LL(146) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vspltw bp2, b1, 1 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 LOAD_A a1, OFFSET_4, AO LOAD_A a2, OFFSET_5, AO vspltw bp1, b1, 2 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 addi AO, AO, 24 * SIZE addi BO, BO, 3 * SIZE b LL(148) .align 4LL(146): cmpwi cr0, r0, 2 bne LL(147) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 vmaddfp c01, a3, bp2, c01 vmaddfp c02, a4, bp2, c02 addi AO, AO, 16 * SIZE addi BO, BO, 2 * SIZE b LL(148) .align 4LL(147): cmpwi cr0, r0, 1 bne LL(148) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vmaddfp c02, a2, bp1, c02 addi AO, AO, 8 * SIZE addi BO, BO, 1 * SIZE .align 4LL(148): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvx C3, OFFSET_2, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, c02, PERMRSHIFT1 vperm c02, c02, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 vmaddfp c02, alpha, c02, C3 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 stvx c02, OFFSET_2, CO1 addi CO1, CO1, 8 * SIZE .align 4LL(150): andi. I, M, 4 ble LL(160) vxor c01, c01, c01 mr BO, B mr J, K andi. r0, B, 15 ble+ LL(151) LOAD_A a1, OFFSET_0, AO LOAD_B b1, OFFSET_0, BO vspltw bp1, b1, 2 vspltw bp2, b1, 3 addi AO, AO, 4 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp1, c01 subi J, J, 1 cmpwi cr0, J, 0 ble LL(158) LOAD_A a1, OFFSET_0, AO addi AO, AO, 4 * SIZE addi BO, BO, SIZE vmaddfp c01, a1, bp2, c01 subi J, J, 1 cmpwi cr0, J, 0 ble LL(158) .align 4LL(151): LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO srawi. r0, J, 2 mtspr CTR, r0 ble LL(155) .align 4LL(153): vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c01, a2, bp2, c01 vspltw bp1, b1, 2 vmaddfp c01, a3, bp1, c01 vspltw bp2, b1, 3 vmaddfp c01, a4, bp2, c01 addi AO, AO, 16 * SIZE addi BO, BO, 4 * SIZE LOAD_A a1, OFFSET_0, AO LOAD_A a2, OFFSET_1, AO LOAD_A a3, OFFSET_2, AO LOAD_A a4, OFFSET_3, AO LOAD_B b1, OFFSET_0, BO bdnz LL(153) .align 4LL(155): andi. r0, J, 3 ble+ LL(158) cmpwi cr0, r0, 3 bne LL(156) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 vspltw bp2, b1, 1 vmaddfp c01, a2, bp2, c01 vspltw bp1, b1, 2 vmaddfp c01, a3, bp1, c01 addi AO, AO, 12 * SIZE addi BO, BO, 3 * SIZE b LL(158) .align 4LL(156): cmpwi cr0, r0, 2 bne LL(157) vspltw bp1, b1, 0 vspltw bp2, b1, 1 vmaddfp c01, a1, bp1, c01 vmaddfp c01, a2, bp2, c01 addi AO, AO, 8 * SIZE addi BO, BO, 2 * SIZE b LL(158) .align 4LL(157): cmpwi cr0, r0, 1 bne LL(158) vspltw bp1, b1, 0 vmaddfp c01, a1, bp1, c01 addi AO, AO, 4 * SIZE addi BO, BO, 1 * SIZE .align 4LL(158): lvx alpha, OFFSET_0, SP vxor VZERO, VZERO, VZERO lvx C1, OFFSET_0, CO1 lvx C2, OFFSET_1, CO1 lvsr PERMRSHIFT1, 0, CO1 vperm c00, VZERO, c01, PERMRSHIFT1 vperm c01, c01, VZERO, PERMRSHIFT1 vmaddfp c00, alpha, c00, C1 vmaddfp c01, alpha, c01, C2 stvx c00, OFFSET_0, CO1 stvx c01, OFFSET_1, CO1 addi CO1, CO1, 4 * SIZE .align 4LL(160): andi. I, M, 2 ble LL(170) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 2 * SIZE(AO) LFD f11, 3 * SIZE(AO) LFD f12, 0 * SIZE(B) LFD f13, 1 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 fmr f2, f0 fmr f3, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(165) .align 4LL(162): FMADD f0, f8, f12, f0 FMADD f1, f9, f12, f1 FMADD f2, f10, f13, f2 FMADD f3, f11, f13, f3 LFD f8, 4 * SIZE(AO) LFD f9, 5 * SIZE(AO) LFD f10, 6 * SIZE(AO) LFD f11, 7 * SIZE(AO) LFD f12, 2 * SIZE(BO) LFD f13, 3 * SIZE(BO) addi AO, AO, 4 * SIZE addi BO, BO, 2 * SIZE bdnz LL(162) .align 4LL(165): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(168) .align 4LL(166): FMADD f0, f8, f12, f0 FMADD f1, f9, f12, f1 addi AO, AO, 2 * SIZE addi BO, BO, 1 * SIZE .align 4LL(168): LFD f8, 0 * SIZE(CO1) LFD f9, 1 * SIZE(CO1) FADD f0, f0, f2 FADD f1, f1, f3 FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 STFD f0, 0 * SIZE(CO1) STFD f1, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE .align 4LL(170): andi. I, M, 1 ble LL(999) mr BO, B LFD f8, 0 * SIZE(AO) LFD f9, 1 * SIZE(AO) LFD f10, 0 * SIZE(B) LFD f11, 1 * SIZE(B) lfs f0, FZERO(SP) fmr f1, f0 srawi. r0, K, 1 mtspr CTR, r0 ble LL(175) .align 4LL(172): FMADD f0, f8, f10, f0 FMADD f1, f9, f11, f1 LFD f8, 2 * SIZE(AO) LFD f9, 3 * SIZE(AO) LFD f10, 2 * SIZE(BO) LFD f11, 3 * SIZE(BO) addi AO, AO, 2 * SIZE addi BO, BO, 2 * SIZE bdnz LL(172) .align 4LL(175): andi. r0, K, 1 lfs f13, ALPHA(SP) ble LL(178) .align 4LL(176): FMADD f0, f8, f10, f0 addi AO, AO, 1 * SIZE addi BO, BO, 1 * SIZE .align 4LL(178): LFD f8, 0 * SIZE(CO1) FADD f0, f0, f1 FMADD f0, f0, f13, f8 STFD f0, 0 * SIZE(CO1) .align 4 LL(999): mr SP, STACK li r0, 0 * 16 lvx v20, SP, r0 li r0, 1 * 16 lvx v21, SP, r0 li r0, 2 * 16 lvx v22, SP, r0 li r0, 3 * 16 lvx v23, SP, r0 li r0, 4 * 16 lvx v24, SP, r0 li r0, 5 * 16 lvx v25, SP, r0 li r0, 6 * 16 lvx v26, SP, r0 li r0, 7 * 16 lvx v27, SP, r0 li r0, 8 * 16 lvx v28, SP, r0 li r0, 9 * 16 lvx v29, SP, r0 li r0, 10 * 16 lvx v30, SP, r0 li r0, 11 * 16 lvx v31, SP, r0 mtspr VRsave, VREG#ifdef __64BIT__ ld r31, 192(SP) ld r30, 200(SP) ld r29, 208(SP) ld r28, 216(SP) ld r27, 224(SP) ld r26, 232(SP) ld r25, 240(SP) ld r24, 248(SP) ld r23, 256(SP) ld r22, 264(SP) ld r21, 272(SP) ld r20, 280(SP) ld r19, 288(SP) ld r18, 296(SP) ld r17, 304(SP) ld r16, 312(SP) ld r15, 320(SP) ld r14, 328(SP)#else lwz r31, 192(SP) lwz r30, 196(SP) lwz r29, 200(SP) lwz r28, 204(SP) lwz r27, 208(SP) lwz r26, 212(SP) lwz r25, 216(SP) lwz r24, 220(SP) lwz r23, 224(SP) lwz r22, 228(SP) lwz r21, 232(SP) lwz r20, 236(SP) lwz r19, 240(SP) lwz r18, 244(SP) lwz r17, 248(SP) lwz r16, 252(SP) lwz r15, 256(SP) lwz r14, 260(SP)#endif addi SP, SP, STACKSIZE blr EPILOGUE#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -