📄 zgemv_n_ppc440.s
字号:
LFDU a8, 1 * SIZE(AO4) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMADD y01, alpha4r, a5, y01 FMADD y02, alpha4i, a5, y02 FMADD y03, alpha4r, a7, y03 FMADD y04, alpha4i, a7, y04 FMSUBX y01, alpha4i, a6, y01 FMADDX y02, alpha4r, a6, y02 FMSUBX y03, alpha4i, a8, y03 FMADDX y04, alpha4r, a8, y04 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) STFDU y03, 1 * SIZE(Y2) STFDU y04, 1 * SIZE(Y2) .align 4LL(17): andi. r0, M, 1 ble LL(19) LFDU y01, 1 * SIZE(Y1) LFDU y02, 1 * SIZE(Y1) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO1) LFDU a3, 1 * SIZE(AO2) LFDU a4, 1 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 LFDU a5, 1 * SIZE(AO3) FMADD y02, alpha1i, a1, y02 LFDU a6, 1 * SIZE(AO3) FMSUBX y01, alpha1i, a2, y01 LFDU a7, 1 * SIZE(AO4) FMADDX y02, alpha1r, a2, y02 LFDU a8, 1 * SIZE(AO4) FMADD y01, alpha2r, a3, y01 FMADD y02, alpha2i, a3, y02 FMSUBX y01, alpha2i, a4, y01 FMADDX y02, alpha2r, a4, y02 FMADD y01, alpha3r, a5, y01 FMADD y02, alpha3i, a5, y02 FMSUBX y01, alpha3i, a6, y01 FMADDX y02, alpha3r, a6, y02 FMADD y01, alpha4r, a7, y01 FMADD y02, alpha4i, a7, y02 FMSUBX y01, alpha4i, a8, y01 FMADDX y02, alpha4r, a8, y02 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) .align 4LL(19): addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(11) .align 4LL(20): andi. J, N, 2 ble LL(30) lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFDUX a1, X, INCX LFDU a2, 1 * SIZE(X) LFDUX a3, X, INCX LFDU a4, 1 * SIZE(X) FMUL alpha1r, alpha_r, a1 mr Y1, YY FMUL alpha1i, alpha_i, a1 mr Y2, YY FMUL alpha2r, alpha_r, a3 mr AO1, A FMUL alpha2i, alpha_i, a3 add AO2, A, LDA FMSUBR alpha1r, alpha_i, a2, alpha1r add A, AO2, LDA FMADDR alpha1i, alpha_r, a2, alpha1i srawi. r0, M, 2 FMSUBR alpha2r, alpha_i, a4, alpha2r mtspr CTR, r0 FMADDR alpha2i, alpha_r, a4, alpha2i ble LL(25) .align 4 LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) LFDU a5, 1 * SIZE(AO1) LFDU y05, 1 * SIZE(Y1) LFDU a6, 1 * SIZE(AO1) LFDU y06, 1 * SIZE(Y1) LFDU a7, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) LFDU a8, 1 * SIZE(AO1) LFDU y08, 1 * SIZE(Y1) FMADD y09, alpha1r, a1, y01 FMADD y10, alpha1i, a1, y02 FMADD y11, alpha1r, a3, y03 FMADD y12, alpha1i, a3, y04 FMADD y13, alpha1r, a5, y05 FMADD y14, alpha1i, a5, y06 FMADD y15, alpha1r, a7, y07 FMADD y16, alpha1i, a7, y08 bdz LL(23) .align 4LL(22): FMSUBX y09, alpha1i, a2, y09 LFDU a1, 1 * SIZE(AO2) FMADDX y10, alpha1r, a2, y10 LFDU a2, 1 * SIZE(AO2) FMSUBX y11, alpha1i, a4, y11 LFDU a3, 1 * SIZE(AO2) FMADDX y12, alpha1r, a4, y12 LFDU a4, 1 * SIZE(AO2)#ifdef PPCG4 dcbt AO2, PREA#endif FMSUBX y13, alpha1i, a6, y13 LFDU a5, 1 * SIZE(AO2) FMADDX y14, alpha1r, a6, y14 LFDU a6, 1 * SIZE(AO2) FMSUBX y15, alpha1i, a8, y15 LFDU a7, 1 * SIZE(AO2) FMADDX y16, alpha1r, a8, y16 LFDU a8, 1 * SIZE(AO2)#if defined(PPCG4) && defined(DOUBLE) dcbt AO2, PREA#endif FMADD y09, alpha2r, a1, y09 LFDU y01, 1 * SIZE(Y1) FMADD y10, alpha2i, a1, y10 LFDU y02, 1 * SIZE(Y1) FMADD y11, alpha2r, a3, y11 LFDU y03, 1 * SIZE(Y1) FMADD y12, alpha2i, a3, y12 LFDU y04, 1 * SIZE(Y1)#ifdef PPCG4 dcbtst Y1, PREA#endif FMADD y13, alpha2r, a5, y13 LFDU y05, 1 * SIZE(Y1) FMADD y14, alpha2i, a5, y14 LFDU y06, 1 * SIZE(Y1) FMADD y15, alpha2r, a7, y15 LFDU y07, 1 * SIZE(Y1) FMADD y16, alpha2i, a7, y16 LFDU y08, 1 * SIZE(Y1)#if defined(PPCG4) && defined(DOUBLE) dcbtst Y1, PREA#endif FMSUBX y09, alpha2i, a2, y09 LFDU a1, 1 * SIZE(AO1) FMADDX y10, alpha2r, a2, y10 LFDU a2, 1 * SIZE(AO1) FMSUBX y11, alpha2i, a4, y11 LFDU a3, 1 * SIZE(AO1) FMADDX y12, alpha2r, a4, y12 LFDU a4, 1 * SIZE(AO1)#ifdef PPCG4 dcbt AO1, PREA#endif FMSUBX y13, alpha2i, a6, y13 LFDU a5, 1 * SIZE(AO1) FMADDX y14, alpha2r, a6, y14 LFDU a6, 1 * SIZE(AO1) FMSUBX y15, alpha2i, a8, y15 LFDU a7, 1 * SIZE(AO1) FMADDX y16, alpha2r, a8, y16 LFDU a8, 1 * SIZE(AO1)#if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA#endif STFDU y09, 1 * SIZE(Y2) FMADD y09, alpha1r, a1, y01 STFDU y10, 1 * SIZE(Y2) FMADD y10, alpha1i, a1, y02 STFDU y11, 1 * SIZE(Y2) FMADD y11, alpha1r, a3, y03 STFDU y12, 1 * SIZE(Y2) FMADD y12, alpha1i, a3, y04 STFDU y13, 1 * SIZE(Y2) FMADD y13, alpha1r, a5, y05 STFDU y14, 1 * SIZE(Y2) FMADD y14, alpha1i, a5, y06 STFDU y15, 1 * SIZE(Y2) FMADD y15, alpha1r, a7, y07 STFDU y16, 1 * SIZE(Y2) FMADD y16, alpha1i, a7, y08 bdnz LL(22) .align 4LL(23): FMSUBX y09, alpha1i, a2, y09 LFDU a1, 1 * SIZE(AO2) FMADDX y10, alpha1r, a2, y10 LFDU a2, 1 * SIZE(AO2) FMSUBX y11, alpha1i, a4, y11 LFDU a3, 1 * SIZE(AO2) FMADDX y12, alpha1r, a4, y12 LFDU a4, 1 * SIZE(AO2) FMSUBX y13, alpha1i, a6, y13 LFDU a5, 1 * SIZE(AO2) FMADDX y14, alpha1r, a6, y14 LFDU a6, 1 * SIZE(AO2) FMSUBX y15, alpha1i, a8, y15 LFDU a7, 1 * SIZE(AO2) FMADDX y16, alpha1r, a8, y16 LFDU a8, 1 * SIZE(AO2) FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 FMSUBX y13, alpha2i, a6, y13 STFDU y09, 1 * SIZE(Y2) FMADDX y14, alpha2r, a6, y14 STFDU y10, 1 * SIZE(Y2) FMSUBX y15, alpha2i, a8, y15 STFDU y11, 1 * SIZE(Y2) FMADDX y16, alpha2r, a8, y16 STFDU y12, 1 * SIZE(Y2) STFDU y13, 1 * SIZE(Y2) STFDU y14, 1 * SIZE(Y2) STFDU y15, 1 * SIZE(Y2) STFDU y16, 1 * SIZE(Y2) .align 4LL(25): andi. r0, M, 2 ble LL(27) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) FMADD y01, alpha1r, a1, y01 LFDU a5, 1 * SIZE(AO2) FMADD y02, alpha1i, a1, y02 LFDU a6, 1 * SIZE(AO2) FMADD y03, alpha1r, a3, y03 LFDU a7, 1 * SIZE(AO2) FMADD y04, alpha1i, a3, y04 LFDU a8, 1 * SIZE(AO2) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMADD y01, alpha2r, a5, y01 FMADD y02, alpha2i, a5, y02 FMADD y03, alpha2r, a7, y03 FMADD y04, alpha2i, a7, y04 FMSUBX y01, alpha2i, a6, y01 FMADDX y02, alpha2r, a6, y02 FMSUBX y03, alpha2i, a8, y03 FMADDX y04, alpha2r, a8, y04 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) STFDU y03, 1 * SIZE(Y2) STFDU y04, 1 * SIZE(Y2) .align 4LL(27): andi. r0, M, 1 ble LL(30) LFDU y01, 1 * SIZE(Y1) LFDU y02, 1 * SIZE(Y1) LFDU a1, 1 * SIZE(AO1) LFDU a2, 1 * SIZE(AO1) LFDU a3, 1 * SIZE(AO2) LFDU a4, 1 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMADD y01, alpha2r, a3, y01 FMADD y02, alpha2i, a3, y02 FMSUBX y01, alpha2i, a4, y01 FMADDX y02, alpha2r, a4, y02 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) .align 4LL(30): andi. J, N, 1 ble LL(990) .align 4 lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFDUX a1, X, INCX LFDU a2, 1 * SIZE(X) FMUL alpha1r, alpha_r, a1 mr Y1, YY mr Y2, YY FMUL alpha1i, alpha_i, a1 mr AO1, A add A, A, LDA FMSUBR alpha1r, alpha_i, a2, alpha1r srawi. r0, M, 2 mtspr CTR, r0 FMADDR alpha1i, alpha_r, a2, alpha1i ble LL(35) .align 4 LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) LFDU a5, 1 * SIZE(AO1) LFDU y05, 1 * SIZE(Y1) LFDU a6, 1 * SIZE(AO1) LFDU y06, 1 * SIZE(Y1) LFDU a7, 1 * SIZE(AO1) LFDU y07, 1 * SIZE(Y1) LFDU a8, 1 * SIZE(AO1) LFDU y08, 1 * SIZE(Y1) FMADD y09, alpha1r, a1, y01 FMADD y10, alpha1i, a1, y02 FMADD y11, alpha1r, a3, y03 FMADD y12, alpha1i, a3, y04 FMADD y13, alpha1r, a5, y05 FMADD y14, alpha1i, a5, y06 FMADD y15, alpha1r, a7, y07 FMADD y16, alpha1i, a7, y08 bdz LL(33) .align 4LL(32): FMSUBX y09, alpha1i, a2, y09 LFDU a1, 1 * SIZE(AO1) FMADDX y10, alpha1r, a2, y10 LFDU a2, 1 * SIZE(AO1) FMSUBX y11, alpha1i, a4, y11 LFDU a3, 1 * SIZE(AO1) FMADDX y12, alpha1r, a4, y12 LFDU a4, 1 * SIZE(AO1)#ifdef PPCG4 dcbt AO1, PREA#endif LFDU y01, 1 * SIZE(Y1) LFDU y02, 1 * SIZE(Y1) LFDU y03, 1 * SIZE(Y1) LFDU y04, 1 * SIZE(Y1)#ifdef PPCG4 dcbtst Y1, PREA#endif FMSUBX y13, alpha1i, a6, y13 LFDU a5, 1 * SIZE(AO1) FMADDX y14, alpha1r, a6, y14 LFDU a6, 1 * SIZE(AO1) FMSUBX y15, alpha1i, a8, y15 LFDU a7, 1 * SIZE(AO1) FMADDX y16, alpha1r, a8, y16 LFDU a8, 1 * SIZE(AO1)#if defined(PPCG4) && defined(DOUBLE) dcbt AO1, PREA#endif LFDU y05, 1 * SIZE(Y1) LFDU y06, 1 * SIZE(Y1) LFDU y07, 1 * SIZE(Y1) LFDU y08, 1 * SIZE(Y1)#if defined(PPCG4) && defined(DOUBLE) dcbtst Y1, PREA#endif STFDU y09, 1 * SIZE(Y2) FMADD y09, alpha1r, a1, y01 STFDU y10, 1 * SIZE(Y2) FMADD y10, alpha1i, a1, y02 STFDU y11, 1 * SIZE(Y2) FMADD y11, alpha1r, a3, y03 STFDU y12, 1 * SIZE(Y2) FMADD y12, alpha1i, a3, y04 STFDU y13, 1 * SIZE(Y2) FMADD y13, alpha1r, a5, y05 STFDU y14, 1 * SIZE(Y2) FMADD y14, alpha1i, a5, y06 STFDU y15, 1 * SIZE(Y2) FMADD y15, alpha1r, a7, y07 STFDU y16, 1 * SIZE(Y2) FMADD y16, alpha1i, a7, y08 bdnz LL(32) .align 4LL(33): FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 STFDU y09, 1 * SIZE(Y2) FMADDX y14, alpha1r, a6, y14 STFDU y10, 1 * SIZE(Y2) FMSUBX y15, alpha1i, a8, y15 STFDU y11, 1 * SIZE(Y2) FMADDX y16, alpha1r, a8, y16 STFDU y12, 1 * SIZE(Y2) STFDU y13, 1 * SIZE(Y2) STFDU y14, 1 * SIZE(Y2) STFDU y15, 1 * SIZE(Y2) STFDU y16, 1 * SIZE(Y2) .align 4LL(35): andi. r0, M, 2 ble LL(37) LFDU a1, 1 * SIZE(AO1) LFDU y01, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a3, 1 * SIZE(AO1) LFDU y03, 1 * SIZE(Y1) LFDU a4, 1 * SIZE(AO1) LFDU y04, 1 * SIZE(Y1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) STFDU y03, 1 * SIZE(Y2) STFDU y04, 1 * SIZE(Y2) .align 4LL(37): andi. r0, M, 1 ble LL(990) LFDU y01, 1 * SIZE(Y1) LFDU a1, 1 * SIZE(AO1) LFDU y02, 1 * SIZE(Y1) LFDU a2, 1 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) .align 4 LL(990): cmpi cr0, 0, INCY, SIZE beq LL(999) addi YY, BUFFER, -SIZE mr Y1, Y srawi. r0, M, 2 mtspr CTR, r0 ble LL(995) .align 4LL(991): LFDUX f0, Y, INCY LFDU f1, 1 * SIZE(Y) LFDUX f2, Y, INCY LFDU f3, 1 * SIZE(Y) LFDUX f4, Y, INCY LFDU f5, 1 * SIZE(Y) LFDUX f6, Y, INCY LFDU f7, 1 * SIZE(Y) LFDU f8, 1 * SIZE(YY) LFDU f9, 1 * SIZE(YY) LFDU f10, 1 * SIZE(YY) LFDU f11, 1 * SIZE(YY) LFDU f12, 1 * SIZE(YY) LFDU f13, 1 * SIZE(YY) LFDU f14, 1 * SIZE(YY) LFDU f15, 1 * SIZE(YY) FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 FADD f12, f12, f4 FADD f13, f13, f5 FADD f14, f14, f6 FADD f15, f15, f7 STFDUX f8, Y1, INCY STFDU f9, 1 * SIZE(Y1) STFDUX f10, Y1, INCY STFDU f11, 1 * SIZE(Y1) STFDUX f12, Y1, INCY STFDU f13, 1 * SIZE(Y1) STFDUX f14, Y1, INCY STFDU f15, 1 * SIZE(Y1) bdnz LL(991) .align 4LL(995): andi. J, M, 2 ble LL(996) LFDUX f0, Y, INCY LFDU f1, 1 * SIZE(Y) LFDUX f2, Y, INCY LFDU f3, 1 * SIZE(Y) LFDU f8, 1 * SIZE(YY) LFDU f9, 1 * SIZE(YY) LFDU f10, 1 * SIZE(YY) LFDU f11, 1 * SIZE(YY) FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 STFDUX f8, Y1, INCY STFDU f9, 1 * SIZE(Y1) STFDUX f10, Y1, INCY STFDU f11, 1 * SIZE(Y1) .align 4LL(996): andi. J, M, 1 ble LL(999) LFDUX f0, Y, INCY LFDU f1, 1 * SIZE(Y) LFDU f8, 1 * SIZE(YY) LFDU f9, 1 * SIZE(YY) FADD f8, f8, f0 FADD f9, f9, f1 STFDUX f8, Y1, INCY STFDU f9, 1 * SIZE(Y1) .align 4LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP)#ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP)#else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP)#endif addi SP, SP, STACKSIZE blr EPILOGUE#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -