📄 zgemv_n.s
字号:
LFD y16, 1 * SIZE(Y1) add Y1, Y1, INCY PREFETCH_Y bdnz LL(112) .align 4LL(113): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 8 * SIZE(AO3) LFD a3, 10 * SIZE(AO3) LFD a5, 12 * SIZE(AO3) LFD a7, 14 * SIZE(AO3) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 9 * SIZE(AO3) LFD a4, 11 * SIZE(AO3) LFD a6, 13 * SIZE(AO3) LFD a8, 15 * SIZE(AO3) FMADD y09, alpha3r, a1, y09 FMADD y10, alpha3i, a1, y10 FMADD y11, alpha3r, a3, y11 FMADD y12, alpha3i, a3, y12 FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y09, alpha3i, a2, y09 FMADDX y10, alpha3r, a2, y10 FMSUBX y11, alpha3i, a4, y11 FMADDX y12, alpha3r, a4, y12 FMSUBX y13, alpha3i, a6, y13 FMADDX y14, alpha3r, a6, y14 FMSUBX y15, alpha3i, a8, y15 FMADDX y16, alpha3r, a8, y16 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 LFD a1, 8 * SIZE(AO4) LFD a3, 10 * SIZE(AO4) LFD a5, 12 * SIZE(AO4) LFD a7, 14 * SIZE(AO4) FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 STFD y01, 0 * SIZE(Y2) nop STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) nop STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 LFD a2, 9 * SIZE(AO4) LFD a4, 11 * SIZE(AO4) LFD a6, 13 * SIZE(AO4) LFD a8, 15 * SIZE(AO4) STFD y05, 0 * SIZE(Y2) nop STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y07, 0 * SIZE(Y2) nop STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 FMSUBX y09, alpha4i, a2, y09 FMADDX y10, alpha4r, a2, y10 FMSUBX y11, alpha4i, a4, y11 FMADDX y12, alpha4r, a4, y12 STFD y09, 0 * SIZE(Y2) nop STFD y10, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y11, 0 * SIZE(Y2) nop STFD y12, 1 * SIZE(Y2) add Y2, Y2, INCY FMSUBX y13, alpha4i, a6, y13 FMADDX y14, alpha4r, a6, y14 FMSUBX y15, alpha4i, a8, y15 FMADDX y16, alpha4r, a8, y16 STFD y13, 0 * SIZE(Y2) nop STFD y14, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y15, 0 * SIZE(Y2) nop STFD y16, 1 * SIZE(Y2) add Y2, Y2, INCY addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE .align 4LL(115): andi. r0, M, 7 ble LL(119) andi. r0, M, 4 ble LL(116) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 0 * SIZE(Y1) LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y07, 0 * SIZE(Y1) LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 STFD y01, 0 * SIZE(Y2) addi AO1, AO1, 8 * SIZE STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) addi AO2, AO2, 8 * SIZE STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y05, 0 * SIZE(Y2) addi AO3, AO3, 8 * SIZE STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y07, 0 * SIZE(Y2) addi AO4, AO4, 8 * SIZE STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4LL(116): andi. r0, M, 2 ble LL(117) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 LFD a1, 0 * SIZE(AO3) LFD a2, 1 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) FMADD y01, alpha2r, a5, y01 FMADD y02, alpha2i, a5, y02 FMADD y03, alpha2r, a7, y03 FMADD y04, alpha2i, a7, y04 FMSUBX y01, alpha2i, a6, y01 FMADDX y02, alpha2r, a6, y02 FMSUBX y03, alpha2i, a8, y03 FMADDX y04, alpha2r, a8, y04 LFD a5, 0 * SIZE(AO4) LFD a6, 1 * SIZE(AO4) LFD a7, 2 * SIZE(AO4) LFD a8, 3 * SIZE(AO4) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMADD y01, alpha4r, a5, y01 FMADD y02, alpha4i, a5, y02 FMADD y03, alpha4r, a7, y03 FMADD y04, alpha4i, a7, y04 FMSUBX y01, alpha4i, a6, y01 FMADDX y02, alpha4r, a6, y02 FMSUBX y03, alpha4i, a8, y03 FMADDX y04, alpha4r, a8, y04 STFD y01, 0 * SIZE(Y2) addi AO1, AO1, 4 * SIZE STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY STFD y03, 0 * SIZE(Y2) addi AO2, AO2, 4 * SIZE STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY addi AO3, AO3, 4 * SIZE addi AO4, AO4, 4 * SIZE .align 4LL(117): andi. r0, M, 1 ble LL(119) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD a5, 0 * SIZE(AO3) LFD a6, 1 * SIZE(AO3) LFD a7, 0 * SIZE(AO4) LFD a8, 1 * SIZE(AO4) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMADD y01, alpha2r, a3, y01 FMADD y02, alpha2i, a3, y02 FMSUBX y01, alpha2i, a4, y01 FMADDX y02, alpha2r, a4, y02 FMADD y01, alpha3r, a5, y01 FMADD y02, alpha3i, a5, y02 FMSUBX y01, alpha3i, a6, y01 FMADDX y02, alpha3r, a6, y02 FMADD y01, alpha4r, a7, y01 FMADD y02, alpha4i, a7, y02 FMSUBX y01, alpha4i, a8, y01 FMADDX y02, alpha4r, a8, y02 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY .align 4LL(119): addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(111) .align 4LL(120): andi. J, N, 2 ble LL(130) .align 4LL(121): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX LFD a3, 0 * SIZE(X) LFD a4, 1 * SIZE(X) add X, X, INCX FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMUL alpha2r, alpha_r, a3 FMUL alpha2i, alpha_i, a3 FMSUBR alpha1r, alpha_i, a2, alpha1r FMADDR alpha1i, alpha_r, a2, alpha1i FMSUBR alpha2r, alpha_i, a4, alpha2r FMADDR alpha2i, alpha_r, a4, alpha2i mr AO1, A add AO2, A, LDA add A, AO2, LDA mr Y1, Y mr Y2, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(125) .align 4 LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 0 * SIZE(Y1) LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y07, 0 * SIZE(Y1) LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y09, 0 * SIZE(Y1) LFD y10,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -