📄 zgemv_n.s
字号:
FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y13, 12 * SIZE(Y2) STFD y14, 13 * SIZE(Y2) STFD y15, 14 * SIZE(Y2) STFD y16, 15 * SIZE(Y2) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) addi Y1, Y1, 16 * SIZE addi Y2, Y2, 16 * SIZE PREFETCH_Y bdnz LL(32) .align 4LL(33): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 STFD y09, 8 * SIZE(Y2) STFD y10, 9 * SIZE(Y2) STFD y11, 10 * SIZE(Y2) STFD y12, 11 * SIZE(Y2) FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 STFD y13, 12 * SIZE(Y2) STFD y14, 13 * SIZE(Y2) STFD y15, 14 * SIZE(Y2) STFD y16, 15 * SIZE(Y2) addi AO1, AO1, 16 * SIZE addi Y2, Y2, 16 * SIZE .align 4LL(35): andi. r0, M, 7 ble LL(999) andi. r0, M, 4 ble LL(36) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) addi AO1, AO1, 8 * SIZE addi Y1, Y1, 8 * SIZE addi Y2, Y2, 8 * SIZE .align 4LL(36): andi. r0, M, 2 ble LL(37) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) addi AO1, AO1, 4 * SIZE addi Y1, Y1, 4 * SIZE addi Y2, Y2, 4 * SIZE .align 4LL(37): andi. r0, M, 1 ble LL(999) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) add Y1, Y1, INCY add Y2, Y2, INCY b LL(999) .align 4 LL(100): srawi. J, N, 2 ble LL(120) .align 4LL(111): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX LFD a3, 0 * SIZE(X) LFD a4, 1 * SIZE(X) add X, X, INCX LFD a5, 0 * SIZE(X) LFD a6, 1 * SIZE(X) add X, X, INCX LFD a7, 0 * SIZE(X) LFD a8, 1 * SIZE(X) add X, X, INCX FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMUL alpha2r, alpha_r, a3 FMUL alpha2i, alpha_i, a3 FMUL alpha3r, alpha_r, a5 FMUL alpha3i, alpha_i, a5 FMUL alpha4r, alpha_r, a7 FMUL alpha4i, alpha_i, a7 FMSUBR alpha1r, alpha_i, a2, alpha1r FMADDR alpha1i, alpha_r, a2, alpha1i FMSUBR alpha2r, alpha_i, a4, alpha2r FMADDR alpha2i, alpha_r, a4, alpha2i FMSUBR alpha3r, alpha_i, a6, alpha3r FMADDR alpha3i, alpha_r, a6, alpha3i FMSUBR alpha4r, alpha_i, a8, alpha4r FMADDR alpha4i, alpha_r, a8, alpha4i mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA mr Y1, Y mr Y2, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(115) .align 4 LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y03, 0 * SIZE(Y1) LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y05, 0 * SIZE(Y1) LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y07, 0 * SIZE(Y1) LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y09, 0 * SIZE(Y1) LFD y10, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y11, 0 * SIZE(Y1) LFD y12, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y13, 0 * SIZE(Y1) LFD y14, 1 * SIZE(Y1) add Y1, Y1, INCY LFD y15, 0 * SIZE(Y1) LFD y16, 1 * SIZE(Y1) add Y1, Y1, INCY LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) bdz LL(113) .align 4LL(112): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) addi AO1, AO1, 16 * SIZE nop PREFETCH_A1 nop FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) addi AO2, AO2, 16 * SIZE nop PREFETCH_A2 nop FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 8 * SIZE(AO3) LFD a3, 10 * SIZE(AO3) LFD a5, 12 * SIZE(AO3) LFD a7, 14 * SIZE(AO3) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 9 * SIZE(AO3) LFD a4, 11 * SIZE(AO3) LFD a6, 13 * SIZE(AO3) LFD a8, 15 * SIZE(AO3) addi AO3, AO3, 16 * SIZE nop PREFETCH_A3 nop FMADD y09, alpha3r, a1, y09 FMADD y10, alpha3i, a1, y10 FMADD y11, alpha3r, a3, y11 FMADD y12, alpha3i, a3, y12 FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y09, alpha3i, a2, y09 FMADDX y10, alpha3r, a2, y10 FMSUBX y11, alpha3i, a4, y11 FMADDX y12, alpha3r, a4, y12 FMSUBX y13, alpha3i, a6, y13 FMADDX y14, alpha3r, a6, y14 FMSUBX y15, alpha3i, a8, y15 FMADDX y16, alpha3r, a8, y16 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 LFD a1, 8 * SIZE(AO4) LFD a3, 10 * SIZE(AO4) LFD a5, 12 * SIZE(AO4) LFD a7, 14 * SIZE(AO4) FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 STFD y01, 0 * SIZE(Y2) nop STFD y02, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y01, 0 * SIZE(Y1) nop LFD y02, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y03, 0 * SIZE(Y2) nop STFD y04, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y03, 0 * SIZE(Y1) nop LFD y04, 1 * SIZE(Y1) add Y1, Y1, INCY FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 LFD a2, 9 * SIZE(AO4) LFD a4, 11 * SIZE(AO4) LFD a6, 13 * SIZE(AO4) LFD a8, 15 * SIZE(AO4) addi AO4, AO4, 16 * SIZE nop PREFETCH_A4 nop STFD y05, 0 * SIZE(Y2) nop STFD y06, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y05, 0 * SIZE(Y1) nop LFD y06, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y07, 0 * SIZE(Y2) nop STFD y08, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y07, 0 * SIZE(Y1) nop LFD y08, 1 * SIZE(Y1) add Y1, Y1, INCY FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) FMSUBX y09, alpha4i, a2, y09 FMADDX y10, alpha4r, a2, y10 FMSUBX y11, alpha4i, a4, y11 FMADDX y12, alpha4r, a4, y12 STFD y09, 0 * SIZE(Y2) nop STFD y10, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y09, 0 * SIZE(Y1) nop LFD y10, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y11, 0 * SIZE(Y2) nop STFD y12, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y11, 0 * SIZE(Y1) nop LFD y12, 1 * SIZE(Y1) add Y1, Y1, INCY FMSUBX y13, alpha4i, a6, y13 FMADDX y14, alpha4r, a6, y14 FMSUBX y15, alpha4i, a8, y15 FMADDX y16, alpha4r, a8, y16 LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y13, 0 * SIZE(Y2) nop STFD y14, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y13, 0 * SIZE(Y1) nop LFD y14, 1 * SIZE(Y1) add Y1, Y1, INCY STFD y15, 0 * SIZE(Y2) nop STFD y16, 1 * SIZE(Y2) add Y2, Y2, INCY LFD y15, 0 * SIZE(Y1) nop
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -