📄 zgemv_n.s
字号:
LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) FMSUBX y13, alpha4i, a6, y13 FMADDX y14, alpha4r, a6, y14 FMSUBX y15, alpha4i, a8, y15 FMADDX y16, alpha4r, a8, y16 LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) STFD y13, 12 * SIZE(Y2) STFD y14, 13 * SIZE(Y2) STFD y15, 14 * SIZE(Y2) STFD y16, 15 * SIZE(Y2) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) addi Y2, Y2, 16 * SIZE addi Y1, Y1, 16 * SIZE PREFETCH_Y bdnz LL(12) .align 4LL(13): FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 8 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 9 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1r, a1, y09 FMADD y10, alpha1i, a1, y10 FMADD y11, alpha1r, a3, y11 FMADD y12, alpha1i, a3, y12 FMADD y13, alpha1r, a5, y13 FMADD y14, alpha1i, a5, y14 FMADD y15, alpha1r, a7, y15 FMADD y16, alpha1i, a7, y16 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y09, alpha1i, a2, y09 FMADDX y10, alpha1r, a2, y10 FMSUBX y11, alpha1i, a4, y11 FMADDX y12, alpha1r, a4, y12 FMSUBX y13, alpha1i, a6, y13 FMADDX y14, alpha1r, a6, y14 FMSUBX y15, alpha1i, a8, y15 FMADDX y16, alpha1r, a8, y16 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 8 * SIZE(AO2) LFD a3, 10 * SIZE(AO2) LFD a5, 12 * SIZE(AO2) LFD a7, 14 * SIZE(AO2) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 9 * SIZE(AO2) LFD a4, 11 * SIZE(AO2) LFD a6, 13 * SIZE(AO2) LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2r, a1, y09 FMADD y10, alpha2i, a1, y10 FMADD y11, alpha2r, a3, y11 FMADD y12, alpha2i, a3, y12 FMADD y13, alpha2r, a5, y13 FMADD y14, alpha2i, a5, y14 FMADD y15, alpha2r, a7, y15 FMADD y16, alpha2i, a7, y16 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y09, alpha2i, a2, y09 FMADDX y10, alpha2r, a2, y10 FMSUBX y11, alpha2i, a4, y11 FMADDX y12, alpha2r, a4, y12 FMSUBX y13, alpha2i, a6, y13 FMADDX y14, alpha2r, a6, y14 FMSUBX y15, alpha2i, a8, y15 FMADDX y16, alpha2r, a8, y16 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 8 * SIZE(AO3) LFD a3, 10 * SIZE(AO3) LFD a5, 12 * SIZE(AO3) LFD a7, 14 * SIZE(AO3) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 9 * SIZE(AO3) LFD a4, 11 * SIZE(AO3) LFD a6, 13 * SIZE(AO3) LFD a8, 15 * SIZE(AO3) FMADD y09, alpha3r, a1, y09 FMADD y10, alpha3i, a1, y10 FMADD y11, alpha3r, a3, y11 FMADD y12, alpha3i, a3, y12 FMADD y13, alpha3r, a5, y13 FMADD y14, alpha3i, a5, y14 FMADD y15, alpha3r, a7, y15 FMADD y16, alpha3i, a7, y16 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y09, alpha3i, a2, y09 FMADDX y10, alpha3r, a2, y10 FMSUBX y11, alpha3i, a4, y11 FMADDX y12, alpha3r, a4, y12 FMSUBX y13, alpha3i, a6, y13 FMADDX y14, alpha3r, a6, y14 FMSUBX y15, alpha3i, a8, y15 FMADDX y16, alpha3r, a8, y16 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 LFD a1, 8 * SIZE(AO4) LFD a3, 10 * SIZE(AO4) LFD a5, 12 * SIZE(AO4) LFD a7, 14 * SIZE(AO4) FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 LFD a2, 9 * SIZE(AO4) LFD a4, 11 * SIZE(AO4) LFD a6, 13 * SIZE(AO4) LFD a8, 15 * SIZE(AO4) FMADD y09, alpha4r, a1, y09 FMADD y10, alpha4i, a1, y10 FMADD y11, alpha4r, a3, y11 FMADD y12, alpha4i, a3, y12 FMADD y13, alpha4r, a5, y13 FMADD y14, alpha4i, a5, y14 FMADD y15, alpha4r, a7, y15 FMADD y16, alpha4i, a7, y16 LFD a1, 16 * SIZE(AO1) LFD a3, 18 * SIZE(AO1) LFD a5, 20 * SIZE(AO1) LFD a7, 22 * SIZE(AO1) FMSUBX y09, alpha4i, a2, y09 FMADDX y10, alpha4r, a2, y10 FMSUBX y11, alpha4i, a4, y11 FMADDX y12, alpha4r, a4, y12 FMSUBX y13, alpha4i, a6, y13 FMADDX y14, alpha4r, a6, y14 FMSUBX y15, alpha4i, a8, y15 FMADDX y16, alpha4r, a8, y16 LFD a2, 17 * SIZE(AO1) LFD a4, 19 * SIZE(AO1) LFD a6, 21 * SIZE(AO1) LFD a8, 23 * SIZE(AO1) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) STFD y09, 8 * SIZE(Y2) STFD y10, 9 * SIZE(Y2) STFD y11, 10 * SIZE(Y2) STFD y12, 11 * SIZE(Y2) STFD y13, 12 * SIZE(Y2) STFD y14, 13 * SIZE(Y2) STFD y15, 14 * SIZE(Y2) STFD y16, 15 * SIZE(Y2) addi Y2, Y2, 16 * SIZE .align 4LL(15): andi. r0, M, 7 ble LL(19) andi. r0, M, 4 ble LL(16) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a2, 1 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMADD y05, alpha1r, a5, y05 FMADD y06, alpha1i, a5, y06 FMADD y07, alpha1r, a7, y07 FMADD y08, alpha1i, a7, y08 LFD a1, 0 * SIZE(AO2) LFD a3, 2 * SIZE(AO2) LFD a5, 4 * SIZE(AO2) LFD a7, 6 * SIZE(AO2) FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 FMSUBX y05, alpha1i, a6, y05 FMADDX y06, alpha1r, a6, y06 FMSUBX y07, alpha1i, a8, y07 FMADDX y08, alpha1r, a8, y08 LFD a2, 1 * SIZE(AO2) LFD a4, 3 * SIZE(AO2) LFD a6, 5 * SIZE(AO2) LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2r, a1, y01 FMADD y02, alpha2i, a1, y02 FMADD y03, alpha2r, a3, y03 FMADD y04, alpha2i, a3, y04 FMADD y05, alpha2r, a5, y05 FMADD y06, alpha2i, a5, y06 FMADD y07, alpha2r, a7, y07 FMADD y08, alpha2i, a7, y08 LFD a1, 0 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a5, 4 * SIZE(AO3) LFD a7, 6 * SIZE(AO3) FMSUBX y01, alpha2i, a2, y01 FMADDX y02, alpha2r, a2, y02 FMSUBX y03, alpha2i, a4, y03 FMADDX y04, alpha2r, a4, y04 FMSUBX y05, alpha2i, a6, y05 FMADDX y06, alpha2r, a6, y06 FMSUBX y07, alpha2i, a8, y07 FMADDX y08, alpha2r, a8, y08 LFD a2, 1 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) LFD a6, 5 * SIZE(AO3) LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMADD y05, alpha3r, a5, y05 FMADD y06, alpha3i, a5, y06 FMADD y07, alpha3r, a7, y07 FMADD y08, alpha3i, a7, y08 LFD a1, 0 * SIZE(AO4) LFD a3, 2 * SIZE(AO4) LFD a5, 4 * SIZE(AO4) LFD a7, 6 * SIZE(AO4) FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMSUBX y05, alpha3i, a6, y05 FMADDX y06, alpha3r, a6, y06 FMSUBX y07, alpha3i, a8, y07 FMADDX y08, alpha3r, a8, y08 LFD a2, 1 * SIZE(AO4) LFD a4, 3 * SIZE(AO4) LFD a6, 5 * SIZE(AO4) LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4r, a1, y01 FMADD y02, alpha4i, a1, y02 FMADD y03, alpha4r, a3, y03 FMADD y04, alpha4i, a3, y04 FMADD y05, alpha4r, a5, y05 FMADD y06, alpha4i, a5, y06 FMADD y07, alpha4r, a7, y07 FMADD y08, alpha4i, a7, y08 FMSUBX y01, alpha4i, a2, y01 FMADDX y02, alpha4r, a2, y02 FMSUBX y03, alpha4i, a4, y03 FMADDX y04, alpha4r, a4, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) FMSUBX y05, alpha4i, a6, y05 FMADDX y06, alpha4r, a6, y06 FMSUBX y07, alpha4i, a8, y07 FMADDX y08, alpha4r, a8, y08 STFD y05, 4 * SIZE(Y2) STFD y06, 5 * SIZE(Y2) STFD y07, 6 * SIZE(Y2) STFD y08, 7 * SIZE(Y2) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi AO3, AO3, 8 * SIZE addi AO4, AO4, 8 * SIZE addi Y1, Y1, 8 * SIZE addi Y2, Y2, 8 * SIZE .align 4LL(16): andi. r0, M, 2 nop nop ble LL(17) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMADD y03, alpha1r, a3, y03 FMADD y04, alpha1i, a3, y04 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMSUBX y03, alpha1i, a4, y03 FMADDX y04, alpha1r, a4, y04 LFD a1, 0 * SIZE(AO3) LFD a2, 1 * SIZE(AO3) LFD a3, 2 * SIZE(AO3) LFD a4, 3 * SIZE(AO3) FMADD y01, alpha2r, a5, y01 FMADD y02, alpha2i, a5, y02 FMADD y03, alpha2r, a7, y03 FMADD y04, alpha2i, a7, y04 FMSUBX y01, alpha2i, a6, y01 FMADDX y02, alpha2r, a6, y02 FMSUBX y03, alpha2i, a8, y03 FMADDX y04, alpha2r, a8, y04 LFD a5, 0 * SIZE(AO4) LFD a6, 1 * SIZE(AO4) LFD a7, 2 * SIZE(AO4) LFD a8, 3 * SIZE(AO4) FMADD y01, alpha3r, a1, y01 FMADD y02, alpha3i, a1, y02 FMADD y03, alpha3r, a3, y03 FMADD y04, alpha3i, a3, y04 FMSUBX y01, alpha3i, a2, y01 FMADDX y02, alpha3r, a2, y02 FMSUBX y03, alpha3i, a4, y03 FMADDX y04, alpha3r, a4, y04 FMADD y01, alpha4r, a5, y01 FMADD y02, alpha4i, a5, y02 FMADD y03, alpha4r, a7, y03 FMADD y04, alpha4i, a7, y04 FMSUBX y01, alpha4i, a6, y01 FMADDX y02, alpha4r, a6, y02 FMSUBX y03, alpha4i, a8, y03 FMADDX y04, alpha4r, a8, y04 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) STFD y03, 2 * SIZE(Y2) STFD y04, 3 * SIZE(Y2) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi AO3, AO3, 4 * SIZE addi AO4, AO4, 4 * SIZE addi Y1, Y1, 4 * SIZE addi Y2, Y2, 4 * SIZE .align 4LL(17): andi. r0, M, 1 ble LL(19) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD a5, 0 * SIZE(AO3) LFD a6, 1 * SIZE(AO3) LFD a7, 0 * SIZE(AO4) LFD a8, 1 * SIZE(AO4) FMADD y01, alpha1r, a1, y01 FMADD y02, alpha1i, a1, y02 FMSUBX y01, alpha1i, a2, y01 FMADDX y02, alpha1r, a2, y02 FMADD y01, alpha2r, a3, y01 FMADD y02, alpha2i, a3, y02 FMSUBX y01, alpha2i, a4, y01 FMADDX y02, alpha2r, a4, y02 FMADD y01, alpha3r, a5, y01 FMADD y02, alpha3i, a5, y02 FMSUBX y01, alpha3i, a6, y01 FMADDX y02, alpha3r, a6, y02 FMADD y01, alpha4r, a7, y01 FMADD y02, alpha4i, a7, y02 FMSUBX y01, alpha4i, a8, y01 FMADDX y02, alpha4r, a8, y02 STFD y01, 0 * SIZE(Y2) STFD y02, 1 * SIZE(Y2) add Y1, Y1, INCY add Y2, Y2, INCY .align 4LL(19): addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(11) .align 4LL(20): andi. J, N, 2 ble LL(30) .align 4LL(21): lfd alpha_r, ALPHA_R lfd alpha_i, ALPHA_I LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX LFD a3, 0 * SIZE(X) LFD a4, 1 * SIZE(X) add X, X, INCX FMUL alpha1r, alpha_r, a1 FMUL alpha1i, alpha_i, a1 FMUL alpha2r, alpha_r, a3 FMUL alpha2i, alpha_i, a3 FMSUBR alpha1r, alpha_i, a2, alpha1r FMADDR alpha1i, alpha_r, a2, alpha1i FMSUBR alpha2r, alpha_i, a4, alpha2r FMADDR alpha2i, alpha_r, a4, alpha2i mr AO1, A add AO2, A, LDA add A, AO2, LDA mr Y1, Y mr Y2, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(25) .align 4 LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -