📄 gemv_n.s
字号:
FMADD y04, alpha2, a4, y04 LFD a4, 11 * SIZE(AO2) FMADD y05, alpha2, a5, y05 LFD a5, 12 * SIZE(AO2) FMADD y06, alpha2, a6, y06 LFD a6, 13 * SIZE(AO2) FMADD y07, alpha2, a7, y07 LFD a7, 14 * SIZE(AO2) FMADD y08, alpha2, a8, y08 LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2, a1, y09 LFD a1, 0 * SIZE(AO3) FMADD y10, alpha2, a2, y10 LFD a2, 1 * SIZE(AO3) FMADD y11, alpha2, a3, y11 LFD a3, 2 * SIZE(AO3) FMADD y12, alpha2, a4, y12 LFD a4, 3 * SIZE(AO3) FMADD y13, alpha2, a5, y13 LFD a5, 4 * SIZE(AO3) FMADD y14, alpha2, a6, y14 LFD a6, 5 * SIZE(AO3) FMADD y15, alpha2, a7, y15 LFD a7, 6 * SIZE(AO3) FMADD y16, alpha2, a8, y16 LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3, a1, y01 LFD a1, 8 * SIZE(AO3) FMADD y02, alpha3, a2, y02 LFD a2, 9 * SIZE(AO3) FMADD y03, alpha3, a3, y03 LFD a3, 10 * SIZE(AO3) FMADD y04, alpha3, a4, y04 LFD a4, 11 * SIZE(AO3) FMADD y05, alpha3, a5, y05 LFD a5, 12 * SIZE(AO3) FMADD y06, alpha3, a6, y06 LFD a6, 13 * SIZE(AO3) FMADD y07, alpha3, a7, y07 LFD a7, 14 * SIZE(AO3) FMADD y08, alpha3, a8, y08 LFD a8, 15 * SIZE(AO3) FMADD y09, alpha3, a1, y09 LFD a1, 0 * SIZE(AO4) FMADD y10, alpha3, a2, y10 LFD a2, 1 * SIZE(AO4) FMADD y11, alpha3, a3, y11 LFD a3, 2 * SIZE(AO4) FMADD y12, alpha3, a4, y12 LFD a4, 3 * SIZE(AO4) FMADD y13, alpha3, a5, y13 LFD a5, 4 * SIZE(AO4) FMADD y14, alpha3, a6, y14 LFD a6, 5 * SIZE(AO4) FMADD y15, alpha3, a7, y15 LFD a7, 6 * SIZE(AO4) FMADD y16, alpha3, a8, y16 LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4, a1, y01 LFD a1, 8 * SIZE(AO4) FMADD y02, alpha4, a2, y02 LFD a2, 9 * SIZE(AO4) FMADD y03, alpha4, a3, y03 LFD a3, 10 * SIZE(AO4) FMADD y04, alpha4, a4, y04 LFD a4, 11 * SIZE(AO4) FMADD y05, alpha4, a5, y05 LFD a5, 12 * SIZE(AO4) FMADD y06, alpha4, a6, y06 LFD a6, 13 * SIZE(AO4) FMADD y07, alpha4, a7, y07 LFD a7, 14 * SIZE(AO4) FMADD y08, alpha4, a8, y08 LFD a8, 15 * SIZE(AO4) FMADD y09, alpha4, a1, y09 addi AO1, AO1, 16 * SIZE FMADD y10, alpha4, a2, y10 addi AO2, AO2, 16 * SIZE FMADD y11, alpha4, a3, y11 addi AO3, AO3, 16 * SIZE FMADD y12, alpha4, a4, y12 addi AO4, AO4, 16 * SIZE FMADD y13, alpha4, a5, y13 FMADD y14, alpha4, a6, y14 FMADD y15, alpha4, a7, y15 FMADD y16, alpha4, a8, y16 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) addi Y1, Y1, 16 * SIZE .align 4LL(25): andi. r0, M, 15 ble LL(30) andi. r0, M, 8 ble LL(26) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO2) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO2) FMADD y03, alpha1, a3, y03 LFD a3, 2 * SIZE(AO2) FMADD y04, alpha1, a4, y04 LFD a4, 3 * SIZE(AO2) FMADD y05, alpha1, a5, y05 LFD a5, 4 * SIZE(AO2) FMADD y06, alpha1, a6, y06 LFD a6, 5 * SIZE(AO2) FMADD y07, alpha1, a7, y07 LFD a7, 6 * SIZE(AO2) FMADD y08, alpha1, a8, y08 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 0 * SIZE(AO3) FMADD y02, alpha2, a2, y02 LFD a2, 1 * SIZE(AO3) FMADD y03, alpha2, a3, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, alpha2, a4, y04 LFD a4, 3 * SIZE(AO3) FMADD y05, alpha2, a5, y05 LFD a5, 4 * SIZE(AO3) FMADD y06, alpha2, a6, y06 LFD a6, 5 * SIZE(AO3) FMADD y07, alpha2, a7, y07 LFD a7, 6 * SIZE(AO3) FMADD y08, alpha2, a8, y08 LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3, a1, y01 LFD a1, 0 * SIZE(AO4) FMADD y02, alpha3, a2, y02 LFD a2, 1 * SIZE(AO4) FMADD y03, alpha3, a3, y03 LFD a3, 2 * SIZE(AO4) FMADD y04, alpha3, a4, y04 LFD a4, 3 * SIZE(AO4) FMADD y05, alpha3, a5, y05 LFD a5, 4 * SIZE(AO4) FMADD y06, alpha3, a6, y06 LFD a6, 5 * SIZE(AO4) FMADD y07, alpha3, a7, y07 LFD a7, 6 * SIZE(AO4) FMADD y08, alpha3, a8, y08 LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4, a1, y01 addi AO1, AO1, 8 * SIZE FMADD y02, alpha4, a2, y02 addi AO2, AO2, 8 * SIZE FMADD y03, alpha4, a3, y03 addi AO3, AO3, 8 * SIZE FMADD y04, alpha4, a4, y04 addi AO4, AO4, 8 * SIZE STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) FMADD y05, alpha4, a5, y05 FMADD y06, alpha4, a6, y06 FMADD y07, alpha4, a7, y07 FMADD y08, alpha4, a8, y08 STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) addi Y1, Y1, 8 * SIZE .align 4LL(26): andi. r0, M, 4 ble LL(27) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO3) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO3) FMADD y03, alpha1, a3, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, alpha1, a4, y04 LFD a4, 3 * SIZE(AO3) FMADD y01, alpha2, a5, y01 LFD a5, 0 * SIZE(AO4) FMADD y02, alpha2, a6, y02 LFD a6, 1 * SIZE(AO4) FMADD y03, alpha2, a7, y03 LFD a7, 2 * SIZE(AO4) FMADD y04, alpha2, a8, y04 LFD a8, 3 * SIZE(AO4) FMADD y01, alpha3, a1, y01 addi AO1, AO1, 4 * SIZE FMADD y02, alpha3, a2, y02 addi AO2, AO2, 4 * SIZE FMADD y03, alpha3, a3, y03 addi AO3, AO3, 4 * SIZE FMADD y04, alpha3, a4, y04 addi AO4, AO4, 4 * SIZE FMADD y01, alpha4, a5, y01 FMADD y02, alpha4, a6, y02 FMADD y03, alpha4, a7, y03 FMADD y04, alpha4, a8, y04 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) addi Y1, Y1, 4 * SIZE .align 4LL(27): andi. r0, M, 2 ble LL(28) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD a5, 0 * SIZE(AO3) LFD a6, 1 * SIZE(AO3) LFD a7, 0 * SIZE(AO4) LFD a8, 1 * SIZE(AO4) FMADD y01, alpha1, a1, y01 addi AO1, AO1, 2 * SIZE FMADD y02, alpha1, a2, y02 addi AO2, AO2, 2 * SIZE FMADD y01, alpha2, a3, y01 addi AO3, AO3, 2 * SIZE FMADD y02, alpha2, a4, y02 addi AO4, AO4, 2 * SIZE FMADD y01, alpha3, a5, y01 FMADD y02, alpha3, a6, y02 FMADD y01, alpha4, a7, y01 FMADD y02, alpha4, a8, y02 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) addi Y1, Y1, 2 * SIZE .align 4LL(28): andi. r0, M, 1 ble LL(30) LFD y01, 0 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 0 * SIZE(AO2) LFD a3, 0 * SIZE(AO3) LFD a4, 0 * SIZE(AO4) FMADD y01, alpha1, a1, y01 FMADD y01, alpha2, a2, y01 FMADD y01, alpha3, a3, y01 FMADD y01, alpha4, a4, y01 STFD y01, 0 * SIZE(Y1) .align 4LL(30): andi. J, N, 2 lfd alpha, ALPHA ble LL(40) .align 4 LFD alpha1, 0 * SIZE(X) add X, X, INCX LFD alpha2, 0 * SIZE(X) add X, X, INCX FMUL alpha1, alpha, alpha1 FMUL alpha2, alpha, alpha2 mr AO1, A add AO2, A, LDA add A, AO2, LDA mr Y1, YY srawi. r0, M, 4 mtspr CTR, r0 ble LL(35) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) bdz LL(33) .align 4LL(32): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 LFD a1, 0 * SIZE(AO2) FMADD y10, alpha1, a2, y10 LFD a2, 1 * SIZE(AO2) FMADD y11, alpha1, a3, y11 LFD a3, 2 * SIZE(AO2) FMADD y12, alpha1, a4, y12 LFD a4, 3 * SIZE(AO2) FMADD y13, alpha1, a5, y13 LFD a5, 4 * SIZE(AO2) FMADD y14, alpha1, a6, y14 LFD a6, 5 * SIZE(AO2) FMADD y15, alpha1, a7, y15 LFD a7, 6 * SIZE(AO2) FMADD y16, alpha1, a8, y16 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 8 * SIZE(AO2) FMADD y02, alpha2, a2, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, alpha2, a3, y03 LFD a3, 10 * SIZE(AO2) FMADD y04, alpha2, a4, y04 LFD a4, 11 * SIZE(AO2) FMADD y05, alpha2, a5, y05 LFD a5, 12 * SIZE(AO2) FMADD y06, alpha2, a6, y06 LFD a6, 13 * SIZE(AO2) FMADD y07, alpha2, a7, y07 LFD a7, 14 * SIZE(AO2) FMADD y08, alpha2, a8, y08 LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2, a1, y09 LFD a1, 16 * SIZE(AO1) FMADD y10, alpha2, a2, y10 LFD a2, 17 * SIZE(AO1) FMADD y11, alpha2, a3, y11 LFD a3, 18 * SIZE(AO1) FMADD y12, alpha2, a4, y12 LFD a4, 19 * SIZE(AO1) FMADD y13, alpha2, a5, y13 LFD a5, 20 * SIZE(AO1) FMADD y14, alpha2, a6, y14 LFD a6, 21 * SIZE(AO1) FMADD y15, alpha2, a7, y15 LFD a7, 22 * SIZE(AO1) FMADD y16, alpha2, a8, y16 LFD a8, 23 * SIZE(AO1) STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) LFD y01, 16 * SIZE(Y1) LFD y02, 17 * SIZE(Y1) LFD y03, 18 * SIZE(Y1) LFD y04, 19 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) LFD y05, 20 * SIZE(Y1) LFD y06, 21 * SIZE(Y1) LFD y07, 22 * SIZE(Y1) LFD y08, 23 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) LFD y09, 24 * SIZE(Y1) LFD y10, 25 * SIZE(Y1) LFD y11, 26 * SIZE(Y1) LFD y12, 27 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) LFD y13, 28 * SIZE(Y1) LFD y14, 29 * SIZE(Y1) LFD y15, 30 * SIZE(Y1) LFD y16, 31 * SIZE(Y1) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi Y1, Y1, 16 * SIZE PREFETCH_A1 PREFETCH_A2 PREFETCH_Y bdnz LL(32) .align 4LL(33): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 LFD a1, 0 * SIZE(AO2) FMADD y10, alpha1, a2, y10 LFD a2, 1 * SIZE(AO2) FMADD y11, alpha1, a3, y11 LFD a3, 2 * SIZE(AO2) FMADD y12, alpha1, a4, y12 LFD a4, 3 * SIZE(AO2) FMADD y13, alpha1, a5, y13 LFD a5, 4 * SIZE(AO2) FMADD y14, alpha1, a6, y14 LFD a6, 5 * SIZE(AO2) FMADD y15, alpha1, a7, y15 LFD a7, 6 * SIZE(AO2) FMADD y16, alpha1, a8, y16 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 8 * SIZE(AO2) FMADD y02, alpha2, a2, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, alpha2, a3, y03 LFD a3, 10 * SIZE(AO2) FMADD y04, alpha2, a4, y04 LFD a4, 11 * SIZE(AO2) FMADD y05, alpha2, a5, y05 LFD a5, 12 * SIZE(AO2) FMADD y06, alpha2, a6, y06 LFD a6, 13 * SIZE(AO2) FMADD y07, alpha2, a7, y07 LFD a7, 14 * SIZE(AO2) FMADD y08, alpha2, a8, y08 LFD a8, 15 * SIZE(AO2) FMADD y09, alpha2, a1, y09 FMADD y10, alpha2, a2, y10 FMADD y11, alpha2, a3, y11 FMADD y12, alpha2, a4, y12 FMADD y13, alpha2, a5, y13 FMADD y14, alpha2, a6, y14 FMADD y15, alpha2, a7, y15 FMADD y16, alpha2, a8, y16 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi Y1, Y1, 16 * SIZE .align 4LL(35): andi. r0, M, 15 ble LL(40) andi. r0, M, 8 ble LL(36) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -