gemv_n.s
来自「Optimized GotoBLAS libraries」· S 代码 · 共 3,077 行 · 第 1/5 页
S
3,077 行
FMADD y03, alpha3, a3, y03 LFD a3, 2 * SIZE(AO4) FMADD y04, alpha3, a4, y04 LFD a4, 3 * SIZE(AO4) FMADD y05, alpha3, a5, y05 LFD a5, 4 * SIZE(AO4) FMADD y06, alpha3, a6, y06 LFD a6, 5 * SIZE(AO4) FMADD y07, alpha3, a7, y07 LFD a7, 6 * SIZE(AO4) FMADD y08, alpha3, a8, y08 LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4, a1, y01 LFD a1, 0 * SIZE(AO5) FMADD y02, alpha4, a2, y02 LFD a2, 1 * SIZE(AO5) FMADD y03, alpha4, a3, y03 LFD a3, 2 * SIZE(AO5) FMADD y04, alpha4, a4, y04 LFD a4, 3 * SIZE(AO5) FMADD y05, alpha4, a5, y05 LFD a5, 4 * SIZE(AO5) FMADD y06, alpha4, a6, y06 LFD a6, 5 * SIZE(AO5) FMADD y07, alpha4, a7, y07 LFD a7, 6 * SIZE(AO5) FMADD y08, alpha4, a8, y08 LFD a8, 7 * SIZE(AO5) FMADD y01, alpha5, a1, y01 LFD a1, 0 * SIZE(AO6) FMADD y02, alpha5, a2, y02 LFD a2, 1 * SIZE(AO6) FMADD y03, alpha5, a3, y03 LFD a3, 2 * SIZE(AO6) FMADD y04, alpha5, a4, y04 LFD a4, 3 * SIZE(AO6) FMADD y05, alpha5, a5, y05 LFD a5, 4 * SIZE(AO6) FMADD y06, alpha5, a6, y06 LFD a6, 5 * SIZE(AO6) FMADD y07, alpha5, a7, y07 LFD a7, 6 * SIZE(AO6) FMADD y08, alpha5, a8, y08 LFD a8, 7 * SIZE(AO6) FMADD y01, alpha6, a1, y01 LFD a1, 0 * SIZE(AO7) FMADD y02, alpha6, a2, y02 LFD a2, 1 * SIZE(AO7) FMADD y03, alpha6, a3, y03 LFD a3, 2 * SIZE(AO7) FMADD y04, alpha6, a4, y04 LFD a4, 3 * SIZE(AO7) FMADD y05, alpha6, a5, y05 LFD a5, 4 * SIZE(AO7) FMADD y06, alpha6, a6, y06 LFD a6, 5 * SIZE(AO7) FMADD y07, alpha6, a7, y07 LFD a7, 6 * SIZE(AO7) FMADD y08, alpha6, a8, y08 LFD a8, 7 * SIZE(AO7) FMADD y01, alpha7, a1, y01 LFD a1, 0 * SIZE(AO8) FMADD y02, alpha7, a2, y02 LFD a2, 1 * SIZE(AO8) FMADD y03, alpha7, a3, y03 LFD a3, 2 * SIZE(AO8) FMADD y04, alpha7, a4, y04 LFD a4, 3 * SIZE(AO8) FMADD y05, alpha7, a5, y05 LFD a5, 4 * SIZE(AO8) FMADD y06, alpha7, a6, y06 LFD a6, 5 * SIZE(AO8) FMADD y07, alpha7, a7, y07 LFD a7, 6 * SIZE(AO8) FMADD y08, alpha7, a8, y08 LFD a8, 7 * SIZE(AO8) FMADD y01, alpha8, a1, y01 addi AO1, AO1, 8 * SIZE FMADD y02, alpha8, a2, y02 addi AO2, AO2, 8 * SIZE FMADD y03, alpha8, a3, y03 addi AO3, AO3, 8 * SIZE FMADD y04, alpha8, a4, y04 addi AO4, AO4, 8 * SIZE STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) FMADD y05, alpha8, a5, y05 addi AO5, AO5, 8 * SIZE FMADD y06, alpha8, a6, y06 addi AO6, AO6, 8 * SIZE FMADD y07, alpha8, a7, y07 addi AO7, AO7, 8 * SIZE FMADD y08, alpha8, a8, y08 addi AO8, AO8, 8 * SIZE STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) addi Y1, Y1, 8 * SIZE .align 4LL(16): andi. r0, M, 4 ble LL(17) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO3) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO3) FMADD y03, alpha1, a3, y03 LFD a3, 2 * SIZE(AO3) FMADD y04, alpha1, a4, y04 LFD a4, 3 * SIZE(AO3) FMADD y01, alpha2, a5, y01 LFD a5, 0 * SIZE(AO4) FMADD y02, alpha2, a6, y02 LFD a6, 1 * SIZE(AO4) FMADD y03, alpha2, a7, y03 LFD a7, 2 * SIZE(AO4) FMADD y04, alpha2, a8, y04 LFD a8, 3 * SIZE(AO4) FMADD y01, alpha3, a1, y01 LFD a1, 0 * SIZE(AO5) FMADD y02, alpha3, a2, y02 LFD a2, 1 * SIZE(AO5) FMADD y03, alpha3, a3, y03 LFD a3, 2 * SIZE(AO5) FMADD y04, alpha3, a4, y04 LFD a4, 3 * SIZE(AO5) FMADD y01, alpha4, a5, y01 LFD a5, 0 * SIZE(AO6) FMADD y02, alpha4, a6, y02 LFD a6, 1 * SIZE(AO6) FMADD y03, alpha4, a7, y03 LFD a7, 2 * SIZE(AO6) FMADD y04, alpha4, a8, y04 LFD a8, 3 * SIZE(AO6) FMADD y01, alpha5, a1, y01 LFD a1, 0 * SIZE(AO7) FMADD y02, alpha5, a2, y02 LFD a2, 1 * SIZE(AO7) FMADD y03, alpha5, a3, y03 LFD a3, 2 * SIZE(AO7) FMADD y04, alpha5, a4, y04 LFD a4, 3 * SIZE(AO7) FMADD y01, alpha6, a5, y01 LFD a5, 0 * SIZE(AO8) FMADD y02, alpha6, a6, y02 LFD a6, 1 * SIZE(AO8) FMADD y03, alpha6, a7, y03 LFD a7, 2 * SIZE(AO8) FMADD y04, alpha6, a8, y04 LFD a8, 3 * SIZE(AO8) FMADD y01, alpha7, a1, y01 addi AO1, AO1, 4 * SIZE FMADD y02, alpha7, a2, y02 addi AO2, AO2, 4 * SIZE FMADD y03, alpha7, a3, y03 addi AO3, AO3, 4 * SIZE FMADD y04, alpha7, a4, y04 addi AO4, AO4, 4 * SIZE FMADD y01, alpha8, a5, y01 addi AO5, AO5, 4 * SIZE FMADD y02, alpha8, a6, y02 addi AO6, AO6, 4 * SIZE FMADD y03, alpha8, a7, y03 addi AO7, AO7, 4 * SIZE FMADD y04, alpha8, a8, y04 addi AO8, AO8, 4 * SIZE STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) addi Y1, Y1, 4 * SIZE .align 4LL(17): andi. r0, M, 2 ble LL(18) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD a5, 0 * SIZE(AO3) LFD a6, 1 * SIZE(AO3) LFD a7, 0 * SIZE(AO4) LFD a8, 1 * SIZE(AO4) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO5) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO5) FMADD y01, alpha2, a3, y01 LFD a3, 0 * SIZE(AO6) FMADD y02, alpha2, a4, y02 LFD a4, 1 * SIZE(AO6) FMADD y01, alpha3, a5, y01 LFD a5, 0 * SIZE(AO7) FMADD y02, alpha3, a6, y02 LFD a6, 1 * SIZE(AO7) FMADD y01, alpha4, a7, y01 LFD a7, 0 * SIZE(AO8) FMADD y02, alpha4, a8, y02 LFD a8, 1 * SIZE(AO8) FMADD y01, alpha5, a1, y01 addi AO1, AO1, 2 * SIZE FMADD y02, alpha5, a2, y02 addi AO2, AO2, 2 * SIZE FMADD y01, alpha6, a3, y01 addi AO3, AO3, 2 * SIZE FMADD y02, alpha6, a4, y02 addi AO4, AO4, 2 * SIZE FMADD y01, alpha7, a5, y01 addi AO5, AO5, 2 * SIZE FMADD y02, alpha7, a6, y02 addi AO6, AO6, 2 * SIZE FMADD y01, alpha8, a7, y01 addi AO7, AO7, 2 * SIZE FMADD y02, alpha8, a8, y02 addi AO8, AO8, 2 * SIZE STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) addi Y1, Y1, 2 * SIZE .align 4LL(18): andi. r0, M, 1 ble LL(19) LFD y01, 0 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 0 * SIZE(AO2) LFD a3, 0 * SIZE(AO3) LFD a4, 0 * SIZE(AO4) LFD a5, 0 * SIZE(AO5) LFD a6, 0 * SIZE(AO6) LFD a7, 0 * SIZE(AO7) LFD a8, 0 * SIZE(AO8) FMADD y01, alpha1, a1, y01 FMADD y01, alpha2, a2, y01 FMADD y01, alpha3, a3, y01 FMADD y01, alpha4, a4, y01 FMADD y01, alpha5, a5, y01 FMADD y01, alpha6, a6, y01 FMADD y01, alpha7, a7, y01 FMADD y01, alpha8, a8, y01 STFD y01, 0 * SIZE(Y1) .align 4LL(19): addi J, J, -1 lfd alpha, ALPHA cmpi cr0, 0, J, 0 bgt LL(11) .align 4 LL(20): andi. J, N, 4 mr AO1, A add AO2, A, LDA ble LL(30) .align 4 LFD alpha1, 0 * SIZE(X) add X, X, INCX LFD alpha2, 0 * SIZE(X) add X, X, INCX LFD alpha3, 0 * SIZE(X) add X, X, INCX LFD alpha4, 0 * SIZE(X) add X, X, INCX FMUL alpha1, alpha, alpha1 add AO3, AO2, LDA FMUL alpha2, alpha, alpha2 add AO4, AO3, LDA FMUL alpha3, alpha, alpha3 add A, AO4, LDA FMUL alpha4, alpha, alpha4 mr Y1, YY srawi. r0, M, 4 mtspr CTR, r0 ble LL(25) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) bdz LL(23) .align 4LL(22): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 LFD a1, 0 * SIZE(AO2) FMADD y10, alpha1, a2, y10 LFD a2, 1 * SIZE(AO2) FMADD y11, alpha1, a3, y11 LFD a3, 2 * SIZE(AO2) FMADD y12, alpha1, a4, y12 LFD a4, 3 * SIZE(AO2) FMADD y13, alpha1, a5, y13 LFD a5, 4 * SIZE(AO2) FMADD y14, alpha1, a6, y14 LFD a6, 5 * SIZE(AO2) FMADD y15, alpha1, a7, y15 LFD a7, 6 * SIZE(AO2) FMADD y16, alpha1, a8, y16 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 8 * SIZE(AO2) FMADD y02, alpha2, a2, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, alpha2, a3, y03 LFD a3, 10 * SIZE(AO2) FMADD y04, alpha2, a4, y04 LFD a4, 11 * SIZE(AO2) FMADD y05, alpha2, a5, y05 LFD a5, 12 * SIZE(AO2) FMADD y06, alpha2, a6, y06 LFD a6, 13 * SIZE(AO2) FMADD y07, alpha2, a7, y07 LFD a7, 14 * SIZE(AO2) FMADD y08, alpha2, a8, y08 LFD a8, 15 * SIZE(AO2) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE PREFETCH_A1 PREFETCH_A2 FMADD y09, alpha2, a1, y09 LFD a1, 0 * SIZE(AO3) FMADD y10, alpha2, a2, y10 LFD a2, 1 * SIZE(AO3) FMADD y11, alpha2, a3, y11 LFD a3, 2 * SIZE(AO3) FMADD y12, alpha2, a4, y12 LFD a4, 3 * SIZE(AO3) FMADD y13, alpha2, a5, y13 LFD a5, 4 * SIZE(AO3) FMADD y14, alpha2, a6, y14 LFD a6, 5 * SIZE(AO3) FMADD y15, alpha2, a7, y15 LFD a7, 6 * SIZE(AO3) FMADD y16, alpha2, a8, y16 LFD a8, 7 * SIZE(AO3) FMADD y01, alpha3, a1, y01 LFD a1, 8 * SIZE(AO3) FMADD y02, alpha3, a2, y02 LFD a2, 9 * SIZE(AO3) FMADD y03, alpha3, a3, y03 LFD a3, 10 * SIZE(AO3) FMADD y04, alpha3, a4, y04 LFD a4, 11 * SIZE(AO3) FMADD y05, alpha3, a5, y05 LFD a5, 12 * SIZE(AO3) FMADD y06, alpha3, a6, y06 LFD a6, 13 * SIZE(AO3) FMADD y07, alpha3, a7, y07 LFD a7, 14 * SIZE(AO3) FMADD y08, alpha3, a8, y08 LFD a8, 15 * SIZE(AO3) FMADD y09, alpha3, a1, y09 LFD a1, 0 * SIZE(AO4) FMADD y10, alpha3, a2, y10 LFD a2, 1 * SIZE(AO4) FMADD y11, alpha3, a3, y11 LFD a3, 2 * SIZE(AO4) FMADD y12, alpha3, a4, y12 LFD a4, 3 * SIZE(AO4) FMADD y13, alpha3, a5, y13 LFD a5, 4 * SIZE(AO4) FMADD y14, alpha3, a6, y14 LFD a6, 5 * SIZE(AO4) FMADD y15, alpha3, a7, y15 LFD a7, 6 * SIZE(AO4) FMADD y16, alpha3, a8, y16 LFD a8, 7 * SIZE(AO4) FMADD y01, alpha4, a1, y01 LFD a1, 8 * SIZE(AO4) FMADD y02, alpha4, a2, y02 LFD a2, 9 * SIZE(AO4) FMADD y03, alpha4, a3, y03 LFD a3, 10 * SIZE(AO4) FMADD y04, alpha4, a4, y04 LFD a4, 11 * SIZE(AO4) STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) LFD y01, 16 * SIZE(Y1) LFD y02, 17 * SIZE(Y1) LFD y03, 18 * SIZE(Y1) LFD y04, 19 * SIZE(Y1) FMADD y05, alpha4, a5, y05 LFD a5, 12 * SIZE(AO4) FMADD y06, alpha4, a6, y06 LFD a6, 13 * SIZE(AO4) FMADD y07, alpha4, a7, y07 LFD a7, 14 * SIZE(AO4) FMADD y08, alpha4, a8, y08 LFD a8, 15 * SIZE(AO4) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) LFD y05, 20 * SIZE(Y1) LFD y06, 21 * SIZE(Y1) LFD y07, 22 * SIZE(Y1) LFD y08, 23 * SIZE(Y1) addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE PREFETCH_A3 PREFETCH_A4 FMADD y09, alpha4, a1, y09 LFD a1, 0 * SIZE(AO1) FMADD y10, alpha4, a2, y10 LFD a2, 1 * SIZE(AO1) FMADD y11, alpha4, a3, y11 LFD a3, 2 * SIZE(AO1) FMADD y12, alpha4, a4, y12 LFD a4, 3 * SIZE(AO1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) LFD y09, 24 * SIZE(Y1) LFD y10, 25 * SIZE(Y1) LFD y11, 26 * SIZE(Y1) LFD y12, 27 * SIZE(Y1) FMADD y13, alpha4, a5, y13 LFD a5, 4 * SIZE(AO1) FMADD y14, alpha4, a6, y14 LFD a6, 5 * SIZE(AO1) FMADD y15, alpha4, a7, y15 LFD a7, 6 * SIZE(AO1) FMADD y16, alpha4, a8, y16 LFD a8, 7 * SIZE(AO1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) LFD y13, 28 * SIZE(Y1) LFD y14, 29 * SIZE(Y1) LFD y15, 30 * SIZE(Y1) LFD y16, 31 * SIZE(Y1) addi Y1, Y1, 16 * SIZE PREFETCH_Y bdnz LL(22) .align 4LL(23): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 LFD a1, 0 * SIZE(AO2) FMADD y10, alpha1, a2, y10 LFD a2, 1 * SIZE(AO2) FMADD y11, alpha1, a3, y11 LFD a3, 2 * SIZE(AO2) FMADD y12, alpha1, a4, y12 LFD a4, 3 * SIZE(AO2) FMADD y13, alpha1, a5, y13 LFD a5, 4 * SIZE(AO2) FMADD y14, alpha1, a6, y14 LFD a6, 5 * SIZE(AO2) FMADD y15, alpha1, a7, y15 LFD a7, 6 * SIZE(AO2) FMADD y16, alpha1, a8, y16 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 LFD a1, 8 * SIZE(AO2) FMADD y02, alpha2, a2, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, alpha2, a3, y03 LFD a3, 10 * SIZE(AO2)
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?