📄 gemv_t.s
字号:
FMADD a4, alpha, y04, a4 FMADD a5, alpha, y05, a5 FMADD a6, alpha, y06, a6 FMADD a7, alpha, y07, a7 FMADD a8, alpha, y08, a8 STFD a1, 1 * SIZE(CO) STFD a2, 2 * SIZE(CO) STFD a3, 3 * SIZE(CO) STFD a4, 4 * SIZE(CO) STFD a5, 5 * SIZE(CO) STFD a6, 6 * SIZE(CO) STFD a7, 7 * SIZE(CO) STFD a8, 8 * SIZE(CO) addi J, J, -1 addi CO, CO, 8 * SIZE cmpi cr0, 0, J, 0 bgt LL(11) b LL(20) .align 4LL(19): LFDUX a1, CO, INCY LFDUX a2, CO, INCY LFDUX a3, CO, INCY LFDUX a4, CO, INCY LFDUX a5, CO, INCY LFDUX a6, CO, INCY LFDUX a7, CO, INCY LFDUX a8, CO, INCY FADD y01, y09, y01 FADD y02, y10, y02 FADD y03, y11, y03 FADD y04, y12, y04 FADD y05, y13, y05 FADD y06, y14, y06 FADD y07, y15, y07 FADD y08, y16, y08 FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 FMADD a3, alpha, f2, a3 FMADD a4, alpha, f3, a4 FMADD a5, alpha, f4, a5 FMADD a6, alpha, f5, a6 FMADD a7, alpha, f6, a7 FMADD a8, alpha, f7, a8 STFDUX a1, BO, INCY STFDUX a2, BO, INCY STFDUX a3, BO, INCY STFDUX a4, BO, INCY STFDUX a5, BO, INCY STFDUX a6, BO, INCY STFDUX a7, BO, INCY STFDUX a8, BO, INCY addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(11) .align 4 LL(20): andi. J, N, 7 ble LL(99) andi. J, N, 4 ble LL(30) mr AO1, A add AO2, A, LDA add AO3, AO2, LDA add AO4, AO3, LDA add A, AO4, LDA mr BO, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 fmr y09, y01 fmr y10, y01 fmr y11, y01 fmr y12, y01 PREFETCH_Y srawi. r0, MIN_N, 4 mtspr CTR, r0 ble LL(24) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD a5, 2 * SIZE(AO1) LFD a6, 2 * SIZE(AO2) LFD a7, 2 * SIZE(AO3) LFD a8, 2 * SIZE(AO4) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) bdz LL(23) .align 4LL(22): FMADD y01, a1, b1, y01 LFD a1, 3 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 3 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 3 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 4 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 4 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 4 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 5 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 5 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) FMADD y09, a5, b4, y09 LFD a5, 6 * SIZE(AO1) FMADD y10, a6, b4, y10 LFD a6, 6 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 6 * SIZE(AO3) FMADD y12, a8, b4, y12 LFD a8, 6 * SIZE(AO4) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 7 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 7 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 7 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 7 * SIZE(AO4) FMADD y09, a5, b6, y09 LFD a5, 8 * SIZE(AO1) FMADD y10, a6, b6, y10 LFD a6, 8 * SIZE(AO2) FMADD y11, a7, b6, y11 LFD a7, 8 * SIZE(AO3) FMADD y12, a8, b6, y12 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b7, y01 LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 9 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 9 * SIZE(AO4) FMADD y09, a5, b8, y09 LFD a5, 10 * SIZE(AO1) FMADD y10, a6, b8, y10 LFD a6, 10 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 10 * SIZE(AO3) FMADD y12, a8, b8, y12 LFD a8, 10 * SIZE(AO4) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 11 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 11 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 11 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 11 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 12 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 12 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 12 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 12 * SIZE(AO4) FMADD y01, a1, b3, y01 LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 13 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 13 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 13 * SIZE(AO4) FMADD y09, a5, b4, y09 LFD a5, 14 * SIZE(AO1) FMADD y10, a6, b4, y10 LFD a6, 14 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 14 * SIZE(AO3) FMADD y12, a8, b4, y12 LFD a8, 14 * SIZE(AO4) LFD b1, 17 * SIZE(BO) LFD b2, 18 * SIZE(BO) LFD b3, 19 * SIZE(BO) LFD b4, 20 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 15 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 15 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 15 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 15 * SIZE(AO4) FMADD y09, a5, b6, y09 LFD a5, 16 * SIZE(AO1) FMADD y10, a6, b6, y10 LFD a6, 16 * SIZE(AO2) FMADD y11, a7, b6, y11 LFD a7, 16 * SIZE(AO3) FMADD y12, a8, b6, y12 LFD a8, 16 * SIZE(AO4) FMADD y01, a1, b7, y01 LFD a1, 17 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 17 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 17 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 17 * SIZE(AO4) FMADD y09, a5, b8, y09 LFD a5, 18 * SIZE(AO1) FMADD y10, a6, b8, y10 LFD a6, 18 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 18 * SIZE(AO3) FMADD y12, a8, b8, y12 LFD a8, 18 * SIZE(AO4) LFD b5, 21 * SIZE(BO) LFD b6, 22 * SIZE(BO) LFD b7, 23 * SIZE(BO) LFD b8, 24 * SIZE(BO) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE PREFETCH_A1 PREFETCH_A2 addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE PREFETCH_A3 PREFETCH_A4 addi BO, BO, 16 * SIZE bdnz LL(22) .align 4 LL(23): FMADD y01, a1, b1, y01 LFD a1, 3 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 3 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 3 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 4 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 4 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 4 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 5 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 5 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) FMADD y09, a5, b4, y09 LFD a5, 6 * SIZE(AO1) FMADD y10, a6, b4, y10 LFD a6, 6 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 6 * SIZE(AO3) FMADD y12, a8, b4, y12 LFD a8, 6 * SIZE(AO4) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 7 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 7 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 7 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 7 * SIZE(AO4) FMADD y09, a5, b6, y09 LFD a5, 8 * SIZE(AO1) FMADD y10, a6, b6, y10 LFD a6, 8 * SIZE(AO2) FMADD y11, a7, b6, y11 LFD a7, 8 * SIZE(AO3) FMADD y12, a8, b6, y12 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b7, y01 LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b7, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, a3, b7, y03 LFD a3, 9 * SIZE(AO3) FMADD y04, a4, b7, y04 LFD a4, 9 * SIZE(AO4) FMADD y09, a5, b8, y09 LFD a5, 10 * SIZE(AO1) FMADD y10, a6, b8, y10 LFD a6, 10 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 10 * SIZE(AO3) FMADD y12, a8, b8, y12 LFD a8, 10 * SIZE(AO4) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 11 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 11 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 11 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 11 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 12 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 12 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 12 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 12 * SIZE(AO4) FMADD y01, a1, b3, y01 LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 13 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 13 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 13 * SIZE(AO4) FMADD y09, a5, b4, y09 LFD a5, 14 * SIZE(AO1) FMADD y10, a6, b4, y10 LFD a6, 14 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 14 * SIZE(AO3) FMADD y12, a8, b4, y12 LFD a8, 14 * SIZE(AO4) FMADD y01, a1, b5, y01 LFD a1, 15 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 15 * SIZE(AO2) FMADD y03, a3, b5, y03 LFD a3, 15 * SIZE(AO3) FMADD y04, a4, b5, y04 LFD a4, 15 * SIZE(AO4) FMADD y09, a5, b6, y09 LFD a5, 16 * SIZE(AO1) FMADD y10, a6, b6, y10 LFD a6, 16 * SIZE(AO2) FMADD y11, a7, b6, y11 LFD a7, 16 * SIZE(AO3) FMADD y12, a8, b6, y12 LFD a8, 16 * SIZE(AO4) FMADD y01, a1, b7, y01 FMADD y02, a2, b7, y02 FMADD y03, a3, b7, y03 FMADD y04, a4, b7, y04 FMADD y09, a5, b8, y09 FMADD y10, a6, b8, y10 FMADD y11, a7, b8, y11 FMADD y12, a8, b8, y12 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi AO3, AO3, 16 * SIZE addi AO4, AO4, 16 * SIZE addi BO, BO, 16 * SIZE .align 4LL(24): andi. r0, MIN_N, 15 ble LL(28) andi. r0, MIN_N, 8 ble LL(25) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD a5, 2 * SIZE(AO1) LFD a6, 2 * SIZE(AO2) LFD a7, 2 * SIZE(AO3) LFD a8, 2 * SIZE(AO4) FMADD y01, a1, b1, y01 LFD a1, 3 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 3 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 3 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 4 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 4 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 4 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b3, y02 LFD a2, 5 * SIZE(AO2) FMADD y03, a3, b3, y03 LFD a3, 5 * SIZE(AO3) FMADD y04, a4, b3, y04 LFD a4, 5 * SIZE(AO4) FMADD y09, a5, b4, y09 LFD a5, 6 * SIZE(AO1) FMADD y10, a6, b4, y10 LFD a6, 6 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 6 * SIZE(AO3) FMADD y12, a8, b4, y12 LFD a8, 6 * SIZE(AO4) LFD b1, 5 * SIZE(BO) LFD b2, 6 * SIZE(BO) LFD b3, 7 * SIZE(BO) LFD b4, 8 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 7 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 7 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 7 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 7 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 8 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 8 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 8 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 8 * SIZE(AO4) FMADD y01, a1, b3, y01 FMADD y02, a2, b3, y02 FMADD y03, a3, b3, y03 FMADD y04, a4, b3, y04 FMADD y09, a5, b4, y09 addi AO1, AO1, 8 * SIZE FMADD y10, a6, b4, y10 addi AO2, AO2, 8 * SIZE FMADD y11, a7, b4, y11 addi AO3, AO3, 8 * SIZE FMADD y12, a8, b4, y12 addi AO4, AO4, 8 * SIZE addi BO, BO, 8 * SIZE .align 4LL(25): andi. r0, MIN_N, 4 ble LL(26) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD a5, 2 * SIZE(AO1) LFD a6, 2 * SIZE(AO2) LFD a7, 2 * SIZE(AO3) LFD a8, 2 * SIZE(AO4) FMADD y01, a1, b1, y01 LFD a1, 3 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 3 * SIZE(AO2) FMADD y03, a3, b1, y03 LFD a3, 3 * SIZE(AO3) FMADD y04, a4, b1, y04 LFD a4, 3 * SIZE(AO4) FMADD y09, a5, b2, y09 LFD a5, 4 * SIZE(AO1) FMADD y10, a6, b2, y10 LFD a6, 4 * SIZE(AO2) FMADD y11, a7, b2, y11 LFD a7, 4 * SIZE(AO3) FMADD y12, a8, b2, y12 LFD a8, 4 * SIZE(AO4) FMADD y01, a1, b3, y01 FMADD y02, a2, b3, y02 FMADD y03, a3, b3, y03 FMADD y04, a4, b3, y04 FMADD y09, a5, b4, y09 addi AO1, AO1, 4 * SIZE FMADD y10, a6, b4, y10 addi AO2, AO2, 4 * SIZE FMADD y11, a7, b4, y11 addi AO3, AO3, 4 * SIZE FMADD y12, a8, b4, y12 addi AO4, AO4, 4 * SIZE addi BO, BO, 4 * SIZE .align 4LL(26): andi. r0, MIN_N, 2 ble LL(27) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) LFD a5, 2 * SIZE(AO1) LFD a6, 2 * SIZE(AO2) LFD a7, 2 * SIZE(AO3) LFD a8, 2 * SIZE(AO4) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y03, a3, b1, y03 FMADD y04, a4, b1, y04 FMADD y09, a5, b2, y09 addi AO1, AO1, 2 * SIZE FMADD y10, a6, b2, y10 addi AO2, AO2, 2 * SIZE FMADD y11, a7, b2, y11 addi AO3, AO3, 2 * SIZE FMADD y12, a8, b2, y12 addi AO4, AO4, 2 * SIZE addi BO, BO, 2 * SIZE .align 4LL(27): andi. r0, MIN_N, 1 ble LL(28) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 1 * SIZE(AO2) LFD a3, 1 * SIZE(AO3) LFD a4, 1 * SIZE(AO4) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y03, a3, b1, y03 FMADD y04, a4, b1, y04 .align 4LL(28): mr BO, CO lfd alpha, ALPHA cmpi cr0, 0, INCY, SIZE bne LL(29) LFD a1, 1 * SIZE(CO) LFD a2, 2 * SIZE(CO) LFD a3, 3 * SIZE(CO) LFD a4, 4 * SIZE(CO) FADD y01, y09, y01 FADD y02, y10, y02 FADD y03, y11, y03 FADD y04, y12, y04 FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 FMADD a3, alpha, y03, a3 FMADD a4, alpha, y04, a4 STFD a1, 1 * SIZE(CO) STFD a2, 2 * SIZE(CO) STFD a3, 3 * SIZE(CO) STFD a4, 4 * SIZE(CO) addi CO, CO, 4 * SIZE b LL(30) .align 4LL(29): LFDUX a1, CO, INCY LFDUX a2, CO, INCY LFDUX a3, CO, INCY LFDUX a4, CO, INCY FADD y01, y09, y01 FADD y02, y10, y02 FADD y03, y11, y03 FADD y04, y12, y04 FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 FMADD a3, alpha, f2, a3 FMADD a4, alpha, f3, a4 STFDUX a1, BO, INCY STFDUX a2, BO, INCY STFDUX a3, BO, INCY STFDUX a4, BO, INCY .align 4LL(30): andi. J, N, 2 ble LL(40) mr AO1, A add AO2, A, LDA add A, AO2, LDA mr BO, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 fmr y09, y01 fmr y10, y01 fmr y11, y01 fmr y12, y01 PREFETCH_Y srawi. r0, MIN_N, 4 mtspr CTR, r0 ble LL(34) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 2 * SIZE(AO1) LFD a4, 2 * SIZE(AO2) LFD a5, 3 * SIZE(AO1) LFD a6, 3 * SIZE(AO2) LFD a7, 4 * SIZE(AO1) LFD a8, 4 * SIZE(AO2) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD b5, 5 * SIZE(BO)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -