📄 gemv_t.s
字号:
LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) bdz LL(33) .align 4LL(32): FMADD y01, a1, b1, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) FMADD y03, a3, b2, y03 LFD a3, 6 * SIZE(AO1) FMADD y04, a4, b2, y04 LFD a4, 6 * SIZE(AO2) FMADD y09, a5, b3, y09 LFD a5, 7 * SIZE(AO1) FMADD y10, a6, b3, y10 LFD a6, 7 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 8 * SIZE(AO1) FMADD y12, a8, b4, y12 LFD a8, 8 * SIZE(AO2) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, a3, b6, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, a4, b6, y04 LFD a4, 10 * SIZE(AO2) FMADD y09, a5, b7, y09 LFD a5, 11 * SIZE(AO1) FMADD y10, a6, b7, y10 LFD a6, 11 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 12 * SIZE(AO1) FMADD y12, a8, b8, y12 LFD a8, 12 * SIZE(AO2) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 13 * SIZE(AO2) FMADD y03, a3, b2, y03 LFD a3, 14 * SIZE(AO1) FMADD y04, a4, b2, y04 LFD a4, 14 * SIZE(AO2) FMADD y09, a5, b3, y09 LFD a5, 15 * SIZE(AO1) FMADD y10, a6, b3, y10 LFD a6, 15 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 16 * SIZE(AO1) FMADD y12, a8, b4, y12 LFD a8, 16 * SIZE(AO2) LFD b1, 17 * SIZE(BO) LFD b2, 18 * SIZE(BO) LFD b3, 19 * SIZE(BO) LFD b4, 20 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 17 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 17 * SIZE(AO2) FMADD y03, a3, b6, y03 LFD a3, 18 * SIZE(AO1) FMADD y04, a4, b6, y04 LFD a4, 18 * SIZE(AO2) FMADD y09, a5, b7, y09 LFD a5, 19 * SIZE(AO1) FMADD y10, a6, b7, y10 LFD a6, 19 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 20 * SIZE(AO1) FMADD y12, a8, b8, y12 LFD a8, 20 * SIZE(AO2) LFD b5, 21 * SIZE(BO) LFD b6, 22 * SIZE(BO) LFD b7, 23 * SIZE(BO) LFD b8, 24 * SIZE(BO) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE PREFETCH_A1 PREFETCH_A2 addi BO, BO, 16 * SIZE bdnz LL(32) .align 4 LL(33): FMADD y01, a1, b1, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) FMADD y03, a3, b2, y03 LFD a3, 6 * SIZE(AO1) FMADD y04, a4, b2, y04 LFD a4, 6 * SIZE(AO2) FMADD y09, a5, b3, y09 LFD a5, 7 * SIZE(AO1) FMADD y10, a6, b3, y10 LFD a6, 7 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 8 * SIZE(AO1) FMADD y12, a8, b4, y12 LFD a8, 8 * SIZE(AO2) LFD b1, 9 * SIZE(BO) LFD b2, 10 * SIZE(BO) LFD b3, 11 * SIZE(BO) LFD b4, 12 * SIZE(BO) FMADD y01, a1, b5, y01 LFD a1, 9 * SIZE(AO1) FMADD y02, a2, b5, y02 LFD a2, 9 * SIZE(AO2) FMADD y03, a3, b6, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, a4, b6, y04 LFD a4, 10 * SIZE(AO2) FMADD y09, a5, b7, y09 LFD a5, 11 * SIZE(AO1) FMADD y10, a6, b7, y10 LFD a6, 11 * SIZE(AO2) FMADD y11, a7, b8, y11 LFD a7, 12 * SIZE(AO1) FMADD y12, a8, b8, y12 LFD a8, 12 * SIZE(AO2) LFD b5, 13 * SIZE(BO) LFD b6, 14 * SIZE(BO) LFD b7, 15 * SIZE(BO) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 13 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 13 * SIZE(AO2) FMADD y03, a3, b2, y03 LFD a3, 14 * SIZE(AO1) FMADD y04, a4, b2, y04 LFD a4, 14 * SIZE(AO2) FMADD y09, a5, b3, y09 LFD a5, 15 * SIZE(AO1) FMADD y10, a6, b3, y10 LFD a6, 15 * SIZE(AO2) FMADD y11, a7, b4, y11 LFD a7, 16 * SIZE(AO1) FMADD y12, a8, b4, y12 LFD a8, 16 * SIZE(AO2) FMADD y01, a1, b5, y01 FMADD y02, a2, b5, y02 FMADD y03, a3, b6, y03 FMADD y04, a4, b6, y04 FMADD y09, a5, b7, y09 FMADD y10, a6, b7, y10 FMADD y11, a7, b8, y11 FMADD y12, a8, b8, y12 addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi BO, BO, 16 * SIZE .align 4LL(34): andi. r0, MIN_N, 15 ble LL(38) andi. r0, MIN_N, 8 ble LL(35) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 2 * SIZE(AO1) LFD a4, 2 * SIZE(AO2) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD a5, 3 * SIZE(AO1) LFD a6, 3 * SIZE(AO2) LFD a7, 4 * SIZE(AO1) LFD a8, 4 * SIZE(AO2) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) FMADD y01, a1, b1, y01 LFD a1, 5 * SIZE(AO1) FMADD y02, a2, b1, y02 LFD a2, 5 * SIZE(AO2) FMADD y09, a3, b2, y09 LFD a3, 6 * SIZE(AO1) FMADD y10, a4, b2, y10 LFD a4, 6 * SIZE(AO2) FMADD y01, a5, b3, y01 LFD a5, 7 * SIZE(AO1) FMADD y02, a6, b3, y02 LFD a6, 7 * SIZE(AO2) FMADD y09, a7, b4, y09 LFD a7, 8 * SIZE(AO1) FMADD y10, a8, b4, y10 LFD a8, 8 * SIZE(AO2) FMADD y01, a1, b5, y01 FMADD y02, a2, b5, y02 FMADD y09, a3, b6, y09 FMADD y10, a4, b6, y10 FMADD y01, a5, b7, y01 addi AO1, AO1, 8 * SIZE FMADD y02, a6, b7, y02 addi AO2, AO2, 8 * SIZE FMADD y09, a7, b8, y09 addi BO, BO, 8 * SIZE FMADD y10, a8, b8, y10 nop .align 4LL(35): andi. r0, MIN_N, 4 ble LL(36) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD a3, 2 * SIZE(AO1) LFD a4, 2 * SIZE(AO2) LFD a5, 3 * SIZE(AO1) LFD a6, 3 * SIZE(AO2) LFD a7, 4 * SIZE(AO1) LFD a8, 4 * SIZE(AO2) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y09, a3, b2, y09 FMADD y10, a4, b2, y10 FMADD y01, a5, b3, y01 addi AO1, AO1, 4 * SIZE FMADD y02, a6, b3, y02 addi AO2, AO2, 4 * SIZE FMADD y09, a7, b4, y09 addi BO, BO, 4 * SIZE FMADD y10, a8, b4, y10 .align 4LL(36): andi. r0, MIN_N, 2 ble LL(37) LFD a1, 1 * SIZE(AO1) LFD a2, 1 * SIZE(AO2) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD a3, 2 * SIZE(AO1) LFD a4, 2 * SIZE(AO2) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 FMADD y09, a3, b2, y09 FMADD y10, a4, b2, y10 addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE addi BO, BO, 2 * SIZE .align 4LL(37): andi. r0, MIN_N, 1 ble LL(38) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 1 * SIZE(AO2) FMADD y01, a1, b1, y01 FMADD y02, a2, b1, y02 .align 4LL(38): mr BO, CO lfd alpha, ALPHA cmpi cr0, 0, INCY, SIZE bne LL(39) LFD a1, 1 * SIZE(CO) LFD a2, 2 * SIZE(CO) FADD y01, y03, y01 FADD y02, y04, y02 FADD y09, y11, y09 FADD y10, y12, y10 FADD y01, y09, y01 FADD y02, y10, y02 FMADD a1, alpha, y01, a1 FMADD a2, alpha, y02, a2 STFD a1, 1 * SIZE(CO) STFD a2, 2 * SIZE(CO) addi CO, CO, 2 * SIZE b LL(40) .align 4LL(39): LFDUX a1, CO, INCY LFDUX a2, CO, INCY FADD y01, y03, y01 FADD y02, y04, y02 FADD y09, y11, y09 FADD y10, y12, y10 FADD y01, y09, y01 FADD y02, y10, y02 FMADD a1, alpha, f0, a1 FMADD a2, alpha, f1, a2 STFDUX a1, BO, INCY STFDUX a2, BO, INCY .align 4LL(40): andi. J, N, 1 ble LL(99) mr AO1, A add A, A, LDA mr BO, XP lfd y01, FZERO fmr y02, y01 fmr y03, y01 fmr y04, y01 fmr y09, y01 fmr y10, y01 fmr y11, y01 fmr y12, y01 PREFETCH_Y srawi. r0, MIN_N, 4 mtspr CTR, r0 ble LL(44) LFD a1, 1 * SIZE(AO1) LFD a2, 2 * SIZE(AO1) LFD a3, 3 * SIZE(AO1) LFD a4, 4 * SIZE(AO1) LFD a5, 5 * SIZE(AO1) LFD a6, 6 * SIZE(AO1) LFD a7, 7 * SIZE(AO1) LFD a8, 8 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) bdz LL(43) .align 4LL(42): FMADD y01, a1, b1, y01 nop LFD a1, 9 * SIZE(AO1) LFD b1, 9 * SIZE(BO) FMADD y02, a2, b2, y02 nop LFD a2, 10 * SIZE(AO1) LFD b2, 10 * SIZE(BO) FMADD y03, a3, b3, y03 nop LFD a3, 11 * SIZE(AO1) LFD b3, 11 * SIZE(BO) FMADD y04, a4, b4, y04 nop LFD a4, 12 * SIZE(AO1) LFD b4, 12 * SIZE(BO) FMADD y01, a5, b5, y01 nop LFD a5, 13 * SIZE(AO1) LFD b5, 13 * SIZE(BO) FMADD y02, a6, b6, y02 nop LFD a6, 14 * SIZE(AO1) LFD b6, 14 * SIZE(BO) FMADD y03, a7, b7, y03 nop LFD a7, 15 * SIZE(AO1) LFD b7, 15 * SIZE(BO) FMADD y04, a8, b8, y04 nop LFD a8, 16 * SIZE(AO1) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 nop LFD a1, 17 * SIZE(AO1) LFD b1, 17 * SIZE(BO) FMADD y02, a2, b2, y02 nop LFD a2, 18 * SIZE(AO1) LFD b2, 18 * SIZE(BO) FMADD y03, a3, b3, y03 nop LFD a3, 19 * SIZE(AO1) LFD b3, 19 * SIZE(BO) FMADD y04, a4, b4, y04 nop LFD a4, 20 * SIZE(AO1) LFD b4, 20 * SIZE(BO) FMADD y01, a5, b5, y01 nop LFD a5, 21 * SIZE(AO1) LFD b5, 21 * SIZE(BO) FMADD y02, a6, b6, y02 nop LFD a6, 22 * SIZE(AO1) LFD b6, 22 * SIZE(BO) FMADD y03, a7, b7, y03 nop LFD a7, 23 * SIZE(AO1) LFD b7, 23 * SIZE(BO) FMADD y04, a8, b8, y04 nop LFD a8, 24 * SIZE(AO1) LFD b8, 24 * SIZE(BO) addi AO1, AO1, 16 * SIZE addi BO, BO, 16 * SIZE PREFETCH_A1 bdnz LL(42) .align 4 LL(43): FMADD y01, a1, b1, y01 nop LFD a1, 9 * SIZE(AO1) LFD b1, 9 * SIZE(BO) FMADD y02, a2, b2, y02 nop LFD a2, 10 * SIZE(AO1) LFD b2, 10 * SIZE(BO) FMADD y03, a3, b3, y03 nop LFD a3, 11 * SIZE(AO1) LFD b3, 11 * SIZE(BO) FMADD y04, a4, b4, y04 nop LFD a4, 12 * SIZE(AO1) LFD b4, 12 * SIZE(BO) FMADD y01, a5, b5, y01 nop LFD a5, 13 * SIZE(AO1) LFD b5, 13 * SIZE(BO) FMADD y02, a6, b6, y02 nop LFD a6, 14 * SIZE(AO1) LFD b6, 14 * SIZE(BO) FMADD y03, a7, b7, y03 nop LFD a7, 15 * SIZE(AO1) LFD b7, 15 * SIZE(BO) FMADD y04, a8, b8, y04 nop LFD a8, 16 * SIZE(AO1) LFD b8, 16 * SIZE(BO) FMADD y01, a1, b1, y01 FMADD y02, a2, b2, y02 FMADD y03, a3, b3, y03 FMADD y04, a4, b4, y04 FMADD y01, a5, b5, y01 addi AO1, AO1, 16 * SIZE FMADD y02, a6, b6, y02 addi BO, BO, 16 * SIZE FMADD y03, a7, b7, y03 nop FMADD y04, a8, b8, y04 nop .align 4LL(44): andi. r0, MIN_N, 15 ble LL(48) andi. r0, MIN_N, 8 ble LL(45) LFD a1, 1 * SIZE(AO1) LFD a2, 2 * SIZE(AO1) LFD a3, 3 * SIZE(AO1) LFD a4, 4 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD b2, 2 * SIZE(BO) LFD b3, 3 * SIZE(BO) LFD b4, 4 * SIZE(BO) LFD a5, 5 * SIZE(AO1) LFD a6, 6 * SIZE(AO1) LFD a7, 7 * SIZE(AO1) LFD a8, 8 * SIZE(AO1) LFD b5, 5 * SIZE(BO) LFD b6, 6 * SIZE(BO) LFD b7, 7 * SIZE(BO) LFD b8, 8 * SIZE(BO) FMADD y01, a1, b1, y01 FMADD y02, a2, b2, y02 FMADD y03, a3, b3, y03 FMADD y04, a4, b4, y04 FMADD y01, a5, b5, y01 addi AO1, AO1, 8 * SIZE FMADD y02, a6, b6, y02 addi BO, BO, 8 * SIZE FMADD y03, a7, b7, y03 nop FMADD y04, a8, b8, y04 nop .align 4LL(45): andi. r0, MIN_N, 4 ble LL(46) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 2 * SIZE(AO1) LFD b2, 2 * SIZE(BO) LFD a3, 3 * SIZE(AO1) LFD b3, 3 * SIZE(BO) LFD a4, 4 * SIZE(AO1) LFD b4, 4 * SIZE(BO) FMADD y01, a1, b1, y01 addi AO1, AO1, 4 * SIZE FMADD y02, a2, b2, y02 addi AO2, AO2, 4 * SIZE FMADD y03, a3, b3, y03 addi BO, BO, 4 * SIZE FMADD y04, a4, b4, y04 nop .align 4LL(46): andi. r0, MIN_N, 2 ble LL(47) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) LFD a2, 2 * SIZE(AO1) LFD b2, 2 * SIZE(BO) FMADD y01, a1, b1, y01 addi AO1, AO1, 2 * SIZE FMADD y02, a2, b2, y02 addi BO, BO, 2 * SIZE .align 4LL(47): andi. r0, MIN_N, 1 ble LL(48) LFD a1, 1 * SIZE(AO1) LFD b1, 1 * SIZE(BO) FMADD y01, a1, b1, y01 .align 4LL(48): mr BO, CO lfd alpha, ALPHA cmpi cr0, 0, INCY, SIZE bne LL(49) LFD a1, 1 * SIZE(CO) FADD y01, y02, y01 FADD y03, y04, y03 FADD y01, y03, y01 FMADD a1, alpha, y01, a1 STFD a1, 1 * SIZE(CO) b LL(99) .align 4LL(49): LFDUX a1, CO, INCY FADD y01, y02, y01 FADD y03, y04, y03 FADD y01, y03, y01 FMADD a1, alpha, f0, a1 STFDUX a1, BO, INCY .align 4LL(99): subf A, PLDA_M, A addi IS, IS, P cmp cr0, 0, IS, M blt LL(ISLoop) .align 4LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP)#ifdef __64BIT__ ld r14, 160(SP) ld r15, 168(SP) ld r16, 176(SP) ld r17, 184(SP) ld r18, 192(SP) ld r19, 200(SP) ld r20, 208(SP) ld r21, 216(SP) ld r22, 224(SP) ld r23, 232(SP) ld r24, 240(SP) ld r25, 248(SP) ld r26, 256(SP) ld r27, 264(SP) ld r28, 272(SP) ld r29, 280(SP)#else lwz r14, 160(SP) lwz r15, 164(SP) lwz r16, 168(SP) lwz r17, 172(SP) lwz r18, 176(SP) lwz r19, 180(SP) lwz r20, 184(SP) lwz r21, 188(SP) lwz r22, 192(SP) lwz r23, 196(SP) lwz r24, 200(SP) lwz r25, 204(SP) lwz r26, 208(SP) lwz r27, 212(SP) lwz r28, 216(SP) lwz r29, 220(SP)#endif addi SP, SP, STACKSIZE blr EPILOGUE#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -