📄 gemv_n.s
字号:
LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1, a1, y01 LFD a1, 0 * SIZE(AO2) FMADD y02, alpha1, a2, y02 LFD a2, 1 * SIZE(AO2) FMADD y03, alpha1, a3, y03 LFD a3, 2 * SIZE(AO2) FMADD y04, alpha1, a4, y04 LFD a4, 3 * SIZE(AO2) FMADD y05, alpha1, a5, y05 LFD a5, 4 * SIZE(AO2) FMADD y06, alpha1, a6, y06 LFD a6, 5 * SIZE(AO2) FMADD y07, alpha1, a7, y07 LFD a7, 6 * SIZE(AO2) FMADD y08, alpha1, a8, y08 LFD a8, 7 * SIZE(AO2) FMADD y01, alpha2, a1, y01 FMADD y02, alpha2, a2, y02 FMADD y03, alpha2, a3, y03 FMADD y04, alpha2, a4, y04 FMADD y05, alpha2, a5, y05 FMADD y06, alpha2, a6, y06 FMADD y07, alpha2, a7, y07 FMADD y08, alpha2, a8, y08 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi Y1, Y1, 8 * SIZE .align 4LL(36): andi. r0, M, 4 ble LL(37) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y03, alpha1, a3, y03 FMADD y04, alpha1, a4, y04 FMADD y01, alpha2, a5, y01 FMADD y02, alpha2, a6, y02 FMADD y03, alpha2, a7, y03 FMADD y04, alpha2, a8, y04 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi Y1, Y1, 4 * SIZE .align 4LL(37): andi. r0, M, 2 ble LL(38) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y01, alpha2, a3, y01 FMADD y02, alpha2, a4, y02 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) addi AO1, AO1, 2 * SIZE addi AO2, AO2, 2 * SIZE addi Y1, Y1, 2 * SIZE .align 4LL(38): andi. r0, M, 1 ble LL(40) LFD y01, 0 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 0 * SIZE(AO2) FMADD y01, alpha1, a1, y01 FMADD y01, alpha2, a2, y01 STFD y01, 0 * SIZE(Y1) .align 4LL(40): andi. J, N, 1 lfd alpha, ALPHA ble LL(990) .align 4 LFD alpha1, 0 * SIZE(X) FMUL alpha1, alpha, alpha1 mr AO1, A mr Y1, YY srawi. r0, M, 4 mtspr CTR, r0 ble LL(45) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y09, 8 * SIZE(Y1) LFD y10, 9 * SIZE(Y1) LFD y11, 10 * SIZE(Y1) LFD y12, 11 * SIZE(Y1) LFD y13, 12 * SIZE(Y1) LFD y14, 13 * SIZE(Y1) LFD y15, 14 * SIZE(Y1) LFD y16, 15 * SIZE(Y1) bdz LL(43) .align 4LL(42): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 LFD a1, 16 * SIZE(AO1) FMADD y10, alpha1, a2, y10 LFD a2, 17 * SIZE(AO1) FMADD y11, alpha1, a3, y11 LFD a3, 18 * SIZE(AO1) FMADD y12, alpha1, a4, y12 LFD a4, 19 * SIZE(AO1) FMADD y13, alpha1, a5, y13 LFD a5, 20 * SIZE(AO1) FMADD y14, alpha1, a6, y14 LFD a6, 21 * SIZE(AO1) FMADD y15, alpha1, a7, y15 LFD a7, 22 * SIZE(AO1) FMADD y16, alpha1, a8, y16 LFD a8, 23 * SIZE(AO1) STFD y01, 0 * SIZE(Y1) LFD y01, 16 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) LFD y02, 17 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) LFD y03, 18 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) LFD y04, 19 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) LFD y05, 20 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) LFD y06, 21 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) LFD y07, 22 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) LFD y08, 23 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) LFD y09, 24 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) LFD y10, 25 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) LFD y11, 26 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) LFD y12, 27 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) LFD y13, 28 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) LFD y14, 29 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) LFD y15, 30 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) LFD y16, 31 * SIZE(Y1) addi AO1, AO1, 16 * SIZE addi Y1, Y1, 16 * SIZE PREFETCH_A1 PREFETCH_Y bdnz LL(42) .align 4LL(43): FMADD y01, alpha1, a1, y01 LFD a1, 8 * SIZE(AO1) FMADD y02, alpha1, a2, y02 LFD a2, 9 * SIZE(AO1) FMADD y03, alpha1, a3, y03 LFD a3, 10 * SIZE(AO1) FMADD y04, alpha1, a4, y04 LFD a4, 11 * SIZE(AO1) FMADD y05, alpha1, a5, y05 LFD a5, 12 * SIZE(AO1) FMADD y06, alpha1, a6, y06 LFD a6, 13 * SIZE(AO1) FMADD y07, alpha1, a7, y07 LFD a7, 14 * SIZE(AO1) FMADD y08, alpha1, a8, y08 LFD a8, 15 * SIZE(AO1) FMADD y09, alpha1, a1, y09 FMADD y10, alpha1, a2, y10 FMADD y11, alpha1, a3, y11 FMADD y12, alpha1, a4, y12 FMADD y13, alpha1, a5, y13 FMADD y14, alpha1, a6, y14 FMADD y15, alpha1, a7, y15 FMADD y16, alpha1, a8, y16 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) STFD y09, 8 * SIZE(Y1) STFD y10, 9 * SIZE(Y1) STFD y11, 10 * SIZE(Y1) STFD y12, 11 * SIZE(Y1) STFD y13, 12 * SIZE(Y1) STFD y14, 13 * SIZE(Y1) STFD y15, 14 * SIZE(Y1) STFD y16, 15 * SIZE(Y1) addi AO1, AO1, 16 * SIZE addi Y1, Y1, 16 * SIZE .align 4LL(45): andi. r0, M, 15 ble LL(990) andi. r0, M, 8 ble LL(46) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y05, 4 * SIZE(Y1) LFD y06, 5 * SIZE(Y1) LFD y07, 6 * SIZE(Y1) LFD y08, 7 * SIZE(Y1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y03, alpha1, a3, y03 FMADD y04, alpha1, a4, y04 FMADD y05, alpha1, a5, y05 FMADD y06, alpha1, a6, y06 FMADD y07, alpha1, a7, y07 FMADD y08, alpha1, a8, y08 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) STFD y05, 4 * SIZE(Y1) STFD y06, 5 * SIZE(Y1) STFD y07, 6 * SIZE(Y1) STFD y08, 7 * SIZE(Y1) addi AO1, AO1, 8 * SIZE addi Y1, Y1, 8 * SIZE .align 4LL(46): andi. r0, M, 4 ble LL(47) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD y03, 2 * SIZE(Y1) LFD y04, 3 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 FMADD y03, alpha1, a3, y03 FMADD y04, alpha1, a4, y04 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) STFD y03, 2 * SIZE(Y1) STFD y04, 3 * SIZE(Y1) addi AO1, AO1, 4 * SIZE addi Y1, Y1, 4 * SIZE .align 4LL(47): andi. r0, M, 2 ble LL(48) LFD y01, 0 * SIZE(Y1) LFD y02, 1 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) FMADD y01, alpha1, a1, y01 FMADD y02, alpha1, a2, y02 STFD y01, 0 * SIZE(Y1) STFD y02, 1 * SIZE(Y1) addi AO1, AO1, 2 * SIZE addi Y1, Y1, 2 * SIZE .align 4LL(48): andi. r0, M, 1 ble LL(990) LFD y01, 0 * SIZE(Y1) LFD a1, 0 * SIZE(AO1) FMADD y01, alpha1, a1, y01 STFD y01, 0 * SIZE(Y1) .align 4LL(990): cmpi cr0, 0, INCY, SIZE beq LL(999) mr YY, BUFFER mr Y1, Y srawi. r0, M, 3 mtspr CTR, r0 ble LL(995) .align 4LL(991): LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) add Y, Y, INCY LFD f3, 0 * SIZE(Y) add Y, Y, INCY LFD f4, 0 * SIZE(Y) add Y, Y, INCY LFD f5, 0 * SIZE(Y) add Y, Y, INCY LFD f6, 0 * SIZE(Y) add Y, Y, INCY LFD f7, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(YY) LFD f9, 1 * SIZE(YY) LFD f10, 2 * SIZE(YY) LFD f11, 3 * SIZE(YY) LFD f12, 4 * SIZE(YY) LFD f13, 5 * SIZE(YY) LFD f14, 6 * SIZE(YY) LFD f15, 7 * SIZE(YY) addi YY, YY, 8 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 FADD f12, f12, f4 FADD f13, f13, f5 FADD f14, f14, f6 FADD f15, f15, f7 STFD f8, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f9, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f10, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f11, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f12, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f13, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f14, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f15, 0 * SIZE(Y1) add Y1, Y1, INCY bdnz LL(991) .align 4LL(995): andi. J, M, 4 ble LL(996) LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f2, 0 * SIZE(Y) add Y, Y, INCY LFD f3, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(YY) LFD f9, 1 * SIZE(YY) LFD f10, 2 * SIZE(YY) LFD f11, 3 * SIZE(YY) addi YY, YY, 4 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 FADD f10, f10, f2 FADD f11, f11, f3 STFD f8, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f9, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f10, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f11, 0 * SIZE(Y1) add Y1, Y1, INCY .align 4LL(996): andi. J, M, 2 ble LL(997) LFD f0, 0 * SIZE(Y) add Y, Y, INCY LFD f1, 0 * SIZE(Y) add Y, Y, INCY LFD f8, 0 * SIZE(YY) LFD f9, 1 * SIZE(YY) addi YY, YY, 2 * SIZE FADD f8, f8, f0 FADD f9, f9, f1 STFD f8, 0 * SIZE(Y1) add Y1, Y1, INCY STFD f9, 0 * SIZE(Y1) add Y1, Y1, INCY .align 4LL(997): andi. J, M, 1 ble LL(999) LFD f0, 0 * SIZE(Y) LFD f8, 0 * SIZE(YY) FADD f8, f8, f0 STFD f8, 0 * SIZE(Y1) .align 4LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP)#ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) ld r24, 224(SP) ld r25, 232(SP) ld r26, 240(SP) ld r27, 248(SP)#else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) lwz r24, 184(SP) lwz r25, 188(SP) lwz r26, 192(SP) lwz r27, 196(SP)#endif addi SP, SP, STACKSIZE blr EPILOGUE#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -