atl_dmm4x4x16_hppa.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,251 行 · 第 1/3 页
C
1,251 行
fmpyfadd,dbl ra2, rB3, rC23, rC23 fmpyfadd,dbl ra3, rB3, rC33, rC33 fldd 32(pB3), rB3 fmpyfadd,dbl rA0, rB0, rC00, rC00 fmpyfadd,dbl rA1, rB0, rC10, rC10 fldd 40(pA0), ra0 fmpyfadd,dbl rA2, rB0, rC20, rC20 fmpyfadd,dbl rA3, rB0, rC30, rC30 fldd 40(pB0), rB0 fmpyfadd,dbl rA0, rB1, rC01, rC01 fmpyfadd,dbl rA1, rB1, rC11, rC11 fldd 40(pA1), ra1 fmpyfadd,dbl rA2, rB1, rC21, rC21 fmpyfadd,dbl rA3, rB1, rC31, rC31 fldd 40(pB1), rB1 fmpyfadd,dbl rA0, rB2, rC02, rC02 fmpyfadd,dbl rA1, rB2, rC12, rC12 fldd 40(pA2), ra2 fmpyfadd,dbl rA2, rB2, rC22, rC22 fmpyfadd,dbl rA3, rB2, rC32, rC32 fldd 40(pB2), rB2 fmpyfadd,dbl rA0, rB3, rC03, rC03 fmpyfadd,dbl rA1, rB3, rC13, rC13 fldd 40(pA3), ra3 fmpyfadd,dbl rA2, rB3, rC23, rC23 fmpyfadd,dbl rA3, rB3, rC33, rC33 fldd 40(pB3), rB3 fmpyfadd,dbl ra0, rB0, rC00, rC00 fmpyfadd,dbl ra1, rB0, rC10, rC10 fldd 48(pA0), rA0 fmpyfadd,dbl ra2, rB0, rC20, rC20 fmpyfadd,dbl ra3, rB0, rC30, rC30 fldd 48(pB0), rB0 fmpyfadd,dbl ra0, rB1, rC01, rC01 fmpyfadd,dbl ra1, rB1, rC11, rC11 fldd 48(pA1), rA1 fmpyfadd,dbl ra2, rB1, rC21, rC21 fmpyfadd,dbl ra3, rB1, rC31, rC31 fldd 48(pB1), rB1 fmpyfadd,dbl ra0, rB2, rC02, rC02 fmpyfadd,dbl ra1, rB2, rC12, rC12 fldd 48(pA2), rA2 fmpyfadd,dbl ra2, rB2, rC22, rC22 fmpyfadd,dbl ra3, rB2, rC32, rC32 fldd 48(pB2), rB2 fmpyfadd,dbl ra0, rB3, rC03, rC03 fmpyfadd,dbl ra1, rB3, rC13, rC13 fldd 48(pA3), rA3 fmpyfadd,dbl ra2, rB3, rC23, rC23 fmpyfadd,dbl ra3, rB3, rC33, rC33 fldd 48(pB3), rB3 fmpyfadd,dbl rA0, rB0, rC00, rC00 fmpyfadd,dbl rA1, rB0, rC10, rC10 fldd 56(pA0), ra0 fmpyfadd,dbl rA2, rB0, rC20, rC20 fmpyfadd,dbl rA3, rB0, rC30, rC30 fldd 56(pB0), rB0 fmpyfadd,dbl rA0, rB1, rC01, rC01 fmpyfadd,dbl rA1, rB1, rC11, rC11 fldd 56(pA1), ra1 fmpyfadd,dbl rA2, rB1, rC21, rC21 fmpyfadd,dbl rA3, rB1, rC31, rC31 fldd 56(pB1), rB1 fmpyfadd,dbl rA0, rB2, rC02, rC02 fmpyfadd,dbl rA1, rB2, rC12, rC12 fldd 56(pA2), ra2 fmpyfadd,dbl rA2, rB2, rC22, rC22 fmpyfadd,dbl rA3, rB2, rC32, rC32 fldd 56(pB2), rB2 fmpyfadd,dbl rA0, rB3, rC03, rC03 fmpyfadd,dbl rA1, rB3, rC13, rC13 fldd 56(pA3), ra3 fmpyfadd,dbl rA2, rB3, rC23, rC23 fmpyfadd,dbl rA3, rB3, rC33, rC33 fldd 56(pB3), rB3 fmpyfadd,dbl ra0, rB0, rC00, rC00 fmpyfadd,dbl ra1, rB0, rC10, rC10 fldd 64(pA0), rA0 fmpyfadd,dbl ra2, rB0, rC20, rC20 fmpyfadd,dbl ra3, rB0, rC30, rC30 fldd 64(pB0), rB0 fmpyfadd,dbl ra0, rB1, rC01, rC01 fmpyfadd,dbl ra1, rB1, rC11, rC11 fldd 64(pA1), rA1 fmpyfadd,dbl ra2, rB1, rC21, rC21 fmpyfadd,dbl ra3, rB1, rC31, rC31 fldd 64(pB1), rB1 fmpyfadd,dbl ra0, rB2, rC02, rC02 fmpyfadd,dbl ra1, rB2, rC12, rC12 fldd 64(pA2), rA2 fmpyfadd,dbl ra2, rB2, rC22, rC22 fmpyfadd,dbl ra3, rB2, rC32, rC32 fldd 64(pB2), rB2 fmpyfadd,dbl ra0, rB3, rC03, rC03 fmpyfadd,dbl ra1, rB3, rC13, rC13 fldd 64(pA3), rA3 fmpyfadd,dbl ra2, rB3, rC23, rC23 fmpyfadd,dbl ra3, rB3, rC33, rC33 fldd 64(pB3), rB3 fmpyfadd,dbl rA0, rB0, rC00, rC00 fmpyfadd,dbl rA1, rB0, rC10, rC10 fldd 72(pA0), ra0 fmpyfadd,dbl rA2, rB0, rC20, rC20 fmpyfadd,dbl rA3, rB0, rC30, rC30 fldd 72(pB0), rB0 fmpyfadd,dbl rA0, rB1, rC01, rC01 fmpyfadd,dbl rA1, rB1, rC11, rC11 fldd 72(pA1), ra1 fmpyfadd,dbl rA2, rB1, rC21, rC21 fmpyfadd,dbl rA3, rB1, rC31, rC31 fldd 72(pB1), rB1 fmpyfadd,dbl rA0, rB2, rC02, rC02 fmpyfadd,dbl rA1, rB2, rC12, rC12 fldd 72(pA2), ra2 fmpyfadd,dbl rA2, rB2, rC22, rC22 fmpyfadd,dbl rA3, rB2, rC32, rC32 fldd 72(pB2), rB2 fmpyfadd,dbl rA0, rB3, rC03, rC03 fmpyfadd,dbl rA1, rB3, rC13, rC13 fldd 72(pA3), ra3 fmpyfadd,dbl rA2, rB3, rC23, rC23 fmpyfadd,dbl rA3, rB3, rC33, rC33 fldd 72(pB3), rB3 fmpyfadd,dbl ra0, rB0, rC00, rC00 fmpyfadd,dbl ra1, rB0, rC10, rC10 fldd 80(pA0), rA0 fmpyfadd,dbl ra2, rB0, rC20, rC20 fmpyfadd,dbl ra3, rB0, rC30, rC30 fldd 80(pB0), rB0 fmpyfadd,dbl ra0, rB1, rC01, rC01 fmpyfadd,dbl ra1, rB1, rC11, rC11 fldd 80(pA1), rA1 fmpyfadd,dbl ra2, rB1, rC21, rC21 fmpyfadd,dbl ra3, rB1, rC31, rC31 fldd 80(pB1), rB1 fmpyfadd,dbl ra0, rB2, rC02, rC02 fmpyfadd,dbl ra1, rB2, rC12, rC12 fldd 80(pA2), rA2 fmpyfadd,dbl ra2, rB2, rC22, rC22 fmpyfadd,dbl ra3, rB2, rC32, rC32 fldd 80(pB2), rB2 fmpyfadd,dbl ra0, rB3, rC03, rC03 fmpyfadd,dbl ra1, rB3, rC13, rC13 fldd 80(pA3), rA3 fmpyfadd,dbl ra2, rB3, rC23, rC23 fmpyfadd,dbl ra3, rB3, rC33, rC33 fldd 80(pB3), rB3 fmpyfadd,dbl rA0, rB0, rC00, rC00 fmpyfadd,dbl rA1, rB0, rC10, rC10 fldd 88(pA0), ra0 fmpyfadd,dbl rA2, rB0, rC20, rC20 fmpyfadd,dbl rA3, rB0, rC30, rC30 fldd 88(pB0), rB0 fmpyfadd,dbl rA0, rB1, rC01, rC01 fmpyfadd,dbl rA1, rB1, rC11, rC11 fldd 88(pA1), ra1 fmpyfadd,dbl rA2, rB1, rC21, rC21 fmpyfadd,dbl rA3, rB1, rC31, rC31 fldd 88(pB1), rB1 fmpyfadd,dbl rA0, rB2, rC02, rC02 fmpyfadd,dbl rA1, rB2, rC12, rC12 fldd 88(pA2), ra2 fmpyfadd,dbl rA2, rB2, rC22, rC22 fmpyfadd,dbl rA3, rB2, rC32, rC32 fldd 88(pB2), rB2 fmpyfadd,dbl rA0, rB3, rC03, rC03 fmpyfadd,dbl rA1, rB3, rC13, rC13 fldd 88(pA3), ra3 fmpyfadd,dbl rA2, rB3, rC23, rC23 fmpyfadd,dbl rA3, rB3, rC33, rC33 fldd 88(pB3), rB3 fmpyfadd,dbl ra0, rB0, rC00, rC00 fmpyfadd,dbl ra1, rB0, rC10, rC10 fldd 96(pA0), rA0 fmpyfadd,dbl ra2, rB0, rC20, rC20 fmpyfadd,dbl ra3, rB0, rC30, rC30 fldd 96(pB0), rB0 fmpyfadd,dbl ra0, rB1, rC01, rC01 fmpyfadd,dbl ra1, rB1, rC11, rC11 fldd 96(pA1), rA1 fmpyfadd,dbl ra2, rB1, rC21, rC21 fmpyfadd,dbl ra3, rB1, rC31, rC31 fldd 96(pB1), rB1 fmpyfadd,dbl ra0, rB2, rC02, rC02 fmpyfadd,dbl ra1, rB2, rC12, rC12 fldd 96(pA2), rA2 fmpyfadd,dbl ra2, rB2, rC22, rC22 fmpyfadd,dbl ra3, rB2, rC32, rC32 fldd 96(pB2), rB2 fmpyfadd,dbl ra0, rB3, rC03, rC03 fmpyfadd,dbl ra1, rB3, rC13, rC13 fldd 96(pA3), rA3 fmpyfadd,dbl ra2, rB3, rC23, rC23 fmpyfadd,dbl ra3, rB3, rC33, rC33 fldd 96(pB3), rB3 fmpyfadd,dbl rA0, rB0, rC00, rC00 fmpyfadd,dbl rA1, rB0, rC10, rC10 fldd 104(pA0), ra0 fmpyfadd,dbl rA2, rB0, rC20, rC20 fmpyfadd,dbl rA3, rB0, rC30, rC30 fldd 104(pB0), rB0 fmpyfadd,dbl rA0, rB1, rC01, rC01 fmpyfadd,dbl rA1, rB1, rC11, rC11 fldd 104(pA1), ra1 fmpyfadd,dbl rA2, rB1, rC21, rC21 fmpyfadd,dbl rA3, rB1, rC31, rC31 fldd 104(pB1), rB1 fmpyfadd,dbl rA0, rB2, rC02, rC02 fmpyfadd,dbl rA1, rB2, rC12, rC12 fldd 104(pA2), ra2 fmpyfadd,dbl rA2, rB2, rC22, rC22 fmpyfadd,dbl rA3, rB2, rC32, rC32 fldd 104(pB2), rB2 fmpyfadd,dbl rA0, rB3, rC03, rC03 fmpyfadd,dbl rA1, rB3, rC13, rC13 fldd 104(pA3), ra3 fmpyfadd,dbl rA2, rB3, rC23, rC23 fmpyfadd,dbl rA3, rB3, rC33, rC33 fldd 104(pB3), rB3 fmpyfadd,dbl ra0, rB0, rC00, rC00 fmpyfadd,dbl ra1, rB0, rC10, rC10 fldd 112(pA0), rA0 fmpyfadd,dbl ra2, rB0, rC20, rC20 fmpyfadd,dbl ra3, rB0, rC30, rC30 fldd 112(pB0), rB0 fmpyfadd,dbl ra0, rB1, rC01, rC01 fmpyfadd,dbl ra1, rB1, rC11, rC11 fldd 112(pA1), rA1 fmpyfadd,dbl ra2, rB1, rC21, rC21 fmpyfadd,dbl ra3, rB1, rC31, rC31 fldd 112(pB1), rB1 fmpyfadd,dbl ra0, rB2, rC02, rC02 fmpyfadd,dbl ra1, rB2, rC12, rC12 fldd 112(pA2), rA2 fmpyfadd,dbl ra2, rB2, rC22, rC22 fmpyfadd,dbl ra3, rB2, rC32, rC32 fldd 112(pB2), rB2 fmpyfadd,dbl ra0, rB3, rC03, rC03 fmpyfadd,dbl ra1, rB3, rC13, rC13 fldd 112(pA3), rA3 fmpyfadd,dbl ra2, rB3, rC23, rC23 fmpyfadd,dbl ra3, rB3, rC33, rC33 fldd 112(pB3), rB3 fmpyfadd,dbl rA0, rB0, rC00, rC00 fmpyfadd,dbl rA1, rB0, rC10, rC10 fldd 120(pA0), ra0 fmpyfadd,dbl rA2, rB0, rC20, rC20 fmpyfadd,dbl rA3, rB0, rC30, rC30 fldd 120(pB0), rB0 fmpyfadd,dbl rA0, rB1, rC01, rC01 fmpyfadd,dbl rA1, rB1, rC11, rC11 fldd 120(pA1), ra1 fmpyfadd,dbl rA2, rB1, rC21, rC21 fmpyfadd,dbl rA3, rB1, rC31, rC31 fldd 120(pB1), rB1 fmpyfadd,dbl rA0, rB2, rC02, rC02 fmpyfadd,dbl rA1, rB2, rC12, rC12 fldd 120(pA2), ra2 fmpyfadd,dbl rA2, rB2, rC22, rC22 fmpyfadd,dbl rA3, rB2, rC32, rC32 fldd 120(pB2), rB2 fmpyfadd,dbl rA0, rB3, rC03, rC03 fmpyfadd,dbl rA1, rB3, rC13, rC13 fldd 120(pA3), ra3 fmpyfadd,dbl rA2, rB3, rC23, rC23 fmpyfadd,dbl rA3, rB3, rC33, rC33 fldd 120(pB3), rB3 fmpyfadd,dbl ra0, rB0, rC00, rC00 fmpyfadd,dbl ra1, rB0, rC10, rC10 ldo 8*(3*KB+16)(pA0), pA0 ldo 8*(3*KB+16)(pA1), pA1 fmpyfadd,dbl ra2, rB0, rC20, rC20 fmpyfadd,dbl ra3, rB0, rC30, rC30 ldo 8*(3*KB+16)(pA2), pA2 ldo 8*(3*KB+16)(pA3), pA3 fmpyfadd,dbl ra0, rB1, rC01, rC01 fmpyfadd,dbl ra1, rB1, rC11, rC11 ldo -8*(KB-16)(pB0), pB0 ldo -8*(KB-16)(pB1), pB1 fmpyfadd,dbl ra2, rB1, rC21, rC21 fmpyfadd,dbl ra3, rB1, rC31, rC31 fmpyfadd,dbl ra0, rB2, rC02, rC02 fmpyfadd,dbl ra1, rB2, rC12, rC12 ldo -8*(KB-16)(pB2), pB2 fmpyfadd,dbl ra2, rB2, rC22, rC22 fmpyfadd,dbl ra3, rB2, rC32, rC32 fmpyfadd,dbl ra0, rB3, rC03, rC03 fmpyfadd,dbl ra1, rB3, rC13, rC13 ldo -8*(KB-16)(pB3), pB3 fmpyfadd,dbl ra2, rB3, rC23, rC23 fmpyfadd,dbl ra3, rB3, rC33, rC33;; end drain KLOOP;;; Write to C; fstd rC00, 0(pC0) fstd rC10, CMUL(8)(pC0) fstd rC20, CMUL(16)(pC0) fstd rC30, CMUL(24)(pC0) ldo CMUL(32)(pC0), pC0 fstd rC01, 0(pC1) fstd rC11, CMUL(8)(pC1) fstd rC21, CMUL(16)(pC1) fstd rC31, CMUL(24)(pC1) ldo CMUL(32)(pC1), pC1 fstd rC02, 0(pC2) fstd rC12, CMUL(8)(pC2) fstd rC22, CMUL(16)(pC2) fstd rC32, CMUL(24)(pC2) ldo CMUL(32)(pC2), pC2 fstd rC03, 0(pC3) fstd rC13, CMUL(8)(pC3) fstd rC23, CMUL(16)(pC3) fstd rC33, CMUL(24)(pC3);; while (M);;; ldo -6(rM), rM; cmpib,<> 0, rM, MLOOP addib,<> -4, rM, MLOOP ldo CMUL(32)(pC3), pC3 ldo 8*4*KB(pB0), pB0 ldo 8*4*KB(pB1), pB1 ldo 8*4*KB(pB2), pB2 ldo 8*4*KB(pB3), pB3 sub pA0, incAn, pA0 sub pA1, incAn, pA1 sub pA2, incAn, pA2 sub pA3, incAn, pA3 add pC0, incCn, pC0 add pC1, incCn, pC1 add pC2, incCn, pC2;; while (N);; addib,<> -4, rN, NLOOP add pC3, incCn, pC3/* * Restore regs and return */ fldd -8(%r30), %fr12 fldd -16(%r30), %fr13 fldd -24(%r30), %fr14 fldd -32(%r30), %fr15 fldd -40(%r30), %fr16 fldd -48(%r30), %fr17 fldd -56(%r30), %fr18 fldd -64(%r30), %fr19 fldd -72(%r30), %fr20 fldd -80(%r30), %fr21 ldw -84(%r30), %r3 ldw -88(%r30), %r4 ldw -92(%r30), %r5; ldw -96(%r30), %r6; ldw -100(%r30), %r7; ldw -104(%r30), %r8; ldw -108(%r30), %r9 ldw -112(%r30), %r10 ldw -116(%r30), %r11 ldw -120(%r30), %r12 ldw -124(%r30), %r13 bve (%r2) ldo -FSIZE(%r30), %r30 .EXIT .PROCEND
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?