atl_dmm4x4x32_ppc.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,609 行 · 第 1/5 页
C
2,609 行
fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 lfd rb2, 504+KB2*8(pB0) fmadd rC03, ra0, rb3, rC03 lfd ra0, 504+KB0*8(pA0) fmadd rC13, ra1, rb3, rC13 lfd ra1, 504+KB1*8(pA0) fmadd rC23, ra2, rb3, rC23 lfd ra2, 504+KB2*8(pA0) fmadd rC33, ra3, rb3, rC33 lfd ra3, 504+KB3*8(pA0)#endif#if KB > 64 fmadd rC00, rA0, rB0, rC00 lfd rb3, 504+KB3*8(pB0) fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 lfd rB0, 512+KB0*8(pB0) fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 lfd rB1, 512+KB1*8(pB0) fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 lfd rB2, 512+KB2*8(pB0) fmadd rC03, rA0, rB3, rC03 lfd rA0, 512+KB0*8(pA0) fmadd rC13, rA1, rB3, rC13 lfd rA1, 512+KB1*8(pA0) fmadd rC23, rA2, rB3, rC23 lfd rA2, 512+KB2*8(pA0) fmadd rC33, rA3, rB3, rC33 lfd rA3, 512+KB3*8(pA0)#endif#if KB > 65 fmadd rC00, ra0, rb0, rC00 lfd rB3, 512+KB3*8(pB0) fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 lfd rb0, 520+KB0*8(pB0) fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 lfd rb1, 520+KB1*8(pB0) fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 lfd rb2, 520+KB2*8(pB0) fmadd rC03, ra0, rb3, rC03 lfd ra0, 520+KB0*8(pA0) fmadd rC13, ra1, rb3, rC13 lfd ra1, 520+KB1*8(pA0) fmadd rC23, ra2, rb3, rC23 lfd ra2, 520+KB2*8(pA0) fmadd rC33, ra3, rb3, rC33 lfd ra3, 520+KB3*8(pA0)#endif#if KB > 66 fmadd rC00, rA0, rB0, rC00 lfd rb3, 520+KB3*8(pB0) fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 lfd rB0, 528+KB0*8(pB0) fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 lfd rB1, 528+KB1*8(pB0) fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 lfd rB2, 528+KB2*8(pB0) fmadd rC03, rA0, rB3, rC03 lfd rA0, 528+KB0*8(pA0) fmadd rC13, rA1, rB3, rC13 lfd rA1, 528+KB1*8(pA0) fmadd rC23, rA2, rB3, rC23 lfd rA2, 528+KB2*8(pA0) fmadd rC33, rA3, rB3, rC33 lfd rA3, 528+KB3*8(pA0)#endif#if KB > 67 fmadd rC00, ra0, rb0, rC00 lfd rB3, 528+KB3*8(pB0) fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 lfd rb0, 536+KB0*8(pB0) fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 lfd rb1, 536+KB1*8(pB0) fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 lfd rb2, 536+KB2*8(pB0) fmadd rC03, ra0, rb3, rC03 lfd ra0, 536+KB0*8(pA0) fmadd rC13, ra1, rb3, rC13 lfd ra1, 536+KB1*8(pA0) fmadd rC23, ra2, rb3, rC23 lfd ra2, 536+KB2*8(pA0) fmadd rC33, ra3, rb3, rC33 lfd ra3, 536+KB3*8(pA0)#endif#if KB > 68 fmadd rC00, rA0, rB0, rC00 lfd rb3, 536+KB3*8(pB0) fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 lfd rB0, 544+KB0*8(pB0) fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 lfd rB1, 544+KB1*8(pB0) fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 lfd rB2, 544+KB2*8(pB0) fmadd rC03, rA0, rB3, rC03 lfd rA0, 544+KB0*8(pA0) fmadd rC13, rA1, rB3, rC13 lfd rA1, 544+KB1*8(pA0) fmadd rC23, rA2, rB3, rC23 lfd rA2, 544+KB2*8(pA0) fmadd rC33, rA3, rB3, rC33 lfd rA3, 544+KB3*8(pA0)#endif#if KB > 69 fmadd rC00, ra0, rb0, rC00 lfd rB3, 544+KB3*8(pB0) fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 lfd rb0, 552+KB0*8(pB0) fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 lfd rb1, 552+KB1*8(pB0) fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 lfd rb2, 552+KB2*8(pB0) fmadd rC03, ra0, rb3, rC03 lfd ra0, 552+KB0*8(pA0) fmadd rC13, ra1, rb3, rC13 lfd ra1, 552+KB1*8(pA0) fmadd rC23, ra2, rb3, rC23 lfd ra2, 552+KB2*8(pA0) fmadd rC33, ra3, rb3, rC33 lfd ra3, 552+KB3*8(pA0)#endif#if KB > 70 fmadd rC00, rA0, rB0, rC00 lfd rb3, 552+KB3*8(pB0) fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 lfd rB0, 560+KB0*8(pB0) fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 lfd rB1, 560+KB1*8(pB0) fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 lfd rB2, 560+KB2*8(pB0) fmadd rC03, rA0, rB3, rC03 lfd rA0, 560+KB0*8(pA0) fmadd rC13, rA1, rB3, rC13 lfd rA1, 560+KB1*8(pA0) fmadd rC23, rA2, rB3, rC23 lfd rA2, 560+KB2*8(pA0) fmadd rC33, rA3, rB3, rC33 lfd rA3, 560+KB3*8(pA0)#endif#if KB > 71 fmadd rC00, ra0, rb0, rC00 lfd rB3, 560+KB3*8(pB0) fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 lfd rb0, 568+KB0*8(pB0) fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 lfd rb1, 568+KB1*8(pB0) fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 lfd rb2, 568+KB2*8(pB0) fmadd rC03, ra0, rb3, rC03 lfd ra0, 568+KB0*8(pA0) fmadd rC13, ra1, rb3, rC13 lfd ra1, 568+KB1*8(pA0) fmadd rC23, ra2, rb3, rC23 lfd ra2, 568+KB2*8(pA0) fmadd rC33, ra3, rb3, rC33 lfd ra3, 568+KB3*8(pA0)#endif#if KB > 72 fmadd rC00, rA0, rB0, rC00 lfd rb3, 568+KB3*8(pB0) fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 lfd rB0, 576+KB0*8(pB0) fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 lfd rB1, 576+KB1*8(pB0) fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 lfd rB2, 576+KB2*8(pB0) fmadd rC03, rA0, rB3, rC03 lfd rA0, 576+KB0*8(pA0) fmadd rC13, rA1, rB3, rC13 lfd rA1, 576+KB1*8(pA0) fmadd rC23, rA2, rB3, rC23 lfd rA2, 576+KB2*8(pA0) fmadd rC33, rA3, rB3, rC33 lfd rA3, 576+KB3*8(pA0)#endif#if KB > 73 fmadd rC00, ra0, rb0, rC00 lfd rB3, 576+KB3*8(pB0) fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 lfd rb0, 584+KB0*8(pB0) fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 lfd rb1, 584+KB1*8(pB0) fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 lfd rb2, 584+KB2*8(pB0) fmadd rC03, ra0, rb3, rC03 lfd ra0, 584+KB0*8(pA0) fmadd rC13, ra1, rb3, rC13 lfd ra1, 584+KB1*8(pA0) fmadd rC23, ra2, rb3, rC23 lfd ra2, 584+KB2*8(pA0) fmadd rC33, ra3, rb3, rC33 lfd ra3, 584+KB3*8(pA0)#endif#if KB > 74 fmadd rC00, rA0, rB0, rC00 lfd rb3, 584+KB3*8(pB0) fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 lfd rB0, 592+KB0*8(pB0) fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 lfd rB1, 592+KB1*8(pB0) fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 lfd rB2, 592+KB2*8(pB0) fmadd rC03, rA0, rB3, rC03 lfd rA0, 592+KB0*8(pA0) fmadd rC13, rA1, rB3, rC13 lfd rA1, 592+KB1*8(pA0) fmadd rC23, rA2, rB3, rC23 lfd rA2, 592+KB2*8(pA0) fmadd rC33, rA3, rB3, rC33 lfd rA3, 592+KB3*8(pA0)#endif#if KB > 75 fmadd rC00, ra0, rb0, rC00 lfd rB3, 592+KB3*8(pB0) fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 lfd rb0, 600+KB0*8(pB0) fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 lfd rb1, 600+KB1*8(pB0) fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 lfd rb2, 600+KB2*8(pB0) fmadd rC03, ra0, rb3, rC03 lfd ra0, 600+KB0*8(pA0) fmadd rC13, ra1, rb3, rC13 lfd ra1, 600+KB1*8(pA0) fmadd rC23, ra2, rb3, rC23 lfd ra2, 600+KB2*8(pA0) fmadd rC33, ra3, rb3, rC33 lfd ra3, 600+KB3*8(pA0)#endif#if KB > 76 fmadd rC00, rA0, rB0, rC00 lfd rb3, 600+KB3*8(pB0) fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 lfd rB0, 608+KB0*8(pB0) fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 lfd rB1, 608+KB1*8(pB0) fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 lfd rB2, 608+KB2*8(pB0) fmadd rC03, rA0, rB3, rC03 lfd rA0, 608+KB0*8(pA0) fmadd rC13, rA1, rB3, rC13 lfd rA1, 608+KB1*8(pA0) fmadd rC23, rA2, rB3, rC23 lfd rA2, 608+KB2*8(pA0) fmadd rC33, rA3, rB3, rC33 lfd rA3, 608+KB3*8(pA0)#endif#if KB > 77 fmadd rC00, ra0, rb0, rC00 lfd rB3, 608+KB3*8(pB0) fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 lfd rb0, 616+KB0*8(pB0) fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 lfd rb1, 616+KB1*8(pB0) fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 lfd rb2, 616+KB2*8(pB0) fmadd rC03, ra0, rb3, rC03 lfd ra0, 616+KB0*8(pA0) fmadd rC13, ra1, rb3, rC13 lfd ra1, 616+KB1*8(pA0) fmadd rC23, ra2, rb3, rC23 lfd ra2, 616+KB2*8(pA0) fmadd rC33, ra3, rb3, rC33 lfd ra3, 616+KB3*8(pA0)#endif#if KB > 78 fmadd rC00, rA0, rB0, rC00 lfd rb3, 616+KB3*8(pB0) fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 lfd rB0, 624+KB0*8(pB0) fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 lfd rB1, 624+KB1*8(pB0) fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 lfd rB2, 624+KB2*8(pB0) fmadd rC03, rA0, rB3, rC03 lfd rA0, 624+KB0*8(pA0) fmadd rC13, rA1, rB3, rC13 lfd rA1, 624+KB1*8(pA0) fmadd rC23, rA2, rB3, rC23 lfd rA2, 624+KB2*8(pA0) fmadd rC33, rA3, rB3, rC33 lfd rA3, 624+KB3*8(pA0)#endif#if KB > 79 fmadd rC00, ra0, rb0, rC00 lfd rB3, 624+KB3*8(pB0) fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 lfd rb0, 632+KB0*8(pB0) fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 lfd rb1, 632+KB1*8(pB0) fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 lfd rb2, 632+KB2*8(pB0) fmadd rC03, ra0, rb3, rC03 lfd ra0, 632+KB0*8(pA0) fmadd rC13, ra1, rb3, rC13 lfd ra1, 632+KB1*8(pA0) fmadd rC23, ra2, rb3, rC23 lfd ra2, 632+KB2*8(pA0) fmadd rC33, ra3, rb3, rC33 lfd ra3, 632+KB3*8(pA0)#endif fmadd rC00, rA0, rB0, rC00#if (KB/2)*2 == KB lfd rb3, (KB-1)*8+KB3*8(pB0)#else lfd rB3, (KB-1)*8+KB3*8(pB0)#endif fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 fmadd rC03, rA0, rB3, rC03 fmadd rC13, rA1, rB3, rC13 fmadd rC23, rA2, rB3, rC23 fmadd rC33, rA3, rB3, rC33#if KB > 1 fmadd rC00, ra0, rb0, rC00 fmadd rC10, ra1, rb0, rC10 dcbt 0, pfA, 0 addi pfA, pfA, 128 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 fmadd rC03, ra0, rb3, rC03 fmadd rC13, ra1, rb3, rC13 fmadd rC23, ra2, rb3, rC23 fmadd rC33, ra3, rb3, rC33#endif/* * Store to C, iterate loop */ stfd rC00, 0(pC0) stfd rC10, CMUL(8)(pC0) stfd rC20, CMUL(16)(pC0) stfd rC30, CMUL(24)(pC0) stfd rC01, 0(pC1) stfd rC11, CMUL(8)(pC1) stfd rC21, CMUL(16)(pC1) stfd rC31, CMUL(24)(pC1) stfd rC02, 0(pC2) stfd rC12, CMUL(8)(pC2) stfd rC22, CMUL(16)(pC2) stfd rC32, CMUL(24)(pC2) stfd rC03, 0(pC3) stfd rC13, CMUL(8)(pC3) stfd rC23, CMUL(16)(pC3) stfd rC33, CMUL(24)(pC3)/* * Mov ptrs, while(M) */ addi pA0, pA0, KB4*8 /* pA0 += 4*lda */ addi pC0, pC0, CMUL(4)*8 /* pC0 += 4 */ addi pC1, pC1, CMUL(4)*8 addi pC2, pC2, CMUL(4)*8 addi
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?