atl_dmm4x4x32_ppc.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,609 行 · 第 1/5 页

C
2,609
字号
	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	lfd	rb2, 504+KB2*8(pB0)	fmadd	rC03, ra0, rb3, rC03	lfd	ra0, 504+KB0*8(pA0)	fmadd	rC13, ra1, rb3, rC13	lfd	ra1, 504+KB1*8(pA0)	fmadd	rC23, ra2, rb3, rC23	lfd	ra2, 504+KB2*8(pA0)	fmadd	rC33, ra3, rb3, rC33	lfd	ra3, 504+KB3*8(pA0)#endif#if KB > 64	fmadd	rC00, rA0, rB0, rC00	lfd	rb3, 504+KB3*8(pB0)	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	lfd	rB0, 512+KB0*8(pB0)	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	lfd	rB1, 512+KB1*8(pB0)	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	lfd	rB2, 512+KB2*8(pB0)	fmadd	rC03, rA0, rB3, rC03	lfd	rA0, 512+KB0*8(pA0)	fmadd	rC13, rA1, rB3, rC13	lfd	rA1, 512+KB1*8(pA0)	fmadd	rC23, rA2, rB3, rC23	lfd	rA2, 512+KB2*8(pA0)	fmadd	rC33, rA3, rB3, rC33	lfd	rA3, 512+KB3*8(pA0)#endif#if KB > 65	fmadd	rC00, ra0, rb0, rC00	lfd	rB3, 512+KB3*8(pB0)	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	lfd	rb0, 520+KB0*8(pB0)	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	lfd	rb1, 520+KB1*8(pB0)	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	lfd	rb2, 520+KB2*8(pB0)	fmadd	rC03, ra0, rb3, rC03	lfd	ra0, 520+KB0*8(pA0)	fmadd	rC13, ra1, rb3, rC13	lfd	ra1, 520+KB1*8(pA0)	fmadd	rC23, ra2, rb3, rC23	lfd	ra2, 520+KB2*8(pA0)	fmadd	rC33, ra3, rb3, rC33	lfd	ra3, 520+KB3*8(pA0)#endif#if KB > 66	fmadd	rC00, rA0, rB0, rC00	lfd	rb3, 520+KB3*8(pB0)	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	lfd	rB0, 528+KB0*8(pB0)	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	lfd	rB1, 528+KB1*8(pB0)	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	lfd	rB2, 528+KB2*8(pB0)	fmadd	rC03, rA0, rB3, rC03	lfd	rA0, 528+KB0*8(pA0)	fmadd	rC13, rA1, rB3, rC13	lfd	rA1, 528+KB1*8(pA0)	fmadd	rC23, rA2, rB3, rC23	lfd	rA2, 528+KB2*8(pA0)	fmadd	rC33, rA3, rB3, rC33	lfd	rA3, 528+KB3*8(pA0)#endif#if KB > 67	fmadd	rC00, ra0, rb0, rC00	lfd	rB3, 528+KB3*8(pB0)	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	lfd	rb0, 536+KB0*8(pB0)	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	lfd	rb1, 536+KB1*8(pB0)	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	lfd	rb2, 536+KB2*8(pB0)	fmadd	rC03, ra0, rb3, rC03	lfd	ra0, 536+KB0*8(pA0)	fmadd	rC13, ra1, rb3, rC13	lfd	ra1, 536+KB1*8(pA0)	fmadd	rC23, ra2, rb3, rC23	lfd	ra2, 536+KB2*8(pA0)	fmadd	rC33, ra3, rb3, rC33	lfd	ra3, 536+KB3*8(pA0)#endif#if KB > 68	fmadd	rC00, rA0, rB0, rC00	lfd	rb3, 536+KB3*8(pB0)	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	lfd	rB0, 544+KB0*8(pB0)	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	lfd	rB1, 544+KB1*8(pB0)	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	lfd	rB2, 544+KB2*8(pB0)	fmadd	rC03, rA0, rB3, rC03	lfd	rA0, 544+KB0*8(pA0)	fmadd	rC13, rA1, rB3, rC13	lfd	rA1, 544+KB1*8(pA0)	fmadd	rC23, rA2, rB3, rC23	lfd	rA2, 544+KB2*8(pA0)	fmadd	rC33, rA3, rB3, rC33	lfd	rA3, 544+KB3*8(pA0)#endif#if KB > 69	fmadd	rC00, ra0, rb0, rC00	lfd	rB3, 544+KB3*8(pB0)	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	lfd	rb0, 552+KB0*8(pB0)	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	lfd	rb1, 552+KB1*8(pB0)	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	lfd	rb2, 552+KB2*8(pB0)	fmadd	rC03, ra0, rb3, rC03	lfd	ra0, 552+KB0*8(pA0)	fmadd	rC13, ra1, rb3, rC13	lfd	ra1, 552+KB1*8(pA0)	fmadd	rC23, ra2, rb3, rC23	lfd	ra2, 552+KB2*8(pA0)	fmadd	rC33, ra3, rb3, rC33	lfd	ra3, 552+KB3*8(pA0)#endif#if KB > 70	fmadd	rC00, rA0, rB0, rC00	lfd	rb3, 552+KB3*8(pB0)	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	lfd	rB0, 560+KB0*8(pB0)	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	lfd	rB1, 560+KB1*8(pB0)	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	lfd	rB2, 560+KB2*8(pB0)	fmadd	rC03, rA0, rB3, rC03	lfd	rA0, 560+KB0*8(pA0)	fmadd	rC13, rA1, rB3, rC13	lfd	rA1, 560+KB1*8(pA0)	fmadd	rC23, rA2, rB3, rC23	lfd	rA2, 560+KB2*8(pA0)	fmadd	rC33, rA3, rB3, rC33	lfd	rA3, 560+KB3*8(pA0)#endif#if KB > 71	fmadd	rC00, ra0, rb0, rC00	lfd	rB3, 560+KB3*8(pB0)	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	lfd	rb0, 568+KB0*8(pB0)	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	lfd	rb1, 568+KB1*8(pB0)	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	lfd	rb2, 568+KB2*8(pB0)	fmadd	rC03, ra0, rb3, rC03	lfd	ra0, 568+KB0*8(pA0)	fmadd	rC13, ra1, rb3, rC13	lfd	ra1, 568+KB1*8(pA0)	fmadd	rC23, ra2, rb3, rC23	lfd	ra2, 568+KB2*8(pA0)	fmadd	rC33, ra3, rb3, rC33	lfd	ra3, 568+KB3*8(pA0)#endif#if KB > 72	fmadd	rC00, rA0, rB0, rC00	lfd	rb3, 568+KB3*8(pB0)	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	lfd	rB0, 576+KB0*8(pB0)	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	lfd	rB1, 576+KB1*8(pB0)	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	lfd	rB2, 576+KB2*8(pB0)	fmadd	rC03, rA0, rB3, rC03	lfd	rA0, 576+KB0*8(pA0)	fmadd	rC13, rA1, rB3, rC13	lfd	rA1, 576+KB1*8(pA0)	fmadd	rC23, rA2, rB3, rC23	lfd	rA2, 576+KB2*8(pA0)	fmadd	rC33, rA3, rB3, rC33	lfd	rA3, 576+KB3*8(pA0)#endif#if KB > 73	fmadd	rC00, ra0, rb0, rC00	lfd	rB3, 576+KB3*8(pB0)	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	lfd	rb0, 584+KB0*8(pB0)	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	lfd	rb1, 584+KB1*8(pB0)	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	lfd	rb2, 584+KB2*8(pB0)	fmadd	rC03, ra0, rb3, rC03	lfd	ra0, 584+KB0*8(pA0)	fmadd	rC13, ra1, rb3, rC13	lfd	ra1, 584+KB1*8(pA0)	fmadd	rC23, ra2, rb3, rC23	lfd	ra2, 584+KB2*8(pA0)	fmadd	rC33, ra3, rb3, rC33	lfd	ra3, 584+KB3*8(pA0)#endif#if KB > 74	fmadd	rC00, rA0, rB0, rC00	lfd	rb3, 584+KB3*8(pB0)	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	lfd	rB0, 592+KB0*8(pB0)	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	lfd	rB1, 592+KB1*8(pB0)	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	lfd	rB2, 592+KB2*8(pB0)	fmadd	rC03, rA0, rB3, rC03	lfd	rA0, 592+KB0*8(pA0)	fmadd	rC13, rA1, rB3, rC13	lfd	rA1, 592+KB1*8(pA0)	fmadd	rC23, rA2, rB3, rC23	lfd	rA2, 592+KB2*8(pA0)	fmadd	rC33, rA3, rB3, rC33	lfd	rA3, 592+KB3*8(pA0)#endif#if KB > 75	fmadd	rC00, ra0, rb0, rC00	lfd	rB3, 592+KB3*8(pB0)	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	lfd	rb0, 600+KB0*8(pB0)	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	lfd	rb1, 600+KB1*8(pB0)	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	lfd	rb2, 600+KB2*8(pB0)	fmadd	rC03, ra0, rb3, rC03	lfd	ra0, 600+KB0*8(pA0)	fmadd	rC13, ra1, rb3, rC13	lfd	ra1, 600+KB1*8(pA0)	fmadd	rC23, ra2, rb3, rC23	lfd	ra2, 600+KB2*8(pA0)	fmadd	rC33, ra3, rb3, rC33	lfd	ra3, 600+KB3*8(pA0)#endif#if KB > 76	fmadd	rC00, rA0, rB0, rC00	lfd	rb3, 600+KB3*8(pB0)	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	lfd	rB0, 608+KB0*8(pB0)	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	lfd	rB1, 608+KB1*8(pB0)	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	lfd	rB2, 608+KB2*8(pB0)	fmadd	rC03, rA0, rB3, rC03	lfd	rA0, 608+KB0*8(pA0)	fmadd	rC13, rA1, rB3, rC13	lfd	rA1, 608+KB1*8(pA0)	fmadd	rC23, rA2, rB3, rC23	lfd	rA2, 608+KB2*8(pA0)	fmadd	rC33, rA3, rB3, rC33	lfd	rA3, 608+KB3*8(pA0)#endif#if KB > 77	fmadd	rC00, ra0, rb0, rC00	lfd	rB3, 608+KB3*8(pB0)	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	lfd	rb0, 616+KB0*8(pB0)	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	lfd	rb1, 616+KB1*8(pB0)	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	lfd	rb2, 616+KB2*8(pB0)	fmadd	rC03, ra0, rb3, rC03	lfd	ra0, 616+KB0*8(pA0)	fmadd	rC13, ra1, rb3, rC13	lfd	ra1, 616+KB1*8(pA0)	fmadd	rC23, ra2, rb3, rC23	lfd	ra2, 616+KB2*8(pA0)	fmadd	rC33, ra3, rb3, rC33	lfd	ra3, 616+KB3*8(pA0)#endif#if KB > 78	fmadd	rC00, rA0, rB0, rC00	lfd	rb3, 616+KB3*8(pB0)	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	lfd	rB0, 624+KB0*8(pB0)	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	lfd	rB1, 624+KB1*8(pB0)	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	lfd	rB2, 624+KB2*8(pB0)	fmadd	rC03, rA0, rB3, rC03	lfd	rA0, 624+KB0*8(pA0)	fmadd	rC13, rA1, rB3, rC13	lfd	rA1, 624+KB1*8(pA0)	fmadd	rC23, rA2, rB3, rC23	lfd	rA2, 624+KB2*8(pA0)	fmadd	rC33, rA3, rB3, rC33	lfd	rA3, 624+KB3*8(pA0)#endif#if KB > 79	fmadd	rC00, ra0, rb0, rC00	lfd	rB3, 624+KB3*8(pB0)	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	lfd	rb0, 632+KB0*8(pB0)	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	lfd	rb1, 632+KB1*8(pB0)	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	lfd	rb2, 632+KB2*8(pB0)	fmadd	rC03, ra0, rb3, rC03	lfd	ra0, 632+KB0*8(pA0)	fmadd	rC13, ra1, rb3, rC13	lfd	ra1, 632+KB1*8(pA0)	fmadd	rC23, ra2, rb3, rC23	lfd	ra2, 632+KB2*8(pA0)	fmadd	rC33, ra3, rb3, rC33	lfd	ra3, 632+KB3*8(pA0)#endif	fmadd	rC00, rA0, rB0, rC00#if (KB/2)*2 == KB	lfd	rb3, (KB-1)*8+KB3*8(pB0)#else	lfd	rB3, (KB-1)*8+KB3*8(pB0)#endif	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	fmadd	rC03, rA0, rB3, rC03	fmadd	rC13, rA1, rB3, rC13	fmadd	rC23, rA2, rB3, rC23	fmadd	rC33, rA3, rB3, rC33#if KB > 1	fmadd	rC00, ra0, rb0, rC00	fmadd	rC10, ra1, rb0, rC10               dcbt    0, pfA, 0               addi    pfA, pfA, 128	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	fmadd	rC03, ra0, rb3, rC03	fmadd	rC13, ra1, rb3, rC13	fmadd	rC23, ra2, rb3, rC23	fmadd	rC33, ra3, rb3, rC33#endif/* *      Store to C, iterate loop */        stfd    rC00, 0(pC0)        stfd    rC10, CMUL(8)(pC0)        stfd    rC20, CMUL(16)(pC0)        stfd    rC30, CMUL(24)(pC0)        stfd    rC01, 0(pC1)        stfd    rC11, CMUL(8)(pC1)        stfd    rC21, CMUL(16)(pC1)        stfd    rC31, CMUL(24)(pC1)        stfd    rC02, 0(pC2)        stfd    rC12, CMUL(8)(pC2)        stfd    rC22, CMUL(16)(pC2)        stfd    rC32, CMUL(24)(pC2)        stfd    rC03, 0(pC3)        stfd    rC13, CMUL(8)(pC3)        stfd    rC23, CMUL(16)(pC3)        stfd    rC33, CMUL(24)(pC3)/* *      Mov ptrs, while(M) */        addi    pA0, pA0, KB4*8         /* pA0 += 4*lda */        addi    pC0, pC0, CMUL(4)*8     /* pC0 += 4 */        addi    pC1, pC1, CMUL(4)*8        addi    pC2, pC2, CMUL(4)*8        addi  

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?