atl_dmm4x4x16_hppa.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,251 行 · 第 1/3 页

C
1,251
字号
	fmpyfadd,dbl	ra2, rB3, rC23, rC23	fmpyfadd,dbl	ra3, rB3, rC33, rC33						fldd   	32(pB3), rB3	fmpyfadd,dbl	rA0, rB0, rC00, rC00	fmpyfadd,dbl	rA1, rB0, rC10, rC10						fldd   	40(pA0), ra0	fmpyfadd,dbl	rA2, rB0, rC20, rC20	fmpyfadd,dbl	rA3, rB0, rC30, rC30						fldd   	40(pB0), rB0	fmpyfadd,dbl	rA0, rB1, rC01, rC01	fmpyfadd,dbl	rA1, rB1, rC11, rC11						fldd   	40(pA1), ra1	fmpyfadd,dbl	rA2, rB1, rC21, rC21	fmpyfadd,dbl	rA3, rB1, rC31, rC31						fldd   	40(pB1), rB1	fmpyfadd,dbl	rA0, rB2, rC02, rC02	fmpyfadd,dbl	rA1, rB2, rC12, rC12						fldd   	40(pA2), ra2	fmpyfadd,dbl	rA2, rB2, rC22, rC22	fmpyfadd,dbl	rA3, rB2, rC32, rC32						fldd   	40(pB2), rB2	fmpyfadd,dbl	rA0, rB3, rC03, rC03	fmpyfadd,dbl	rA1, rB3, rC13, rC13						fldd   	40(pA3), ra3	fmpyfadd,dbl	rA2, rB3, rC23, rC23	fmpyfadd,dbl	rA3, rB3, rC33, rC33						fldd   	40(pB3), rB3	fmpyfadd,dbl	ra0, rB0, rC00, rC00	fmpyfadd,dbl	ra1, rB0, rC10, rC10						fldd   	48(pA0), rA0	fmpyfadd,dbl	ra2, rB0, rC20, rC20	fmpyfadd,dbl	ra3, rB0, rC30, rC30						fldd   	48(pB0), rB0	fmpyfadd,dbl	ra0, rB1, rC01, rC01	fmpyfadd,dbl	ra1, rB1, rC11, rC11						fldd   	48(pA1), rA1	fmpyfadd,dbl	ra2, rB1, rC21, rC21	fmpyfadd,dbl	ra3, rB1, rC31, rC31						fldd   	48(pB1), rB1	fmpyfadd,dbl	ra0, rB2, rC02, rC02	fmpyfadd,dbl	ra1, rB2, rC12, rC12						fldd   	48(pA2), rA2	fmpyfadd,dbl	ra2, rB2, rC22, rC22	fmpyfadd,dbl	ra3, rB2, rC32, rC32						fldd   	48(pB2), rB2	fmpyfadd,dbl	ra0, rB3, rC03, rC03	fmpyfadd,dbl	ra1, rB3, rC13, rC13						fldd   	48(pA3), rA3	fmpyfadd,dbl	ra2, rB3, rC23, rC23	fmpyfadd,dbl	ra3, rB3, rC33, rC33						fldd   	48(pB3), rB3	fmpyfadd,dbl	rA0, rB0, rC00, rC00	fmpyfadd,dbl	rA1, rB0, rC10, rC10						fldd   	56(pA0), ra0	fmpyfadd,dbl	rA2, rB0, rC20, rC20	fmpyfadd,dbl	rA3, rB0, rC30, rC30						fldd   	56(pB0), rB0	fmpyfadd,dbl	rA0, rB1, rC01, rC01	fmpyfadd,dbl	rA1, rB1, rC11, rC11						fldd   	56(pA1), ra1	fmpyfadd,dbl	rA2, rB1, rC21, rC21	fmpyfadd,dbl	rA3, rB1, rC31, rC31						fldd   	56(pB1), rB1	fmpyfadd,dbl	rA0, rB2, rC02, rC02	fmpyfadd,dbl	rA1, rB2, rC12, rC12						fldd   	56(pA2), ra2	fmpyfadd,dbl	rA2, rB2, rC22, rC22	fmpyfadd,dbl	rA3, rB2, rC32, rC32						fldd   	56(pB2), rB2	fmpyfadd,dbl	rA0, rB3, rC03, rC03	fmpyfadd,dbl	rA1, rB3, rC13, rC13						fldd   	56(pA3), ra3	fmpyfadd,dbl	rA2, rB3, rC23, rC23	fmpyfadd,dbl	rA3, rB3, rC33, rC33						fldd   	56(pB3), rB3	fmpyfadd,dbl	ra0, rB0, rC00, rC00	fmpyfadd,dbl	ra1, rB0, rC10, rC10						fldd   	64(pA0), rA0	fmpyfadd,dbl	ra2, rB0, rC20, rC20	fmpyfadd,dbl	ra3, rB0, rC30, rC30						fldd   	64(pB0), rB0	fmpyfadd,dbl	ra0, rB1, rC01, rC01	fmpyfadd,dbl	ra1, rB1, rC11, rC11						fldd   	64(pA1), rA1	fmpyfadd,dbl	ra2, rB1, rC21, rC21	fmpyfadd,dbl	ra3, rB1, rC31, rC31						fldd   	64(pB1), rB1	fmpyfadd,dbl	ra0, rB2, rC02, rC02	fmpyfadd,dbl	ra1, rB2, rC12, rC12						fldd   	64(pA2), rA2	fmpyfadd,dbl	ra2, rB2, rC22, rC22	fmpyfadd,dbl	ra3, rB2, rC32, rC32						fldd   	64(pB2), rB2	fmpyfadd,dbl	ra0, rB3, rC03, rC03	fmpyfadd,dbl	ra1, rB3, rC13, rC13						fldd   	64(pA3), rA3	fmpyfadd,dbl	ra2, rB3, rC23, rC23	fmpyfadd,dbl	ra3, rB3, rC33, rC33						fldd   	64(pB3), rB3	fmpyfadd,dbl	rA0, rB0, rC00, rC00	fmpyfadd,dbl	rA1, rB0, rC10, rC10						fldd   	72(pA0), ra0	fmpyfadd,dbl	rA2, rB0, rC20, rC20	fmpyfadd,dbl	rA3, rB0, rC30, rC30						fldd   	72(pB0), rB0	fmpyfadd,dbl	rA0, rB1, rC01, rC01	fmpyfadd,dbl	rA1, rB1, rC11, rC11						fldd   	72(pA1), ra1	fmpyfadd,dbl	rA2, rB1, rC21, rC21	fmpyfadd,dbl	rA3, rB1, rC31, rC31						fldd   	72(pB1), rB1	fmpyfadd,dbl	rA0, rB2, rC02, rC02	fmpyfadd,dbl	rA1, rB2, rC12, rC12						fldd   	72(pA2), ra2	fmpyfadd,dbl	rA2, rB2, rC22, rC22	fmpyfadd,dbl	rA3, rB2, rC32, rC32						fldd   	72(pB2), rB2	fmpyfadd,dbl	rA0, rB3, rC03, rC03	fmpyfadd,dbl	rA1, rB3, rC13, rC13						fldd   	72(pA3), ra3	fmpyfadd,dbl	rA2, rB3, rC23, rC23	fmpyfadd,dbl	rA3, rB3, rC33, rC33						fldd   	72(pB3), rB3	fmpyfadd,dbl	ra0, rB0, rC00, rC00	fmpyfadd,dbl	ra1, rB0, rC10, rC10						fldd   	80(pA0), rA0	fmpyfadd,dbl	ra2, rB0, rC20, rC20	fmpyfadd,dbl	ra3, rB0, rC30, rC30						fldd   	80(pB0), rB0	fmpyfadd,dbl	ra0, rB1, rC01, rC01	fmpyfadd,dbl	ra1, rB1, rC11, rC11						fldd   	80(pA1), rA1	fmpyfadd,dbl	ra2, rB1, rC21, rC21	fmpyfadd,dbl	ra3, rB1, rC31, rC31						fldd   	80(pB1), rB1	fmpyfadd,dbl	ra0, rB2, rC02, rC02	fmpyfadd,dbl	ra1, rB2, rC12, rC12						fldd   	80(pA2), rA2	fmpyfadd,dbl	ra2, rB2, rC22, rC22	fmpyfadd,dbl	ra3, rB2, rC32, rC32						fldd   	80(pB2), rB2	fmpyfadd,dbl	ra0, rB3, rC03, rC03	fmpyfadd,dbl	ra1, rB3, rC13, rC13						fldd   	80(pA3), rA3	fmpyfadd,dbl	ra2, rB3, rC23, rC23	fmpyfadd,dbl	ra3, rB3, rC33, rC33						fldd   	80(pB3), rB3	fmpyfadd,dbl	rA0, rB0, rC00, rC00	fmpyfadd,dbl	rA1, rB0, rC10, rC10						fldd   	88(pA0), ra0	fmpyfadd,dbl	rA2, rB0, rC20, rC20	fmpyfadd,dbl	rA3, rB0, rC30, rC30						fldd   	88(pB0), rB0	fmpyfadd,dbl	rA0, rB1, rC01, rC01	fmpyfadd,dbl	rA1, rB1, rC11, rC11						fldd   	88(pA1), ra1	fmpyfadd,dbl	rA2, rB1, rC21, rC21	fmpyfadd,dbl	rA3, rB1, rC31, rC31						fldd   	88(pB1), rB1	fmpyfadd,dbl	rA0, rB2, rC02, rC02	fmpyfadd,dbl	rA1, rB2, rC12, rC12						fldd   	88(pA2), ra2	fmpyfadd,dbl	rA2, rB2, rC22, rC22	fmpyfadd,dbl	rA3, rB2, rC32, rC32						fldd   	88(pB2), rB2	fmpyfadd,dbl	rA0, rB3, rC03, rC03	fmpyfadd,dbl	rA1, rB3, rC13, rC13						fldd   	88(pA3), ra3	fmpyfadd,dbl	rA2, rB3, rC23, rC23	fmpyfadd,dbl	rA3, rB3, rC33, rC33						fldd   	88(pB3), rB3	fmpyfadd,dbl	ra0, rB0, rC00, rC00	fmpyfadd,dbl	ra1, rB0, rC10, rC10						fldd   	96(pA0), rA0	fmpyfadd,dbl	ra2, rB0, rC20, rC20	fmpyfadd,dbl	ra3, rB0, rC30, rC30						fldd   	96(pB0), rB0	fmpyfadd,dbl	ra0, rB1, rC01, rC01	fmpyfadd,dbl	ra1, rB1, rC11, rC11						fldd   	96(pA1), rA1	fmpyfadd,dbl	ra2, rB1, rC21, rC21	fmpyfadd,dbl	ra3, rB1, rC31, rC31						fldd   	96(pB1), rB1	fmpyfadd,dbl	ra0, rB2, rC02, rC02	fmpyfadd,dbl	ra1, rB2, rC12, rC12						fldd   	96(pA2), rA2	fmpyfadd,dbl	ra2, rB2, rC22, rC22	fmpyfadd,dbl	ra3, rB2, rC32, rC32						fldd   	96(pB2), rB2	fmpyfadd,dbl	ra0, rB3, rC03, rC03	fmpyfadd,dbl	ra1, rB3, rC13, rC13						fldd   	96(pA3), rA3	fmpyfadd,dbl	ra2, rB3, rC23, rC23	fmpyfadd,dbl	ra3, rB3, rC33, rC33						fldd   	96(pB3), rB3	fmpyfadd,dbl	rA0, rB0, rC00, rC00	fmpyfadd,dbl	rA1, rB0, rC10, rC10						fldd   	104(pA0), ra0	fmpyfadd,dbl	rA2, rB0, rC20, rC20	fmpyfadd,dbl	rA3, rB0, rC30, rC30						fldd   	104(pB0), rB0	fmpyfadd,dbl	rA0, rB1, rC01, rC01	fmpyfadd,dbl	rA1, rB1, rC11, rC11						fldd   	104(pA1), ra1	fmpyfadd,dbl	rA2, rB1, rC21, rC21	fmpyfadd,dbl	rA3, rB1, rC31, rC31						fldd   	104(pB1), rB1	fmpyfadd,dbl	rA0, rB2, rC02, rC02	fmpyfadd,dbl	rA1, rB2, rC12, rC12						fldd   	104(pA2), ra2	fmpyfadd,dbl	rA2, rB2, rC22, rC22	fmpyfadd,dbl	rA3, rB2, rC32, rC32						fldd   	104(pB2), rB2	fmpyfadd,dbl	rA0, rB3, rC03, rC03	fmpyfadd,dbl	rA1, rB3, rC13, rC13						fldd   	104(pA3), ra3	fmpyfadd,dbl	rA2, rB3, rC23, rC23	fmpyfadd,dbl	rA3, rB3, rC33, rC33						fldd   	104(pB3), rB3	fmpyfadd,dbl	ra0, rB0, rC00, rC00	fmpyfadd,dbl	ra1, rB0, rC10, rC10						fldd   	112(pA0), rA0	fmpyfadd,dbl	ra2, rB0, rC20, rC20	fmpyfadd,dbl	ra3, rB0, rC30, rC30						fldd   	112(pB0), rB0	fmpyfadd,dbl	ra0, rB1, rC01, rC01	fmpyfadd,dbl	ra1, rB1, rC11, rC11						fldd   	112(pA1), rA1	fmpyfadd,dbl	ra2, rB1, rC21, rC21	fmpyfadd,dbl	ra3, rB1, rC31, rC31						fldd   	112(pB1), rB1	fmpyfadd,dbl	ra0, rB2, rC02, rC02	fmpyfadd,dbl	ra1, rB2, rC12, rC12						fldd   	112(pA2), rA2	fmpyfadd,dbl	ra2, rB2, rC22, rC22	fmpyfadd,dbl	ra3, rB2, rC32, rC32						fldd   	112(pB2), rB2	fmpyfadd,dbl	ra0, rB3, rC03, rC03	fmpyfadd,dbl	ra1, rB3, rC13, rC13						fldd   	112(pA3), rA3	fmpyfadd,dbl	ra2, rB3, rC23, rC23	fmpyfadd,dbl	ra3, rB3, rC33, rC33						fldd   	112(pB3), rB3	fmpyfadd,dbl	rA0, rB0, rC00, rC00	fmpyfadd,dbl	rA1, rB0, rC10, rC10						fldd   	120(pA0), ra0	fmpyfadd,dbl	rA2, rB0, rC20, rC20	fmpyfadd,dbl	rA3, rB0, rC30, rC30						fldd   	120(pB0), rB0	fmpyfadd,dbl	rA0, rB1, rC01, rC01	fmpyfadd,dbl	rA1, rB1, rC11, rC11						fldd   	120(pA1), ra1	fmpyfadd,dbl	rA2, rB1, rC21, rC21	fmpyfadd,dbl	rA3, rB1, rC31, rC31						fldd   	120(pB1), rB1	fmpyfadd,dbl	rA0, rB2, rC02, rC02	fmpyfadd,dbl	rA1, rB2, rC12, rC12						fldd   	120(pA2), ra2	fmpyfadd,dbl	rA2, rB2, rC22, rC22	fmpyfadd,dbl	rA3, rB2, rC32, rC32						fldd   	120(pB2), rB2	fmpyfadd,dbl	rA0, rB3, rC03, rC03	fmpyfadd,dbl	rA1, rB3, rC13, rC13						fldd   	120(pA3), ra3	fmpyfadd,dbl	rA2, rB3, rC23, rC23	fmpyfadd,dbl	rA3, rB3, rC33, rC33						fldd   	120(pB3), rB3	fmpyfadd,dbl	ra0, rB0, rC00, rC00	fmpyfadd,dbl	ra1, rB0, rC10, rC10						ldo	8*(3*KB+16)(pA0), pA0						ldo	8*(3*KB+16)(pA1), pA1	fmpyfadd,dbl	ra2, rB0, rC20, rC20	fmpyfadd,dbl	ra3, rB0, rC30, rC30						ldo	8*(3*KB+16)(pA2), pA2						ldo	8*(3*KB+16)(pA3), pA3	fmpyfadd,dbl	ra0, rB1, rC01, rC01	fmpyfadd,dbl	ra1, rB1, rC11, rC11						ldo	-8*(KB-16)(pB0), pB0						ldo	-8*(KB-16)(pB1), pB1	fmpyfadd,dbl	ra2, rB1, rC21, rC21	fmpyfadd,dbl	ra3, rB1, rC31, rC31	fmpyfadd,dbl	ra0, rB2, rC02, rC02	fmpyfadd,dbl	ra1, rB2, rC12, rC12						ldo	-8*(KB-16)(pB2), pB2	fmpyfadd,dbl	ra2, rB2, rC22, rC22	fmpyfadd,dbl	ra3, rB2, rC32, rC32	fmpyfadd,dbl	ra0, rB3, rC03, rC03	fmpyfadd,dbl	ra1, rB3, rC13, rC13						ldo	-8*(KB-16)(pB3), pB3	fmpyfadd,dbl	ra2, rB3, rC23, rC23	fmpyfadd,dbl	ra3, rB3, rC33, rC33;;       end drain KLOOP;;;       Write to C;	fstd	rC00,  0(pC0)	fstd	rC10,  CMUL(8)(pC0)	fstd	rC20,  CMUL(16)(pC0)	fstd	rC30, CMUL(24)(pC0)					ldo	CMUL(32)(pC0), pC0	fstd	rC01,  0(pC1)	fstd	rC11,  CMUL(8)(pC1)	fstd	rC21,  CMUL(16)(pC1)	fstd	rC31, CMUL(24)(pC1)					ldo	CMUL(32)(pC1), pC1	fstd	rC02,  0(pC2)	fstd	rC12,  CMUL(8)(pC2)	fstd	rC22,  CMUL(16)(pC2)	fstd	rC32, CMUL(24)(pC2)					ldo 	CMUL(32)(pC2), pC2	fstd	rC03,  0(pC3)	fstd	rC13,  CMUL(8)(pC3)	fstd	rC23,  CMUL(16)(pC3)	fstd	rC33, CMUL(24)(pC3);;       while (M);;;	ldo	-6(rM), rM;	cmpib,<> 0, rM, MLOOP	addib,<>	-4, rM, MLOOP					ldo	CMUL(32)(pC3), pC3	ldo	8*4*KB(pB0), pB0	ldo	8*4*KB(pB1), pB1	ldo	8*4*KB(pB2), pB2	ldo	8*4*KB(pB3), pB3	sub	pA0, incAn, pA0	sub	pA1, incAn, pA1	sub	pA2, incAn, pA2	sub	pA3, incAn, pA3	add	pC0, incCn, pC0	add	pC1, incCn, pC1	add	pC2, incCn, pC2;;       while (N);;	addib,<>	-4, rN, NLOOP	add	pC3, incCn, pC3/* *      Restore regs and return */	fldd	-8(%r30), %fr12	fldd	-16(%r30), %fr13	fldd	-24(%r30), %fr14	fldd	-32(%r30), %fr15	fldd	-40(%r30), %fr16	fldd	-48(%r30), %fr17	fldd	-56(%r30), %fr18	fldd	-64(%r30), %fr19	fldd	-72(%r30), %fr20	fldd	-80(%r30), %fr21	ldw	-84(%r30), %r3	ldw	-88(%r30), %r4	ldw	-92(%r30), %r5;	ldw	-96(%r30), %r6;	ldw	-100(%r30), %r7;	ldw	-104(%r30), %r8;	ldw	-108(%r30), %r9	ldw	-112(%r30), %r10	ldw	-116(%r30), %r11	ldw	-120(%r30), %r12	ldw	-124(%r30), %r13	bve (%r2)	ldo	-FSIZE(%r30), %r30	.EXIT	.PROCEND

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?