atl_dmm4x4x8_us.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,163 行 · 第 1/2 页

C
1,163
字号
	faddd	rC20, m2, rC20				fmuld	ra2, rB1, m2	faddd	rC30, m3, rC30				fmuld	ra3, rB1, m3							ldd	[pB1+48], rB1	faddd	rC01, m0, rC01				fmuld	ra0, rB2, m0	faddd	rC11, m1, rC11				fmuld	ra1, rB2, m1	faddd	rC21, m2, rC21				fmuld	ra2, rB2, m2	faddd	rC31, m3, rC31				fmuld	ra3, rB2, m3							ldd	[pB2+48], rB2	faddd	rC02, m0, rC02				fmuld	ra0, rB3, m0							ldd	[pA0+56], ra0	faddd	rC12, m1, rC12				fmuld	ra1, rB3, m1							ldd	[pA1+56], ra1	faddd	rC22, m2, rC22				fmuld	ra2, rB3, m2							ldd	[pA2+56], ra2	faddd	rC32, m3, rC32				fmuld	ra3, rB3, m3							ldd	[pB3+48], rB3	faddd	rC03, m0, rC03				fmuld	rA0, rB0, m0							ldd	[pA3+56], ra3	faddd	rC13, m1, rC13				fmuld	rA1, rB0, m1	faddd	rC23, m2, rC23				fmuld	rA2, rB0, m2	faddd	rC33, m3, rC33				fmuld	rA3, rB0, m3							ldd	[pB0+56], rB0!!	K=6 iteration!	faddd	rC00, m0, rC00				fmuld	rA0, rB1, m0	faddd	rC10, m1, rC10				fmuld	rA1, rB1, m1	faddd	rC20, m2, rC20				fmuld	rA2, rB1, m2	faddd	rC30, m3, rC30				fmuld	rA3, rB1, m3							ldd	[pB1+56], rB1	faddd	rC01, m0, rC01				fmuld	rA0, rB2, m0	faddd	rC11, m1, rC11				fmuld	rA1, rB2, m1	faddd	rC21, m2, rC21				fmuld	rA2, rB2, m2	faddd	rC31, m3, rC31				fmuld	rA3, rB2, m3							ldd	[pB2+56], rB2	faddd	rC02, m0, rC02				fmuld	rA0, rB3, m0							ldd	[pA0+64], rA0	faddd	rC12, m1, rC12				fmuld	rA1, rB3, m1							ldd	[pA1+64], rA1	faddd	rC22, m2, rC22				fmuld	rA2, rB3, m2							ldd	[pA2+64], rA2	faddd	rC32, m3, rC32				fmuld	rA3, rB3, m3							ldd	[pB3+56], rB3	faddd	rC03, m0, rC03				fmuld	ra0, rB0, m0							ldd	[pA3+64], rA3	faddd	rC13, m1, rC13				fmuld	ra1, rB0, m1	faddd	rC23, m2, rC23				fmuld	ra2, rB0, m2	faddd	rC33, m3, rC33				fmuld	ra3, rB0, m3							ldd	[pB0+64], rB0!!	K=7 iteration!	faddd	rC00, m0, rC00				fmuld	ra0, rB1, m0							prefR1([pB0+PFD])	faddd	rC10, m1, rC10				fmuld	ra1, rB1, m1							prefR1([pB1+PFD])	faddd	rC20, m2, rC20				fmuld	ra2, rB1, m2	faddd	rC30, m3, rC30				fmuld	ra3, rB1, m3							ldd	[pB1+64], rB1	faddd	rC01, m0, rC01				fmuld	ra0, rB2, m0							add	pB1, 64, pB1	faddd	rC11, m1, rC11				fmuld	ra1, rB2, m1							prefR1([pB2+PFD])	faddd	rC21, m2, rC21				fmuld	ra2, rB2, m2							prefR1([pB3+PFD])	faddd	rC31, m3, rC31				fmuld	ra3, rB2, m3							ldd	[pB2+64], rB2	faddd	rC02, m0, rC02				fmuld	ra0, rB3, m0							ldd	[pA0+72], ra0							add	pA0, 64, pA0	faddd	rC12, m1, rC12				fmuld	ra1, rB3, m1							ldd	[pA1+72], ra1							add	pA1, 64, pA1	faddd	rC22, m2, rC22				fmuld	ra2, rB3, m2							ldd	[pA2+72], ra2							add	pA2, 64, pA2	faddd	rC32, m3, rC32				fmuld	ra3, rB3, m3							ldd	[pB3+64], rB3	faddd	rC03, m0, rC03				fmuld	rA0, rB0, m0							ldd	[pA3+72], ra3							add	pA3, 64, pA3	faddd	rC13, m1, rC13				fmuld	rA1, rB0, m1							add	pB2, 64, pB2	faddd	rC23, m2, rC23				fmuld	rA2, rB0, m2							add	pB3, 64, pB3	faddd	rC33, m3, rC33				fmuld	rA3, rB0, m3							ldd	[pB0+72], rB0!!       while(K);!        subcc   KK, 1, KK        bnz     KLOOP                                                        add     pB0, 64, pB0#endif!! Drain multiply pipe on last iteration of K-loop!#if (KB == 0)KDRAIN:#endif!!	K=0 iteration!							prefR2([pfB])	faddd	rC00, m0, rC00				fmuld	rA0, rB1, m0							prefR2([pfB+64])	faddd	rC10, m1, rC10				fmuld	rA1, rB1, m1	faddd	rC20, m2, rC20				fmuld	rA2, rB1, m2	faddd	rC30, m3, rC30				fmuld	rA3, rB1, m3							ldd	[pB1+8], rB1	faddd	rC01, m0, rC01				fmuld	rA0, rB2, m0!							prefR2([pfB+128])	faddd	rC11, m1, rC11				fmuld	rA1, rB2, m1	faddd	rC21, m2, rC21				fmuld	rA2, rB2, m2	faddd	rC31, m3, rC31				fmuld	rA3, rB2, m3							ldd	[pB2+8], rB2	faddd	rC02, m0, rC02				fmuld	rA0, rB3, m0							ldd	[pA0+16], rA0	faddd	rC12, m1, rC12				fmuld	rA1, rB3, m1							ldd	[pA1+16], rA1	faddd	rC22, m2, rC22				fmuld	rA2, rB3, m2							ldd	[pA2+16], rA2	faddd	rC32, m3, rC32				fmuld	rA3, rB3, m3							ldd	[pB3+8], rB3	faddd	rC03, m0, rC03				fmuld	ra0, rB0, m0							ldd	[pA3+16], rA3	faddd	rC13, m1, rC13				fmuld	ra1, rB0, m1	faddd	rC23, m2, rC23				fmuld	ra2, rB0, m2	faddd	rC33, m3, rC33				fmuld	ra3, rB0, m3							ldd	[pB0+16], rB0!!	K=1 iteration!	faddd	rC00, m0, rC00				fmuld	ra0, rB1, m0	faddd	rC10, m1, rC10				fmuld	ra1, rB1, m1	faddd	rC20, m2, rC20				fmuld	ra2, rB1, m2	faddd	rC30, m3, rC30				fmuld	ra3, rB1, m3							ldd	[pB1+16], rB1	faddd	rC01, m0, rC01				fmuld	ra0, rB2, m0	faddd	rC11, m1, rC11				fmuld	ra1, rB2, m1	faddd	rC21, m2, rC21				fmuld	ra2, rB2, m2	faddd	rC31, m3, rC31				fmuld	ra3, rB2, m3							ldd	[pB2+16], rB2	faddd	rC02, m0, rC02				fmuld	ra0, rB3, m0							ldd	[pA0+24], ra0	faddd	rC12, m1, rC12				fmuld	ra1, rB3, m1							ldd	[pA1+24], ra1	faddd	rC22, m2, rC22				fmuld	ra2, rB3, m2							ldd	[pA2+24], ra2	faddd	rC32, m3, rC32				fmuld	ra3, rB3, m3							ldd	[pB3+16], rB3	faddd	rC03, m0, rC03				fmuld	rA0, rB0, m0							ldd	[pA3+24], ra3	faddd	rC13, m1, rC13				fmuld	rA1, rB0, m1	faddd	rC23, m2, rC23				fmuld	rA2, rB0, m2	faddd	rC33, m3, rC33				fmuld	rA3, rB0, m3							ldd	[pB0+24], rB0!!	K=2 iteration!	faddd	rC00, m0, rC00				fmuld	rA0, rB1, m0	faddd	rC10, m1, rC10				fmuld	rA1, rB1, m1	faddd	rC20, m2, rC20				fmuld	rA2, rB1, m2	faddd	rC30, m3, rC30				fmuld	rA3, rB1, m3							ldd	[pB1+24], rB1	faddd	rC01, m0, rC01				fmuld	rA0, rB2, m0	faddd	rC11, m1, rC11				fmuld	rA1, rB2, m1	faddd	rC21, m2, rC21				fmuld	rA2, rB2, m2	faddd	rC31, m3, rC31				fmuld	rA3, rB2, m3							ldd	[pB2+24], rB2	faddd	rC02, m0, rC02				fmuld	rA0, rB3, m0							ldd	[pA0+32], rA0	faddd	rC12, m1, rC12				fmuld	rA1, rB3, m1							ldd	[pA1+32], rA1	faddd	rC22, m2, rC22				fmuld	rA2, rB3, m2							ldd	[pA2+32], rA2	faddd	rC32, m3, rC32				fmuld	rA3, rB3, m3							ldd	[pB3+24], rB3	faddd	rC03, m0, rC03				fmuld	ra0, rB0, m0							ldd	[pA3+32], rA3	faddd	rC13, m1, rC13				fmuld	ra1, rB0, m1	faddd	rC23, m2, rC23				fmuld	ra2, rB0, m2	faddd	rC33, m3, rC33				fmuld	ra3, rB0, m3							ldd	[pB0+32], rB0!!	K=3 iteration!	faddd	rC00, m0, rC00				fmuld	ra0, rB1, m0	faddd	rC10, m1, rC10				fmuld	ra1, rB1, m1	faddd	rC20, m2, rC20				fmuld	ra2, rB1, m2	faddd	rC30, m3, rC30				fmuld	ra3, rB1, m3							ldd	[pB1+32], rB1	faddd	rC01, m0, rC01				fmuld	ra0, rB2, m0	faddd	rC11, m1, rC11				fmuld	ra1, rB2, m1	faddd	rC21, m2, rC21				fmuld	ra2, rB2, m2	faddd	rC31, m3, rC31				fmuld	ra3, rB2, m3							ldd	[pB2+32], rB2	faddd	rC02, m0, rC02				fmuld	ra0, rB3, m0							ldd	[pA0+40], ra0	faddd	rC12, m1, rC12				fmuld	ra1, rB3, m1							ldd	[pA1+40], ra1	faddd	rC22, m2, rC22				fmuld	ra2, rB3, m2							ldd	[pA2+40], ra2	faddd	rC32, m3, rC32				fmuld	ra3, rB3, m3							ldd	[pB3+32], rB3	faddd	rC03, m0, rC03				fmuld	rA0, rB0, m0							ldd	[pA3+40], ra3	faddd	rC13, m1, rC13				fmuld	rA1, rB0, m1	faddd	rC23, m2, rC23				fmuld	rA2, rB0, m2	faddd	rC33, m3, rC33				fmuld	rA3, rB0, m3							ldd	[pB0+40], rB0!!	K=4 iteration!	faddd	rC00, m0, rC00				fmuld	rA0, rB1, m0	faddd	rC10, m1, rC10				fmuld	rA1, rB1, m1	faddd	rC20, m2, rC20				fmuld	rA2, rB1, m2	faddd	rC30, m3, rC30				fmuld	rA3, rB1, m3							ldd	[pB1+40], rB1	faddd	rC01, m0, rC01				fmuld	rA0, rB2, m0	faddd	rC11, m1, rC11				fmuld	rA1, rB2, m1	faddd	rC21, m2, rC21				fmuld	rA2, rB2, m2	faddd	rC31, m3, rC31				fmuld	rA3, rB2, m3							ldd	[pB2+40], rB2	faddd	rC02, m0, rC02				fmuld	rA0, rB3, m0							ldd	[pA0+48], rA0	faddd	rC12, m1, rC12				fmuld	rA1, rB3, m1							ldd	[pA1+48], rA1	faddd	rC22, m2, rC22				fmuld	rA2, rB3, m2							ldd	[pA2+48], rA2	faddd	rC32, m3, rC32				fmuld	rA3, rB3, m3							ldd	[pB3+40], rB3	faddd	rC03, m0, rC03				fmuld	ra0, rB0, m0							ldd	[pA3+48], rA3	faddd	rC13, m1, rC13				fmuld	ra1, rB0, m1	faddd	rC23, m2, rC23				fmuld	ra2, rB0, m2	faddd	rC33, m3, rC33				fmuld	ra3, rB0, m3							ldd	[pB0+48], rB0!!	K=5 iteration!	faddd	rC00, m0, rC00				fmuld	ra0, rB1, m0	faddd	rC10, m1, rC10				fmuld	ra1, rB1, m1	faddd	rC20, m2, rC20				fmuld	ra2, rB1, m2	faddd	rC30, m3, rC30				fmuld	ra3, rB1, m3							ldd	[pB1+48], rB1	faddd	rC01, m0, rC01				fmuld	ra0, rB2, m0	faddd	rC11, m1, rC11				fmuld	ra1, rB2, m1	faddd	rC21, m2, rC21				fmuld	ra2, rB2, m2	faddd	rC31, m3, rC31				fmuld	ra3, rB2, m3							ldd	[pB2+48], rB2	faddd	rC02, m0, rC02				fmuld	ra0, rB3, m0							ldd	[pA0+56], ra0	faddd	rC12, m1, rC12				fmuld	ra1, rB3, m1							ldd	[pA1+56], ra1	faddd	rC22, m2, rC22				fmuld	ra2, rB3, m2							ldd	[pA2+56], ra2	faddd	rC32, m3, rC32				fmuld	ra3, rB3, m3							ldd	[pB3+48], rB3	faddd	rC03, m0, rC03				fmuld	rA0, rB0, m0							ldd	[pA3+56], ra3	faddd	rC13, m1, rC13				fmuld	rA1, rB0, m1	faddd	rC23, m2, rC23				fmuld	rA2, rB0, m2	faddd	rC33, m3, rC33				fmuld	rA3, rB0, m3							ldd	[pB0+56], rB0!!       Second to last K iteration!        faddd   rC00, m0, rC00                                fmuld   rA0, rB1, m0                                                        add     pB0, incBm, pB0        faddd   rC10, m1, rC10                                fmuld   rA1, rB1, m1        faddd   rC20, m2, rC20                                fmuld   rA2, rB1, m2        faddd   rC30, m3, rC30                                fmuld   rA3, rB1, m3                                                        ldd     [pB1+56], rB1        faddd   rC01, m0, rC01                                fmuld   rA0, rB2, m0        faddd   rC11, m1, rC11                                fmuld   rA1, rB2, m1        faddd   rC21, m2, rC21                                fmuld   rA2, rB2, m2        faddd   rC31, m3, rC31                                fmuld   rA3, rB2, m3                                                        ldd     [pB2+56], rB2        faddd   rC02, m0, rC02                                fmuld   rA0, rB3, m0        faddd   rC12, m1, rC12                                fmuld   rA1, rB3, m1        faddd   rC22, m2, rC22                                fmuld   rA2, rB3, m2        faddd   rC32, m3, rC32                                fmuld   rA3, rB3, m3                                                        ldd     [pB3+56], rB3        faddd   rC03, m0, rC03                                fmuld   ra0, rB0, m0        faddd   rC13, m1, rC13                                fmuld   ra1, rB0, m1        faddd   rC23, m2, rC23                                fmuld   ra2, rB0, m2        faddd   rC33, m3, rC33                                fmuld   ra3, rB0, m3!!       Last K iteration!        faddd   rC00, m0, rC00                                fmuld   ra0, rB1, m0        faddd   rC10, m1, rC10                                fmuld   ra1, rB1, m1        faddd   rC20, m2, rC20                                fmuld   ra2, rB1, m2        faddd   rC30, m3, rC30                                fmuld   ra3, rB1, m3        faddd   rC01, m0, rC01                                fmuld   ra0, rB2, m0                                                        add     pB1, incBm, pB1        faddd   rC11, m1, rC11                                fmuld   ra1, rB2, m1							prefR2([pfA])        faddd   rC21, m2, rC21                                fmuld   ra2, rB2, m2        faddd   rC31, m3, rC31							prefR2([pfA+64])                                fmuld   ra3, rB2, m3        faddd   rC02, m0, rC02                                fmuld   ra0, rB3, m0                                                        add     pB2, incBm, pB2        faddd   rC12, m1, rC12                                fmuld   ra1, rB3, m1                                                        add     pA0, incAm, pA0        faddd   rC22, m2, rC22                                fmuld   ra2, rB3, m2                                                        add     pA1, incAm, pA1        faddd   rC32, m3, rC32                                fmuld   ra3, rB3, m3                                                        add     pA2, incAm, pA2        faddd   rC03, m0, rC03                                                        add     pB3, incBm, pB3        faddd   rC13, m1, rC13                                                        add     pA3, incAm, pA3        faddd   rC23, m2, rC23							add	pfB, 128, pfB        faddd   rC33, m3, rC33							add	pfA, 128, pfA!!       Write result back to C!        std     rC00, [pC0]        std     rC10, [pC0+CMUL(8)]        std     rC20, [pC0+CMUL(16)]        std     rC30, [pC0+CMUL(24)]                                add     pC0, CMUL(32), pC0        std     rC01, [pC1]        std     rC11, [pC1+CMUL(8)]        std     rC21, [pC1+CMUL(16)]        std     rC31, [pC1+CMUL(24)]                                add     pC1, CMUL(32), pC1        std     rC02, [pC2]        std     rC12, [pC2+CMUL(8)]        std     rC22, [pC2+CMUL(16)]        std     rC32, [pC2+CMUL(24)]                                add     pC2, CMUL(32), pC2        std     rC03, [pC3]        std     rC13, [pC3+CMUL(8)]        std     rC23, [pC3+CMUL(16)]        std     rC33, [pC3+CMUL(24)]!!       while(II);!        subcc   II, 4, II        bnz     MLOOP                                add     pC3, incCm, pC3	add	pC0, incCn, pC0	add	pC1, incCn, pC1	add	pC2, incCn, pC2	add	pC3, incCn, pC3	add	pA0, incAn, pA0	add	pA1, incAn, pA1	add	pA2, incAn, pA2	add	pA3, incAn, pA3	add	pB0, incBn, pB0	add	pB1, incBn, pB1	add	pB2, incBn, pB2!!       while(N);!        subcc   N, 4, N        bnz     NLOOP	add	pB3, incBn, pB3!!       Restore non-scratch registers and return!#ifdef ATL_USE64BITS#else        ld      [%sp+80], %g2        ld      [%sp+84], %g3        ld      [%sp+88], %g4#endif        ret        restore

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?