atl_dmm4x4x2_us.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 531 行 · 第 1/2 页

C
531
字号
!  For K == 1, we never enter the loop at all!#if (KB != 2)   #if KB == 0        subcc   Kstart, %g0, %g0        bz      KDRAIN        nop   #endif        mov     Kstart, KK        .align  4KLOOP:        faddd   rC00, m0, rC00                                fmuld   rA0, rB1, m0        faddd   rC10, m1, rC10                                fmuld   rA1, rB1, m1        faddd   rC20, m2, rC20                                fmuld   rA2, rB1, m2        faddd   rC30, m3, rC30                                fmuld   rA3, rB1, m3                                                        ldd     [pB1+8], rB1        faddd   rC01, m0, rC01                                fmuld   rA0, rB2, m0        faddd   rC11, m1, rC11                                fmuld   rA1, rB2, m1        faddd   rC21, m2, rC21                                fmuld   rA2, rB2, m2        faddd   rC31, m3, rC31                                fmuld   rA3, rB2, m3                                                        ldd     [pB2+8], rB2        faddd   rC02, m0, rC02                                fmuld   rA0, rB3, m0                                                        ldd     [pA0+16], rA0        faddd   rC12, m1, rC12                                fmuld   rA1, rB3, m1                                                        ldd     [pA1+16], rA1        faddd   rC22, m2, rC22                                fmuld   rA2, rB3, m2                                                        ldd     [pA2+16], rA2        faddd   rC32, m3, rC32                                fmuld   rA3, rB3, m3                                                        ldd     [pB3+8], rB3        faddd   rC03, m0, rC03                                fmuld   ra0, rB0, m0                                                        ldd     [pA3+16], rA3        faddd   rC13, m1, rC13                                fmuld   ra1, rB0, m1        faddd   rC23, m2, rC23                                fmuld   ra2, rB0, m2        faddd   rC33, m3, rC33                                fmuld   ra3, rB0, m3                                                        ldd     [pB0+16], rB0!!       Second K iteration!        faddd   rC00, m0, rC00                                fmuld   ra0, rB1, m0        faddd   rC10, m1, rC10                                fmuld   ra1, rB1, m1        faddd   rC20, m2, rC20                                fmuld   ra2, rB1, m2        faddd   rC30, m3, rC30                                fmuld   ra3, rB1, m3                                                        ldd     [pB1+16], rB1        faddd   rC01, m0, rC01                                fmuld   ra0, rB2, m0                                                        add     pB1, 16, pB1        faddd   rC11, m1, rC11                                fmuld   ra1, rB2, m1        faddd   rC21, m2, rC21                                fmuld   ra2, rB2, m2        faddd   rC31, m3, rC31                                fmuld   ra3, rB2, m3                                                        ldd     [pB2+16], rB2        faddd   rC02, m0, rC02                                fmuld   ra0, rB3, m0                                                        add     pB2, 16, pB2                                                        ldd     [pA0+24], ra0        faddd   rC12, m1, rC12                                fmuld   ra1, rB3, m1                                                        add     pA0, 16, pA0                                                        ldd     [pA1+24], ra1        faddd   rC22, m2, rC22                                fmuld   ra2, rB3, m2                                                        add     pA1, 16, pA1                                                        ldd     [pA2+24], ra2        faddd   rC32, m3, rC32                                fmuld   ra3, rB3, m3                                                        add     pA2, 16, pA2                                                        ldd     [pB3+16], rB3        faddd   rC03, m0, rC03                                fmuld   rA0, rB0, m0                                                        add     pB3, 16, pB3                                                        ldd     [pA3+24], ra3        faddd   rC13, m1, rC13                                fmuld   rA1, rB0, m1                                                        add     pA3, 16, pA3        faddd   rC23, m2, rC23                                fmuld   rA2, rB0, m2        faddd   rC33, m3, rC33                                fmuld   rA3, rB0, m3                                                        ldd     [pB0+24], rB0!!       while(K);!        subcc   KK, 1, KK        bnz     KLOOP                                                        add     pB0, 16, pB0#endif!! Drain multiply pipe on last iteration of K-loop!#if (KB == 0)KDRAIN:#endif        faddd   rC00, m0, rC00                                fmuld   rA0, rB1, m0                                                        add     pB0, incBm, pB0        faddd   rC10, m1, rC10                                fmuld   rA1, rB1, m1        faddd   rC20, m2, rC20                                fmuld   rA2, rB1, m2        faddd   rC30, m3, rC30                                fmuld   rA3, rB1, m3                                                        ldd     [pB1+8], rB1        faddd   rC01, m0, rC01                                fmuld   rA0, rB2, m0        faddd   rC11, m1, rC11                                fmuld   rA1, rB2, m1        faddd   rC21, m2, rC21                                fmuld   rA2, rB2, m2        faddd   rC31, m3, rC31                                fmuld   rA3, rB2, m3                                                        ldd     [pB2+8], rB2        faddd   rC02, m0, rC02                                fmuld   rA0, rB3, m0        faddd   rC12, m1, rC12                                fmuld   rA1, rB3, m1        faddd   rC22, m2, rC22                                fmuld   rA2, rB3, m2        faddd   rC32, m3, rC32                                fmuld   rA3, rB3, m3                                                        ldd     [pB3+8], rB3        faddd   rC03, m0, rC03                                fmuld   ra0, rB0, m0        faddd   rC13, m1, rC13                                fmuld   ra1, rB0, m1        faddd   rC23, m2, rC23                                fmuld   ra2, rB0, m2        faddd   rC33, m3, rC33                                fmuld   ra3, rB0, m3!!       Second K iteration!        faddd   rC00, m0, rC00                                fmuld   ra0, rB1, m0        faddd   rC10, m1, rC10                                fmuld   ra1, rB1, m1        faddd   rC20, m2, rC20                                fmuld   ra2, rB1, m2        faddd   rC30, m3, rC30                                fmuld   ra3, rB1, m3        faddd   rC01, m0, rC01                                fmuld   ra0, rB2, m0                                                        add     pB1, incBm, pB1        faddd   rC11, m1, rC11                                fmuld   ra1, rB2, m1        faddd   rC21, m2, rC21                                fmuld   ra2, rB2, m2        faddd   rC31, m3, rC31                                fmuld   ra3, rB2, m3        faddd   rC02, m0, rC02                                fmuld   ra0, rB3, m0                                                        add     pB2, incBm, pB2        faddd   rC12, m1, rC12                                fmuld   ra1, rB3, m1                                                        add     pA0, incAm, pA0        faddd   rC22, m2, rC22                                fmuld   ra2, rB3, m2                                                        add     pA1, incAm, pA1        faddd   rC32, m3, rC32                                fmuld   ra3, rB3, m3                                                        add     pA2, incAm, pA2        faddd   rC03, m0, rC03                                                        add     pB3, incBm, pB3        faddd   rC13, m1, rC13                                                        add     pA3, incAm, pA3        faddd   rC23, m2, rC23        faddd   rC33, m3, rC33!!       Write result back to C!        std     rC00, [pC0]        std     rC10, [pC0+CMUL(8)]        std     rC20, [pC0+CMUL(16)]        std     rC30, [pC0+CMUL(24)]                                add     pC0, CMUL(32), pC0        std     rC01, [pC1]        std     rC11, [pC1+CMUL(8)]        std     rC21, [pC1+CMUL(16)]        std     rC31, [pC1+CMUL(24)]                                add     pC1, CMUL(32), pC1        std     rC02, [pC2]        std     rC12, [pC2+CMUL(8)]        std     rC22, [pC2+CMUL(16)]        std     rC32, [pC2+CMUL(24)]                                add     pC2, CMUL(32), pC2        std     rC03, [pC3]        std     rC13, [pC3+CMUL(8)]        std     rC23, [pC3+CMUL(16)]        std     rC33, [pC3+CMUL(24)]!!       while(II);!        subcc   II, 4, II        bnz     MLOOP                                add     pC3, incCm, pC3	add	pC0, incCn, pC0	add	pC1, incCn, pC1	add	pC2, incCn, pC2	add	pC3, incCn, pC3	add	pA0, incAn, pA0	add	pA1, incAn, pA1	add	pA2, incAn, pA2	add	pA3, incAn, pA3	add	pB0, incBn, pB0	add	pB1, incBn, pB1	add	pB2, incBn, pB2	add	pB3, incBn, pB3!!       while(N);!        subcc   N, 4, N        bnz     NLOOP        nop!!       Restore non-scratch registers and return!        srl     ldab, 3, ldab        ld      [%sp+64], %i1        ld      [%sp+68], %i3        ld      [%sp+72], %i4        ld      [%sp+76], %i5        ld      [%sp+80], %g2        ld      [%sp+84], %g3        ld      [%sp+88], %g4        ret        restore

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?