atl_dmm4x4x2_us.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 531 行 · 第 1/2 页
C
531 行
! For K == 1, we never enter the loop at all!#if (KB != 2) #if KB == 0 subcc Kstart, %g0, %g0 bz KDRAIN nop #endif mov Kstart, KK .align 4KLOOP: faddd rC00, m0, rC00 fmuld rA0, rB1, m0 faddd rC10, m1, rC10 fmuld rA1, rB1, m1 faddd rC20, m2, rC20 fmuld rA2, rB1, m2 faddd rC30, m3, rC30 fmuld rA3, rB1, m3 ldd [pB1+8], rB1 faddd rC01, m0, rC01 fmuld rA0, rB2, m0 faddd rC11, m1, rC11 fmuld rA1, rB2, m1 faddd rC21, m2, rC21 fmuld rA2, rB2, m2 faddd rC31, m3, rC31 fmuld rA3, rB2, m3 ldd [pB2+8], rB2 faddd rC02, m0, rC02 fmuld rA0, rB3, m0 ldd [pA0+16], rA0 faddd rC12, m1, rC12 fmuld rA1, rB3, m1 ldd [pA1+16], rA1 faddd rC22, m2, rC22 fmuld rA2, rB3, m2 ldd [pA2+16], rA2 faddd rC32, m3, rC32 fmuld rA3, rB3, m3 ldd [pB3+8], rB3 faddd rC03, m0, rC03 fmuld ra0, rB0, m0 ldd [pA3+16], rA3 faddd rC13, m1, rC13 fmuld ra1, rB0, m1 faddd rC23, m2, rC23 fmuld ra2, rB0, m2 faddd rC33, m3, rC33 fmuld ra3, rB0, m3 ldd [pB0+16], rB0!! Second K iteration! faddd rC00, m0, rC00 fmuld ra0, rB1, m0 faddd rC10, m1, rC10 fmuld ra1, rB1, m1 faddd rC20, m2, rC20 fmuld ra2, rB1, m2 faddd rC30, m3, rC30 fmuld ra3, rB1, m3 ldd [pB1+16], rB1 faddd rC01, m0, rC01 fmuld ra0, rB2, m0 add pB1, 16, pB1 faddd rC11, m1, rC11 fmuld ra1, rB2, m1 faddd rC21, m2, rC21 fmuld ra2, rB2, m2 faddd rC31, m3, rC31 fmuld ra3, rB2, m3 ldd [pB2+16], rB2 faddd rC02, m0, rC02 fmuld ra0, rB3, m0 add pB2, 16, pB2 ldd [pA0+24], ra0 faddd rC12, m1, rC12 fmuld ra1, rB3, m1 add pA0, 16, pA0 ldd [pA1+24], ra1 faddd rC22, m2, rC22 fmuld ra2, rB3, m2 add pA1, 16, pA1 ldd [pA2+24], ra2 faddd rC32, m3, rC32 fmuld ra3, rB3, m3 add pA2, 16, pA2 ldd [pB3+16], rB3 faddd rC03, m0, rC03 fmuld rA0, rB0, m0 add pB3, 16, pB3 ldd [pA3+24], ra3 faddd rC13, m1, rC13 fmuld rA1, rB0, m1 add pA3, 16, pA3 faddd rC23, m2, rC23 fmuld rA2, rB0, m2 faddd rC33, m3, rC33 fmuld rA3, rB0, m3 ldd [pB0+24], rB0!! while(K);! subcc KK, 1, KK bnz KLOOP add pB0, 16, pB0#endif!! Drain multiply pipe on last iteration of K-loop!#if (KB == 0)KDRAIN:#endif faddd rC00, m0, rC00 fmuld rA0, rB1, m0 add pB0, incBm, pB0 faddd rC10, m1, rC10 fmuld rA1, rB1, m1 faddd rC20, m2, rC20 fmuld rA2, rB1, m2 faddd rC30, m3, rC30 fmuld rA3, rB1, m3 ldd [pB1+8], rB1 faddd rC01, m0, rC01 fmuld rA0, rB2, m0 faddd rC11, m1, rC11 fmuld rA1, rB2, m1 faddd rC21, m2, rC21 fmuld rA2, rB2, m2 faddd rC31, m3, rC31 fmuld rA3, rB2, m3 ldd [pB2+8], rB2 faddd rC02, m0, rC02 fmuld rA0, rB3, m0 faddd rC12, m1, rC12 fmuld rA1, rB3, m1 faddd rC22, m2, rC22 fmuld rA2, rB3, m2 faddd rC32, m3, rC32 fmuld rA3, rB3, m3 ldd [pB3+8], rB3 faddd rC03, m0, rC03 fmuld ra0, rB0, m0 faddd rC13, m1, rC13 fmuld ra1, rB0, m1 faddd rC23, m2, rC23 fmuld ra2, rB0, m2 faddd rC33, m3, rC33 fmuld ra3, rB0, m3!! Second K iteration! faddd rC00, m0, rC00 fmuld ra0, rB1, m0 faddd rC10, m1, rC10 fmuld ra1, rB1, m1 faddd rC20, m2, rC20 fmuld ra2, rB1, m2 faddd rC30, m3, rC30 fmuld ra3, rB1, m3 faddd rC01, m0, rC01 fmuld ra0, rB2, m0 add pB1, incBm, pB1 faddd rC11, m1, rC11 fmuld ra1, rB2, m1 faddd rC21, m2, rC21 fmuld ra2, rB2, m2 faddd rC31, m3, rC31 fmuld ra3, rB2, m3 faddd rC02, m0, rC02 fmuld ra0, rB3, m0 add pB2, incBm, pB2 faddd rC12, m1, rC12 fmuld ra1, rB3, m1 add pA0, incAm, pA0 faddd rC22, m2, rC22 fmuld ra2, rB3, m2 add pA1, incAm, pA1 faddd rC32, m3, rC32 fmuld ra3, rB3, m3 add pA2, incAm, pA2 faddd rC03, m0, rC03 add pB3, incBm, pB3 faddd rC13, m1, rC13 add pA3, incAm, pA3 faddd rC23, m2, rC23 faddd rC33, m3, rC33!! Write result back to C! std rC00, [pC0] std rC10, [pC0+CMUL(8)] std rC20, [pC0+CMUL(16)] std rC30, [pC0+CMUL(24)] add pC0, CMUL(32), pC0 std rC01, [pC1] std rC11, [pC1+CMUL(8)] std rC21, [pC1+CMUL(16)] std rC31, [pC1+CMUL(24)] add pC1, CMUL(32), pC1 std rC02, [pC2] std rC12, [pC2+CMUL(8)] std rC22, [pC2+CMUL(16)] std rC32, [pC2+CMUL(24)] add pC2, CMUL(32), pC2 std rC03, [pC3] std rC13, [pC3+CMUL(8)] std rC23, [pC3+CMUL(16)] std rC33, [pC3+CMUL(24)]!! while(II);! subcc II, 4, II bnz MLOOP add pC3, incCm, pC3 add pC0, incCn, pC0 add pC1, incCn, pC1 add pC2, incCn, pC2 add pC3, incCn, pC3 add pA0, incAn, pA0 add pA1, incAn, pA1 add pA2, incAn, pA2 add pA3, incAn, pA3 add pB0, incBn, pB0 add pB1, incBn, pB1 add pB2, incBn, pB2 add pB3, incBn, pB3!! while(N);! subcc N, 4, N bnz NLOOP nop!! Restore non-scratch registers and return! srl ldab, 3, ldab ld [%sp+64], %i1 ld [%sp+68], %i3 ld [%sp+72], %i4 ld [%sp+76], %i5 ld [%sp+80], %g2 ld [%sp+84], %g3 ld [%sp+88], %g4 ret restore
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?