atl_dmm4x4x8_us.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,163 行 · 第 1/2 页
C
1,163 行
faddd rC20, m2, rC20 fmuld ra2, rB1, m2 faddd rC30, m3, rC30 fmuld ra3, rB1, m3 ldd [pB1+48], rB1 faddd rC01, m0, rC01 fmuld ra0, rB2, m0 faddd rC11, m1, rC11 fmuld ra1, rB2, m1 faddd rC21, m2, rC21 fmuld ra2, rB2, m2 faddd rC31, m3, rC31 fmuld ra3, rB2, m3 ldd [pB2+48], rB2 faddd rC02, m0, rC02 fmuld ra0, rB3, m0 ldd [pA0+56], ra0 faddd rC12, m1, rC12 fmuld ra1, rB3, m1 ldd [pA1+56], ra1 faddd rC22, m2, rC22 fmuld ra2, rB3, m2 ldd [pA2+56], ra2 faddd rC32, m3, rC32 fmuld ra3, rB3, m3 ldd [pB3+48], rB3 faddd rC03, m0, rC03 fmuld rA0, rB0, m0 ldd [pA3+56], ra3 faddd rC13, m1, rC13 fmuld rA1, rB0, m1 faddd rC23, m2, rC23 fmuld rA2, rB0, m2 faddd rC33, m3, rC33 fmuld rA3, rB0, m3 ldd [pB0+56], rB0!! K=6 iteration! faddd rC00, m0, rC00 fmuld rA0, rB1, m0 faddd rC10, m1, rC10 fmuld rA1, rB1, m1 faddd rC20, m2, rC20 fmuld rA2, rB1, m2 faddd rC30, m3, rC30 fmuld rA3, rB1, m3 ldd [pB1+56], rB1 faddd rC01, m0, rC01 fmuld rA0, rB2, m0 faddd rC11, m1, rC11 fmuld rA1, rB2, m1 faddd rC21, m2, rC21 fmuld rA2, rB2, m2 faddd rC31, m3, rC31 fmuld rA3, rB2, m3 ldd [pB2+56], rB2 faddd rC02, m0, rC02 fmuld rA0, rB3, m0 ldd [pA0+64], rA0 faddd rC12, m1, rC12 fmuld rA1, rB3, m1 ldd [pA1+64], rA1 faddd rC22, m2, rC22 fmuld rA2, rB3, m2 ldd [pA2+64], rA2 faddd rC32, m3, rC32 fmuld rA3, rB3, m3 ldd [pB3+56], rB3 faddd rC03, m0, rC03 fmuld ra0, rB0, m0 ldd [pA3+64], rA3 faddd rC13, m1, rC13 fmuld ra1, rB0, m1 faddd rC23, m2, rC23 fmuld ra2, rB0, m2 faddd rC33, m3, rC33 fmuld ra3, rB0, m3 ldd [pB0+64], rB0!! K=7 iteration! faddd rC00, m0, rC00 fmuld ra0, rB1, m0 prefR1([pB0+PFD]) faddd rC10, m1, rC10 fmuld ra1, rB1, m1 prefR1([pB1+PFD]) faddd rC20, m2, rC20 fmuld ra2, rB1, m2 faddd rC30, m3, rC30 fmuld ra3, rB1, m3 ldd [pB1+64], rB1 faddd rC01, m0, rC01 fmuld ra0, rB2, m0 add pB1, 64, pB1 faddd rC11, m1, rC11 fmuld ra1, rB2, m1 prefR1([pB2+PFD]) faddd rC21, m2, rC21 fmuld ra2, rB2, m2 prefR1([pB3+PFD]) faddd rC31, m3, rC31 fmuld ra3, rB2, m3 ldd [pB2+64], rB2 faddd rC02, m0, rC02 fmuld ra0, rB3, m0 ldd [pA0+72], ra0 add pA0, 64, pA0 faddd rC12, m1, rC12 fmuld ra1, rB3, m1 ldd [pA1+72], ra1 add pA1, 64, pA1 faddd rC22, m2, rC22 fmuld ra2, rB3, m2 ldd [pA2+72], ra2 add pA2, 64, pA2 faddd rC32, m3, rC32 fmuld ra3, rB3, m3 ldd [pB3+64], rB3 faddd rC03, m0, rC03 fmuld rA0, rB0, m0 ldd [pA3+72], ra3 add pA3, 64, pA3 faddd rC13, m1, rC13 fmuld rA1, rB0, m1 add pB2, 64, pB2 faddd rC23, m2, rC23 fmuld rA2, rB0, m2 add pB3, 64, pB3 faddd rC33, m3, rC33 fmuld rA3, rB0, m3 ldd [pB0+72], rB0!! while(K);! subcc KK, 1, KK bnz KLOOP add pB0, 64, pB0#endif!! Drain multiply pipe on last iteration of K-loop!#if (KB == 0)KDRAIN:#endif!! K=0 iteration! prefR2([pfB]) faddd rC00, m0, rC00 fmuld rA0, rB1, m0 prefR2([pfB+64]) faddd rC10, m1, rC10 fmuld rA1, rB1, m1 faddd rC20, m2, rC20 fmuld rA2, rB1, m2 faddd rC30, m3, rC30 fmuld rA3, rB1, m3 ldd [pB1+8], rB1 faddd rC01, m0, rC01 fmuld rA0, rB2, m0! prefR2([pfB+128]) faddd rC11, m1, rC11 fmuld rA1, rB2, m1 faddd rC21, m2, rC21 fmuld rA2, rB2, m2 faddd rC31, m3, rC31 fmuld rA3, rB2, m3 ldd [pB2+8], rB2 faddd rC02, m0, rC02 fmuld rA0, rB3, m0 ldd [pA0+16], rA0 faddd rC12, m1, rC12 fmuld rA1, rB3, m1 ldd [pA1+16], rA1 faddd rC22, m2, rC22 fmuld rA2, rB3, m2 ldd [pA2+16], rA2 faddd rC32, m3, rC32 fmuld rA3, rB3, m3 ldd [pB3+8], rB3 faddd rC03, m0, rC03 fmuld ra0, rB0, m0 ldd [pA3+16], rA3 faddd rC13, m1, rC13 fmuld ra1, rB0, m1 faddd rC23, m2, rC23 fmuld ra2, rB0, m2 faddd rC33, m3, rC33 fmuld ra3, rB0, m3 ldd [pB0+16], rB0!! K=1 iteration! faddd rC00, m0, rC00 fmuld ra0, rB1, m0 faddd rC10, m1, rC10 fmuld ra1, rB1, m1 faddd rC20, m2, rC20 fmuld ra2, rB1, m2 faddd rC30, m3, rC30 fmuld ra3, rB1, m3 ldd [pB1+16], rB1 faddd rC01, m0, rC01 fmuld ra0, rB2, m0 faddd rC11, m1, rC11 fmuld ra1, rB2, m1 faddd rC21, m2, rC21 fmuld ra2, rB2, m2 faddd rC31, m3, rC31 fmuld ra3, rB2, m3 ldd [pB2+16], rB2 faddd rC02, m0, rC02 fmuld ra0, rB3, m0 ldd [pA0+24], ra0 faddd rC12, m1, rC12 fmuld ra1, rB3, m1 ldd [pA1+24], ra1 faddd rC22, m2, rC22 fmuld ra2, rB3, m2 ldd [pA2+24], ra2 faddd rC32, m3, rC32 fmuld ra3, rB3, m3 ldd [pB3+16], rB3 faddd rC03, m0, rC03 fmuld rA0, rB0, m0 ldd [pA3+24], ra3 faddd rC13, m1, rC13 fmuld rA1, rB0, m1 faddd rC23, m2, rC23 fmuld rA2, rB0, m2 faddd rC33, m3, rC33 fmuld rA3, rB0, m3 ldd [pB0+24], rB0!! K=2 iteration! faddd rC00, m0, rC00 fmuld rA0, rB1, m0 faddd rC10, m1, rC10 fmuld rA1, rB1, m1 faddd rC20, m2, rC20 fmuld rA2, rB1, m2 faddd rC30, m3, rC30 fmuld rA3, rB1, m3 ldd [pB1+24], rB1 faddd rC01, m0, rC01 fmuld rA0, rB2, m0 faddd rC11, m1, rC11 fmuld rA1, rB2, m1 faddd rC21, m2, rC21 fmuld rA2, rB2, m2 faddd rC31, m3, rC31 fmuld rA3, rB2, m3 ldd [pB2+24], rB2 faddd rC02, m0, rC02 fmuld rA0, rB3, m0 ldd [pA0+32], rA0 faddd rC12, m1, rC12 fmuld rA1, rB3, m1 ldd [pA1+32], rA1 faddd rC22, m2, rC22 fmuld rA2, rB3, m2 ldd [pA2+32], rA2 faddd rC32, m3, rC32 fmuld rA3, rB3, m3 ldd [pB3+24], rB3 faddd rC03, m0, rC03 fmuld ra0, rB0, m0 ldd [pA3+32], rA3 faddd rC13, m1, rC13 fmuld ra1, rB0, m1 faddd rC23, m2, rC23 fmuld ra2, rB0, m2 faddd rC33, m3, rC33 fmuld ra3, rB0, m3 ldd [pB0+32], rB0!! K=3 iteration! faddd rC00, m0, rC00 fmuld ra0, rB1, m0 faddd rC10, m1, rC10 fmuld ra1, rB1, m1 faddd rC20, m2, rC20 fmuld ra2, rB1, m2 faddd rC30, m3, rC30 fmuld ra3, rB1, m3 ldd [pB1+32], rB1 faddd rC01, m0, rC01 fmuld ra0, rB2, m0 faddd rC11, m1, rC11 fmuld ra1, rB2, m1 faddd rC21, m2, rC21 fmuld ra2, rB2, m2 faddd rC31, m3, rC31 fmuld ra3, rB2, m3 ldd [pB2+32], rB2 faddd rC02, m0, rC02 fmuld ra0, rB3, m0 ldd [pA0+40], ra0 faddd rC12, m1, rC12 fmuld ra1, rB3, m1 ldd [pA1+40], ra1 faddd rC22, m2, rC22 fmuld ra2, rB3, m2 ldd [pA2+40], ra2 faddd rC32, m3, rC32 fmuld ra3, rB3, m3 ldd [pB3+32], rB3 faddd rC03, m0, rC03 fmuld rA0, rB0, m0 ldd [pA3+40], ra3 faddd rC13, m1, rC13 fmuld rA1, rB0, m1 faddd rC23, m2, rC23 fmuld rA2, rB0, m2 faddd rC33, m3, rC33 fmuld rA3, rB0, m3 ldd [pB0+40], rB0!! K=4 iteration! faddd rC00, m0, rC00 fmuld rA0, rB1, m0 faddd rC10, m1, rC10 fmuld rA1, rB1, m1 faddd rC20, m2, rC20 fmuld rA2, rB1, m2 faddd rC30, m3, rC30 fmuld rA3, rB1, m3 ldd [pB1+40], rB1 faddd rC01, m0, rC01 fmuld rA0, rB2, m0 faddd rC11, m1, rC11 fmuld rA1, rB2, m1 faddd rC21, m2, rC21 fmuld rA2, rB2, m2 faddd rC31, m3, rC31 fmuld rA3, rB2, m3 ldd [pB2+40], rB2 faddd rC02, m0, rC02 fmuld rA0, rB3, m0 ldd [pA0+48], rA0 faddd rC12, m1, rC12 fmuld rA1, rB3, m1 ldd [pA1+48], rA1 faddd rC22, m2, rC22 fmuld rA2, rB3, m2 ldd [pA2+48], rA2 faddd rC32, m3, rC32 fmuld rA3, rB3, m3 ldd [pB3+40], rB3 faddd rC03, m0, rC03 fmuld ra0, rB0, m0 ldd [pA3+48], rA3 faddd rC13, m1, rC13 fmuld ra1, rB0, m1 faddd rC23, m2, rC23 fmuld ra2, rB0, m2 faddd rC33, m3, rC33 fmuld ra3, rB0, m3 ldd [pB0+48], rB0!! K=5 iteration! faddd rC00, m0, rC00 fmuld ra0, rB1, m0 faddd rC10, m1, rC10 fmuld ra1, rB1, m1 faddd rC20, m2, rC20 fmuld ra2, rB1, m2 faddd rC30, m3, rC30 fmuld ra3, rB1, m3 ldd [pB1+48], rB1 faddd rC01, m0, rC01 fmuld ra0, rB2, m0 faddd rC11, m1, rC11 fmuld ra1, rB2, m1 faddd rC21, m2, rC21 fmuld ra2, rB2, m2 faddd rC31, m3, rC31 fmuld ra3, rB2, m3 ldd [pB2+48], rB2 faddd rC02, m0, rC02 fmuld ra0, rB3, m0 ldd [pA0+56], ra0 faddd rC12, m1, rC12 fmuld ra1, rB3, m1 ldd [pA1+56], ra1 faddd rC22, m2, rC22 fmuld ra2, rB3, m2 ldd [pA2+56], ra2 faddd rC32, m3, rC32 fmuld ra3, rB3, m3 ldd [pB3+48], rB3 faddd rC03, m0, rC03 fmuld rA0, rB0, m0 ldd [pA3+56], ra3 faddd rC13, m1, rC13 fmuld rA1, rB0, m1 faddd rC23, m2, rC23 fmuld rA2, rB0, m2 faddd rC33, m3, rC33 fmuld rA3, rB0, m3 ldd [pB0+56], rB0!! Second to last K iteration! faddd rC00, m0, rC00 fmuld rA0, rB1, m0 add pB0, incBm, pB0 faddd rC10, m1, rC10 fmuld rA1, rB1, m1 faddd rC20, m2, rC20 fmuld rA2, rB1, m2 faddd rC30, m3, rC30 fmuld rA3, rB1, m3 ldd [pB1+56], rB1 faddd rC01, m0, rC01 fmuld rA0, rB2, m0 faddd rC11, m1, rC11 fmuld rA1, rB2, m1 faddd rC21, m2, rC21 fmuld rA2, rB2, m2 faddd rC31, m3, rC31 fmuld rA3, rB2, m3 ldd [pB2+56], rB2 faddd rC02, m0, rC02 fmuld rA0, rB3, m0 faddd rC12, m1, rC12 fmuld rA1, rB3, m1 faddd rC22, m2, rC22 fmuld rA2, rB3, m2 faddd rC32, m3, rC32 fmuld rA3, rB3, m3 ldd [pB3+56], rB3 faddd rC03, m0, rC03 fmuld ra0, rB0, m0 faddd rC13, m1, rC13 fmuld ra1, rB0, m1 faddd rC23, m2, rC23 fmuld ra2, rB0, m2 faddd rC33, m3, rC33 fmuld ra3, rB0, m3!! Last K iteration! faddd rC00, m0, rC00 fmuld ra0, rB1, m0 faddd rC10, m1, rC10 fmuld ra1, rB1, m1 faddd rC20, m2, rC20 fmuld ra2, rB1, m2 faddd rC30, m3, rC30 fmuld ra3, rB1, m3 faddd rC01, m0, rC01 fmuld ra0, rB2, m0 add pB1, incBm, pB1 faddd rC11, m1, rC11 fmuld ra1, rB2, m1 prefR2([pfA]) faddd rC21, m2, rC21 fmuld ra2, rB2, m2 faddd rC31, m3, rC31 prefR2([pfA+64]) fmuld ra3, rB2, m3 faddd rC02, m0, rC02 fmuld ra0, rB3, m0 add pB2, incBm, pB2 faddd rC12, m1, rC12 fmuld ra1, rB3, m1 add pA0, incAm, pA0 faddd rC22, m2, rC22 fmuld ra2, rB3, m2 add pA1, incAm, pA1 faddd rC32, m3, rC32 fmuld ra3, rB3, m3 add pA2, incAm, pA2 faddd rC03, m0, rC03 add pB3, incBm, pB3 faddd rC13, m1, rC13 add pA3, incAm, pA3 faddd rC23, m2, rC23 add pfB, 128, pfB faddd rC33, m3, rC33 add pfA, 128, pfA!! Write result back to C! std rC00, [pC0] std rC10, [pC0+CMUL(8)] std rC20, [pC0+CMUL(16)] std rC30, [pC0+CMUL(24)] add pC0, CMUL(32), pC0 std rC01, [pC1] std rC11, [pC1+CMUL(8)] std rC21, [pC1+CMUL(16)] std rC31, [pC1+CMUL(24)] add pC1, CMUL(32), pC1 std rC02, [pC2] std rC12, [pC2+CMUL(8)] std rC22, [pC2+CMUL(16)] std rC32, [pC2+CMUL(24)] add pC2, CMUL(32), pC2 std rC03, [pC3] std rC13, [pC3+CMUL(8)] std rC23, [pC3+CMUL(16)] std rC33, [pC3+CMUL(24)]!! while(II);! subcc II, 4, II bnz MLOOP add pC3, incCm, pC3 add pC0, incCn, pC0 add pC1, incCn, pC1 add pC2, incCn, pC2 add pC3, incCn, pC3 add pA0, incAn, pA0 add pA1, incAn, pA1 add pA2, incAn, pA2 add pA3, incAn, pA3 add pB0, incBn, pB0 add pB1, incBn, pB1 add pB2, incBn, pB2!! while(N);! subcc N, 4, N bnz NLOOP add pB3, incBn, pB3!! Restore non-scratch registers and return!#ifdef ATL_USE64BITS#else ld [%sp+80], %g2 ld [%sp+84], %g3 ld [%sp+88], %g4#endif ret restore
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?