atl_dmm4x4xur3_mips.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,389 行 · 第 1/5 页

C
2,389
字号
#endif#if KB > 6	madd.d	rC00, rC00, rA0, rB0					prefC(CMUL(32)(pC0))	madd.d	rC10, rC10, rA1, rB0					ldc1	rz0, 40(pA0)	madd.d	rC20, rC20, rA2, rB0					prefC(CMUL(32)(pC1))	madd.d	rC30, rC30, rA3, rB0					ldc1	rB0, 32(pB0)	madd.d	rC01, rC01, rA0, rB1	madd.d	rC11, rC11, rA1, rB1					ldc1	rz1, 40(pA1)	madd.d	rC21, rC21, rA2, rB1	madd.d	rC31, rC31, rA3, rB1					ldc1	rB1, 32(pB1)	madd.d	rC02, rC02, rA0, rB2	madd.d	rC12, rC12, rA1, rB2					ldc1	rz2, 40(pA2)	madd.d	rC22, rC22, rA2, rB2	madd.d	rC32, rC32, rA3, rB2					ldc1	rB2, 32(pB2)	madd.d	rC03, rC03, rA0, rB3	madd.d	rC13, rC13, rA1, rB3					ldc1	rz3, 40(pA3)	madd.d	rC23, rC23, rA2, rB3	madd.d	rC33, rC33, rA3, rB3					ldc1	rB3, 32(pB3)	madd.d	rC00, rC00, ra0, rB0	madd.d	rC10, rC10, ra1, rB0					ldc1	rA0, 48(pA0)	madd.d	rC20, rC20, ra2, rB0	madd.d	rC30, rC30, ra3, rB0					ldc1	rB0, 40(pB0)	madd.d	rC01, rC01, ra0, rB1					prefC(CMUL(32)(pC2))	madd.d	rC11, rC11, ra1, rB1					ldc1	rA1, 48(pA1)	madd.d	rC21, rC21, ra2, rB1	madd.d	rC31, rC31, ra3, rB1					ldc1	rB1, 40(pB1)	madd.d	rC02, rC02, ra0, rB2	madd.d	rC12, rC12, ra1, rB2					ldc1	rA2, 48(pA2)	madd.d	rC22, rC22, ra2, rB2					prefC(CMUL(32)(pC3))	madd.d	rC32, rC32, ra3, rB2					ldc1	rB2, 40(pB2)	madd.d	rC03, rC03, ra0, rB3	madd.d	rC13, rC13, ra1, rB3					ldc1	rA3, 48(pA3)	madd.d	rC23, rC23, ra2, rB3	madd.d	rC33, rC33, ra3, rB3					ldc1	rB3, 40(pB3)	madd.d	rC00, rC00, rz0, rB0	madd.d	rC10, rC10, rz1, rB0					ldc1	ra0, 56(pA0)	madd.d	rC20, rC20, rz2, rB0	madd.d	rC30, rC30, rz3, rB0					ldc1	rB0, 48(pB0)	madd.d	rC01, rC01, rz0, rB1	madd.d	rC11, rC11, rz1, rB1					ldc1	ra1, 56(pA1)	madd.d	rC21, rC21, rz2, rB1	madd.d	rC31, rC31, rz3, rB1					ldc1	rB1, 48(pB1)	madd.d	rC02, rC02, rz0, rB2	madd.d	rC12, rC12, rz1, rB2					ldc1	ra2, 56(pA2)	madd.d	rC22, rC22, rz2, rB2	madd.d	rC32, rC32, rz3, rB2					ldc1	rB2, 48(pB2)	madd.d	rC03, rC03, rz0, rB3	madd.d	rC13, rC13, rz1, rB3					ldc1	ra3, 56(pA3)	madd.d	rC23, rC23, rz2, rB3	madd.d	rC33, rC33, rz3, rB3					ldc1	rB3, 48(pB3)#endif#if KB > 9	madd.d	rC00, rC00, rA0, rB0					prefB(KB*8*4(pB0))	madd.d	rC10, rC10, rA1, rB0					ldc1	rz0, 64(pA0)	madd.d	rC20, rC20, rA2, rB0					prefB(KB*8*4(pB1))	madd.d	rC30, rC30, rA3, rB0					ldc1	rB0, 56(pB0)	madd.d	rC01, rC01, rA0, rB1					prefB(KB*8*4(pB2))	madd.d	rC11, rC11, rA1, rB1					ldc1	rz1, 64(pA1)	madd.d	rC21, rC21, rA2, rB1					prefB(KB*8*4(pB3))	madd.d	rC31, rC31, rA3, rB1					ldc1	rB1, 56(pB1)	madd.d	rC02, rC02, rA0, rB2	madd.d	rC12, rC12, rA1, rB2					ldc1	rz2, 64(pA2)	madd.d	rC22, rC22, rA2, rB2	madd.d	rC32, rC32, rA3, rB2					ldc1	rB2, 56(pB2)	madd.d	rC03, rC03, rA0, rB3	madd.d	rC13, rC13, rA1, rB3					ldc1	rz3, 64(pA3)	madd.d	rC23, rC23, rA2, rB3	madd.d	rC33, rC33, rA3, rB3					ldc1	rB3, 56(pB3)	madd.d	rC00, rC00, ra0, rB0	madd.d	rC10, rC10, ra1, rB0					ldc1	rA0, 72(pA0)	madd.d	rC20, rC20, ra2, rB0	madd.d	rC30, rC30, ra3, rB0					ldc1	rB0, 64(pB0)	madd.d	rC01, rC01, ra0, rB1	madd.d	rC11, rC11, ra1, rB1					ldc1	rA1, 72(pA1)	madd.d	rC21, rC21, ra2, rB1	madd.d	rC31, rC31, ra3, rB1					ldc1	rB1, 64(pB1)	madd.d	rC02, rC02, ra0, rB2	madd.d	rC12, rC12, ra1, rB2					ldc1	rA2, 72(pA2)	madd.d	rC22, rC22, ra2, rB2	madd.d	rC32, rC32, ra3, rB2					ldc1	rB2, 64(pB2)	madd.d	rC03, rC03, ra0, rB3	madd.d	rC13, rC13, ra1, rB3					ldc1	rA3, 72(pA3)	madd.d	rC23, rC23, ra2, rB3	madd.d	rC33, rC33, ra3, rB3					ldc1	rB3, 64(pB3)	madd.d	rC00, rC00, rz0, rB0	madd.d	rC10, rC10, rz1, rB0					ldc1	ra0, 80(pA0)	madd.d	rC20, rC20, rz2, rB0					prefB(32+KB*8*4(pB0))	madd.d	rC30, rC30, rz3, rB0					ldc1	rB0, 72(pB0)	madd.d	rC01, rC01, rz0, rB1					prefB(32+KB*8*4(pB1))	madd.d	rC11, rC11, rz1, rB1					ldc1	ra1, 80(pA1)	madd.d	rC21, rC21, rz2, rB1					prefB(32+KB*8*4(pB2))	madd.d	rC31, rC31, rz3, rB1					ldc1	rB1, 72(pB1)	madd.d	rC02, rC02, rz0, rB2					prefB(32+KB*8*4(pB3))	madd.d	rC12, rC12, rz1, rB2					ldc1	ra2, 80(pA2)	madd.d	rC22, rC22, rz2, rB2					prefB(64+KB*8*4(pB0))	madd.d	rC32, rC32, rz3, rB2					ldc1	rB2, 72(pB2)	madd.d	rC03, rC03, rz0, rB3					prefB(64+KB*8*4(pB1))	madd.d	rC13, rC13, rz1, rB3					ldc1	ra3, 80(pA3)	madd.d	rC23, rC23, rz2, rB3					prefB(64+KB*8*4(pB2))	madd.d	rC33, rC33, rz3, rB3					ldc1	rB3, 72(pB3)#endif#if KB > 12	madd.d	rC00, rC00, rA0, rB0					prefB(64+KB*8*4(pB3))	madd.d	rC10, rC10, rA1, rB0					ldc1	rz0, 88(pA0)	madd.d	rC20, rC20, rA2, rB0					prefB(96+KB*8*4(pB0))	madd.d	rC30, rC30, rA3, rB0					ldc1	rB0, 80(pB0)	madd.d	rC01, rC01, rA0, rB1					prefB(96+KB*8*4(pB1))	madd.d	rC11, rC11, rA1, rB1					ldc1	rz1, 88(pA1)	madd.d	rC21, rC21, rA2, rB1					prefB(96+KB*8*4(pB2))	madd.d	rC31, rC31, rA3, rB1					ldc1	rB1, 80(pB1)	madd.d	rC02, rC02, rA0, rB2					prefB(96+KB*8*4(pB3))	madd.d	rC12, rC12, rA1, rB2					ldc1	rz2, 88(pA2)	madd.d	rC22, rC22, rA2, rB2					prefB(128+KB*8*4(pB0))	madd.d	rC32, rC32, rA3, rB2					ldc1	rB2, 80(pB2)	madd.d	rC03, rC03, rA0, rB3					prefB(128+KB*8*4(pB1))	madd.d	rC13, rC13, rA1, rB3					ldc1	rz3, 88(pA3)	madd.d	rC23, rC23, rA2, rB3					prefB(128+KB*8*4(pB2))	madd.d	rC33, rC33, rA3, rB3					ldc1	rB3, 80(pB3)	madd.d	rC00, rC00, ra0, rB0	madd.d	rC10, rC10, ra1, rB0					ldc1	rA0, 96(pA0)	madd.d	rC20, rC20, ra2, rB0	madd.d	rC30, rC30, ra3, rB0					ldc1	rB0, 88(pB0)	madd.d	rC01, rC01, ra0, rB1	madd.d	rC11, rC11, ra1, rB1					ldc1	rA1, 96(pA1)	madd.d	rC21, rC21, ra2, rB1	madd.d	rC31, rC31, ra3, rB1					ldc1	rB1, 88(pB1)	madd.d	rC02, rC02, ra0, rB2	madd.d	rC12, rC12, ra1, rB2					ldc1	rA2, 96(pA2)	madd.d	rC22, rC22, ra2, rB2	madd.d	rC32, rC32, ra3, rB2					ldc1	rB2, 88(pB2)	madd.d	rC03, rC03, ra0, rB3	madd.d	rC13, rC13, ra1, rB3					ldc1	rA3, 96(pA3)	madd.d	rC23, rC23, ra2, rB3	madd.d	rC33, rC33, ra3, rB3					ldc1	rB3, 88(pB3)	madd.d	rC00, rC00, rz0, rB0	madd.d	rC10, rC10, rz1, rB0					ldc1	ra0, 104(pA0)	madd.d	rC20, rC20, rz2, rB0	madd.d	rC30, rC30, rz3, rB0					ldc1	rB0, 96(pB0)	madd.d	rC01, rC01, rz0, rB1	madd.d	rC11, rC11, rz1, rB1					ldc1	ra1, 104(pA1)	madd.d	rC21, rC21, rz2, rB1	madd.d	rC31, rC31, rz3, rB1					ldc1	rB1, 96(pB1)	madd.d	rC02, rC02, rz0, rB2	madd.d	rC12, rC12, rz1, rB2					ldc1	ra2, 104(pA2)	madd.d	rC22, rC22, rz2, rB2	madd.d	rC32, rC32, rz3, rB2					ldc1	rB2, 96(pB2)	madd.d	rC03, rC03, rz0, rB3	madd.d	rC13, rC13, rz1, rB3					ldc1	ra3, 104(pA3)	madd.d	rC23, rC23, rz2, rB3	madd.d	rC33, rC33, rz3, rB3					ldc1	rB3, 96(pB3)#endif#if KB > 15	madd.d	rC00, rC00, rA0, rB0					prefB(128+KB*8*4(pB3))	madd.d	rC10, rC10, rA1, rB0					ldc1	rz0, 112(pA0)	madd.d	rC20, rC20, rA2, rB0					prefB(160+KB*8*4(pB0))	madd.d	rC30, rC30, rA3, rB0					ldc1	rB0, 104(pB0)	madd.d	rC01, rC01, rA0, rB1					prefB(160+KB*8*4(pB1))	madd.d	rC11, rC11, rA1, rB1					ldc1	rz1, 112(pA1)	madd.d	rC21, rC21, rA2, rB1					prefB(160+KB*8*4(pB2))	madd.d	rC31, rC31, rA3, rB1					ldc1	rB1, 104(pB1)	madd.d	rC02, rC02, rA0, rB2					prefB(160+KB*8*4(pB2))	madd.d	rC12, rC12, rA1, rB2					ldc1	rz2, 112(pA2)	madd.d	rC22, rC22, rA2, rB2					prefB(192+KB*8*4(pB0))	madd.d	rC32, rC32, rA3, rB2					ldc1	rB2, 104(pB2)	madd.d	rC03, rC03, rA0, rB3					prefB(192+KB*8*4(pB1))	madd.d	rC13, rC13, rA1, rB3					ldc1	rz3, 112(pA3)	madd.d	rC23, rC23, rA2, rB3					prefB(192+KB*8*4(pB2))	madd.d	rC33, rC33, rA3, rB3					ldc1	rB3, 104(pB3)	madd.d	rC00, rC00, ra0, rB0					prefB(192+KB*8*4(pB3))	madd.d	rC10, rC10, ra1, rB0					ldc1	rA0, 120(pA0)	madd.d	rC20, rC20, ra2, rB0					prefB(224+KB*8*4(pB0))	madd.d	rC30, rC30, ra3, rB0					ldc1	rB0, 112(pB0)	madd.d	rC01, rC01, ra0, rB1					prefB(224+KB*8*4(pB1))	madd.d	rC11, rC11, ra1, rB1					ldc1	rA1, 120(pA1)	madd.d	rC21, rC21, ra2, rB1					prefB(224+KB*8*4(pB2))	madd.d	rC31, rC31, ra3, rB1					ldc1	rB1, 112(pB1)	madd.d	rC02, rC02, ra0, rB2					prefB(224+KB*8*4(pB3))	madd.d	rC12, rC12, ra1, rB2					ldc1	rA2, 120(pA2)	madd.d	rC22, rC22, ra2, rB2					prefB(256+KB*8*4(pB0))	madd.d	rC32, rC32, ra3, rB2					ldc1	rB2, 112(pB2)	madd.d	rC03, rC03, ra0, rB3					prefB(256+KB*8*4(pB1))	madd.d	rC13, rC13, ra1, rB3					ldc1	rA3, 120(pA3)	madd.d	rC23, rC23, ra2, rB3	madd.d	rC33, rC33, ra3, rB3					ldc1	rB3, 112(pB3)	madd.d	rC00, rC00, rz0, rB0	madd.d	rC10, rC10, rz1, rB0					ldc1	ra0, 128(pA0)	madd.d	rC20, rC20, rz2, rB0	madd.d	rC30, rC30, rz3, rB0					ldc1	rB0, 120(pB0)	madd.d	rC01, rC01, rz0, rB1	madd.d	rC11, rC11, rz1, rB1					ldc1	ra1, 128(pA1)	madd.d	rC21, rC21, rz2, rB1	madd.d	rC31, rC31, rz3, rB1					ldc1	rB1, 120(pB1)	madd.d	rC02, rC02, rz0, rB2	madd.d	rC12, rC12, rz1, rB2					ldc1	ra2, 128(pA2)	madd.d	rC22, rC22, rz2, rB2	madd.d	rC32, rC32, rz3, rB2					ldc1	rB2, 120(pB2)	madd.d	rC03, rC03, rz0, rB3	madd.d	rC13, rC13, rz1, rB3					ldc1	ra3, 128(pA3)	madd.d	rC23, rC23, rz2, rB3	madd.d	rC33, rC33, rz3, rB3					ldc1	rB3, 120(pB3)#endif#if KB > 18	madd.d	rC00, rC00, rA0, rB0	madd.d	rC10, rC10, rA1, rB0					ldc1	rz0, 136(pA0)	madd.d	rC20, rC20, rA2, rB0	madd.d	rC30, rC30, rA3, rB0					ldc1	rB0, 128(pB0)	madd.d	rC01, rC01, rA0, rB1	madd.d	rC11, rC11, rA1, rB1					ldc1	rz1, 136(pA1)	madd.d	rC21, rC21, rA2, rB1	madd.d	rC31, rC31, rA3, rB1					ldc1	rB1, 128(pB1)	madd.d	rC02, rC02, rA0, rB2	madd.d	rC12, rC12, rA1, rB2					ldc1	rz2, 136(pA2)	madd.d	rC22, rC22, rA2, rB2	madd.d	rC32, rC32, rA3, rB2					ldc1	rB2, 128(pB2)	madd.d	rC03, rC03, rA0, rB3	madd.d	rC13, rC13, rA1, rB3					ldc1	rz3, 136(pA3)	madd.d	rC23, rC23, rA2, rB3	madd.d	rC33, rC33, rA3, rB3					ldc1	rB3, 128(pB3)	madd.d	rC00, rC00, ra0, rB0	madd.d	rC10, rC10, ra1, rB0					ldc1	rA0, 144(pA0)	madd.d	rC20, rC20, ra2, rB0					prefB(256+KB*8*4(pB2))	madd.d	rC30, rC30, ra3, rB0					ldc1	rB0, 136(pB0)	madd.d	rC01, rC01, ra0, rB1					prefB(256+KB*8*4(pB3))	madd.d	rC11, rC11, ra1, rB1					ldc1	rA1, 144(pA1)	madd.d	rC21, rC21, ra2, rB1					prefB(288+KB*8*4(pB0))	madd.d	rC31, rC31, ra3, rB1					ldc1	rB1, 136(pB1)	madd.d	rC02, rC02, ra0, rB2					prefB(288+KB*8*4(pB1))	madd.d	rC12, rC12, ra1, rB2					ldc1	rA2, 144(pA2)	madd.d	rC22, rC22, ra2, rB2					prefB(288+KB*8*4(pB2))	madd.d	rC32, rC32, ra3, rB2					ldc1	rB2, 136(pB2)	madd.d	rC03, rC03, ra0, rB3					prefB(288+KB*8*4(pB3))	madd.d	rC13, rC13, ra1, rB3					ldc1	rA3, 144(pA3)	madd.d	rC23, rC23, ra2, rB3					prefB(320+KB*8*4(pB0))	madd.d	rC33, rC33, ra3, rB3					ldc1	rB3, 136(pB3)	madd.d	rC00, rC00, rz0, rB0					prefB(320+KB*8*4(pB1))	madd.d	rC10, rC10, rz1, rB0					ldc1	ra0, 152(pA0)	madd.d	rC20, rC20, rz2, rB0					prefB(320+KB*8*4(pB2))	madd.d	rC30, rC30, rz3, rB0					ldc1	rB0, 144(pB0)	madd.d	rC01, rC01, rz0, rB1					prefB(320+KB*8*4(pB3))	madd.d	rC11, rC11, rz1, rB1					ldc1	ra1, 152(pA1)	madd.d	rC21, rC21, rz2, rB1					prefB(352+KB*8*4(pB0))	madd.d	rC31, rC31, rz3, rB1					ldc1	rB1, 144(pB1)	madd.d	rC02, rC02, rz0, rB2					prefB(352+KB*8*4(pB1))	madd.d	rC12, rC12, rz1, rB2					ldc1	ra2, 152(pA2)	madd.d	rC22, rC22, rz2, rB2					prefB(352+KB*8*4(pB2))	madd.d	rC32, rC32, rz3, rB2					ldc1	rB2, 144(pB2)	madd.d	rC03, rC03, rz0, rB3					prefB(352+KB*8*4(pB3))	madd.d	rC13, rC13, rz1, rB3					ldc1	ra3, 152(pA3)	madd.d	rC23, rC23, rz2, rB3					prefA(0(pfA))	madd.d	rC33, rC33, rz3, rB3					ldc1	rB3, 144(pB3)#endif#if KB > 21	madd.d	rC00, rC00, rA0, rB0	madd.d	rC10, rC10, rA1, rB0					ldc1	rz0, 160(pA0)	madd.d	rC20, rC20, rA2, rB0	madd.d	rC30, rC30, rA3, rB0					ldc1	rB0, 152(pB0)	madd.d	rC01, rC01, rA0, rB1	madd.d	rC11, rC11, rA1, rB1					ldc1	rz1, 160(pA1)	madd.d	rC21, rC21, rA2, rB1	madd.d	rC31, rC31, rA3, rB1					ldc1	rB1, 152(pB1)	madd.d	rC02, rC02, rA0, rB2	madd.d	rC12, rC12, rA1, rB2					ldc1	rz2, 160(pA2)	madd.d	rC22, rC22, rA2, rB2	madd.d	rC32, rC32, rA3, rB2					ldc1	rB2, 152(pB2)	madd.d	rC03, rC03, rA0, rB3	madd.d	rC13, rC13, rA1, rB3					ldc1	rz3, 160(pA3)	madd.d	rC23, rC23, rA2, rB3	madd.d	rC33, rC33, rA3, rB3					ldc1	rB3, 152(pB3)	madd.d	rC00, rC00, ra0, rB0	madd.d	rC10, rC10, ra1, rB0					ldc1	rA0, 168(pA0)	madd.d	rC20, rC20, ra2, rB0	madd.d	rC30, rC30, ra3, rB0					ldc1	rB0, 160(pB0)	madd.d	rC01, rC01, ra0, rB1	madd.d	rC11, rC11, ra1, rB1					ldc1	rA1, 168(pA1)	madd.d	rC21, rC21, ra2, rB1	madd.d	rC31, rC31, ra3, rB1					ldc1	rB1, 160(pB1)	madd.d	rC02, rC02, ra0, rB2	madd.d	rC12, rC12, ra1, rB2					ldc1	rA2, 168(pA2)	madd.d	rC22, rC22, ra2, rB2	madd.d	rC32, rC32, ra3, rB2					ldc1	rB2, 160(pB2)	madd.d	rC03, rC03, ra0, rB3	madd.d	rC13, rC13, ra1, rB3					ldc1	rA3, 168(pA3)	madd.d	rC23, rC23, ra2, rB3	madd.d	rC33, rC33, ra3, rB3					ldc1	rB3, 160(pB3)	madd.d	rC00, rC00, rz0, rB0	madd.d	rC10, rC10, rz1, rB0					ldc1	ra0, 176(pA0)	madd.d	rC20, rC20, rz2, rB0					prefA(32(pfA))

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?