atl_dmm4x4xur2_mips.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,473 行 · 第 1/5 页

C
2,473
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2007 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef KB   #error "This kernel requires KB be a compile-time constant!"#endif#if KB > 80   #error "This kernel supports max KB of 80"#endif#define rC00 $f0#define rC10 $f1#define rC20 $f2#define rC30 $f3#define rC01 $f4#define rC11 $f5#define rC21 $f6#define rC31 $f7#define rC02 $f8#define rC12 $f9#define rC22 $f10#define rC32 $f11#define rC03 $f12#define rC13 $f13#define rC23 $f14#define rC33 $f15#define rA0  $f16#define rA1  $f17#define rA2  $f18#define rA3  $f19#define rB0  $f20#define rB1  $f21#define rB2  $f22#define rB3  $f23#define ra0  $f24#define ra1  $f25#define ra2  $f26#define ra3  $f27#define rb0  $f28#define rb1  $f29#define rb2  $f30#define rb3  $f31#define M       $4#define N       $5#define K0      $6#define pA0     $8#define incAm   $9#define pB0     $10#define incBn   $11#define pC0     $7#define pA1     $12#define pA2     $13#define pA3     $14#define pB1     $15#define pB2     $16#define pB3     $17#define K       $18#define stAm    $19#define stBn    $20#define incAn   $21#define pfA     $22#define pfB     $23#define pC1	$24#define pC2	$25#define pC3	$30#define incCn	$2#ifdef BETAX   #define FSIZE 160#else   #define FSIZE 152#endif#define PFDISTA KB*8*8#if 0   #define prefA(mem) pref 6, mem#else   #define prefA(mem)#endif#if 1   #define prefB(mem) pref 6, mem#else   #define prefB(mem)#endif#if 1   #define prefC(mem) pref 5, mem#else   #define prefC(mem)#endif#ifdef DCPLX   #define CMUL(i_) ((i_)+(i_))#else   #define CMUL(i_) i_#endif/* * save : 18,19,20,21,22,23 * $26, $27 reserved.  $0 = 0, $1 used by assembler * I think can use $31 if I save it & restore it. r0 = 0 * Avail: $2, $3, $12-25, $28, $30, $31 *//*void ATL_USERMM             $4           $5           $6                $f15,             $8   (const int M, const int N, const int K, const double alpha, const double *A,               $9               $10           $11               0(%sp)    const int lda, const double *B, const int ldb, const double beta,        8($sp)       16($sp)    double *C, const int ldc)*/.text.align 3.globl ATL_USERMM.ent   ATL_USERMMATL_USERMM:        .frame  $sp,FSIZE,$31        .set    noreorder        .set    nomacro        .set    noat/* *      Adjust stack and save registers */        daddiu  $sp, $sp, -FSIZE        sd      $16, 0($sp)        sd      $17, 8($sp)        sd      $18, 16($sp)        sd      $19, 24($sp)        sd      $20, 32($sp)        sd      $21, 40($sp)        sd      $22, 64($sp)        sd      $23, 72($sp)        sd      $30, 80($sp)#ifdef ATL_USE64BITS        sdc1    $f24, 88($sp)        sdc1    $f25, 96($sp)        sdc1    $f26, 104($sp)        sdc1    $f27, 112($sp)        sdc1    $f28, 120($sp)        sdc1    $f29, 128($sp)        sdc1    $f30, 136($sp)        sdc1    $f31, 144($sp)#else        sdc1    $f20, 88($sp)        sdc1    $f22, 96($sp)        sdc1    $f24, 104($sp)        sdc1    $f26, 112($sp)        sdc1    $f28, 120($sp)        sdc1    $f30, 128($sp)#endif#ifdef BETAX        ldc1    rA0, FSIZE($sp)           /* get BETA from caller's stack */        sdc1    rA0, 152($sp)             /* save BETA to my stack */   #define BETOFF 152#endif/* *      (ldc,lda, ldb, K0) * sizeof; setup column ptrs */#ifdef ATL_USE64BITS	ld	pC0, FSIZE+8($sp)	ld	incCn, FSIZE+16($sp)#else        lw      pC0, FSIZE+8($sp)        lw      incCn, FSIZE+16($sp)#endif#ifdef DCPLX	sll	incCn, incCn, 4		/* incCn = ldc*sizeof */#else	sll	incCn, incCn, 3		/* incCn = ldc*sizeof */#endif        sll     incAm, incAm, 3         /* incAm = lda*sizeof */        .set    macro        dmul    incAn, incAm, M         /* incAn = lda*M */        sll     incBn, incBn, 3         /* incBn = ldb*sizeof */        dmul    stBn, incBn, N          /* stBn = ldb*N */        .set    nomacro        sll     K0, K0, 3        daddu   pA1, pA0, incAm        daddu   pA2, pA1, incAm        daddu   pA3, pA2, incAm        daddu   pB1, pB0, incBn        daddu   pB2, pB1, incBn        daddu   pB3, pB2, incBn	daddu	pC1, pC0, incCn	daddu	pC2, pC1, incCn	daddu	pC3, pC2, incCn        sll     incAm, incAm, 2         /* incAm = lda*4 */        sll     incBn, incBn, 2         /* incBn = ldb*4 */        daddu   stAm, pA0, incAn        /* stAm = pA0 + lda*M */        daddu   stBn, pB0, stBn         /* stBn = pB0 + ldb*N */#ifdef DCPLX 	sll	pfA, M, 4		/* pfA = M*sizeof */#else 	sll	pfA, M, 3		/* pfA = M*sizeof */#endif        sll	incCn, incCn, 2		/* incCn = ldc*4 */	dsubu	incCn, incCn, pfA	/* incCn = ldc*4 - M */        or     pfA, stAm, $0        or     pfB, stBn, $0	daddiu	K0, K0, -8NLOOP:MLOOP:        ldc1    rB0, 0(pB0)        ldc1    rA0, 0(pA0)        ldc1    rA1, 0(pA1)        ldc1    rA2, 0(pA2)        ldc1    rA3, 0(pA3)        ldc1    rB1, 0(pB1)        ldc1    rB2, 0(pB2)#ifndef BETAX        ldc1    rB3, 0(pB3)#endif#ifdef BETA1        ldc1    rC00, 0(pC0)        ldc1    rC01, 0(pC1)        ldc1    rC10, CMUL(8)(pC0)        ldc1    rC20, CMUL(16)(pC0)        ldc1    rC30, CMUL(24)(pC0)        ldc1    rC11, CMUL(8)(pC1)        ldc1    rC21, CMUL(16)(pC1)        ldc1    rC31, CMUL(24)(pC1)        ldc1    rC02, 0(pC2)        ldc1    rC03, 0(pC3)        ldc1    rC12, CMUL(8)(pC2)        ldc1    rC22, CMUL(16)(pC2)        ldc1    rC32, CMUL(24)(pC2)   #if KB <= 2        ldc1    rC13, CMUL(8)(pC3)        ldc1    rC23, CMUL(16)(pC3)        ldc1    rC33, CMUL(24)(pC3)   #endif#elif defined(BETAX)        ldc1    rB3, BETOFF($sp)           /* load BETA */        ldc1    rC00, 0(pC0)        ldc1    rC10, CMUL(8)(pC0)        ldc1    rC20, CMUL(16)(pC0)        ldc1    rC30, CMUL(24)(pC0)        ldc1    rC01, 0(pC1)		mul.d	rC00, rC00, rB3        ldc1    rC11, CMUL(8)(pC1)		mul.d	rC10, rC10, rB3        ldc1    rC21, CMUL(16)(pC1)		mul.d	rC20, rC20, rB3        ldc1    rC31, CMUL(24)(pC1)		mul.d	rC30, rC30, rB3        ldc1    rC02, 0(pC2)		mul.d	rC01, rC01, rB3        ldc1    rC12, CMUL(8)(pC2)		mul.d	rC11, rC11, rB3        ldc1    rC22, CMUL(16)(pC2)		mul.d	rC21, rC21, rB3        ldc1    rC32, CMUL(24)(pC2)		mul.d	rC31, rC31, rB3        ldc1    rC03, 0(pC3)		mul.d	rC02, rC02, rB3        ldc1    rC13, CMUL(8)(pC3)		mul.d	rC12, rC12, rB3        ldc1    rC23, CMUL(16)(pC3)		mul.d	rC22, rC22, rB3        ldc1    rC33, CMUL(24)(pC3)		mul.d	rC32, rC32, rB3		mul.d	rC03, rC03, rB3		mul.d	rC13, rC13, rB3		mul.d	rC23, rC23, rB3		mul.d	rC33, rC33, rB3        ldc1    rB3, 0(pB3)#endif#if KB <= 2 && defined(BETA0)	dmtc1	$0, rC00	mov.d	rC10, rC00	mov.d	rC20, rC00	mov.d	rC30, rC00	mov.d	rC01, rC00	mov.d	rC11, rC00	mov.d	rC21, rC00	mov.d	rC31, rC00	mov.d	rC02, rC00	mov.d	rC12, rC00	mov.d	rC22, rC00	mov.d	rC32, rC00	mov.d	rC03, rC00	mov.d	rC13, rC00	mov.d	rC23, rC00	mov.d	rC33, rC00#endif	.align 3/* KLOOP: */#if KB > 2   #ifdef BETA0					ldc1	rb0, 8(pB0)	mul.d	rC00, rA0, rB0					ldc1	ra0, 8(pA0)	mul.d	rC10, rA1, rB0					prefC((pC0))	mul.d	rC20, rA2, rB0					prefC((pC1))	mul.d	rC30, rA3, rB0					prefC((pC2))	mul.d	rC01, rA0, rB1					prefC((pC3))	mul.d	rC11, rA1, rB1					prefA(KB*8*8(pA0))	mul.d	rC21, rA2, rB1					prefA(KB*8*8(pA1))	mul.d	rC31, rA3, rB1					prefA(KB*8*8(pA2))	mul.d	rC02, rA0, rB2					prefA(KB*8*8(pA3))	mul.d	rC12, rA1, rB2					ldc1	ra1, 8(pA1)	mul.d	rC22, rA2, rB2					ldc1	ra2, 8(pA2)	mul.d	rC32, rA3, rB2					ldc1	ra3, 8(pA3)	mul.d	rC03, rA0, rB3					ldc1	rb1, 8(pB1)	mul.d	rC13, rA1, rB3					ldc1	rb2, 8(pB2)	mul.d	rC23, rA2, rB3					ldc1	rb3, 8(pB3)	mul.d	rC33, rA3, rB3   #else					ldc1	rb0, 8(pB0)	madd.d	rC00, rC00, rA0, rB0					ldc1	ra0, 8(pA0)	madd.d	rC10, rC10, rA1, rB0	madd.d	rC20, rC20, rA2, rB0       #ifndef BETAX        				ldc1    rC13, CMUL(8)(pC3)       #endif	madd.d	rC30, rC30, rA3, rB0       #ifndef BETAX        				ldc1    rC23, CMUL(16)(pC3)       #endif	madd.d	rC01, rC01, rA0, rB1       #ifndef BETAX        				ldc1    rC33, CMUL(24)(pC3)       #endif	madd.d	rC11, rC11, rA1, rB1					prefA(KB*8*8(pA0))	madd.d	rC21, rC21, rA2, rB1					prefA(KB*8*8(pA1))	madd.d	rC31, rC31, rA3, rB1					prefA(KB*8*8(pA2))	madd.d	rC02, rC02, rA0, rB2					prefA(KB*8*8(pA3))	madd.d	rC12, rC12, rA1, rB2					ldc1	ra1, 8(pA1)	madd.d	rC22, rC22, rA2, rB2					ldc1	ra2, 8(pA2)	madd.d	rC32, rC32, rA3, rB2					ldc1	ra3, 8(pA3)	madd.d	rC03, rC03, rA0, rB3					ldc1	rb1, 8(pB1)	madd.d	rC13, rC13, rA1, rB3					ldc1	rb2, 8(pB2)	madd.d	rC23, rC23, rA2, rB3					ldc1	rb3, 8(pB3)	madd.d	rC33, rC33, rA3, rB3   #endif /* if BETA != 0 */					ldc1	rB0, 16(pB0)	madd.d	rC00, rC00, ra0, rb0					ldc1	rA0, 16(pA0)	madd.d	rC10, rC10, ra1, rb0					ldc1	rA1, 16(pA1)	madd.d	rC20, rC20, ra2, rb0					ldc1	rA2, 16(pA2)	madd.d	rC30, rC30, ra3, rb0					ldc1	rA3, 16(pA3)	madd.d	rC01, rC01, ra0, rb1					ldc1	rB1, 16(pB1)	madd.d	rC11, rC11, ra1, rb1					ldc1	rB2, 16(pB2)	madd.d	rC21, rC21, ra2, rb1					ldc1	rB3, 16(pB3)	madd.d	rC31, rC31, ra3, rb1					prefA(32+KB*8*8(pA0))	madd.d	rC02, rC02, ra0, rb2					prefA(32+KB*8*8(pA1))	madd.d	rC12, rC12, ra1, rb2					prefA(32+KB*8*8(pA2))	madd.d	rC22, rC22, ra2, rb2					prefA(32+KB*8*8(pA3))	madd.d	rC32, rC32, ra3, rb2					prefA(64+KB*8*8(pA0))	madd.d	rC03, rC03, ra0, rb3					prefA(64+KB*8*8(pA1))	madd.d	rC13, rC13, ra1, rb3					prefA(64+KB*8*8(pA2))	madd.d	rC23, rC23, ra2, rb3					prefA(64+KB*8*8(pA3))	madd.d	rC33, rC33, ra3, rb3#endif#if KB > 4					ldc1	rb0, 24(pB0)	madd.d	rC00, rC00, rA0, rB0					ldc1	ra0, 24(pA0)	madd.d	rC10, rC10, rA1, rB0					ldc1	ra1, 24(pA1)	madd.d	rC20, rC20, rA2, rB0					ldc1	ra2, 24(pA2)	madd.d	rC30, rC30, rA3, rB0					ldc1	ra3, 24(pA3)	madd.d	rC01, rC01, rA0, rB1					ldc1	rb1, 24(pB1)	madd.d	rC11, rC11, rA1, rB1					ldc1	rb2, 24(pB2)	madd.d	rC21, rC21, rA2, rB1					ldc1	rb3, 24(pB3)	madd.d	rC31, rC31, rA3, rB1					prefA(96+KB*8*8(pA0))	madd.d	rC02, rC02, rA0, rB2					prefA(96+KB*8*8(pA1))	madd.d	rC12, rC12, rA1, rB2					prefA(96+KB*8*8(pA2))	madd.d	rC22, rC22, rA2, rB2					prefA(96+KB*8*8(pA3))	madd.d	rC32, rC32, rA3, rB2					prefA(128+KB*8*8(pA0))	madd.d	rC03, rC03, rA0, rB3					prefA(128+KB*8*8(pA1))	madd.d	rC13, rC13, rA1, rB3					prefA(128+KB*8*8(pA2))	madd.d	rC23, rC23, rA2, rB3					prefA(128+KB*8*8(pA3))	madd.d	rC33, rC33, rA3, rB3					ldc1	rB0, 32(pB0)	madd.d	rC00, rC00, ra0, rb0					ldc1	rA0, 32(pA0)	madd.d	rC10, rC10, ra1, rb0					ldc1	rA1, 32(pA1)	madd.d	rC20, rC20, ra2, rb0					ldc1	rA2, 32(pA2)	madd.d	rC30, rC30, ra3, rb0					ldc1	rA3, 32(pA3)	madd.d	rC01, rC01, ra0, rb1					ldc1	rB1, 32(pB1)	madd.d	rC11, rC11, ra1, rb1					ldc1	rB2, 32(pB2)	madd.d	rC21, rC21, ra2, rb1					ldc1	rB3, 32(pB3)	madd.d	rC31, rC31, ra3, rb1	madd.d	rC02, rC02, ra0, rb2	madd.d	rC12, rC12, ra1, rb2	madd.d	rC22, rC22, ra2, rb2	madd.d	rC32, rC32, ra3, rb2	madd.d	rC03, rC03, ra0, rb3	madd.d	rC13, rC13, ra1, rb3	madd.d	rC23, rC23, ra2, rb3	madd.d	rC33, rC33, ra3, rb3#endif#if KB > 6					ldc1	rb0, 40(pB0)	madd.d	rC00, rC00, rA0, rB0					ldc1	ra0, 40(pA0)	madd.d	rC10, rC10, rA1, rB0					ldc1	ra1, 40(pA1)	madd.d	rC20, rC20, rA2, rB0					ldc1	ra2, 40(pA2)	madd.d	rC30, rC30, rA3, rB0					ldc1	ra3, 40(pA3)	madd.d	rC01, rC01, rA0, rB1					ldc1	rb1, 40(pB1)	madd.d	rC11, rC11, rA1, rB1					ldc1	rb2, 40(pB2)	madd.d	rC21, rC21, rA2, rB1					ldc1	rb3, 40(pB3)	madd.d	rC31, rC31, rA3, rB1	madd.d	rC02, rC02, rA0, rB2

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?