atl_smm4x4xurx_mips.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 3,232 行 · 第 1/5 页

C
3,232
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2007 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef KB   #error "This kernel requires KB be a compile-time constant!"#endif#if KB > 80   #error "This kernel supports max KB of 80"#endif#define rC00 $f0#define rC10 $f1#define rC20 $f2#define rC30 $f3#define rC01 $f4#define rC11 $f5#define rC21 $f6#define rC31 $f7#define rC02 $f8#define rC12 $f9#define rC22 $f10#define rC32 $f11#define rC03 $f12#define rC13 $f13#define rC23 $f14#define rC33 $f15#define rA0  $f16#define rA1  $f17#define rA2  $f18#define rA3  $f19#define rB0  $f20#define rB1  $f21#define rB2  $f22#define rB3  $f23#define ra0  $f24#define ra1  $f25#define ra2  $f26#define ra3  $f27#define rE0  $f28#define re0  $f29#define rE2  $f30#define rb0  $f31#define M       $4#define N       $5// #define K0      $6#define pA0     $8#define incAm   $9#define pB0     $10#define incBn   $11#define pC0     $7#define pA1     $12#define pA2     $13#define pA3     $14#define pB1     $15#define pB2     $16#define pB3     $17#define K       $18#define stAm    $19#define stBn    $20#define incAn   $21#define pfA     $22#define pfB     $23#define pC1	$24#define pC2	$25#define pC3	$30#define incCn	$2#ifdef BETAX   #define FSIZE 160#else   #define FSIZE 152#endif#if 1   #define prefB(mem) pref 6, mem#else   #define prefB(mem)#endif#if 1   #define prefC(mem) pref 5, mem#else   #define prefC(mem)#endif#ifdef SCPLX   #define CMUL(i_) ((i_)+(i_))#else   #define CMUL(i_) i_#endif/* * save : 18,19,20,21,22,23 * $26, $27 reserved.  $0 = 0, $1 used by assembler * I think can use $31 if I save it & restore it. r0 = 0 * Avail: $2, $3, $12-25, $28, $30, $31 *//*void ATL_USERMM             $4           $5           $6                $f15,             $8   (const int M, const int N, const int K, const float  alpha, const float  *A,               $9               $10           $11               0(%sp)    const int lda, const float  *B, const int ldb, const float  beta,        8($sp)       16($sp)    float  *C, const int ldc)*/.text.align 3.globl ATL_USERMM.ent   ATL_USERMMATL_USERMM:        .frame  $sp,FSIZE,$31        .set    noreorder        .set    nomacro        .set    noat/* *      Adjust stack and save registers */        daddiu  $sp, $sp, -FSIZE        sd      $16, 0($sp)        sd      $17, 8($sp)        sd      $18, 16($sp)        sd      $19, 24($sp)        sd      $20, 32($sp)        sd      $21, 40($sp)        sd      $22, 64($sp)        sd      $23, 72($sp)        sd      $30, 80($sp)#ifdef ATL_USE64BITS        sdc1    $f24, 88($sp)        sdc1    $f25, 96($sp)        sdc1    $f26, 104($sp)        sdc1    $f27, 112($sp)        sdc1    $f28, 120($sp)        sdc1    $f29, 128($sp)        sdc1    $f30, 136($sp)        sdc1    $f31, 144($sp)#else        sdc1    $f20, 88($sp)        sdc1    $f22, 96($sp)        sdc1    $f24, 104($sp)        sdc1    $f26, 112($sp)        sdc1    $f28, 120($sp)        sdc1    $f30, 128($sp)#endif#ifdef BETAX        ldc1    rA0, FSIZE($sp)           /* get BETA from caller's stack */        sdc1    rA0, 152($sp)             /* save BETA to my stack */   #define BETOFF 152#endif/* *      (ldc,lda, ldb, K0) * sizeof; setup column ptrs */#ifdef ATL_USE64BITS	ld	pC0, FSIZE+8($sp)	ld	incCn, FSIZE+16($sp)#else        lw      pC0, FSIZE+8($sp)        lw      incCn, FSIZE+16($sp)#endif#ifdef SCPLX	sll	incCn, incCn, 3		/* incCn = ldc*sizeof */#else	sll	incCn, incCn, 2		/* incCn = ldc*sizeof */#endif        sll     incAm, incAm, 2         /* incAm = lda*sizeof */        .set    macro        dmul    incAn, incAm, M         /* incAn = lda*M */        sll     incBn, incBn, 2         /* incBn = ldb*sizeof */        dmul    stBn, incBn, N          /* stBn = ldb*N */        .set    nomacro//        sll     K0, K0, 3        daddu   pA1, pA0, incAm        daddu   pA2, pA1, incAm        daddu   pA3, pA2, incAm        daddu   pB1, pB0, incBn        daddu   pB2, pB1, incBn        daddu   pB3, pB2, incBn	daddu	pC1, pC0, incCn	daddu	pC2, pC1, incCn	daddu	pC3, pC2, incCn        sll     incAm, incAm, 2         /* incAm = lda*4 */        sll     incBn, incBn, 2         /* incBn = ldb*4 */        daddu   stAm, pA0, incAn        /* stAm = pA0 + lda*M */        daddu   stBn, pB0, stBn         /* stBn = pB0 + ldb*N */#ifdef SCPLX 	sll	pfA, M, 3		/* pfA = M*sizeof */#else 	sll	pfA, M, 2		/* pfA = M*sizeof */#endif        sll	incCn, incCn, 2		/* incCn = ldc*4 */	dsubu	incCn, incCn, pfA	/* incCn = ldc*4 - M */        or     pfA, stAm, $0/*        or     pfB, stBn, $0 *///	daddiu	K0, K0, -8NLOOP:	daddiu	pfB, pB0, 4*4*KBMLOOP:	lwc1	rB0, 0(pB0)	lwc1	rA0, 0(pA0)#ifdef BETA1        lwc1    rC00, 0(pC0)        lwc1    rC10, CMUL(4)(pC0)        lwc1    rC20, CMUL(8)(pC0)        lwc1    rC30, CMUL(12)(pC0)        lwc1    rC01, 0(pC1)        lwc1    rC11, CMUL(4)(pC1)        lwc1    rC21, CMUL(8)(pC1)        lwc1    rC31, CMUL(12)(pC1)        lwc1    rC02, 0(pC2)#elif defined(BETAX)        lwc1    rB3, BETOFF($sp)           /* load BETA */        lwc1    rC00, 0(pC0)        lwc1    rC10, CMUL(4)(pC0)        lwc1    rC20, CMUL(8)(pC0)        lwc1    rC30, CMUL(12)(pC0)        lwc1    rC01, 0(pC1)		mul.s	rC00, rC00, rB3        lwc1    rC11, CMUL(4)(pC1)		mul.s	rC10, rC10, rB3        lwc1    rC21, CMUL(8)(pC1)		mul.s	rC20, rC20, rB3        lwc1    rC31, CMUL(12)(pC1)		mul.s	rC30, rC30, rB3        lwc1    rC02, 0(pC2)		mul.s	rC01, rC01, rB3        lwc1    rC12, CMUL(4)(pC2)		mul.s	rC11, rC11, rB3        lwc1    rC22, CMUL(8)(pC2)		mul.s	rC21, rC21, rB3        lwc1    rC32, CMUL(12)(pC2)		mul.s	rC31, rC31, rB3        lwc1    rC03, 0(pC3)		mul.s	rC02, rC02, rB3        lwc1    rC13, CMUL(4)(pC3)		mul.s	rC12, rC12, rB3        lwc1    rC23, CMUL(8)(pC3)		mul.s	rC22, rC22, rB3        lwc1    rC33, CMUL(12)(pC3)		mul.s	rC32, rC32, rB3#endif/*	.align 3 *//* KLOOP: */	lwc1	rA1, 0(pA1)	lwc1	rA2, 0(pA2)	lwc1	rA3, 0(pA3)	lwc1	rB1, 0(pB1)	lwc1	rB2, 0(pB2)   #ifndef BETAX	lwc1	rB3, 0(pB3)   #endif   #if KB > 1	lwc1	ra0, 4(pA0)   #endif   #if KB > 1	lwc1	ra2, 4(pA2)   #endif   #if KB > 2	lwc1	rE0, 8(pA0)   #endif#if KB > 0   #ifdef BETA0	#if KB > 1					lwc1	ra1, 4(pA1)	#endif	mul.s 	rC00, rA0, rB0					daddiu  pC0, pC0, CMUL(16)	mul.s 	rC10, rA1, rB0	#if KB > 3					lwc1	re0, 12(pA0)	#endif	mul.s 	rC20, rA2, rB0					daddiu  pC1, pC1, CMUL(16)	mul.s 	rC30, rA3, rB0	#if KB > 1					lwc1	rb0, 4(pB0)	#endif	mul.s 	rC01, rA0, rB1					prefC(-CMUL(16)(pC0))	mul.s 	rC11, rA1, rB1					daddiu  pC2, pC2, CMUL(16)	mul.s 	rC21, rA2, rB1					daddiu  pC3, pC3, CMUL(16)	mul.s 	rC31, rA3, rB1	#if KB > 1					lwc1	rB1, 4(pB1)	#endif	mul.s 	rC02, rA0, rB2	mul.s 	rC12, rA1, rB2	#if KB > 2					lwc1	rE2, 8(pA2)	#endif	mul.s 	rC22, rA2, rB2					prefC(-CMUL(16)(pC1))	mul.s 	rC32, rA3, rB2	#if KB > 1					lwc1	rB2, 4(pB2)	#endif	mul.s 	rC03, rA0, rB3					prefC(-CMUL(16)(pC2))	mul.s 	rC13, rA1, rB3	#if KB > 1					lwc1	ra3, 4(pA3)	#endif	mul.s 	rC23, rA2, rB3					prefC(-CMUL(16)(pC3))	mul.s 	rC33, rA3, rB3	#if KB > 1					lwc1	rB3, 4(pB3)	#endif   #elif defined(BETAX)	#if KB > 1					lwc1	ra1, 4(pA1)	#endif	madd.s	rC00, rC00, rA0, rB0					daddiu  pC0, pC0, CMUL(16)	madd.s	rC10, rC10, rA1, rB0		mul.s	rC03, rC03, rB3	#if KB > 3					lwc1	re0, 12(pA0)	#endif	madd.s	rC20, rC20, rA2, rB0					daddiu  pC1, pC1, CMUL(16)	madd.s	rC30, rC30, rA3, rB0		mul.s	rC13, rC13, rB3	#if KB > 1					lwc1	rb0, 4(pB0)	#endif	madd.s	rC01, rC01, rA0, rB1					daddiu  pC2, pC2, CMUL(16)	madd.s	rC11, rC11, rA1, rB1					daddiu  pC3, pC3, CMUL(16)	madd.s	rC21, rC21, rA2, rB1	madd.s	rC31, rC31, rA3, rB1		mul.s	rC23, rC23, rB3	#if KB > 1					lwc1	rB1, 4(pB1)	#endif		mul.s	rC33, rC33, rB3					lwc1	rB3, 0(pB3)	madd.s	rC02, rC02, rA0, rB2	madd.s	rC12, rC12, rA1, rB2	madd.s	rC22, rC22, rA2, rB2	#if KB > 2					lwc1	rE2,  8(pA2)	#endif	madd.s	rC32, rC32, rA3, rB2	#if KB > 1					lwc1	rB2, 4(pB2)	#endif	madd.s	rC03, rC03, rA0, rB3	madd.s	rC13, rC13, rA1, rB3	#if KB > 1					lwc1	ra3, 4(pA3)	#endif	madd.s	rC23, rC23, rA2, rB3	madd.s	rC33, rC33, rA3, rB3	#if KB > 1					lwc1	rB3, 4(pB3)	#endif   #else  /* BETA = 1 */	#if KB > 1					lwc1	ra1, 4(pA1)	#endif	madd.s	rC00, rC00, rA0, rB0        				lwc1    rC12, CMUL(4)(pC2)	madd.s	rC10, rC10, rA1, rB0        				lwc1    rC22, CMUL(8)(pC2)	madd.s	rC20, rC20, rA2, rB0        				lwc1    rC32, CMUL(12)(pC2)	madd.s	rC30, rC30, rA3, rB0        				lwc1    rC03, 0(pC3)	madd.s	rC01, rC01, rA0, rB1        				lwc1    rC13, CMUL(4)(pC3)	madd.s	rC11, rC11, rA1, rB1        				lwc1    rC23, CMUL(8)(pC3)	madd.s	rC21, rC21, rA2, rB1	#if KB > 3					lwc1	re0, 12(pA0)	#endif	madd.s	rC31, rC31, rA3, rB1	#if KB > 1					lwc1	rb0, 4(pB0)	#endif	madd.s	rC02, rC02, rA0, rB2	#if KB > 1					lwc1	rB1, 4(pB1)	#endif	madd.s	rC12, rC12, rA1, rB2	#if KB > 2					lwc1	rE2, 8(pA2)	#endif	madd.s	rC22, rC22, rA2, rB2        				lwc1    rC33, CMUL(12)(pC3)	madd.s	rC32, rC32, rA3, rB2	#if KB > 1					lwc1	rB2, 4(pB2)	#endif	madd.s	rC03, rC03, rA0, rB3					daddiu  pC0, pC0, CMUL(16)	madd.s	rC13, rC13, rA1, rB3	#if KB > 1					lwc1	ra3, 4(pA3)	#endif	madd.s	rC23, rC23, rA2, rB3					daddiu  pC1, pC1, CMUL(16)	madd.s	rC33, rC33, rA3, rB3	#if KB > 1					lwc1	rB3, 4(pB3)	#endif   #endif /* end BETA specialization */#endif#if KB <= 2 && defined(BETA1)					daddiu  pC2, pC2, CMUL(16)					daddiu  pC3, pC3, CMUL(16)#endif	.align 3#if KB > 1	#if KB > 2					lwc1	rA1, 8(pA1)	#elif KB == 2					daddu	pA0, pA0, incAm	#endif	madd.s	rC00, rC00, ra0, rb0	#if KB == 2					daddu	pA1, pA1, incAm	#endif	madd.s	rC10, rC10, ra1, rb0	#if KB > 4					lwc1	rA0, 16(pA0)	#elif KB == 2					daddu	pA2, pA2, incAm	#endif	madd.s	rC20, rC20, ra2, rb0	#if KB == 2					daddu	pA3, pA3, incAm        #elif defined(BETA1)					daddiu  pC2, pC2, CMUL(16)	#endif	madd.s	rC30, rC30, ra3, rb0	#if KB > 2					lwc1	rB0, 8(pB0)	#elif KB == 2					swc1	rC00, -CMUL(16)(pC0)	#endif	madd.s	rC01, rC01, ra0, rB1	#if KB == 2					swc1	rC10, -CMUL(12)(pC0)        #elif defined(BETA1)					daddiu  pC3, pC3, CMUL(16)	#endif	madd.s	rC11, rC11, ra1, rB1	#if KB == 2					swc1	rC20, -CMUL(8)(pC0)	#endif	madd.s	rC21, rC21, ra2, rB1	#if KB == 2					swc1	rC30, -CMUL(4)(pC0)	#endif	madd.s	rC31, rC31, ra3, rB1	#if KB > 2					lwc1	rB1, 8(pB1)	#elif KB == 2					swc1	rC01, -CMUL(16)(pC1)	#endif	madd.s	rC02, rC02, ra0, rB2	#if KB == 2					swc1	rC11, -CMUL(12)(pC1)	#endif	madd.s	rC12, rC12, ra1, rB2	#if KB > 3					lwc1	rA2, 12(pA2)	#elif KB == 2					swc1	rC21, -CMUL(8)(pC1)	#endif	madd.s	rC22, rC22, ra2, rB2	#if KB == 2					swc1	rC31, -CMUL(4)(pC1)	#endif	madd.s	rC32, rC32, ra3, rB2	#if KB > 2					lwc1	rB2, 8(pB2)	#elif KB == 2					swc1	rC02, -CMUL(16)(pC2)	#endif	madd.s	rC03, rC03, ra0, rB3	#if KB == 2					swc1	rC12, -CMUL(12)(pC2)	#endif	madd.s	rC13, rC13, ra1, rB3	#if KB > 2					lwc1	rA3, 8(pA3)	#elif KB == 2					swc1	rC22, -CMUL(8)(pC2)	#endif	madd.s	rC23, rC23, ra2, rB3	#if KB == 2					swc1	rC32, -CMUL(4)(pC2)	#endif	madd.s	rC33, rC33, ra3, rB3	#if KB > 2					lwc1	rB3, 8(pB3)	#endif#endif#if KB > 2	#if KB > 3					lwc1	ra1, 12(pA1)	#elif KB == 3					daddu	pA0, pA0, incAm	#endif	madd.s	rC00, rC00, rE0, rB0	#if KB == 3					daddu	pA1, pA1, incAm	#endif	madd.s	rC10, rC10, rA1, rB0	#if KB > 5					lwc1	ra0, 20(pA0)	#elif KB == 3					daddu	pA2, pA2, incAm	#endif	madd.s	rC20, rC20, rE2, rB0	#if KB == 3					daddu	pA3, pA3, incAm	#endif	madd.s	rC30, rC30, rA3, rB0	#if KB > 3					lwc1	rb0, 12(pB0)	#elif KB == 3					swc1	rC00, -CMUL(16)(pC0)	#endif	madd.s	rC01, rC01, rE0, rB1	#if KB == 3					swc1	rC10, -CMUL(12)(pC0)	#endif	madd.s	rC11, rC11, rA1, rB1	#if KB == 3					swc1	rC20, -CMUL(8)(pC0)	#endif	madd.s	rC21, rC21, rE2, rB1	#if KB == 3					swc1	rC30, -CMUL(4)(pC0)	#endif	madd.s	rC31, rC31, rA3, rB1	#if KB > 3					lwc1	rB1, 12(pB1)	#elif KB == 3					swc1	rC01, -CMUL(16)(pC1)	#endif	madd.s	rC02, rC02, rE0, rB2	#if KB == 3					swc1	rC11, -CMUL(12)(pC1)	#endif	madd.s	rC12, rC12, rA1, rB2	#if KB > 4					lwc1	ra2, 16(pA2)	#elif KB == 3					swc1	rC21, -CMUL(8)(pC1)	#endif	madd.s	rC22, rC22, rE2, rB2	#if KB == 3					swc1	rC31, -CMUL(4)(pC1)	#endif	madd.s	rC32, rC32, rA3, rB2	#if KB > 3					lwc1	rB2, 12(pB2)	#elif KB == 3					swc1	rC02, -CMUL(16)(pC2)	#endif	madd.s	rC03, rC03, rE0, rB3	#if KB == 3					swc1	rC12, -CMUL(12)(pC2)	#endif	madd.s	rC13, rC13, rA1, rB3	#if KB > 3					lwc1	ra3, 12(pA3)	#elif KB == 3					swc1	rC22, -CMUL(8)(pC2)	#endif	madd.s	rC23, rC23, rE2, rB3	#if KB == 3					swc1	rC32, -CMUL(4)(pC2)	#endif	madd.s	rC33, rC33, rA3, rB3	#if KB > 3					lwc1	rB3, 12(pB3)	#endif#endif#if KB > 3	#if KB > 4					lwc1	rA1, 16(pA1)	#elif KB == 4					daddu	pA0, pA0, incAm	#endif	madd.s	rC00, rC00, re0, rb0	#if KB == 4					daddu	pA1, pA1, incAm	#elif !defined(BETA0)		prefC(16(pC0))	#endif	madd.s	rC10, rC10, ra1, rb0	#if KB > 6					lwc1	rE0, 24(pA0)	#elif KB == 4					daddu	pA2, pA2, incAm	#endif	madd.s	rC20, rC20, rA2, rb0	#if KB == 4					daddu	pA3, pA3, incAm	#elif !defined(BETA0)		prefC(16(pC1))	#endif	madd.s	rC30, rC30, ra3, rb0	#if KB > 4					lwc1	rB0, 16(pB0)	#elif KB == 4					swc1	rC00, -CMUL(16)(pC0)	#endif	madd.s	rC01, rC01, re0, rB1	#if KB == 4

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?