atl_smm4x4xurx_mips.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 3,232 行 · 第 1/5 页
C
3,232 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2007 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef KB #error "This kernel requires KB be a compile-time constant!"#endif#if KB > 80 #error "This kernel supports max KB of 80"#endif#define rC00 $f0#define rC10 $f1#define rC20 $f2#define rC30 $f3#define rC01 $f4#define rC11 $f5#define rC21 $f6#define rC31 $f7#define rC02 $f8#define rC12 $f9#define rC22 $f10#define rC32 $f11#define rC03 $f12#define rC13 $f13#define rC23 $f14#define rC33 $f15#define rA0 $f16#define rA1 $f17#define rA2 $f18#define rA3 $f19#define rB0 $f20#define rB1 $f21#define rB2 $f22#define rB3 $f23#define ra0 $f24#define ra1 $f25#define ra2 $f26#define ra3 $f27#define rE0 $f28#define re0 $f29#define rE2 $f30#define rb0 $f31#define M $4#define N $5// #define K0 $6#define pA0 $8#define incAm $9#define pB0 $10#define incBn $11#define pC0 $7#define pA1 $12#define pA2 $13#define pA3 $14#define pB1 $15#define pB2 $16#define pB3 $17#define K $18#define stAm $19#define stBn $20#define incAn $21#define pfA $22#define pfB $23#define pC1 $24#define pC2 $25#define pC3 $30#define incCn $2#ifdef BETAX #define FSIZE 160#else #define FSIZE 152#endif#if 1 #define prefB(mem) pref 6, mem#else #define prefB(mem)#endif#if 1 #define prefC(mem) pref 5, mem#else #define prefC(mem)#endif#ifdef SCPLX #define CMUL(i_) ((i_)+(i_))#else #define CMUL(i_) i_#endif/* * save : 18,19,20,21,22,23 * $26, $27 reserved. $0 = 0, $1 used by assembler * I think can use $31 if I save it & restore it. r0 = 0 * Avail: $2, $3, $12-25, $28, $30, $31 *//*void ATL_USERMM $4 $5 $6 $f15, $8 (const int M, const int N, const int K, const float alpha, const float *A, $9 $10 $11 0(%sp) const int lda, const float *B, const int ldb, const float beta, 8($sp) 16($sp) float *C, const int ldc)*/.text.align 3.globl ATL_USERMM.ent ATL_USERMMATL_USERMM: .frame $sp,FSIZE,$31 .set noreorder .set nomacro .set noat/* * Adjust stack and save registers */ daddiu $sp, $sp, -FSIZE sd $16, 0($sp) sd $17, 8($sp) sd $18, 16($sp) sd $19, 24($sp) sd $20, 32($sp) sd $21, 40($sp) sd $22, 64($sp) sd $23, 72($sp) sd $30, 80($sp)#ifdef ATL_USE64BITS sdc1 $f24, 88($sp) sdc1 $f25, 96($sp) sdc1 $f26, 104($sp) sdc1 $f27, 112($sp) sdc1 $f28, 120($sp) sdc1 $f29, 128($sp) sdc1 $f30, 136($sp) sdc1 $f31, 144($sp)#else sdc1 $f20, 88($sp) sdc1 $f22, 96($sp) sdc1 $f24, 104($sp) sdc1 $f26, 112($sp) sdc1 $f28, 120($sp) sdc1 $f30, 128($sp)#endif#ifdef BETAX ldc1 rA0, FSIZE($sp) /* get BETA from caller's stack */ sdc1 rA0, 152($sp) /* save BETA to my stack */ #define BETOFF 152#endif/* * (ldc,lda, ldb, K0) * sizeof; setup column ptrs */#ifdef ATL_USE64BITS ld pC0, FSIZE+8($sp) ld incCn, FSIZE+16($sp)#else lw pC0, FSIZE+8($sp) lw incCn, FSIZE+16($sp)#endif#ifdef SCPLX sll incCn, incCn, 3 /* incCn = ldc*sizeof */#else sll incCn, incCn, 2 /* incCn = ldc*sizeof */#endif sll incAm, incAm, 2 /* incAm = lda*sizeof */ .set macro dmul incAn, incAm, M /* incAn = lda*M */ sll incBn, incBn, 2 /* incBn = ldb*sizeof */ dmul stBn, incBn, N /* stBn = ldb*N */ .set nomacro// sll K0, K0, 3 daddu pA1, pA0, incAm daddu pA2, pA1, incAm daddu pA3, pA2, incAm daddu pB1, pB0, incBn daddu pB2, pB1, incBn daddu pB3, pB2, incBn daddu pC1, pC0, incCn daddu pC2, pC1, incCn daddu pC3, pC2, incCn sll incAm, incAm, 2 /* incAm = lda*4 */ sll incBn, incBn, 2 /* incBn = ldb*4 */ daddu stAm, pA0, incAn /* stAm = pA0 + lda*M */ daddu stBn, pB0, stBn /* stBn = pB0 + ldb*N */#ifdef SCPLX sll pfA, M, 3 /* pfA = M*sizeof */#else sll pfA, M, 2 /* pfA = M*sizeof */#endif sll incCn, incCn, 2 /* incCn = ldc*4 */ dsubu incCn, incCn, pfA /* incCn = ldc*4 - M */ or pfA, stAm, $0/* or pfB, stBn, $0 */// daddiu K0, K0, -8NLOOP: daddiu pfB, pB0, 4*4*KBMLOOP: lwc1 rB0, 0(pB0) lwc1 rA0, 0(pA0)#ifdef BETA1 lwc1 rC00, 0(pC0) lwc1 rC10, CMUL(4)(pC0) lwc1 rC20, CMUL(8)(pC0) lwc1 rC30, CMUL(12)(pC0) lwc1 rC01, 0(pC1) lwc1 rC11, CMUL(4)(pC1) lwc1 rC21, CMUL(8)(pC1) lwc1 rC31, CMUL(12)(pC1) lwc1 rC02, 0(pC2)#elif defined(BETAX) lwc1 rB3, BETOFF($sp) /* load BETA */ lwc1 rC00, 0(pC0) lwc1 rC10, CMUL(4)(pC0) lwc1 rC20, CMUL(8)(pC0) lwc1 rC30, CMUL(12)(pC0) lwc1 rC01, 0(pC1) mul.s rC00, rC00, rB3 lwc1 rC11, CMUL(4)(pC1) mul.s rC10, rC10, rB3 lwc1 rC21, CMUL(8)(pC1) mul.s rC20, rC20, rB3 lwc1 rC31, CMUL(12)(pC1) mul.s rC30, rC30, rB3 lwc1 rC02, 0(pC2) mul.s rC01, rC01, rB3 lwc1 rC12, CMUL(4)(pC2) mul.s rC11, rC11, rB3 lwc1 rC22, CMUL(8)(pC2) mul.s rC21, rC21, rB3 lwc1 rC32, CMUL(12)(pC2) mul.s rC31, rC31, rB3 lwc1 rC03, 0(pC3) mul.s rC02, rC02, rB3 lwc1 rC13, CMUL(4)(pC3) mul.s rC12, rC12, rB3 lwc1 rC23, CMUL(8)(pC3) mul.s rC22, rC22, rB3 lwc1 rC33, CMUL(12)(pC3) mul.s rC32, rC32, rB3#endif/* .align 3 *//* KLOOP: */ lwc1 rA1, 0(pA1) lwc1 rA2, 0(pA2) lwc1 rA3, 0(pA3) lwc1 rB1, 0(pB1) lwc1 rB2, 0(pB2) #ifndef BETAX lwc1 rB3, 0(pB3) #endif #if KB > 1 lwc1 ra0, 4(pA0) #endif #if KB > 1 lwc1 ra2, 4(pA2) #endif #if KB > 2 lwc1 rE0, 8(pA0) #endif#if KB > 0 #ifdef BETA0 #if KB > 1 lwc1 ra1, 4(pA1) #endif mul.s rC00, rA0, rB0 daddiu pC0, pC0, CMUL(16) mul.s rC10, rA1, rB0 #if KB > 3 lwc1 re0, 12(pA0) #endif mul.s rC20, rA2, rB0 daddiu pC1, pC1, CMUL(16) mul.s rC30, rA3, rB0 #if KB > 1 lwc1 rb0, 4(pB0) #endif mul.s rC01, rA0, rB1 prefC(-CMUL(16)(pC0)) mul.s rC11, rA1, rB1 daddiu pC2, pC2, CMUL(16) mul.s rC21, rA2, rB1 daddiu pC3, pC3, CMUL(16) mul.s rC31, rA3, rB1 #if KB > 1 lwc1 rB1, 4(pB1) #endif mul.s rC02, rA0, rB2 mul.s rC12, rA1, rB2 #if KB > 2 lwc1 rE2, 8(pA2) #endif mul.s rC22, rA2, rB2 prefC(-CMUL(16)(pC1)) mul.s rC32, rA3, rB2 #if KB > 1 lwc1 rB2, 4(pB2) #endif mul.s rC03, rA0, rB3 prefC(-CMUL(16)(pC2)) mul.s rC13, rA1, rB3 #if KB > 1 lwc1 ra3, 4(pA3) #endif mul.s rC23, rA2, rB3 prefC(-CMUL(16)(pC3)) mul.s rC33, rA3, rB3 #if KB > 1 lwc1 rB3, 4(pB3) #endif #elif defined(BETAX) #if KB > 1 lwc1 ra1, 4(pA1) #endif madd.s rC00, rC00, rA0, rB0 daddiu pC0, pC0, CMUL(16) madd.s rC10, rC10, rA1, rB0 mul.s rC03, rC03, rB3 #if KB > 3 lwc1 re0, 12(pA0) #endif madd.s rC20, rC20, rA2, rB0 daddiu pC1, pC1, CMUL(16) madd.s rC30, rC30, rA3, rB0 mul.s rC13, rC13, rB3 #if KB > 1 lwc1 rb0, 4(pB0) #endif madd.s rC01, rC01, rA0, rB1 daddiu pC2, pC2, CMUL(16) madd.s rC11, rC11, rA1, rB1 daddiu pC3, pC3, CMUL(16) madd.s rC21, rC21, rA2, rB1 madd.s rC31, rC31, rA3, rB1 mul.s rC23, rC23, rB3 #if KB > 1 lwc1 rB1, 4(pB1) #endif mul.s rC33, rC33, rB3 lwc1 rB3, 0(pB3) madd.s rC02, rC02, rA0, rB2 madd.s rC12, rC12, rA1, rB2 madd.s rC22, rC22, rA2, rB2 #if KB > 2 lwc1 rE2, 8(pA2) #endif madd.s rC32, rC32, rA3, rB2 #if KB > 1 lwc1 rB2, 4(pB2) #endif madd.s rC03, rC03, rA0, rB3 madd.s rC13, rC13, rA1, rB3 #if KB > 1 lwc1 ra3, 4(pA3) #endif madd.s rC23, rC23, rA2, rB3 madd.s rC33, rC33, rA3, rB3 #if KB > 1 lwc1 rB3, 4(pB3) #endif #else /* BETA = 1 */ #if KB > 1 lwc1 ra1, 4(pA1) #endif madd.s rC00, rC00, rA0, rB0 lwc1 rC12, CMUL(4)(pC2) madd.s rC10, rC10, rA1, rB0 lwc1 rC22, CMUL(8)(pC2) madd.s rC20, rC20, rA2, rB0 lwc1 rC32, CMUL(12)(pC2) madd.s rC30, rC30, rA3, rB0 lwc1 rC03, 0(pC3) madd.s rC01, rC01, rA0, rB1 lwc1 rC13, CMUL(4)(pC3) madd.s rC11, rC11, rA1, rB1 lwc1 rC23, CMUL(8)(pC3) madd.s rC21, rC21, rA2, rB1 #if KB > 3 lwc1 re0, 12(pA0) #endif madd.s rC31, rC31, rA3, rB1 #if KB > 1 lwc1 rb0, 4(pB0) #endif madd.s rC02, rC02, rA0, rB2 #if KB > 1 lwc1 rB1, 4(pB1) #endif madd.s rC12, rC12, rA1, rB2 #if KB > 2 lwc1 rE2, 8(pA2) #endif madd.s rC22, rC22, rA2, rB2 lwc1 rC33, CMUL(12)(pC3) madd.s rC32, rC32, rA3, rB2 #if KB > 1 lwc1 rB2, 4(pB2) #endif madd.s rC03, rC03, rA0, rB3 daddiu pC0, pC0, CMUL(16) madd.s rC13, rC13, rA1, rB3 #if KB > 1 lwc1 ra3, 4(pA3) #endif madd.s rC23, rC23, rA2, rB3 daddiu pC1, pC1, CMUL(16) madd.s rC33, rC33, rA3, rB3 #if KB > 1 lwc1 rB3, 4(pB3) #endif #endif /* end BETA specialization */#endif#if KB <= 2 && defined(BETA1) daddiu pC2, pC2, CMUL(16) daddiu pC3, pC3, CMUL(16)#endif .align 3#if KB > 1 #if KB > 2 lwc1 rA1, 8(pA1) #elif KB == 2 daddu pA0, pA0, incAm #endif madd.s rC00, rC00, ra0, rb0 #if KB == 2 daddu pA1, pA1, incAm #endif madd.s rC10, rC10, ra1, rb0 #if KB > 4 lwc1 rA0, 16(pA0) #elif KB == 2 daddu pA2, pA2, incAm #endif madd.s rC20, rC20, ra2, rb0 #if KB == 2 daddu pA3, pA3, incAm #elif defined(BETA1) daddiu pC2, pC2, CMUL(16) #endif madd.s rC30, rC30, ra3, rb0 #if KB > 2 lwc1 rB0, 8(pB0) #elif KB == 2 swc1 rC00, -CMUL(16)(pC0) #endif madd.s rC01, rC01, ra0, rB1 #if KB == 2 swc1 rC10, -CMUL(12)(pC0) #elif defined(BETA1) daddiu pC3, pC3, CMUL(16) #endif madd.s rC11, rC11, ra1, rB1 #if KB == 2 swc1 rC20, -CMUL(8)(pC0) #endif madd.s rC21, rC21, ra2, rB1 #if KB == 2 swc1 rC30, -CMUL(4)(pC0) #endif madd.s rC31, rC31, ra3, rB1 #if KB > 2 lwc1 rB1, 8(pB1) #elif KB == 2 swc1 rC01, -CMUL(16)(pC1) #endif madd.s rC02, rC02, ra0, rB2 #if KB == 2 swc1 rC11, -CMUL(12)(pC1) #endif madd.s rC12, rC12, ra1, rB2 #if KB > 3 lwc1 rA2, 12(pA2) #elif KB == 2 swc1 rC21, -CMUL(8)(pC1) #endif madd.s rC22, rC22, ra2, rB2 #if KB == 2 swc1 rC31, -CMUL(4)(pC1) #endif madd.s rC32, rC32, ra3, rB2 #if KB > 2 lwc1 rB2, 8(pB2) #elif KB == 2 swc1 rC02, -CMUL(16)(pC2) #endif madd.s rC03, rC03, ra0, rB3 #if KB == 2 swc1 rC12, -CMUL(12)(pC2) #endif madd.s rC13, rC13, ra1, rB3 #if KB > 2 lwc1 rA3, 8(pA3) #elif KB == 2 swc1 rC22, -CMUL(8)(pC2) #endif madd.s rC23, rC23, ra2, rB3 #if KB == 2 swc1 rC32, -CMUL(4)(pC2) #endif madd.s rC33, rC33, ra3, rB3 #if KB > 2 lwc1 rB3, 8(pB3) #endif#endif#if KB > 2 #if KB > 3 lwc1 ra1, 12(pA1) #elif KB == 3 daddu pA0, pA0, incAm #endif madd.s rC00, rC00, rE0, rB0 #if KB == 3 daddu pA1, pA1, incAm #endif madd.s rC10, rC10, rA1, rB0 #if KB > 5 lwc1 ra0, 20(pA0) #elif KB == 3 daddu pA2, pA2, incAm #endif madd.s rC20, rC20, rE2, rB0 #if KB == 3 daddu pA3, pA3, incAm #endif madd.s rC30, rC30, rA3, rB0 #if KB > 3 lwc1 rb0, 12(pB0) #elif KB == 3 swc1 rC00, -CMUL(16)(pC0) #endif madd.s rC01, rC01, rE0, rB1 #if KB == 3 swc1 rC10, -CMUL(12)(pC0) #endif madd.s rC11, rC11, rA1, rB1 #if KB == 3 swc1 rC20, -CMUL(8)(pC0) #endif madd.s rC21, rC21, rE2, rB1 #if KB == 3 swc1 rC30, -CMUL(4)(pC0) #endif madd.s rC31, rC31, rA3, rB1 #if KB > 3 lwc1 rB1, 12(pB1) #elif KB == 3 swc1 rC01, -CMUL(16)(pC1) #endif madd.s rC02, rC02, rE0, rB2 #if KB == 3 swc1 rC11, -CMUL(12)(pC1) #endif madd.s rC12, rC12, rA1, rB2 #if KB > 4 lwc1 ra2, 16(pA2) #elif KB == 3 swc1 rC21, -CMUL(8)(pC1) #endif madd.s rC22, rC22, rE2, rB2 #if KB == 3 swc1 rC31, -CMUL(4)(pC1) #endif madd.s rC32, rC32, rA3, rB2 #if KB > 3 lwc1 rB2, 12(pB2) #elif KB == 3 swc1 rC02, -CMUL(16)(pC2) #endif madd.s rC03, rC03, rE0, rB3 #if KB == 3 swc1 rC12, -CMUL(12)(pC2) #endif madd.s rC13, rC13, rA1, rB3 #if KB > 3 lwc1 ra3, 12(pA3) #elif KB == 3 swc1 rC22, -CMUL(8)(pC2) #endif madd.s rC23, rC23, rE2, rB3 #if KB == 3 swc1 rC32, -CMUL(4)(pC2) #endif madd.s rC33, rC33, rA3, rB3 #if KB > 3 lwc1 rB3, 12(pB3) #endif#endif#if KB > 3 #if KB > 4 lwc1 rA1, 16(pA1) #elif KB == 4 daddu pA0, pA0, incAm #endif madd.s rC00, rC00, re0, rb0 #if KB == 4 daddu pA1, pA1, incAm #elif !defined(BETA0) prefC(16(pC0)) #endif madd.s rC10, rC10, ra1, rb0 #if KB > 6 lwc1 rE0, 24(pA0) #elif KB == 4 daddu pA2, pA2, incAm #endif madd.s rC20, rC20, rA2, rb0 #if KB == 4 daddu pA3, pA3, incAm #elif !defined(BETA0) prefC(16(pC1)) #endif madd.s rC30, rC30, ra3, rb0 #if KB > 4 lwc1 rB0, 16(pB0) #elif KB == 4 swc1 rC00, -CMUL(16)(pC0) #endif madd.s rC01, rC01, re0, rB1 #if KB == 4
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?