atl_dmm4x4xur2_mips.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,473 行 · 第 1/5 页
C
2,473 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2007 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef KB #error "This kernel requires KB be a compile-time constant!"#endif#if KB > 80 #error "This kernel supports max KB of 80"#endif#define rC00 $f0#define rC10 $f1#define rC20 $f2#define rC30 $f3#define rC01 $f4#define rC11 $f5#define rC21 $f6#define rC31 $f7#define rC02 $f8#define rC12 $f9#define rC22 $f10#define rC32 $f11#define rC03 $f12#define rC13 $f13#define rC23 $f14#define rC33 $f15#define rA0 $f16#define rA1 $f17#define rA2 $f18#define rA3 $f19#define rB0 $f20#define rB1 $f21#define rB2 $f22#define rB3 $f23#define ra0 $f24#define ra1 $f25#define ra2 $f26#define ra3 $f27#define rb0 $f28#define rb1 $f29#define rb2 $f30#define rb3 $f31#define M $4#define N $5#define K0 $6#define pA0 $8#define incAm $9#define pB0 $10#define incBn $11#define pC0 $7#define pA1 $12#define pA2 $13#define pA3 $14#define pB1 $15#define pB2 $16#define pB3 $17#define K $18#define stAm $19#define stBn $20#define incAn $21#define pfA $22#define pfB $23#define pC1 $24#define pC2 $25#define pC3 $30#define incCn $2#ifdef BETAX #define FSIZE 160#else #define FSIZE 152#endif#define PFDISTA KB*8*8#if 0 #define prefA(mem) pref 6, mem#else #define prefA(mem)#endif#if 1 #define prefB(mem) pref 6, mem#else #define prefB(mem)#endif#if 1 #define prefC(mem) pref 5, mem#else #define prefC(mem)#endif#ifdef DCPLX #define CMUL(i_) ((i_)+(i_))#else #define CMUL(i_) i_#endif/* * save : 18,19,20,21,22,23 * $26, $27 reserved. $0 = 0, $1 used by assembler * I think can use $31 if I save it & restore it. r0 = 0 * Avail: $2, $3, $12-25, $28, $30, $31 *//*void ATL_USERMM $4 $5 $6 $f15, $8 (const int M, const int N, const int K, const double alpha, const double *A, $9 $10 $11 0(%sp) const int lda, const double *B, const int ldb, const double beta, 8($sp) 16($sp) double *C, const int ldc)*/.text.align 3.globl ATL_USERMM.ent ATL_USERMMATL_USERMM: .frame $sp,FSIZE,$31 .set noreorder .set nomacro .set noat/* * Adjust stack and save registers */ daddiu $sp, $sp, -FSIZE sd $16, 0($sp) sd $17, 8($sp) sd $18, 16($sp) sd $19, 24($sp) sd $20, 32($sp) sd $21, 40($sp) sd $22, 64($sp) sd $23, 72($sp) sd $30, 80($sp)#ifdef ATL_USE64BITS sdc1 $f24, 88($sp) sdc1 $f25, 96($sp) sdc1 $f26, 104($sp) sdc1 $f27, 112($sp) sdc1 $f28, 120($sp) sdc1 $f29, 128($sp) sdc1 $f30, 136($sp) sdc1 $f31, 144($sp)#else sdc1 $f20, 88($sp) sdc1 $f22, 96($sp) sdc1 $f24, 104($sp) sdc1 $f26, 112($sp) sdc1 $f28, 120($sp) sdc1 $f30, 128($sp)#endif#ifdef BETAX ldc1 rA0, FSIZE($sp) /* get BETA from caller's stack */ sdc1 rA0, 152($sp) /* save BETA to my stack */ #define BETOFF 152#endif/* * (ldc,lda, ldb, K0) * sizeof; setup column ptrs */#ifdef ATL_USE64BITS ld pC0, FSIZE+8($sp) ld incCn, FSIZE+16($sp)#else lw pC0, FSIZE+8($sp) lw incCn, FSIZE+16($sp)#endif#ifdef DCPLX sll incCn, incCn, 4 /* incCn = ldc*sizeof */#else sll incCn, incCn, 3 /* incCn = ldc*sizeof */#endif sll incAm, incAm, 3 /* incAm = lda*sizeof */ .set macro dmul incAn, incAm, M /* incAn = lda*M */ sll incBn, incBn, 3 /* incBn = ldb*sizeof */ dmul stBn, incBn, N /* stBn = ldb*N */ .set nomacro sll K0, K0, 3 daddu pA1, pA0, incAm daddu pA2, pA1, incAm daddu pA3, pA2, incAm daddu pB1, pB0, incBn daddu pB2, pB1, incBn daddu pB3, pB2, incBn daddu pC1, pC0, incCn daddu pC2, pC1, incCn daddu pC3, pC2, incCn sll incAm, incAm, 2 /* incAm = lda*4 */ sll incBn, incBn, 2 /* incBn = ldb*4 */ daddu stAm, pA0, incAn /* stAm = pA0 + lda*M */ daddu stBn, pB0, stBn /* stBn = pB0 + ldb*N */#ifdef DCPLX sll pfA, M, 4 /* pfA = M*sizeof */#else sll pfA, M, 3 /* pfA = M*sizeof */#endif sll incCn, incCn, 2 /* incCn = ldc*4 */ dsubu incCn, incCn, pfA /* incCn = ldc*4 - M */ or pfA, stAm, $0 or pfB, stBn, $0 daddiu K0, K0, -8NLOOP:MLOOP: ldc1 rB0, 0(pB0) ldc1 rA0, 0(pA0) ldc1 rA1, 0(pA1) ldc1 rA2, 0(pA2) ldc1 rA3, 0(pA3) ldc1 rB1, 0(pB1) ldc1 rB2, 0(pB2)#ifndef BETAX ldc1 rB3, 0(pB3)#endif#ifdef BETA1 ldc1 rC00, 0(pC0) ldc1 rC01, 0(pC1) ldc1 rC10, CMUL(8)(pC0) ldc1 rC20, CMUL(16)(pC0) ldc1 rC30, CMUL(24)(pC0) ldc1 rC11, CMUL(8)(pC1) ldc1 rC21, CMUL(16)(pC1) ldc1 rC31, CMUL(24)(pC1) ldc1 rC02, 0(pC2) ldc1 rC03, 0(pC3) ldc1 rC12, CMUL(8)(pC2) ldc1 rC22, CMUL(16)(pC2) ldc1 rC32, CMUL(24)(pC2) #if KB <= 2 ldc1 rC13, CMUL(8)(pC3) ldc1 rC23, CMUL(16)(pC3) ldc1 rC33, CMUL(24)(pC3) #endif#elif defined(BETAX) ldc1 rB3, BETOFF($sp) /* load BETA */ ldc1 rC00, 0(pC0) ldc1 rC10, CMUL(8)(pC0) ldc1 rC20, CMUL(16)(pC0) ldc1 rC30, CMUL(24)(pC0) ldc1 rC01, 0(pC1) mul.d rC00, rC00, rB3 ldc1 rC11, CMUL(8)(pC1) mul.d rC10, rC10, rB3 ldc1 rC21, CMUL(16)(pC1) mul.d rC20, rC20, rB3 ldc1 rC31, CMUL(24)(pC1) mul.d rC30, rC30, rB3 ldc1 rC02, 0(pC2) mul.d rC01, rC01, rB3 ldc1 rC12, CMUL(8)(pC2) mul.d rC11, rC11, rB3 ldc1 rC22, CMUL(16)(pC2) mul.d rC21, rC21, rB3 ldc1 rC32, CMUL(24)(pC2) mul.d rC31, rC31, rB3 ldc1 rC03, 0(pC3) mul.d rC02, rC02, rB3 ldc1 rC13, CMUL(8)(pC3) mul.d rC12, rC12, rB3 ldc1 rC23, CMUL(16)(pC3) mul.d rC22, rC22, rB3 ldc1 rC33, CMUL(24)(pC3) mul.d rC32, rC32, rB3 mul.d rC03, rC03, rB3 mul.d rC13, rC13, rB3 mul.d rC23, rC23, rB3 mul.d rC33, rC33, rB3 ldc1 rB3, 0(pB3)#endif#if KB <= 2 && defined(BETA0) dmtc1 $0, rC00 mov.d rC10, rC00 mov.d rC20, rC00 mov.d rC30, rC00 mov.d rC01, rC00 mov.d rC11, rC00 mov.d rC21, rC00 mov.d rC31, rC00 mov.d rC02, rC00 mov.d rC12, rC00 mov.d rC22, rC00 mov.d rC32, rC00 mov.d rC03, rC00 mov.d rC13, rC00 mov.d rC23, rC00 mov.d rC33, rC00#endif .align 3/* KLOOP: */#if KB > 2 #ifdef BETA0 ldc1 rb0, 8(pB0) mul.d rC00, rA0, rB0 ldc1 ra0, 8(pA0) mul.d rC10, rA1, rB0 prefC((pC0)) mul.d rC20, rA2, rB0 prefC((pC1)) mul.d rC30, rA3, rB0 prefC((pC2)) mul.d rC01, rA0, rB1 prefC((pC3)) mul.d rC11, rA1, rB1 prefA(KB*8*8(pA0)) mul.d rC21, rA2, rB1 prefA(KB*8*8(pA1)) mul.d rC31, rA3, rB1 prefA(KB*8*8(pA2)) mul.d rC02, rA0, rB2 prefA(KB*8*8(pA3)) mul.d rC12, rA1, rB2 ldc1 ra1, 8(pA1) mul.d rC22, rA2, rB2 ldc1 ra2, 8(pA2) mul.d rC32, rA3, rB2 ldc1 ra3, 8(pA3) mul.d rC03, rA0, rB3 ldc1 rb1, 8(pB1) mul.d rC13, rA1, rB3 ldc1 rb2, 8(pB2) mul.d rC23, rA2, rB3 ldc1 rb3, 8(pB3) mul.d rC33, rA3, rB3 #else ldc1 rb0, 8(pB0) madd.d rC00, rC00, rA0, rB0 ldc1 ra0, 8(pA0) madd.d rC10, rC10, rA1, rB0 madd.d rC20, rC20, rA2, rB0 #ifndef BETAX ldc1 rC13, CMUL(8)(pC3) #endif madd.d rC30, rC30, rA3, rB0 #ifndef BETAX ldc1 rC23, CMUL(16)(pC3) #endif madd.d rC01, rC01, rA0, rB1 #ifndef BETAX ldc1 rC33, CMUL(24)(pC3) #endif madd.d rC11, rC11, rA1, rB1 prefA(KB*8*8(pA0)) madd.d rC21, rC21, rA2, rB1 prefA(KB*8*8(pA1)) madd.d rC31, rC31, rA3, rB1 prefA(KB*8*8(pA2)) madd.d rC02, rC02, rA0, rB2 prefA(KB*8*8(pA3)) madd.d rC12, rC12, rA1, rB2 ldc1 ra1, 8(pA1) madd.d rC22, rC22, rA2, rB2 ldc1 ra2, 8(pA2) madd.d rC32, rC32, rA3, rB2 ldc1 ra3, 8(pA3) madd.d rC03, rC03, rA0, rB3 ldc1 rb1, 8(pB1) madd.d rC13, rC13, rA1, rB3 ldc1 rb2, 8(pB2) madd.d rC23, rC23, rA2, rB3 ldc1 rb3, 8(pB3) madd.d rC33, rC33, rA3, rB3 #endif /* if BETA != 0 */ ldc1 rB0, 16(pB0) madd.d rC00, rC00, ra0, rb0 ldc1 rA0, 16(pA0) madd.d rC10, rC10, ra1, rb0 ldc1 rA1, 16(pA1) madd.d rC20, rC20, ra2, rb0 ldc1 rA2, 16(pA2) madd.d rC30, rC30, ra3, rb0 ldc1 rA3, 16(pA3) madd.d rC01, rC01, ra0, rb1 ldc1 rB1, 16(pB1) madd.d rC11, rC11, ra1, rb1 ldc1 rB2, 16(pB2) madd.d rC21, rC21, ra2, rb1 ldc1 rB3, 16(pB3) madd.d rC31, rC31, ra3, rb1 prefA(32+KB*8*8(pA0)) madd.d rC02, rC02, ra0, rb2 prefA(32+KB*8*8(pA1)) madd.d rC12, rC12, ra1, rb2 prefA(32+KB*8*8(pA2)) madd.d rC22, rC22, ra2, rb2 prefA(32+KB*8*8(pA3)) madd.d rC32, rC32, ra3, rb2 prefA(64+KB*8*8(pA0)) madd.d rC03, rC03, ra0, rb3 prefA(64+KB*8*8(pA1)) madd.d rC13, rC13, ra1, rb3 prefA(64+KB*8*8(pA2)) madd.d rC23, rC23, ra2, rb3 prefA(64+KB*8*8(pA3)) madd.d rC33, rC33, ra3, rb3#endif#if KB > 4 ldc1 rb0, 24(pB0) madd.d rC00, rC00, rA0, rB0 ldc1 ra0, 24(pA0) madd.d rC10, rC10, rA1, rB0 ldc1 ra1, 24(pA1) madd.d rC20, rC20, rA2, rB0 ldc1 ra2, 24(pA2) madd.d rC30, rC30, rA3, rB0 ldc1 ra3, 24(pA3) madd.d rC01, rC01, rA0, rB1 ldc1 rb1, 24(pB1) madd.d rC11, rC11, rA1, rB1 ldc1 rb2, 24(pB2) madd.d rC21, rC21, rA2, rB1 ldc1 rb3, 24(pB3) madd.d rC31, rC31, rA3, rB1 prefA(96+KB*8*8(pA0)) madd.d rC02, rC02, rA0, rB2 prefA(96+KB*8*8(pA1)) madd.d rC12, rC12, rA1, rB2 prefA(96+KB*8*8(pA2)) madd.d rC22, rC22, rA2, rB2 prefA(96+KB*8*8(pA3)) madd.d rC32, rC32, rA3, rB2 prefA(128+KB*8*8(pA0)) madd.d rC03, rC03, rA0, rB3 prefA(128+KB*8*8(pA1)) madd.d rC13, rC13, rA1, rB3 prefA(128+KB*8*8(pA2)) madd.d rC23, rC23, rA2, rB3 prefA(128+KB*8*8(pA3)) madd.d rC33, rC33, rA3, rB3 ldc1 rB0, 32(pB0) madd.d rC00, rC00, ra0, rb0 ldc1 rA0, 32(pA0) madd.d rC10, rC10, ra1, rb0 ldc1 rA1, 32(pA1) madd.d rC20, rC20, ra2, rb0 ldc1 rA2, 32(pA2) madd.d rC30, rC30, ra3, rb0 ldc1 rA3, 32(pA3) madd.d rC01, rC01, ra0, rb1 ldc1 rB1, 32(pB1) madd.d rC11, rC11, ra1, rb1 ldc1 rB2, 32(pB2) madd.d rC21, rC21, ra2, rb1 ldc1 rB3, 32(pB3) madd.d rC31, rC31, ra3, rb1 madd.d rC02, rC02, ra0, rb2 madd.d rC12, rC12, ra1, rb2 madd.d rC22, rC22, ra2, rb2 madd.d rC32, rC32, ra3, rb2 madd.d rC03, rC03, ra0, rb3 madd.d rC13, rC13, ra1, rb3 madd.d rC23, rC23, ra2, rb3 madd.d rC33, rC33, ra3, rb3#endif#if KB > 6 ldc1 rb0, 40(pB0) madd.d rC00, rC00, rA0, rB0 ldc1 ra0, 40(pA0) madd.d rC10, rC10, rA1, rB0 ldc1 ra1, 40(pA1) madd.d rC20, rC20, rA2, rB0 ldc1 ra2, 40(pA2) madd.d rC30, rC30, rA3, rB0 ldc1 ra3, 40(pA3) madd.d rC01, rC01, rA0, rB1 ldc1 rb1, 40(pB1) madd.d rC11, rC11, rA1, rB1 ldc1 rb2, 40(pB2) madd.d rC21, rC21, rA2, rB1 ldc1 rb3, 40(pB3) madd.d rC31, rC31, rA3, rB1 madd.d rC02, rC02, rA0, rB2
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?