atl_dmm4x4x80_ppc.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,991 行 · 第 1/5 页
C
2,991 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2007 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"/* * NOTE: this kernel written by R. Clint Whaley, but it uses two key ideas * discovered by Tony Castaldo for the PowerPC970: * (1) Instructions must be issued in groups of 4 like inst (eg. 4 iop/ld * 4 fp, etc. * (2) It is effective to somewhat intermix M-loop iterations */#if !defined(ATL_AS_OSX_PPC) && !defined(ATL_GAS_LINUX_PPC) && \ !defined(ATL_AS_AIX_PPC) #error "This kernel requires OS X, AIX, or Linux PPC assembler!"#endif#if !defined(KB) || KB == 0 #error "This kernel requires KB be a compile-time constant!"#endif#ifndef MB #define MB 0#endif#ifdef DCPLX #define CMUL(i_) ((i_)*2) #define SHF 4#else #define CMUL(i_) i_ #define SHF 3#endif#ifdef ATL_USE64BITS #define slwi sldi #define srwi srdi #define cmpwi cmpdi#else #define std stw #define ld lwz#endif#if defined(ATL_USE64BITS) #define M r3 #define N r4 #define pA0 r7 #define pB0 r9 #define pC0 r10 #define pC1 r5 #define pC2 r6 #define pC3 r8 #define pfA r11 #define incAn r0 #define incCn r12 #define pfB r14 #define NEG(i_) -i_ /* 64 bit ABI defines red zone! */ #define FSIZE 0 #define BOFF -160#elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC) #define M r3 #define N r4 #define pA0 r8 #define pB0 r10 #define pC0 r6 #define pC1 r7 #define pC2 r9 #define pC3 r11 #define pfA r12 #define incAn r0 #define incCn r5 #define pfB r14 #define FSIZE 0 #define BOFF -160 #define NEG(i_) -i_#else /* 32-bit linux has no red zone */ #define M r3 #define N r4 #define pA0 r6 #define pB0 r8 #define pC0 r10 #define pC1 r5 #define pC2 r7 #define pC3 r9 #define pfA r11 #define incAn r0 #define incCn r12 #define pfB r14 #define NEG(i_) i_ #define FSIZE 172 #define BOFF FSIZE-8#endif#define rA0 f0#define rA1 f1#define rA2 f2#define rA3 f3#define rB0 f4#define rB1 f5#define rB2 f6#define rB3 f7#define ra0 f8#define ra1 f9#define ra2 f10#define ra3 f11#define rb0 f12#define rb1 f13#define rb2 f14#define rb3 f15#define rC00 f16#define rC10 f17#define rC20 f18#define rC30 f19#define rC01 f20#define rC11 f21#define rC21 f22#define rC31 f23#define rC02 f24#define rC12 f25#define rC22 f26#define rC32 f27#define rC03 f28#define rC13 f29#define rC23 f30#define rC33 f31#ifndef KB1 #define KB1 KB#endif#ifndef KB2 #define KB2 KB*2 #define KB3 KB*3 #define KB4 KB*4 #define KB5 KB*5 #define KB6 KB*6 #define KB7 KB*7#endif#if 0*******************************************************************************32 bit ABIs: r3 r4 r5 r6-r7,f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, (r6) r8 (r7) r9 (r8) r10 (r9) 56(r1) const TYPE *A, const int lda, const TYPE *B, const int ldb, f2 68(r1) 72(r1) const TYPE beta, TYPE *C, const int ldc) (r10) 8(r1)*******************************************************************************64 bit ABIs: r3 r4 r5 r6/f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, r7 r8 r9 r10 const TYPE *A, const int lda, const TYPE *B, const int ldb, f2 120(r1) 128(r1) const TYPE beta, TYPE *C, const int ldc)#endif#ifdef ATL_AS_AIX_PPC .csect .text[PR] .toc .csect .text[PR] .align 3 .globl ATL_USERMM .globl Mjoin(.,ATL_USERMM) #ifdef ATL_USE64BITS .csect ATL_USERMM[DS],3ATL_USERMM: .llong Mjoin(.,ATL_USERMM) #else .csect ATL_USERMM[DS]ATL_USERMM: .long Mjoin(.,ATL_USERMM), TOC[tc0], 0 #endif .csect .text[PR]Mjoin(.,ATL_USERMM):#else.text #ifdef ATL_AS_OSX_PPC .globl Mjoin(_,ATL_USERMM)Mjoin(_,ATL_USERMM): #else #if defined(ATL_USE64BITS)/* * Official Program Descripter section, seg fault w/o it on Linux/PPC64 */ .section ".opd","aw" .align 3 .globl ATL_USERMMATL_USERMM: .quad Mjoin(.,ATL_USERMM),.TOC.@tocbase,0 .previous .type Mjoin(.,ATL_USERMM),@function .text .globl Mjoin(.,ATL_USERMM)Mjoin(.,ATL_USERMM): #else .globl ATL_USERMMATL_USERMM: #endif #endif#endif/* Save regs */#if defined(ATL_GAS_LINUX_PPC) && !defined(ATL_USE64BITS) stwu r1, -FSIZE(r1)#endif stfd f14, NEG(8)(r1) stfd f15, NEG(16)(r1) stfd f16, NEG(24)(r1) stfd f17, NEG(32)(r1) stfd f18, NEG(40)(r1) stfd f19, NEG(48)(r1) stfd f20, NEG(56)(r1) stfd f21, NEG(64)(r1) stfd f22, NEG(72)(r1) stfd f23, NEG(80)(r1) stfd f24, NEG(88)(r1) stfd f25, NEG(96)(r1) stfd f26, NEG(104)(r1) stfd f27, NEG(112)(r1) stfd f28, NEG(120)(r1) stfd f29, NEG(128)(r1) stfd f30, NEG(136)(r1) stfd f31, NEG(144)(r1) std r14, NEG(152)(r1)#ifdef BETAX stfd f2, BOFF(r1)#elif defined(BETA0) xor pfA, pfA, pfA #ifdef ATL_USE64BITS std pfA, BOFF(r1) #else stw pfA, BOFF(r1) stw pfA, 4+BOFF(r1) #endif#endif#if defined (ATL_USE64BITS) ld pC0, 120(r1) ld incCn, 128(r1)#elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC) lwz pC0, 68(r1) lwz incCn, 72(r1)#else lwz incCn, FSIZE+8(r1)#endif slwi incCn, incCn, SHF /* incCn = ldc*sizeof */ add pC1, pC0, incCn add pC2, pC1, incCn add pC3, pC2, incCn slwi pfA, M, SHF /* pfA = M*sizeof() */ slwi incCn, incCn, 2 sub incCn, incCn, pfA /* incCn = ldc*4 - M */ mulli incAn, M, KB*8 /* incAn = M*KB*sizeof() */ add pfA, pA0, incAn /* pfA = A + M*KB */ srwi M, M, 2 /* M /= 4 */ addi M, M, -1#if MB == 0 cmpwi cr5, M, 0#endif// .align 5NLOOP: addi pfB, pB0, KB4*8 mtctr M lfd rB0, 0(pB0) lfd rA0, 0(pA0) lfd rA1, KB*8(pA0) lfd rA2, KB2*8(pA0) lfd rA3, KB3*8(pA0) lfd rB1, KB*8(pB0) lfd rB2, KB2*8(pB0) lfd rB3, KB3*8(pB0)#if MB == 0 beq- cr5, MPEELED#endifMLOOP:/* Begin KLOOP */#if KB > 0 #ifdef BETA0 #if KB > 1 lfd rb0, 8(pB0) #endif #if KB > 1 lfd ra0, 8(pA0) #endif #if KB > 1 lfd ra1, KB*8+8(pA0) #endif #if KB > 1 lfd ra2, KB2*8+8(pA0) #endif fmul rC00, rA0, rB0 fmul rC10, rA1, rB0 fmul rC20, rA2, rB0 fmul rC30, rA3, rB0 #if KB > 1 lfd ra3, KB3*8+8(pA0) #endif #if KB > 1 lfd rb1, KB*8+8(pB0) #endif #if KB > 1 lfd rb2, KB2*8+8(pB0) #endif #if KB > 1 lfd rb3, KB3*8+8(pB0) #endif fmul rC01, rA0, rB1 fmul rC11, rA1, rB1 fmul rC21, rA2, rB1 fmul rC31, rA3, rB1 dcbt 0, pfA, 0 addi pfA, pfA, 128 dcbt 0, pfB, 0 addi pfB, pfB, 128 fmul rC02, rA0, rB2 fmul rC12, rA1, rB2 fmul rC22, rA2, rB2 fmul rC32, rA3, rB2 fmul rC03, rA0, rB3 fmul rC13, rA1, rB3 fmul rC23, rA2, rB3 fmul rC33, rA3, rB3 #elif defined(BETAX) lfd rb3, BOFF(r1) lfd rC00, 0(pC0) lfd rC10, CMUL(8)(pC0) lfd rC20, CMUL(16)(pC0) lfd rC30, CMUL(24)(pC0) nop nop nop lfd rC01, 0(pC1) lfd rC11, CMUL(8)(pC1) lfd rC21, CMUL(16)(pC1) lfd rC31, CMUL(24)(pC1) fmul rC00, rC00, rb3 fmul rC10, rC10, rb3 fmul rC20, rC20, rb3 fmul rC30, rC30, rb3 lfd rC02, 0(pC2) lfd rC12, CMUL(8)(pC2) lfd rC22, CMUL(16)(pC2) lfd rC32, CMUL(24)(pC2) fmul rC01, rC01, rb3 fmul rC11, rC11, rb3 fmul rC21, rC21, rb3 fmul rC31, rC31, rb3 lfd rC03, 0(pC3) lfd rC13, CMUL(8)(pC3) lfd rC23, CMUL(16)(pC3) lfd rC33, CMUL(24)(pC3) fmul rC02, rC02, rb3 fmul rC12, rC12, rb3 fmul rC22, rC22, rb3 fmul rC32, rC32, rb3 fmul rC03, rC03, rb3 fmul rC13, rC13, rb3 fmul rC23, rC23, rb3 fmul rC33, rC33, rb3 #if KB > 1 lfd rb0, 8(pB0) #endif #if KB > 1 lfd ra0, 8(pA0) #endif #if KB > 1 lfd ra1, KB*8+8(pA0) #endif #if KB > 1 lfd ra2, KB2*8+8(pA0) #endif fmadd rC00, rA0, rB0, rC00 fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 #if KB > 1 lfd ra3, KB3*8+8(pA0) #endif #if KB > 1 lfd rb1, KB*8+8(pB0) #endif #if KB > 1 lfd rb2, KB2*8+8(pB0) #endif #if KB > 1 lfd rb3, KB3*8+8(pB0) #endif fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 dcbt 0, pfA, 0 addi pfA, pfA, 128 dcbt 0, pfB, 0 addi pfB, pfB, 128 fmadd rC03, rA0, rB3, rC03 fmadd rC13, rA1, rB3, rC13 fmadd rC23, rA2, rB3, rC23 fmadd rC33, rA3, rB3, rC33 #else /* BETA == 1 */ lfd rC00, 0(pC0) lfd rC10, CMUL(8)(pC0) lfd rC20, CMUL(16)(pC0) lfd rC30, CMUL(24)(pC0) lfd rC01, 0(pC1) lfd rC11, CMUL(8)(pC1) lfd rC21, CMUL(16)(pC1) lfd rC31, CMUL(24)(pC1) lfd rC02, 0(pC2) lfd rC12, CMUL(8)(pC2) lfd rC22, CMUL(16)(pC2) lfd rC32, CMUL(24)(pC2) lfd rC03, 0(pC3) lfd rC13, CMUL(8)(pC3) lfd rC23, CMUL(16)(pC3) lfd rC33, CMUL(24)(pC3) fmadd rC00, rA0, rB0, rC00 fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 dcbt 0, pfA, 0 dcbt 0, pfB, 0 addi pfA, pfA, 128 addi pfB, pfB, 128 fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 #if KB > 1 lfd rb0, 8(pB0) #endif #if KB > 1 lfd ra0, 8(pA0) #endif #if KB > 1 lfd ra1, KB*8+8(pA0) #endif #if KB > 1 lfd ra2, KB2*8+8(pA0) #endif fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 #if KB > 1 lfd ra3, KB3*8+8(pA0) #endif #if KB > 1 lfd rb1, KB*8+8(pB0) #endif #if KB > 1 lfd rb2, KB2*8+8(pB0) #endif #if KB > 1 lfd rb3, KB3*8+8(pB0) #endif fmadd rC03, rA0, rB3, rC03 fmadd rC13, rA1, rB3, rC13 fmadd rC23, rA2, rB3, rC23 fmadd rC33, rA3, rB3, rC33 #endif /* done BETA specialization */#endif /* end K=1 block */#if KB > 1 #if KB > 2 lfd rB0, 16(pB0) #endif #if KB > 2 lfd rA0, 16(pA0) #endif #if KB > 2 lfd rA1, KB*8+16(pA0) #endif #if KB > 2 lfd rA2, KB2*8+16(pA0) #endif fmadd rC00, ra0, rb0, rC00 fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 #if KB > 2 lfd rA3, KB3*8+16(pA0) #endif #if KB > 2 lfd rB1, KB*8+16(pB0) #endif #if KB > 2 lfd rB2, KB2*8+16(pB0) #endif #if KB > 2 lfd rB3, KB3*8+16(pB0) #endif fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 fmadd rC03, ra0, rb3, rC03 fmadd rC13, ra1, rb3, rC13 fmadd rC23, ra2, rb3, rC23 fmadd rC33, ra3, rb3, rC33#endif /* end K=2 block */#if KB > 2 #if KB > 3 lfd rb0, 24(pB0) #endif #if KB > 3 lfd ra0, 24(pA0) #endif #if KB > 3 lfd ra1, KB*8+24(pA0) #endif #if KB > 3 lfd ra2, KB2*8+24(pA0) #endif fmadd rC00, rA0, rB0, rC00 fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 #if KB > 3 lfd ra3, KB3*8+24(pA0) #endif #if KB > 3 lfd rb1, KB*8+24(pB0) #endif #if KB > 3 lfd rb2, KB2*8+24(pB0) #endif #if KB > 3 lfd rb3, KB3*8+24(pB0) #endif fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 addi pC0, pC0, CMUL(4)*8 /* pC0 += 4 */ addi pC1, pC1, CMUL(4)*8 addi pC2, pC2, CMUL(4)*8 addi pC3, pC3, CMUL(4)*8 fmadd rC03, rA0, rB3, rC03 fmadd rC13, rA1, rB3, rC13 fmadd rC23, rA2, rB3, rC23 fmadd rC33, rA3, rB3, rC33#else addi pC0, pC0, CMUL(4)*8 /* pC0 += 4 */ addi pC1, pC1, CMUL(4)*8 addi pC2, pC2, CMUL(4)*8 addi pC3, pC3, CMUL(4)*8#endif /* end K=3 block */#if KB > 3 #if KB > 4 lfd rB0, 32(pB0) #endif #if KB > 4 lfd rA0, 32(pA0) #endif #if KB > 4 lfd rA1, KB*8+32(pA0) #endif #if KB > 4 lfd rA2, KB2*8+32(pA0) #endif fmadd rC00, ra0, rb0, rC00 fmadd rC10, ra1, rb0, rC10
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?