atl_dmm4x4x32_ppc.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,609 行 · 第 1/5 页
C
2,609 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2005 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#if !defined(ATL_AS_OSX_PPC) && !defined(ATL_GAS_LINUX_PPC) && \ !defined(ATL_AS_AIX_PPC) #error "This kernel requires OS X, AIX, or Linux PPC assembler!"#endif#ifdef BETAX #define MulByBeta(elt_, bet_) \ fmul elt_, elt_, bet_#else #define MulByBeta(elt_, bet_)#endif#ifndef Mjoin #define Mjoin(pre, nam) my_join(pre, nam) #define my_join(pre, nam) pre ## nam#endif#ifdef DCPLX #define CMUL(i_) ((i_)*2) #define SHF 4#else #define CMUL(i_) i_ #define SHF 3#endif#ifdef ATL_GAS_LINUX_PPC #define r0 0 #define r1 1 #define r2 2 #define r3 3 #define r4 4 #define r5 5 #define r6 6 #define r7 7 #define r8 8 #define r9 9 #define r10 10 #define r11 11 #define r12 12/* #define r13 13 don't use r13 under linux, as it rules out .so */ #define r14 14 #define r15 15 #define r16 16 #define r17 17 #define r18 18 #define r19 19 #define r20 20 #define r21 21 #define r22 22 #define r23 23 #define r24 24 #define r25 25 #define r26 26 #define r27 27 #define r28 28 #define r29 29 #define r30 30 #define r31 31 #define f0 0 #define f1 1 #define f2 2 #define f3 3 #define f4 4 #define f5 5 #define f6 6 #define f7 7 #define f8 8 #define f9 9 #define f10 10 #define f11 11 #define f12 12 #define f13 13 #define f14 14 #define f15 15 #define f16 16 #define f17 17 #define f18 18 #define f19 19 #define f20 20 #define f21 21 #define f22 22 #define f23 23 #define f24 24 #define f25 25 #define f26 26 #define f27 27 #define f28 28 #define f29 29 #define f30 30 #define f31 31#endif#ifdef ATL_USE64BITS #define slwi sldi #define srwi srdi#else #define std stw #define ld lwz#endif#if defined(ATL_USE64BITS) #define M r3 #define N r4 #define pA0 r7 #define pB0 r9 #define pC0 r10 #define pC1 r5 #define pC2 r6 #define pC3 r8 #define pfA r11 #define incAn r0 #define incCn r12 #define pfB r14 #define NEG(i_) -i_ /* 64 bit ABI defines red zone! */ #define FSIZE 0 #define BOFF -160#elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC) #define M r3 #define N r4 #define pA0 r8 #define pB0 r10 #define pC0 r6 #define pC1 r7 #define pC2 r9 #define pC3 r11 #define pfA r12 #define incAn r0 #define incCn r5 #define pfB r14 #define BOFF -160 #define FSIZE 0 #define NEG(i_) -i_#else #define M r3 #define N r4 #define pA0 r6 #define pB0 r8 #define pC0 r10 #define pC1 r5 #define pC2 r7 #define pC3 r9 #define pfA r11 #define incAn r0 #define incCn r12 #define pfB r14 #define NEG(i_) i_ #define FSIZE 172 #define BOFF FSIZE-8#endif#define rA0 f0#define rA1 f1#define rA2 f2#define rA3 f3#define rB0 f4#define rB1 f5#define rB2 f6#define rB3 f7#define ra0 f8#define ra1 f9#define ra2 f10#define ra3 f11#define rb0 f12#define rb1 f13#define rb2 f14#define rb3 f15#define rC00 f16#define rC10 f17#define rC20 f18#define rC30 f19#define rC01 f20#define rC11 f21#define rC21 f22#define rC31 f23#define rC02 f24#define rC12 f25#define rC22 f26#define rC32 f27#define rC03 f28#define rC13 f29#define rC23 f30#define rC33 f31#ifdef KB0 #undef KB0#endif#define KB0 0#ifndef KB1 #define KB1 KB#endif#ifndef KB2 #define KB2 KB*2 #define KB3 KB*3 #define KB4 KB*4 #define KB5 KB*5 #define KB6 KB*6 #define KB7 KB*7#endif#if 0*******************************************************************************32 bit ABIs: r3 r4 r5 r6-r7,f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, (r6) r8 (r7) r9 (r8) r10 (r9) 56(r1) const TYPE *A, const int lda, const TYPE *B, const int ldb, f2 68(r1) 72(r1) const TYPE beta, TYPE *C, const int ldc) (r10) 8(r1)*******************************************************************************64 bit ABIs: r3 r4 r5 r6/f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, r7 r8 r9 r10 const TYPE *A, const int lda, const TYPE *B, const int ldb, f2 120(r1) 128(r1) const TYPE beta, TYPE *C, const int ldc)#endif#ifdef ATL_AS_AIX_PPC .csect .text[PR] .toc .csect .text[PR] .align 3 .globl ATL_USERMM .globl Mjoin(.,ATL_USERMM) #ifdef ATL_USE64BITS .csect ATL_USERMM[DS],3ATL_USERMM: .llong Mjoin(.,ATL_USERMM) #else .csect ATL_USERMM[DS]ATL_USERMM: .long Mjoin(.,ATL_USERMM), TOC[tc0], 0 #endif .csect .text[PR]Mjoin(.,ATL_USERMM):#else.text #ifdef ATL_AS_OSX_PPC .globl Mjoin(_,ATL_USERMM)Mjoin(_,ATL_USERMM): #else #if defined(ATL_USE64BITS)/* * Official Program Descripter section, seg fault w/o it on Linux/PPC64 */ .section ".opd","aw" .globl ATL_USERMM .align 3ATL_USERMM: .quad Mjoin(.,ATL_USERMM),.TOC.@tocbase,0 .previous .type Mjoin(.,ATL_USERMM),@function .text .globl Mjoin(.,ATL_USERMM)Mjoin(.,ATL_USERMM): #else .globl ATL_USERMMATL_USERMM: #endif #endif#endif/* Save regs */#if defined(ATL_GAS_LINUX_PPC) && !defined(ATL_USE64BITS) stwu r1, -FSIZE(r1)#endif stfd f14, NEG(8)(r1) stfd f15, NEG(16)(r1) stfd f16, NEG(24)(r1) stfd f17, NEG(32)(r1) stfd f18, NEG(40)(r1) stfd f19, NEG(48)(r1) stfd f20, NEG(56)(r1) stfd f21, NEG(64)(r1) stfd f22, NEG(72)(r1) stfd f23, NEG(80)(r1) stfd f24, NEG(88)(r1) stfd f25, NEG(96)(r1) stfd f26, NEG(104)(r1) stfd f27, NEG(112)(r1) stfd f28, NEG(120)(r1) stfd f29, NEG(128)(r1) stfd f30, NEG(136)(r1) stfd f31, NEG(144)(r1) std r14, NEG(152)(r1)#ifdef BETAX stfd f2, BOFF(r1)#elif defined(BETA0) xor pfA, pfA, pfA #ifdef ATL_USE64BITS std pfA, BOFF(r1) #else stw pfA, BOFF(r1) stw pfA, 4+BOFF(r1) #endif#endif#ifdef ATL_USE64BITS ld pC0, 120(r1) ld incCn, 128(r1)#elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC) lwz pC0, 68(r1) lwz incCn, 72(r1)#else lwz incCn, FSIZE+8(r1)#endif slwi incCn, incCn, SHF /* incCn = ldc*sizeof */ add pC1, pC0, incCn add pC2, pC1, incCn add pC3, pC2, incCn slwi pfA, M, SHF /* pfA = M*sizeof() */ slwi incCn, incCn, 2 sub incCn, incCn, pfA /* incCn = ldc*4 - M */ mulli incAn, M, KB*8 /* incAn = M*KB*sizeof() */ add pfA, pA0, incAn /* pfA = A + M*KB */ srwi M, M, 2 /* M /= 4 */NLOOP: addi pfB, pB0, KB4*8 mtctr MMLOOP:#ifdef BETA0 lfd rC00, BOFF(r1) fmr rC10, rC00 fmr rC20, rC00 fmr rC30, rC00 fmr rC01, rC00 fmr rC11, rC00 fmr rC21, rC00 fmr rC31, rC00 fmr rC02, rC00 fmr rC12, rC00 fmr rC22, rC00 fmr rC32, rC00 fmr rC03, rC00 fmr rC13, rC00 fmr rC23, rC00 fmr rC33, rC00#else #ifdef BETAX lfd rb3, BOFF(r1) #endif lfd rC00, 0(pC0) MulByBeta(rC00, rb3) lfd rC10, CMUL(8)(pC0) MulByBeta(rC10, rb3) lfd rC20, CMUL(16)(pC0) MulByBeta(rC20, rb3) lfd rC30, CMUL(24)(pC0) MulByBeta(rC30, rb3) lfd rC01, 0(pC1) MulByBeta(rC01, rb3) lfd rC11, CMUL(8)(pC1) MulByBeta(rC11, rb3) lfd rC21, CMUL(16)(pC1) MulByBeta(rC21, rb3) lfd rC31, CMUL(24)(pC1) MulByBeta(rC31, rb3) lfd rC02, 0(pC2) MulByBeta(rC02, rb3) lfd rC12, CMUL(8)(pC2) MulByBeta(rC12, rb3) lfd rC22, CMUL(16)(pC2) MulByBeta(rC22, rb3) lfd rC32, CMUL(24)(pC2) MulByBeta(rC32, rb3) lfd rC03, 0(pC3) MulByBeta(rC03, rb3) lfd rC13, CMUL(8)(pC3) MulByBeta(rC13, rb3) lfd rC23, CMUL(16)(pC3) MulByBeta(rC23, rb3) lfd rC33, CMUL(24)(pC3) MulByBeta(rC33, rb3)#endif/* * Unrolled K loop */ lfd rA0, 0(pA0) lfd rA1, KB*8(pA0) lfd rA2, KB2*8(pA0) lfd rA3, KB3*8(pA0) lfd rB0, 0(pB0) lfd rB1, KB*8(pB0) lfd rB2, KB2*8(pB0) lfd rB3, KB3*8(pB0)#if KB > 1 lfd ra0, 8(pA0) lfd ra1, 8+KB*8(pA0) lfd ra2, 8+KB2*8(pA0) lfd ra3, 8+KB3*8(pA0) lfd rb0, 8(pB0) lfd rb1, 8+KB*8(pB0) lfd rb2, 8+KB2*8(pB0)#endif#if KB > 2 fmadd rC00, rA0, rB0, rC00 lfd rb3, 8+KB3*8(pB0) fmadd rC10, rA1, rB0, rC10 dcbt 0, pfB, 0 addi pfB, pfB, 128 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 lfd rB0, 16+KB0*8(pB0) fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 lfd rB1, 16+KB1*8(pB0) fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 lfd rB2, 16+KB2*8(pB0) fmadd rC03, rA0, rB3, rC03 lfd rA0, 16+KB0*8(pA0) fmadd rC13, rA1, rB3, rC13 lfd rA1, 16+KB1*8(pA0) fmadd rC23, rA2, rB3, rC23 lfd rA2, 16+KB2*8(pA0) fmadd rC33, rA3, rB3, rC33 lfd rA3, 16+KB3*8(pA0)#endif#if KB > 3 fmadd rC00, ra0, rb0, rC00 lfd rB3, 16+KB3*8(pB0) fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 lfd rb0, 24+KB0*8(pB0) fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 lfd rb1, 24+KB1*8(pB0) fmadd rC02, ra0, rb2, rC02 fmadd rC12, ra1, rb2, rC12 fmadd rC22, ra2, rb2, rC22 fmadd rC32, ra3, rb2, rC32 lfd rb2, 24+KB2*8(pB0) fmadd rC03, ra0, rb3, rC03 lfd ra0, 24+KB0*8(pA0) fmadd rC13, ra1, rb3, rC13 lfd ra1, 24+KB1*8(pA0) fmadd rC23, ra2, rb3, rC23 lfd ra2, 24+KB2*8(pA0) fmadd rC33, ra3, rb3, rC33 lfd ra3, 24+KB3*8(pA0)#endif#if KB > 4 fmadd rC00, rA0, rB0, rC00 lfd rb3, 24+KB3*8(pB0) fmadd rC10, rA1, rB0, rC10 fmadd rC20, rA2, rB0, rC20 fmadd rC30, rA3, rB0, rC30 lfd rB0, 32+KB0*8(pB0) fmadd rC01, rA0, rB1, rC01 fmadd rC11, rA1, rB1, rC11 fmadd rC21, rA2, rB1, rC21 fmadd rC31, rA3, rB1, rC31 lfd rB1, 32+KB1*8(pB0) fmadd rC02, rA0, rB2, rC02 fmadd rC12, rA1, rB2, rC12 fmadd rC22, rA2, rB2, rC22 fmadd rC32, rA3, rB2, rC32 lfd rB2, 32+KB2*8(pB0) fmadd rC03, rA0, rB3, rC03 lfd rA0, 32+KB0*8(pA0) fmadd rC13, rA1, rB3, rC13 lfd rA1, 32+KB1*8(pA0) fmadd rC23, rA2, rB3, rC23 lfd rA2, 32+KB2*8(pA0) fmadd rC33, rA3, rB3, rC33 lfd rA3, 32+KB3*8(pA0)#endif#if KB > 5 fmadd rC00, ra0, rb0, rC00 lfd rB3, 32+KB3*8(pB0) fmadd rC10, ra1, rb0, rC10 fmadd rC20, ra2, rb0, rC20 fmadd rC30, ra3, rb0, rC30 lfd rb0, 40+KB0*8(pB0) fmadd rC01, ra0, rb1, rC01 fmadd rC11, ra1, rb1, rC11 fmadd rC21, ra2, rb1, rC21 fmadd rC31, ra3, rb1, rC31 lfd rb1, 40+KB1*8(pB0)
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?