atl_dmm4x4x80_ppc.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,991 行 · 第 1/5 页

C
2,991
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2007 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"/* * NOTE: this kernel written by R. Clint Whaley, but it uses two key ideas * discovered by Tony Castaldo for the PowerPC970: * (1) Instructions must be issued in groups of 4 like inst (eg. 4 iop/ld *     4 fp, etc. * (2) It is effective to somewhat intermix M-loop iterations */#if !defined(ATL_AS_OSX_PPC) && !defined(ATL_GAS_LINUX_PPC) && \    !defined(ATL_AS_AIX_PPC)   #error "This kernel requires OS X, AIX, or Linux PPC assembler!"#endif#if !defined(KB) || KB == 0   #error "This kernel requires KB be a compile-time constant!"#endif#ifndef MB   #define MB 0#endif#ifdef DCPLX   #define CMUL(i_) ((i_)*2)   #define SHF  4#else   #define CMUL(i_) i_   #define SHF  3#endif#ifdef ATL_USE64BITS   #define slwi         sldi   #define srwi         srdi   #define cmpwi        cmpdi#else   #define std  stw   #define ld   lwz#endif#if defined(ATL_USE64BITS)   #define M       r3   #define N       r4   #define pA0     r7   #define pB0     r9   #define pC0     r10   #define pC1     r5   #define pC2     r6   #define pC3     r8   #define pfA     r11   #define incAn   r0   #define incCn   r12   #define pfB     r14   #define NEG(i_) -i_  /* 64 bit ABI defines red zone! */   #define FSIZE     0   #define BOFF    -160#elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC)   #define M       r3   #define N       r4   #define pA0     r8   #define pB0     r10   #define pC0     r6   #define pC1     r7   #define pC2     r9   #define pC3     r11   #define pfA     r12   #define incAn   r0   #define incCn   r5   #define pfB     r14   #define FSIZE     0   #define BOFF    -160   #define NEG(i_) -i_#else  /* 32-bit linux has no red zone */   #define M       r3   #define N       r4   #define pA0     r6   #define pB0     r8   #define pC0     r10   #define pC1     r5   #define pC2     r7   #define pC3     r9   #define pfA     r11   #define incAn   r0   #define incCn   r12   #define pfB     r14   #define NEG(i_) i_   #define FSIZE   172   #define BOFF    FSIZE-8#endif#define rA0     f0#define rA1     f1#define rA2     f2#define rA3     f3#define rB0     f4#define rB1     f5#define rB2     f6#define rB3     f7#define ra0     f8#define ra1     f9#define ra2     f10#define ra3     f11#define rb0     f12#define rb1     f13#define rb2     f14#define rb3     f15#define rC00    f16#define rC10    f17#define rC20    f18#define rC30    f19#define rC01    f20#define rC11    f21#define rC21    f22#define rC31    f23#define rC02    f24#define rC12    f25#define rC22    f26#define rC32    f27#define rC03    f28#define rC13    f29#define rC23    f30#define rC33    f31#ifndef KB1   #define KB1     KB#endif#ifndef KB2   #define KB2     KB*2   #define KB3     KB*3   #define KB4     KB*4   #define KB5     KB*5   #define KB6     KB*6   #define KB7     KB*7#endif#if 0*******************************************************************************32 bit ABIs:                         r3           r4           r5          r6-r7,f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,                (r6)       r8  (r7)       r9  (r8)      r10  (r9)   56(r1)                const TYPE *A, const int lda, const TYPE *B, const int ldb,                             f2   68(r1)          72(r1)                const TYPE beta, TYPE *C, const int ldc)                                  (r10)    8(r1)*******************************************************************************64 bit ABIs:                         r3           r4           r5             r6/f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,                           r7             r8             r9            r10                const TYPE *A, const int lda, const TYPE *B, const int ldb,                             f2   120(r1)        128(r1)                const TYPE beta, TYPE *C, const int ldc)#endif#ifdef ATL_AS_AIX_PPC        .csect .text[PR]        .toc        .csect .text[PR]        .align 3        .globl ATL_USERMM        .globl Mjoin(.,ATL_USERMM)   #ifdef ATL_USE64BITS        .csect ATL_USERMM[DS],3ATL_USERMM:        .llong Mjoin(.,ATL_USERMM)   #else        .csect ATL_USERMM[DS]ATL_USERMM:        .long Mjoin(.,ATL_USERMM), TOC[tc0], 0   #endif        .csect .text[PR]Mjoin(.,ATL_USERMM):#else.text   #ifdef ATL_AS_OSX_PPC	.globl  Mjoin(_,ATL_USERMM)Mjoin(_,ATL_USERMM):   #else      #if defined(ATL_USE64BITS)/* *      Official Program Descripter section, seg fault w/o it on Linux/PPC64 */        .section        ".opd","aw"        .align  3	.globl  ATL_USERMMATL_USERMM:        .quad   Mjoin(.,ATL_USERMM),.TOC.@tocbase,0        .previous        .type   Mjoin(.,ATL_USERMM),@function        .text	.globl  Mjoin(.,ATL_USERMM)Mjoin(.,ATL_USERMM):      #else	.globl  ATL_USERMMATL_USERMM:      #endif   #endif#endif/*      Save regs */#if defined(ATL_GAS_LINUX_PPC) && !defined(ATL_USE64BITS)        stwu    r1, -FSIZE(r1)#endif        stfd    f14, NEG(8)(r1)        stfd    f15, NEG(16)(r1)        stfd    f16, NEG(24)(r1)        stfd    f17, NEG(32)(r1)        stfd    f18, NEG(40)(r1)        stfd    f19, NEG(48)(r1)        stfd    f20, NEG(56)(r1)        stfd    f21, NEG(64)(r1)        stfd    f22, NEG(72)(r1)        stfd    f23, NEG(80)(r1)        stfd    f24, NEG(88)(r1)        stfd    f25, NEG(96)(r1)        stfd    f26, NEG(104)(r1)        stfd    f27, NEG(112)(r1)        stfd    f28, NEG(120)(r1)        stfd    f29, NEG(128)(r1)        stfd    f30, NEG(136)(r1)        stfd    f31, NEG(144)(r1)        std     r14, NEG(152)(r1)#ifdef BETAX        stfd    f2, BOFF(r1)#elif defined(BETA0)        xor     pfA, pfA, pfA   #ifdef ATL_USE64BITS        std     pfA, BOFF(r1)   #else        stw     pfA, BOFF(r1)        stw     pfA, 4+BOFF(r1)   #endif#endif#if defined (ATL_USE64BITS)        ld      pC0, 120(r1)        ld      incCn, 128(r1)#elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC)        lwz     pC0, 68(r1)        lwz     incCn,  72(r1)#else        lwz     incCn,  FSIZE+8(r1)#endif        slwi    incCn, incCn, SHF       /* incCn = ldc*sizeof */        add     pC1, pC0, incCn        add     pC2, pC1, incCn        add     pC3, pC2, incCn        slwi    pfA, M, SHF             /* pfA = M*sizeof() */        slwi    incCn, incCn, 2        sub     incCn, incCn, pfA       /* incCn = ldc*4 - M */        mulli   incAn, M, KB*8          /* incAn = M*KB*sizeof() */        add     pfA, pA0, incAn         /* pfA = A + M*KB */        srwi    M, M, 2                 /* M /= 4 */	addi	M, M, -1#if MB == 0        cmpwi   cr5, M, 0#endif//	.align 5NLOOP:        addi    pfB, pB0, KB4*8        mtctr   M	lfd	rB0, 0(pB0)	lfd	rA0, 0(pA0)	lfd	rA1, KB*8(pA0)	lfd	rA2, KB2*8(pA0)	lfd	rA3, KB3*8(pA0)	lfd	rB1, KB*8(pB0)	lfd	rB2, KB2*8(pB0)	lfd	rB3, KB3*8(pB0)#if MB == 0        beq-    cr5, MPEELED#endifMLOOP:/* Begin KLOOP */#if KB > 0   #ifdef BETA0      #if KB > 1		lfd	rb0, 8(pB0)      #endif      #if KB > 1		lfd	ra0, 8(pA0)      #endif      #if KB > 1		lfd	ra1, KB*8+8(pA0)      #endif      #if KB > 1		lfd	ra2, KB2*8+8(pA0)      #endif	fmul 	rC00, rA0, rB0	fmul 	rC10, rA1, rB0	fmul 	rC20, rA2, rB0	fmul 	rC30, rA3, rB0      #if KB > 1		lfd	ra3, KB3*8+8(pA0)      #endif      #if KB > 1		lfd	rb1, KB*8+8(pB0)      #endif      #if KB > 1		lfd	rb2, KB2*8+8(pB0)      #endif      #if KB > 1		lfd	rb3, KB3*8+8(pB0)      #endif	fmul 	rC01, rA0, rB1	fmul 	rC11, rA1, rB1	fmul 	rC21, rA2, rB1	fmul 	rC31, rA3, rB1		dcbt	0, pfA, 0        	addi    pfA, pfA, 128		dcbt	0, pfB, 0        	addi    pfB, pfB, 128	fmul 	rC02, rA0, rB2	fmul 	rC12, rA1, rB2	fmul 	rC22, rA2, rB2	fmul 	rC32, rA3, rB2	fmul 	rC03, rA0, rB3	fmul 	rC13, rA1, rB3	fmul 	rC23, rA2, rB3	fmul 	rC33, rA3, rB3   #elif defined(BETAX)	lfd	rb3, BOFF(r1)        lfd     rC00, 0(pC0)        lfd     rC10, CMUL(8)(pC0)        lfd     rC20, CMUL(16)(pC0)        lfd     rC30, CMUL(24)(pC0)	nop	nop	nop        lfd     rC01, 0(pC1)        lfd     rC11, CMUL(8)(pC1)        lfd     rC21, CMUL(16)(pC1)        lfd     rC31, CMUL(24)(pC1)	fmul	rC00, rC00, rb3	fmul	rC10, rC10, rb3	fmul	rC20, rC20, rb3	fmul	rC30, rC30, rb3        lfd     rC02, 0(pC2)        lfd     rC12, CMUL(8)(pC2)        lfd     rC22, CMUL(16)(pC2)        lfd     rC32, CMUL(24)(pC2)	fmul	rC01, rC01, rb3	fmul	rC11, rC11, rb3	fmul	rC21, rC21, rb3	fmul	rC31, rC31, rb3        lfd     rC03, 0(pC3)        lfd     rC13, CMUL(8)(pC3)        lfd     rC23, CMUL(16)(pC3)        lfd     rC33, CMUL(24)(pC3)	fmul	rC02, rC02, rb3	fmul	rC12, rC12, rb3	fmul	rC22, rC22, rb3	fmul	rC32, rC32, rb3	fmul	rC03, rC03, rb3	fmul	rC13, rC13, rb3	fmul	rC23, rC23, rb3	fmul	rC33, rC33, rb3      #if KB > 1		lfd	rb0, 8(pB0)      #endif      #if KB > 1		lfd	ra0, 8(pA0)      #endif      #if KB > 1		lfd	ra1, KB*8+8(pA0)      #endif      #if KB > 1		lfd	ra2, KB2*8+8(pA0)      #endif	fmadd	rC00, rA0, rB0, rC00	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31      #if KB > 1		lfd	ra3, KB3*8+8(pA0)      #endif      #if KB > 1		lfd	rb1, KB*8+8(pB0)      #endif      #if KB > 1		lfd	rb2, KB2*8+8(pB0)      #endif      #if KB > 1		lfd	rb3, KB3*8+8(pB0)      #endif	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32		dcbt	0, pfA, 0        	addi    pfA, pfA, 128		dcbt	0, pfB, 0        	addi    pfB, pfB, 128	fmadd	rC03, rA0, rB3, rC03	fmadd	rC13, rA1, rB3, rC13	fmadd	rC23, rA2, rB3, rC23	fmadd	rC33, rA3, rB3, rC33   #else  /* BETA == 1 */        lfd     rC00, 0(pC0)        lfd     rC10, CMUL(8)(pC0)        lfd     rC20, CMUL(16)(pC0)        lfd     rC30, CMUL(24)(pC0)        lfd     rC01, 0(pC1)        lfd     rC11, CMUL(8)(pC1)        lfd     rC21, CMUL(16)(pC1)        lfd     rC31, CMUL(24)(pC1)        lfd     rC02, 0(pC2)        lfd     rC12, CMUL(8)(pC2)        lfd     rC22, CMUL(16)(pC2)        lfd     rC32, CMUL(24)(pC2)        	lfd     rC03, 0(pC3)        	lfd     rC13, CMUL(8)(pC3)        	lfd     rC23, CMUL(16)(pC3)        	lfd     rC33, CMUL(24)(pC3)	fmadd	rC00, rA0, rB0, rC00	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30		dcbt	0, pfA, 0		dcbt	0, pfB, 0        	addi    pfA, pfA, 128        	addi    pfB, pfB, 128	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31      #if KB > 1		lfd	rb0, 8(pB0)      #endif      #if KB > 1		lfd	ra0, 8(pA0)      #endif      #if KB > 1		lfd	ra1, KB*8+8(pA0)      #endif      #if KB > 1		lfd	ra2, KB2*8+8(pA0)      #endif	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32      #if KB > 1		lfd	ra3, KB3*8+8(pA0)      #endif      #if KB > 1		lfd	rb1, KB*8+8(pB0)      #endif      #if KB > 1		lfd	rb2, KB2*8+8(pB0)      #endif      #if KB > 1		lfd	rb3, KB3*8+8(pB0)      #endif	fmadd	rC03, rA0, rB3, rC03	fmadd	rC13, rA1, rB3, rC13	fmadd	rC23, rA2, rB3, rC23	fmadd	rC33, rA3, rB3, rC33   #endif /* done BETA specialization */#endif  /* end K=1 block */#if KB > 1   #if KB > 2		lfd	rB0, 16(pB0)   #endif   #if KB > 2		lfd	rA0, 16(pA0)   #endif   #if KB > 2		lfd	rA1, KB*8+16(pA0)   #endif   #if KB > 2		lfd	rA2, KB2*8+16(pA0)   #endif	fmadd	rC00, ra0, rb0, rC00	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30   #if KB > 2		lfd	rA3, KB3*8+16(pA0)   #endif   #if KB > 2		lfd	rB1, KB*8+16(pB0)   #endif   #if KB > 2		lfd	rB2, KB2*8+16(pB0)   #endif   #if KB > 2		lfd	rB3, KB3*8+16(pB0)   #endif	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	fmadd	rC03, ra0, rb3, rC03	fmadd	rC13, ra1, rb3, rC13	fmadd	rC23, ra2, rb3, rC23	fmadd	rC33, ra3, rb3, rC33#endif  /* end K=2 block */#if KB > 2   #if KB > 3		lfd	rb0, 24(pB0)   #endif   #if KB > 3		lfd	ra0, 24(pA0)   #endif   #if KB > 3		lfd	ra1, KB*8+24(pA0)   #endif   #if KB > 3		lfd	ra2, KB2*8+24(pA0)   #endif	fmadd	rC00, rA0, rB0, rC00	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30   #if KB > 3		lfd	ra3, KB3*8+24(pA0)   #endif   #if KB > 3		lfd	rb1, KB*8+24(pB0)   #endif   #if KB > 3		lfd	rb2, KB2*8+24(pB0)   #endif   #if KB > 3		lfd	rb3, KB3*8+24(pB0)   #endif	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32        	addi    pC0, pC0, CMUL(4)*8     /* pC0 += 4 */        	addi    pC1, pC1, CMUL(4)*8        	addi    pC2, pC2, CMUL(4)*8        	addi    pC3, pC3, CMUL(4)*8	fmadd	rC03, rA0, rB3, rC03	fmadd	rC13, rA1, rB3, rC13	fmadd	rC23, rA2, rB3, rC23	fmadd	rC33, rA3, rB3, rC33#else        	addi    pC0, pC0, CMUL(4)*8     /* pC0 += 4 */        	addi    pC1, pC1, CMUL(4)*8        	addi    pC2, pC2, CMUL(4)*8        	addi    pC3, pC3, CMUL(4)*8#endif  /* end K=3 block */#if KB > 3   #if KB > 4		lfd	rB0, 32(pB0)   #endif   #if KB > 4		lfd	rA0, 32(pA0)   #endif   #if KB > 4		lfd	rA1, KB*8+32(pA0)   #endif   #if KB > 4		lfd	rA2, KB2*8+32(pA0)   #endif	fmadd	rC00, ra0, rb0, rC00	fmadd	rC10, ra1, rb0, rC10

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?