atl_dmm4x4x32_ppc.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,609 行 · 第 1/5 页

C
2,609
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2005 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#if !defined(ATL_AS_OSX_PPC) && !defined(ATL_GAS_LINUX_PPC) && \    !defined(ATL_AS_AIX_PPC)   #error "This kernel requires OS X, AIX, or Linux PPC assembler!"#endif#ifdef BETAX   #define MulByBeta(elt_, bet_) \      fmul      elt_, elt_, bet_#else   #define MulByBeta(elt_, bet_)#endif#ifndef Mjoin   #define Mjoin(pre, nam) my_join(pre, nam)   #define my_join(pre, nam) pre ## nam#endif#ifdef DCPLX   #define CMUL(i_) ((i_)*2)   #define SHF  4#else   #define CMUL(i_) i_   #define SHF  3#endif#ifdef ATL_GAS_LINUX_PPC   #define r0    0   #define r1    1   #define r2    2   #define r3    3   #define r4    4   #define r5    5   #define r6    6   #define r7    7   #define r8    8   #define r9    9   #define r10  10   #define r11  11   #define r12  12/*   #define r13  13 don't use r13 under linux, as it rules out .so */   #define r14  14   #define r15  15   #define r16  16   #define r17  17   #define r18  18   #define r19  19   #define r20  20   #define r21  21   #define r22  22   #define r23  23   #define r24  24   #define r25  25   #define r26  26   #define r27  27   #define r28  28   #define r29  29   #define r30  30   #define r31  31   #define f0    0   #define f1    1   #define f2    2   #define f3    3   #define f4    4   #define f5    5   #define f6    6   #define f7    7   #define f8    8   #define f9    9   #define f10  10   #define f11  11   #define f12  12   #define f13  13   #define f14  14   #define f15  15   #define f16  16   #define f17  17   #define f18  18   #define f19  19   #define f20  20   #define f21  21   #define f22  22   #define f23  23   #define f24  24   #define f25  25   #define f26  26   #define f27  27   #define f28  28   #define f29  29   #define f30  30   #define f31  31#endif#ifdef ATL_USE64BITS   #define slwi         sldi   #define srwi         srdi#else   #define std  stw   #define ld   lwz#endif#if defined(ATL_USE64BITS)   #define M       r3   #define N       r4   #define pA0     r7   #define pB0     r9   #define pC0     r10   #define pC1     r5   #define pC2     r6   #define pC3     r8   #define pfA     r11   #define incAn   r0   #define incCn   r12   #define pfB     r14   #define NEG(i_) -i_  /* 64 bit ABI defines red zone! */   #define FSIZE     0   #define BOFF    -160#elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC)   #define M       r3   #define N       r4   #define pA0     r8   #define pB0     r10   #define pC0     r6   #define pC1     r7   #define pC2     r9   #define pC3     r11   #define pfA     r12   #define incAn   r0   #define incCn   r5   #define pfB     r14   #define BOFF    -160   #define FSIZE    0   #define NEG(i_) -i_#else   #define M       r3   #define N       r4   #define pA0     r6   #define pB0     r8   #define pC0     r10   #define pC1     r5   #define pC2     r7   #define pC3     r9   #define pfA     r11   #define incAn   r0   #define incCn   r12   #define pfB     r14   #define NEG(i_) i_   #define FSIZE   172   #define BOFF    FSIZE-8#endif#define rA0     f0#define rA1     f1#define rA2     f2#define rA3     f3#define rB0     f4#define rB1     f5#define rB2     f6#define rB3     f7#define ra0     f8#define ra1     f9#define ra2     f10#define ra3     f11#define rb0     f12#define rb1     f13#define rb2     f14#define rb3     f15#define rC00    f16#define rC10    f17#define rC20    f18#define rC30    f19#define rC01    f20#define rC11    f21#define rC21    f22#define rC31    f23#define rC02    f24#define rC12    f25#define rC22    f26#define rC32    f27#define rC03    f28#define rC13    f29#define rC23    f30#define rC33    f31#ifdef KB0   #undef KB0#endif#define KB0      0#ifndef KB1   #define KB1     KB#endif#ifndef KB2   #define KB2     KB*2   #define KB3     KB*3   #define KB4     KB*4   #define KB5     KB*5   #define KB6     KB*6   #define KB7     KB*7#endif#if 0*******************************************************************************32 bit ABIs:                         r3           r4           r5          r6-r7,f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,                (r6)       r8  (r7)       r9  (r8)      r10  (r9)   56(r1)                const TYPE *A, const int lda, const TYPE *B, const int ldb,                             f2   68(r1)          72(r1)                const TYPE beta, TYPE *C, const int ldc)                                  (r10)    8(r1)*******************************************************************************64 bit ABIs:                         r3           r4           r5             r6/f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,                           r7             r8             r9            r10                const TYPE *A, const int lda, const TYPE *B, const int ldb,                             f2   120(r1)        128(r1)                const TYPE beta, TYPE *C, const int ldc)#endif#ifdef ATL_AS_AIX_PPC        .csect .text[PR]        .toc        .csect .text[PR]        .align 3        .globl ATL_USERMM        .globl Mjoin(.,ATL_USERMM)   #ifdef ATL_USE64BITS        .csect ATL_USERMM[DS],3ATL_USERMM:        .llong Mjoin(.,ATL_USERMM)   #else        .csect ATL_USERMM[DS]ATL_USERMM:        .long Mjoin(.,ATL_USERMM), TOC[tc0], 0   #endif        .csect .text[PR]Mjoin(.,ATL_USERMM):#else.text   #ifdef ATL_AS_OSX_PPC	.globl  Mjoin(_,ATL_USERMM)Mjoin(_,ATL_USERMM):   #else      #if defined(ATL_USE64BITS)/* *      Official Program Descripter section, seg fault w/o it on Linux/PPC64 */        .section        ".opd","aw"	.globl  ATL_USERMM        .align  3ATL_USERMM:        .quad   Mjoin(.,ATL_USERMM),.TOC.@tocbase,0        .previous        .type   Mjoin(.,ATL_USERMM),@function        .text	.globl  Mjoin(.,ATL_USERMM)Mjoin(.,ATL_USERMM):      #else	.globl  ATL_USERMMATL_USERMM:      #endif   #endif#endif/*      Save regs */#if defined(ATL_GAS_LINUX_PPC) && !defined(ATL_USE64BITS)        stwu    r1, -FSIZE(r1)#endif        stfd    f14, NEG(8)(r1)        stfd    f15, NEG(16)(r1)        stfd    f16, NEG(24)(r1)        stfd    f17, NEG(32)(r1)        stfd    f18, NEG(40)(r1)        stfd    f19, NEG(48)(r1)        stfd    f20, NEG(56)(r1)        stfd    f21, NEG(64)(r1)        stfd    f22, NEG(72)(r1)        stfd    f23, NEG(80)(r1)        stfd    f24, NEG(88)(r1)        stfd    f25, NEG(96)(r1)        stfd    f26, NEG(104)(r1)        stfd    f27, NEG(112)(r1)        stfd    f28, NEG(120)(r1)        stfd    f29, NEG(128)(r1)        stfd    f30, NEG(136)(r1)        stfd    f31, NEG(144)(r1)        std     r14, NEG(152)(r1)#ifdef BETAX        stfd    f2, BOFF(r1)#elif defined(BETA0)        xor     pfA, pfA, pfA   #ifdef ATL_USE64BITS        std     pfA, BOFF(r1)   #else        stw     pfA, BOFF(r1)        stw     pfA, 4+BOFF(r1)   #endif#endif#ifdef ATL_USE64BITS        ld      pC0, 120(r1)        ld      incCn, 128(r1)#elif defined(ATL_AS_OSX_PPC) || defined(ATL_AS_AIX_PPC)        lwz     pC0, 68(r1)        lwz     incCn,  72(r1)#else        lwz     incCn,  FSIZE+8(r1)#endif        slwi    incCn, incCn, SHF       /* incCn = ldc*sizeof */        add     pC1, pC0, incCn        add     pC2, pC1, incCn        add     pC3, pC2, incCn        slwi    pfA, M, SHF             /* pfA = M*sizeof() */        slwi    incCn, incCn, 2        sub     incCn, incCn, pfA       /* incCn = ldc*4 - M */        mulli   incAn, M, KB*8          /* incAn = M*KB*sizeof() */        add     pfA, pA0, incAn         /* pfA = A + M*KB */        srwi    M, M, 2                 /* M /= 4 */NLOOP:        addi    pfB, pB0, KB4*8        mtctr   MMLOOP:#ifdef BETA0        lfd     rC00, BOFF(r1)        fmr     rC10, rC00        fmr     rC20, rC00        fmr     rC30, rC00        fmr     rC01, rC00        fmr     rC11, rC00        fmr     rC21, rC00        fmr     rC31, rC00        fmr     rC02, rC00        fmr     rC12, rC00        fmr     rC22, rC00        fmr     rC32, rC00        fmr     rC03, rC00        fmr     rC13, rC00        fmr     rC23, rC00        fmr     rC33, rC00#else   #ifdef BETAX        lfd     rb3, BOFF(r1)   #endif        lfd     rC00, 0(pC0)        MulByBeta(rC00, rb3)        lfd     rC10, CMUL(8)(pC0)        MulByBeta(rC10, rb3)        lfd     rC20, CMUL(16)(pC0)        MulByBeta(rC20, rb3)        lfd     rC30, CMUL(24)(pC0)        MulByBeta(rC30, rb3)        lfd     rC01, 0(pC1)        MulByBeta(rC01, rb3)        lfd     rC11, CMUL(8)(pC1)        MulByBeta(rC11, rb3)        lfd     rC21, CMUL(16)(pC1)        MulByBeta(rC21, rb3)        lfd     rC31, CMUL(24)(pC1)        MulByBeta(rC31, rb3)        lfd     rC02, 0(pC2)        MulByBeta(rC02, rb3)        lfd     rC12, CMUL(8)(pC2)        MulByBeta(rC12, rb3)        lfd     rC22, CMUL(16)(pC2)        MulByBeta(rC22, rb3)        lfd     rC32, CMUL(24)(pC2)        MulByBeta(rC32, rb3)        lfd     rC03, 0(pC3)        MulByBeta(rC03, rb3)        lfd     rC13, CMUL(8)(pC3)        MulByBeta(rC13, rb3)        lfd     rC23, CMUL(16)(pC3)        MulByBeta(rC23, rb3)        lfd     rC33, CMUL(24)(pC3)        MulByBeta(rC33, rb3)#endif/* *      Unrolled K loop */        lfd     rA0, 0(pA0)        lfd     rA1, KB*8(pA0)        lfd     rA2, KB2*8(pA0)        lfd     rA3, KB3*8(pA0)        lfd     rB0, 0(pB0)        lfd     rB1, KB*8(pB0)        lfd     rB2, KB2*8(pB0)        lfd     rB3, KB3*8(pB0)#if KB > 1        lfd     ra0, 8(pA0)        lfd     ra1, 8+KB*8(pA0)        lfd     ra2, 8+KB2*8(pA0)        lfd     ra3, 8+KB3*8(pA0)        lfd     rb0, 8(pB0)        lfd     rb1, 8+KB*8(pB0)        lfd     rb2, 8+KB2*8(pB0)#endif#if KB > 2	fmadd	rC00, rA0, rB0, rC00        lfd     rb3, 8+KB3*8(pB0)	fmadd	rC10, rA1, rB0, rC10                dcbt    0, pfB, 0                addi    pfB, pfB, 128	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	lfd	rB0, 16+KB0*8(pB0)	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	lfd	rB1, 16+KB1*8(pB0)	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	lfd	rB2, 16+KB2*8(pB0)	fmadd	rC03, rA0, rB3, rC03	lfd	rA0, 16+KB0*8(pA0)	fmadd	rC13, rA1, rB3, rC13	lfd	rA1, 16+KB1*8(pA0)	fmadd	rC23, rA2, rB3, rC23	lfd	rA2, 16+KB2*8(pA0)	fmadd	rC33, rA3, rB3, rC33	lfd	rA3, 16+KB3*8(pA0)#endif#if KB > 3	fmadd	rC00, ra0, rb0, rC00	lfd	rB3, 16+KB3*8(pB0)	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	lfd	rb0, 24+KB0*8(pB0)	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	lfd	rb1, 24+KB1*8(pB0)	fmadd	rC02, ra0, rb2, rC02	fmadd	rC12, ra1, rb2, rC12	fmadd	rC22, ra2, rb2, rC22	fmadd	rC32, ra3, rb2, rC32	lfd	rb2, 24+KB2*8(pB0)	fmadd	rC03, ra0, rb3, rC03	lfd	ra0, 24+KB0*8(pA0)	fmadd	rC13, ra1, rb3, rC13	lfd	ra1, 24+KB1*8(pA0)	fmadd	rC23, ra2, rb3, rC23	lfd	ra2, 24+KB2*8(pA0)	fmadd	rC33, ra3, rb3, rC33	lfd	ra3, 24+KB3*8(pA0)#endif#if KB > 4	fmadd	rC00, rA0, rB0, rC00	lfd	rb3, 24+KB3*8(pB0)	fmadd	rC10, rA1, rB0, rC10	fmadd	rC20, rA2, rB0, rC20	fmadd	rC30, rA3, rB0, rC30	lfd	rB0, 32+KB0*8(pB0)	fmadd	rC01, rA0, rB1, rC01	fmadd	rC11, rA1, rB1, rC11	fmadd	rC21, rA2, rB1, rC21	fmadd	rC31, rA3, rB1, rC31	lfd	rB1, 32+KB1*8(pB0)	fmadd	rC02, rA0, rB2, rC02	fmadd	rC12, rA1, rB2, rC12	fmadd	rC22, rA2, rB2, rC22	fmadd	rC32, rA3, rB2, rC32	lfd	rB2, 32+KB2*8(pB0)	fmadd	rC03, rA0, rB3, rC03	lfd	rA0, 32+KB0*8(pA0)	fmadd	rC13, rA1, rB3, rC13	lfd	rA1, 32+KB1*8(pA0)	fmadd	rC23, rA2, rB3, rC23	lfd	rA2, 32+KB2*8(pA0)	fmadd	rC33, rA3, rB3, rC33	lfd	rA3, 32+KB3*8(pA0)#endif#if KB > 5	fmadd	rC00, ra0, rb0, rC00	lfd	rB3, 32+KB3*8(pB0)	fmadd	rC10, ra1, rb0, rC10	fmadd	rC20, ra2, rb0, rC20	fmadd	rC30, ra3, rb0, rC30	lfd	rb0, 40+KB0*8(pB0)	fmadd	rC01, ra0, rb1, rC01	fmadd	rC11, ra1, rb1, rC11	fmadd	rC21, ra2, rb1, rC21	fmadd	rC31, ra3, rb1, rC31	lfd	rb1, 40+KB1*8(pB0)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?