atl_smm4x4x128_av.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 3,072 行 · 第 1/5 页

C
3,072
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2007 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#if !defined(ATL_AS_OSX_PPC) && !defined(ATL_GAS_LINUX_PPC)   #error "This kernel requires OS X or Linux PPC assembler!"#endif#if !defined(KB) || KB == 0   #error "This kernel requires KB be a compile-time constant!"#endif#ifndef MB   #define MB 0#endif#ifdef DCPLX   #define CMUL(i_) ((i_)*2)   #define SHF  3#else   #define CMUL(i_) i_   #define SHF  2#endif#ifdef ATL_USE64BITS   #define slwi         sldi   #define srwi         srdi   #define cmpwi        cmpdi#else   #define std  stw   #define ld   lwz#endif#ifdef ATL_AS_OSX_PPC   #define M       r3   #define N       r4   #define pA0     r7   #define pB0     r9   #define pC0     r6   #define ldc     r8   #define ldc2    r10   #define ldc3    r11   #define pfA     r12   #define incAn   r0   #define incCn   r5#elif defined(ATL_USE64BITS)   #define M       r3   #define N       r4   #define pA0     r7   #define pB0     r9   #define pC0     r10   #define ldc     r5   #define ldc2    r6   #define ldc3    r8   #define pfA     r11   #define incAn   r0   #define incCn   r12#else   #define M       r3   #define N       r4   #define pA0     r6   #define pB0     r8   #define pC0     r10   #define ldc     r5   #define ldc2    r7   #define ldc3    r9   #define pfA     r11   #define incAn   r0   #define incCn   r12#endif#define pfB     r14#define k0      r15#define k1      r16#define k2      r17#define k3      r18#define pBETA   r19/* * These next 7 defines only used by unaligned-C kernel */#define nxtC0   r20#define nxtC1   r21#define nxtC2   r22#define nxtC3   r23#define off4    r24#define off8    r25#define off12   r26#if defined(ATL_USE64BITS)   #define FSIZE   320   #define FST     48#else   #define FSIZE   304   #define FST     32#endif#define BOFF    FSIZE-16#define vA0     v0#define vA1     v1#define vA2     v2#define vA3     v3#define vB0     v4#define vB1     v5#define vB2     v6#define vB3     v7#define va0     v8#define va1     v9#define va2     v10#define va3     v11#define vb0     v12#define vb1     v13#define vb2     v14#define vb3     v15#define vC00    v16#define vC10    v17#define vC20    v18#define vC30    v19#define vC01    v20#define vC11    v21#define vC21    v22#define vC31    v23#define vC02    v24#define vC12    v25#define vC22    v26#define vC32    v27#define vC03    v28#define vC13    v29#define vC23    v30#define vC33    v31#ifndef KB1   #define KB1     KB#endif#ifndef KB2   #define KB2     KB*2   #define KB3     KB*3   #define KB4     KB*4   #define KB5     KB*5   #define KB6     KB*6   #define KB7     KB*7#endif#if 0*******************************************************************************32 bit ABIs: (linux in parenthesis)                         r3           r4           r5             r6,f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,                (r6)       r7  (r7)       r8  (r8)       r9  (r9)      r10                const TYPE *A, const int lda, const TYPE *B, const int ldb,                             f2   68(r1)          72(r1)                const TYPE beta, TYPE *C, const int ldc)                                  (r10)    8(r1)*******************************************************************************64 bit ABIs:                         r3           r4           r5             r6/f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,                           r7             r8             r9            r10                const TYPE *A, const int lda, const TYPE *B, const int ldb,                             f2   120(r1)        128(r1)                const TYPE beta, TYPE *C, const int ldc)#endif.text#ifdef ATL_AS_OSX_PPC	.globl  Mjoin(_,ATL_USERMM)Mjoin(_,ATL_USERMM):#else   #if defined(ATL_USE64BITS)/* *      Official Program Descripter section, seg fault w/o it on Linux/PPC64 */        .section        ".opd","aw"        .align 2	.globl  ATL_USERMM        .align  3ATL_USERMM:        .quad   Mjoin(.,ATL_USERMM),.TOC.@tocbase,0        .previous        .type   Mjoin(.,ATL_USERMM),@function        .text	.globl  Mjoin(.,ATL_USERMM)Mjoin(.,ATL_USERMM):   #else	.globl  ATL_USERMMATL_USERMM:   #endif#endif/* *      If C is unaligned or ldc would cause misalignment, run unaligned *      kernel instead */#if defined (ATL_USE64BITS)        ld      r10, 120(r1)        ld      r5, 128(r1)#elif defined(ATL_AS_OSX_PPC)        lwz     r10, 60(r1)        lwz     r5,  64(r1)#else        lwz     r5,  8(r1)#endif	andi.	r0, r10, 0xF	/* looking for 1s in last 4 digits of ptr */	andi.	r5, r5, 0x3     /* ldc will be *sizeof, so only last 2 dig */	or.	r5, r5, r0      /* either one got 1s? */	bne	UNALIGNED_C/*      Save regs */#if defined(ATL_USE64BITS)        stdu    r1, -FSIZE(r1)#else        stwu    r1, -FSIZE(r1)#endif        std     r14, FST(r1)        std     r15, FST+8(r1)        std     r16, FST+16(r1)        std     r17, FST+24(r1)        std     r18, FST+32(r1)        std     r19, FST+40(r1)        std     r20, FST+48(r1)        mfspr   r14, VRsave        std     r14, FST+56(r1)        li      r14, FST+64        stvx    v20, r1, r14        addi    r14, r14, 16        stvx    v21, r1, r14        addi    r14, r14, 16        stvx    v22, r1, r14        addi    r14, r14, 16        stvx    v23, r1, r14        addi    r14, r14, 16        stvx    v24, r1, r14        addi    r14, r14, 16        stvx    v25, r1, r14        addi    r14, r14, 16        stvx    v26, r1, r14        addi    r14, r14, 16        stvx    v27, r1, r14        addi    r14, r14, 16        stvx    v28, r1, r14        addi    r14, r14, 16        stvx    v29, r1, r14        addi    r14, r14, 16        stvx    v30, r1, r14        addi    r14, r14, 16        stvx    v31, r1, r14        vxor    v0, v0, v0      /* zero v0 */        mtvscr  v0              /* force IEEE compliance */#ifdef BETAX	addi	pBETA, r1, BOFF        stfs    f2, BOFF(r1)        lvewx   v0, 0, pBETA        vspltw  v0, v0, 0        stvx    v0, 0, pBETA#endif        eqv     r0, r0, r0      /* all 1s */        mtspr   VRsave, r0      /* signal we use all vector regs */#if defined (ATL_USE64BITS)        ld      pC0, FSIZE+120(r1)        ld      ldc, FSIZE+128(r1)#elif defined(ATL_AS_OSX_PPC)        lwz     pC0, FSIZE+60(r1)        lwz     ldc,  FSIZE+64(r1)#else        lwz     ldc,  FSIZE+8(r1)#endif        slwi    ldc, ldc, SHF       /* ldc = ldc*sizeof */        add     ldc2, ldc, ldc        add     ldc3, ldc2, ldc        slwi    pfA, M, SHF             /* pfA = M*sizeof() */        slwi    incCn, ldc, 2        sub     incCn, incCn, pfA       /* incCn = ldc*4 - M */        mulli   incAn, M, KB*4          /* incAn = M*KB*sizeof() */        add     pfA, pA0, incAn         /* pfA = A + M*KB */        srwi    M, M, 2                 /* M /= 4 */	addi	M, M, -1// pA0 = pA0 - incAn + KB4*4 = pA0 -(incAn - KB*4)	mr	k1, incAn	addi	incAn, k1, -KB4*4	addi	incCn, incCn, 16#if MB == 0        cmpwi   cr5, M, 0#endif//	.align 5NLOOP:        addi    pfB, pB0, KB4*4        mtctr   M#if MB == 0        beq-    cr5, MPEELED#endif        xor     k0, k0, k0        li      k1, KB*4        li      k2, 2*KB*4        li      k3, 3*KB*4        lvx     vB0, 0, pB0        lvx     vA0, 0, pA0        lvx     vA1, pA0, k1        lvx     vA2, pA0, k2        lvx     vA3, pA0, k3        lvx     vB1, pB0, k1        lvx     vB2, pB0, k2        lvx     vB3, pB0, k3        vxor    vC33, vC33, vC33#if KB > 4        addi    k0, k0, 16        addi    k1, k1, 16        addi    k2, k2, 16        addi    k3, k3, 16        lvx     vb0, pB0, k0        lvx     va0, pA0, k0        lvx     va1, pA0, k1        lvx     va2, pA0, k2        lvx     va3, pA0, k3        lvx     vb1, pB0, k1        lvx     vb2, pB0, k2        lvx     vb3, pB0, k3#endif#if MB == 0 || MB > 4MLOOP:/* Begin KLOOP */#if KB > 0      #if KB > 8		addi	k0, k0, 16      #endif	vmaddfp	vC00, vA0, vB0, vC33      #if KB > 8		addi	k1, k1, 16      #endif	vmaddfp	vC10, vA1, vB0, vC33      #if KB > 8		addi	k2, k2, 16      #endif	vmaddfp	vC20, vA2, vB0, vC33      #if KB > 8		addi	k3, k3, 16      #endif	vmaddfp	vC30, vA3, vB0, vC33      #if KB > 8		lvx vB0, pB0, k0      #endif	vmaddfp	vC01, vA0, vB1, vC33	vmaddfp	vC11, vA1, vB1, vC33                        dcbt    0, pfA, 0	vmaddfp	vC21, vA2, vB1, vC33                        addi    pfA, pfA, 64	vmaddfp	vC31, vA3, vB1, vC33      #if KB > 8		lvx vB1, pB0, k1      #endif	vmaddfp	vC02, vA0, vB2, vC33	vmaddfp	vC12, vA1, vB2, vC33	vmaddfp	vC22, vA2, vB2, vC33	vmaddfp	vC32, vA3, vB2, vC33      #if KB > 8		lvx vB2, pB0, k2      #endif	vmaddfp	vC03, vA0, vB3, vC33      #if KB > 8		lvx vA0, pA0, k0      #endif	vmaddfp	vC13, vA1, vB3, vC33      #if KB > 8		lvx vA1, pA0, k1      #endif	vmaddfp	vC23, vA2, vB3, vC33      #if KB > 8		lvx vA2, pA0, k2      #endif	vmaddfp	vC33, vA3, vB3, vC33      #if KB > 8		lvx vA3, pA0, k3      #endif      #if KB > 8		lvx vB3, pB0, k3      #endif#endif  /* end K=0 block */#if KB > 4   #if KB > 12		addi	k0, k0, 16   #endif	vmaddfp	vC00, va0, vb0, vC00   #if KB > 12		addi	k1, k1, 16   #endif	vmaddfp	vC10, va1, vb0, vC10   #if KB > 12		addi	k2, k2, 16   #endif	vmaddfp	vC20, va2, vb0, vC20   #if KB > 12		addi	k3, k3, 16   #endif	vmaddfp	vC30, va3, vb0, vC30   #if KB > 12		lvx vb0, pB0, k0   #endif	vmaddfp	vC01, va0, vb1, vC01	vmaddfp	vC11, va1, vb1, vC11	vmaddfp	vC21, va2, vb1, vC21	vmaddfp	vC31, va3, vb1, vC31   #if KB > 12		lvx vb1, pB0, k1   #endif	vmaddfp	vC02, va0, vb2, vC02	vmaddfp	vC12, va1, vb2, vC12	vmaddfp	vC22, va2, vb2, vC22	vmaddfp	vC32, va3, vb2, vC32   #if KB > 12		lvx vb2, pB0, k2   #endif	vmaddfp	vC03, va0, vb3, vC03   #if KB > 12		lvx va0, pA0, k0   #endif	vmaddfp	vC13, va1, vb3, vC13   #if KB > 12		lvx va1, pA0, k1   #endif	vmaddfp	vC23, va2, vb3, vC23   #if KB > 12		lvx va2, pA0, k2   #endif	vmaddfp	vC33, va3, vb3, vC33   #if KB > 12		lvx va3, pA0, k3   #endif   #if KB > 12		lvx vb3, pB0, k3   #endif#endif  /* end K=4 block */#if KB > 8   #if KB > 16		addi	k0, k0, 16   #endif	vmaddfp	vC00, vA0, vB0, vC00   #if KB > 16		addi	k1, k1, 16   #endif	vmaddfp	vC10, vA1, vB0, vC10   #if KB > 16		addi	k2, k2, 16   #endif	vmaddfp	vC20, vA2, vB0, vC20   #if KB > 16		addi	k3, k3, 16   #endif	vmaddfp	vC30, vA3, vB0, vC30   #if KB > 16		lvx vB0, pB0, k0   #endif	vmaddfp	vC01, vA0, vB1, vC01	vmaddfp	vC11, vA1, vB1, vC11	vmaddfp	vC21, vA2, vB1, vC21	vmaddfp	vC31, vA3, vB1, vC31   #if KB > 16		lvx vB1, pB0, k1   #endif	vmaddfp	vC02, vA0, vB2, vC02	vmaddfp	vC12, vA1, vB2, vC12	vmaddfp	vC22, vA2, vB2, vC22	vmaddfp	vC32, vA3, vB2, vC32   #if KB > 16		lvx vB2, pB0, k2   #endif	vmaddfp	vC03, vA0, vB3, vC03   #if KB > 16		lvx vA0, pA0, k0   #endif	vmaddfp	vC13, vA1, vB3, vC13   #if KB > 16		lvx vA1, pA0, k1   #endif	vmaddfp	vC23, vA2, vB3, vC23   #if KB > 16		lvx vA2, pA0, k2   #endif	vmaddfp	vC33, vA3, vB3, vC33   #if KB > 16		lvx vA3, pA0, k3   #endif   #if KB > 16		lvx vB3, pB0, k3   #endif#endif  /* end K=8 block */#if KB > 12   #if KB > 20		addi	k0, k0, 16   #endif	vmaddfp	vC00, va0, vb0, vC00   #if KB > 20		addi	k1, k1, 16   #endif	vmaddfp	vC10, va1, vb0, vC10   #if KB > 20		addi	k2, k2, 16   #endif	vmaddfp	vC20, va2, vb0, vC20   #if KB > 20		addi	k3, k3, 16   #endif	vmaddfp	vC30, va3, vb0, vC30   #if KB > 20		lvx vb0, pB0, k0   #endif	vmaddfp	vC01, va0, vb1, vC01	vmaddfp	vC11, va1, vb1, vC11	vmaddfp	vC21, va2, vb1, vC21	vmaddfp	vC31, va3, vb1, vC31   #if KB > 20		lvx vb1, pB0, k1   #endif	vmaddfp	vC02, va0, vb2, vC02	vmaddfp	vC12, va1, vb2, vC12	vmaddfp	vC22, va2, vb2, vC22	vmaddfp	vC32, va3, vb2, vC32   #if KB > 20		lvx vb2, pB0, k2   #endif	vmaddfp	vC03, va0, vb3, vC03   #if KB > 20		lvx va0, pA0, k0   #endif	vmaddfp	vC13, va1, vb3, vC13   #if KB > 20		lvx va1, pA0, k1   #endif	vmaddfp	vC23, va2, vb3, vC23   #if KB > 20		lvx va2, pA0, k2   #endif	vmaddfp	vC33, va3, vb3, vC33   #if KB > 20		lvx va3, pA0, k3   #endif   #if KB > 20		lvx vb3, pB0, k3   #endif#endif  /* end K=12 block */#if KB > 16   #if KB > 24		addi	k0, k0, 16   #endif	vmaddfp	vC00, vA0, vB0, vC00   #if KB > 24		addi	k1, k1, 16   #endif	vmaddfp	vC10, vA1, vB0, vC10   #if KB > 24		addi	k2, k2, 16   #endif	vmaddfp	vC20, vA2, vB0, vC20   #if KB > 24		addi	k3, k3, 16   #endif	vmaddfp	vC30, vA3, vB0, vC30   #if KB > 24		lvx vB0, pB0, k0   #endif	vmaddfp	vC01, vA0, vB1, vC01	vmaddfp	vC11, vA1, vB1, vC11	vmaddfp	vC21, vA2, vB1, vC21	vmaddfp	vC31, vA3, vB1, vC31   #if KB > 24		lvx vB1, pB0, k1   #endif	vmaddfp	vC02, vA0, vB2, vC02	vmaddfp	vC12, vA1, vB2, vC12	vmaddfp	vC22, vA2, vB2, vC22	vmaddfp	vC32, vA3, vB2, vC32   #if KB > 24		lvx vB2, pB0, k2   #endif	vmaddfp	vC03, vA0, vB3, vC03   #if KB > 24		lvx vA0, pA0, k0   #endif	vmaddfp	vC13, vA1, vB3, vC13   #if KB > 24		lvx vA1, pA0, k1   #endif	vmaddfp	vC23, vA2, vB3, vC23   #if KB > 24		lvx vA2, pA0, k2   #endif	vmaddfp	vC33, vA3, vB3, vC33   #if KB > 24		lvx vA3, pA0, k3   #endif   #if KB > 24

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?