atl_smm4x4x128_av.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 3,072 行 · 第 1/5 页
C
3,072 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2007 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#if !defined(ATL_AS_OSX_PPC) && !defined(ATL_GAS_LINUX_PPC) #error "This kernel requires OS X or Linux PPC assembler!"#endif#if !defined(KB) || KB == 0 #error "This kernel requires KB be a compile-time constant!"#endif#ifndef MB #define MB 0#endif#ifdef DCPLX #define CMUL(i_) ((i_)*2) #define SHF 3#else #define CMUL(i_) i_ #define SHF 2#endif#ifdef ATL_USE64BITS #define slwi sldi #define srwi srdi #define cmpwi cmpdi#else #define std stw #define ld lwz#endif#ifdef ATL_AS_OSX_PPC #define M r3 #define N r4 #define pA0 r7 #define pB0 r9 #define pC0 r6 #define ldc r8 #define ldc2 r10 #define ldc3 r11 #define pfA r12 #define incAn r0 #define incCn r5#elif defined(ATL_USE64BITS) #define M r3 #define N r4 #define pA0 r7 #define pB0 r9 #define pC0 r10 #define ldc r5 #define ldc2 r6 #define ldc3 r8 #define pfA r11 #define incAn r0 #define incCn r12#else #define M r3 #define N r4 #define pA0 r6 #define pB0 r8 #define pC0 r10 #define ldc r5 #define ldc2 r7 #define ldc3 r9 #define pfA r11 #define incAn r0 #define incCn r12#endif#define pfB r14#define k0 r15#define k1 r16#define k2 r17#define k3 r18#define pBETA r19/* * These next 7 defines only used by unaligned-C kernel */#define nxtC0 r20#define nxtC1 r21#define nxtC2 r22#define nxtC3 r23#define off4 r24#define off8 r25#define off12 r26#if defined(ATL_USE64BITS) #define FSIZE 320 #define FST 48#else #define FSIZE 304 #define FST 32#endif#define BOFF FSIZE-16#define vA0 v0#define vA1 v1#define vA2 v2#define vA3 v3#define vB0 v4#define vB1 v5#define vB2 v6#define vB3 v7#define va0 v8#define va1 v9#define va2 v10#define va3 v11#define vb0 v12#define vb1 v13#define vb2 v14#define vb3 v15#define vC00 v16#define vC10 v17#define vC20 v18#define vC30 v19#define vC01 v20#define vC11 v21#define vC21 v22#define vC31 v23#define vC02 v24#define vC12 v25#define vC22 v26#define vC32 v27#define vC03 v28#define vC13 v29#define vC23 v30#define vC33 v31#ifndef KB1 #define KB1 KB#endif#ifndef KB2 #define KB2 KB*2 #define KB3 KB*3 #define KB4 KB*4 #define KB5 KB*5 #define KB6 KB*6 #define KB7 KB*7#endif#if 0*******************************************************************************32 bit ABIs: (linux in parenthesis) r3 r4 r5 r6,f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, (r6) r7 (r7) r8 (r8) r9 (r9) r10 const TYPE *A, const int lda, const TYPE *B, const int ldb, f2 68(r1) 72(r1) const TYPE beta, TYPE *C, const int ldc) (r10) 8(r1)*******************************************************************************64 bit ABIs: r3 r4 r5 r6/f1void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, r7 r8 r9 r10 const TYPE *A, const int lda, const TYPE *B, const int ldb, f2 120(r1) 128(r1) const TYPE beta, TYPE *C, const int ldc)#endif.text#ifdef ATL_AS_OSX_PPC .globl Mjoin(_,ATL_USERMM)Mjoin(_,ATL_USERMM):#else #if defined(ATL_USE64BITS)/* * Official Program Descripter section, seg fault w/o it on Linux/PPC64 */ .section ".opd","aw" .align 2 .globl ATL_USERMM .align 3ATL_USERMM: .quad Mjoin(.,ATL_USERMM),.TOC.@tocbase,0 .previous .type Mjoin(.,ATL_USERMM),@function .text .globl Mjoin(.,ATL_USERMM)Mjoin(.,ATL_USERMM): #else .globl ATL_USERMMATL_USERMM: #endif#endif/* * If C is unaligned or ldc would cause misalignment, run unaligned * kernel instead */#if defined (ATL_USE64BITS) ld r10, 120(r1) ld r5, 128(r1)#elif defined(ATL_AS_OSX_PPC) lwz r10, 60(r1) lwz r5, 64(r1)#else lwz r5, 8(r1)#endif andi. r0, r10, 0xF /* looking for 1s in last 4 digits of ptr */ andi. r5, r5, 0x3 /* ldc will be *sizeof, so only last 2 dig */ or. r5, r5, r0 /* either one got 1s? */ bne UNALIGNED_C/* Save regs */#if defined(ATL_USE64BITS) stdu r1, -FSIZE(r1)#else stwu r1, -FSIZE(r1)#endif std r14, FST(r1) std r15, FST+8(r1) std r16, FST+16(r1) std r17, FST+24(r1) std r18, FST+32(r1) std r19, FST+40(r1) std r20, FST+48(r1) mfspr r14, VRsave std r14, FST+56(r1) li r14, FST+64 stvx v20, r1, r14 addi r14, r14, 16 stvx v21, r1, r14 addi r14, r14, 16 stvx v22, r1, r14 addi r14, r14, 16 stvx v23, r1, r14 addi r14, r14, 16 stvx v24, r1, r14 addi r14, r14, 16 stvx v25, r1, r14 addi r14, r14, 16 stvx v26, r1, r14 addi r14, r14, 16 stvx v27, r1, r14 addi r14, r14, 16 stvx v28, r1, r14 addi r14, r14, 16 stvx v29, r1, r14 addi r14, r14, 16 stvx v30, r1, r14 addi r14, r14, 16 stvx v31, r1, r14 vxor v0, v0, v0 /* zero v0 */ mtvscr v0 /* force IEEE compliance */#ifdef BETAX addi pBETA, r1, BOFF stfs f2, BOFF(r1) lvewx v0, 0, pBETA vspltw v0, v0, 0 stvx v0, 0, pBETA#endif eqv r0, r0, r0 /* all 1s */ mtspr VRsave, r0 /* signal we use all vector regs */#if defined (ATL_USE64BITS) ld pC0, FSIZE+120(r1) ld ldc, FSIZE+128(r1)#elif defined(ATL_AS_OSX_PPC) lwz pC0, FSIZE+60(r1) lwz ldc, FSIZE+64(r1)#else lwz ldc, FSIZE+8(r1)#endif slwi ldc, ldc, SHF /* ldc = ldc*sizeof */ add ldc2, ldc, ldc add ldc3, ldc2, ldc slwi pfA, M, SHF /* pfA = M*sizeof() */ slwi incCn, ldc, 2 sub incCn, incCn, pfA /* incCn = ldc*4 - M */ mulli incAn, M, KB*4 /* incAn = M*KB*sizeof() */ add pfA, pA0, incAn /* pfA = A + M*KB */ srwi M, M, 2 /* M /= 4 */ addi M, M, -1// pA0 = pA0 - incAn + KB4*4 = pA0 -(incAn - KB*4) mr k1, incAn addi incAn, k1, -KB4*4 addi incCn, incCn, 16#if MB == 0 cmpwi cr5, M, 0#endif// .align 5NLOOP: addi pfB, pB0, KB4*4 mtctr M#if MB == 0 beq- cr5, MPEELED#endif xor k0, k0, k0 li k1, KB*4 li k2, 2*KB*4 li k3, 3*KB*4 lvx vB0, 0, pB0 lvx vA0, 0, pA0 lvx vA1, pA0, k1 lvx vA2, pA0, k2 lvx vA3, pA0, k3 lvx vB1, pB0, k1 lvx vB2, pB0, k2 lvx vB3, pB0, k3 vxor vC33, vC33, vC33#if KB > 4 addi k0, k0, 16 addi k1, k1, 16 addi k2, k2, 16 addi k3, k3, 16 lvx vb0, pB0, k0 lvx va0, pA0, k0 lvx va1, pA0, k1 lvx va2, pA0, k2 lvx va3, pA0, k3 lvx vb1, pB0, k1 lvx vb2, pB0, k2 lvx vb3, pB0, k3#endif#if MB == 0 || MB > 4MLOOP:/* Begin KLOOP */#if KB > 0 #if KB > 8 addi k0, k0, 16 #endif vmaddfp vC00, vA0, vB0, vC33 #if KB > 8 addi k1, k1, 16 #endif vmaddfp vC10, vA1, vB0, vC33 #if KB > 8 addi k2, k2, 16 #endif vmaddfp vC20, vA2, vB0, vC33 #if KB > 8 addi k3, k3, 16 #endif vmaddfp vC30, vA3, vB0, vC33 #if KB > 8 lvx vB0, pB0, k0 #endif vmaddfp vC01, vA0, vB1, vC33 vmaddfp vC11, vA1, vB1, vC33 dcbt 0, pfA, 0 vmaddfp vC21, vA2, vB1, vC33 addi pfA, pfA, 64 vmaddfp vC31, vA3, vB1, vC33 #if KB > 8 lvx vB1, pB0, k1 #endif vmaddfp vC02, vA0, vB2, vC33 vmaddfp vC12, vA1, vB2, vC33 vmaddfp vC22, vA2, vB2, vC33 vmaddfp vC32, vA3, vB2, vC33 #if KB > 8 lvx vB2, pB0, k2 #endif vmaddfp vC03, vA0, vB3, vC33 #if KB > 8 lvx vA0, pA0, k0 #endif vmaddfp vC13, vA1, vB3, vC33 #if KB > 8 lvx vA1, pA0, k1 #endif vmaddfp vC23, vA2, vB3, vC33 #if KB > 8 lvx vA2, pA0, k2 #endif vmaddfp vC33, vA3, vB3, vC33 #if KB > 8 lvx vA3, pA0, k3 #endif #if KB > 8 lvx vB3, pB0, k3 #endif#endif /* end K=0 block */#if KB > 4 #if KB > 12 addi k0, k0, 16 #endif vmaddfp vC00, va0, vb0, vC00 #if KB > 12 addi k1, k1, 16 #endif vmaddfp vC10, va1, vb0, vC10 #if KB > 12 addi k2, k2, 16 #endif vmaddfp vC20, va2, vb0, vC20 #if KB > 12 addi k3, k3, 16 #endif vmaddfp vC30, va3, vb0, vC30 #if KB > 12 lvx vb0, pB0, k0 #endif vmaddfp vC01, va0, vb1, vC01 vmaddfp vC11, va1, vb1, vC11 vmaddfp vC21, va2, vb1, vC21 vmaddfp vC31, va3, vb1, vC31 #if KB > 12 lvx vb1, pB0, k1 #endif vmaddfp vC02, va0, vb2, vC02 vmaddfp vC12, va1, vb2, vC12 vmaddfp vC22, va2, vb2, vC22 vmaddfp vC32, va3, vb2, vC32 #if KB > 12 lvx vb2, pB0, k2 #endif vmaddfp vC03, va0, vb3, vC03 #if KB > 12 lvx va0, pA0, k0 #endif vmaddfp vC13, va1, vb3, vC13 #if KB > 12 lvx va1, pA0, k1 #endif vmaddfp vC23, va2, vb3, vC23 #if KB > 12 lvx va2, pA0, k2 #endif vmaddfp vC33, va3, vb3, vC33 #if KB > 12 lvx va3, pA0, k3 #endif #if KB > 12 lvx vb3, pB0, k3 #endif#endif /* end K=4 block */#if KB > 8 #if KB > 16 addi k0, k0, 16 #endif vmaddfp vC00, vA0, vB0, vC00 #if KB > 16 addi k1, k1, 16 #endif vmaddfp vC10, vA1, vB0, vC10 #if KB > 16 addi k2, k2, 16 #endif vmaddfp vC20, vA2, vB0, vC20 #if KB > 16 addi k3, k3, 16 #endif vmaddfp vC30, vA3, vB0, vC30 #if KB > 16 lvx vB0, pB0, k0 #endif vmaddfp vC01, vA0, vB1, vC01 vmaddfp vC11, vA1, vB1, vC11 vmaddfp vC21, vA2, vB1, vC21 vmaddfp vC31, vA3, vB1, vC31 #if KB > 16 lvx vB1, pB0, k1 #endif vmaddfp vC02, vA0, vB2, vC02 vmaddfp vC12, vA1, vB2, vC12 vmaddfp vC22, vA2, vB2, vC22 vmaddfp vC32, vA3, vB2, vC32 #if KB > 16 lvx vB2, pB0, k2 #endif vmaddfp vC03, vA0, vB3, vC03 #if KB > 16 lvx vA0, pA0, k0 #endif vmaddfp vC13, vA1, vB3, vC13 #if KB > 16 lvx vA1, pA0, k1 #endif vmaddfp vC23, vA2, vB3, vC23 #if KB > 16 lvx vA2, pA0, k2 #endif vmaddfp vC33, vA3, vB3, vC33 #if KB > 16 lvx vA3, pA0, k3 #endif #if KB > 16 lvx vB3, pB0, k3 #endif#endif /* end K=8 block */#if KB > 12 #if KB > 20 addi k0, k0, 16 #endif vmaddfp vC00, va0, vb0, vC00 #if KB > 20 addi k1, k1, 16 #endif vmaddfp vC10, va1, vb0, vC10 #if KB > 20 addi k2, k2, 16 #endif vmaddfp vC20, va2, vb0, vC20 #if KB > 20 addi k3, k3, 16 #endif vmaddfp vC30, va3, vb0, vC30 #if KB > 20 lvx vb0, pB0, k0 #endif vmaddfp vC01, va0, vb1, vC01 vmaddfp vC11, va1, vb1, vC11 vmaddfp vC21, va2, vb1, vC21 vmaddfp vC31, va3, vb1, vC31 #if KB > 20 lvx vb1, pB0, k1 #endif vmaddfp vC02, va0, vb2, vC02 vmaddfp vC12, va1, vb2, vC12 vmaddfp vC22, va2, vb2, vC22 vmaddfp vC32, va3, vb2, vC32 #if KB > 20 lvx vb2, pB0, k2 #endif vmaddfp vC03, va0, vb3, vC03 #if KB > 20 lvx va0, pA0, k0 #endif vmaddfp vC13, va1, vb3, vC13 #if KB > 20 lvx va1, pA0, k1 #endif vmaddfp vC23, va2, vb3, vC23 #if KB > 20 lvx va2, pA0, k2 #endif vmaddfp vC33, va3, vb3, vC33 #if KB > 20 lvx va3, pA0, k3 #endif #if KB > 20 lvx vb3, pB0, k3 #endif#endif /* end K=12 block */#if KB > 16 #if KB > 24 addi k0, k0, 16 #endif vmaddfp vC00, vA0, vB0, vC00 #if KB > 24 addi k1, k1, 16 #endif vmaddfp vC10, vA1, vB0, vC10 #if KB > 24 addi k2, k2, 16 #endif vmaddfp vC20, vA2, vB0, vC20 #if KB > 24 addi k3, k3, 16 #endif vmaddfp vC30, vA3, vB0, vC30 #if KB > 24 lvx vB0, pB0, k0 #endif vmaddfp vC01, vA0, vB1, vC01 vmaddfp vC11, vA1, vB1, vC11 vmaddfp vC21, vA2, vB1, vC21 vmaddfp vC31, vA3, vB1, vC31 #if KB > 24 lvx vB1, pB0, k1 #endif vmaddfp vC02, vA0, vB2, vC02 vmaddfp vC12, vA1, vB2, vC12 vmaddfp vC22, vA2, vB2, vC22 vmaddfp vC32, vA3, vB2, vC32 #if KB > 24 lvx vB2, pB0, k2 #endif vmaddfp vC03, vA0, vB3, vC03 #if KB > 24 lvx vA0, pA0, k0 #endif vmaddfp vC13, vA1, vB3, vC13 #if KB > 24 lvx vA1, pA0, k1 #endif vmaddfp vC23, vA2, vB3, vC23 #if KB > 24 lvx vA2, pA0, k2 #endif vmaddfp vC33, vA3, vB3, vC33 #if KB > 24 lvx vA3, pA0, k3 #endif #if KB > 24
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?