atl_smm14x1x84_sse.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,566 行 · 第 1/5 页
C
2,566 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2003 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef ATL_GAS_x8664 #error "This kernel requires x86-64 assembly!"#endif#ifdef ATL_OS_SunOS #define ATL_DIV_NUM MB #define ATL_DIV_DEN 14#endif#include "atlas_asm.h"#if !defined(NB) || (NB == 0) #error "NB must be a compile-time constant!"#endif#if NB != MB || NB != KB #error "For this kernel, MB = NB = KB required!"#endif#if (NB != 84) #error "NB must be 84!"#endif#if (NB/14)*14 != NB #error "NB must be multiple of 14!"#endif#ifdef SREAL #define CMUL(arg_) arg_#else #define CMUL(arg_) 2*arg_#endif/* * Integer register usage shown be these defines */#define pA %rcx#define pA10 %rbx#define ldab %rbp#define mldab %rdx#define mldab5 %rax#define pB %rdi#define pC %rsi#define incCn %r10#define stM %r9#define stN %r11#define pfA %r8#define pA5 pA#define pB0 pB/* rax used in 32/64 conversion */#define NBso (NB*4)#define NBNBso (NB*NB*4)#define NB2so (NBso+NBso)#define NB3so (NBso+NBso+NBso)#define NB4so (NBso+NBso+NBso+NBso)#define NB5so (NBso+NBso+NBso+NBso+NBso)#define NB6so (NB3so+NB3so)#define NB7so (NB3so+NB4so)#define NB8so (NB4so+NB4so)#define NB9so (NB4so+NB5so)#define NB10so (NB5so+NB5so)#define NB11so (NB6so+NB5so)#define NB12so (NB7so+NB5so)#define NB13so (NB8so+NB5so)#define NB14so (NB9so+NB5so)/* * SSE2 register usage shown be these defines */#define rA0 %xmm0#define rB0 %xmm1#define rC0 %xmm2#define rC1 %xmm3#define rC2 %xmm4#define rC3 %xmm5#define rC4 %xmm6#define rC5 %xmm7#define rC6 %xmm8#define rC7 %xmm9#define rC8 %xmm10#define rC9 %xmm11#define rC10 %xmm12#define rC11 %xmm13#define rC12 %xmm14#define rC13 %xmm15/* * Prefetch defines */#if 1#define pref2(mem) prefetcht1 mem#define prefB(mem) prefetcht0 mem#define prefC(mem) prefetchw mem#else#define pref2(mem)#define prefB(mem)#define prefC(mem)#endif .textALIGN4.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* * Save callee-saved iregs */ movq %rbp, -8(%rsp) movq %rbx, -16(%rsp)/* movq %r12, -32(%rsp) *//* movq %r13, -40(%rsp) */#ifdef BETAX #define BOF -56 movss %xmm1, BOF(%rsp) movss %xmm1, BOF+4(%rsp) movss %xmm1, BOF+8(%rsp) movss %xmm1, BOF+12(%rsp)#endif/* * pA already comes in right reg * Initialize pB = B; pC = C; NBso = NB * sizeof; */ movq 16(%rsp), pC prefC((pC)) prefC(64(pC)) movq %r9, pB prefB((pB)) prefB(64(pB))/* * stM = pA + NBNBso; stN = pB + NBNBso; */ movq $NBNBso, pfA addq pA5, pfA prefB(128(pB))/* * convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof */ movl 24(%rsp), %eax cltq movq %rax, incCn#ifdef SREAL movq %rax, stM subq $MB-14, incCn shl $2, incCn#else subq $MB-14, incCn shl $3, incCn prefC(128(pC)) prefC(192(pC))#endif/* * */ addq $120, pA5 addq $120, pB0 movq $KB*4, ldab movq $-KB*5*4, mldab5 movq $-KB*4, mldab subq mldab5, pA5 lea KB*4(pA5, ldab,4), pA10 movq $NB, stN#ifdef SREAL test $1, stM jnz UNLOOP test $15, pC jnz UNLOOP#endifUNLOOP: #ifdef ATL_DivAns movq $ATL_DivAns-1, stM #else movq $MB/14-1, stM #endifUMLOOP:/* * rC[0-13] = pC[0-13] * beta */ prefC((pC)) ALIGN16/*UKLOOP: */ movaps 0-120(pA10,mldab5,2), rC0 movaps 0-120(pB0), rC13 mulps rC13, rC0 movaps 0-120(pA5, mldab,4), rC1 mulps rC13, rC1 movaps 0-120(pA10, mldab,8), rC2 mulps rC13, rC2 movaps 0-120(pA5, mldab,2), rC3 mulps rC13, rC3 movaps 0-120(pA5, mldab), rC4 mulps rC13, rC4 movaps 0-120(pA5), rC5 mulps rC13, rC5 movaps 0-120(pA5, ldab), rC6 mulps rC13, rC6 movaps 0-120(pA5, ldab,2), rC7 mulps rC13, rC7 movaps 0-120(pA10, mldab,2), rC8 mulps rC13, rC8 movaps 0-120(pA5,ldab,4), rC9 mulps rC13, rC9 movaps 0-120(pA10), rC10 mulps rC13, rC10 movaps 0-120(pA10,ldab), rC11 mulps rC13, rC11 movaps 0-120(pA10,ldab,2), rC12 mulps rC13, rC12 mulps 0-120(pA5,ldab,8), rC13 movaps 16-120(pA10,mldab5,2), rA0 movaps 16-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 16-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 16-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 16-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 16-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 16-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 16-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 16-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 16-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 16-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 16-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 16-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 16-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 16-120(pA5,ldab,8), rB0 addps rB0, rC13 movaps 32-120(pA10,mldab5,2), rA0 movaps 32-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 32-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 32-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 32-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 32-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 32-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 32-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 32-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 32-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 32-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 32-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 32-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 32-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 32-120(pA5,ldab,8), rB0 addps rB0, rC13 movaps 48-120(pA10,mldab5,2), rA0 movaps 48-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 48-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 48-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 48-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 48-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 48-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 48-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 48-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 48-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 48-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 48-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 48-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 48-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 48-120(pA5,ldab,8), rB0 addps rB0, rC13 movaps 64-120(pA10,mldab5,2), rA0 movaps 64-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 64-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 64-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 64-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 64-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 64-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 64-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 64-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 64-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 64-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 64-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 64-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 64-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 64-120(pA5,ldab,8), rB0 addps rB0, rC13 movaps 80-120(pA10,mldab5,2), rA0 movaps 80-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 80-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 80-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 80-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 80-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 80-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 80-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 80-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 80-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 80-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 80-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 80-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 80-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 80-120(pA5,ldab,8), rB0 addps rB0, rC13 movaps 96-120(pA10,mldab5,2), rA0 movaps 96-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 96-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 96-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 96-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 96-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 96-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 96-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 96-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 96-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 96-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 96-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 96-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 96-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 96-120(pA5,ldab,8), rB0 addps rB0, rC13 movaps 112-120(pA10,mldab5,2), rA0 movaps 112-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 112-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 112-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 112-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 112-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 112-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 112-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 112-120(pA5, ldab,2), rA0 mulps rB0, rA0
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?