atl_smm14x1x84_ssecu.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,730 行 · 第 1/5 页
C
2,730 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2003 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef ATL_GAS_x8664 #error "This kernel requires x86-64 assembly!"#endif#ifdef ATL_OS_SunOS #define ATL_DIV_NUM MB #define ATL_DIV_DEN 14#endif#include "atlas_asm.h"#ifndef NB #define NB 0#endif#ifndef MB #define MB 0#endif#if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!"#endif#if (MB/14)*14 != MB #error "MB must be multiple of 14!"#endif#ifdef SREAL #define CMUL(arg_) arg_#else #define CMUL(arg_) 2*arg_#endif/* * Integer register usage shown be these defines */#define pA %rcx#define pA10 %rbx#define ldab %rbp#define mldab %rdx#define mldab5 %rax#define pB %rdi#define pC %rsi#define incCn %r10#define stM %r9#define stN %r11#define pfA %r8#define pA5 pA#define pB0 pB#if MB == 0 #define stM0 %r12 #define incAm %r13#endif/* rax used in 32/64 conversion */#define NBso (KB*4)#define MBKBso (MB*KB*4)#define NB2so (NBso+NBso)#define NB3so (NBso+NBso+NBso)#define NB4so (NBso+NBso+NBso+NBso)#define NB5so (NBso+NBso+NBso+NBso+NBso)#define NB6so (NB3so+NB3so)#define NB7so (NB3so+NB4so)#define NB8so (NB4so+NB4so)#define NB9so (NB4so+NB5so)#define NB10so (NB5so+NB5so)#define NB11so (NB6so+NB5so)#define NB12so (NB7so+NB5so)#define NB13so (NB8so+NB5so)#define NB14so (NB9so+NB5so)/* * SSE2 register usage shown be these defines */#define rA0 %xmm0#define rB0 %xmm1#define rC0 %xmm2#define rC1 %xmm3#define rC2 %xmm4#define rC3 %xmm5#define rC4 %xmm6#define rC5 %xmm7#define rC6 %xmm8#define rC7 %xmm9#define rC8 %xmm10#define rC9 %xmm11#define rC10 %xmm12#define rC11 %xmm13#define rC12 %xmm14#define rC13 %xmm15/* * Prefetch defines */#if 1#define pref2(mem) prefetcht1 mem#define prefB(mem) prefetcht0 mem#define prefC(mem) prefetchw mem#else#define pref2(mem)#define prefB(mem)#define prefC(mem)#endif#if MB != 0 #define incAm $MBKBso-NB14so+176#endif .text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* * Save callee-saved iregs */ movq %rbp, -8(%rsp) movq %rbx, -16(%rsp)#if MB == 0 movq %r12, -32(%rsp) movq %r13, -40(%rsp)#endif#ifdef BETAX #define BOF -56 movss %xmm1, BOF(%rsp) movss %xmm1, BOF+4(%rsp) movss %xmm1, BOF+8(%rsp) movss %xmm1, BOF+12(%rsp)#endif/* * pA already comes in right reg * Initialize pB = B; pC = C; NBso = NB * sizeof; */ movq %rsi, stN movq %rdi, %rax movq 16(%rsp), pC prefC((pC)) prefC(64(pC)) movq %r9, pB prefB((pB)) prefB(64(pB)) movq %rax, stM/* * stM = pA + NBNBso; stN = pB + NBNBso; */#if MB == 0 movq stM, pfA imulq $NBso, pfA prefB(128(pB)) movq pfA, incAm addq pA5, pfA addq $176-NB14so, incAm#else movq $MBKBso, pfA addq pA5, pfA prefB(128(pB))#endif/* * convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof */ movl 24(%rsp), %eax cltq movq %rax, incCn subq stM, incCn addq $14, incCn#ifdef SREAL shl $2, incCn#else shl $3, incCn prefC(128(pC)) prefC(192(pC))#endif/* * Find M/14 if MB is not set */#if MB == 0 cmp $84, stM jne MB_LT84/* movq $84/14, stM */ movq $6, stMMBFOUND: subq $1, stM movq stM, stM0#endif addq $120, pA5 addq $120, pB0 movq $KB*4, ldab movq $-KB*5*4, mldab5 movq $-KB*4, mldab subq mldab5, pA5 lea KB*4(pA5, ldab,4), pA10/* movq $NB, stN */UNLOOP:#if MB == 0 movq stM0, stM cmp $0, stM je MLAST#else #ifdef ATL_DivAns movq $ATL_DivAns-1, stM #else movq $MB/14-1, stM #endif#endif#if MB == 0 || MB > 14UMLOOP:/* * rC[0-13] = pC[0-13] * beta */ ALIGN16/*UKLOOP: */#ifdef BETA1 movaps 0-120(pA10,mldab5,2), rC0 movaps 0-120(pB0), rB0 mulps rB0, rC0 addss (pC), rC0 movaps 0-120(pA5, mldab,4), rC1 mulps rB0, rC1 addss CMUL(4)(pC), rC1 movaps 0-120(pA10, mldab,8), rC2 mulps rB0, rC2 addss CMUL(8)(pC), rC2 movaps 0-120(pA5, mldab,2), rC3 mulps rB0, rC3 addss CMUL(12)(pC), rC3 movaps 0-120(pA5, mldab), rC4 mulps rB0, rC4 addss CMUL(16)(pC), rC4 movaps 0-120(pA5), rC5 mulps rB0, rC5 addss CMUL(20)(pC), rC5 movaps 0-120(pA5, ldab), rC6 mulps rB0, rC6 addss CMUL(24)(pC), rC6 movaps 0-120(pA5, ldab,2), rC7 mulps rB0, rC7 addss CMUL(28)(pC), rC7 movaps 0-120(pA10, mldab,2), rC8 mulps rB0, rC8 addss CMUL(32)(pC), rC8 movaps 0-120(pA5,ldab,4), rC9 mulps rB0, rC9 addss CMUL(36)(pC), rC9 movaps 0-120(pA10), rC10 mulps rB0, rC10 addss CMUL(40)(pC), rC10 movaps 0-120(pA10,ldab), rC11 mulps rB0, rC11 addss CMUL(44)(pC), rC11 movaps 0-120(pA10,ldab,2), rC12 mulps rB0, rC12 addss CMUL(48)(pC), rC12 movaps 0-120(pA5,ldab,8), rC13 mulps rB0, rC13 addss CMUL(52)(pC), rC13#else movaps 0-120(pA10,mldab5,2), rC0 movaps 0-120(pB0), rC13 mulps rC13, rC0 movaps 0-120(pA5, mldab,4), rC1 mulps rC13, rC1 movaps 0-120(pA10, mldab,8), rC2 mulps rC13, rC2 movaps 0-120(pA5, mldab,2), rC3 mulps rC13, rC3 movaps 0-120(pA5, mldab), rC4 mulps rC13, rC4 movaps 0-120(pA5), rC5 mulps rC13, rC5 movaps 0-120(pA5, ldab), rC6 mulps rC13, rC6 movaps 0-120(pA5, ldab,2), rC7 mulps rC13, rC7 movaps 0-120(pA10, mldab,2), rC8 mulps rC13, rC8 movaps 0-120(pA5,ldab,4), rC9 mulps rC13, rC9 movaps 0-120(pA10), rC10 mulps rC13, rC10 movaps 0-120(pA10,ldab), rC11 mulps rC13, rC11 movaps 0-120(pA10,ldab,2), rC12 mulps rC13, rC12 mulps 0-120(pA5,ldab,8), rC13#endif#if KB > 4 movaps 16-120(pA10,mldab5,2), rA0 movaps 16-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 16-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 16-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 16-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 16-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 16-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 16-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 16-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 16-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 16-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 16-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 16-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 16-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 16-120(pA5,ldab,8), rB0 addps rB0, rC13#endif#if KB > 8 movaps 32-120(pA10,mldab5,2), rA0 movaps 32-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 32-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 32-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 32-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 32-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 32-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 32-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 32-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 32-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 32-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 32-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 32-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 32-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 32-120(pA5,ldab,8), rB0 addps rB0, rC13#endif#if KB > 12 movaps 48-120(pA10,mldab5,2), rA0 movaps 48-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 48-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 48-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 48-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 48-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 48-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 48-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 48-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 48-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 48-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 48-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 48-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 48-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 48-120(pA5,ldab,8), rB0 addps rB0, rC13#endif#if KB > 16 movaps 64-120(pA10,mldab5,2), rA0 movaps 64-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 64-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 64-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 64-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 64-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 64-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 64-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 64-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 64-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 64-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 64-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 64-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 64-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 64-120(pA5,ldab,8), rB0 addps rB0, rC13#endif#if KB > 20 movaps 80-120(pA10,mldab5,2), rA0 movaps 80-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 80-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 80-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 80-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 80-120(pA5, mldab), rA0 mulps rB0, rA0 addps rA0, rC4 movaps 80-120(pA5), rA0 mulps rB0, rA0 addps rA0, rC5 movaps 80-120(pA5, ldab), rA0 mulps rB0, rA0 addps rA0, rC6 movaps 80-120(pA5, ldab,2), rA0 mulps rB0, rA0 addps rA0, rC7 movaps 80-120(pA10, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC8 movaps 80-120(pA5,ldab,4), rA0 mulps rB0, rA0 addps rA0, rC9 movaps 80-120(pA10), rA0 mulps rB0, rA0 addps rA0, rC10 movaps 80-120(pA10,ldab), rA0 mulps rB0, rA0 addps rA0, rC11 movaps 80-120(pA10,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC12 mulps 80-120(pA5,ldab,8), rB0 addps rB0, rC13#endif#if KB > 24 movaps 96-120(pA10,mldab5,2), rA0 movaps 96-120(pB0), rB0 mulps rB0, rA0 addps rA0, rC0 movaps 96-120(pA5, mldab,4), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 96-120(pA10, mldab,8), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 96-120(pA5, mldab,2), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 96-120(pA5, mldab), rA0
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?