atl_smm6x1x80_sse.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,344 行 · 第 1/2 页
C
1,344 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2004 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#if !defined(ATL_GAS_x8632) && !defined(ATL_GAS_x8664) #error "This kernel requires x86 gas 32 or 64 bit assembler!"#endif#ifdef SCPLX #define CMUL(i_) ((i_)+(i_))#else #define CMUL(i_) i_#endif#if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!"#endif#if KB > 80 #error "Max KB is 80!"#endif#if !defined(NB) #define NB 0#endif#if !defined(MB) #define MB 0#endif#if (MB/6)*6 != MB #error "MB must be multiple of 6!"#endif#if MB <= 6 #define PFAINC 64#else #define PFAINC ((MB*4+MB/6-2)/(MB/6-1))#endif#if PFAINC < -6400 #undef PFAINC #define PFAINC 64#endif#ifdef ATL_GAS_x8664 #define movL movl #define movl movq #define subl subq #define addl addq #define movb movq #define subb subq#endif/* * Integer register usage shown be these defines */#ifdef ATL_GAS_x8664 #define pC %r10 #define pA %rcx #define pB %r9 #define incCn %rax #define stM %rdi #define stN %rsi #define ldab %r8 #define pA3 %rdx #define pfA %r11 #define M_m %rbx #define incAn_m %rbp #define incCn_m incCn#else #define pC %esi #define pA %ecx #define pB %edi #define incCn %eax #define stM %bl #define stN %bh #define ldab %edx #define pA3 %ebp #ifdef BETAX #define COFF 36 #else #define COFF 16 #endif #define M_m COFF(%esp) #define incAn_m COFF+4(%esp) #define incCn_m COFF+8(%esp) #define pfA incCn#endif#define pA0 pA#define pB0 pB#define rC0 %xmm0#define rC1 %xmm1#define rC2 %xmm2#define rC3 %xmm3#define rC4 %xmm4#define rC5 %xmm5#define rA0 %xmm6#define rB0 %xmm7#ifdef ATL_GAS_x8664 #define rbeta %xmm8#else #define rbeta rA0#endif#define NBso (KB*4)#if MB != 0 #define MBKBso (MB*KB*4)#endif#define NB2so (NBso+NBso)#define NB3so (NBso+NBso+NBso)#define NB4so (NBso+NBso+NBso+NBso)#define NB5so (NBso+NBso+NBso+NBso+NBso)#define NB6so (NBso+NBso+NBso+NBso+NBso+NBso)#define NB7so (NB6so+NBso)#define NB8so (NB6so+NB2so)#define NB9so (NB6so+NB3so)#define NB10so (NB6so+NB4so)#define NB11so (NB6so+NB5so)/* * Prefetch defines */#if 1#define pref2(mem) prefetcht1 mem#define prefB(mem) prefetchnta mem#define prefC(mem) prefetcht0 mem#else#define pref2(mem)#define prefB(mem)#define prefC(mem)#endif/*offset rdi/4 rsi/8 rdx/12 xmm0/16 *void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, *offset rcx/ 20 r8/24 r9/28 8/ 32 * const TYPE *A, const int lda, const TYPE *B, const int ldb, *offset xmm1/36 16/40 24/44 * const TYPE beta, TYPE *C, const int ldc) */ .text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):#ifdef ATL_GAS_x8664 movq %rbx, -8(%rsp) movq %rbp, -16(%rsp) #ifdef BETAX movapd %xmm1, rbeta #endif movq 16(%rsp), pC mov 24(%rsp), %eax /* incCn = ldc */ cltq #if MB == 0 movq stM, M_m #endif movq stM, incAn_m subq $6, incAn_m imul $NBso, incAn_m#else/* * Save callee-saved iregs; Save old stack pointer in eax, * so we can adjust for BETA alignment */ movl %esp, %eax #ifdef BETAX subl $48, %esp shr $4, %esp shl $4, %esp movl %ebp, 32(%esp) movl %ebx, 28(%esp) movl %esi, 24(%esp) movl %edi, 20(%esp) movl %eax, 16(%esp) movss 36(%eax), rC0 movaps rC0, (%esp) #define BETAOFF 0 #else subl $28, %esp movl %ebp, 12(%esp) movl %ebx, 8(%esp) movl %esi, 4(%esp) movl %edi, (%esp) #endif/* * Initialize pA = A; pB = B; pC = C; */ #if MB == 0 movl 4(%eax), %ebx movl %ebx, M_m imul $NBso, %ebx subl $NB6so, %ebx movl %ebx, incAn_m #endif movl 20(%eax), pA movl 28(%eax), pB movl 40(%eax), pC #if NB == 0 movb 8(%eax), stN #else movb $NB, stN #endif movl 44(%eax), incCn#endif addl $120, pA addl $120, pB/* * Set incCn = (ldc - NB)*sizeof */#if MB == 0 subl M_m, incCn addl $6, incCn#else subl $MB-6, incCn#endif #ifdef SCPLX shl $3, incCn #else shl $2, incCn #endif #ifndef ATL_GAS_x8664 movl incCn, incCn_m #endif movl $NBso, ldab movl pA0, pA3 addl $NB3so, pA3 movl pA0, pfA#if MB == 0 addl $NB6so-120, pfA addl incAn_m, pfA#else addl $MBKBso-120, pfA#endifUNLOOP:#if MB == 0 movb M_m, stM subb $6, stM jz ULMLOOP#else movb $MB-6, stM#endif#if MB != 6 ALIGN16UMLOOP:#ifdef BETA0 xorps rC0, rC0 xorps rC1, rC1 xorps rC2, rC2 xorps rC3, rC3 xorps rC4, rC4 xorps rC5, rC5#else movss (pC), rC0 movss CMUL(4)(pC), rC1 movss CMUL(8)(pC), rC2 movss CMUL(12)(pC), rC3 movss CMUL(16)(pC), rC4 movss CMUL(20)(pC), rC5 #ifdef BETAX #ifndef ATL_GAS_x8664 movss (%esp), rbeta #endif mulss rbeta, rC0 mulss rbeta, rC1 mulss rbeta, rC2 mulss rbeta, rC3 mulss rbeta, rC4 mulss rbeta, rC5 #endif#endif/* * Completely unrolled K-loop */ ALIGN16 movaps 0-120(pB0), rB0 movaps 0-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 0-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 0-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 0-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 0-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 0-120(pA3,ldab,2), rB0 addps rB0, rC5#if KB > 4 movaps 16-120(pB0), rB0 movaps 16-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 16-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 16-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 16-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 16-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 16-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 8 movaps 32-120(pB0), rB0 movaps 32-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 32-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 32-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 32-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 32-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 32-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 12 movaps 48-120(pB0), rB0 movaps 48-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 48-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 48-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 48-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 48-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 48-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 16 movaps 64-120(pB0), rB0 movaps 64-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 64-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 64-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 64-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 64-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 64-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 20 movaps 80-120(pB0), rB0 movaps 80-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 80-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 80-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 80-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 80-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 80-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 24 movaps 96-120(pB0), rB0 movaps 96-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 96-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 96-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 96-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 96-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 96-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 28 movaps 112-120(pB0), rB0 movaps 112-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 112-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 112-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 112-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 112-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 112-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 32 movaps 128-120(pB0), rB0 movaps 128-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 128-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 128-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 128-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 128-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 128-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 36 movaps 144-120(pB0), rB0 movaps 144-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 144-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 144-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 144-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 144-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 144-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 40 movaps 160-120(pB0), rB0 movaps 160-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 160-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 160-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 160-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 160-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 160-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 44 movaps 176-120(pB0), rB0 movaps 176-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 176-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 176-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 176-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 176-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 176-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 48 movaps 192-120(pB0), rB0 movaps 192-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 192-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 192-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 192-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 192-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 192-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 52 movaps 208-120(pB0), rB0 movaps 208-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 208-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 208-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 208-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 208-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 208-120(pA3,ldab,2), rB0 addps rB0, rC5#endif pref2((pfA)) addl $PFAINC, pfA#if KB > 56 movaps 224-120(pB0), rB0 movaps 224-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 224-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 224-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 224-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 224-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 224-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 60 movaps 240-120(pB0), rB0 movaps 240-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 240-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 240-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 240-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 240-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 240-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 64 movaps 256-120(pB0), rB0 movaps 256-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 256-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 256-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 256-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 256-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 256-120(pA3,ldab,2), rB0 addps rB0, rC5#endif#if KB > 68 movaps 272-120(pB0), rB0 movaps 272-120(pA0), rA0 mulps rB0, rA0 addps rA0, rC0 movaps 272-120(pA0,ldab), rA0 mulps rB0, rA0 addps rA0, rC1 movaps 272-120(pA0,ldab,2), rA0 mulps rB0, rA0 addps rA0, rC2 movaps 272-120(pA3), rA0 mulps rB0, rA0 addps rA0, rC3 movaps 272-120(pA3,ldab), rA0 mulps rB0, rA0 addps rA0, rC4 mulps 272-120(pA3,ldab,2), rB0 addps rB0, rC5
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?