atl_dmm1x6x72_sse2.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,724 行 · 第 1/3 页
C
1,724 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2003 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifdef ATL_OS_SunOS #define ATL_DIV_NUM NB #define ATL_DIV_DEN 6#endif#include "atlas_asm.h"#ifndef ATL_GAS_x8632 #error "This kernel requires x86-32 assembly!"#endif#if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!"#endif#if (KB != 72) #error "KB must be 72!"#endif#if NB != KB #error "NB must equal KB!"#endif#if (NB/6)*6 != NB #error "NB must be evenly divisable by 6!"#endif#if !defined(MB) #define MB 0#endif/* * Integer register usage shown be these defines */#define pC %esi#define pA %ecx#define pB %edi#define pfA %eax#define stM %bl#define stN %bh#define ldab %edx#define pA3 %ebp#define pA0 pA#define pB0 pB#define ldc ldab#define rC0 %xmm0#define rC1 %xmm1#define rC2 %xmm2#define rC3 %xmm3#define rC4 %xmm4#define rC5 %xmm5#define rA0 %xmm6#define rB0 %xmm7#define NBso (KB*8)#define NBNBso (KB*KB*8)#define NB2so (NBso+NBso)#define NB3so (NBso+NBso+NBso)#define NB4so (NBso+NBso+NBso+NBso)#define NB5so (NBso+NBso+NBso+NBso+NBso)#define NB6so (NBso+NBso+NBso+NBso+NBso+NBso)#define NB7so (NB6so+NBso)#define NB8so (NB6so+NB2so)#define NB9so (NB6so+NB3so)#define NB10so (NB6so+NB4so)#define NB11so (NB6so+NB5so)/* * Prefetch defines */#if 1#define pref2(mem) prefetcht1 mem#define prefB(mem) prefetcht1 mem#define prefC(mem) prefetcht0 mem#else#define pref2(mem)#define prefB(mem)#define prefC(mem)#endif/*offsets 4 8 12 16 *void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, *offsets 24 28 32 36 * const TYPE *A, const int lda, const TYPE *B, const int ldb, * offsets 40 48 52 * const TYPE beta, TYPE *C, const int ldc) */ .text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* * Save callee-saved iregs; Save old stack pointer in eax, * so we can adjust for BETA alignment */ movl %esp, %eax #ifdef BETAX subl $44, %esp shr $4, %esp shl $4, %esp movl %ebp, 32(%esp) movl %ebx, 28(%esp) movl %esi, 24(%esp) movl %edi, 20(%esp) movl %eax, 16(%esp) movlpd 40(%eax), rC0 unpcklpd rC0, rC0 movapd rC0, (%esp) #define COFF 36 #define BETAOFF 0 #else subl $24, %esp movl %ebp, 12(%esp) movl %ebx, 8(%esp) movl %esi, 4(%esp) movl %edi, (%esp) #define COFF 16 #endif/* * Initialize pA = A; pB = B; pC = C; */ movl 32(%eax), pA movl 24(%eax), pB movl 48(%eax), pC movb 4(%eax), stN addl $120, pA addl $120, pB/* * Set ldc = ldc*sizeof */ movl 52(%eax), ldc #ifdef DCPLX shl $4, ldc #else shl $3, ldc #endif movl ldc, COFF(%esp) movl pA0, pA3 addl $NB3so, pA3 movl $NBNBso-120, pfA addl pA0, pfA movl pC, COFF+4(%esp)NLOOP: #ifdef ATL_DivAns mov $ATL_DivAns-1, stM #else mov $NB/6-1, stM #endifMLOOP:#ifdef BETA0 xorpd rC0, rC0 xorpd rC1, rC1 xorpd rC2, rC2 xorpd rC3, rC3 xorpd rC4, rC4 xorpd rC5, rC5#else movsd (pC), rC0 movsd (pC,ldc), rC1 movsd (pC,ldc,2), rC2 movsd (pC,ldc,4), rC4 addl ldc, pC movsd (pC,ldc,2), rC3 movsd (pC,ldc,4), rC5 subl ldc, pC #ifdef BETAX movlpd (%esp), rA0 mulsd rA0, rC0 mulsd rA0, rC1 mulsd rA0, rC2 mulsd rA0, rC3 mulsd rA0, rC4 mulsd rA0, rC5 #endif#endif movl $NBso, ldab movapd 0-120(pB0), rB0 movapd 0-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 0-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 0-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 0-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 0-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 0-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 16-120(pB0), rB0 movapd 16-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 16-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 16-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 16-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 16-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 16-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 32-120(pB0), rB0 movapd 32-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 32-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 32-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 32-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 32-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 32-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 48-120(pB0), rB0 movapd 48-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 48-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 48-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 48-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 48-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 48-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 64-120(pB0), rB0 movapd 64-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 64-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 64-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 64-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 64-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 64-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 80-120(pB0), rB0 movapd 80-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 80-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 80-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 80-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 80-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 80-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 96-120(pB0), rB0 movapd 96-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 96-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 96-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 96-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 96-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 96-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 112-120(pB0), rB0 movapd 112-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 112-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 112-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 112-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 112-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 112-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 128-120(pB0), rB0 movapd 128-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 128-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 128-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 128-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 128-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 128-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 144-120(pB0), rB0 movapd 144-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 144-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 144-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 144-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 144-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 144-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 160-120(pB0), rB0 movapd 160-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 160-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 160-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 160-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 160-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 160-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 176-120(pB0), rB0 movapd 176-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 176-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 176-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 176-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 176-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 176-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 192-120(pB0), rB0 movapd 192-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 192-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 192-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 192-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 192-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 192-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 208-120(pB0), rB0 movapd 208-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 208-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 208-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 208-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 208-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 208-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 224-120(pB0), rB0 movapd 224-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 224-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 224-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 224-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 224-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 224-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 240-120(pB0), rB0 movapd 240-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 240-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 240-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 240-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 240-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 240-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 256-120(pB0), rB0 movapd 256-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 256-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 256-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 256-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 256-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 256-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 272-120(pB0), rB0 movapd 272-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 272-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 272-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 272-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 272-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 272-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 288-120(pB0), rB0 movapd 288-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 288-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 288-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 288-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 288-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 288-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 304-120(pB0), rB0 movapd 304-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 304-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 304-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 304-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 304-120(pA3,ldab), rA0
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?