atl_dmm6x1x72_sse2_k.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,063 行 · 第 1/2 页
C
1,063 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2003 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#ifndef ATL_GAS_x8632 #error "This kernel requires gas x86-32 assembler!"#endif#if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!"#endif#if !defined(NB) #define NB 0#endif#if !defined(MB) #define MB 0#endif#if (MB/6)*6 != MB #error "MB must be multiple of 6!"#endif/* * Integer register usage shown be these defines */#define pC %esi#define pA %ecx#define pB %edi#define incCn %eax#define stM %bl#define stN %bh#define ldab %edx#define pA3 %ebp#define pA0 pA#define pB0 pB#define pfA incCn#define rC0 %xmm0#define rC1 %xmm1#define rC2 %xmm2#define rC3 %xmm3#define rC4 %xmm4#define rC5 %xmm5#define rA0 %xmm6#define rB0 %xmm7#define NBso (KB*8)#if MB != 0 #define MBKBso (MB*KB*8)#endif#define NB2so (NBso+NBso)#define NB3so (NBso+NBso+NBso)#define NB4so (NBso+NBso+NBso+NBso)#define NB5so (NBso+NBso+NBso+NBso+NBso)#define NB6so (NBso+NBso+NBso+NBso+NBso+NBso)#define NB7so (NB6so+NBso)#define NB8so (NB6so+NB2so)#define NB9so (NB6so+NB3so)#define NB10so (NB6so+NB4so)#define NB11so (NB6so+NB5so)/* * Prefetch defines */#if 1#define pref2(mem) prefetcht1 mem#define prefB(mem) prefetcht1 mem#define prefC(mem) prefetcht0 mem#else#define pref2(mem)#define prefB(mem)#define prefC(mem)#endif/*offset 4 8 12 16 *void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, *offset 24 28 32 36 * const TYPE *A, const int lda, const TYPE *B, const int ldb, *offset 40 48 52 * const TYPE beta, TYPE *C, const int ldc) */ .text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* * Save callee-saved iregs; Save old stack pointer in eax, * so we can adjust for BETA alignment */ movl %esp, %eax #ifdef BETAX subl $48, %esp shr $4, %esp shl $4, %esp movl %ebp, 32(%esp) movl %ebx, 28(%esp) movl %esi, 24(%esp) movl %edi, 20(%esp) movl %eax, 16(%esp) movlpd 40(%eax), rC0 unpcklpd rC0, rC0 movapd rC0, (%esp) #define COFF 36 #define BETAOFF 0 #else subl $28, %esp movl %ebp, 12(%esp) movl %ebx, 8(%esp) movl %esi, 4(%esp) movl %edi, (%esp) #define COFF 16 #endif/* * Initialize pA = A; pB = B; pC = C; */#if MB == 0 movl 4(%eax), %ebx movl %ebx, COFF+4(%esp) imul $NBso, %ebx movl %ebx, COFF+8(%esp)#endif movl 24(%eax), pA movl 32(%eax), pB movl 48(%eax), pC#if NB == 0 movb 8(%eax), stN#else movb $NB, stN#endif addl $120, pA addl $120, pB/* * Set incCn = (ldc - NB)*sizeof */ movl 52(%eax), incCn#if MB == 0 subl COFF+4(%esp), incCn#else subl $MB, incCn#endif #ifdef DCPLX shl $4, incCn #else shl $3, incCn #endif movl incCn, COFF(%esp) movl $NBso, ldab movl pA0, pA3 addl $NB3so, pA3 movl pA0, pfA#if MB == 0 subl $120, pfA addl COFF+8(%esp), pfA#else addl $MBKBso-120, pfA#endifUNLOOP:#if MB == 0 movb COFF+4(%esp), stM#else movb $MB, stM#endifUMLOOP:#ifdef BETA0 xorpd rC0, rC0 xorpd rC1, rC1 xorpd rC2, rC2 xorpd rC3, rC3 xorpd rC4, rC4 xorpd rC5, rC5#else #ifdef DCPLX movsd (pC), rC0 movsd 16(pC), rC1 movsd 32(pC), rC2 movsd 48(pC), rC3 movsd 64(pC), rC4 movsd 80(pC), rC5 #else movsd (pC), rC0 movsd 8(pC), rC1 movsd 16(pC), rC2 movsd 24(pC), rC3 movsd 32(pC), rC4 movsd 40(pC), rC5 #endif #ifdef BETAX movlpd (%esp), rA0 mulsd rA0, rC0 mulsd rA0, rC1 mulsd rA0, rC2 mulsd rA0, rC3 mulsd rA0, rC4 mulsd rA0, rC5 #endif#endif movapd 0-120(pB0), rB0 movapd 0-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 0-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 0-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 0-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 0-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 0-120(pA3,ldab,2), rB0 addpd rB0, rC5#if KB > 2 movapd 16-120(pB0), rB0 movapd 16-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 16-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 16-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 16-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 16-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 16-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 4 movapd 32-120(pB0), rB0 movapd 32-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 32-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 32-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 32-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 32-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 32-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 6 movapd 48-120(pB0), rB0 movapd 48-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 48-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 48-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 48-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 48-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 48-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 8 movapd 64-120(pB0), rB0 movapd 64-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 64-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 64-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 64-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 64-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 64-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 10 movapd 80-120(pB0), rB0 movapd 80-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 80-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 80-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 80-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 80-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 80-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 12 movapd 96-120(pB0), rB0 movapd 96-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 96-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 96-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 96-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 96-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 96-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 14 movapd 112-120(pB0), rB0 movapd 112-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 112-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 112-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 112-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 112-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 112-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 16 movapd 128-120(pB0), rB0 movapd 128-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 128-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 128-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 128-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 128-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 128-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 18 movapd 144-120(pB0), rB0 movapd 144-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 144-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 144-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 144-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 144-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 144-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 20 movapd 160-120(pB0), rB0 movapd 160-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 160-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 160-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 160-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 160-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 160-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 22 movapd 176-120(pB0), rB0 movapd 176-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 176-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 176-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 176-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 176-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 176-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 24 movapd 192-120(pB0), rB0 movapd 192-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 192-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 192-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 192-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 192-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 192-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 26 movapd 208-120(pB0), rB0 movapd 208-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 208-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 208-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 208-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 208-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 208-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif#if KB > 28 movapd 224-120(pB0), rB0 movapd 224-120(pA0), rA0 mulpd rB0, rA0 addpd rA0, rC0 movapd 224-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 224-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 224-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 224-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 224-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?