atl_dmm14x1x56_sse2pabc_k.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 3,047 行 · 第 1/5 页
C
3,047 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2002 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef ATL_GAS_x8664 #error "This kernel requires x86-64 assembly!"#endif#ifdef ATL_OS_SunOS #define ATL_DIV_NUM MB #define ATL_DIV_DEN 14#endif#include "atlas_asm.h"#if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!"#endif#if ((KB/2)*2 == KB)#if NB != MB #error "For this kernel, MB = NB required!"#endif#if (MB/14)*14 != MB #error "MB must be multiple of 14!"#endif#ifdef DREAL #define CMUL(arg_) arg_#else #define CMUL(arg_) 2*arg_#endif/* * Integer register usage shown be these defines */#define pA %rcx#define pA10 %rbx#define ldab %rbp#define mldab %rdx#define mldab5 %rax#define pB %rdi#define pC %rsi#define incCn %r10#define stM %r9#define stN %r11#define pfA %r8#define pA5 pA#define pB0 pB/* rax used in 32/64 conversion */#define NBso (KB*8)#define MBKBso (MB*KB*8)#define NB2so (NBso+NBso)#define NB3so (NBso+NBso+NBso)#define NB4so (NBso+NBso+NBso+NBso)#define NB5so (NBso+NBso+NBso+NBso+NBso)#define NB6so (NB3so+NB3so)#define NB7so (NB3so+NB4so)#define NB8so (NB4so+NB4so)#define NB9so (NB4so+NB5so)#define NB10so (NB5so+NB5so)#define NB11so (NB6so+NB5so)#define NB12so (NB7so+NB5so)#define NB13so (NB8so+NB5so)#define NB14so (NB9so+NB5so)/* * SSE2 register usage shown be these defines */#define rA0 %xmm0#define rB0 %xmm1#define rC0 %xmm2#define rC1 %xmm3#define rC2 %xmm4#define rC3 %xmm5#define rC4 %xmm6#define rC5 %xmm7#define rC6 %xmm8#define rC7 %xmm9#define rC8 %xmm10#define rC9 %xmm11#define rC10 %xmm12#define rC11 %xmm13#define rC12 %xmm14#define rC13 %xmm15/* * Prefetch defines */#if 1#define pref2(mem) prefetcht1 mem#define prefB(mem) prefetcht0 mem#define prefC(mem) prefetchw mem#else#define pref2(mem)#define prefB(mem)#define prefC(mem)#endif .text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* * Save callee-saved iregs */ movq %rbp, -8(%rsp) movq %rbx, -16(%rsp)/* movq %r12, -32(%rsp) *//* movq %r13, -40(%rsp) */#ifdef BETAX #define BOF -24 movlpd %xmm1, BOF(%rsp)#endif/* * pA already comes in right reg * Initialize pB = B; pC = C; */ movq 16(%rsp), pC prefC((pC)) prefC(64(pC)) movq %r9, pB prefB((pB)) prefB(64(pB))/* * setup prefetch ptr for next blk of A */ movq $MBKBso, pfA addq pA5, pfA prefB(128(pB))/* * convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof */ movl 24(%rsp), %eax cltq movq %rax, incCn#ifdef DREAL subq $MB-14, incCn shl $3, incCn#else subq $(MB-14), incCn shl $4, incCn prefC(128(pC)) prefC(192(pC))#endif addq $120, pA5 addq $120, pB0 movq $KB*8, ldab movq $-KB*5*8, mldab5 movq $-KB*8, mldab subq mldab5, pA5 lea KB*8(pA5, ldab,4), pA10 movq $NB, stNUNLOOP: #ifdef ATL_DivAns movq $ATL_DivAns-1, stM #else movq $MB/14-1, stM #endifUMLOOP:/* * rC[0-13] = pC[0-13] * beta */ ALIGN16/*UKLOOP: */#ifdef BETA1 movapd 0-120(pA10,mldab5,2), rC0 movapd 0-120(pB0), rB0 mulpd rB0, rC0 addsd (pC), rC0 movapd 0-120(pA5, mldab,4), rC1 mulpd rB0, rC1 addsd CMUL(8)(pC), rC1 movapd 0-120(pA10, mldab,8), rC2 mulpd rB0, rC2 addsd CMUL(16)(pC), rC2 movapd 0-120(pA5, mldab,2), rC3 mulpd rB0, rC3 addsd CMUL(24)(pC), rC3 movapd 0-120(pA5, mldab), rC4 mulpd rB0, rC4 addsd CMUL(32)(pC), rC4 movapd 0-120(pA5), rC5 mulpd rB0, rC5 addsd CMUL(40)(pC), rC5 movapd 0-120(pA5, ldab), rC6 mulpd rB0, rC6 addsd CMUL(48)(pC), rC6 movapd 0-120(pA5, ldab,2), rC7 mulpd rB0, rC7 addsd CMUL(56)(pC), rC7 movapd 0-120(pA10, mldab,2), rC8 mulpd rB0, rC8 addsd CMUL(64)(pC), rC8 movapd 0-120(pA5,ldab,4), rC9 mulpd rB0, rC9 addsd CMUL(72)(pC), rC9 movapd 0-120(pA10), rC10 mulpd rB0, rC10 addsd CMUL(80)(pC), rC10 movapd 0-120(pA10,ldab), rC11 mulpd rB0, rC11 addsd CMUL(88)(pC), rC11 movapd 0-120(pA10,ldab,2), rC12 mulpd rB0, rC12 addsd CMUL(96)(pC), rC12 movapd 0-120(pA5,ldab,8), rC13 mulpd rB0, rC13 addsd CMUL(104)(pC), rC13#elif defined(BETA0) movapd 0-120(pA10,mldab5,2), rC0 movapd 0-120(pB0), rC13 mulpd rC13, rC0 movapd 0-120(pA5, mldab,4), rC1 mulpd rC13, rC1 movapd 0-120(pA10, mldab,8), rC2 mulpd rC13, rC2 movapd 0-120(pA5, mldab,2), rC3 mulpd rC13, rC3 movapd 0-120(pA5, mldab), rC4 mulpd rC13, rC4 movapd 0-120(pA5), rC5 mulpd rC13, rC5 movapd 0-120(pA5, ldab), rC6 mulpd rC13, rC6 movapd 0-120(pA5, ldab,2), rC7 mulpd rC13, rC7 movapd 0-120(pA10, mldab,2), rC8 mulpd rC13, rC8 movapd 0-120(pA5,ldab,4), rC9 mulpd rC13, rC9 movapd 0-120(pA10), rC10 mulpd rC13, rC10 movapd 0-120(pA10,ldab), rC11 mulpd rC13, rC11 movapd 0-120(pA10,ldab,2), rC12 mulpd rC13, rC12 mulpd 0-120(pA5,ldab,8), rC13#else movsd BOF(%rsp), rC0 movapd rC0, rC1 movapd rC0, rC2 movapd rC0, rC3 movapd rC0, rC4 movapd rC0, rC5 movapd rC0, rC6 movapd rC0, rC7 movapd rC0, rC8 movapd rC0, rC9 movapd rC0, rC10 movapd rC0, rC11 movapd rC0, rC12 movapd rC0, rC13 mulsd (pC), rC0 mulsd CMUL(8)(pC), rC1 mulsd CMUL(16)(pC), rC2 mulsd CMUL(24)(pC), rC3 mulsd CMUL(32)(pC), rC4 mulsd CMUL(40)(pC), rC5 mulsd CMUL(48)(pC), rC6 mulsd CMUL(56)(pC), rC7 mulsd CMUL(64)(pC), rC8 mulsd CMUL(72)(pC), rC9 mulsd CMUL(80)(pC), rC10 mulsd CMUL(88)(pC), rC11 mulsd CMUL(96)(pC), rC12 mulsd CMUL(104)(pC), rC13 movapd 0-120(pA10,mldab5,2), rA0 movapd 0-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 0-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 0-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 0-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 0-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 0-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 0-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 0-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 0-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 0-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 0-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 0-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 0-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 0-120(pA5,ldab,8), rB0 addpd rB0, rC13#endif#if KB > 2 movapd 16-120(pA10,mldab5,2), rA0 movapd 16-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 16-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 16-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 16-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 16-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 16-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 16-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 16-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 16-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 16-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 16-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 16-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 16-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 16-120(pA5,ldab,8), rB0 addpd rB0, rC13#endif#if KB > 4 movapd 32-120(pA10,mldab5,2), rA0 movapd 32-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 32-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 32-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 32-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 32-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 32-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 32-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 32-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 32-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 32-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 32-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 32-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 32-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 32-120(pA5,ldab,8), rB0 addpd rB0, rC13#endif#if KB > 6 movapd 48-120(pA10,mldab5,2), rA0 movapd 48-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 48-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 48-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 48-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 48-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 48-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 48-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 48-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 48-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 48-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 48-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 48-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 48-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 48-120(pA5,ldab,8), rB0 addpd rB0, rC13#endif#ifndef DREAL pref2((pfA)) pref2(64(pfA))#endif#if KB > 8 movapd 64-120(pA10,mldab5,2), rA0 movapd 64-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 64-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 64-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 64-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 64-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 64-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 64-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 64-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 64-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 64-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 64-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 64-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 64-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 64-120(pA5,ldab,8), rB0 addpd rB0, rC13#endif#if KB > 10 movapd 80-120(pA10,mldab5,2), rA0 movapd 80-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 80-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 80-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 80-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 80-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 80-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 80-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 80-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 80-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 80-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 80-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 80-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 80-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 80-120(pA5,ldab,8), rB0 addpd rB0, rC13#endif#if KB > 12 movapd 96-120(pA10,mldab5,2), rA0 movapd 96-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 96-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 96-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 96-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 96-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 96-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 96-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 96-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 96-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 96-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 96-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 96-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 96-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 96-120(pA5,ldab,8), rB0 addpd rB0, rC13#endif
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?