atl_dmm2x1x40_5pabc.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 3,473 行 · 第 1/4 页
C
3,473 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2001 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#ifdef ATL_GAS_x8632 #define movq movl #define addq addl #define subq subl #define rsp esp#elif defined(ATL_GAS_x8664) #error "This kernel not debugged under x8664 yet"#elif !defined(ATL_GAS_x8664) #error "This kernel requires a gas x86 assembler!"#endif#if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!"#endif#if (KB != 40) #error "KB must be 40!"#endif#if (MB/2)*2 != MB #error "MB must be multiple of 2!"#endif#if defined(NB) && NB == 0 #undef NB#endif#if defined(MB) && MB == 0 #undef MB#endif/* * Integer register usage shown be these defines */#ifdef ATL_GAS_x8632 #define pC %esi #define pA %ecx #define pB %edi #define incCn %eax #define stM %edx #define stN %ebx #define pfA %ebp#else #define pC %rbx #define pA %rcx #define pB %rdx #define incCn %rax #define stM %rdi #define stN %rsi #define pfA %rbp #define pA0 %r11 /* rax used in 32/64 conversion */#endif#ifdef DCPLX #define incCm 32 #define OFF 16#else #define incCm 16 #define OFF 8#endif#define NBso (KB*8)#define NB2so (NBso+NBso)#define NB3so (NBso+NBso+NBso)#define NB4so (NBso+NBso+NBso+NBso)#define NB5so (NBso+NBso+NBso+NBso+NBso)#define NB6so (NBso+NBso+NBso+NBso+NBso+NBso)#define NB7so (NB6so+NBso)#define NB8so (NB6so+NB2so)#define NB9so (NB6so+NB3so)#define NB10so (NB6so+NB4so)#define NB11so (NB6so+NB5so)/* * Prefetch defines */#ifdef ATL_SSE1 #define pref2(mem) prefetcht1 mem #define prefA(mem) prefetcht0 mem #define prefB(mem) prefetcht0 mem #define prefC(mem) prefetcht0 mem#else #define pref2(mem) #define prefA(mem) #define prefB(mem) #define prefC(mem)#endif/* %rdi/4 %rsi/8 %rdx/12 %xmm0/16 *void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, * %rcx/24 %r8/28 %r9/32 8/36 * const TYPE *A, const int lda, const TYPE *B, const int ldb, * %xmm1/40 16/48 24/52 * const TYPE beta, TYPE *C, const int ldc) */ .text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):#ifdef ATL_GAS_x8632/* * Save callee-saved iregs */ subl $24, %esp movl %ebp, 20(%esp) movl %ebx, 16(%esp) movl %esi, 12(%esp) movl %edi, 8(%esp) #ifdef BETAX fldl 64(%esp) fstpl (%esp) #define BETAOFF 0 #endif/* * Initialize stM = (M-4)*KB, stN = N*KB * Set incCn = (ldc - M)*sizeof */ movl 28(%esp), stM movl 76(%esp), incCn subl stM, incCn addl $2, incCn #ifdef DCPLX shl $4, incCn #else shl $3, incCn #endif subl $4, stM imull $NBso, stM movl 32(%esp), stN imull $NBso, stN/* * Initialize pA = A; pB = B; pC = C; */ movl 48(%esp), pA movl 56(%esp), pB movl 72(%esp), pC/* prefC((pC)) *//* prefC(64(pC)) *//* * stM = pA + NBNB-4*NB; pfA = pA+NBNB; stN = pB + NBNB; */ addl pA, stM movl stM, pfA addl $NB4so, pfA addl pB, stN#else/* * Save callee-saved iregs */ movq %rbp, -8(%rsp) movq %rbx, -16(%rsp) #ifdef BETAX movsd %xmm1, -24(%rsp) #define BETAOFF -24 #endif/* * pA already comes in right reg * Initialize pB = B; pC = C; */ movq pA, pA0 movq %r9, pB movq 16(%rsp), pC/* prefC((pC)) *//* prefC(64(pC)) *//* * stM = pA + NBNBso; stN = pB + NBNBso; */#ifndef MB imul $KB*8, stM addq $-4*KB*8, stM#else movq $MB*KB*8-4*KB*8, stM#endif addq pA, stM movq stM, pfA addq $4*KB*8, pfA#ifndef NB imul $KB*8, stN#else movq $NB*KB*8, stN#endif addq pB, stN/* * convert ldc to 64 bits, and then set incCn = (ldc - NB)*sizeof */ movl 24(%rsp), %eax cltq/* prefB((pB)) *//* prefB(64(pB)) *//* movq %rax, incCn *//* subq $NB, incCn */ shl $3, incCn addq $16, incCn#endif/* * Unroll the 1st iterations of N-loop so we can prefetch A */ #ifndef MB cmp stM, pA je MLOOP_DRAIN_UN #endif#if !defined(MB) || MB > 4MLOOP_UN: fldl (pB) fldl (pA) fmul %st(1),%st fldl 320(pA) fmulp %st,%st(2) fldl 8(pB) fldl 8(pA) fmul %st(1),%st fldl 328(pA) fmulp %st,%st(2) fldl 16(pB) fldl 16(pA) fmul %st(1),%st #if defined(BETA0) || defined (BETAX) prefC((pC)) fldz #else fldl (pC) #endif faddp %st,%st(5) fldl 336(pA) fmulp %st,%st(2) fldl 24(pB) #if defined(BETA0) || defined (BETAX) fldz #else fldl OFF(pC) #endif faddp %st,%st(7) fldl 24(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 344(pA) fmulp %st,%st(1) fldl 32(pB) fxch %st(7) faddp %st,%st(5) fldl 32(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 352(pA) fmulp %st,%st(7) fldl 40(pB) fxch %st(5) faddp %st,%st(3) fldl 40(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 360(pA) fmulp %st,%st(5) fldl 48(pB) fxch %st(3) faddp %st,%st(1) fldl 48(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 368(pA) fmulp %st,%st(3) fldl 56(pB) fxch %st(1) faddp %st,%st(7) fldl 56(pA) fmul %st(1),%st fxch %st(4) faddp %st,%st(2) fldl 376(pA) fmulp %st,%st(1) fldl 64(pB) fxch %st(7) faddp %st,%st(5) fldl 64(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 384(pA) fmulp %st,%st(7) fldl 72(pB) fxch %st(5) prefA(NB2so(pA)) prefA(NB3so(pA)) faddp %st,%st(3) fldl 72(pA) fmul %st(5),%st fxch %st(6) faddp %st,%st(4) fldl 392(pA) fmulp %st,%st(5) fldl 80(pB) fxch %st(3) faddp %st,%st(1) fldl 80(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 400(pA) fmulp %st,%st(3) fldl 88(pB) fxch %st(1) faddp %st,%st(7) fldl 88(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 408(pA) fmulp %st,%st(1) fldl 96(pB) fxch %st(7) faddp %st,%st(5) fldl 96(pA) fmul %st(7),%st fxch %st(6) prefA(32+NB2so(pA)) prefA(32+NB3so(pA)) faddp %st,%st(4) fldl 416(pA) fmulp %st,%st(7) fldl 104(pB) fxch %st(5) faddp %st,%st(3) fldl 104(pA) fmul %st(5),%st fxch %st(4) faddp %st,%st(2) fldl 424(pA) fmulp %st,%st(5) fldl 112(pB) fxch %st(3) faddp %st,%st(1) fldl 112(pA) fmul %st(3),%st fxch %st(2) faddp %st,%st(6) fldl 432(pA) fmulp %st,%st(3) fldl 120(pB) fxch %st(1) faddp %st,%st(7) fldl 120(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 440(pA) fmulp %st,%st(1) fldl 128(pB) fxch %st(7) prefA(64+NB2so(pA)) prefA(64+NB3so(pA)) faddp %st,%st(5) fldl 128(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 448(pA) fmulp %st,%st(7) fldl 136(pB) fxch %st(5) faddp %st,%st(3) fldl 136(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 456(pA) fmulp %st,%st(5) fldl 144(pB) fxch %st(3) faddp %st,%st(1) fldl 144(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 464(pA) fmulp %st,%st(3) fldl 152(pB) fxch %st(1) faddp %st,%st(7) fldl 152(pA) fmul %st(1),%st fxch %st(4) prefA(96+NB2so(pA)) prefA(96+NB3so(pA)) faddp %st,%st(2) fldl 472(pA) fmulp %st,%st(1) fldl 160(pB) fxch %st(7) faddp %st,%st(5) fldl 160(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 480(pA) fmulp %st,%st(7) fldl 168(pB) fxch %st(5) faddp %st,%st(3) fldl 168(pA) fmul %st(5),%st fxch %st(6) faddp %st,%st(4) fldl 488(pA) fmulp %st,%st(5) fldl 176(pB) fxch %st(3) faddp %st,%st(1) fldl 176(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 496(pA) fmulp %st,%st(3) fldl 184(pB) fxch %st(1) prefA(128+NB2so(pA)) prefA(128+NB3so(pA)) faddp %st,%st(7) fldl 184(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 504(pA) fmulp %st,%st(1) fldl 192(pB) fxch %st(7) faddp %st,%st(5) fldl 192(pA) fmul %st(7),%st fxch %st(6) faddp %st,%st(4) fldl 512(pA) fmulp %st,%st(7) fldl 200(pB) fxch %st(5) faddp %st,%st(3) fldl 200(pA) fmul %st(5),%st fxch %st(4) faddp %st,%st(2) fldl 520(pA) fmulp %st,%st(5) fldl 208(pB) fxch %st(3) faddp %st,%st(1) fldl 208(pA) fmul %st(3),%st fxch %st(2) prefA(160+NB2so(pA)) prefA(160+NB3so(pA)) faddp %st,%st(6) fldl 528(pA) fmulp %st,%st(3) fldl 216(pB) fxch %st(1) faddp %st,%st(7) fldl 216(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 536(pA) fmulp %st,%st(1) fldl 224(pB) fxch %st(7) faddp %st,%st(5) fldl 224(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 544(pA) fmulp %st,%st(7) fldl 232(pB) fxch %st(5) faddp %st,%st(3) fldl 232(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 552(pA) fmulp %st,%st(5) fldl 240(pB) fxch %st(3) prefA(192+NB2so(pA)) prefA(192+NB3so(pA)) faddp %st,%st(1) fldl 240(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 560(pA) fmulp %st,%st(3) fldl 248(pB) fxch %st(1) faddp %st,%st(7) fldl 248(pA) fmul %st(1),%st fxch %st(4) faddp %st,%st(2) fldl 568(pA) fmulp %st,%st(1) fldl 256(pB) fxch %st(7) faddp %st,%st(5) fldl 256(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 576(pA) fmulp %st,%st(7) fldl 264(pB) fxch %st(5) faddp %st,%st(3) fldl 264(pA) fmul %st(5),%st fxch %st(6) prefA(224+NB2so(pA)) prefA(224+NB3so(pA)) faddp %st,%st(4) fldl 584(pA) fmulp %st,%st(5) fldl 272(pB) fxch %st(3) faddp %st,%st(1) fldl 272(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 592(pA) fmulp %st,%st(3) fldl 280(pB) fxch %st(1) faddp %st,%st(7) fldl 280(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 600(pA) fmulp %st,%st(1) fldl 288(pB) fxch %st(7) faddp %st,%st(5) fldl 288(pA) fmul %st(7),%st fxch %st(6) faddp %st,%st(4) fldl 608(pA) fmulp %st,%st(7) fldl 296(pB) fxch %st(5) prefA(256+NB2so(pA)) prefA(256+NB3so(pA)) faddp %st,%st(3) fldl 296(pA) fmul %st(5),%st fxch %st(4) faddp %st,%st(2) fldl 616(pA) fmulp %st,%st(5) fldl 304(pB) fxch %st(3) faddp %st,%st(1) fldl 304(pA) fmul %st(3),%st fxch %st(2) faddp %st,%st(6) fldl 624(pA) fmulp %st,%st(3) fldl 312(pB) fxch %st(1) prefC(32(pC)) faddp %st,%st(7) fldl 312(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 632(pA) fmulp %st,%st(1) fxch %st(6) faddp %st,%st(4) faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(2) prefA(288+NB2so(pA)) prefA(288+NB3so(pA)) faddp %st,%st(2)/* * While (pB != stK); *//* cmp pB, stK *//* jne KLOOP *//* * Write results back to C */ #ifdef BETAX fldl (pC) fldl OFF(pC) fldl BETAOFF(%rsp) fmul %st, %st(1) fmulp %st, %st(2) faddp %st, %st(3) faddp %st, %st(1) #endif fstpl (pC) fstpl OFF(pC)/* * pC += 2; pA += 2*NB; pB -= NB; */ addq $incCm, pC addq $NB2so, pA fldl (pB) fldl (pA) fmul %st(1),%st fldl 320(pA) fmulp %st,%st(2) fldl 8(pB) fldl 8(pA) fmul %st(1),%st fldl 328(pA) fmulp %st,%st(2) fldl 16(pB) fldl 16(pA) fmul %st(1),%st #if defined(BETA0) || defined (BETAX) fldz #else fldl (pC) #endif faddp %st,%st(5) fldl 336(pA) fmulp %st,%st(2) fldl 24(pB) #if defined(BETA0) || defined (BETAX) fldz #else fldl OFF(pC) #endif faddp %st,%st(7) fldl 24(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 344(pA) fmulp %st,%st(1) fldl 32(pB) fxch %st(7) faddp %st,%st(5) fldl 32(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 352(pA) fmulp %st,%st(7) fldl 40(pB) fxch %st(5) faddp %st,%st(3) fldl 40(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 360(pA) fmulp %st,%st(5) fldl 48(pB) fxch %st(3) faddp %st,%st(1) fldl 48(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 368(pA) fmulp %st,%st(3) fldl 56(pB) fxch %st(1) faddp %st,%st(7) fldl 56(pA) fmul %st(1),%st fxch %st(4) faddp %st,%st(2) fldl 376(pA) fmulp %st,%st(1) fldl 64(pB) fxch %st(7) faddp %st,%st(5) fldl 64(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 384(pA) fmulp %st,%st(7) fldl 72(pB) fxch %st(5) prefA(NB2so(pA)) prefA(NB3so(pA)) faddp %st,%st(3) fldl 72(pA) fmul %st(5),%st fxch %st(6) faddp %st,%st(4) fldl 392(pA) fmulp %st,%st(5) fldl 80(pB) fxch %st(3) faddp %st,%st(1) fldl 80(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 400(pA) fmulp %st,%st(3) fldl 88(pB) fxch %st(1) faddp %st,%st(7) fldl 88(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 408(pA) fmulp %st,%st(1) fldl 96(pB) fxch %st(7) faddp %st,%st(5) fldl 96(pA) fmul %st(7),%st fxch %st(6) prefA(32+NB2so(pA)) prefA(32+NB3so(pA)) faddp %st,%st(4) fldl 416(pA) fmulp %st,%st(7) fldl 104(pB) fxch %st(5) faddp %st,%st(3) fldl 104(pA) fmul %st(5),%st fxch %st(4) faddp %st,%st(2) fldl 424(pA) fmulp %st,%st(5) fldl 112(pB) fxch %st(3) faddp %st,%st(1) fldl 112(pA) fmul %st(3),%st fxch %st(2) faddp %st,%st(6) fldl 432(pA) fmulp %st,%st(3) fldl 120(pB) fxch %st(1) faddp %st,%st(7) fldl 120(pA) fmul %st(1),%st fxch %st(6) faddp %st,%st(4) fldl 440(pA) fmulp %st,%st(1) fldl 128(pB) fxch %st(7) prefA(64+NB2so(pA)) prefA(64+NB3so(pA)) faddp %st,%st(5) fldl 128(pA) fmul %st(7),%st fxch %st(4) faddp %st,%st(2) fldl 448(pA) fmulp %st,%st(7) fldl 136(pB) fxch %st(5) faddp %st,%st(3) fldl 136(pA) fmul %st(5),%st fxch %st(2) faddp %st,%st(6) fldl 456(pA) fmulp %st,%st(5) fldl 144(pB) fxch %st(3) faddp %st,%st(1) fldl 144(pA) fmul %st(3),%st fxch %st(6) faddp %st,%st(4) fldl 464(pA) fmulp %st,%st(3) fldl 152(pB) fxch %st(1) faddp %st,%st(7) fldl 152(pA) fmul %st(1),%st fxch %st(4) prefA(96+NB2so(pA)) prefA(96+NB3so(pA)) faddp %st,%st(2) fldl 472(pA) fmulp %st,%st(1) fldl 160(pB) fxch %st(7) faddp %st,%st(5) fldl 160(pA) fmul %st(7),%st fxch %st(2) faddp %st,%st(6) fldl 480(pA) fmulp %st,%st(7) fldl 168(pB) fxch %st(5) faddp %st,%st(3) fldl 168(pA) fmul %st(5),%st fxch %st(6) faddp %st,%st(4) fldl 488(pA) fmulp %st,%st(5) fldl 176(pB) fxch %st(3) faddp %st,%st(1) fldl 176(pA) fmul %st(3),%st fxch %st(4) faddp %st,%st(2) fldl 496(pA) fmulp %st,%st(3) fldl 184(pB) fxch %st(1) prefA(128+NB2so(pA)) prefA(128+NB3so(pA)) faddp %st,%st(7) fldl 184(pA) fmul %st(1),%st fxch %st(2) faddp %st,%st(6) fldl 504(pA) fmulp %st,%st(1) fldl 192(pB) fxch %st(7) faddp %st,%st(5) fldl 192(pA) fmul %st(7),%st fxch %st(6) faddp %st,%st(4)
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?