atl_dmm1x14x56_sse2pabc.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 2,923 行 · 第 1/5 页
C
2,923 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2003 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifndef ATL_GAS_x8664 #error "This kernel requires x86-64 assembly!"#endif#include "atlas_asm.h"#if !defined(MB) #define MB 0#endif#if !defined(NB) #define NB 0#endif#if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!"#endif#if (KB != 56) #error "KB must be 56!"#endif#if (NB/14)*14 != NB #error "NB must be multiple of 14!"#endif/* * Integer register usage shown be these defines */#define pA %rcx#define pA10 %rbx#define ldab %rbp#define mldab %rdx#define mldab5 %rax#define pB %rdi#define pC %rsi#define stM %r9#define stN %r11#define pfA %r8#define pA5 pA#define pB0 pB#define pAS %r13#define ldc %r10#define mldc %r14#define ldc3 %r15/* rax used in 32/64 conversion */#define NBso (KB*8)#define MBKBso (MB*KB*8)#define NB2so (NBso+NBso)#define NB3so (NBso+NBso+NBso)#define NB4so (NBso+NBso+NBso+NBso)#define NB5so (NBso+NBso+NBso+NBso+NBso)#define NB6so (NB3so+NB3so)#define NB7so (NB3so+NB4so)#define NB8so (NB4so+NB4so)#define NB9so (NB4so+NB5so)#define NB10so (NB5so+NB5so)#define NB11so (NB6so+NB5so)#define NB12so (NB7so+NB5so)#define NB13so (NB8so+NB5so)#define NB14so (NB9so+NB5so)/* * SSE2 register usage shown be these defines */#define rA0 %xmm0#define rB0 %xmm1#define rC0 %xmm2#define rC1 %xmm3#define rC2 %xmm4#define rC3 %xmm5#define rC4 %xmm6#define rC5 %xmm7#define rC6 %xmm8#define rC7 %xmm9#define rC8 %xmm10#define rC9 %xmm11#define rC10 %xmm12#define rC11 %xmm13#define rC12 %xmm14#define rC13 %xmm15/* * Prefetch defines */#if 1#define pref2(mem) prefetcht1 mem#define prefB(mem) prefetcht0 mem#define prefC(mem) prefetchw mem#else#define pref2(mem)#define prefB(mem)#define prefC(mem)#endif .text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* * Save callee-saved iregs */ movq %rbp, -8(%rsp) movq %rbx, -16(%rsp) movq %r12, -32(%rsp) movq %r13, -40(%rsp) movq %r14, -48(%rsp) movq %r15, -56(%rsp)#define SOFF -64#ifdef BETAX #define BOF -24 movlpd %xmm1, BOF(%rsp)#endif/* * pA already comes in right reg; load stN, * Initialize pB = B; pC = C; NBso = NB * sizeof; */ movq %rsi, stN movq %rdi, %r12 movq %r9, pB prefB((pB)) prefB(64(pB)) movq 16(%rsp), pC prefC((pC))/* * stM = M/14; stN = N */#if MB != 0 movq $MB, stM#else movq %r12, stM#endif/* * convert ldc to 64 bits, and mul by size */ movl 24(%rsp), %eax cltq movq %rax, ldc#ifdef DREAL shl $3, ldc#else shl $4, ldc#endif/* * At this point, pA5 has pA, pB0 has pB, stN has N, stM has M; swap * them so that we can reverse loops */ movq pA5, mldc movq pB0, pA5 movq mldc, pB0 prefB(128(pB)) movq stN, mldc movq stM, stN movq mldc, stM/* * pfA = pA + M*KBso */ movq stM, pfA imulq $NBso, pfA addq pA5, pfA/* * Calculate and store incCn = sizeof*(N*ldc - 1) */ movq stM, mldc imulq ldc, mldc#ifdef DREAL subq $8, mldc#else subq $16, mldc#endif movq mldc, SOFF(%rsp)/* * mldc = -ldc; ldc3 = ldc*3, pC = pC + ldc*2 */ movq ldc, mldc neg mldc lea (ldc,ldc,2), ldc3 lea (pC, ldc,2), pC addq $120, pA5 addq $120, pB0 movq $KB*8, ldab movq $-KB*5*8, mldab5 movq $-KB*8, mldab subq mldab5, pA5 lea KB*8(pA5, ldab,4), pA10 movq pA10, pAS movq stM, %r12UNLOOP:#if NB == 0 movq %r12, stM sub $14, stM jz UMLOOPCU#else movq $NB-14, stM#endif#if NB != 14UMLOOP:/* * rC[0-13] = pC[0-13] * beta */ ALIGN16/*UKLOOP: */#ifdef BETA1 movapd 0-120(pA10,mldab5,2), rC0 movapd 0-120(pB0), rB0 mulpd rB0, rC0 addsd (pC,mldc,2), rC0 movapd 0-120(pA5, mldab,4), rC1 mulpd rB0, rC1 addsd (pC,mldc), rC1 movapd 0-120(pA10, mldab,8), rC2 mulpd rB0, rC2 addsd (pC), rC2 movapd 0-120(pA5, mldab,2), rC3 mulpd rB0, rC3 addsd (pC,ldc), rC3 movapd 0-120(pA5, mldab), rC4 mulpd rB0, rC4 addsd (pC,ldc,2), rC4 movapd 0-120(pA5), rC5 mulpd rB0, rC5 addsd (pC,ldc3), rC5 movapd 0-120(pA5, ldab), rC6 mulpd rB0, rC6 addsd (pC,ldc,4), rC6 addq ldc, pC movapd 0-120(pA5, ldab,2), rC7 lea (pC,ldc3,2), pC mulpd rB0, rC7 addsd (pC,mldc,2), rC7 movapd 0-120(pA10, mldab,2), rC8 mulpd rB0, rC8 addsd (pC,mldc), rC8 movapd 0-120(pA5,ldab,4), rC9 mulpd rB0, rC9 addsd (pC), rC9 movapd 0-120(pA10), rC10 mulpd rB0, rC10 addsd (pC,ldc), rC10 movapd 0-120(pA10,ldab), rC11 mulpd rB0, rC11 addsd (pC,ldc,2), rC11 movapd 0-120(pA10,ldab,2), rC12 mulpd rB0, rC12 addsd (pC,ldc3), rC12 movapd 0-120(pA5,ldab,8), rC13 mulpd rB0, rC13 addsd (pC,ldc,4), rC13#elif defined(BETA0) movapd 0-120(pA10,mldab5,2), rC0 movapd 0-120(pB0), rC13 mulpd rC13, rC0 movapd 0-120(pA5, mldab,4), rC1 mulpd rC13, rC1 movapd 0-120(pA10, mldab,8), rC2 mulpd rC13, rC2 movapd 0-120(pA5, mldab,2), rC3 mulpd rC13, rC3 movapd 0-120(pA5, mldab), rC4 mulpd rC13, rC4 movapd 0-120(pA5), rC5 mulpd rC13, rC5 movapd 0-120(pA5, ldab), rC6 mulpd rC13, rC6 movapd 0-120(pA5, ldab,2), rC7 mulpd rC13, rC7 movapd 0-120(pA10, mldab,2), rC8 mulpd rC13, rC8 movapd 0-120(pA5,ldab,4), rC9 mulpd rC13, rC9 movapd 0-120(pA10), rC10 mulpd rC13, rC10 movapd 0-120(pA10,ldab), rC11 mulpd rC13, rC11 movapd 0-120(pA10,ldab,2), rC12 mulpd rC13, rC12 mulpd 0-120(pA5,ldab,8), rC13#else movsd BOF(%rsp), rC0 movapd rC0, rC1 movapd rC0, rC2 movapd rC0, rC3 movapd rC0, rC4 movapd rC0, rC5 movapd rC0, rC6 movapd rC0, rC7 movapd rC0, rC8 movapd rC0, rC9 movapd rC0, rC10 movapd rC0, rC11 movapd rC0, rC12 movapd rC0, rC13 mulsd (pC,mldc,2), rC0 mulsd (pC,mldc), rC1 mulsd (pC), rC2 mulsd (pC,ldc), rC3 mulsd (pC,ldc,2), rC4 mulsd (pC,ldc3), rC5 mulsd (pC,ldc,4), rC6 add ldc, pC lea (pC,ldc3,2), pC mulsd (pC,mldc,2), rC7 mulsd (pC,mldc), rC8 mulsd (pC), rC9 mulsd (pC,ldc), rC10 mulsd (pC,ldc,2), rC11 mulsd (pC,ldc3), rC12 mulsd (pC,ldc,4), rC13 movapd 0-120(pA10,mldab5,2), rA0 movapd 0-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 lea (pC,mldc,8), pC movapd 0-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 addq ldc, pC movapd 0-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 0-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 0-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 0-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 0-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 0-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 0-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 0-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 0-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 0-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 0-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 0-120(pA5,ldab,8), rB0 addpd rB0, rC13#endif movapd 16-120(pA10,mldab5,2), rA0 movapd 16-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0#ifdef BETA1 lea (pC,mldc,8), pC#endif movapd 16-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1#ifdef BETA1 addq ldc, pC#endif movapd 16-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 16-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 16-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 16-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 16-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 16-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 16-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 16-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 16-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 16-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 16-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 16-120(pA5,ldab,8), rB0 addpd rB0, rC13 movapd 32-120(pA10,mldab5,2), rA0 movapd 32-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 32-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 32-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 32-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 32-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 32-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 32-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 32-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 32-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 32-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 32-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 32-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 32-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 32-120(pA5,ldab,8), rB0 addpd rB0, rC13 movapd 48-120(pA10,mldab5,2), rA0 movapd 48-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 48-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 48-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 48-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 48-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 48-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 48-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 48-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 48-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 48-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 48-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 48-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 48-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 48-120(pA5,ldab,8), rB0 addpd rB0, rC13#ifndef DREAL pref2((pfA)) pref2(64(pfA))#endif movapd 64-120(pA10,mldab5,2), rA0 movapd 64-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 64-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 64-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 64-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 64-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 64-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5 movapd 64-120(pA5, ldab), rA0 mulpd rB0, rA0 addpd rA0, rC6 movapd 64-120(pA5, ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC7 movapd 64-120(pA10, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC8 movapd 64-120(pA5,ldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC9 movapd 64-120(pA10), rA0 mulpd rB0, rA0 addpd rA0, rC10 movapd 64-120(pA10,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC11 movapd 64-120(pA10,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC12 mulpd 64-120(pA5,ldab,8), rB0 addpd rB0, rC13 movapd 80-120(pA10,mldab5,2), rA0 movapd 80-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 80-120(pA5, mldab,4), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 80-120(pA10, mldab,8), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 80-120(pA5, mldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 80-120(pA5, mldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 movapd 80-120(pA5), rA0 mulpd rB0, rA0 addpd rA0, rC5
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?