atl_dmm6x1x60_sse2_32.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,718 行 · 第 1/3 页
C
1,718 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2004 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#ifdef ATL_OS_SunOS #define ATL_DIV_NUM MB #define ATL_DIV_DEN 6#endif#include "atlas_asm.h"#ifdef ATL_GAS_x8664 #define movl movq #define addl addq #define subl subq #define esp rsp #define X8664#elif defined(ATL_GAS_x8632) #define X8632#else #error "This kernel requires a gas x86 assembler!"#endif#ifndef MB #ifdef X8664 #define MB 0 #else #error "MB must be compile-time constant!" #endif#endif#ifndef NB #define NB 0#endif#if !defined(KB) || KB != 60 #error "KB must be compile-time constant of 60!"#endif#ifdef ATL_GAS_x8632 #define pC0 %esi #define pA0 %ecx #define pA3 %eax #define pB0 %edi #define ldab %edx #define pfA %ebp #define stN %bh #define stM %bl#else #define pC0 %rsi #define pA0 %rcx #define pA3 %rax #define pB0 %rbx #define ldab %rdx #define pfA %rbp #define stM %rdi #define stN %r8 #define MM %r9 #define MBKB %r10 #define incCn %r11 #define rBETA %xmm8 #define rc0 %xmm9 #define rc1 %xmm10 #define rc2 %xmm11#endif#define rC0 %xmm0#define rC1 %xmm1#define rC2 %xmm2#define rC3 %xmm3#define rC4 %xmm4#define rC5 %xmm5#define rA0 %xmm6#define rB0 %xmm7#define NB6so (6*KB*8)/* * Prefetch defines */#if 1 #define pref2(mem) prefetcht1 mem #define prefB(mem) prefetcht0 mem #define prefC(mem) prefetchw mem#else #define pref2(mem) #define prefB(mem) #define prefC(mem)#endif#ifdef DCPLX #define CMUL(arg_) 2*arg_#else #define CMUL(arg_) arg_#endif/* %rdi/4 %rsi/8 %rdx/12 %xmm0/16 void ATL_AUSERMM(const int M, const int N, const int K, const TYPE alpha, %rcx/24 %r8/28 %r9/32 36 const TYPE *A, const int lda, const TYPE *B, const int ldb, %xmm1/40 16/48 24/52 const TYPE beta, TYPE *C, const int ldc)*/ .text.global ATL_asmdecor(ATL_USERMM)ALIGN16ATL_asmdecor(ATL_USERMM):#ifdef X8632 subl $28, %esp movl %ebp, 24(%esp) movl %ebx, 20(%esp) movl %esi, 16(%esp) movl %edi, 12(%esp)/* * Store incCn = (ldc-NB)*sizeof and BETA to stack */ movl 80(%esp), %eax movb %al, stM subl $MB, %eax#ifdef DCPLX shl $4, %eax#else shl $3, %eax#endif movl %eax, 8(%esp) #ifdef BETAX fldl 68(%esp) fstpl (%esp) #define BETAOFF 0 #endif/* * Initialize pA = A; pB = B; pC = C; */ movl 76(%esp), pC0 prefC((pC0)) prefC(64(pC0)) movl 52(%esp), pA0 movl 60(%esp), pB0 prefB((pB0)) prefB(64(pB0)) addl $120, pA0 addl $120, pB0/* * ldab = K * 8; */ movl 40(%esp), ldab shl $3, ldab/* * pfA = pA + NBNB */ movl pA0, pfA addl $MB*KB*8-120, pfA movb 36(%esp), stN#else/* * Save callee-saved iregs */ movq %rbp, -8(%rsp) movq %rbx, -16(%rsp)/* * Initilize beta */ unpcklpd %xmm1, %xmm1 movapd %xmm1, rBETA/* * On entry, pA0 is correct, ldab has K, stM has M, pC0 has stN; MM has pB0 * Set ldab = K*8, stN=N, and MBKB = M*K*8 */ shlq $3, ldab movq pC0, stN #if MB == 0 movq stM, MBKB imulq ldab, MBKB #else movq $MB*KB*8, MBKB #endif/* * Initialize incCn = (ldc-MB)*sizeof; pC0 = C; pB0 = B, pfA = pA0+MBKB */ movl 24(%rsp), %rsi movq %rsi, incCn shlq $32, incCn sarq $32, incCn subq stM, incCn #ifdef TCPLX shlq $4, incCn #else shlq $3, incCn #endif movq 16(%rsp), pC0 movq MM, pB0 movq stM, MM subq $6, MM movq pA0, pfA addq MBKB, pfA/* * Bias ptrs to max 1-byte range */ addq $120, pA0 addq $120, pB0/* * MBKB henceforth used to increment pA0, so subtract NB6so+240 from it */ subq $NB6so-240, MBKB#endif prefB((pA0)) prefB(64(pA0)) ALIGN8NLOOP: lea 0(pA0, ldab,2), pA3 addl ldab, pA3#if MB == 0 || MB > 6 #ifdef X8632 #ifdef ATL_DivAns movb $ATL_DivAns-1, stM #else movb $MB/6-1, stM #endif #else movq MM, stM #if MB == 0 cmp $0, stM je MLOOPCU #endif #endif ALIGN8MLOOP:#ifdef BETAX #ifdef X8632 movsd (pC0), rC0 movsd CMUL(8)(pC0), rC1 movsd CMUL(16)(pC0), rC2 movsd CMUL(24)(pC0), rC3 movsd CMUL(32)(pC0), rC4 movsd CMUL(40)(pC0), rC5 movlpd BETAOFF(%esp), rA0 mulsd rA0, rC0 mulsd rA0, rC1 mulsd rA0, rC2 mulsd rA0, rC3 mulsd rA0, rC4 mulsd rA0, rC5 #else #ifdef DCPLX movlpd (pC0), rc0 movhpd 16(pC0), rc0 movlpd 32(pC0), rc1 movhpd 48(pC0), rc1 movlpd 64(pC0), rc1 movhpd 80(pC0), rc1 #else movupd (pC0), rc0 movupd 16(pC0), rc1 movupd 32(pC0), rc2 #endif mulpd rBETA, rc0 mulpd rBETA, rc1 mulpd rBETA, rc2 #endif ALIGN16#endif/*KLOOP */#ifdef BETA1 movapd 0-120(pA0), rC0 movapd 0-120(pB0), rB0 mulpd rB0, rC0 addsd (pC0), rC0 movapd 0-120(pA0,ldab), rC1 mulpd rB0, rC1 addsd CMUL(8)(pC0), rC1 movapd 0-120(pA0,ldab,2), rC2 mulpd rB0, rC2 addsd CMUL(16)(pC0), rC2 movapd 0-120(pA3), rC3 mulpd rB0, rC3 addsd CMUL(24)(pC0), rC3 movapd 0-120(pA3,ldab), rC4 mulpd rB0, rC4 addsd CMUL(32)(pC0), rC4 movapd 0-120(pA3,ldab,2), rC5 mulpd rB0, rC5 addsd CMUL(40)(pC0), rC5#elif defined(BETA0) || defined(X8664) movapd 0-120(pA0), rC0 movapd 0-120(pB0), rB0 mulpd rB0, rC0 movapd 0-120(pA0,ldab), rC1 mulpd rB0, rC1 movapd 0-120(pA0,ldab,2), rC2 mulpd rB0, rC2 movapd 0-120(pA3), rC3 mulpd rB0, rC3 movapd 0-120(pA3,ldab), rC4 mulpd rB0, rC4 movapd 0-120(pA3,ldab,2), rC5 mulpd rB0, rC5#else movapd 0-120(pA0), rA0 movapd 0-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 0-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 0-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 0-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 0-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 0-120(pA3,ldab,2), rB0 addpd rB0, rC5#endif movapd 16-120(pA0), rA0 movapd 16-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 16-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 16-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 16-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 16-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 16-120(pA3,ldab,2), rB0 addpd rB0, rC5 prefC((pC0)) prefC(64(pC0)) movapd 32-120(pA0), rA0 movapd 32-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 32-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 32-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 32-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 32-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 32-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 48-120(pA0), rA0 movapd 48-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 48-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 48-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 48-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 48-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 48-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 64-120(pA0), rA0 movapd 64-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 64-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 64-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 64-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 64-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 64-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 80-120(pA0), rA0 movapd 80-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 80-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 80-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 80-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 80-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 80-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 96-120(pA0), rA0 movapd 96-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 96-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 96-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 96-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 96-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 96-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 112-120(pA0), rA0 movapd 112-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 112-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 112-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 112-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 112-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 112-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 128-120(pA0), rA0 movapd 128-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 128-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 128-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 128-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 128-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 128-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 144-120(pA0), rA0 movapd 144-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 144-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 144-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 144-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 144-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 144-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 160-120(pA0), rA0 movapd 160-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 160-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 160-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 160-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 160-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 160-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 176-120(pA0), rA0 movapd 176-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 176-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 176-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 176-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 176-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 176-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 192-120(pA0), rA0 movapd 192-120(pB0), rB0 mulpd rB0, rA0 addpd rA0, rC0 movapd 192-120(pA0,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC1 movapd 192-120(pA0,ldab,2), rA0 mulpd rB0, rA0 addpd rA0, rC2 movapd 192-120(pA3), rA0 mulpd rB0, rA0 addpd rA0, rC3 movapd 192-120(pA3,ldab), rA0 mulpd rB0, rA0 addpd rA0, rC4 mulpd 192-120(pA3,ldab,2), rB0 addpd rB0, rC5 movapd 208-120(pA0), rA0
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?