📄 atl_dmm4x1x90_x87.c
字号:
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2006 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#if !defined(ATL_GAS_x8664) && !defined(ATL_GAS_x8632) #error "This kernel requires x86 assembly!"#endif#if !defined(KB) || (KB == 0) #error "KB must be a compile-time constant!"#endif#if KB > 90 #error "KB can at most be 90!"#endif#ifdef DCPLX #define CMUL(arg_) 2*arg_#else #define CMUL(arg_) arg_#endif#ifdef ATL_GAS_x8632 #define movq movl #define addq addl #define subq subl #define shrq shrl #define rsp esp #define STKSIZE 36 #define IOFF STKSIZE-4 #define MOFF IOFF-4 #define JOFF MOFF-4 #define iAOFF JOFF-4 #define iCOFF iAOFF-4 #define BETAOFF STKSIZE+40#endif/* *Integer register usage shown by these defines */#ifdef ATL_GAS_x8632 #define pA0 %ecx #define lda %ebx #define lda3 %ebp #define pAE pA0 #define pB0 %eax #define pC0 %esi #define pBE pB0 #define ldb %edi #define pfA %edx #define incAn iAOFF(%esp) #define incCn iCOFF(%esp) #define MM IOFF(%esp) #define NN JOFF(%esp) #define MM0 MOFF(%esp)#else #define pA0 %rcx #define lda %rbx #define lda3 %rbp #define pAE %rdi #define pB0 %rax #define pC0 %rsi #define pBE %rdx #define incAn %r8 #define incCn %r9 #define ldb %r10 #define MM %r11 #define NN %r12 #define pfA %r13 #define MM0 %r14#endif/* %rdi/4 %rsi/8 %rdx/12 %xmm0/16 void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, %rcx/24 %r8/28 %r9/32 8/36 const TYPE *A, const int lda, const TYPE *B, const int ldb, %xmm1/40 16/48 24/52 const TYPE beta, TYPE *C, const int ldc)*/ .text.global ATL_asmdecor(ATL_USERMM)ALIGN16ATL_asmdecor(ATL_USERMM):/* * Save callee-saved iregs */#ifdef ATL_GAS_x8632 sub $STKSIZE, %esp movl %ebp, (%esp) movl %ebx, 4(%esp) movl %esi, 8(%esp) movl %edi, 12(%esp)#else movq %rbp, -8(%rsp) movq %rbx, -16(%rsp) movq %r12, -24(%rsp) movq %r13, -32(%rsp) movq %r14, -40(%rsp) prefetcht0 (pA0)/* movq %r15, -48(%rsp) */#endif/* * Setup input parameters */#ifdef ATL_GAS_x8632 movl STKSIZE+4(%esp), lda3 movl lda3, MM0 movl STKSIZE+8(%esp), lda3 movl lda3, NN movl STKSIZE+24(%esp), pA0 prefetcht0 (pA0) movl STKSIZE+28(%esp), lda prefetcht0 (pA0,lda) movl STKSIZE+32(%esp), pB0 prefetcht0 (pB0) movl STKSIZE+36(%esp), ldb prefetcht0 (pA0,lda,2) movl STKSIZE+48(%esp), pC0 prefetcht0 KB*8(pA0,lda,2)/* * incCn = (ldc - M)*sizeof */ movl STKSIZE+52(%esp), lda3 subl MM0, lda3 #ifdef DCPLX shl $4, lda3 #else shl $3, lda3 #endif movl lda3, incCn/* * pA0 += 128; pB0 += 128 */ sub $-128, pA0 sub $-128, pB0 prefetcht0 -64(pB0)/* * lda *= sizeof; ldb *= sizeof; lda3 = lda*3 */ shl $3, lda prefetcht0 (pB0) shl $3, ldb prefetcht0 64(pB0) lea (lda,lda,2), lda3/* * pfA = A + lda*M; incAn = lda*M */ movl MM0, pfA prefetcht0 128(pB0) imull lda, pfA prefetcht0 192(pB0) prefetcht0 256(pB0) movl pfA, incAn lea -128(pA0, pfA), pfA prefetcht0 320(pB0) shrl $2, MM0 /* MM0 = MM0 / mu */#else #ifdef BETAX #define BETAOFF -48 movlpd %xmm1, BETAOFF(%rsp) #endif movq %rdi, MM0 movq %rsi, NN movq %r8, lda prefetcht0 (pA0,lda) movq %r9, pB0 prefetcht0 (pB0) movslq 8(%rsp), ldb prefetcht0 (pA0,lda,2) movq 16(%rsp), pC0 movslq 24(%rsp), incCn prefetcht0 KB*8(pA0,lda,2)/* * incCn = (ldc-M)*sizeof */ sub MM0, incCn#ifdef DCPLX shl $4, incCn#else shl $3, incCn#endif/* * pA0 += 128; pB0 += 128 */ sub $-128, pA0 sub $-128, pB0 prefetcht0 -64(pB0)/* * lda = lda*sizeof; lda3 = lda*3 */ shl $3, lda prefetcht0 (pB0) lea (lda,lda,2), lda3/* * ldb = ldb*sizeof */ shl $3, ldb prefetcht0 64(pB0)/* * pfA = A + lda*M ; incAn = lda*M, pfB = B + ldb*N */ movq lda, pfA prefetcht0 128(pB0) imulq MM0, pfA/* prefetcht0 192(pB0) *//* prefetcht0 256(pB0) */ movq pfA, incAn/* movq ldb, pfB *//* imulq NN, pfB */ lea -128(pA0, pfA), pfA/* prefetcht0 320(pB0) *//* lea -128-(MB-8)*KB*8(pA0, pfA), pfA *//* * pAE (pointer to end of column of A) = pA + lda */ #if KB > 32/* lea -128(pA0,lda), pAE *//* lea -128(pB0,ldb), pBE */ lea KB*8-128(pA0), pAE lea KB*8-128(pB0), pBE #endif/* * MM0 = MM0/mu */ shr $2, MM0#endifALIGN16NLOOP:#ifdef ATL_GAS_x8632 movl MM0, lda3 movl lda3, MM lea (lda,lda,2), lda3#else movq MM0, MM#endif prefetcht0 -128(pB0,ldb,2) prefetcht0 -64(pB0,ldb,2) prefetcht0 (pB0,ldb,2)MLOOP:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -