⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 atl_dmm4x1x90_x87.c

📁 基于Blas CLapck的.用过的人知道是干啥的
💻 C
📖 第 1 页 / 共 5 页
字号:
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2006 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#if !defined(ATL_GAS_x8664) && !defined(ATL_GAS_x8632)   #error "This kernel requires x86 assembly!"#endif#if !defined(KB) || (KB == 0)   #error "KB must be a compile-time constant!"#endif#if KB > 90   #error "KB can at most be 90!"#endif#ifdef DCPLX   #define CMUL(arg_) 2*arg_#else   #define CMUL(arg_) arg_#endif#ifdef ATL_GAS_x8632   #define movq movl   #define addq addl   #define subq subl   #define shrq shrl   #define rsp  esp   #define STKSIZE 36   #define IOFF    STKSIZE-4   #define MOFF    IOFF-4   #define JOFF    MOFF-4   #define iAOFF    JOFF-4   #define iCOFF    iAOFF-4   #define BETAOFF  STKSIZE+40#endif/* *Integer register usage shown by these defines */#ifdef ATL_GAS_x8632   #define pA0     %ecx   #define lda     %ebx   #define lda3    %ebp   #define pAE     pA0   #define pB0     %eax   #define pC0     %esi   #define pBE     pB0   #define ldb     %edi   #define pfA     %edx   #define incAn   iAOFF(%esp)   #define incCn   iCOFF(%esp)   #define MM      IOFF(%esp)   #define NN      JOFF(%esp)   #define MM0     MOFF(%esp)#else   #define pA0     %rcx   #define lda     %rbx   #define lda3    %rbp   #define pAE     %rdi   #define pB0     %rax   #define pC0     %rsi   #define pBE     %rdx   #define incAn   %r8   #define incCn   %r9   #define ldb     %r10   #define MM      %r11   #define NN      %r12   #define pfA     %r13   #define MM0     %r14#endif/*                      %rdi/4       %rsi/8       %rdx/12          %xmm0/16 void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,                       %rcx/24         %r8/28         %r9/32           8/36                 const TYPE *A, const int lda, const TYPE *B, const int ldb,                        %xmm1/40    16/48          24/52                 const TYPE beta, TYPE *C, const int ldc)*/        .text.global ATL_asmdecor(ATL_USERMM)ALIGN16ATL_asmdecor(ATL_USERMM):/* *      Save callee-saved iregs */#ifdef ATL_GAS_x8632        sub     $STKSIZE, %esp        movl    %ebp, (%esp)        movl    %ebx, 4(%esp)        movl    %esi, 8(%esp)        movl    %edi, 12(%esp)#else        movq    %rbp, -8(%rsp)        movq    %rbx, -16(%rsp)        movq    %r12, -24(%rsp)        movq    %r13, -32(%rsp)        movq    %r14, -40(%rsp)                                        prefetcht0 (pA0)/*        movq    %r15, -48(%rsp) */#endif/* *      Setup input parameters */#ifdef ATL_GAS_x8632        movl    STKSIZE+4(%esp), lda3        movl    lda3, MM0        movl    STKSIZE+8(%esp), lda3        movl    lda3, NN        movl    STKSIZE+24(%esp), pA0                                prefetcht0 (pA0)        movl    STKSIZE+28(%esp), lda                                prefetcht0 (pA0,lda)        movl    STKSIZE+32(%esp), pB0                                prefetcht0 (pB0)        movl    STKSIZE+36(%esp), ldb                                prefetcht0 (pA0,lda,2)        movl    STKSIZE+48(%esp), pC0                                prefetcht0 KB*8(pA0,lda,2)/* *      incCn = (ldc - M)*sizeof */        movl    STKSIZE+52(%esp), lda3        subl    MM0, lda3   #ifdef DCPLX        shl     $4, lda3   #else        shl     $3, lda3   #endif        movl    lda3, incCn/* *      pA0 += 128; pB0 += 128 */        sub     $-128, pA0        sub     $-128, pB0                                prefetcht0 -64(pB0)/* *      lda *= sizeof; ldb *= sizeof; lda3 = lda*3 */        shl     $3, lda                                prefetcht0 (pB0)        shl     $3, ldb                                prefetcht0 64(pB0)        lea     (lda,lda,2), lda3/* *      pfA = A + lda*M; incAn = lda*M */        movl    MM0, pfA                                prefetcht0 128(pB0)        imull   lda, pfA                                prefetcht0 192(pB0)                                prefetcht0 256(pB0)        movl    pfA, incAn        lea     -128(pA0, pfA), pfA                                prefetcht0 320(pB0)        shrl    $2, MM0            /* MM0 = MM0 / mu */#else   #ifdef BETAX      #define BETAOFF -48        movlpd  %xmm1, BETAOFF(%rsp)   #endif        movq    %rdi, MM0        movq    %rsi, NN        movq    %r8, lda                                        prefetcht0      (pA0,lda)        movq    %r9, pB0                                        prefetcht0      (pB0)        movslq  8(%rsp), ldb                                        prefetcht0      (pA0,lda,2)        movq    16(%rsp), pC0        movslq  24(%rsp), incCn                                        prefetcht0      KB*8(pA0,lda,2)/* *      incCn = (ldc-M)*sizeof */        sub     MM0, incCn#ifdef DCPLX        shl     $4, incCn#else        shl     $3, incCn#endif/* *      pA0 += 128; pB0 += 128 */        sub     $-128, pA0        sub     $-128, pB0                                        prefetcht0      -64(pB0)/* *      lda = lda*sizeof;  lda3 = lda*3 */        shl     $3, lda                                                prefetcht0      (pB0)        lea     (lda,lda,2), lda3/* *      ldb = ldb*sizeof */        shl     $3, ldb                                                prefetcht0      64(pB0)/* *      pfA = A + lda*M ; incAn = lda*M, pfB = B + ldb*N */        movq    lda, pfA                                                prefetcht0      128(pB0)        imulq   MM0, pfA/*                                                prefetcht0      192(pB0) *//*                                                prefetcht0      256(pB0) */        movq    pfA, incAn/*        movq    ldb, pfB *//*        imulq   NN, pfB */        lea     -128(pA0, pfA), pfA/*                                                prefetcht0      320(pB0) *//*       lea     -128-(MB-8)*KB*8(pA0, pfA), pfA *//* *      pAE (pointer to end of column of A) = pA + lda */   #if KB > 32/*        lea     -128(pA0,lda), pAE *//*        lea     -128(pB0,ldb), pBE */        lea     KB*8-128(pA0), pAE        lea     KB*8-128(pB0), pBE   #endif/* *      MM0 = MM0/mu */        shr     $2, MM0#endifALIGN16NLOOP:#ifdef ATL_GAS_x8632        movl    MM0, lda3        movl    lda3, MM        lea     (lda,lda,2), lda3#else        movq    MM0, MM#endif        prefetcht0      -128(pB0,ldb,2)        prefetcht0      -64(pB0,ldb,2)        prefetcht0      (pB0,ldb,2)MLOOP:

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -