atl_dmm_julian_gas_30.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,904 行 · 第 1/5 页
C
1,904 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2001 Julian Ruhe * * Code contributers : Julian Ruhe, Peter Soendergaard * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * *//****************************************************************************//* This routine originally written in NASM by Julian Ruhe. See *//* ATLAS/tune/blas/gemm/CASES/objs/ATL_dJIK30x30x30TN30x30x0_a1_b1.asm *//* for the original. Translated into gnu inline assembly (to simplify the *//* ATLAS build process) and extended to all precisions by *//* Peter Soendergaard. Edited by Clint Whaley so it runs on any x87 proc *//* (got rid of prefetch and FEMMS for certain archs) *//****************************************************************************/#include "atlas_misc.h"#ifndef ATL_GAS_x8632 #error "This kernel requires gas x86-32 assembler!"#endif#if defined(ATL_3DNow) #define PREFETCH prefetch #define PREFETCHW prefetchw #define USE_PREFETCH #define NOFEMMS#elif defined(ATL_SSE1) || defined (ATL_SSE2) #define PREFETCH prefetchnta #define PREFETCHW prefetchnta #define USE_PREFETCH #define NOFEMMS#else #define NOFEMMS#endif#if !defined(MB) || !defined(KB) || MB != 30 || KB != 30 #error "MB and KB must be 30!!"#endif#define ASM __asm__ __volatile__/* Since the code only works for NB==30 this is hardcoded. */#define PNB "30"#if defined(DREAL) || defined(DCPLX)#define DS "8"/* These macros define memory-register operations for doubles. */#define ADD "faddl"#define MUL "fmull"#define LOAD "fldl"#define STO "fstpl"/* These macros indicates where the parameters are placed on the stack. */#define Nstack "0xc(%%ebp)"#define Astack "0x1c(%%ebp)"#define Bstack "0x24(%%ebp)"#define BETAstack "0x2c(%%ebp)"#define Cstack "0x34(%%ebp)"#define LDCstack "0x38(%%ebp)"#ifdef BETAX/* This variable is placed here to make it go into the data segment instead of the stack */static double locbeta;#endif/* Define DSC as the length of a complex number (stride in C). */#ifdef DCPLX#define DSC "16"#else#define DSC DS#endif#endif#if defined(SREAL) || defined(SCPLX)#define DS "4"/* These macros define memory-register operations for floats. */#define ADD "fadds"#define MUL "fmuls"#define LOAD "flds"#define STO "fstps"/* These macros indicates where the parameters are placed on the stack. */#define Nstack "12(%%ebp)"#define Astack "24(%%ebp)"#define Bstack "32(%%ebp)"#define BETAstack "40(%%ebp)"#define Cstack "44(%%ebp)"#define LDCstack "48(%%ebp)"#ifdef BETAXstatic float locbeta;#endif/* Define DSC as the length of a complex number (stride in C). */#ifdef SCPLX#define DSC "8"#else#define DSC DS#endif#endif#ifdef DCPLX#define CPTR0 " -15*" DS#define CPTR1 " -13*" DS#define CPTR2 " -11*" DS#define CPTR3 " -9*" DS#define CPTR4 " -7*" DS#define CPTR5 " -5*" DS#define CPTR6 " -3*" DS#define CPTR7 " -1*" DS#define CPTR8 " 1*" DS#define CPTR9 " 3*" DS#define CPTR10 " 5*" DS#define CPTR11 " 7*" DS#define CPTR12 " -15*" DS#define CPTR13 " -13*" DS#define CPTR14 " -11*" DS#define CPTR15 " -9*" DS#define CPTR16 " -7*" DS#define CPTR17 " -5*" DS#define CPTR18 " -3*" DS#define CPTR19 " -1*" DS#define CPTR20 " 1*" DS#define CPTR21 " 3*" DS#define CPTR22 " 5*" DS#define CPTR23 " 7*" DS#define CPTR24 " -15*" DS#define CPTR25 " -13*" DS#define CPTR26 " -11*" DS#define CPTR27 " -9*" DS#define CPTR28 " -7*" DS#define CPTR29 " -5*" DS#else#define CPTR0 " -15*" DSC#define CPTR1 " -14*" DSC#define CPTR2 " -13*" DSC#define CPTR3 " -12*" DSC#define CPTR4 " -11*" DSC#define CPTR5 " -10*" DSC#define CPTR6 " -9*" DSC#define CPTR7 " -8*" DSC#define CPTR8 " -7*" DSC#define CPTR9 " -6*" DSC#define CPTR10 " -5*" DSC#define CPTR11 " -4*" DSC#define CPTR12 " -3*" DSC#define CPTR13 " -2*" DSC#define CPTR14 " -1*" DSC#define CPTR15 " "#define CPTR16 " " DSC#define CPTR17 " 2*" DSC#define CPTR18 " 3*" DSC#define CPTR19 " 4*" DSC#define CPTR20 " 5*" DSC#define CPTR21 " 6*" DSC#define CPTR22 " 7*" DSC#define CPTR23 " 8*" DSC#define CPTR24 " 9*" DSC#define CPTR25 " 10*" DSC#define CPTR26 " 11*" DSC#define CPTR27 " 12*" DSC#define CPTR28 " 13*" DSC#define CPTR29 " 14*" DSC#endif#define COLSIZE PNB "*" DS#if defined(DREAL) || defined(DCPLX)void ATL_USERMM(const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc)#endif#if defined(SREAL) || defined(SCPLX)void ATL_USERMM(const int M, const int N, const int K, const float alpha, const float *A, const int lda, const float *B, const int ldb, const float beta, float *C, const int ldc)#endif{ ASM ("push %%ebp"::); ASM ("mov %%esp,%%ebp"::); ASM ("push %%ebx"::); ASM ("push %%esi"::); ASM ("push %%edi"::); #ifndef NOFEMMS ASM ("femms "::); #endif /* Don't quite know why this computation is usefull, but it is used for the prefetching. It does not give the correct result for N cleanup. */ ASM ("mov $0x0,%%eax"::); ASM ("push %%eax"::); ASM ("mov " Astack ",%%eax"::); ASM ("add $" DS "*" PNB "*" PNB ",%%eax"::); ASM ("mov " Bstack ",%%ebx"::); ASM ("sub %%ebx,%%eax"::); ASM ("push %%eax"::); /* Load ldc onto stack and scale it. The scale factor must be 8 or lower because of limitiations in the intel adressing modes, so therefore it is multiplied by two for the complex case, as just scaling with 16 for DCPLX wouldn't work. Funny, eh! */ ASM ("mov " LDCstack ",%%eax"::); ASM ("lea 0x0(,%%eax," DS "),%%eax"::);#if defined(DCPLX) || defined(SCPLX) ASM ("shl $1,%%eax"::);#endif ASM ("push %%eax"::); ASM ("mov " Nstack ", %%eax"::); ASM ("push %%eax"::); /* Load &A, &B and &C into registers. */ ASM ("mov " Astack ",%%eax"::); ASM ("mov " Bstack ",%%ebx"::); ASM ("mov " Cstack ",%%ecx"::); /* Add offsets for small adresses. */#ifdef DCPLX ASM ("add $15*" DS ",%%ecx"::);#else ASM ("add $15*" DSC ",%%ecx"::);#endif ASM ("add $15*" DS ",%%ebx"::);#ifdef BETA0 ASM ("add $5*" COLSIZE "+15*" DS ",%%eax"::); ASM ("mov $6*" COLSIZE ",%%edx"::);#elif defined(BETA1) ASM ("add $5*" COLSIZE ",%%eax"::); ASM ("mov $6*" COLSIZE "-15*" DS ",%%edx"::);#else ASM ("add $5*" COLSIZE "+15*" DS ",%%eax"::); ASM ("mov $6*" COLSIZE ",%%edx"::); ASM (LOAD " " BETAstack::); /* Initialise locbeta with beta */ ASM (STO " %0":"=m" (((locbeta))):);#endif ASM ("push %%eax"::); ASM ("push %%ebp"::); ASM ("mov $-1*" COLSIZE ",%%edi"::); ASM ("mov $-3*" COLSIZE ",%%esi"::); ASM ("mov $-5*" COLSIZE ",%%ebp"::); ASM (".align 16"::); ASM ("loopj_:"::);#ifdef BETA0 ASM (LOAD " -15*" DS "(%%eax,%%ebp,1)"::); ASM (MUL " -15*" DS "(%%ebx)"::); ASM ("nop "::); ASM (LOAD " -15*" DS "(%%eax,%%edi,4)"::); ASM (MUL " -15*" DS "(%%ebx)"::); ASM ("nop "::); ASM (LOAD " -15*" DS "(%%eax,%%esi,1)"::); ASM (MUL " -15*" DS "(%%ebx)"::); ASM ("nop "::); ASM (LOAD " -14*" DS "(%%ebx)"::); ASM (LOAD " -15*" DS "(%%eax,%%edi,1)"::); ASM ("nop "::); ASM (MUL " -15*" DS "(%%ebx)"::); ASM (LOAD " -15*" DS "(%%eax)"::); ASM ("mov %%edx,%%edx"::); ASM (MUL " -15*" DS "(%%ebx)"::); ASM (LOAD " -15*" DS "(%%eax,%%edi,2)"::); ASM ("nop "::); ASM ("repz"::); ASM (MUL " -15*" DS "(%%ebx)"::); ASM ("fxch %%st(3)"::); ASM ("mov %%edx,%%edx"::);#elif defined(BETA1) ASM (LOAD " -15*" DS "(%%ebx)"::); ASM (LOAD " (%%eax,%%edi,4)"::); ASM ("fmul %%st(1),%%st"::); ASM (ADD CPTR1 "(%%ecx)"::); ASM (LOAD " (%%eax,%%esi,1)"::); ASM ("fmul %%st(2),%%st"::); ASM (ADD CPTR2 "(%%ecx)"::); ASM (LOAD " (%%eax,%%ebp,1)"::); ASM ("fmul %%st(3),%%st"::); ASM (ADD CPTR0 "(%%ecx)"::); ASM ("fxch %%st(3)"::); ASM (LOAD " (%%eax,%%edi,1)"::); ASM ("repz"::); ASM ("fmul %%st(1),%%st"::); ASM (ADD CPTR4 "(%%ecx)"::); ASM (LOAD " (%%eax)"::); ASM ("fmul %%st(2),%%st"::); ASM (ADD CPTR5 "(%%ecx)"::); ASM (LOAD " (%%eax,%%edi,2)"::); ASM ("repz"::); ASM ("fmulp %%st,%%st(3)"::); ASM (LOAD CPTR3 "(%%ecx)"::); ASM ("faddp %%st,%%st(3)"::); ASM (LOAD " -14*" DS "(%%ebx)"::); ASM ("add $15*" DS ",%%eax"::); ASM ("mov %%edx,%%edx"::);#else ASM (LOAD CPTR0 "(%%ecx)"::); ASM (LOAD CPTR1 "(%%ecx)"::); ASM ("mov %%edx,%%edx"::); ASM (LOAD " %0"::"m" (((locbeta)))); ASM ("fmul %%st,%%st(2)"::); ASM ("fmul %%st,%%st(1)"::); ASM (LOAD " -15*" DS "(%%ebx)"::); ASM (LOAD CPTR4 "(%%ecx)"::); ASM ("repz"::); ASM ("fmul %%st(2),%%st"::); ASM (LOAD CPTR5 "(%%ecx)"::); ASM ("fmul %%st(3),%%st"::); ASM (LOAD CPTR3 "(%%ecx)"::); ASM ("fmul %%st(4),%%st"::); ASM (LOAD CPTR2 "(%%ecx)"::); ASM ("repz"::); ASM ("fmulp %%st,%%st(5)"::); ASM ("repz"::); ASM ("fxch %%st(3)"::); ASM ("mov %%edx,%%edx"::); ASM (LOAD " -15*" DS "(%%eax,%%ebp,1)"::); ASM ("fmul %%st(1),%%st"::); ASM ("faddp %%st,%%st(7)"::); ASM (LOAD " -15*" DS "(%%eax,%%edi,4)"::); ASM ("fmul %%st(1),%%st"::); ASM ("faddp %%st,%%st(6)"::); ASM (LOAD " -15*" DS "(%%eax,%%esi,1)"::); ASM ("fmul %%st(1),%%st"::); ASM ("faddp %%st,%%st(5)"::); ASM (LOAD " -15*" DS "(%%eax,%%edi,2)"::); ASM ("fmul %%st(1),%%st"::); ASM ("faddp %%st,%%st(4)"::); ASM (LOAD " -15*" DS "(%%eax,%%edi,1)"::); ASM ("fmul %%st(1),%%st"::); ASM ("faddp %%st,%%st(3)"::); ASM (MUL " -15*" DS "(%%eax)"::); ASM ("faddp %%st,%%st(1)"::); ASM (LOAD " -14*" DS "(%%ebx)"::);#endif ASM (LOAD " -14*" DS "(%%eax,%%ebp,1)"::); ASM ("fmul %%st(1),%%st"::); ASM ("faddp %%st,%%st(7)"::); ASM (LOAD " -14*" DS "(%%eax,%%edi,4)"::); ASM ("fmul %%st(1),%%st"::); ASM ("faddp %%st,%%st(6)"::); ASM (LOAD " -14*" DS "(%%eax,%%esi,1)"::); ASM ("fmul %%st(1),%%st"::); ASM ("faddp %%st,%%st(5)"::); ASM (LOAD " -14*" DS "(%%eax,%%edi,2)"::); ASM ("fmul %%st(1),%%st"::);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?