atl_dmm6x1x60_sse2.c
来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,489 行 · 第 1/3 页
C
1,489 行
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2002 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#ifndef ATL_GAS_x8632 #error "This kernel requires gas x86-32 assembler!"#endif#if !defined(NB) || (NB == 0) #error "NB must be a compile-time constant!"#endif#if (NB != 60) #error "NB must be 60!"#endif#if (NB/6)*6 != NB #error "NB must be multiple of 6!"#endif/* * Integer register usage shown be these defines */#define pC %esi#define pA %ecx#define pB %edi#define incCn %eax#define stM %edx#define stN %ebx#define pfA %ebp#define rC0 %xmm0#define rC1 %xmm1#define rC2 %xmm2#define rC3 %xmm3#define rC4 %xmm4#define rC5 %xmm5#define rA0 %xmm6#define rA1 %xmm7#define NBso (NB*8)#define NBNBso (NB*NB*8)#define NB2so (NBso+NBso)#define NB3so (NBso+NBso+NBso)#define NB4so (NBso+NBso+NBso+NBso)#define NB5so (NBso+NBso+NBso+NBso+NBso)#define NB6so (NBso+NBso+NBso+NBso+NBso+NBso)#define NB7so (NB6so+NBso)#define NB8so (NB6so+NB2so)#define NB9so (NB6so+NB3so)#define NB10so (NB6so+NB4so)#define NB11so (NB6so+NB5so)/* * Prefetch defines */#define pref2(mem) prefetcht1 mem#define prefB(mem) prefetcht0 mem#define prefC(mem) prefetchw mem/* *void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, * const TYPE *A, const int lda, const TYPE *B, const int ldb, * const TYPE beta, TYPE *C, const int ldc) */ .text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* * Save callee-saved iregs; Save old stack pointer in eax, * so we can adjust for BETA alignment */ movl %esp, %eax #ifdef BETAX subl $36, %esp shr $4, %esp shl $4, %esp movl %ebp, 32(%esp) movl %ebx, 28(%esp) movl %esi, 24(%esp) movl %edi, 20(%esp) movl %eax, 16(%esp) movlpd 40(%eax), rC0 unpcklpd rC0, rC0 movapd rC0, (%esp) #define BETAOFF 0 #else subl $16, %esp movl %ebp, 12(%esp) movl %ebx, 8(%esp) movl %esi, 4(%esp) movl %edi, (%esp) #endif/* * Initialize pA = A; pB = B; pC = C; */ movl 48(%eax), pC prefC((pC)) prefC(64(pC)) movl 24(%eax), pA movl 32(%eax), pB/* * stM = pA + NBNB-6*NB; pfA = pA+NBNB; stN = pB + NBNB; */ movl $NBNBso-NB6so, stM addl pA, stM xor pfA, pfA movl $NBNBso, stN addl pB, stN/* * Set incCn = (ldc - NB + 6)*sizeof */ movl 52(%eax), incCn subl $MB-6, incCn #ifdef DCPLX shl $4, incCn #else shl $3, incCn #endifNLOOP:MLOOP:#ifdef BETA0 xorpd rC0, rC0 xorpd rC1, rC1 xorpd rC2, rC2 xorpd rC3, rC3 xorpd rC4, rC4 xorpd rC5, rC5#else #ifdef DCPLX movsd (pC), rC0 movsd 16(pC), rC1 movsd 32(pC), rC2 movsd 48(pC), rC3 movsd 64(pC), rC4 movsd 80(pC), rC5 #else movsd (pC), rC0 movsd 8(pC), rC1 movsd 16(pC), rC2 movsd 24(pC), rC3 movsd 32(pC), rC4 movsd 40(pC), rC5 #endif #ifdef BETAX movlpd (%esp), rA0 mulsd rA0, rC0 mulsd rA0, rC1 mulsd rA0, rC2 mulsd rA0, rC3 mulsd rA0, rC4 mulsd rA0, rC5 #endif#endif movapd 0(pA), rA0 mulpd 0(pB), rA0 addpd rA0, rC0 movapd 0+NBso(pA), rA0 mulpd 0(pB), rA0 addpd rA0, rC1 movapd 0+NB2so(pA), rA0 mulpd 0(pB), rA0 addpd rA0, rC2 movapd 0+NB3so(pA), rA0 mulpd 0(pB), rA0 addpd rA0, rC3 movapd 0+NB4so(pA), rA0 mulpd 0(pB), rA0 addpd rA0, rC4 movapd 0+NB5so(pA), rA0 mulpd 0(pB), rA0 addpd rA0, rC5 movapd 16(pA), rA0 mulpd 16(pB), rA0 addpd rA0, rC0 movapd 16+NBso(pA), rA0 mulpd 16(pB), rA0 addpd rA0, rC1 movapd 16+NB2so(pA), rA0 mulpd 16(pB), rA0 addpd rA0, rC2 movapd 16+NB3so(pA), rA0 mulpd 16(pB), rA0 addpd rA0, rC3 movapd 16+NB4so(pA), rA0 mulpd 16(pB), rA0 addpd rA0, rC4 movapd 16+NB5so(pA), rA0 mulpd 16(pB), rA0 addpd rA0, rC5 movapd 32(pA), rA0 mulpd 32(pB), rA0 addpd rA0, rC0 movapd 32+NBso(pA), rA0 mulpd 32(pB), rA0 addpd rA0, rC1 movapd 32+NB2so(pA), rA0 mulpd 32(pB), rA0 addpd rA0, rC2 movapd 32+NB3so(pA), rA0 mulpd 32(pB), rA0 addpd rA0, rC3 movapd 32+NB4so(pA), rA0 mulpd 32(pB), rA0 addpd rA0, rC4 movapd 32+NB5so(pA), rA0 mulpd 32(pB), rA0 addpd rA0, rC5 movapd 48(pA), rA0 mulpd 48(pB), rA0 addpd rA0, rC0 movapd 48+NBso(pA), rA0 mulpd 48(pB), rA0 addpd rA0, rC1 movapd 48+NB2so(pA), rA0 mulpd 48(pB), rA0 addpd rA0, rC2 movapd 48+NB3so(pA), rA0 mulpd 48(pB), rA0 addpd rA0, rC3 movapd 48+NB4so(pA), rA0 mulpd 48(pB), rA0 addpd rA0, rC4 movapd 48+NB5so(pA), rA0 mulpd 48(pB), rA0 addpd rA0, rC5 movapd 64(pA), rA0 mulpd 64(pB), rA0 addpd rA0, rC0 movapd 64+NBso(pA), rA0 mulpd 64(pB), rA0 addpd rA0, rC1 movapd 64+NB2so(pA), rA0 mulpd 64(pB), rA0 addpd rA0, rC2 movapd 64+NB3so(pA), rA0 mulpd 64(pB), rA0 addpd rA0, rC3 movapd 64+NB4so(pA), rA0 mulpd 64(pB), rA0 addpd rA0, rC4 movapd 64+NB5so(pA), rA0 mulpd 64(pB), rA0 addpd rA0, rC5 movapd 80(pA), rA0 mulpd 80(pB), rA0 addpd rA0, rC0 movapd 80+NBso(pA), rA0 mulpd 80(pB), rA0 addpd rA0, rC1 movapd 80+NB2so(pA), rA0 mulpd 80(pB), rA0 addpd rA0, rC2 movapd 80+NB3so(pA), rA0 mulpd 80(pB), rA0 addpd rA0, rC3 movapd 80+NB4so(pA), rA0 mulpd 80(pB), rA0 addpd rA0, rC4 movapd 80+NB5so(pA), rA0 mulpd 80(pB), rA0 addpd rA0, rC5 movapd 96(pA), rA0 mulpd 96(pB), rA0 addpd rA0, rC0 movapd 96+NBso(pA), rA0 mulpd 96(pB), rA0 addpd rA0, rC1 movapd 96+NB2so(pA), rA0 mulpd 96(pB), rA0 addpd rA0, rC2 movapd 96+NB3so(pA), rA0 mulpd 96(pB), rA0 addpd rA0, rC3 movapd 96+NB4so(pA), rA0 mulpd 96(pB), rA0 addpd rA0, rC4 movapd 96+NB5so(pA), rA0 mulpd 96(pB), rA0 addpd rA0, rC5 movapd 112(pA), rA0 mulpd 112(pB), rA0 addpd rA0, rC0 movapd 112+NBso(pA), rA0 mulpd 112(pB), rA0 addpd rA0, rC1 movapd 112+NB2so(pA), rA0 mulpd 112(pB), rA0 addpd rA0, rC2 movapd 112+NB3so(pA), rA0 mulpd 112(pB), rA0 addpd rA0, rC3 movapd 112+NB4so(pA), rA0 mulpd 112(pB), rA0 addpd rA0, rC4 movapd 112+NB5so(pA), rA0 mulpd 112(pB), rA0 addpd rA0, rC5 movapd 128(pA), rA0 mulpd 128(pB), rA0 addpd rA0, rC0 movapd 128+NBso(pA), rA0 mulpd 128(pB), rA0 addpd rA0, rC1 movapd 128+NB2so(pA), rA0 mulpd 128(pB), rA0 addpd rA0, rC2 movapd 128+NB3so(pA), rA0 mulpd 128(pB), rA0 addpd rA0, rC3 movapd 128+NB4so(pA), rA0 mulpd 128(pB), rA0 addpd rA0, rC4 movapd 128+NB5so(pA), rA0 mulpd 128(pB), rA0 addpd rA0, rC5 movapd 144(pA), rA0 mulpd 144(pB), rA0 addpd rA0, rC0 movapd 144+NBso(pA), rA0 mulpd 144(pB), rA0 addpd rA0, rC1 movapd 144+NB2so(pA), rA0 mulpd 144(pB), rA0 addpd rA0, rC2 movapd 144+NB3so(pA), rA0 mulpd 144(pB), rA0 addpd rA0, rC3 movapd 144+NB4so(pA), rA0 mulpd 144(pB), rA0 addpd rA0, rC4 movapd 144+NB5so(pA), rA0 mulpd 144(pB), rA0 addpd rA0, rC5 movapd 160(pA), rA0 mulpd 160(pB), rA0 addpd rA0, rC0 movapd 160+NBso(pA), rA0 mulpd 160(pB), rA0 addpd rA0, rC1 movapd 160+NB2so(pA), rA0 mulpd 160(pB), rA0 addpd rA0, rC2 movapd 160+NB3so(pA), rA0 mulpd 160(pB), rA0 addpd rA0, rC3 movapd 160+NB4so(pA), rA0 mulpd 160(pB), rA0 addpd rA0, rC4 movapd 160+NB5so(pA), rA0 mulpd 160(pB), rA0 addpd rA0, rC5 movapd 176(pA), rA0 mulpd 176(pB), rA0 addpd rA0, rC0 movapd 176+NBso(pA), rA0 mulpd 176(pB), rA0 addpd rA0, rC1 movapd 176+NB2so(pA), rA0 mulpd 176(pB), rA0 addpd rA0, rC2 movapd 176+NB3so(pA), rA0 mulpd 176(pB), rA0 addpd rA0, rC3 movapd 176+NB4so(pA), rA0 mulpd 176(pB), rA0 addpd rA0, rC4 movapd 176+NB5so(pA), rA0 mulpd 176(pB), rA0 addpd rA0, rC5 movapd 192(pA), rA0 mulpd 192(pB), rA0 addpd rA0, rC0 movapd 192+NBso(pA), rA0 mulpd 192(pB), rA0 addpd rA0, rC1 movapd 192+NB2so(pA), rA0 mulpd 192(pB), rA0 addpd rA0, rC2 movapd 192+NB3so(pA), rA0 mulpd 192(pB), rA0 addpd rA0, rC3 movapd 192+NB4so(pA), rA0 mulpd 192(pB), rA0 addpd rA0, rC4 movapd 192+NB5so(pA), rA0 mulpd 192(pB), rA0 addpd rA0, rC5 movapd 208(pA), rA0 mulpd 208(pB), rA0 addpd rA0, rC0 movapd 208+NBso(pA), rA0 mulpd 208(pB), rA0 addpd rA0, rC1 movapd 208+NB2so(pA), rA0 mulpd 208(pB), rA0 addpd rA0, rC2 movapd 208+NB3so(pA), rA0 mulpd 208(pB), rA0 addpd rA0, rC3 movapd 208+NB4so(pA), rA0 mulpd 208(pB), rA0 addpd rA0, rC4 movapd 208+NB5so(pA), rA0 mulpd 208(pB), rA0 addpd rA0, rC5 movapd 224(pA), rA0 mulpd 224(pB), rA0 addpd rA0, rC0 movapd 224+NBso(pA), rA0 mulpd 224(pB), rA0 addpd rA0, rC1 movapd 224+NB2so(pA), rA0 mulpd 224(pB), rA0 addpd rA0, rC2 movapd 224+NB3so(pA), rA0 mulpd 224(pB), rA0 addpd rA0, rC3 movapd 224+NB4so(pA), rA0 mulpd 224(pB), rA0 addpd rA0, rC4 movapd 224+NB5so(pA), rA0 mulpd 224(pB), rA0 addpd rA0, rC5 movapd 240(pA), rA0 mulpd 240(pB), rA0 addpd rA0, rC0 movapd 240+NBso(pA), rA0 mulpd 240(pB), rA0 addpd rA0, rC1 movapd 240+NB2so(pA), rA0 mulpd 240(pB), rA0 addpd rA0, rC2 movapd 240+NB3so(pA), rA0 mulpd 240(pB), rA0 addpd rA0, rC3 movapd 240+NB4so(pA), rA0 mulpd 240(pB), rA0 addpd rA0, rC4 movapd 240+NB5so(pA), rA0 mulpd 240(pB), rA0 addpd rA0, rC5 movapd 256(pA), rA0 mulpd 256(pB), rA0 addpd rA0, rC0 movapd 256+NBso(pA), rA0 mulpd 256(pB), rA0 addpd rA0, rC1 movapd 256+NB2so(pA), rA0 mulpd 256(pB), rA0 addpd rA0, rC2 movapd 256+NB3so(pA), rA0 mulpd 256(pB), rA0 addpd rA0, rC3 movapd 256+NB4so(pA), rA0 mulpd 256(pB), rA0 addpd rA0, rC4 movapd 256+NB5so(pA), rA0
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?