atl_dmm6x1x60_sse2.c

来自「基于Blas CLapck的.用过的人知道是干啥的」· C语言 代码 · 共 1,489 行 · 第 1/3 页

C
1,489
字号
/* *             Automatically Tuned Linear Algebra Software v3.8.0 *                    (C) Copyright 2002 R. Clint Whaley * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: *   1. Redistributions of source code must retain the above copyright *      notice, this list of conditions and the following disclaimer. *   2. Redistributions in binary form must reproduce the above copyright *      notice, this list of conditions, and the following disclaimer in the *      documentation and/or other materials provided with the distribution. *   3. The name of the ATLAS group or the names of its contributers may *      not be used to endorse or promote products derived from this *      software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include "atlas_asm.h"#ifndef ATL_GAS_x8632   #error "This kernel requires gas x86-32 assembler!"#endif#if !defined(NB) || (NB == 0)   #error "NB must be a compile-time constant!"#endif#if (NB != 60)   #error "NB must be 60!"#endif#if (NB/6)*6 != NB   #error "NB must be multiple of 6!"#endif/* * Integer register usage shown be these defines */#define pC      %esi#define pA      %ecx#define pB      %edi#define incCn   %eax#define stM     %edx#define stN     %ebx#define pfA     %ebp#define rC0	%xmm0#define rC1	%xmm1#define rC2	%xmm2#define rC3	%xmm3#define rC4	%xmm4#define rC5	%xmm5#define rA0	%xmm6#define rA1	%xmm7#define NBso	(NB*8)#define NBNBso  (NB*NB*8)#define NB2so   (NBso+NBso)#define NB3so   (NBso+NBso+NBso)#define NB4so   (NBso+NBso+NBso+NBso)#define NB5so   (NBso+NBso+NBso+NBso+NBso)#define NB6so   (NBso+NBso+NBso+NBso+NBso+NBso)#define NB7so   (NB6so+NBso)#define NB8so   (NB6so+NB2so)#define NB9so   (NB6so+NB3so)#define NB10so   (NB6so+NB4so)#define NB11so   (NB6so+NB5so)/* * Prefetch defines */#define pref2(mem) prefetcht1	mem#define prefB(mem) prefetcht0	mem#define prefC(mem) prefetchw	mem/* *void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha, *                const TYPE *A, const int lda, const TYPE *B, const int ldb, *                const TYPE beta, TYPE *C, const int ldc) */	.text.global ATL_asmdecor(ATL_USERMM)ATL_asmdecor(ATL_USERMM):/* *      Save callee-saved iregs; Save old stack pointer in eax, *      so we can adjust for BETA alignment */	movl %esp, %eax   #ifdef BETAX	subl	$36, %esp	shr	$4, %esp	shl	$4, %esp	movl	%ebp, 32(%esp)	movl	%ebx, 28(%esp)	movl	%esi, 24(%esp)	movl	%edi, 20(%esp)	movl	%eax, 16(%esp)	movlpd	40(%eax), rC0	unpcklpd	rC0, rC0	movapd	rC0, (%esp)      #define BETAOFF 0   #else	subl	$16, %esp	movl	%ebp, 12(%esp)	movl	%ebx,  8(%esp)	movl	%esi,  4(%esp)	movl	%edi,   (%esp)   #endif/* *      Initialize pA = A;  pB = B; pC = C; */	movl	48(%eax), pC			prefC((pC))			prefC(64(pC))	movl	24(%eax), pA	movl	32(%eax), pB/* *      stM = pA + NBNB-6*NB;  pfA = pA+NBNB;  stN = pB + NBNB; */	movl	$NBNBso-NB6so, stM	addl	pA, stM	xor	pfA, pfA	movl	$NBNBso, stN	addl	pB, stN/* *      Set incCn = (ldc - NB + 6)*sizeof */	movl	52(%eax), incCn	subl	$MB-6, incCn   #ifdef DCPLX	shl	$4, incCn   #else	shl	$3, incCn   #endifNLOOP:MLOOP:#ifdef BETA0	xorpd	rC0, rC0	xorpd	rC1, rC1	xorpd	rC2, rC2	xorpd	rC3, rC3	xorpd	rC4, rC4	xorpd	rC5, rC5#else   #ifdef DCPLX	movsd	(pC), rC0	movsd	16(pC), rC1	movsd	32(pC), rC2	movsd	48(pC), rC3	movsd	64(pC), rC4	movsd	80(pC), rC5   #else	movsd	(pC), rC0	movsd	8(pC), rC1	movsd	16(pC), rC2	movsd	24(pC), rC3	movsd	32(pC), rC4	movsd	40(pC), rC5   #endif   #ifdef BETAX	movlpd	(%esp), rA0	mulsd	rA0, rC0	mulsd	rA0, rC1	mulsd	rA0, rC2	mulsd	rA0, rC3	mulsd	rA0, rC4	mulsd	rA0, rC5   #endif#endif	movapd	0(pA), rA0	mulpd	0(pB), rA0	addpd	rA0, rC0	movapd	0+NBso(pA), rA0	mulpd	0(pB), rA0	addpd	rA0, rC1	movapd	0+NB2so(pA), rA0	mulpd	0(pB), rA0	addpd	rA0, rC2	movapd	0+NB3so(pA), rA0	mulpd	0(pB), rA0	addpd	rA0, rC3	movapd	0+NB4so(pA), rA0	mulpd	0(pB), rA0	addpd	rA0, rC4	movapd	0+NB5so(pA), rA0	mulpd	0(pB), rA0	addpd	rA0, rC5	movapd	16(pA), rA0	mulpd	16(pB), rA0	addpd	rA0, rC0	movapd	16+NBso(pA), rA0	mulpd	16(pB), rA0	addpd	rA0, rC1	movapd	16+NB2so(pA), rA0	mulpd	16(pB), rA0	addpd	rA0, rC2	movapd	16+NB3so(pA), rA0	mulpd	16(pB), rA0	addpd	rA0, rC3	movapd	16+NB4so(pA), rA0	mulpd	16(pB), rA0	addpd	rA0, rC4	movapd	16+NB5so(pA), rA0	mulpd	16(pB), rA0	addpd	rA0, rC5	movapd	32(pA), rA0	mulpd	32(pB), rA0	addpd	rA0, rC0	movapd	32+NBso(pA), rA0	mulpd	32(pB), rA0	addpd	rA0, rC1	movapd	32+NB2so(pA), rA0	mulpd	32(pB), rA0	addpd	rA0, rC2	movapd	32+NB3so(pA), rA0	mulpd	32(pB), rA0	addpd	rA0, rC3	movapd	32+NB4so(pA), rA0	mulpd	32(pB), rA0	addpd	rA0, rC4	movapd	32+NB5so(pA), rA0	mulpd	32(pB), rA0	addpd	rA0, rC5	movapd	48(pA), rA0	mulpd	48(pB), rA0	addpd	rA0, rC0	movapd	48+NBso(pA), rA0	mulpd	48(pB), rA0	addpd	rA0, rC1	movapd	48+NB2so(pA), rA0	mulpd	48(pB), rA0	addpd	rA0, rC2	movapd	48+NB3so(pA), rA0	mulpd	48(pB), rA0	addpd	rA0, rC3	movapd	48+NB4so(pA), rA0	mulpd	48(pB), rA0	addpd	rA0, rC4	movapd	48+NB5so(pA), rA0	mulpd	48(pB), rA0	addpd	rA0, rC5	movapd	64(pA), rA0	mulpd	64(pB), rA0	addpd	rA0, rC0	movapd	64+NBso(pA), rA0	mulpd	64(pB), rA0	addpd	rA0, rC1	movapd	64+NB2so(pA), rA0	mulpd	64(pB), rA0	addpd	rA0, rC2	movapd	64+NB3so(pA), rA0	mulpd	64(pB), rA0	addpd	rA0, rC3	movapd	64+NB4so(pA), rA0	mulpd	64(pB), rA0	addpd	rA0, rC4	movapd	64+NB5so(pA), rA0	mulpd	64(pB), rA0	addpd	rA0, rC5	movapd	80(pA), rA0	mulpd	80(pB), rA0	addpd	rA0, rC0	movapd	80+NBso(pA), rA0	mulpd	80(pB), rA0	addpd	rA0, rC1	movapd	80+NB2so(pA), rA0	mulpd	80(pB), rA0	addpd	rA0, rC2	movapd	80+NB3so(pA), rA0	mulpd	80(pB), rA0	addpd	rA0, rC3	movapd	80+NB4so(pA), rA0	mulpd	80(pB), rA0	addpd	rA0, rC4	movapd	80+NB5so(pA), rA0	mulpd	80(pB), rA0	addpd	rA0, rC5	movapd	96(pA), rA0	mulpd	96(pB), rA0	addpd	rA0, rC0	movapd	96+NBso(pA), rA0	mulpd	96(pB), rA0	addpd	rA0, rC1	movapd	96+NB2so(pA), rA0	mulpd	96(pB), rA0	addpd	rA0, rC2	movapd	96+NB3so(pA), rA0	mulpd	96(pB), rA0	addpd	rA0, rC3	movapd	96+NB4so(pA), rA0	mulpd	96(pB), rA0	addpd	rA0, rC4	movapd	96+NB5so(pA), rA0	mulpd	96(pB), rA0	addpd	rA0, rC5	movapd	112(pA), rA0	mulpd	112(pB), rA0	addpd	rA0, rC0	movapd	112+NBso(pA), rA0	mulpd	112(pB), rA0	addpd	rA0, rC1	movapd	112+NB2so(pA), rA0	mulpd	112(pB), rA0	addpd	rA0, rC2	movapd	112+NB3so(pA), rA0	mulpd	112(pB), rA0	addpd	rA0, rC3	movapd	112+NB4so(pA), rA0	mulpd	112(pB), rA0	addpd	rA0, rC4	movapd	112+NB5so(pA), rA0	mulpd	112(pB), rA0	addpd	rA0, rC5	movapd	128(pA), rA0	mulpd	128(pB), rA0	addpd	rA0, rC0	movapd	128+NBso(pA), rA0	mulpd	128(pB), rA0	addpd	rA0, rC1	movapd	128+NB2so(pA), rA0	mulpd	128(pB), rA0	addpd	rA0, rC2	movapd	128+NB3so(pA), rA0	mulpd	128(pB), rA0	addpd	rA0, rC3	movapd	128+NB4so(pA), rA0	mulpd	128(pB), rA0	addpd	rA0, rC4	movapd	128+NB5so(pA), rA0	mulpd	128(pB), rA0	addpd	rA0, rC5	movapd	144(pA), rA0	mulpd	144(pB), rA0	addpd	rA0, rC0	movapd	144+NBso(pA), rA0	mulpd	144(pB), rA0	addpd	rA0, rC1	movapd	144+NB2so(pA), rA0	mulpd	144(pB), rA0	addpd	rA0, rC2	movapd	144+NB3so(pA), rA0	mulpd	144(pB), rA0	addpd	rA0, rC3	movapd	144+NB4so(pA), rA0	mulpd	144(pB), rA0	addpd	rA0, rC4	movapd	144+NB5so(pA), rA0	mulpd	144(pB), rA0	addpd	rA0, rC5	movapd	160(pA), rA0	mulpd	160(pB), rA0	addpd	rA0, rC0	movapd	160+NBso(pA), rA0	mulpd	160(pB), rA0	addpd	rA0, rC1	movapd	160+NB2so(pA), rA0	mulpd	160(pB), rA0	addpd	rA0, rC2	movapd	160+NB3so(pA), rA0	mulpd	160(pB), rA0	addpd	rA0, rC3	movapd	160+NB4so(pA), rA0	mulpd	160(pB), rA0	addpd	rA0, rC4	movapd	160+NB5so(pA), rA0	mulpd	160(pB), rA0	addpd	rA0, rC5	movapd	176(pA), rA0	mulpd	176(pB), rA0	addpd	rA0, rC0	movapd	176+NBso(pA), rA0	mulpd	176(pB), rA0	addpd	rA0, rC1	movapd	176+NB2so(pA), rA0	mulpd	176(pB), rA0	addpd	rA0, rC2	movapd	176+NB3so(pA), rA0	mulpd	176(pB), rA0	addpd	rA0, rC3	movapd	176+NB4so(pA), rA0	mulpd	176(pB), rA0	addpd	rA0, rC4	movapd	176+NB5so(pA), rA0	mulpd	176(pB), rA0	addpd	rA0, rC5	movapd	192(pA), rA0	mulpd	192(pB), rA0	addpd	rA0, rC0	movapd	192+NBso(pA), rA0	mulpd	192(pB), rA0	addpd	rA0, rC1	movapd	192+NB2so(pA), rA0	mulpd	192(pB), rA0	addpd	rA0, rC2	movapd	192+NB3so(pA), rA0	mulpd	192(pB), rA0	addpd	rA0, rC3	movapd	192+NB4so(pA), rA0	mulpd	192(pB), rA0	addpd	rA0, rC4	movapd	192+NB5so(pA), rA0	mulpd	192(pB), rA0	addpd	rA0, rC5	movapd	208(pA), rA0	mulpd	208(pB), rA0	addpd	rA0, rC0	movapd	208+NBso(pA), rA0	mulpd	208(pB), rA0	addpd	rA0, rC1	movapd	208+NB2so(pA), rA0	mulpd	208(pB), rA0	addpd	rA0, rC2	movapd	208+NB3so(pA), rA0	mulpd	208(pB), rA0	addpd	rA0, rC3	movapd	208+NB4so(pA), rA0	mulpd	208(pB), rA0	addpd	rA0, rC4	movapd	208+NB5so(pA), rA0	mulpd	208(pB), rA0	addpd	rA0, rC5	movapd	224(pA), rA0	mulpd	224(pB), rA0	addpd	rA0, rC0	movapd	224+NBso(pA), rA0	mulpd	224(pB), rA0	addpd	rA0, rC1	movapd	224+NB2so(pA), rA0	mulpd	224(pB), rA0	addpd	rA0, rC2	movapd	224+NB3so(pA), rA0	mulpd	224(pB), rA0	addpd	rA0, rC3	movapd	224+NB4so(pA), rA0	mulpd	224(pB), rA0	addpd	rA0, rC4	movapd	224+NB5so(pA), rA0	mulpd	224(pB), rA0	addpd	rA0, rC5	movapd	240(pA), rA0	mulpd	240(pB), rA0	addpd	rA0, rC0	movapd	240+NBso(pA), rA0	mulpd	240(pB), rA0	addpd	rA0, rC1	movapd	240+NB2so(pA), rA0	mulpd	240(pB), rA0	addpd	rA0, rC2	movapd	240+NB3so(pA), rA0	mulpd	240(pB), rA0	addpd	rA0, rC3	movapd	240+NB4so(pA), rA0	mulpd	240(pB), rA0	addpd	rA0, rC4	movapd	240+NB5so(pA), rA0	mulpd	240(pB), rA0	addpd	rA0, rC5	movapd	256(pA), rA0	mulpd	256(pB), rA0	addpd	rA0, rC0	movapd	256+NBso(pA), rA0	mulpd	256(pB), rA0	addpd	rA0, rC1	movapd	256+NB2so(pA), rA0	mulpd	256(pB), rA0	addpd	rA0, rC2	movapd	256+NB3so(pA), rA0	mulpd	256(pB), rA0	addpd	rA0, rC3	movapd	256+NB4so(pA), rA0	mulpd	256(pB), rA0	addpd	rA0, rC4	movapd	256+NB5so(pA), rA0

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?