⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm_kernel_1x2_3dnow.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define STACK	16#define ARGS	 0#define OLD_M	 4 + STACK + ARGS(%esi)#define OLD_N	 8 + STACK + ARGS(%esi)#define OLD_K	12 + STACK + ARGS(%esi)#define OLD_ALPHA_R	16 + STACK + ARGS(%esi)#define OLD_ALPHA_I	20 + STACK + ARGS(%esi)#define OLD_A	24 + STACK + ARGS(%esi)#define OLD_B	28 + STACK + ARGS(%esi)#define OLD_C	32 + STACK + ARGS(%esi)#define OLD_LDC	36 + STACK + ARGS(%esi)#define OLD_OFFSET 40 + STACK + ARGS(%esi)#define GAMMA_R  0(%esp)#define GAMMA_I  8(%esp)#define ALPHA	16(%esp)#define K	24(%esp)#define N	28(%esp)#define M	32(%esp)#define A	36(%esp)#define C	40(%esp)#define J	44(%esp)#define OLD_STACK 48(%esp)#define OFFSET	52(%esp)#define KK	56(%esp)#define KKK	60(%esp)#define BUFFER 128(%esp)#define AA	%edx#define BB	%ecx#define PREFETCHSIZE (16 * 2 + 6)#define LOCAL_BUFFER_SIZE  GEMM_Q * GEMM_UNROLL_N * COMPSIZE * 16#define AOFFSET -32#define BOFFSET 128/*  A hint of scheduling is received from following URLhttps://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11*/	PROLOGUE	pushl	%ebp	pushl	%edi	pushl	%esi	pushl	%ebx	PROFCODE	movl	%esp, %esi	# save old stack	subl	$128 + LOCAL_BUFFER_SIZE, %esp	movl	OLD_M, %ebx	andl	$-1024, %esp	# align stack#ifdef WINDOWS_ABI#if LOCAL_BUFFER_SIZE > 12288	movl	$0,  4096 * 3(%esp)#endif#if LOCAL_BUFFER_SIZE >  8192	movl	$0,  4096 * 2(%esp)#endif#if LOCAL_BUFFER_SIZE >  4096	movl	$0,  4096 * 1(%esp)#endif	movl	$0,  4096 * 0(%esp)#endif		movl	OLD_N, %eax	movl	OLD_K, %ecx	movl	OLD_A, %edx	movl	%ebx, M	movl	%eax, N	movl	%ecx, K	subl	$AOFFSET * SIZE, %edx	movl	%edx, A	movl	%esi, OLD_STACK	testl	%ebx, %ebx	jle	.L999	movl	OLD_B, %edi	movl	OLD_C, %ebx	EMMS	movd	OLD_ALPHA_R, %mm0	movd	OLD_ALPHA_I, %mm1	movd	%mm0, 0 + ALPHA	movd	%mm1, 4 + ALPHA#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)	movl	 $0x3f800000,  0 + GAMMA_R	movl	 $0x3f800000,  4 + GAMMA_R	movl	 $0xbf800000,  0 + GAMMA_I	movl	 $0x3f800000,  4 + GAMMA_I#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)	movl	 $0x3f800000,  0 + GAMMA_R	movl	 $0x3f800000,  4 + GAMMA_R	movl	 $0x3f800000,  0 + GAMMA_I	movl	 $0xbf800000,  4 + GAMMA_I#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)	movl	 $0x3f800000,  0 + GAMMA_R	movl	 $0xbF800000,  4 + GAMMA_R	movl	 $0x3f800000,  0 + GAMMA_I	movl	 $0x3F800000,  4 + GAMMA_I#else	movl	 $0x3f800000,  0 + GAMMA_R	movl	 $0xbf800000,  4 + GAMMA_R	movl	 $0xbf800000,  0 + GAMMA_I	movl	 $0xbf800000,  4 + GAMMA_I#endif	movl	%ebx, C	movl	OLD_LDC, %ebp	leal	(, %ebp, SIZE * 2), %ebp#ifdef TRMMKERNEL	movl	OLD_OFFSET, %eax	movl	%eax, OFFSET#ifndef LEFT	negl	%eax	movl	%eax, KK#endif#endif	movl	N, %eax	sarl	$1, %eax	movl	%eax, J			# j = n	jle	.L20	ALIGN_4.L01:/* Copying to Sub Buffer */	leal	BUFFER, BB#if defined(TRMMKERNEL) && defined(LEFT)	movl	OFFSET, %eax	movl	%eax, KK#endif		movl	K,  %eax	sarl	$2, %eax	jle	.L03	ALIGN_4.L02:	movd	 0 * SIZE(%edi), %mm0	movd	 1 * SIZE(%edi), %mm1	movd	 2 * SIZE(%edi), %mm2	movd	 3 * SIZE(%edi), %mm3	movd	 4 * SIZE(%edi), %mm4	movd	 5 * SIZE(%edi), %mm5	movd	 6 * SIZE(%edi), %mm6	movd	 7 * SIZE(%edi), %mm7	prefetchnta	72 * SIZE(%edi)	punpckldq %mm0, %mm0	punpckldq %mm1, %mm1	punpckldq %mm2, %mm2	punpckldq %mm3, %mm3	punpckldq %mm4, %mm4	punpckldq %mm5, %mm5	punpckldq %mm6, %mm6	punpckldq %mm7, %mm7	movq	%mm0,  0 * SIZE(BB)	movq	%mm1,  2 * SIZE(BB)	movq	%mm2,  4 * SIZE(BB)	movq	%mm3,  6 * SIZE(BB)	movq	%mm4,  8 * SIZE(BB)	movq	%mm5, 10 * SIZE(BB)	movq	%mm6, 12 * SIZE(BB)	movq	%mm7, 14 * SIZE(BB)	movd	 8 * SIZE(%edi), %mm0	movd	 9 * SIZE(%edi), %mm1	movd	10 * SIZE(%edi), %mm2	movd	11 * SIZE(%edi), %mm3	movd	12 * SIZE(%edi), %mm4	movd	13 * SIZE(%edi), %mm5	movd	14 * SIZE(%edi), %mm6	movd	15 * SIZE(%edi), %mm7	punpckldq %mm0, %mm0	punpckldq %mm1, %mm1	punpckldq %mm2, %mm2	punpckldq %mm3, %mm3	punpckldq %mm4, %mm4	punpckldq %mm5, %mm5	punpckldq %mm6, %mm6	punpckldq %mm7, %mm7	movq	%mm0, 16 * SIZE(BB)	movq	%mm1, 18 * SIZE(BB)	movq	%mm2, 20 * SIZE(BB)	movq	%mm3, 22 * SIZE(BB)	movq	%mm4, 24 * SIZE(BB)	movq	%mm5, 26 * SIZE(BB)	movq	%mm6, 28 * SIZE(BB)	movq	%mm7, 30 * SIZE(BB)	addl	$16 * SIZE, %edi	addl	$32 * SIZE, BB	decl	%eax	jne	.L02	ALIGN_4.L03:	movl	K, %eax	andl	$3, %eax	BRANCH	jle	.L10	ALIGN_4.L04:	movd	 0 * SIZE(%edi), %mm0	movd	 1 * SIZE(%edi), %mm1	movd	 2 * SIZE(%edi), %mm2	movd	 3 * SIZE(%edi), %mm3	punpckldq %mm0, %mm0	punpckldq %mm1, %mm1	punpckldq %mm2, %mm2	punpckldq %mm3, %mm3	movq	%mm0,  0 * SIZE(BB)	movq	%mm1,  2 * SIZE(BB)	movq	%mm2,  4 * SIZE(BB)	movq	%mm3,  6 * SIZE(BB)	addl	$4 * SIZE, %edi	addl	$8 * SIZE, BB	decl	%eax	jne	.L04	ALIGN_4.L10:	movl	C, %esi		# coffset = c	movl	A, AA		# aoffset = a	movl	M,  %ebx	ALIGN_4.L11:	leal	- BOFFSET * SIZE + BUFFER, BB#if !defined(TRMMKERNEL) || \	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))#else	movl	KK,   %eax	leal	(, %eax, SIZE), %eax	leal	(AA, %eax, 2), AA	leal	(BB, %eax, 8), BB#endif	movq	        (  0 + AOFFSET) * SIZE(AA), %mm0	pxor	%mm4, %mm4	movq	        ( 16 + AOFFSET) * SIZE(AA), %mm1	pxor	%mm5, %mm5	PADDING movq	(  0 + BOFFSET) * SIZE(BB), %mm2	pxor	%mm6, %mm6	PADDING movq	( 16 + BOFFSET) * SIZE(BB), %mm3	pxor	%mm7, %mm7	prefetchw 2 * SIZE(%esi)	prefetchw 2 * SIZE(%esi, %ebp)#ifndef TRMMKERNEL	movl	K,  %eax#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	movl	K, %eax	subl	KK, %eax	movl	%eax, KKK	#else	movl	KK, %eax#ifdef LEFT	addl	$1, %eax#else	addl	$2, %eax#endif	movl	%eax, KKK#endif	sarl	$4, %eax	je	.L15	ALIGN_4.L12:	pfmul	%mm0, %mm2	pfadd	%mm2, %mm4	PADDING movq	(  2 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm0, %mm2	pfadd	%mm2, %mm5	PADDING movq	(  4 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm0, %mm2	pfadd	%mm2, %mm6	PADDING prefetch	(PREFETCHSIZE +  0) * SIZE(AA)	PADDING movq	(  8 + BOFFSET) * SIZE(BB), %mm2	pfmul	        (  6 + BOFFSET) * SIZE(BB), %mm0	pfadd	%mm0, %mm7	movq	        (  2 + AOFFSET) * SIZE(AA), %mm0	pfmul	%mm0, %mm2	pfadd	%mm2, %mm4	PADDING movq	( 10 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm0, %mm2	pfadd	%mm2, %mm5	PADDING movq	( 12 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm0, %mm2	pfadd	%mm2, %mm6	PADDING movq	( 32 + BOFFSET) * SIZE(BB), %mm2	pfmul	        ( 14 + BOFFSET) * SIZE(BB), %mm0	pfadd	%mm0, %mm7	movq	        (  4 + AOFFSET) * SIZE(AA), %mm0	pfmul	%mm0, %mm3	pfadd	%mm3, %mm4	PADDING movq	( 18 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm0, %mm3	pfadd	%mm3, %mm5	PADDING movq	( 20 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm0, %mm3	pfadd	%mm3, %mm6	PADDING movq	( 24 + BOFFSET) * SIZE(BB), %mm3	pfmul	        ( 22 + BOFFSET) * SIZE(BB), %mm0	pfadd	%mm0, %mm7	movq	        (  6 + AOFFSET) * SIZE(AA), %mm0	pfmul	%mm0, %mm3	pfadd	%mm3, %mm4	PADDING movq	( 26 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm0, %mm3	pfadd	%mm3, %mm5	PADDING movq	( 28 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm0, %mm3	pfadd	%mm3, %mm6	PADDING movq	( 48 + BOFFSET) * SIZE(BB), %mm3	pfmul	        ( 30 + BOFFSET) * SIZE(BB), %mm0	pfadd	%mm0, %mm7	movq	        (  8 + AOFFSET) * SIZE(AA), %mm0	pfmul	%mm0, %mm2	pfadd	%mm2, %mm4	PADDING movq	( 34 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm0, %mm2	pfadd	%mm2, %mm5	PADDING movq	( 36 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm0, %mm2	pfadd	%mm2, %mm6	PADDING movq	( 40 + BOFFSET) * SIZE(BB), %mm2	pfmul	        ( 38 + BOFFSET) * SIZE(BB), %mm0	pfadd	%mm0, %mm7	movq	        ( 10 + AOFFSET) * SIZE(AA), %mm0	pfmul	%mm0, %mm2	pfadd	%mm2, %mm4	PADDING movq	( 42 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm0, %mm2	pfadd	%mm2, %mm5	PADDING movq	( 44 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm0, %mm2	pfadd	%mm2, %mm6	PADDING movq	( 64 + BOFFSET) * SIZE(BB), %mm2	pfmul	        ( 46 + BOFFSET) * SIZE(BB), %mm0	pfadd	%mm0, %mm7	movq	        ( 12 + AOFFSET) * SIZE(AA), %mm0	pfmul	%mm0, %mm3	pfadd	%mm3, %mm4	PADDING movq	( 50 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm0, %mm3	pfadd	%mm3, %mm5	PADDING movq	( 52 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm0, %mm3	pfadd	%mm3, %mm6	PADDING movq	( 56 + BOFFSET) * SIZE(BB), %mm3	pfmul	        ( 54 + BOFFSET) * SIZE(BB), %mm0	pfadd	%mm0, %mm7	movq	        ( 14 + AOFFSET) * SIZE(AA), %mm0	pfmul	%mm0, %mm3	pfadd	%mm3, %mm4	PADDING movq	( 58 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm0, %mm3	pfadd	%mm3, %mm5	PADDING movq	( 60 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm0, %mm3	pfadd	%mm3, %mm6	PADDING movq	( 80 + BOFFSET) * SIZE(BB), %mm3	pfmul	        ( 62 + BOFFSET) * SIZE(BB), %mm0	pfadd	%mm0, %mm7	movq	        ( 32 + AOFFSET) * SIZE(AA), %mm0	pfmul	%mm1, %mm2	pfadd	%mm2, %mm4	PADDING movq	( 66 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm1, %mm2	pfadd	%mm2, %mm5	PADDING movq	( 68 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm1, %mm2	pfadd	%mm2, %mm6	PADDING movq	( 72 + BOFFSET) * SIZE(BB), %mm2	pfmul	        ( 70 + BOFFSET) * SIZE(BB), %mm1	pfadd	%mm1, %mm7	movq	        ( 18 + AOFFSET) * SIZE(AA), %mm1	pfmul	%mm1, %mm2	pfadd	%mm2, %mm4	PADDING movq	( 74 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm1, %mm2	pfadd	%mm2, %mm5	PADDING movq	( 76 + BOFFSET) * SIZE(BB), %mm2	pfmul	%mm1, %mm2	pfadd	%mm2, %mm6	PADDING movq	( 96 + BOFFSET) * SIZE(BB), %mm2	pfmul	        ( 78 + BOFFSET) * SIZE(BB), %mm1	pfadd	%mm1, %mm7	movq	        ( 20 + AOFFSET) * SIZE(AA), %mm1	pfmul	%mm1, %mm3	pfadd	%mm3, %mm4	PADDING movq	( 82 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm1, %mm3	pfadd	%mm3, %mm5	PADDING movq	( 84 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm1, %mm3	pfadd	%mm3, %mm6	PADDING movq	( 88 + BOFFSET) * SIZE(BB), %mm3	pfmul	        ( 86 + BOFFSET) * SIZE(BB), %mm1	pfadd	%mm1, %mm7	movq	        ( 22 + AOFFSET) * SIZE(AA), %mm1	pfmul	%mm1, %mm3	pfadd	%mm3, %mm4	PADDING movq	( 90 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm1, %mm3	pfadd	%mm3, %mm5	PADDING movq	( 92 + BOFFSET) * SIZE(BB), %mm3	pfmul	%mm1, %mm3	pfadd	%mm3, %mm6	PADDING movq	(112 + BOFFSET) * SIZE(BB), %mm3	pfmul	        ( 94 + BOFFSET) * SIZE(BB), %mm1	pfadd	%mm1, %mm7	movq	        ( 24 + AOFFSET) * SIZE(AA), %mm1

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -