symv_u_sse.s

来自「Optimized GotoBLAS libraries」· S 代码 · 共 1,017 行 · 第 1/2 页
1,017 行
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef CORE2#define PREFETCH	prefetcht0#define PREFETCHW	prefetcht0#define PREFETCHSIZE	(16 * 12)#endif#ifdef PENRYN#define PREFETCH	prefetcht0#define PREFETCHW	prefetcht0#define PREFETCHSIZE	(16 * 12)#endif#ifdef PENTIUM4#define PREFETCH	prefetcht0#define PREFETCHW	prefetcht0#define PREFETCHSIZE	(16 * 20)#endif#ifdef OPTERON#define PREFETCH	prefetch#define PREFETCHW	prefetchw#define PREFETCHSIZE	(16 * 8)#define movsd		movlps#endif#ifdef BARCELONA#define PREFETCH	prefetch#define PREFETCHW	prefetchw#define PREFETCHSIZE	(16 * 16)#endif#ifdef GENERIC#define PREFETCH	prefetcht0#define PREFETCHW	prefetcht0#define PREFETCHSIZE	(16 * 20)#endif#ifndef WINDOWS_ABI#define STACKSIZE	80	#define OLD_INCY	 8 + STACKSIZE(%rsp)#define OLD_BUFFER	16 + STACKSIZE(%rsp)#define M	  ARG1#define	A	  ARG2#define LDA	  ARG3	#define	X	  ARG4#define INCX	  ARG5	#define	Y	  ARG6#define INCY	  %r10#define BUFFER	  %r11#else#define STACKSIZE	256	#define OLD_X		 40 + STACKSIZE(%rsp)#define OLD_INCX	 48 + STACKSIZE(%rsp)#define OLD_Y		 56 + STACKSIZE(%rsp)#define OLD_INCY	 64 + STACKSIZE(%rsp)#define OLD_BUFFER	 72 + STACKSIZE(%rsp)#define M	  ARG1#define	A	  ARG3#define LDA	  ARG4#define	X	  %rdi#define INCX	  %rsi	#define	Y	  %rdx#define INCY	  %r10#define BUFFER	  %r11#endif#define TEMP	%rax#define I	%rax#define IS	%r12#define A1	%rbx#define A2	%rbp#define XX	%r13#define YY	%r14#define NEW_X	BUFFER#define NEW_Y	X#define ALPHA  %xmm0#define atemp1 %xmm0#define atemp2 %xmm1#define atemp3 %xmm2#define atemp4 %xmm3#define xsum1  %xmm4#define xsum2  %xmm5#define xsum3  %xmm6#define xsum4  %xmm7#define xtemp1 %xmm8#define xtemp2 %xmm9#define yy1    %xmm10#define	xt1    %xmm11#define a1     %xmm12#define a2     %xmm13#define a3     %xmm14#define a4     %xmm15	PROLOGUE	PROFCODE	subq	$STACKSIZE, %rsp	movq	%rbx,  0(%rsp)	movq	%rbp,  8(%rsp)	movq	%r12, 16(%rsp)	movq	%r13, 24(%rsp)	movq	%r14, 32(%rsp)	movq	%r15, 40(%rsp)#ifdef WINDOWS_ABI	movq	%rdi,    48(%rsp)	movq	%rsi,    56(%rsp)	movups	%xmm6,   64(%rsp)	movups	%xmm7,   80(%rsp)	movups	%xmm8,   96(%rsp)	movups	%xmm9,  112(%rsp)	movups	%xmm10, 128(%rsp)	movups	%xmm11, 144(%rsp)	movups	%xmm12, 160(%rsp)	movups	%xmm13, 176(%rsp)	movups	%xmm14, 192(%rsp)	movups	%xmm15, 208(%rsp)	movq	OLD_X,     X	movq	OLD_INCX,  INCX	movq	OLD_Y,     Y	movaps	%xmm1, %xmm0#endif	movq	OLD_INCY,   INCY	movq	OLD_BUFFER, BUFFER	leaq	(,INCX, SIZE), INCX	leaq	(,INCY, SIZE), INCY	leaq	(,LDA,  SIZE), LDA	testq	M, M	jle	.L999	shufps	$0, ALPHA, ALPHA	movq	BUFFER, XX	movq	M,  %rax	sarq	$3, %rax	jle	.L02	ALIGN_3.L01:	movss	0 * SIZE(X), %xmm1	addq	INCX, X	movss	0 * SIZE(X), %xmm2	addq	INCX, X	movss	0 * SIZE(X), %xmm3	addq	INCX, X	movss	0 * SIZE(X), %xmm4	addq	INCX, X	movss	0 * SIZE(X), %xmm5	addq	INCX, X	movss	0 * SIZE(X), %xmm6	addq	INCX, X	movss	0 * SIZE(X), %xmm7	addq	INCX, X	movss	0 * SIZE(X), %xmm8	addq	INCX, X	mulss	ALPHA, %xmm1	mulss	ALPHA, %xmm2	mulss	ALPHA, %xmm3	mulss	ALPHA, %xmm4	mulss	ALPHA, %xmm5	mulss	ALPHA, %xmm6	mulss	ALPHA, %xmm7	mulss	ALPHA, %xmm8	movss	%xmm1, 0 * SIZE(XX)	movss	%xmm2, 1 * SIZE(XX)	movss	%xmm3, 2 * SIZE(XX)	movss	%xmm4, 3 * SIZE(XX)	movss	%xmm5, 4 * SIZE(XX)	movss	%xmm6, 5 * SIZE(XX)	movss	%xmm7, 6 * SIZE(XX)	movss	%xmm8, 7 * SIZE(XX)	addq	$8 * SIZE, XX	decq	%rax	jg	.L01	ALIGN_3.L02:	movq	M, %rax	andq	$7, %rax	jle	.L05	ALIGN_3.L03:	movss	0 * SIZE(X), %xmm1	addq	INCX, X	mulss	ALPHA, %xmm1	movss	%xmm1, 0 * SIZE(XX)	addq	$1 * SIZE, XX	decq	%rax	jg	.L03	ALIGN_3.L05:	/* now we don't need original X */	movq   Y, NEW_Y	addq   $512, XX	andq   $-512, XX	cmpq   $SIZE, INCY	je    .L10	movq   Y,  YY	movq   XX, NEW_Y	movq	M,  %rax	sarq	$3, %rax	jle	.L07	ALIGN_3.L06:	movss	0 * SIZE(YY), %xmm0	addq	INCY, YY	movss	0 * SIZE(YY), %xmm1	addq	INCY, YY	movss	0 * SIZE(YY), %xmm2	addq	INCY, YY	movss	0 * SIZE(YY), %xmm3	addq	INCY, YY	movss	0 * SIZE(YY), %xmm4	addq	INCY, YY	movss	0 * SIZE(YY), %xmm5	addq	INCY, YY	movss	0 * SIZE(YY), %xmm6	addq	INCY, YY	movss	0 * SIZE(YY), %xmm7	addq	INCY, YY	movss	%xmm0, 0 * SIZE(XX)	movss	%xmm1, 1 * SIZE(XX)	movss	%xmm2, 2 * SIZE(XX)	movss	%xmm3, 3 * SIZE(XX)	movss	%xmm4, 4 * SIZE(XX)	movss	%xmm5, 5 * SIZE(XX)	movss	%xmm6, 6 * SIZE(XX)	movss	%xmm7, 7 * SIZE(XX)	addq	$8 * SIZE, XX	decq	%rax	jg	.L06	ALIGN_3.L07:	movq	M, %rax	andq	$7, %rax	jle	.L10	ALIGN_3.L08:	movss	0 * SIZE(YY), %xmm0	addq	INCY, YY	movss	%xmm0, 0 * SIZE(XX)	addq	$1 * SIZE, XX	decq	%rax	jg	.L08	ALIGN_3.L10:	xorq	IS, IS		# is = 0	cmpq	$4, M	jl	.L20	ALIGN_3.L11:	movq	A,  A1	leaq	(A, LDA, 2), A2	leaq	(A, LDA, 4), A	movaps		0 * SIZE(NEW_X, IS, SIZE), atemp4	pshufd	$0x00, atemp4, atemp1	pshufd	$0x55, atemp4, atemp2	pshufd	$0xaa, atemp4, atemp3	pshufd	$0xff, atemp4, atemp4	pxor		xsum1, xsum1	pxor		xsum2, xsum2	pxor		xsum3, xsum3	pxor		xsum4, xsum4	movaps	 0 * SIZE(NEW_X), xtemp1	movaps	 4 * SIZE(NEW_X), xtemp2	movsd	 0 * SIZE(A1), a1	movhps	 2 * SIZE(A1), a1	movsd	 0 * SIZE(A1, LDA, 1), a2	movhps	 2 * SIZE(A1, LDA, 1), a2	movsd	 0 * SIZE(A2), a3	movhps	 2 * SIZE(A2), a3	movsd	 0 * SIZE(A2, LDA, 1), a4	movhps	 2 * SIZE(A2, LDA, 1), a4	movsd	 0 * SIZE(NEW_Y), yy1	movhps	 2 * SIZE(NEW_Y), yy1	movq		NEW_X, XX	movq		NEW_Y, YY	movq	IS,  I	sarq	$4,  I	jle	.L14	ALIGN_3.L12:	movaps	 xtemp1, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	 4 * SIZE(A1), a1	movhps	 6 * SIZE(A1), a1	PREFETCH	PREFETCHSIZE(A1)	movaps	 xtemp1, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	 4 * SIZE(A1, LDA, 1), a2	movhps	 6 * SIZE(A1, LDA, 1), a2	movaps	 xtemp1, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movsd	 4 * SIZE(A2), a3	movhps	 6 * SIZE(A2), a3#if !defined(CORE2) && !defined(PENRYN)	PREFETCH	PREFETCHSIZE(XX)#endif	movaps	 xtemp1, xt1	movaps	 8 * SIZE(XX), xtemp1	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	 4 * SIZE(A2, LDA, 1), a4	movhps	 6 * SIZE(A2, LDA, 1), a4	movlps	 yy1, 0 * SIZE(YY)	movhps	 yy1, 2 * SIZE(YY)	movsd	 4 * SIZE(YY), yy1	movhps	 6 * SIZE(YY), yy1	movaps	 xtemp2, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	 8 * SIZE(A1), a1	movhps	10 * SIZE(A1), a1	PREFETCH	PREFETCHSIZE(A1, LDA, 1)	movaps	 xtemp2, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	 8 * SIZE(A1, LDA, 1), a2	movhps	10 * SIZE(A1, LDA, 1), a2	movaps	 xtemp2, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movsd	 8 * SIZE(A2), a3	movhps	10 * SIZE(A2), a3	movaps	 xtemp2, xt1	movaps	12 * SIZE(XX), xtemp2	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	 8 * SIZE(A2, LDA, 1), a4	movhps	10 * SIZE(A2, LDA, 1), a4	movlps	 yy1, 4 * SIZE(YY)	movhps	 yy1, 6 * SIZE(YY)	movsd	 8 * SIZE(YY), yy1	movhps	10 * SIZE(YY), yy1	movaps	 xtemp1, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	12 * SIZE(A1), a1	movhps	14 * SIZE(A1), a1	PREFETCH	PREFETCHSIZE(A2)	movaps	 xtemp1, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	12 * SIZE(A1, LDA, 1), a2	movhps	14 * SIZE(A1, LDA, 1), a2	movaps	 xtemp1, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3	addps	 a3,     yy1	movsd	12 * SIZE(A2), a3	movhps	14 * SIZE(A2), a3#if !defined(CORE2) && !defined(PENRYN)	PREFETCHW	PREFETCHSIZE(YY)#endif	movaps	 xtemp1, xt1	movaps	16 * SIZE(XX), xtemp1	mulps	 a4,     xt1	mulps	 atemp4, a4	addps	 xt1,    xsum4	addps	 a4,     yy1	movsd	12 * SIZE(A2, LDA, 1), a4	movhps	14 * SIZE(A2, LDA, 1), a4	movlps	 yy1,  8 * SIZE(YY)	movhps	 yy1, 10 * SIZE(YY)	movsd	12 * SIZE(YY), yy1	movhps	14 * SIZE(YY), yy1	movaps	 xtemp2, xt1	mulps	 a1,     xt1	mulps	 atemp1, a1	addps	 xt1,    xsum1	addps	 a1,     yy1	movsd	16 * SIZE(A1), a1	movhps	18 * SIZE(A1), a1	PREFETCH	PREFETCHSIZE(A2, LDA, 1)	movaps	 xtemp2, xt1	mulps	 a2,     xt1	mulps	 atemp2, a2	addps	 xt1,    xsum2	addps	 a2,     yy1	movsd	16 * SIZE(A1, LDA, 1), a2	movhps	18 * SIZE(A1, LDA, 1), a2	movaps	 xtemp2, xt1	mulps	 a3,     xt1	mulps	 atemp3, a3	addps	 xt1,    xsum3
symv_u_sse.s - 源码说明

本页面展示了「Optimized GotoBLAS libraries」中的 symv_u_sse.s 源码文件，采用 S 编程语言编写，共 1,017 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Optimized相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?