zsymv_u_sse.s

来自「Optimized GotoBLAS libraries」· S 代码 · 共 503 行
503 行
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef CORE2#define PREFETCH	prefetcht0#define PREFETCHW	prefetcht0#define PREFETCHSIZE	(16 * 24)#endif#ifdef PENRYN#define PREFETCH	prefetcht0#define PREFETCHW	prefetcht0#define PREFETCHSIZE	(16 * 24)#endif#ifdef PENTIUM4#define PREFETCH	prefetcht0#define PREFETCHW	prefetcht0#define PREFETCHSIZE	(16 * 28)#endif#ifdef OPTERON#define PREFETCH	prefetch#define PREFETCHW	prefetchw#define PREFETCHSIZE	(16 * 12)#define movsd		movlpd#endif#ifdef BARCELONA#define PREFETCH	prefetch#define PREFETCHW	prefetchw#define PREFETCHSIZE	(16 * 16)#endif#ifdef GENERIC#define PREFETCH	prefetcht0#define PREFETCHW	prefetcht0#define PREFETCHSIZE	(16 * 14)#endif#define STACKSIZE	80	#define OLD_INCY	 8 + STACKSIZE(%rsp)#define OLD_BUFFER	16 + STACKSIZE(%rsp)#define M	  ARG1#define	A	  ARG2#define LDA	  ARG3	#define	X	  ARG4#define INCX	  ARG5	#define	Y	  ARG6#define INCY	  %r10#define BUFFER	  %r11#define TEMP	%rax#define I	%rax#define IS	%r12#define A1	%rbx#define A2	%rbp#define XX	%r13#define YY	%r14#define NEW_X	BUFFER#define NEW_Y	X#define ALPHA_R  %xmm0#define ALPHA_I  %xmm1#define xsum1  %xmm0#define xsum2  %xmm1#define xsum3  %xmm2#define xsum4  %xmm3#define atemp1 %xmm4#define atemp2 %xmm5#define atemp3 %xmm6#define atemp4 %xmm7#define xtemp1 %xmm8#define xtemp2 %xmm9#define a1     %xmm10#define a2     %xmm11#define a3     %xmm12#define yy1    %xmm13#define	xt1    %xmm14#define	xt2    %xmm15#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA)#define MOVDDUP(a, b, c)	movddup	a(b), c#define MOVDDUP2(a, b, c)	movddup	a##b, c#else#define MOVDDUP(a, b, c)	movlpd	a(b), c;movhpd	a(b), c#define MOVDDUP2(a, b, c)	movlpd	a##b, c;movhpd	a##b, c#endif	PROLOGUE	PROFCODE	subq	$STACKSIZE, %rsp	movq	%rbx,  0(%rsp)	movq	%rbp,  8(%rsp)	movq	%r12, 16(%rsp)	movq	%r13, 24(%rsp)	movq	%r14, 32(%rsp)	movq	%r15, 40(%rsp)	movq	OLD_INCY,   INCY	movq	OLD_BUFFER, BUFFER	salq	$ZBASE_SHIFT, INCX	salq	$ZBASE_SHIFT, INCY	salq	$ZBASE_SHIFT, LDA	testq	M, M	jle	.L999	pcmpeqb	%xmm3,  %xmm3	xorpd	%xmm2,  %xmm2	pslld	$31,    %xmm3	unpckhps %xmm3, %xmm2	shufps	 $0, ALPHA_R, ALPHA_R	shufps	 $0, ALPHA_I, ALPHA_I	movaps	 ALPHA_I, %xmm3	unpcklps ALPHA_R, ALPHA_I	unpcklps %xmm3,   ALPHA_R	pxor	 %xmm2,   ALPHA_R	movq	BUFFER, XX	movq	M,  %rax	sarq	$2, %rax	jle	.L02	ALIGN_3.L01:	movsd	0 * SIZE(X), %xmm4	addq	INCX, X	movhps	0 * SIZE(X), %xmm4	addq	INCX, X	movsd	0 * SIZE(X), %xmm6	addq	INCX, X	movhps	0 * SIZE(X), %xmm6	addq	INCX, X	movsldup %xmm4, %xmm3	movshdup %xmm4, %xmm4	movsldup %xmm6, %xmm5	movshdup %xmm6, %xmm6	mulps	ALPHA_I, %xmm3	mulps	ALPHA_R, %xmm4	mulps	ALPHA_I, %xmm5	mulps	ALPHA_R, %xmm6	addps	%xmm4, %xmm3	addps	%xmm6, %xmm5	movaps	%xmm3,  4 * SIZE(XX)	movaps	%xmm5, 12 * SIZE(XX)	shufps	$0xb1, %xmm3, %xmm3	shufps	$0xb1, %xmm5, %xmm5	pxor	%xmm2, %xmm3	pxor	%xmm2, %xmm5	movaps	%xmm3,  0 * SIZE(XX)	movaps	%xmm5,  8 * SIZE(XX)	subq	$-16 * SIZE, XX	decq	%rax	jg	.L01	ALIGN_3.L02:	testq	$2, M	jle	.L03	movsd	0 * SIZE(X), %xmm4	addq	INCX, X	movhps	0 * SIZE(X), %xmm4	addq	INCX, X	movsldup %xmm4, %xmm3	movshdup %xmm4, %xmm4	mulps	ALPHA_I, %xmm3	mulps	ALPHA_R, %xmm4	addps	%xmm4, %xmm3	movaps	%xmm3,  4 * SIZE(XX)	shufps	$0xb1, %xmm3, %xmm3	pxor	%xmm2, %xmm3	movaps	%xmm3,  0 * SIZE(XX)	subq	$-8 * SIZE, XX	ALIGN_3.L03:	testq	$1, M	jle	.L05	movsd	0 * SIZE(X), %xmm4	addq	INCX, X	movsldup %xmm4, %xmm3	movshdup %xmm4, %xmm4	mulps	ALPHA_I, %xmm3	mulps	ALPHA_R, %xmm4	addps	%xmm4, %xmm3	movlps	%xmm3,  2 * SIZE(XX)	shufps	$0xb1, %xmm3, %xmm3	pxor	%xmm2, %xmm3	movlps	%xmm3,  0 * SIZE(XX)	subq	$-4 * SIZE, XX	ALIGN_3.L05:	/* now we don't need original X */	movq   Y, NEW_Y	addq   $512, XX	andq   $-512, XX	cmpq   $2 * SIZE, INCY	je    .L10	movq   Y,  YY	movq   XX, NEW_Y	movq	M,  %rax	sarq	$2, %rax	jle	.L07	ALIGN_3.L06:	movsd	0 * SIZE(YY), %xmm0	addq	INCY, YY	movhps	0 * SIZE(YY), %xmm0	addq	INCY, YY	movsd	0 * SIZE(YY), %xmm1	addq	INCY, YY	movhps	0 * SIZE(YY), %xmm1	addq	INCY, YY	movaps	%xmm0, 0 * SIZE(XX)	movaps	%xmm1, 8 * SIZE(XX)	addq	$8 * SIZE, XX	decq	%rax	jg	.L06	ALIGN_3.L07:	movq	M, %rax	andq	$3, %rax	jle	.L10	ALIGN_3.L08:	movsd	0 * SIZE(YY), %xmm0	addq	INCY, YY	movlps	%xmm0, 0 * SIZE(XX)	addq	$2 * SIZE, XX	decq	%rax	jg	.L08	ALIGN_3.L10:	xorq	IS, IS		# is = 0	cmpq	$2, M	jl	.L20	ALIGN_3.L11:	movq	A,  A1	leaq	(A, LDA, 1), A2	leaq	(A, LDA, 2), A	leaq	(, IS, 4), I	movsd	 0 * SIZE(NEW_X, I, SIZE), atemp2	movhps	 4 * SIZE(NEW_X, I, SIZE), atemp2	movsd	 2 * SIZE(NEW_X, I, SIZE), atemp4	movhps	 6 * SIZE(NEW_X, I, SIZE), atemp4	pshufd	 $0xcc, atemp2, atemp1	pshufd	 $0x99, atemp2, atemp2	pshufd	 $0xcc, atemp4, atemp3	pshufd	 $0x99, atemp4, atemp4	pxor		xsum1, xsum1	pxor		xsum2, xsum2	pxor		xsum3, xsum3	pxor		xsum4, xsum4	movq		NEW_X, XX	movq		NEW_Y, YY	movq	IS,  I	sarq	$2,  I	jle	.L15	ALIGN_3.L12:	HALT	subq	 $-16 * SIZE, XX	addq	 $  8 * SIZE, YY	addq	 $  8 * SIZE, A1	addq	 $  8 * SIZE, A2	decq	 I	jg	 .L12	ALIGN_3.L15:	testq	$2, IS	jle	.L18	movsd	 0 * SIZE(YY), yy1	movhps	 2 * SIZE(YY), yy1	movaps	 0 * SIZE(XX), xtemp1	movaps	 4 * SIZE(XX), xtemp2	movsd	 0 * SIZE(A1), a1	movhps	 2 * SIZE(A1), a1	movaps	 xtemp1, xt1	movaps	 xtemp2, xt2	mulps	 a1, xt1	mulps	 a1, xt2	addps	 xt1, xsum1	addps	 xt2, xsum2	pshufd	 $0xb1, a1, xt2	mulps	 atemp1, a1	mulps	 atemp2, xt2	addps	 a1,  yy1	addps	 xt2, yy1	movsd	 0 * SIZE(A2), a1	movhps	 2 * SIZE(A2), a1	movaps	 xtemp1, xt1	movaps	 xtemp2, xt2	mulps	 a1, xt1	mulps	 a1, xt2	addps	 xt1, xsum3	addps	 xt2, xsum4	pshufd	 $0xb1, a1, xt2	mulps	 atemp1, a1	mulps	 atemp2, xt2	addps	  a1, yy1	addps	 xt2, yy1	movlps	 yy1, 0 * SIZE(YY)	movhps	 yy1, 2 * SIZE(YY)	addq	 $8 * SIZE, XX	addq	 $4 * SIZE, YY	addq	 $4 * SIZE, A1	addq	 $4 * SIZE, A2	ALIGN_3.L18:	leaq	(, IS, 4), I	movaps	 0 * SIZE(NEW_X, I, SIZE), atemp1	movaps	 4 * SIZE(NEW_X, I, SIZE), atemp2	movlps	 0 * SIZE(YY), yy1	movhps	 2 * SIZE(YY), yy1	movsd	 0 * SIZE(A1), a1	movhps	 0 * SIZE(A2), a1	movaps	 a1, a2	mulps	 atemp1, a1	mulps	 atemp2, a2	addps	 a1, xsum1	addps	 a2, xsum2	movsd	 0 * SIZE(A2), a1	movhps	 2 * SIZE(A2), a1	movaps	 a1, a2	mulps	 atemp1, a1	mulps	 atemp2, a2	addps	 a1, xsum3	addps	 a2, xsum4	haddps	 xsum2, xsum1	haddps	 xsum4, xsum3	haddps	 xsum3, xsum1	addps	 xsum1, yy1	movlps	 yy1, 0 * SIZE(YY)	movhps	 yy1, 2 * SIZE(YY)	addq	 $2, IS	movq	 IS, I	addq	 $2, I	cmpq	 M, I	jle	 .L11	ALIGN_3.L20:	testq	$1, M	jle	.L990.L990:	cmpq   $2 * SIZE, INCY	je    .L999	movq	M,  %rax	sarq	$2, %rax	jle	.L997	ALIGN_3.L996:	movaps	 0 * SIZE(NEW_Y), %xmm0	movaps	 4 * SIZE(NEW_Y), %xmm1	movlps	%xmm0,  0 * SIZE(Y)	addq	INCY, Y	movhps	%xmm0,  0 * SIZE(Y)	addq	INCY, Y	movlps	%xmm1,  0 * SIZE(Y)	addq	INCY, Y	movhps	%xmm1,  0 * SIZE(Y)	addq	INCY, Y	addq	$8 * SIZE, NEW_Y	decq	%rax	jg	.L996	ALIGN_3.L997:	movq	M, %rax	andq	$3, %rax	jle	.L999	ALIGN_3.L998:	movlps	0 * SIZE(NEW_Y), %xmm0	addq	$2 * SIZE, NEW_Y	movlps	%xmm0,  0 * SIZE(Y)	addq	INCY, Y	decq	%rax	jg	.L998	ALIGN_3.L999:	movq	  0(%rsp), %rbx	movq	  8(%rsp), %rbp	movq	 16(%rsp), %r12	movq	 24(%rsp), %r13	movq	 32(%rsp), %r14	movq	 40(%rsp), %r15	addq	$STACKSIZE, %rsp	ret	EPILOGUE
zsymv_u_sse.s - 源码说明

本页面展示了「Optimized GotoBLAS libraries」中的 zsymv_u_sse.s 源码文件，采用 S 编程语言编写，共 503 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Optimized相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?