📄 zsymv_l_sse2.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef CORE2#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 24)#endif#ifdef PENRYN#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 24)#endif#ifdef PENTIUM4#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 28)#endif#ifdef OPTERON#define PREFETCH prefetch#define PREFETCHW prefetchw#define PREFETCHSIZE (16 * 12)#define movsd movlpd#endif#ifdef BARCELONA#define PREFETCH prefetch#define PREFETCHW prefetchw#define PREFETCHSIZE (16 * 16)#endif#ifdef GENERIC#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 12)#endif#ifndef WINDOWS_ABI#define STACKSIZE 80 #define OLD_INCY 8 + STACKSIZE(%rsp)#define OLD_BUFFER 16 + STACKSIZE(%rsp)#define M ARG1#define A ARG2#define LDA ARG3 #define X ARG4#define INCX ARG5 #define Y ARG6#define INCY %r10#define BUFFER %r11#else#define STACKSIZE 256 #define OLD_LDA 40 + STACKSIZE(%rsp)#define OLD_X 48 + STACKSIZE(%rsp)#define OLD_INCX 56 + STACKSIZE(%rsp)#define OLD_Y 64 + STACKSIZE(%rsp)#define OLD_INCY 72 + STACKSIZE(%rsp)#define OLD_BUFFER 80 + STACKSIZE(%rsp)#define M ARG1#define A ARG4#define LDA ARG3#define X %rdi#define INCX %rsi #define Y %rdx#define INCY %r10#define BUFFER %r11#endif#define TEMP %rax#define I %rax#define IS %r12#define A1 %rbx#define A2 %rbp#define XX %r13#define YY %r14#define NEW_X BUFFER#define NEW_Y X#define ALPHA_R %xmm0#define ALPHA_I %xmm1#define xtemp1 %xmm0#define xtemp2 %xmm1#define xtemp3 %xmm2#define xtemp4 %xmm3#define atemp1 %xmm4#define atemp2 %xmm5#define atemp3 %xmm6#define atemp4 %xmm7#define xsum1 %xmm8#define xsum2 %xmm9#define yy1 %xmm10#define yy2 %xmm11#define a1 %xmm12#define a2 %xmm13#define a3 %xmm14#define xt1 %xmm15#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA)#define MOVDDUP(a, b, c) movddup a(b), c#define MOVDDUP2(a, b, c) movddup a##b, c#else#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c#endif#ifndef HEMV#define ADD addpd#else#define ADD subpd#endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp)#ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_LDA, LDA movq OLD_X, X movq OLD_INCX, INCX movq OLD_Y, Y movaps %xmm1, %xmm0 movaps %xmm2, %xmm1#endif movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY salq $ZBASE_SHIFT, LDA testq M, M jle .L999 pcmpeqb %xmm2, %xmm2 xorpd %xmm3, %xmm3 psllq $63, %xmm2 unpcklpd %xmm3, %xmm2 unpcklpd ALPHA_I, ALPHA_R unpcklpd ALPHA_R, ALPHA_I xorpd %xmm2, ALPHA_I movq BUFFER, XX movq M, %rax sarq $2, %rax jle .L02 ALIGN_3.L01: MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm5) MOVDDUP(1 * SIZE, X, %xmm6) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm6 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 movapd %xmm3, 0 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 2 * SIZE(XX) movapd %xmm5, 4 * SIZE(XX) SHUFPD_1 %xmm5, %xmm5 pxor %xmm2, %xmm5 movapd %xmm5, 6 * SIZE(XX) MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm5) MOVDDUP(1 * SIZE, X, %xmm6) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm6 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 movapd %xmm3, 8 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 10 * SIZE(XX) movapd %xmm5, 12 * SIZE(XX) SHUFPD_1 %xmm5, %xmm5 pxor %xmm2, %xmm5 movapd %xmm5, 14 * SIZE(XX) subq $-16 * SIZE, XX decq %rax jg .L01 ALIGN_3.L02: movq M, %rax andq $3, %rax jle .L05 ALIGN_3.L03: MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 addpd %xmm4, %xmm3 movapd %xmm3, 0 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 2 * SIZE(XX) addq $4 * SIZE, XX decq %rax jg .L03 ALIGN_3.L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $2 * SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $2, %rax jle .L07 ALIGN_3.L06: movsd 0 * SIZE(YY), %xmm0 movhpd 1 * SIZE(YY), %xmm0 addq INCY, YY movsd 0 * SIZE(YY), %xmm1 movhpd 1 * SIZE(YY), %xmm1 addq INCY, YY movsd 0 * SIZE(YY), %xmm2 movhpd 1 * SIZE(YY), %xmm2 addq INCY, YY movsd 0 * SIZE(YY), %xmm3 movhpd 1 * SIZE(YY), %xmm3 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) movapd %xmm1, 2 * SIZE(XX) movapd %xmm2, 4 * SIZE(XX) movapd %xmm3, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3.L07: movq M, %rax andq $3, %rax jle .L10 ALIGN_3.L08: movsd 0 * SIZE(YY), %xmm0 movhpd 1 * SIZE(YY), %xmm0 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) addq $2 * SIZE, XX decq %rax jg .L08 ALIGN_3.L10: xorq IS, IS # is = 0 cmpq $2, M jl .L20 ALIGN_3.L11: movq A, A1 leaq (A, LDA, 1), A2 leaq 4 * SIZE(A, LDA, 2), A leaq (, IS, SIZE), I leaq 0 * SIZE(NEW_X, I, 4), XX leaq 4 * SIZE(NEW_Y, I, 2), YY movapd 0 * SIZE(XX), atemp1 movapd 2 * SIZE(XX), atemp2 movapd 4 * SIZE(XX), atemp3 movapd 6 * SIZE(XX), atemp4 MOVDDUP(0 * SIZE, A1, xsum1) MOVDDUP(2 * SIZE, A1, xsum2) mulpd atemp1, xsum1 mulpd atemp1, xsum2#ifndef HEMV MOVDDUP(1 * SIZE, A1, a1) MOVDDUP(3 * SIZE, A1, a2) mulpd atemp2, a1 mulpd atemp2, a2 addpd a1, xsum1 addpd a2, xsum2#else MOVDDUP(3 * SIZE, A1, a2) mulpd atemp2, a2 addpd a2, xsum2#endif MOVDDUP(2 * SIZE, A1, a1) MOVDDUP(2 * SIZE, A2, a2) mulpd atemp3, a1 mulpd atemp3, a2 addpd a1, xsum1 addpd a2, xsum2#ifndef HEMV MOVDDUP(3 * SIZE, A1, a1) MOVDDUP(3 * SIZE, A2, a2) mulpd atemp4, a1 mulpd atemp4, a2 addpd a1, xsum1 addpd a2, xsum2#else MOVDDUP(3 * SIZE, A1, a1) mulpd atemp4, a1 subpd a1, xsum1#endif MOVDDUP(4 * SIZE, A1, a1) MOVDDUP(6 * SIZE, A2, a2) movsd 0 * SIZE(YY), yy1 movhpd 1 * SIZE(YY), yy1 movsd 2 * SIZE(YY), yy2 movhpd 3 * SIZE(YY), yy2 movapd 8 * SIZE(XX), xtemp1 movapd 10 * SIZE(XX), xtemp2 movapd 12 * SIZE(XX), xtemp3
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -