📄 symv_u_sse2.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef CORE2#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 12)#endif#ifdef PENRYN#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 12)#endif#ifdef PENTIUM4#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 20)#endif#ifdef OPTERON#define PREFETCH prefetch#define PREFETCHW prefetchw#define PREFETCHSIZE (16 * 8)#define movsd movlpd#endif#ifdef BARCELONA#define PREFETCH prefetch#define PREFETCHW prefetchw#define PREFETCHSIZE (16 * 16)#endif#ifdef GENERIC#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 20)#endif#ifndef WINDOWS_ABI#define STACKSIZE 80 #define OLD_INCY 8 + STACKSIZE(%rsp)#define OLD_BUFFER 16 + STACKSIZE(%rsp)#define M ARG1#define A ARG2#define LDA ARG3 #define X ARG4#define INCX ARG5 #define Y ARG6#define INCY %r10#define BUFFER %r11#else#define STACKSIZE 256 #define OLD_X 40 + STACKSIZE(%rsp)#define OLD_INCX 48 + STACKSIZE(%rsp)#define OLD_Y 56 + STACKSIZE(%rsp)#define OLD_INCY 64 + STACKSIZE(%rsp)#define OLD_BUFFER 72 + STACKSIZE(%rsp)#define M ARG1#define A ARG3#define LDA ARG4#define X %rdi#define INCX %rsi #define Y %rdx#define INCY %r10#define BUFFER %r11#endif#define TEMP %rax#define I %rax#define IS %r12#define A1 %rbx#define A2 %rbp#define XX %r13#define YY %r14#define NEW_X BUFFER#define NEW_Y X#define ALPHA %xmm0#define xtemp1 %xmm0#define xtemp2 %xmm1#define yy1 %xmm2#define yy2 %xmm3#define atemp1 %xmm4#define atemp2 %xmm5#define atemp3 %xmm6#define atemp4 %xmm7#define xsum1 %xmm8#define xsum2 %xmm9#define xsum3 %xmm10#define xsum4 %xmm11#define a1 %xmm12#define a2 %xmm13#define a3 %xmm14#define xt1 %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp)#ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_X, X movq OLD_INCX, INCX movq OLD_Y, Y movaps %xmm1, %xmm0#endif movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA testq M, M jle .L999 unpcklpd ALPHA, ALPHA movq BUFFER, XX movq M, %rax sarq $3, %rax jle .L02 ALIGN_3.L01: movsd 0 * SIZE(X), %xmm1 addq INCX, X movhpd 0 * SIZE(X), %xmm1 addq INCX, X movsd 0 * SIZE(X), %xmm2 addq INCX, X movhpd 0 * SIZE(X), %xmm2 addq INCX, X movsd 0 * SIZE(X), %xmm3 addq INCX, X movhpd 0 * SIZE(X), %xmm3 addq INCX, X movsd 0 * SIZE(X), %xmm4 addq INCX, X movhpd 0 * SIZE(X), %xmm4 addq INCX, X mulpd ALPHA, %xmm1 mulpd ALPHA, %xmm2 mulpd ALPHA, %xmm3 mulpd ALPHA, %xmm4 movapd %xmm1, 0 * SIZE(XX) movapd %xmm2, 2 * SIZE(XX) movapd %xmm3, 4 * SIZE(XX) movapd %xmm4, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L01 ALIGN_3.L02: movq M, %rax andq $7, %rax jle .L05 ALIGN_3.L03: movsd 0 * SIZE(X), %xmm1 addq INCX, X mulsd ALPHA, %xmm1 movlpd %xmm1, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L03 ALIGN_3.L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $3, %rax jle .L07 ALIGN_3.L06: movsd 0 * SIZE(YY), %xmm0 addq INCY, YY movhpd 0 * SIZE(YY), %xmm0 addq INCY, YY movsd 0 * SIZE(YY), %xmm1 addq INCY, YY movhpd 0 * SIZE(YY), %xmm1 addq INCY, YY movsd 0 * SIZE(YY), %xmm2 addq INCY, YY movhpd 0 * SIZE(YY), %xmm2 addq INCY, YY movsd 0 * SIZE(YY), %xmm3 addq INCY, YY movhpd 0 * SIZE(YY), %xmm3 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) movapd %xmm1, 2 * SIZE(XX) movapd %xmm2, 4 * SIZE(XX) movapd %xmm3, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3.L07: movq M, %rax andq $7, %rax jle .L10 ALIGN_3.L08: movsd 0 * SIZE(YY), %xmm0 addq INCY, YY movsd %xmm0, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L08 ALIGN_3.L10: xorq IS, IS # is = 0 cmpq $4, M jl .L20 ALIGN_3.L11: movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A#ifdef HAVE_SSE3 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2 movddup 2 * SIZE(NEW_X, IS, SIZE), atemp3 movddup 3 * SIZE(NEW_X, IS, SIZE), atemp4#else movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2 movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2 movsd 2 * SIZE(NEW_X, IS, SIZE), atemp3 movhpd 2 * SIZE(NEW_X, IS, SIZE), atemp3 movsd 3 * SIZE(NEW_X, IS, SIZE), atemp4 movhpd 3 * SIZE(NEW_X, IS, SIZE), atemp4#endif pxor xsum1, xsum1 pxor xsum2, xsum2 pxor xsum3, xsum3 pxor xsum4, xsum4 movapd 0 * SIZE(NEW_X), xtemp1 movapd 2 * SIZE(NEW_X), xtemp2 movsd 0 * SIZE(A1), a1 movhpd 1 * SIZE(A1), a1 movsd 2 * SIZE(A1), a2 movhpd 3 * SIZE(A1), a2 movsd 0 * SIZE(A1, LDA, 1), a3 movhpd 1 * SIZE(A1, LDA, 1), a3 movsd 0 * SIZE(NEW_Y), yy1 movhpd 1 * SIZE(NEW_Y), yy1 movsd 2 * SIZE(NEW_Y), yy2 movhpd 3 * SIZE(NEW_Y), yy2 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $3, I jle .L15 ALIGN_3.L12: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 movsd 2 * SIZE(A1, LDA, 1), a1 movhpd 3 * SIZE(A1, LDA, 1), a1 PREFETCH PREFETCHSIZE(A1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp1, a2 addpd xt1, xsum1 addpd a2, yy2 movsd 0 * SIZE(A2), a2 movhpd 1 * SIZE(A2), a2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp2, a3 addpd xt1, xsum2 addpd a3, yy1 movsd 2 * SIZE(A2), a3 movhpd 3 * SIZE(A2), a3#if !defined(CORE2) && !defined(PENRYN) PREFETCH PREFETCHSIZE(XX)#endif movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum2 addpd a1, yy2 movsd 0 * SIZE(A2, LDA, 1), a1 movhpd 1 * SIZE(A2, LDA, 1), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum3 addpd a2, yy1 movsd 2 * SIZE(A2, LDA, 1), a2 movhpd 3 * SIZE(A2, LDA, 1), a2 PREFETCH PREFETCHSIZE(A1, LDA, 1) movapd xtemp2, xt1 mulpd a3, xt1 mulpd atemp3, a3 addpd xt1, xsum3 addpd a3, yy2 movsd 4 * SIZE(A1), a3 movhpd 5 * SIZE(A1), a3 movapd xtemp1, xt1 movapd 4 * SIZE(XX), xtemp1 mulpd a1, xt1 mulpd atemp4, a1 addpd xt1, xsum4 addpd a1, yy1 movsd 6 * SIZE(A1), a1 movhpd 7 * SIZE(A1), a1 movapd xtemp2, xt1 movapd 6 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum4 addpd a2, yy2 movsd 4 * SIZE(A1, LDA, 1), a2 movhpd 5 * SIZE(A1, LDA, 1), a2 movsd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movsd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp1, xt1 mulpd a3, xt1 mulpd atemp1, a3 addpd xt1, xsum1 addpd a3, yy1 movsd 6 * SIZE(A1, LDA, 1), a3 movhpd 7 * SIZE(A1, LDA, 1), a3 PREFETCH PREFETCHSIZE(A2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 movsd 4 * SIZE(A2), a1 movhpd 5 * SIZE(A2), a1 movapd xtemp1, xt1 mulpd a2, xt1 mulpd atemp2, a2 addpd xt1, xsum2 addpd a2, yy1 movsd 6 * SIZE(A2), a2 movhpd 7 * SIZE(A2), a2#if !defined(CORE2) && !defined(PENRYN) PREFETCHW PREFETCHSIZE(YY)#endif movapd xtemp2, xt1 mulpd a3, xt1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -