📄 symv_u_sse.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef CORE2#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 12)#endif#ifdef PENRYN#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 12)#endif#ifdef PENTIUM4#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 20)#endif#ifdef OPTERON#define PREFETCH prefetch#define PREFETCHW prefetchw#define PREFETCHSIZE (16 * 8)#define movsd movlps#endif#ifdef BARCELONA#define PREFETCH prefetch#define PREFETCHW prefetchw#define PREFETCHSIZE (16 * 16)#endif#ifdef GENERIC#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (16 * 20)#endif#ifndef WINDOWS_ABI#define STACKSIZE 80 #define OLD_INCY 8 + STACKSIZE(%rsp)#define OLD_BUFFER 16 + STACKSIZE(%rsp)#define M ARG1#define A ARG2#define LDA ARG3 #define X ARG4#define INCX ARG5 #define Y ARG6#define INCY %r10#define BUFFER %r11#else#define STACKSIZE 256 #define OLD_X 40 + STACKSIZE(%rsp)#define OLD_INCX 48 + STACKSIZE(%rsp)#define OLD_Y 56 + STACKSIZE(%rsp)#define OLD_INCY 64 + STACKSIZE(%rsp)#define OLD_BUFFER 72 + STACKSIZE(%rsp)#define M ARG1#define A ARG3#define LDA ARG4#define X %rdi#define INCX %rsi #define Y %rdx#define INCY %r10#define BUFFER %r11#endif#define TEMP %rax#define I %rax#define IS %r12#define A1 %rbx#define A2 %rbp#define XX %r13#define YY %r14#define NEW_X BUFFER#define NEW_Y X#define ALPHA %xmm0#define atemp1 %xmm0#define atemp2 %xmm1#define atemp3 %xmm2#define atemp4 %xmm3#define xsum1 %xmm4#define xsum2 %xmm5#define xsum3 %xmm6#define xsum4 %xmm7#define xtemp1 %xmm8#define xtemp2 %xmm9#define yy1 %xmm10#define xt1 %xmm11#define a1 %xmm12#define a2 %xmm13#define a3 %xmm14#define a4 %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp)#ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_X, X movq OLD_INCX, INCX movq OLD_Y, Y movaps %xmm1, %xmm0#endif movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA testq M, M jle .L999 shufps $0, ALPHA, ALPHA movq BUFFER, XX movq M, %rax sarq $3, %rax jle .L02 ALIGN_3.L01: movss 0 * SIZE(X), %xmm1 addq INCX, X movss 0 * SIZE(X), %xmm2 addq INCX, X movss 0 * SIZE(X), %xmm3 addq INCX, X movss 0 * SIZE(X), %xmm4 addq INCX, X movss 0 * SIZE(X), %xmm5 addq INCX, X movss 0 * SIZE(X), %xmm6 addq INCX, X movss 0 * SIZE(X), %xmm7 addq INCX, X movss 0 * SIZE(X), %xmm8 addq INCX, X mulss ALPHA, %xmm1 mulss ALPHA, %xmm2 mulss ALPHA, %xmm3 mulss ALPHA, %xmm4 mulss ALPHA, %xmm5 mulss ALPHA, %xmm6 mulss ALPHA, %xmm7 mulss ALPHA, %xmm8 movss %xmm1, 0 * SIZE(XX) movss %xmm2, 1 * SIZE(XX) movss %xmm3, 2 * SIZE(XX) movss %xmm4, 3 * SIZE(XX) movss %xmm5, 4 * SIZE(XX) movss %xmm6, 5 * SIZE(XX) movss %xmm7, 6 * SIZE(XX) movss %xmm8, 7 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L01 ALIGN_3.L02: movq M, %rax andq $7, %rax jle .L05 ALIGN_3.L03: movss 0 * SIZE(X), %xmm1 addq INCX, X mulss ALPHA, %xmm1 movss %xmm1, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L03 ALIGN_3.L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $3, %rax jle .L07 ALIGN_3.L06: movss 0 * SIZE(YY), %xmm0 addq INCY, YY movss 0 * SIZE(YY), %xmm1 addq INCY, YY movss 0 * SIZE(YY), %xmm2 addq INCY, YY movss 0 * SIZE(YY), %xmm3 addq INCY, YY movss 0 * SIZE(YY), %xmm4 addq INCY, YY movss 0 * SIZE(YY), %xmm5 addq INCY, YY movss 0 * SIZE(YY), %xmm6 addq INCY, YY movss 0 * SIZE(YY), %xmm7 addq INCY, YY movss %xmm0, 0 * SIZE(XX) movss %xmm1, 1 * SIZE(XX) movss %xmm2, 2 * SIZE(XX) movss %xmm3, 3 * SIZE(XX) movss %xmm4, 4 * SIZE(XX) movss %xmm5, 5 * SIZE(XX) movss %xmm6, 6 * SIZE(XX) movss %xmm7, 7 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3.L07: movq M, %rax andq $7, %rax jle .L10 ALIGN_3.L08: movss 0 * SIZE(YY), %xmm0 addq INCY, YY movss %xmm0, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L08 ALIGN_3.L10: xorq IS, IS # is = 0 cmpq $4, M jl .L20 ALIGN_3.L11: movq A, A1 leaq (A, LDA, 2), A2 leaq (A, LDA, 4), A movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4 pshufd $0x00, atemp4, atemp1 pshufd $0x55, atemp4, atemp2 pshufd $0xaa, atemp4, atemp3 pshufd $0xff, atemp4, atemp4 pxor xsum1, xsum1 pxor xsum2, xsum2 pxor xsum3, xsum3 pxor xsum4, xsum4 movaps 0 * SIZE(NEW_X), xtemp1 movaps 4 * SIZE(NEW_X), xtemp2 movsd 0 * SIZE(A1), a1 movhps 2 * SIZE(A1), a1 movsd 0 * SIZE(A1, LDA, 1), a2 movhps 2 * SIZE(A1, LDA, 1), a2 movsd 0 * SIZE(A2), a3 movhps 2 * SIZE(A2), a3 movsd 0 * SIZE(A2, LDA, 1), a4 movhps 2 * SIZE(A2, LDA, 1), a4 movsd 0 * SIZE(NEW_Y), yy1 movhps 2 * SIZE(NEW_Y), yy1 movq NEW_X, XX movq NEW_Y, YY movq IS, I sarq $4, I jle .L14 ALIGN_3.L12: movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A1) movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3#if !defined(CORE2) && !defined(PENRYN) PREFETCH PREFETCHSIZE(XX)#endif movaps xtemp1, xt1 movaps 8 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 8 * SIZE(A1), a1 movhps 10 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A1, LDA, 1) movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 8 * SIZE(A1, LDA, 1), a2 movhps 10 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 8 * SIZE(A2), a3 movhps 10 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 12 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 8 * SIZE(A2, LDA, 1), a4 movhps 10 * SIZE(A2, LDA, 1), a4 movlps yy1, 4 * SIZE(YY) movhps yy1, 6 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhps 10 * SIZE(YY), yy1 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 12 * SIZE(A1), a1 movhps 14 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A2) movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 12 * SIZE(A1, LDA, 1), a2 movhps 14 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 12 * SIZE(A2), a3 movhps 14 * SIZE(A2), a3#if !defined(CORE2) && !defined(PENRYN) PREFETCHW PREFETCHSIZE(YY)#endif movaps xtemp1, xt1 movaps 16 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 12 * SIZE(A2, LDA, 1), a4 movhps 14 * SIZE(A2, LDA, 1), a4 movlps yy1, 8 * SIZE(YY) movhps yy1, 10 * SIZE(YY) movsd 12 * SIZE(YY), yy1 movhps 14 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 16 * SIZE(A1), a1 movhps 18 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A2, LDA, 1) movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 16 * SIZE(A1, LDA, 1), a2 movhps 18 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -