📄 zaxpy_sse.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifndef WINDOWS_ABI#define M ARG1#define X ARG4#define INCX ARG5#define Y ARG6#define INCY ARG2#else#define M ARG1#define X ARG2#define INCX ARG3#define Y ARG4#define INCY %r10#endif#define YY %r11#define ALPHA %xmm15 PROLOGUE PROFCODE#ifndef WINDOWS_ABI#ifndef XDOUBLE movq 8(%rsp), INCY#else movq 40(%rsp), INCY#endif#else movaps %xmm3, %xmm0 movss 40(%rsp), %xmm1 movq 48(%rsp), X movq 56(%rsp), INCX movq 64(%rsp), Y movq 72(%rsp), INCY#endif SAVEREGISTERS #ifndef CONJ pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm15 pxor %xmm13, %xmm13 subps %xmm15, %xmm13 unpcklps %xmm14, %xmm13 unpcklps %xmm15, %xmm14 movaps %xmm13, %xmm15#else pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm15 pxor %xmm13, %xmm13 subps %xmm14, %xmm13 unpcklps %xmm15, %xmm14 unpcklps %xmm13, %xmm15#endif leaq (, INCX, SIZE * 2), INCX leaq (, INCY, SIZE * 2), INCY cmpq $2 * SIZE, INCX jne .L50 cmpq $2 * SIZE, INCY jne .L50 testq $2 * SIZE, X je .L05 movsd 0 * SIZE(X), %xmm1 movsd 0 * SIZE(Y), %xmm8 movsldup %xmm1, %xmm0 movshdup %xmm1, %xmm1 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm8 movsd %xmm8, 0 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y decq M jle .L999 ALIGN_2.L05: testq $1 * SIZE, X jne .L30 testq $3 * SIZE, Y jne .L20 movq M, %rax sarq $3, %rax jle .L15 ALIGN_3.L12: movsldup 0 * SIZE(X), %xmm0 movsldup 4 * SIZE(X), %xmm2 movsldup 8 * SIZE(X), %xmm4 movsldup 12 * SIZE(X), %xmm6 movshdup 0 * SIZE(X), %xmm1 movshdup 4 * SIZE(X), %xmm3 movshdup 8 * SIZE(X), %xmm5 movshdup 12 * SIZE(X), %xmm7 movaps 0 * SIZE(Y), %xmm8 movaps 4 * SIZE(Y), %xmm9 movaps 8 * SIZE(Y), %xmm10 movaps 12 * SIZE(Y), %xmm11 mulps %xmm14, %xmm0 mulps %xmm14, %xmm2 mulps %xmm14, %xmm4 mulps %xmm14, %xmm6 mulps %xmm15, %xmm1 mulps %xmm15, %xmm3 mulps %xmm15, %xmm5 mulps %xmm15, %xmm7 addps %xmm0, %xmm8 addps %xmm2, %xmm9 addps %xmm4, %xmm10 addps %xmm6, %xmm11 addps %xmm1, %xmm8 addps %xmm3, %xmm9 addps %xmm5, %xmm10 addps %xmm7, %xmm11 movaps %xmm8, 0 * SIZE(Y) movaps %xmm9, 4 * SIZE(Y) movaps %xmm10, 8 * SIZE(Y) movaps %xmm11, 12 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y decq %rax jg .L12 ALIGN_3.L15: testq $4, M jle .L16 movsldup 0 * SIZE(X), %xmm0 movshdup 0 * SIZE(X), %xmm1 movsldup 4 * SIZE(X), %xmm2 movshdup 4 * SIZE(X), %xmm3 movaps 0 * SIZE(Y), %xmm8 movaps 4 * SIZE(Y), %xmm9 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 mulps %xmm14, %xmm2 mulps %xmm15, %xmm3 addps %xmm0, %xmm8 addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm9 movaps %xmm8, 0 * SIZE(Y) movaps %xmm9, 4 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_2.L16: testq $2, M jle .L17 movsldup 0 * SIZE(X), %xmm0 movshdup 0 * SIZE(X), %xmm1 movaps 0 * SIZE(Y), %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm8 movaps %xmm8, 0 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_2.L17: testq $1, M jle .L999 movsldup 0 * SIZE(X), %xmm0 movshdup 0 * SIZE(X), %xmm1 movaps 0 * SIZE(Y), %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm8 movsd %xmm8, 0 * SIZE(Y) jmp .L999 ALIGN_3.L20: movq M, %rax sarq $3, %rax jle .L25 ALIGN_3.L22: movsldup 0 * SIZE(X), %xmm0 movsldup 4 * SIZE(X), %xmm2 movsldup 8 * SIZE(X), %xmm4 movsldup 12 * SIZE(X), %xmm6 movshdup 0 * SIZE(X), %xmm1 movshdup 4 * SIZE(X), %xmm3 movshdup 8 * SIZE(X), %xmm5 movshdup 12 * SIZE(X), %xmm7 movsd 0 * SIZE(Y), %xmm8 movhps 2 * SIZE(Y), %xmm8 movsd 4 * SIZE(Y), %xmm9 movhps 6 * SIZE(Y), %xmm9 movsd 8 * SIZE(Y), %xmm10 movhps 10 * SIZE(Y), %xmm10 movsd 12 * SIZE(Y), %xmm11 movhps 14 * SIZE(Y), %xmm11 mulps %xmm14, %xmm0 mulps %xmm14, %xmm2 mulps %xmm14, %xmm4 mulps %xmm14, %xmm6 mulps %xmm15, %xmm1 mulps %xmm15, %xmm3 mulps %xmm15, %xmm5 mulps %xmm15, %xmm7 addps %xmm0, %xmm8 addps %xmm2, %xmm9 addps %xmm4, %xmm10 addps %xmm6, %xmm11 addps %xmm1, %xmm8 addps %xmm3, %xmm9 addps %xmm5, %xmm10 addps %xmm7, %xmm11 movlpd %xmm8, 0 * SIZE(Y) movhpd %xmm8, 2 * SIZE(Y) movlpd %xmm9, 4 * SIZE(Y) movhpd %xmm9, 6 * SIZE(Y) movlpd %xmm10, 8 * SIZE(Y) movhpd %xmm10, 10 * SIZE(Y) movlpd %xmm11, 12 * SIZE(Y) movhpd %xmm11, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y decq %rax jg .L22 ALIGN_3.L25: testq $4, M jle .L26 movsldup 0 * SIZE(X), %xmm0 movshdup 0 * SIZE(X), %xmm1 movsldup 4 * SIZE(X), %xmm2 movshdup 4 * SIZE(X), %xmm3 movsd 0 * SIZE(Y), %xmm8 movhps 2 * SIZE(Y), %xmm8 movsd 4 * SIZE(Y), %xmm9 movhps 6 * SIZE(Y), %xmm9 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 mulps %xmm14, %xmm2 mulps %xmm15, %xmm3 addps %xmm0, %xmm8 addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm9 movlpd %xmm8, 0 * SIZE(Y) movhpd %xmm8, 2 * SIZE(Y) movlpd %xmm9, 4 * SIZE(Y) movhpd %xmm9, 6 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_2.L26: testq $2, M jle .L27 movsldup 0 * SIZE(X), %xmm0 movshdup 0 * SIZE(X), %xmm1 movsd 0 * SIZE(Y), %xmm8 movhps 2 * SIZE(Y), %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm8 movlpd %xmm8, 0 * SIZE(Y) movhpd %xmm8, 2 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_2.L27: testq $1, M jle .L999 movsldup 0 * SIZE(X), %xmm0 movshdup 0 * SIZE(X), %xmm1 movsd 0 * SIZE(Y), %xmm8 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 addps %xmm0, %xmm8 addps %xmm1, %xmm8 movsd %xmm8, 0 * SIZE(Y) jmp .L999 ALIGN_3.L30: movq M, %rax sarq $3, %rax jle .L35 ALIGN_3.L32: movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movsd 4 * SIZE(X), %xmm2 movhps 6 * SIZE(X), %xmm2 movsd 8 * SIZE(X), %xmm4 movhps 10 * SIZE(X), %xmm4 movsd 12 * SIZE(X), %xmm6 movhps 14 * SIZE(X), %xmm6 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 movshdup %xmm2, %xmm3 movsldup %xmm2, %xmm2 movshdup %xmm4, %xmm5 movsldup %xmm4, %xmm4 movshdup %xmm6, %xmm7 movsldup %xmm6, %xmm6 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 mulps %xmm14, %xmm2 mulps %xmm15, %xmm3 mulps %xmm14, %xmm4 mulps %xmm15, %xmm5 mulps %xmm14, %xmm6 mulps %xmm15, %xmm7 movsd 0 * SIZE(Y), %xmm8 movhps 2 * SIZE(Y), %xmm8 movsd 4 * SIZE(Y), %xmm9 movhps 6 * SIZE(Y), %xmm9 movsd 8 * SIZE(Y), %xmm10 movhps 10 * SIZE(Y), %xmm10 movsd 12 * SIZE(Y), %xmm11 movhps 14 * SIZE(Y), %xmm11 addps %xmm0, %xmm8 addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm10 addps %xmm6, %xmm11 addps %xmm7, %xmm11 movsd %xmm8, 0 * SIZE(Y) movhps %xmm8, 2 * SIZE(Y) movsd %xmm9, 4 * SIZE(Y) movhps %xmm9, 6 * SIZE(Y) movsd %xmm10, 8 * SIZE(Y) movhps %xmm10, 10 * SIZE(Y) movsd %xmm11, 12 * SIZE(Y) movhps %xmm11, 14 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y decq %rax jg .L32 ALIGN_3.L35: testq $4, M jle .L36 movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movsd 4 * SIZE(X), %xmm2 movhps 6 * SIZE(X), %xmm2 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 movshdup %xmm2, %xmm3 movsldup %xmm2, %xmm2 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 mulps %xmm14, %xmm2 mulps %xmm15, %xmm3 movsd 0 * SIZE(Y), %xmm8 movhps 2 * SIZE(Y), %xmm8 movsd 4 * SIZE(Y), %xmm9 movhps 6 * SIZE(Y), %xmm9 addps %xmm0, %xmm8 addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm9 movsd %xmm8, 0 * SIZE(Y) movhps %xmm8, 2 * SIZE(Y) movsd %xmm9, 4 * SIZE(Y) movhps %xmm9, 6 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3.L36: testq $2, M jle .L37 movsd 0 * SIZE(X), %xmm0 movhps 2 * SIZE(X), %xmm0 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 movsd 0 * SIZE(Y), %xmm8 movhps 2 * SIZE(Y), %xmm8 addps %xmm0, %xmm8 addps %xmm1, %xmm8 movsd %xmm8, 0 * SIZE(Y) movhps %xmm8, 2 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3.L37: testq $1, M jle .L999 movsd 0 * SIZE(X), %xmm0 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 movsd 0 * SIZE(Y), %xmm8 addps %xmm0, %xmm8 addps %xmm1, %xmm8 movsd %xmm8, 0 * SIZE(Y) jmp .L999 ALIGN_3.L50: movq Y, YY movq M, %rax sarq $3, %rax jle .L55 ALIGN_3.L52: movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X movsd (X), %xmm2 addq INCX, X movhps (X), %xmm2 addq INCX, X movsd (X), %xmm4 addq INCX, X movhps (X), %xmm4 addq INCX, X movsd (X), %xmm6 addq INCX, X movhps (X), %xmm6 addq INCX, X movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 movshdup %xmm2, %xmm3 movsldup %xmm2, %xmm2 movshdup %xmm4, %xmm5 movsldup %xmm4, %xmm4 movshdup %xmm6, %xmm7 movsldup %xmm6, %xmm6 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 mulps %xmm14, %xmm2 mulps %xmm15, %xmm3 mulps %xmm14, %xmm4 mulps %xmm15, %xmm5 mulps %xmm14, %xmm6 mulps %xmm15, %xmm7 movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y movsd (Y), %xmm9 addq INCY, Y movhps (Y), %xmm9 addq INCY, Y movsd (Y), %xmm10 addq INCY, Y movhps (Y), %xmm10 addq INCY, Y movsd (Y), %xmm11 addq INCY, Y movhps (Y), %xmm11 addq INCY, Y addps %xmm0, %xmm8 addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm9 addps %xmm4, %xmm10 addps %xmm5, %xmm10 addps %xmm6, %xmm11 addps %xmm7, %xmm11 movsd %xmm8, (YY) addq INCY, YY movhps %xmm8, (YY) addq INCY, YY movsd %xmm9, (YY) addq INCY, YY movhps %xmm9, (YY) addq INCY, YY movsd %xmm10, (YY) addq INCY, YY movhps %xmm10, (YY) addq INCY, YY movsd %xmm11, (YY) addq INCY, YY movhps %xmm11, (YY) addq INCY, YY decq %rax jg .L52 ALIGN_3.L55: testq $4, M jle .L56 movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X movsd (X), %xmm2 addq INCX, X movhps (X), %xmm2 addq INCX, X movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 movshdup %xmm2, %xmm3 movsldup %xmm2, %xmm2 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 mulps %xmm14, %xmm2 mulps %xmm15, %xmm3 movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y movsd (Y), %xmm9 addq INCY, Y movhps (Y), %xmm9 addq INCY, Y addps %xmm0, %xmm8 addps %xmm1, %xmm8 addps %xmm2, %xmm9 addps %xmm3, %xmm9 movsd %xmm8, (YY) addq INCY, YY movhps %xmm8, (YY) addq INCY, YY movsd %xmm9, (YY) addq INCY, YY movhps %xmm9, (YY) addq INCY, YY ALIGN_3.L56: testq $2, M jle .L57 movsd (X), %xmm0 addq INCX, X movhps (X), %xmm0 addq INCX, X movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 movsd (Y), %xmm8 addq INCY, Y movhps (Y), %xmm8 addq INCY, Y addps %xmm0, %xmm8 addps %xmm1, %xmm8 movsd %xmm8, (YY) addq INCY, YY movhps %xmm8, (YY) addq INCY, YY ALIGN_3.L57: testq $1, M jle .L999 movsd (X), %xmm0 movshdup %xmm0, %xmm1 movsldup %xmm0, %xmm0 mulps %xmm14, %xmm0 mulps %xmm15, %xmm1 movsd (Y), %xmm8 addps %xmm0, %xmm8 addps %xmm1, %xmm8 movsd %xmm8, (Y) ALIGN_3.L999: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -