📄 axpy_sse2_opteron.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifndef WINDOWS_ABI#define M ARG1#define X ARG4#define INCX ARG5#define Y ARG6#define INCY ARG2#else#define M ARG1#define X ARG2#define INCX ARG3#define Y ARG4#define INCY %r10#endif#define YY %r11#define ALPHA %xmm15#define PREFETCH prefetch#define PREFETCHW prefetchw#ifdef BARCELONA#define PREFETCHSIZE (8 * 8 + 4)#else#define PREFETCHSIZE (8 * 8 + 4)#endif PROLOGUE PROFCODE#ifndef WINDOWS_ABI#ifndef XDOUBLE movq 8(%rsp), INCY#else movq 24(%rsp), INCY#endif movaps %xmm0, ALPHA#else movaps %xmm3, ALPHA movq 40(%rsp), X movq 48(%rsp), INCX movq 56(%rsp), Y movq 64(%rsp), INCY#endif SAVEREGISTERS unpcklpd ALPHA, ALPHA leaq (, INCX, SIZE), INCX leaq (, INCY, SIZE), INCY cmpq $SIZE, INCX jne .L40 cmpq $SIZE, INCY jne .L40 subq $-16 * SIZE, X subq $-16 * SIZE, Y testq $SIZE, Y je .L10 movsd -16 * SIZE(X), %xmm0 mulsd ALPHA, %xmm0 addsd -16 * SIZE(Y), %xmm0 movlpd %xmm0, -16 * SIZE(Y) addq $1 * SIZE, X addq $1 * SIZE, Y decq M jle .L19 ALIGN_3.L10: testq $SIZE, X jne .L20 movq M, %rax sarq $4, %rax jle .L13 movapd -16 * SIZE(X), %xmm0 movapd -14 * SIZE(X), %xmm1 movapd -12 * SIZE(X), %xmm2 movapd -10 * SIZE(X), %xmm3 movapd -8 * SIZE(X), %xmm4 movapd -6 * SIZE(X), %xmm5 movapd -4 * SIZE(X), %xmm6 movapd -2 * SIZE(X), %xmm7 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 mulpd ALPHA, %xmm2 mulpd ALPHA, %xmm3 decq %rax jle .L12 ALIGN_3.L11: addpd -16 * SIZE(Y), %xmm0 mulpd ALPHA, %xmm4 PREFETCH (PREFETCHSIZE + 0) * SIZE(X) addpd -14 * SIZE(Y), %xmm1 mulpd ALPHA, %xmm5 addpd -12 * SIZE(Y), %xmm2 mulpd ALPHA, %xmm6 addpd -10 * SIZE(Y), %xmm3 mulpd ALPHA, %xmm7 PREFETCHW (PREFETCHSIZE + 0) * SIZE(Y) movapd %xmm0, -16 * SIZE(Y) movapd %xmm1, -14 * SIZE(Y) movapd %xmm2, -12 * SIZE(Y) movapd %xmm3, -10 * SIZE(Y) movapd 0 * SIZE(X), %xmm0 movapd 2 * SIZE(X), %xmm1 movapd 4 * SIZE(X), %xmm2 movapd 6 * SIZE(X), %xmm3 addpd -8 * SIZE(Y), %xmm4 mulpd ALPHA, %xmm0 PREFETCH (PREFETCHSIZE + 8) * SIZE(X) addpd -6 * SIZE(Y), %xmm5 mulpd ALPHA, %xmm1 addpd -4 * SIZE(Y), %xmm6 mulpd ALPHA, %xmm2 addpd -2 * SIZE(Y), %xmm7 mulpd ALPHA, %xmm3 PREFETCHW (PREFETCHSIZE + 8) * SIZE(Y) movapd %xmm4, -8 * SIZE(Y) movapd %xmm5, -6 * SIZE(Y) movapd %xmm6, -4 * SIZE(Y) movapd %xmm7, -2 * SIZE(Y) movapd 8 * SIZE(X), %xmm4 movapd 10 * SIZE(X), %xmm5 movapd 12 * SIZE(X), %xmm6 movapd 14 * SIZE(X), %xmm7 subq $-16 * SIZE, X subq $-16 * SIZE, Y decq %rax jg .L11 ALIGN_3.L12: addpd -16 * SIZE(Y), %xmm0 mulpd ALPHA, %xmm4 addpd -14 * SIZE(Y), %xmm1 mulpd ALPHA, %xmm5 addpd -12 * SIZE(Y), %xmm2 mulpd ALPHA, %xmm6 addpd -10 * SIZE(Y), %xmm3 mulpd ALPHA, %xmm7 movapd %xmm0, -16 * SIZE(Y) movapd %xmm1, -14 * SIZE(Y) movapd %xmm2, -12 * SIZE(Y) movapd %xmm3, -10 * SIZE(Y) addpd -8 * SIZE(Y), %xmm4 addpd -6 * SIZE(Y), %xmm5 addpd -4 * SIZE(Y), %xmm6 addpd -2 * SIZE(Y), %xmm7 movapd %xmm4, -8 * SIZE(Y) movapd %xmm5, -6 * SIZE(Y) movapd %xmm6, -4 * SIZE(Y) movapd %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y ALIGN_3.L13: testq $15, M jle .L19 testq $8, M jle .L14 movapd -16 * SIZE(X), %xmm0 movapd -14 * SIZE(X), %xmm1 movapd -12 * SIZE(X), %xmm2 movapd -10 * SIZE(X), %xmm3 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 mulpd ALPHA, %xmm2 mulpd ALPHA, %xmm3 addpd -16 * SIZE(Y), %xmm0 addpd -14 * SIZE(Y), %xmm1 addpd -12 * SIZE(Y), %xmm2 addpd -10 * SIZE(Y), %xmm3 movapd %xmm0, -16 * SIZE(Y) movapd %xmm1, -14 * SIZE(Y) movapd %xmm2, -12 * SIZE(Y) movapd %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3.L14: testq $4, M jle .L15 movapd -16 * SIZE(X), %xmm0 movapd -14 * SIZE(X), %xmm1 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 addpd -16 * SIZE(Y), %xmm0 addpd -14 * SIZE(Y), %xmm1 movapd %xmm0, -16 * SIZE(Y) movapd %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3.L15: testq $2, M jle .L16 movapd -16 * SIZE(X), %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movapd %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3.L16: testq $1, M jle .L19 movlpd -16 * SIZE(X), %xmm0 mulsd ALPHA, %xmm0 addsd -16 * SIZE(Y), %xmm0 movlpd %xmm0, -16 * SIZE(Y) addq $SIZE, Y ALIGN_3.L19: xorq %rax, %rax RESTOREREGISTERS ret ALIGN_3.L20: movq M, %rax sarq $4, %rax jle .L23 movlpd -16 * SIZE(X), %xmm0 movhpd -15 * SIZE(X), %xmm0 movlpd -14 * SIZE(X), %xmm1 movhpd -13 * SIZE(X), %xmm1 movlpd -12 * SIZE(X), %xmm2 movhpd -11 * SIZE(X), %xmm2 movlpd -10 * SIZE(X), %xmm3 movhpd -9 * SIZE(X), %xmm3 movlpd -8 * SIZE(X), %xmm4 movhpd -7 * SIZE(X), %xmm4 movlpd -6 * SIZE(X), %xmm5 movhpd -5 * SIZE(X), %xmm5 movlpd -4 * SIZE(X), %xmm6 movhpd -3 * SIZE(X), %xmm6 movlpd -2 * SIZE(X), %xmm7 movhpd -1 * SIZE(X), %xmm7 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 mulpd ALPHA, %xmm2 mulpd ALPHA, %xmm3 decq %rax jle .L22 ALIGN_3.L21: PREFETCH (PREFETCHSIZE + 0) * SIZE(X) addpd -16 * SIZE(Y), %xmm0 mulpd ALPHA, %xmm4 addpd -14 * SIZE(Y), %xmm1 mulpd ALPHA, %xmm5 addpd -12 * SIZE(Y), %xmm2 mulpd ALPHA, %xmm6 addpd -10 * SIZE(Y), %xmm3 mulpd ALPHA, %xmm7 movapd %xmm0, -16 * SIZE(Y) movapd %xmm1, -14 * SIZE(Y) movapd %xmm2, -12 * SIZE(Y) movapd %xmm3, -10 * SIZE(Y) PREFETCHW (PREFETCHSIZE + 0) * SIZE(Y) movlpd 0 * SIZE(X), %xmm0 movhpd 1 * SIZE(X), %xmm0 movlpd 2 * SIZE(X), %xmm1 movhpd 3 * SIZE(X), %xmm1 movlpd 4 * SIZE(X), %xmm2 movhpd 5 * SIZE(X), %xmm2 movlpd 6 * SIZE(X), %xmm3 movhpd 7 * SIZE(X), %xmm3 PREFETCH (PREFETCHSIZE + 8) * SIZE(X) addpd -8 * SIZE(Y), %xmm4 mulpd ALPHA, %xmm0 addpd -6 * SIZE(Y), %xmm5 mulpd ALPHA, %xmm1 addpd -4 * SIZE(Y), %xmm6 mulpd ALPHA, %xmm2 addpd -2 * SIZE(Y), %xmm7 mulpd ALPHA, %xmm3 movapd %xmm4, -8 * SIZE(Y) movapd %xmm5, -6 * SIZE(Y) movapd %xmm6, -4 * SIZE(Y) movapd %xmm7, -2 * SIZE(Y) PREFETCHW (PREFETCHSIZE + 8) * SIZE(Y) movlpd 8 * SIZE(X), %xmm4 movhpd 9 * SIZE(X), %xmm4 movlpd 10 * SIZE(X), %xmm5 movhpd 11 * SIZE(X), %xmm5 movlpd 12 * SIZE(X), %xmm6 movhpd 13 * SIZE(X), %xmm6 movlpd 14 * SIZE(X), %xmm7 movhpd 15 * SIZE(X), %xmm7 addq $16 * SIZE, Y addq $16 * SIZE, X decq %rax jg .L21 ALIGN_3.L22: addpd -16 * SIZE(Y), %xmm0 mulpd ALPHA, %xmm4 addpd -14 * SIZE(Y), %xmm1 mulpd ALPHA, %xmm5 addpd -12 * SIZE(Y), %xmm2 mulpd ALPHA, %xmm6 addpd -10 * SIZE(Y), %xmm3 mulpd ALPHA, %xmm7 movapd %xmm0, -16 * SIZE(Y) movapd %xmm1, -14 * SIZE(Y) movapd %xmm2, -12 * SIZE(Y) movapd %xmm3, -10 * SIZE(Y) addpd -8 * SIZE(Y), %xmm4 addpd -6 * SIZE(Y), %xmm5 addpd -4 * SIZE(Y), %xmm6 addpd -2 * SIZE(Y), %xmm7 movapd %xmm4, -8 * SIZE(Y) movapd %xmm5, -6 * SIZE(Y) movapd %xmm6, -4 * SIZE(Y) movapd %xmm7, -2 * SIZE(Y) addq $16 * SIZE, X addq $16 * SIZE, Y ALIGN_3.L23: movq M, %rax andq $8, %rax jle .L24 ALIGN_3 movlpd -16 * SIZE(X), %xmm0 movhpd -15 * SIZE(X), %xmm0 movlpd -14 * SIZE(X), %xmm1 movhpd -13 * SIZE(X), %xmm1 movlpd -12 * SIZE(X), %xmm2 movhpd -11 * SIZE(X), %xmm2 movlpd -10 * SIZE(X), %xmm3 movhpd -9 * SIZE(X), %xmm3 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 mulpd ALPHA, %xmm2 mulpd ALPHA, %xmm3 addpd -16 * SIZE(Y), %xmm0 addpd -14 * SIZE(Y), %xmm1 addpd -12 * SIZE(Y), %xmm2 addpd -10 * SIZE(Y), %xmm3 movapd %xmm0, -16 * SIZE(Y) movapd %xmm1, -14 * SIZE(Y) movapd %xmm2, -12 * SIZE(Y) movapd %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y ALIGN_3.L24: movq M, %rax andq $4, %rax jle .L25 ALIGN_3 movlpd -16 * SIZE(X), %xmm0 movhpd -15 * SIZE(X), %xmm0 movlpd -14 * SIZE(X), %xmm1 movhpd -13 * SIZE(X), %xmm1 mulpd ALPHA, %xmm0 mulpd ALPHA, %xmm1 addpd -16 * SIZE(Y), %xmm0 addpd -14 * SIZE(Y), %xmm1 movapd %xmm0, -16 * SIZE(Y) movapd %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y ALIGN_3.L25: movq M, %rax andq $2, %rax jle .L26 ALIGN_3 movlpd -16 * SIZE(X), %xmm0 movhpd -15 * SIZE(X), %xmm0 mulpd ALPHA, %xmm0 addpd -16 * SIZE(Y), %xmm0 movapd %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y ALIGN_3.L26: movq M, %rax andq $1, %rax jle .L29 ALIGN_3 movlpd -16 * SIZE(X), %xmm0 mulsd ALPHA, %xmm0 addsd -16 * SIZE(Y), %xmm0 movlpd %xmm0, -16 * SIZE(Y) addq $SIZE, Y ALIGN_3.L29: xorq %rax, %rax RESTOREREGISTERS ret ALIGN_3.L40: movq Y, YY movq M, %rax sarq $3, %rax jle .L45 ALIGN_3.L41: movlpd 0 * SIZE(X), %xmm0 addq INCX, X movhpd 0 * SIZE(X), %xmm0 addq INCX, X mulpd ALPHA, %xmm0 movlpd 0 * SIZE(YY), %xmm6 addq INCY, YY movhpd 0 * SIZE(YY), %xmm6 addq INCY, YY addpd %xmm6, %xmm0 movlpd 0 * SIZE(X), %xmm1 addq INCX, X movhpd 0 * SIZE(X), %xmm1 addq INCX, X mulpd ALPHA, %xmm1 movlpd 0 * SIZE(YY), %xmm6 addq INCY, YY movhpd 0 * SIZE(YY), %xmm6 addq INCY, YY addpd %xmm6, %xmm1 movlpd 0 * SIZE(X), %xmm2 addq INCX, X movhpd 0 * SIZE(X), %xmm2 addq INCX, X mulpd ALPHA, %xmm2 movlpd 0 * SIZE(YY), %xmm6 addq INCY, YY movhpd 0 * SIZE(YY), %xmm6 addq INCY, YY addpd %xmm6, %xmm2 movlpd 0 * SIZE(X), %xmm3 addq INCX, X movhpd 0 * SIZE(X), %xmm3 addq INCX, X mulpd ALPHA, %xmm3 movlpd 0 * SIZE(YY), %xmm6 addq INCY, YY movhpd 0 * SIZE(YY), %xmm6 addq INCY, YY addpd %xmm6, %xmm3 movlpd %xmm0, 0 * SIZE(Y) addq INCY, Y movhpd %xmm0, 0 * SIZE(Y) addq INCY, Y movlpd %xmm1, 0 * SIZE(Y) addq INCY, Y movhpd %xmm1, 0 * SIZE(Y) addq INCY, Y movlpd %xmm2, 0 * SIZE(Y) addq INCY, Y movhpd %xmm2, 0 * SIZE(Y) addq INCY, Y movlpd %xmm3, 0 * SIZE(Y) addq INCY, Y movhpd %xmm3, 0 * SIZE(Y) addq INCY, Y decq %rax jg .L41 ALIGN_3.L45: movq M, %rax andq $7, %rax jle .L47 ALIGN_3.L46: movlpd (X), %xmm0 addq INCX, X mulsd %xmm15, %xmm0 addsd (Y), %xmm0 movlpd %xmm0, (Y) addq INCY, Y decq %rax jg .L46 ALIGN_3.L47: xorq %rax, %rax RESTOREREGISTERS ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -