📄 gemv_t_sse2.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#define P 8192#ifdef PENTIUM4#define PREFETCH prefetchnta#define PREFETCHW prefetchnta#define PREFETCHSIZE 32#endif#ifdef CORE2#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (8 * 5)#endif#ifdef PENRYN#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (8 * 5)#endif#ifdef GENERIC#define PREFETCH prefetcht0#define PREFETCHW prefetcht0#define PREFETCHSIZE (8 * 5)#endif#define STACKSIZE 128 #define OLD_INCX 8 + STACKSIZE(%rsp)#define OLD_Y 16 + STACKSIZE(%rsp)#define OLD_INCY 24 + STACKSIZE(%rsp)#define BUFFER 32 + STACKSIZE(%rsp) #define NLDA 48(%rsp)#define J 56(%rsp)#define M %rdi#define N %rsi#define A %rcx#define LDA %r8#define X %r9#define INCX %rdx#define Y %rbp#define INCY %r10#define TEMP %rax#define I %rax#define MIN_M %rbx#define IS %r11#define AO1 %r12#define AO2 %r13#define AO3 %rdi#define AO4 %r11#define BO %r14#define CO %r15#define ALPHA %xmm15 #ifdef OPTERON#define movsd movlpd#endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq OLD_INCX, INCX movq OLD_Y, Y movq OLD_INCY, INCY leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY movapd %xmm0, ALPHA testq M, M jle .L999 testq N, N jle .L999 mov N, TEMP imulq LDA, TEMP movq $P, BO subq TEMP, BO leaq (, BO, SIZE), BO leaq (, LDA, SIZE), LDA xor IS, IS movq BO, NLDA testq $SIZE, LDA jne .L100 testq $SIZE, A jne .L50 ALIGN_3.L10: movq $P, TEMP movq M, MIN_M subq IS, MIN_M cmpq TEMP, MIN_M cmovg TEMP, MIN_M movq BUFFER, BO movq MIN_M, I sarq $3, I jle .L15 ALIGN_3.L12: movsd (X), %xmm0 addq INCX, X movhpd (X), %xmm0 addq INCX, X movsd (X), %xmm2 addq INCX, X movhpd (X), %xmm2 addq INCX, X movsd (X), %xmm4 addq INCX, X movhpd (X), %xmm4 addq INCX, X movsd (X), %xmm6 addq INCX, X movhpd (X), %xmm6 addq INCX, X movapd %xmm0, 0 * SIZE(BO) movapd %xmm2, 2 * SIZE(BO) movapd %xmm4, 4 * SIZE(BO) movapd %xmm6, 6 * SIZE(BO) addq $8 * SIZE, BO decq I jg .L12 ALIGN_3.L15: movq MIN_M, I andq $7, I jle .L20 ALIGN_2.L16: movsd (X), %xmm0 addq INCX, X movsd %xmm0, 0 * SIZE(BO) addq $SIZE, BO decq I jg .L16 ALIGN_3.L20: movq Y, CO movq N, J sarq $2, J jle .L30 ALIGN_3.L21: movq A, AO1 leaq (A, LDA, 1), AO2 leaq (A, LDA, 4), A movq BUFFER, BO movapd 0 * SIZE(BO), %xmm12 pxor %xmm0, %xmm0 movapd 2 * SIZE(BO), %xmm13 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3#if defined(CORE2) || defined(PENRYN) PREFETCHW 4 * SIZE(CO)#endif movq MIN_M, I sarq $4, I jle .L24 movapd 0 * SIZE(AO1), %xmm4 movapd 0 * SIZE(AO2), %xmm5 movapd 0 * SIZE(AO1, LDA, 2), %xmm6 movapd 0 * SIZE(AO2, LDA, 2), %xmm7 movapd 2 * SIZE(AO1), %xmm8 movapd 2 * SIZE(AO2), %xmm9 movapd 2 * SIZE(AO1, LDA, 2), %xmm10 movapd 2 * SIZE(AO2, LDA, 2), %xmm11 decq I jle .L23 ALIGN_3.L22: PREFETCH PREFETCHSIZE * SIZE(AO1) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movapd 4 * SIZE(AO1), %xmm4 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 movapd 4 * SIZE(AO2), %xmm5 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 movapd 4 * SIZE(AO1, LDA, 2), %xmm6 mulpd %xmm12, %xmm7 movapd 4 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 movapd 4 * SIZE(AO2, LDA, 2), %xmm7#if defined(CORE2) || defined(PENRYN) PREFETCH (PREFETCHSIZE + 8) * SIZE(AO1)#endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movapd 6 * SIZE(AO1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movapd 6 * SIZE(AO2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movapd 6 * SIZE(AO1, LDA, 2), %xmm10 mulpd %xmm13, %xmm11 movapd 6 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3 movapd 6 * SIZE(AO2, LDA, 2), %xmm11 PREFETCH PREFETCHSIZE * SIZE(AO2) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movapd 8 * SIZE(AO1), %xmm4 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 movapd 8 * SIZE(AO2), %xmm5 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 movapd 8 * SIZE(AO1, LDA, 2), %xmm6 mulpd %xmm12, %xmm7 movapd 8 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 movapd 8 * SIZE(AO2, LDA, 2), %xmm7#if defined(CORE2) || defined(PENRYN) PREFETCH (PREFETCHSIZE + 8) * SIZE(AO2)#endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movapd 10 * SIZE(AO1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movapd 10 * SIZE(AO2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movapd 10 * SIZE(AO1, LDA, 2), %xmm10 mulpd %xmm13, %xmm11 movapd 10 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3 movapd 10 * SIZE(AO2, LDA, 2), %xmm11 PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movapd 12 * SIZE(AO1), %xmm4 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 movapd 12 * SIZE(AO2), %xmm5 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 movapd 12 * SIZE(AO1, LDA, 2), %xmm6 mulpd %xmm12, %xmm7 movapd 12 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 movapd 12 * SIZE(AO2, LDA, 2), %xmm7#if defined(CORE2) || defined(PENRYN) PREFETCH (PREFETCHSIZE + 8) * SIZE(AO1, LDA, 2)#endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movapd 14 * SIZE(AO1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movapd 14 * SIZE(AO2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movapd 14 * SIZE(AO1, LDA, 2), %xmm10 mulpd %xmm13, %xmm11 movapd 14 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3 movapd 14 * SIZE(AO2, LDA, 2), %xmm11 PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movapd 16 * SIZE(AO1), %xmm4 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 movapd 16 * SIZE(AO2), %xmm5 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 movapd 16 * SIZE(AO1, LDA, 2), %xmm6 mulpd %xmm12, %xmm7 movapd 16 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 movapd 16 * SIZE(AO2, LDA, 2), %xmm7#if defined(CORE2) || defined(PENRYN) PREFETCH (PREFETCHSIZE + 8) * SIZE(AO2, LDA, 2)#endif mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movapd 18 * SIZE(AO1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movapd 18 * SIZE(AO2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movapd 18 * SIZE(AO1, LDA, 2), %xmm10 mulpd %xmm13, %xmm11 movapd 18 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3 movapd 18 * SIZE(AO2, LDA, 2), %xmm11 subq $-16 * SIZE, AO1 subq $-16 * SIZE, AO2 subq $-16 * SIZE, BO decq I jg .L22 ALIGN_3.L23: mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movapd 4 * SIZE(AO1), %xmm4 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 movapd 4 * SIZE(AO2), %xmm5 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 movapd 4 * SIZE(AO1, LDA, 2), %xmm6 mulpd %xmm12, %xmm7 movapd 4 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 movapd 4 * SIZE(AO2, LDA, 2), %xmm7 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movapd 6 * SIZE(AO1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movapd 6 * SIZE(AO2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movapd 6 * SIZE(AO1, LDA, 2), %xmm10 mulpd %xmm13, %xmm11 movapd 6 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3 movapd 6 * SIZE(AO2, LDA, 2), %xmm11 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movapd 8 * SIZE(AO1), %xmm4 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 movapd 8 * SIZE(AO2), %xmm5 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 movapd 8 * SIZE(AO1, LDA, 2), %xmm6 mulpd %xmm12, %xmm7 movapd 8 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 movapd 8 * SIZE(AO2, LDA, 2), %xmm7 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movapd 10 * SIZE(AO1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movapd 10 * SIZE(AO2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movapd 10 * SIZE(AO1, LDA, 2), %xmm10 mulpd %xmm13, %xmm11 movapd 10 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3 movapd 10 * SIZE(AO2, LDA, 2), %xmm11 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movapd 12 * SIZE(AO1), %xmm4 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 movapd 12 * SIZE(AO2), %xmm5 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 movapd 12 * SIZE(AO1, LDA, 2), %xmm6 mulpd %xmm12, %xmm7 movapd 12 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 movapd 12 * SIZE(AO2, LDA, 2), %xmm7 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movapd 14 * SIZE(AO1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movapd 14 * SIZE(AO2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movapd 14 * SIZE(AO1, LDA, 2), %xmm10 mulpd %xmm13, %xmm11 movapd 14 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3 movapd 14 * SIZE(AO2, LDA, 2), %xmm11 mulpd %xmm12, %xmm4 subq $-16 * SIZE, AO1 addpd %xmm4, %xmm0 subq $-16 * SIZE, AO2 mulpd %xmm12, %xmm5 subq $-16 * SIZE, BO addpd %xmm5, %xmm1 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 mulpd %xmm12, %xmm7 movapd 0 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm13, %xmm11 movapd 2 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3.L24: movq MIN_M, I andq $8, I je .L25 movapd 0 * SIZE(AO1), %xmm4 movapd 0 * SIZE(AO2), %xmm5 movapd 0 * SIZE(AO1, LDA, 2), %xmm6 movapd 0 * SIZE(AO2, LDA, 2), %xmm7 movapd 2 * SIZE(AO1), %xmm8 movapd 2 * SIZE(AO2), %xmm9 movapd 2 * SIZE(AO1, LDA, 2), %xmm10 movapd 2 * SIZE(AO2, LDA, 2), %xmm11 mulpd %xmm12, %xmm4 addpd %xmm4, %xmm0 movapd 4 * SIZE(AO1), %xmm4 mulpd %xmm12, %xmm5 addpd %xmm5, %xmm1 movapd 4 * SIZE(AO2), %xmm5 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 movapd 4 * SIZE(AO1, LDA, 2), %xmm6 mulpd %xmm12, %xmm7 movapd 4 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 movapd 4 * SIZE(AO2, LDA, 2), %xmm7 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 movapd 6 * SIZE(AO1), %xmm8 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 movapd 6 * SIZE(AO2), %xmm9 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 movapd 6 * SIZE(AO1, LDA, 2), %xmm10 mulpd %xmm13, %xmm11 movapd 6 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3 movapd 6 * SIZE(AO2, LDA, 2), %xmm11 mulpd %xmm12, %xmm4 subq $-8 * SIZE, AO1 addpd %xmm4, %xmm0 subq $-8 * SIZE, AO2 mulpd %xmm12, %xmm5 subq $-8 * SIZE, BO addpd %xmm5, %xmm1 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 mulpd %xmm12, %xmm7 movapd 0 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm13, %xmm11 movapd 2 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3.L25: movq MIN_M, I andq $4, I je .L26 movapd 0 * SIZE(AO1), %xmm4 movapd 0 * SIZE(AO2), %xmm5 movapd 0 * SIZE(AO1, LDA, 2), %xmm6 movapd 0 * SIZE(AO2, LDA, 2), %xmm7 movapd 2 * SIZE(AO1), %xmm8 movapd 2 * SIZE(AO2), %xmm9 movapd 2 * SIZE(AO1, LDA, 2), %xmm10 movapd 2 * SIZE(AO2, LDA, 2), %xmm11 mulpd %xmm12, %xmm4 subq $-4 * SIZE, AO1 addpd %xmm4, %xmm0 subq $-4 * SIZE, AO2 mulpd %xmm12, %xmm5 subq $-4 * SIZE, BO addpd %xmm5, %xmm1 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 mulpd %xmm12, %xmm7 movapd 0 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3 mulpd %xmm13, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm13, %xmm9 addpd %xmm9, %xmm1 mulpd %xmm13, %xmm10 addpd %xmm10, %xmm2 mulpd %xmm13, %xmm11 movapd 2 * SIZE(BO), %xmm13 addpd %xmm11, %xmm3.L26: movq MIN_M, I andq $2, I je .L27 movapd 0 * SIZE(AO1), %xmm4 movapd 0 * SIZE(AO2), %xmm5 movapd 0 * SIZE(AO1, LDA, 2), %xmm6 movapd 0 * SIZE(AO2, LDA, 2), %xmm7 mulpd %xmm12, %xmm4 subq $-2 * SIZE, AO1 addpd %xmm4, %xmm0 subq $-2 * SIZE, AO2 mulpd %xmm12, %xmm5 subq $-2 * SIZE, BO addpd %xmm5, %xmm1 mulpd %xmm12, %xmm6 addpd %xmm6, %xmm2 mulpd %xmm12, %xmm7 movapd 0 * SIZE(BO), %xmm12 addpd %xmm7, %xmm3.L27: movq MIN_M, I andq $1, I je .L29 movsd 0 * SIZE(AO1), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm0 movsd 0 * SIZE(AO2), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm1 movsd 0 * SIZE(AO1, LDA, 2), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm2 movsd 0 * SIZE(AO2, LDA, 2), %xmm8 mulsd %xmm12, %xmm8 addsd %xmm8, %xmm3.L29:#ifdef HAVE_SSE3 haddpd %xmm0, %xmm0 haddpd %xmm1, %xmm1 haddpd %xmm2, %xmm2 haddpd %xmm3, %xmm3#else movapd %xmm0, %xmm9 movapd %xmm1, %xmm10 movapd %xmm2, %xmm11 movapd %xmm3, %xmm12 unpckhpd %xmm0, %xmm0 unpckhpd %xmm1, %xmm1 unpckhpd %xmm2, %xmm2 unpckhpd %xmm3, %xmm3 addsd %xmm9, %xmm0 addsd %xmm10, %xmm1 addsd %xmm11, %xmm2 addsd %xmm12, %xmm3#endif mulsd ALPHA, %xmm0 mulsd ALPHA, %xmm1 mulsd ALPHA, %xmm2 mulsd ALPHA, %xmm3 movq CO, TEMP addsd (TEMP), %xmm0 addq INCY, TEMP addsd (TEMP), %xmm1 addq INCY, TEMP addsd (TEMP), %xmm2 addq INCY, TEMP addsd (TEMP), %xmm3 movsd %xmm0, (CO) addq INCY, CO movsd %xmm1, (CO) addq INCY, CO movsd %xmm2, (CO) addq INCY, CO movsd %xmm3, (CO) addq INCY, CO decq J jg .L21 ALIGN_3.L30: movq N, J
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -