📄 zrot.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#include "version.h"#define N $16#define X $17#define INCX $18#define Y $19#define INCY $20#define I $21#define XX $23#define YY $24#define C $f10#define S $f11#define PREFETCH_SIZE 80 PROLOGUE PROFCODE .frame $sp, 0, $26, 0#ifndef PROFILE .prologue 0#else .prologue 1#endif fmov $f21, C LD S, 0($sp) addq INCX, INCX, INCX addq INCY, INCY, INCY cmpeq INCX, 2, $23 cmpeq INCY, 2, $24 ble N, $L998 and $23, $24, $23 beq $23, $L50 sra N, 2, I ble I, $L15 LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) LD $f15, 1*SIZE(Y) LD $f16, 2*SIZE(X) LD $f17, 2*SIZE(Y) LD $f18, 3*SIZE(X) LD $f19, 3*SIZE(Y) MUL C, $f12, $f21 unop MUL S, $f13, $f22 MUL C, $f13, $f23 LD $f13, 4*SIZE(Y) MUL S, $f12, $f24 LD $f12, 4*SIZE(X) MUL C, $f14, $f25 lda I, -1(I) MUL S, $f15, $f26 ADD $f21, $f22, $f22 MUL C, $f15, $f27 LD $f15, 5*SIZE(Y) MUL S, $f14, $f28 SUB $f23, $f24, $f24 ble I, $L13 .align 4$L12: MUL C, $f16, $f21 lds $f31, (PREFETCH_SIZE) * SIZE(X) unop LD $f14, 5*SIZE(X) ST $f22, 0*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 lds $f31, (PREFETCH_SIZE) * SIZE(Y) unop LD $f17, 6*SIZE(Y) ST $f24, 0*SIZE(Y) MUL S, $f16, $f24 unop SUB $f27, $f28, $f28 MUL C, $f18, $f25 LD $f16, 6*SIZE(X) unop unop ST $f26, 1*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 unop unop LD $f19, 7*SIZE(Y) ST $f28, 1*SIZE(Y) MUL S, $f18, $f28 unop SUB $f23, $f24, $f24 MUL C, $f12, $f21 LD $f18, 7*SIZE(X) unop unop ST $f22, 2*SIZE(X) unop MUL S, $f13, $f22 ADD $f25, $f26, $f26 MUL C, $f13, $f23 LD $f13, 8*SIZE(Y) unop unop ST $f24, 2*SIZE(Y) MUL S, $f12, $f24 unop SUB $f27, $f28, $f28 MUL C, $f14, $f25 LD $f12, 8*SIZE(X) unop unop ST $f26, 3*SIZE(X) MUL S, $f15, $f26 unop ADD $f21, $f22, $f22 MUL C, $f15, $f27 LD $f15, 9*SIZE(Y) unop unop ST $f28, 3*SIZE(Y) MUL S, $f14, $f28 unop SUB $f23, $f24, $f24 MUL C, $f16, $f21 LD $f14, 9*SIZE(X) unop unop ST $f22, 4*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 LD $f17, 10*SIZE(Y) unop unop ST $f24, 4*SIZE(Y) MUL S, $f16, $f24 unop SUB $f27, $f28, $f28 MUL C, $f18, $f25 LD $f16, 10*SIZE(X) unop unop ST $f26, 5*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 LD $f19, 11*SIZE(Y) unop unop ST $f28, 5*SIZE(Y) MUL S, $f18, $f28 lda I, -1(I) SUB $f23, $f24, $f24 MUL C, $f12, $f21 LD $f18, 11*SIZE(X) unop unop ST $f22, 6*SIZE(X) MUL S, $f13, $f22 unop ADD $f25, $f26, $f26 MUL C, $f13, $f23 LD $f13, 12*SIZE(Y) lda X, 8*SIZE(X) unop ST $f24, 6*SIZE(Y) MUL S, $f12, $f24 unop SUB $f27, $f28, $f28 MUL C, $f14, $f25 LD $f12, 4*SIZE(X) lda Y, 8*SIZE(Y) unop ST $f26, -1*SIZE(X) MUL S, $f15, $f26 unop ADD $f21, $f22, $f22 MUL C, $f15, $f27 LD $f15, 5*SIZE(Y) unop unop ST $f28, -1*SIZE(Y) MUL S, $f14, $f28 SUB $f23, $f24, $f24 bgt I, $L12 .align 4$L13: MUL C, $f16, $f21 LD $f14, 5*SIZE(X) unop unop ST $f22, 0*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 unop unop LD $f17, 6*SIZE(Y) ST $f24, 0*SIZE(Y) MUL S, $f16, $f24 LD $f16, 6*SIZE(X) SUB $f27, $f28, $f28 MUL C, $f18, $f25 unop unop unop ST $f26, 1*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 unop unop LD $f19, 7*SIZE(Y) ST $f28, 1*SIZE(Y) MUL S, $f18, $f28 LD $f18, 7*SIZE(X) SUB $f23, $f24, $f24 MUL C, $f12, $f21 unop unop unop ST $f22, 2*SIZE(X) unop MUL S, $f13, $f22 ADD $f25, $f26, $f26 MUL C, $f13, $f23 unop unop unop ST $f24, 2*SIZE(Y) MUL S, $f12, $f24 unop SUB $f27, $f28, $f28 MUL C, $f14, $f25 unop unop unop ST $f26, 3*SIZE(X) MUL S, $f15, $f26 unop ADD $f21, $f22, $f22 MUL C, $f15, $f27 unop unop unop ST $f28, 3*SIZE(Y) MUL S, $f14, $f28 unop SUB $f23, $f24, $f24 MUL C, $f16, $f21 unop unop unop ST $f22, 4*SIZE(X) MUL S, $f17, $f22 unop ADD $f25, $f26, $f26 MUL C, $f17, $f23 unop unop unop ST $f24, 4*SIZE(Y) MUL S, $f16, $f24 unop SUB $f27, $f28, $f28 MUL C, $f18, $f25 unop unop unop ST $f26, 5*SIZE(X) MUL S, $f19, $f26 unop ADD $f21, $f22, $f22 MUL C, $f19, $f27 unop unop unop ST $f28, 5*SIZE(Y) MUL S, $f18, $f28 unop SUB $f23, $f24, $f24 ST $f22, 6*SIZE(X) ADD $f25, $f26, $f26 ST $f24, 6*SIZE(Y) SUB $f27, $f28, $f28 ST $f26, 7*SIZE(X) lda X, 8*SIZE(X) ST $f28, 7*SIZE(Y) lda Y, 8*SIZE(Y) .align 4$L15: and N, 3, I ble I, $L998 .align 4$L16: LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) LD $f15, 1*SIZE(Y) MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(X) ST $f24, 0*SIZE(Y) lda I, -1(I) ST $f26, 1*SIZE(X) lda X, 2 * SIZE(X) ST $f28, 1*SIZE(Y) lda Y, 2 * SIZE(Y) bgt I, $L16 .align 4$L998: clr $0 ret .align 4$L50: mov X, XX mov Y, YY sra N, 2, I ble I, $L55 .align 4$L51: LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) SXADDQ INCX, X, X LD $f15, 1*SIZE(Y) SXADDQ INCY, Y, Y MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(XX) ST $f24, 0*SIZE(YY) ST $f26, 1*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 1*SIZE(YY) SXADDQ INCY, YY, YY LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) SXADDQ INCX, X, X LD $f15, 1*SIZE(Y) SXADDQ INCY, Y, Y MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(XX) ST $f24, 0*SIZE(YY) ST $f26, 1*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 1*SIZE(YY) SXADDQ INCY, YY, YY LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) SXADDQ INCX, X, X LD $f15, 1*SIZE(Y) SXADDQ INCY, Y, Y MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(XX) ST $f24, 0*SIZE(YY) ST $f26, 1*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 1*SIZE(YY) SXADDQ INCY, YY, YY LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) SXADDQ INCX, X, X LD $f15, 1*SIZE(Y) SXADDQ INCY, Y, Y MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(XX) ST $f24, 0*SIZE(YY) ST $f26, 1*SIZE(XX) SXADDQ INCX, XX, XX ST $f28, 1*SIZE(YY) SXADDQ INCY, YY, YY lda I, -1(I) bgt I, $L51 .align 4$L55: and N, 3, I ble I, $L999 .align 4$L56: LD $f12, 0*SIZE(X) LD $f13, 0*SIZE(Y) LD $f14, 1*SIZE(X) LD $f15, 1*SIZE(Y) MUL C, $f12, $f21 MUL S, $f13, $f22 MUL C, $f13, $f23 MUL S, $f12, $f24 ADD $f21, $f22, $f22 SUB $f23, $f24, $f24 MUL C, $f14, $f25 MUL S, $f15, $f26 MUL C, $f15, $f27 MUL S, $f14, $f28 ADD $f25, $f26, $f26 SUB $f27, $f28, $f28 ST $f22, 0*SIZE(X) ST $f24, 0*SIZE(Y) lda I, -1(I) ST $f26, 1*SIZE(X) ST $f28, 1*SIZE(Y) SXADDQ INCX, X, X SXADDQ INCY, Y, Y bgt I, $L56 .align 4$L999: clr $0 ret EPILOGUE
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -