⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zrot.s

📁 Optimized GotoBLAS libraries
💻 S
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#include "version.h"#define N	$16#define	X	$17#define INCX	$18#define Y	$19#define INCY	$20#define I	$21#define XX	$23#define YY	$24#define C	$f10#define S	$f11#define PREFETCH_SIZE 80	PROLOGUE	PROFCODE	.frame	$sp, 0, $26, 0#ifndef PROFILE	.prologue 0#else	.prologue 1#endif	fmov	$f21,   C	LD	S, 0($sp)	addq	INCX, INCX, INCX	addq	INCY, INCY, INCY	cmpeq	INCX, 2,  $23	cmpeq	INCY, 2,  $24	ble	N,  $L998	and	$23, $24, $23	beq	$23, $L50	sra	N, 2, I	ble	I, $L15	LD	$f12,   0*SIZE(X)	LD	$f13,   0*SIZE(Y)	LD	$f14,   1*SIZE(X)	LD	$f15,   1*SIZE(Y)	LD	$f16,   2*SIZE(X)	LD	$f17,   2*SIZE(Y)	LD	$f18,   3*SIZE(X)	LD	$f19,   3*SIZE(Y)	MUL	C, $f12, $f21	unop	MUL	S, $f13, $f22	MUL	C, $f13, $f23	LD	$f13,   4*SIZE(Y)	MUL	S, $f12, $f24	LD	$f12,   4*SIZE(X)	MUL	C, $f14, $f25	lda	I, -1(I)	MUL	S, $f15, $f26	ADD	$f21, $f22, $f22	MUL	C, $f15, $f27	LD	$f15,   5*SIZE(Y)	MUL	S, $f14, $f28	SUB	$f23, $f24, $f24	ble	I, $L13	.align 4$L12:	MUL	C, $f16, $f21	lds	$f31, (PREFETCH_SIZE) * SIZE(X)	unop	LD	$f14,   5*SIZE(X)	ST	$f22,   0*SIZE(X)	MUL	S, $f17, $f22	unop	ADD	$f25, $f26, $f26	MUL	C, $f17, $f23	lds	$f31, (PREFETCH_SIZE) * SIZE(Y)	unop	LD	$f17,   6*SIZE(Y)	ST	$f24,   0*SIZE(Y)	MUL	S, $f16, $f24	unop	SUB	$f27, $f28, $f28	MUL	C, $f18, $f25	LD	$f16,   6*SIZE(X)	unop	unop	ST	$f26,   1*SIZE(X)	MUL	S, $f19, $f26	unop	ADD	$f21, $f22, $f22	MUL	C, $f19, $f27	unop	unop	LD	$f19,   7*SIZE(Y)	ST	$f28,   1*SIZE(Y)	MUL	S, $f18, $f28	unop	SUB	$f23, $f24, $f24	MUL	C, $f12, $f21	LD	$f18,   7*SIZE(X)	unop	unop	ST	$f22,   2*SIZE(X)	unop	MUL	S, $f13, $f22	ADD	$f25, $f26, $f26	MUL	C, $f13, $f23	LD	$f13,   8*SIZE(Y)	unop	unop	ST	$f24,   2*SIZE(Y)	MUL	S, $f12, $f24	unop	SUB	$f27, $f28, $f28	MUL	C, $f14, $f25	LD	$f12,   8*SIZE(X)	unop	unop	ST	$f26,   3*SIZE(X)	MUL	S, $f15, $f26	unop	ADD	$f21, $f22, $f22	MUL	C, $f15, $f27	LD	$f15,   9*SIZE(Y)	unop	unop	ST	$f28,   3*SIZE(Y)	MUL	S, $f14, $f28	unop	SUB	$f23, $f24, $f24	MUL	C, $f16, $f21	LD	$f14,   9*SIZE(X)	unop	unop	ST	$f22,   4*SIZE(X)	MUL	S, $f17, $f22	unop	ADD	$f25, $f26, $f26	MUL	C, $f17, $f23	LD	$f17,  10*SIZE(Y)	unop	unop	ST	$f24,   4*SIZE(Y)	MUL	S, $f16, $f24	unop	SUB	$f27, $f28, $f28	MUL	C, $f18, $f25	LD	$f16,  10*SIZE(X)	unop	unop	ST	$f26,   5*SIZE(X)	MUL	S, $f19, $f26	unop	ADD	$f21, $f22, $f22	MUL	C, $f19, $f27	LD	$f19,  11*SIZE(Y)	unop	unop	ST	$f28,   5*SIZE(Y)	MUL	S, $f18, $f28	lda	I, -1(I)	SUB	$f23, $f24, $f24	MUL	C, $f12, $f21	LD	$f18,  11*SIZE(X)	unop	unop	ST	$f22,   6*SIZE(X)	MUL	S, $f13, $f22	unop	ADD	$f25, $f26, $f26	MUL	C, $f13, $f23	LD	$f13,  12*SIZE(Y)	lda	X,   8*SIZE(X)	unop	ST	$f24,   6*SIZE(Y)	MUL	S, $f12, $f24	unop	SUB	$f27, $f28, $f28	MUL	C, $f14, $f25	LD	$f12,   4*SIZE(X)	lda	Y,   8*SIZE(Y)	unop	ST	$f26,  -1*SIZE(X)	MUL	S, $f15, $f26	unop	ADD	$f21, $f22, $f22	MUL	C, $f15, $f27	LD	$f15,   5*SIZE(Y)	unop	unop	ST	$f28,  -1*SIZE(Y)	MUL	S, $f14, $f28	SUB	$f23, $f24, $f24	bgt	I, $L12	.align 4$L13:	MUL	C, $f16, $f21	LD	$f14,   5*SIZE(X)	unop	unop	ST	$f22,   0*SIZE(X)	MUL	S, $f17, $f22	unop	ADD	$f25, $f26, $f26	MUL	C, $f17, $f23	unop	unop	LD	$f17,   6*SIZE(Y)	ST	$f24,   0*SIZE(Y)	MUL	S, $f16, $f24	LD	$f16,   6*SIZE(X)	SUB	$f27, $f28, $f28	MUL	C, $f18, $f25	unop	unop	unop	ST	$f26,   1*SIZE(X)	MUL	S, $f19, $f26	unop	ADD	$f21, $f22, $f22	MUL	C, $f19, $f27	unop	unop	LD	$f19,   7*SIZE(Y)	ST	$f28,   1*SIZE(Y)	MUL	S, $f18, $f28	LD	$f18,   7*SIZE(X)	SUB	$f23, $f24, $f24	MUL	C, $f12, $f21	unop	unop	unop	ST	$f22,   2*SIZE(X)	unop	MUL	S, $f13, $f22	ADD	$f25, $f26, $f26	MUL	C, $f13, $f23	unop	unop	unop	ST	$f24,   2*SIZE(Y)	MUL	S, $f12, $f24	unop	SUB	$f27, $f28, $f28	MUL	C, $f14, $f25	unop	unop	unop	ST	$f26,   3*SIZE(X)	MUL	S, $f15, $f26	unop	ADD	$f21, $f22, $f22	MUL	C, $f15, $f27	unop	unop	unop	ST	$f28,   3*SIZE(Y)	MUL	S, $f14, $f28	unop	SUB	$f23, $f24, $f24	MUL	C, $f16, $f21	unop	unop	unop	ST	$f22,   4*SIZE(X)	MUL	S, $f17, $f22	unop	ADD	$f25, $f26, $f26	MUL	C, $f17, $f23	unop	unop	unop	ST	$f24,   4*SIZE(Y)	MUL	S, $f16, $f24	unop	SUB	$f27, $f28, $f28	MUL	C, $f18, $f25	unop	unop	unop	ST	$f26,   5*SIZE(X)	MUL	S, $f19, $f26	unop	ADD	$f21, $f22, $f22	MUL	C, $f19, $f27	unop	unop	unop	ST	$f28,   5*SIZE(Y)	MUL	S, $f18, $f28	unop	SUB	$f23, $f24, $f24	ST	$f22,   6*SIZE(X)	ADD	$f25, $f26, $f26	ST	$f24,   6*SIZE(Y)	SUB	$f27, $f28, $f28	ST	$f26,   7*SIZE(X)	lda	X,   8*SIZE(X)	ST	$f28,   7*SIZE(Y)	lda	Y,   8*SIZE(Y)	.align 4$L15:	and	N, 3, I	ble	I, $L998	.align 4$L16:	LD	$f12,   0*SIZE(X)	LD	$f13,   0*SIZE(Y)	LD	$f14,   1*SIZE(X)	LD	$f15,   1*SIZE(Y)	MUL	C, $f12, $f21	MUL	S, $f13, $f22	MUL	C, $f13, $f23	MUL	S, $f12, $f24	ADD	$f21, $f22, $f22	SUB	$f23, $f24, $f24	MUL	C, $f14, $f25	MUL	S, $f15, $f26	MUL	C, $f15, $f27	MUL	S, $f14, $f28	ADD	$f25, $f26, $f26	SUB	$f27, $f28, $f28	ST	$f22,   0*SIZE(X)	ST	$f24,   0*SIZE(Y)	lda	I, -1(I)	ST	$f26,   1*SIZE(X)	lda	X, 2 * SIZE(X)	ST	$f28,   1*SIZE(Y)	lda	Y, 2 * SIZE(Y)	bgt	I, $L16	.align 4$L998:	clr	$0	ret	.align 4$L50:	mov	X, XX	mov	Y, YY	sra	N, 2, I	ble	I, $L55	.align 4$L51:	LD	$f12,   0*SIZE(X)	LD	$f13,   0*SIZE(Y)	LD	$f14,   1*SIZE(X)	SXADDQ	INCX, X, X	LD	$f15,   1*SIZE(Y)	SXADDQ	INCY, Y, Y	MUL	C, $f12, $f21	MUL	S, $f13, $f22	MUL	C, $f13, $f23	MUL	S, $f12, $f24	ADD	$f21, $f22, $f22	SUB	$f23, $f24, $f24	MUL	C, $f14, $f25	MUL	S, $f15, $f26	MUL	C, $f15, $f27	MUL	S, $f14, $f28	ADD	$f25, $f26, $f26	SUB	$f27, $f28, $f28	ST	$f22,   0*SIZE(XX)	ST	$f24,   0*SIZE(YY)	ST	$f26,   1*SIZE(XX)	SXADDQ	INCX, XX, XX	ST	$f28,   1*SIZE(YY)	SXADDQ	INCY, YY, YY	LD	$f12,   0*SIZE(X)	LD	$f13,   0*SIZE(Y)	LD	$f14,   1*SIZE(X)	SXADDQ	INCX, X, X	LD	$f15,   1*SIZE(Y)	SXADDQ	INCY, Y, Y	MUL	C, $f12, $f21	MUL	S, $f13, $f22	MUL	C, $f13, $f23	MUL	S, $f12, $f24	ADD	$f21, $f22, $f22	SUB	$f23, $f24, $f24	MUL	C, $f14, $f25	MUL	S, $f15, $f26	MUL	C, $f15, $f27	MUL	S, $f14, $f28	ADD	$f25, $f26, $f26	SUB	$f27, $f28, $f28	ST	$f22,   0*SIZE(XX)	ST	$f24,   0*SIZE(YY)	ST	$f26,   1*SIZE(XX)	SXADDQ	INCX, XX, XX	ST	$f28,   1*SIZE(YY)	SXADDQ	INCY, YY, YY	LD	$f12,   0*SIZE(X)	LD	$f13,   0*SIZE(Y)	LD	$f14,   1*SIZE(X)	SXADDQ	INCX, X, X	LD	$f15,   1*SIZE(Y)	SXADDQ	INCY, Y, Y	MUL	C, $f12, $f21	MUL	S, $f13, $f22	MUL	C, $f13, $f23	MUL	S, $f12, $f24	ADD	$f21, $f22, $f22	SUB	$f23, $f24, $f24	MUL	C, $f14, $f25	MUL	S, $f15, $f26	MUL	C, $f15, $f27	MUL	S, $f14, $f28	ADD	$f25, $f26, $f26	SUB	$f27, $f28, $f28	ST	$f22,   0*SIZE(XX)	ST	$f24,   0*SIZE(YY)	ST	$f26,   1*SIZE(XX)	SXADDQ	INCX, XX, XX	ST	$f28,   1*SIZE(YY)	SXADDQ	INCY, YY, YY	LD	$f12,   0*SIZE(X)	LD	$f13,   0*SIZE(Y)	LD	$f14,   1*SIZE(X)	SXADDQ	INCX, X, X	LD	$f15,   1*SIZE(Y)	SXADDQ	INCY, Y, Y	MUL	C, $f12, $f21	MUL	S, $f13, $f22	MUL	C, $f13, $f23	MUL	S, $f12, $f24	ADD	$f21, $f22, $f22	SUB	$f23, $f24, $f24	MUL	C, $f14, $f25	MUL	S, $f15, $f26	MUL	C, $f15, $f27	MUL	S, $f14, $f28	ADD	$f25, $f26, $f26	SUB	$f27, $f28, $f28	ST	$f22,   0*SIZE(XX)	ST	$f24,   0*SIZE(YY)	ST	$f26,   1*SIZE(XX)	SXADDQ	INCX, XX, XX	ST	$f28,   1*SIZE(YY)	SXADDQ	INCY, YY, YY	lda	I, -1(I)	bgt	I, $L51	.align 4$L55:	and	N, 3, I	ble	I, $L999	.align 4$L56:	LD	$f12,   0*SIZE(X)	LD	$f13,   0*SIZE(Y)	LD	$f14,   1*SIZE(X)	LD	$f15,   1*SIZE(Y)	MUL	C, $f12, $f21	MUL	S, $f13, $f22	MUL	C, $f13, $f23	MUL	S, $f12, $f24	ADD	$f21, $f22, $f22	SUB	$f23, $f24, $f24	MUL	C, $f14, $f25	MUL	S, $f15, $f26	MUL	C, $f15, $f27	MUL	S, $f14, $f28	ADD	$f25, $f26, $f26	SUB	$f27, $f28, $f28	ST	$f22,   0*SIZE(X)	ST	$f24,   0*SIZE(Y)	lda	I, -1(I)	ST	$f26,   1*SIZE(X)	ST	$f28,   1*SIZE(Y)	SXADDQ	INCX, X, X	SXADDQ	INCY, Y, Y	bgt	I, $L56	.align 4$L999:	clr	$0	ret	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -