⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 rot.s

📁 Optimized GotoBLAS libraries
💻 S
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define N	%i0#define X	%i1#define INCX	%i2#define Y	%i3#define INCY	%i4#define I	%i5#define XX	%l0#define YY	%l1#ifdef DOUBLE#define a1	%f4#define a2	%f6#define a3	%f8#define a4	%f10#define a5	%f12#define a6	%f14#define a7	%f16#define a8	%f18#define b1	%f20#define b2	%f22#define b3	%f24#define b4	%f26#define b5	%f28#define b6	%f30#define b7	%f32#define b8	%f34#define c1	%f36#define c2	%f38#define c3	%f40#define c4	%f42#define c5	%f44#define c6	%f46#define c7	%f48#define c8	%f50#define t1	%f52#define t2	%f54#define t3	%f56#define t4	%f58#else#define a1	%f2#define a2	%f3#define a3	%f4#define a4	%f5#define a5	%f6#define a6	%f7#define a7	%f8#define a8	%f9#define b1	%f10#define b2	%f11#define b3	%f12#define b4	%f13#define b5	%f14#define b6	%f15#define b7	%f16#define b8	%f17#define c1	%f18#define c2	%f19#define c3	%f20#define c4	%f21#define c5	%f22#define c6	%f23#define c7	%f24#define c8	%f25#define t1	%f26#define t2	%f27#define t3	%f28#define t4	%f29#endif#ifdef DOUBLE#define C	%f0#define S	%f2#else#define C	%f0#define S	%f1#endif	PROLOGUE	SAVESP#ifndef __64BIT__#ifdef DOUBLE	st	%i5, [%fp + 88]	LDF	[%fp + 88], C	LDF	[%fp + 96], S#else	st	%i5, [%fp + 88]	LDF	[%fp + 88], C	LDF	[%fp + 92], S#endif#else#ifdef DOUBLE	FMOV	%f10, C	FMOV	%f12, S#else	FMOV	%f11, C	FMOV	%f13, S#endif#endif	cmp	N, 0	ble	.LL19	sll	INCX, BASE_SHIFT, INCX	sll	INCY, BASE_SHIFT, INCY	cmp	INCX, SIZE	bne	.LL50	nop	cmp	INCY, SIZE	bne	.LL50	nop	sra	N, 3, I	cmp	I, 0	ble,pn	%icc, .LL15	nop	LDF	[X +  0 * SIZE], a1	LDF	[Y +  0 * SIZE], b1	LDF	[X +  1 * SIZE], a2	LDF	[Y +  1 * SIZE], b2	LDF	[X +  2 * SIZE], a3	LDF	[Y +  2 * SIZE], b3	LDF	[X +  3 * SIZE], a4	LDF	[Y +  3 * SIZE], b4	LDF	[X +  4 * SIZE], a5	LDF	[Y +  4 * SIZE], b5	LDF	[X +  5 * SIZE], a6	LDF	[Y +  5 * SIZE], b6	LDF	[X +  6 * SIZE], a7	LDF	[Y +  6 * SIZE], b7	LDF	[X +  7 * SIZE], a8	LDF	[Y +  7 * SIZE], b8	FMUL	C, a1, c1	FMUL	S, b1, c2	FMUL	C, b1, c3	LDF	[Y +  8 * SIZE], b1	FMUL	S, a1, c4	LDF	[X +  8 * SIZE], a1	FMUL	C, a2, c5	FMUL	S, b2, c6	FADD	c1, c2, t1	FMUL	C, b2, c7	LDF	[Y +  9 * SIZE], b2	FMUL	S, a2, c8	LDF	[X +  9 * SIZE], a2	FSUB	c3, c4, t2	addcc	I, -1, I	ble,pt	%icc, .LL12	nop#define PREFETCHSIZE 64.LL11:	FMUL	C, a3, c1	nop	prefetch [Y  + PREFETCHSIZE * SIZE], 1	nop	FMUL	S, b3, c2	STF	t1, [X +  0 * SIZE]	FADD	c5, c6, t3	nop	FMUL	C, b3, c3	LDF	[Y + 10 * SIZE], b3	nop	nop	FMUL	S, a3, c4	STF	t2, [Y +  0 * SIZE]	FSUB	c7, c8, t4	nop	FMUL	C, a4, c5	LDF	[X + 10 * SIZE], a3	nop	nop	FMUL	S, b4, c6	STF	t3, [X +  1 * SIZE]	FADD	c1, c2, t1	nop	FMUL	C, b4, c7	LDF	[Y + 11 * SIZE], b4	nop	nop	FMUL	S, a4, c8	STF	t4, [Y +  1 * SIZE]	FSUB	c3, c4, t2	nop	FMUL	C, a5, c1	LDF	[X + 11 * SIZE], a4	nop	nop	FMUL	S, b5, c2	STF	t1, [X +  2 * SIZE]	FADD	c5, c6, t3	nop	FMUL	C, b5, c3	LDF	[Y + 12 * SIZE], b5	nop	nop	FMUL	S, a5, c4	STF	t2, [Y +  2 * SIZE]	FSUB	c7, c8, t4	nop	FMUL	C, a6, c5	LDF	[X + 12 * SIZE], a5	nop	nop	FMUL	S, b6, c6	STF	t3, [X +  3 * SIZE]	FADD	c1, c2, t1	nop	FMUL	C, b6, c7	LDF	[Y + 13 * SIZE], b6	nop	nop	FMUL	S, a6, c8	STF	t4, [Y +  3 * SIZE]	FSUB	c3, c4, t2	nop	FMUL	C, a7, c1	LDF	[X + 13 * SIZE], a6	nop	nop	FMUL	S, b7, c2	STF	t1, [X +  4 * SIZE]	FADD	c5, c6, t3	nop	FMUL	C, b7, c3	LDF	[Y + 14 * SIZE], b7	nop	nop	FMUL	S, a7, c4	STF	t2, [Y +  4 * SIZE]	FSUB	c7, c8, t4	nop	FMUL	C, a8, c5	LDF	[X + 14 * SIZE], a7	nop	nop	FMUL	S, b8, c6	STF	t3, [X +  5 * SIZE]	FADD	c1, c2, t1	nop	FMUL	C, b8, c7	LDF	[Y + 15 * SIZE], b8	nop	nop	FMUL	S, a8, c8	STF	t4, [Y +  5 * SIZE]	FSUB	c3, c4, t2	nop	FMUL	C, a1, c1	LDF	[X + 15 * SIZE], a8	addcc	I, -1, I	nop	FMUL	S, b1, c2	STF	t1, [X +  6 * SIZE]	FADD	c5, c6, t3	nop	FMUL	C, b1, c3	LDF	[Y + 16 * SIZE], b1	nop	nop	FMUL	S, a1, c4	STF	t2, [Y +  6 * SIZE]	FSUB	c7, c8, t4	nop	FMUL	C, a2, c5	LDF	[X + 16 * SIZE], a1	add	Y, 8 * SIZE, Y	nop	FMUL	S, b2, c6	STF	t3, [X +  7 * SIZE]	FADD	c1, c2, t1	nop	FMUL	C, b2, c7	LDF	[Y +  9 * SIZE], b2	add	X, 8 * SIZE, X	nop	FMUL	S, a2, c8	STF	t4, [Y -  1 * SIZE]	FSUB	c3, c4, t2	nop	bg,pt	%icc, .LL11	LDF	[X +  9 * SIZE], a2.LL12:	FMUL	C, a3, c1	FMUL	S, b3, c2	STF	t1, [X +  0 * SIZE]	FADD	c5, c6, t3	FMUL	C, b3, c3	FMUL	S, a3, c4	STF	t2, [Y +  0 * SIZE]	FSUB	c7, c8, t4	FMUL	C, a4, c5	FMUL	S, b4, c6	STF	t3, [X +  1 * SIZE]	FADD	c1, c2, t1	FMUL	C, b4, c7	FMUL	S, a4, c8	STF	t4, [Y +  1 * SIZE]	FSUB	c3, c4, t2	FMUL	C, a5, c1	FMUL	S, b5, c2	STF	t1, [X +  2 * SIZE]	FADD	c5, c6, t3	FMUL	C, b5, c3	FMUL	S, a5, c4	STF	t2, [Y +  2 * SIZE]	FSUB	c7, c8, t4	FMUL	C, a6, c5	FMUL	S, b6, c6	STF	t3, [X +  3 * SIZE]	FADD	c1, c2, t1	FMUL	C, b6, c7	FMUL	S, a6, c8	STF	t4, [Y +  3 * SIZE]	FSUB	c3, c4, t2	FMUL	C, a7, c1	FMUL	S, b7, c2	STF	t1, [X +  4 * SIZE]	FADD	c5, c6, t3	FMUL	C, b7, c3	FMUL	S, a7, c4	STF	t2, [Y +  4 * SIZE]	FSUB	c7, c8, t4	FMUL	C, a8, c5	FMUL	S, b8, c6	STF	t3, [X +  5 * SIZE]	FADD	c1, c2, t1	FMUL	C, b8, c7	FMUL	S, a8, c8	STF	t4, [Y +  5 * SIZE]	FSUB	c3, c4, t2	FADD	c5, c6, t3	STF	t1, [X +  6 * SIZE]	FSUB	c7, c8, t4	STF	t2, [Y +  6 * SIZE]	STF	t3, [X +  7 * SIZE]	STF	t4, [Y +  7 * SIZE]	add	X, 8 * SIZE, X	add	Y, 8 * SIZE, Y.LL15:	andcc	N, 7, I	nop	ble,a,pn %icc, .LL19	nop.LL16:	LDF	[X + 0 * SIZE], a1	add	X, 1 * SIZE, X	LDF	[Y + 0 * SIZE], b1	add	Y, 1 * SIZE, Y	FMUL	C, a1, c1	FMUL	S, b1, c2	FMUL	C, b1, c3	FMUL	S, a1, c4	FADD	c1, c2, c2	addcc	I, -1, I	FSUB	c3, c4, c4	nop	STF	c2, [X - 1 * SIZE]	STF	c4, [Y - 1 * SIZE]	bg,pt	%icc, .LL16	nop.LL19:	return	%i7 + 8	nop	.LL50:	mov	X, XX	mov	Y, YY	sra	N, 3, I	cmp	I, 0	ble,pn	%icc, .LL55	nop.LL51:	LDF	[X +  0 * SIZE], a1	add	X, INCX, X	LDF	[Y +  0 * SIZE], b1	add	Y, INCY, Y	LDF	[X +  0 * SIZE], a2	add	X, INCX, X	LDF	[Y +  0 * SIZE], b2	add	Y, INCY, Y	LDF	[X +  0 * SIZE], a3	add	X, INCX, X	LDF	[Y +  0 * SIZE], b3	add	Y, INCY, Y	LDF	[X +  0 * SIZE], a4	add	X, INCX, X	LDF	[Y +  0 * SIZE], b4	add	Y, INCY, Y	LDF	[X +  0 * SIZE], a5	add	X, INCX, X	LDF	[Y +  0 * SIZE], b5	add	Y, INCY, Y	LDF	[X +  0 * SIZE], a6	add	X, INCX, X	LDF	[Y +  0 * SIZE], b6	add	Y, INCY, Y	LDF	[X +  0 * SIZE], a7	add	X, INCX, X	LDF	[Y +  0 * SIZE], b7	add	Y, INCY, Y	LDF	[X +  0 * SIZE], a8	add	X, INCX, X	LDF	[Y +  0 * SIZE], b8	add	Y, INCY, Y	FMUL	C, a1, c1	FMUL	S, b1, c2	FMUL	C, b1, c3	FMUL	S, a1, c4	FADD	c1, c2, t1	FSUB	c3, c4, t2	STF	t1, [XX +  0 * SIZE]	add	XX, INCX, XX	STF	t2, [YY +  0 * SIZE]	add	YY, INCY, YY	FMUL	C, a2, c5	FMUL	S, b2, c6	FMUL	C, b2, c7	FMUL	S, a2, c8	FADD	c5, c6, t3	FSUB	c7, c8, t4	STF	t3, [XX +  0 * SIZE]	add	XX, INCX, XX	STF	t4, [YY +  0 * SIZE]	add	YY, INCY, YY	FMUL	C, a3, c1	FMUL	S, b3, c2	FMUL	C, b3, c3	FMUL	S, a3, c4	FADD	c1, c2, t1	FSUB	c3, c4, t2	STF	t1, [XX +  0 * SIZE]	add	XX, INCX, XX	STF	t2, [YY +  0 * SIZE]	add	YY, INCY, YY	FMUL	C, a4, c5	FMUL	S, b4, c6	FMUL	C, b4, c7	FMUL	S, a4, c8	FADD	c5, c6, t3	FSUB	c7, c8, t4	STF	t3, [XX +  0 * SIZE]	add	XX, INCX, XX	STF	t4, [YY +  0 * SIZE]	add	YY, INCY, YY	FMUL	C, a5, c1	FMUL	S, b5, c2	FMUL	C, b5, c3	FMUL	S, a5, c4	FADD	c1, c2, t1	FSUB	c3, c4, t2	STF	t1, [XX +  0 * SIZE]	add	XX, INCX, XX	STF	t2, [YY +  0 * SIZE]	add	YY, INCY, YY	FMUL	C, a6, c5	FMUL	S, b6, c6	FMUL	C, b6, c7	FMUL	S, a6, c8	FADD	c5, c6, t3	FSUB	c7, c8, t4	STF	t3, [XX +  0 * SIZE]	add	XX, INCX, XX	STF	t4, [YY +  0 * SIZE]	add	YY, INCY, YY	FMUL	C, a7, c1	FMUL	S, b7, c2	FMUL	C, b7, c3	FMUL	S, a7, c4	FADD	c1, c2, t1	FSUB	c3, c4, t2	STF	t1, [XX +  0 * SIZE]	add	XX, INCX, XX	STF	t2, [YY +  0 * SIZE]	add	YY, INCY, YY	FMUL	C, a8, c5	FMUL	S, b8, c6	FMUL	C, b8, c7	FMUL	S, a8, c8	FADD	c5, c6, t3	FSUB	c7, c8, t4	STF	t3, [XX +  0 * SIZE]	add	XX, INCX, XX	STF	t4, [YY +  0 * SIZE]	add	YY, INCY, YY	addcc	I, -1, I	bg,pt	%icc, .LL51	nop.LL55:	andcc	N, 7, I	nop	ble	%icc, .LL59	nop.LL56:	LDF	[X +  0 * SIZE], a1	LDF	[Y +  0 * SIZE], b1	FMUL	C, a1, c1	FMUL	S, b1, c2	FMUL	C, b1, c3	FMUL	S, a1, c4	FADD	c1, c2, c2	FSUB	c3, c4, c4	STF	c2, [X +  0 * SIZE]	add	X, INCX, X	STF	c4, [Y +  0 * SIZE]	addcc	I, -1, I	bg	%icc, .LL56	add	Y, INCY, Y.LL59:	return	%i7 + 8	nop	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -