⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zaxpy.s

📁 Optimized GotoBLAS libraries
💻 S
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#if defined(DOUBLE) && !defined(__64BIT__)#define N	%i0#define X	%i1#define INCX	%i2#define Y	%i3#define INCY	%i4#define I	%i5#else#define N	%i0#define X	%i5#define INCX	%i1#define Y	%i2#define INCY	%i3#define I	%i4#endif#define YY	%l1#ifdef DOUBLE#define a1	%f0#define a2	%f2#define a3	%f4#define a4	%f6#define a5	%f8#define a6	%f10#define a7	%f12#define a8	%f14#define b1	%f16#define b2	%f18#define b3	%f20#define b4	%f22#define b5	%f24#define b6	%f26#define b7	%f28#define b8	%f30#define t1	%f32#define t2	%f34#define t3	%f36#define	t4	%f38#define c1	%f40#define c2	%f42#define c3	%f44#define c4	%f46#define c5	%f48#define c6	%f50#define c7	%f52#define c8	%f54#define ALPHA_R	%f60#define ALPHA_I	%f62#else#define a1	%f0#define a2	%f1#define a3	%f2#define a4	%f3#define a5	%f4#define a6	%f5#define a7	%f6#define a8	%f7#define b1	%f8#define b2	%f9#define b3	%f10#define b4	%f11#define b5	%f12#define b6	%f13#define b7	%f14#define b8	%f15#define t1	%f16#define t2	%f17#define t3	%f18#define	t4	%f19#define c1	%f20#define c2	%f21#define c3	%f22#define c4	%f23#define c5	%f24#define c6	%f25#define c7	%f26#define c8	%f27#define ALPHA_R	%f30#define ALPHA_I	%f31#endif#ifndef CONJ#define ADD1	FSUB#define ADD2	FADD#else#define ADD1	FADD#define ADD2	FSUB#endif	PROLOGUE	SAVESP#ifndef __64BIT__#ifdef DOUBLE	st	%i3, [%fp + STACK_START + 16]	st	%i4, [%fp + STACK_START + 20]	st	%i5, [%fp + STACK_START + 24]	ld	[%fp+ STACK_START + 32], X	ld	[%fp+ STACK_START + 36], INCX	ld	[%fp+ STACK_START + 40], Y	ld	[%fp+ STACK_START + 44], INCY	ldd	[%fp + STACK_START + 16], ALPHA_R	ldd	[%fp + STACK_START + 24], ALPHA_I#else	st	%i3, [%fp + STACK_START + 16]	st	%i4, [%fp + STACK_START + 20]	ld	[%fp+ STACK_START + 28], INCX	ld	[%fp+ STACK_START + 32], Y	ld	[%fp+ STACK_START + 36], INCY	ld	[%fp + STACK_START + 16], ALPHA_R	ld	[%fp + STACK_START + 20], ALPHA_I#endif#else	ldx	[%fp +  STACK_START + 56], INCX	ldx	[%fp +  STACK_START + 64], Y	ldx	[%fp +  STACK_START + 72], INCY#ifdef DOUBLE	FMOV	%f6, ALPHA_R	FMOV	%f8, ALPHA_I#else	FMOV	%f7, ALPHA_R	FMOV	%f9, ALPHA_I#endif#endif	sll	INCX, ZBASE_SHIFT, INCX	sll	INCY, ZBASE_SHIFT, INCY	cmp	INCX, 2 * SIZE	bne	.LL50	nop	cmp	INCY, 2 * SIZE	bne	.LL50	nop	sra	N, 2, I	cmp	I, 0	ble,pn	%icc, .LL15	nop	LDF	[X +  0 * SIZE], a1	LDF	[X +  1 * SIZE], a2	LDF	[Y +  0 * SIZE], b1	LDF	[Y +  1 * SIZE], b2	LDF	[X +  2 * SIZE], a3	LDF	[X +  3 * SIZE], a4	LDF	[Y +  2 * SIZE], b3	LDF	[Y +  3 * SIZE], b4	LDF	[X +  4 * SIZE], a5	LDF	[X +  5 * SIZE], a6	LDF	[Y +  4 * SIZE], b5	LDF	[Y +  5 * SIZE], b6	LDF	[X +  6 * SIZE], a7	LDF	[X +  7 * SIZE], a8	LDF	[Y +  6 * SIZE], b7	LDF	[Y +  7 * SIZE], b8	FMUL	ALPHA_R, a1, t1	FMUL	ALPHA_R, a2, t2	FMUL	ALPHA_R, a3, t3	FMUL	ALPHA_R, a4, t4	FADD	b1, t1, c1	FMUL	ALPHA_I, a2, t1	ADD2	b2, t2, c2	FMUL	ALPHA_I, a1, t2	deccc	I	ble,pt	%icc, .LL12	nop#ifdef DOUBLE#define PREFETCHSIZE  54#else#define PREFETCHSIZE 108#endif.LL11:	FADD	b3, t3, c3	prefetch [Y  + PREFETCHSIZE * SIZE], 0	FMUL	ALPHA_I, a4, t3	prefetch [X  + PREFETCHSIZE * SIZE], 0	ADD2	b4, t4, c4	LDF	[Y +  8 * SIZE], b1	FMUL	ALPHA_I, a3, t4	LDF	[X +  9 * SIZE], a2	ADD1	c1, t1, c1	LDF	[Y +  9 * SIZE], b2	FMUL	ALPHA_R, a5, t1	LDF	[X +  8 * SIZE], a1	FADD	c2, t2, c2	LDF	[Y + 10 * SIZE], b3	FMUL	ALPHA_R, a6, t2	LDF	[X + 11 * SIZE], a4	ADD1	c3, t3, c3	STF	c1, [Y +  0 * SIZE]	FMUL	ALPHA_R, a7, t3	LDF	[Y + 11 * SIZE], b4	FADD	c4, t4, c4	STF	c2, [Y +  1 * SIZE]	FMUL	ALPHA_R, a8, t4	LDF	[X + 10 * SIZE], a3	FADD	b5, t1, c5	STF	c3, [Y +  2 * SIZE]	FMUL	ALPHA_I, a6, t1	ADD2	b6, t2, c6	STF	c4, [Y +  3 * SIZE]	FMUL	ALPHA_I, a5, t2	FADD	b7, t3, c7	LDF	[Y + 12 * SIZE], b5	FMUL	ALPHA_I, a8, t3	LDF	[X + 13 * SIZE], a6	ADD2	b8, t4, c8	LDF	[Y + 13 * SIZE], b6	FMUL	ALPHA_I, a7, t4	LDF	[X + 12 * SIZE], a5	ADD1	c5, t1, c5	LDF	[Y + 14 * SIZE], b7	FMUL	ALPHA_R, a1, t1	LDF	[X + 15 * SIZE], a8	FADD	c6, t2, c6	LDF	[Y + 15 * SIZE], b8	FMUL	ALPHA_R, a2, t2	LDF	[X + 14 * SIZE], a7	ADD1	c7, t3, c7	STF	c5, [Y +  4 * SIZE]	FMUL	ALPHA_R, a3, t3	add	X, 8 * SIZE, X	FADD	c8, t4, c8	STF	c6, [Y +  5 * SIZE]	FMUL	ALPHA_R, a4, t4	deccc	I	FADD	b1, t1, c1	STF	c7, [Y +  6 * SIZE]	FMUL	ALPHA_I, a2, t1	ADD2	b2, t2, c2	STF	c8, [Y +  7 * SIZE]	FMUL	ALPHA_I, a1, t2	bg,pt	%icc, .LL11	add	Y, 8 * SIZE, Y.LL12:	FADD	b3, t3, c3	FMUL	ALPHA_I, a4, t3	ADD2	b4, t4, c4	FMUL	ALPHA_I, a3, t4	ADD1	c1, t1, c1	FMUL	ALPHA_R, a5, t1	FADD	c2, t2, c2	FMUL	ALPHA_R, a6, t2	ADD1	c3, t3, c3	FMUL	ALPHA_R, a7, t3	FADD	c4, t4, c4	FMUL	ALPHA_R, a8, t4	FADD	b5, t1, c5	FMUL	ALPHA_I, a6, t1	ADD2	b6, t2, c6	FMUL	ALPHA_I, a5, t2	FADD	b7, t3, c7	FMUL	ALPHA_I, a8, t3	ADD2	b8, t4, c8	FMUL	ALPHA_I, a7, t4	ADD1	c5, t1, c5	FADD	c6, t2, c6	ADD1	c7, t3, c7	FADD	c8, t4, c8	STF	c1, [Y +  0 * SIZE]	STF	c2, [Y +  1 * SIZE]	STF	c3, [Y +  2 * SIZE]	STF	c4, [Y +  3 * SIZE]	STF	c5, [Y +  4 * SIZE]	STF	c6, [Y +  5 * SIZE]	STF	c7, [Y +  6 * SIZE]	STF	c8, [Y +  7 * SIZE]	add	X, 8 * SIZE, X	add	Y, 8 * SIZE, Y.LL15:	and	N, 3, I	cmp	I,  0	ble,a,pn %icc, .LL19	nop.LL16:	LDF	[X +  0 * SIZE], a1	LDF	[X +  1 * SIZE], a2	LDF	[Y +  0 * SIZE], b1	LDF	[Y +  1 * SIZE], b2	FMUL	ALPHA_R, a1, t1	FMUL	ALPHA_R, a2, t2	FMUL	ALPHA_I, a2, t3	FMUL	ALPHA_I, a1, t4	FADD	b1, t1, b1	add	I, -1, I	ADD2	b2, t2, b2	cmp	I, 0	ADD1	b1, t3, c1	FADD	b2, t4, c2	STF	c1, [Y +  0 * SIZE]	STF	c2, [Y +  1 * SIZE]	add	Y, 2 * SIZE, Y	bg,pt	%icc, .LL16	add	X, 2 * SIZE, X.LL19:	return	%i7 + 8	clr	%g0.LL50:	sra	N, 2, I	cmp	I, 0	ble,pn	%icc, .LL55	mov	Y, YY	LDF	[X +  0 * SIZE], a1	LDF	[Y +  0 * SIZE], b1	LDF	[X +  1 * SIZE], a2	add	X, INCX, X	LDF	[Y +  1 * SIZE], b2	add	Y, INCY, Y	LDF	[X +  0 * SIZE], a3	LDF	[Y +  0 * SIZE], b3	LDF	[X +  1 * SIZE], a4	add	X, INCX, X	LDF	[Y +  1 * SIZE], b4	add	Y, INCY, Y	LDF	[X +  0 * SIZE], a5	add	I, -1, I	LDF	[Y +  0 * SIZE], b5	LDF	[X +  1 * SIZE], a6	cmp	I, 0	add	X, INCX, X	LDF	[Y +  1 * SIZE], b6	add	Y, INCY, Y	LDF	[X +  0 * SIZE], a7	FMUL	ALPHA_R, a1, t1	LDF	[Y +  0 * SIZE], b7	FMUL	ALPHA_R, a2, t2	LDF	[X +  1 * SIZE], a8	FMUL	ALPHA_R, a3, t3	add	X, INCX, X	LDF	[Y +  1 * SIZE], b8	FMUL	ALPHA_R, a4, t4	ble,pt	%icc, .LL52	add	Y, INCY, Y.LL51:	FADD	b1, t1, c1	LDF	[Y +  0 * SIZE], b1	FMUL	ALPHA_I, a2, t1	LDF	[X +  1 * SIZE], a2	ADD2	b2, t2, c2	LDF	[Y +  1 * SIZE], b2	add	Y, INCY, Y	FMUL	ALPHA_I, a1, t2	LDF	[X +  0 * SIZE], a1	add	X, INCX, X	FADD	b3, t3, c3	LDF	[Y +  0 * SIZE], b3	FMUL	ALPHA_I, a4, t3	LDF	[X +  1 * SIZE], a4	ADD2	b4, t4, c4	LDF	[Y +  1 * SIZE], b4	add	Y, INCY, Y	FMUL	ALPHA_I, a3, t4	LDF	[X +  0 * SIZE], a3	add	X, INCX, X	ADD1	c1, t1, c1	FMUL	ALPHA_R, a5, t1	FADD	c2, t2, c2	FMUL	ALPHA_R, a6, t2	ADD1	c3, t3, c3	FMUL	ALPHA_R, a7, t3	FADD	c4, t4, c4	FMUL	ALPHA_R, a8, t4	STF	c1, [YY +  0 * SIZE]	FADD	b5, t1, c1	FMUL	ALPHA_I, a6, t1	STF	c2, [YY +  1 * SIZE]	ADD2	b6, t2, c2	FMUL	ALPHA_I, a5, t2	add	YY, INCY, YY	STF	c3, [YY +  0 * SIZE]	FADD	b7, t3, c3	FMUL	ALPHA_I, a8, t3	STF	c4, [YY +  1 * SIZE]	ADD2	b8, t4, c4	FMUL	ALPHA_I, a7, t4	add	YY, INCY, YY	LDF	[X +  0 * SIZE], a5	ADD1	c1, t1, c1	LDF	[Y +  0 * SIZE], b5	FMUL	ALPHA_R, a1, t1	LDF	[X +  1 * SIZE], a6	add	X, INCX, X	FADD	c2, t2, c2	LDF	[Y +  1 * SIZE], b6	add	Y, INCY, Y	FMUL	ALPHA_R, a2, t2	LDF	[X +  0 * SIZE], a7	ADD1	c3, t3, c3	LDF	[Y +  0 * SIZE], b7	FMUL	ALPHA_R, a3, t3	LDF	[X +  1 * SIZE], a8	add	X, INCX, X	FADD	c4, t4, c4	LDF	[Y +  1 * SIZE], b8	add	Y, INCY, Y	FMUL	ALPHA_R, a4, t4	STF	c1, [YY +  0 * SIZE]	add	I, -1, I	STF	c2, [YY +  1 * SIZE]	add	YY, INCY, YY	STF	c3, [YY +  0 * SIZE]	cmp	I, 0	STF	c4, [YY +  1 * SIZE]	bg,pt	%icc, .LL51	add	YY, INCY, YY.LL52:	FADD	b1, t1, c1	FMUL	ALPHA_I, a2, t1	ADD2	b2, t2, c2	FMUL	ALPHA_I, a1, t2	FADD	b3, t3, c3	FMUL	ALPHA_I, a4, t3	ADD2	b4, t4, c4	FMUL	ALPHA_I, a3, t4	ADD1	c1, t1, c1	FMUL	ALPHA_R, a5, t1	FADD	c2, t2, c2	FMUL	ALPHA_R, a6, t2	ADD1	c3, t3, c3	FMUL	ALPHA_R, a7, t3	FADD	c4, t4, c4	FMUL	ALPHA_R, a8, t4	STF	c1, [YY +  0 * SIZE]	STF	c2, [YY +  1 * SIZE]	add	YY, INCY, YY	STF	c3, [YY +  0 * SIZE]	STF	c4, [YY +  1 * SIZE]	add	YY, INCY, YY	FADD	b5, t1, c1	FMUL	ALPHA_I, a6, t1	ADD2	b6, t2, c2	FMUL	ALPHA_I, a5, t2	FADD	b7, t3, c3	FMUL	ALPHA_I, a8, t3	ADD2	b8, t4, c4	FMUL	ALPHA_I, a7, t4	ADD1	c1, t1, c1	FADD	c2, t2, c2	ADD1	c3, t3, c3	FADD	c4, t4, c4	STF	c1, [YY +  0 * SIZE]	STF	c2, [YY +  1 * SIZE]	add	YY, INCY, YY	STF	c3, [YY +  0 * SIZE]	STF	c4, [YY +  1 * SIZE]	add	YY, INCY, YY.LL55:	and	N, 3, I	cmp	I,  0	ble,a,pn %icc, .LL59	nop.LL56:	LDF	[X +  0 * SIZE], a1	LDF	[X +  1 * SIZE], a2	LDF	[Y +  0 * SIZE], b1	LDF	[Y +  1 * SIZE], b2	FMUL	ALPHA_R, a1, t1	FMUL	ALPHA_R, a2, t2	FMUL	ALPHA_I, a2, t3	FMUL	ALPHA_I, a1, t4	FADD	b1, t1, b1	ADD2	b2, t2, b2	ADD1	b1, t3, c1	FADD	b2, t4, c2	add	I, -1, I	cmp	I, 0	STF	c1, [Y +  0 * SIZE]	STF	c2, [Y +  1 * SIZE]	add	Y, INCY, Y	bg,pt	%icc, .LL56	add	X, INCX, X.LL59:	return	%i7 + 8	clr	%o0	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -