⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zscal.s

📁 Optimized GotoBLAS libraries
💻 S
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define N	%i0#if defined(DOUBLE) && !defined(__64BIT__)#define X	%i3#define INCX	%i4#else#define X	%i5#define INCX	%i3#endif#define I	%i1#define XX	%i2#ifdef DOUBLE#define c1	%f0#define c2	%f2#define c3	%f4#define c4	%f6#define c5	%f8#define c6	%f10#define c7	%f12#define c8	%f14#define t1	%f16#define t2	%f18#define t3	%f20#define t4	%f22#define t5	%f24#define t6	%f26#define t7	%f28#define t8	%f30#define c9	%f32#define c10	%f34#define c11	%f36#define c12	%f38#define c13	%f40#define c14	%f42#define c15	%f44#define c16	%f46#define s1	%f32#define s2	%f34#define s3	%f36#define s4	%f38#define s5	%f40#define s6	%f42#define s7	%f44#define s8	%f46#define FZERO	%f48#define ALPHA_R	%f50#define ALPHA_I	%f52#else#define c1	%f0#define c2	%f1#define c3	%f2#define c4	%f3#define c5	%f4#define c6	%f5#define c7	%f6#define c8	%f7#define c9	%f8#define c10	%f9#define c11	%f10#define c12	%f11#define c13	%f12#define c14	%f13#define c15	%f14#define c16	%f15#define s1	%f8#define s2	%f9#define s3	%f10#define s4	%f11#define s5	%f12#define s6	%f13#define s7	%f14#define s8	%f15#define t1	%f16#define t2	%f17#define t3	%f18#define t4	%f19#define t5	%f20#define t6	%f21#define t7	%f22#define t8	%f23#define FZERO	%f24#define ALPHA_R	%f25#define ALPHA_I	%f26#endif#define PREFETCHSIZE 128	PROLOGUE	SAVESP#ifndef __64BIT__#ifdef DOUBLE	st	%g0, [%fp + STACK_START +  8]	st	%g0, [%fp + STACK_START + 12]	st	%i3, [%fp + STACK_START + 16]	st	%i4, [%fp + STACK_START + 20]	st	%i5, [%fp + STACK_START + 24]	ld	[%fp+ STACK_START + 32], X	ld	[%fp+ STACK_START + 36], INCX#else	st	%g0, [%fp + STACK_START +  8]	st	%i3, [%fp + STACK_START + 16]	st	%i4, [%fp + STACK_START + 24]	ld	[%fp+  STACK_START + 28], INCX#endif	LDF	[%fp + STACK_START +  8], FZERO	LDF	[%fp + STACK_START + 16], ALPHA_R	LDF	[%fp + STACK_START + 24], ALPHA_I#else#ifdef DOUBLE	stx	%g0, [%fp + STACK_START + 32]	FMOV	%f6, ALPHA_R	FMOV	%f8, ALPHA_I#else	st	%g0, [%fp + STACK_START + 32]	FMOV	%f7, ALPHA_R	FMOV	%f9, ALPHA_I#endif	ldx	[%fp + STACK_START + 56], INCX	LDF	[%fp + STACK_START + 32], FZERO#endif	FCMP	ALPHA_R, FZERO	fbne	.LL100	sll	INCX, ZBASE_SHIFT, INCX	FCMP	ALPHA_I, FZERO	fbne	.LL100	nop	cmp	INCX, 2 * SIZE	bne	.LL50	nop	sra	N, 2, I	cmp	I, 0	ble,pn	%icc, .LL15	nop.LL11:	prefetch [X  + PREFETCHSIZE * SIZE], 0	STF	FZERO, [X +  0 * SIZE]	add	I, -1, I	STF	FZERO, [X +  1 * SIZE]	cmp	I, 0	STF	FZERO, [X +  2 * SIZE]	STF	FZERO, [X +  3 * SIZE]	STF	FZERO, [X +  4 * SIZE]	STF	FZERO, [X +  5 * SIZE]	add	X, 8 * SIZE, X	STF	FZERO, [X -  2 * SIZE]	bg,pt	%icc, .LL11	STF	FZERO, [X -  1 * SIZE].LL15:	and	N, 3, I	cmp	I,  0	ble,a,pn %icc, .LL19	nop.LL16:	STF	FZERO, [X +  0 * SIZE]	STF	FZERO, [X +  1 * SIZE]	add	I, -1, I	cmp	I, 0	bg,pt	%icc, .LL16	add	X, 2 * SIZE, X.LL19:	return	%i7 + 8	clr	%o0.LL50:	sra	N, 2, I	cmp	I, 0	ble,pn	%icc, .LL55	nop.LL51:	STF	FZERO, [X +  0 * SIZE]	add	I, -1, I	STF	FZERO, [X +  1 * SIZE]	add	X, INCX, X	STF	FZERO, [X +  0 * SIZE]	cmp	I, 0	STF	FZERO, [X +  1 * SIZE]	add	X, INCX, X	STF	FZERO, [X +  0 * SIZE]	STF	FZERO, [X +  1 * SIZE]	add	X, INCX, X	STF	FZERO, [X +  0 * SIZE]	STF	FZERO, [X +  1 * SIZE]	bg,pt	%icc, .LL51	add	X, INCX, X.LL55:	and	N, 3, I	cmp	I,  0	ble,a,pn %icc, .LL59	nop.LL56:	STF	FZERO, [X +  0 * SIZE]	add	I, -1, I	STF	FZERO, [X +  1 * SIZE]	cmp	I, 0	bg,pt	%icc, .LL56	add	X, INCX, X.LL59:	return	%i7 + 8	clr	%o0.LL100:	cmp	INCX, 2 * SIZE	bne	.LL150	sra	N, 2, I	cmp	I, 0	ble,pn	%icc, .LL115	nop	LDF	[X +  0 * SIZE], c1	LDF	[X +  1 * SIZE], c2	LDF	[X +  2 * SIZE], c3	LDF	[X +  3 * SIZE], c4	LDF	[X +  4 * SIZE], c5	LDF	[X +  5 * SIZE], c6	LDF	[X +  6 * SIZE], c7	LDF	[X +  7 * SIZE], c8	FMUL	ALPHA_R, c1, t1	FMUL	ALPHA_I, c2, t3	FMUL	ALPHA_I, c1, t2	LDF	[X +  8 * SIZE], c1	FMUL	ALPHA_R, c2, t4	LDF	[X +  9 * SIZE], c2	FMUL	ALPHA_R, c3, t5	deccc	I	FMUL	ALPHA_I, c4, t7	FSUB	t1,  t3,  s1	FMUL	ALPHA_I, c3, t6	LDF	[X + 10 * SIZE], c3	FMUL	ALPHA_R, c4, t8	LDF	[X + 11 * SIZE], c4	FADD	t4,  t2,  s2	ble,pn	%icc, .LL112	nop.LL111:	prefetch [X  + PREFETCHSIZE * SIZE], 0	FMUL	ALPHA_R, c5, t1	FMUL	ALPHA_I, c6, t3	FSUB	t5,  t7,  s3	STF	s1, [X +  0 * SIZE]	FMUL	ALPHA_I, c5, t2	LDF	[X + 12 * SIZE], c5	FMUL	ALPHA_R, c6, t4	LDF	[X + 13 * SIZE], c6	FADD	t8,  t6,  s4	STF	s2, [X +  1 * SIZE]	FMUL	ALPHA_R, c7, t5	FMUL	ALPHA_I, c8, t7	FSUB	t1,  t3,  s5	STF	s3, [X +  2 * SIZE]	FMUL	ALPHA_I, c7, t6	LDF	[X + 14 * SIZE], c7	FMUL	ALPHA_R, c8, t8	LDF	[X + 15 * SIZE], c8	FADD	t4,  t2,  s6	STF	s4, [X +  3 * SIZE]	FMUL	ALPHA_R, c1, t1	FMUL	ALPHA_I, c2, t3	FSUB	t5,  t7,  s7	STF	s5, [X +  4 * SIZE]	FMUL	ALPHA_I, c1, t2	LDF	[X + 16 * SIZE], c1	FMUL	ALPHA_R, c2, t4	LDF	[X + 17 * SIZE], c2	FADD	t8,  t6,  s8	STF	s6, [X +  5 * SIZE]	FMUL	ALPHA_R, c3, t5	deccc	I	FMUL	ALPHA_I, c4, t7	FSUB	t1,  t3,  s1	STF	s7, [X +  6 * SIZE]	FMUL	ALPHA_I, c3, t6	LDF	[X + 18 * SIZE], c3	FMUL	ALPHA_R, c4, t8	LDF	[X + 19 * SIZE], c4	FADD	t4,  t2,  s2	STF	s8, [X +  7 * SIZE]	bg,pt	%icc, .LL111	add	X, 8 * SIZE, X.LL112:	FMUL	ALPHA_R, c5, t1	FMUL	ALPHA_I, c6, t3	FSUB	t5,  t7,  s3	STF	s1, [X +  0 * SIZE]	FMUL	ALPHA_I, c5, t2	FMUL	ALPHA_R, c6, t4	FADD	t8,  t6,  s4	STF	s2, [X +  1 * SIZE]	FMUL	ALPHA_R, c7, t5	FMUL	ALPHA_I, c8, t7	FSUB	t1,  t3,  s5	STF	s3, [X +  2 * SIZE]	FMUL	ALPHA_I, c7, t6	FMUL	ALPHA_R, c8, t8	FADD	t4,  t2,  s6	STF	s4, [X +  3 * SIZE]	FSUB	t5,  t7,  s7	FADD	t8,  t6,  s8	STF	s5, [X +  4 * SIZE]	STF	s6, [X +  5 * SIZE]	STF	s7, [X +  6 * SIZE]	STF	s8, [X +  7 * SIZE]	add	X, 8 * SIZE, X.LL115:	and	N, 3, I	cmp	I,  0	ble,a,pn %icc, .LL119	nop.LL116:	LDF	[X +  0 * SIZE], c1	LDF	[X +  1 * SIZE], c2	FMUL	ALPHA_R, c1, c3	FMUL	ALPHA_I, c1, c4	FMUL	ALPHA_I, c2, c1	FMUL	ALPHA_R, c2, c2	FSUB	c3, c1, c1	FADD	c2, c4, c2	STF	c1, [X +  0 * SIZE]	STF	c2, [X +  1 * SIZE]	add	I, -1, I	cmp	I, 0	bg,pt	%icc, .LL116	add	X, 2 * SIZE, X.LL119:	return	%i7 + 8	clr	%o0.LL150:	sra	N, 2, I	cmp	I, 0	ble,pn	%icc, .LL155	mov	X, XX.LL151:	LDF	[X +  0 * SIZE], c1	LDF	[X +  1 * SIZE], c2	add	X, INCX, X	LDF	[X +  0 * SIZE], c3	FMUL	ALPHA_R, c1, c9	LDF	[X +  1 * SIZE], c4	FMUL	ALPHA_I, c1, c10	add	X, INCX, X	LDF	[X +  0 * SIZE], c5	FMUL	ALPHA_I, c2, c1	LDF	[X +  1 * SIZE], c6	FMUL	ALPHA_R, c2, c2	add	X, INCX, X	LDF	[X +  0 * SIZE], c7	FMUL	ALPHA_R, c3, c11	LDF	[X +  1 * SIZE], c8	FMUL	ALPHA_I, c3, c12	add	X, INCX, X	FMUL	ALPHA_I, c4, c3	FMUL	ALPHA_R, c4, c4	FMUL	ALPHA_R, c5, c13	FMUL	ALPHA_I, c5, c14	FMUL	ALPHA_I, c6, c5	FMUL	ALPHA_R, c6, c6	FMUL	ALPHA_R, c7, c15	FSUB	c9,  c1,  c1	FMUL	ALPHA_I, c7, c16	FADD	c2,  c10, c2	FMUL	ALPHA_I, c8, c7	FSUB	c11, c3,  c3	FMUL	ALPHA_R, c8, c8	FADD	c4,  c12, c4	STF	c1, [XX +  0 * SIZE]	FSUB	c13, c5,  c5	add	I, -1, I	STF	c2, [XX +  1 * SIZE]	FADD	c6,  c14, c6	add	XX, INCX, XX	STF	c3, [XX +  0 * SIZE]	FSUB	c15, c7,  c7	cmp	I, 0	STF	c4, [XX +  1 * SIZE]	FADD	c8,  c16, c8	add	XX, INCX, XX	STF	c5, [XX +  0 * SIZE]	STF	c6, [XX +  1 * SIZE]	add	XX, INCX, XX	STF	c7, [XX +  0 * SIZE]	STF	c8, [XX +  1 * SIZE]	bg,pt	%icc, .LL151	add	XX, INCX, XX.LL155:	and	N, 3, I	cmp	I,  0	ble,a,pn %icc, .LL159	nop.LL156:	LDF	[X +  0 * SIZE], c1	LDF	[X +  1 * SIZE], c2	FMUL	ALPHA_R, c1, c3	FMUL	ALPHA_I, c1, c4	FMUL	ALPHA_I, c2, c1	FMUL	ALPHA_R, c2, c2	FSUB	c3, c1, c1	FADD	c2, c4, c2	STF	c1, [X +  0 * SIZE]	STF	c2, [X +  1 * SIZE]	add	I, -1, I	cmp	I, 0	bg,pt	%icc, .LL156	add	X, INCX, X.LL159:	return	%i7 + 8	clr	%o0	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -