⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemv_t.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define P 4000#define M	%i0#define N	%i1#define A	%i5#define LDA	%i2#define X	%i3#define INCX	%i4	#define Y	%l0#define INCY	%l1#define BUFFER	%l2#define I	%l3#define IS	%l4#define J	%l5#define MIN_M	%l6#define XP	%l7#define A1	%o0#define A2	%o1#define A3	%o2#define A4	%o3#define X1	%o4#define Y1	%o5#define PNLDA	%g1#define Y2	%o7	/* Danger? */#ifdef DOUBLE#define t1	%f0#define	t2 	%f2#define t3	%f4#define	t4 	%f6#define c1	%f8#define c2	%f10#define c3	%f12#define c4	%f14#define c5	%f16#define c6	%f18#define c7	%f20#define c8	%f22#define c9	%f24#define c10	%f26#define c11	%f28#define c12	%f30#define c13	%f32#define c14	%f34#define c15	%f36#define c16	%f38#define a1	%f40#define a2	%f42#define a3	%f44#define a4	%f46#define a5	%f48#define a6	%f50#define a7	%f52#define a8	%f54#define b1	%f56#define b2	%f58#define b3	%f60#define b4	%f62#else#define t1	%f0#define	t2 	%f1#define t3	%f2#define	t4 	%f3#define c1	%f4#define c2	%f5#define c3	%f6#define c4	%f7#define c5	%f8#define c6	%f9#define c7	%f10#define c8	%f11#define c9	%f12#define c10	%f13#define c11	%f14#define c12	%f15#define c13	%f16#define c14	%f17#define c15	%f18#define c16	%f19#define a1	%f20#define a2	%f21#define a3	%f22#define a4	%f23#define a5	%f24#define a6	%f25#define a7	%f26#define a8	%f27#define b1	%f28#define b2	%f29#define b3	%f30#define b4	%f31#endif#ifndef __64BIT__#define FZERO	[%fp + STACK_START +  8]#define ALPHA_R	[%fp + STACK_START + 16]#ifndef DOUBLE#define ALPHA_I	[%fp + STACK_START + 20]#else#define ALPHA_I	[%fp + STACK_START + 24]#endif#else#define FZERO	[%fp + STACK_START + 24]#define ALPHA_R	[%fp + STACK_START + 32]#define ALPHA_I	[%fp + STACK_START + 40]#endif#ifdef DOUBLE#define PREFETCHSIZE 18#else#define PREFETCHSIZE 36#endif	PROLOGUE	SAVESP	nop#ifndef __64BIT__#ifdef DOUBLE	st	%g0, [%fp + STACK_START +  8]	st	%g0, [%fp + STACK_START + 12]	st	%i3, [%fp + STACK_START + 16]   /* ALPHA_R */	st	%i4, [%fp + STACK_START + 20]	st	%i5, [%fp + STACK_START + 24]   /* ALPHA_I */	ld	[%fp + STACK_START + 32], A	ld	[%fp + STACK_START + 36], LDA	ld	[%fp + STACK_START + 40], X	ld	[%fp + STACK_START + 44], INCX	ld	[%fp + STACK_START + 48], Y	ld	[%fp + STACK_START + 52], INCY	ld	[%fp + STACK_START + 56], BUFFER#else	st	%g0, [%fp + STACK_START +  8]	st	%i3, [%fp + STACK_START + 16]   /* ALPHA_R */	st	%i4, [%fp + STACK_START + 20]   /* ALPHA_I */	ld	[%fp + STACK_START + 28], LDA	ld	[%fp + STACK_START + 32], X	ld	[%fp + STACK_START + 36], INCX	ld	[%fp + STACK_START + 40], Y	ld	[%fp + STACK_START + 44], INCY	ld	[%fp + STACK_START + 48], BUFFER#endif#else#ifdef DOUBLE	stx	%g0, FZERO	std	%f6, ALPHA_R	std	%f8, ALPHA_I#else	st	%g0, FZERO	st	%f7, ALPHA_R	st	%f9, ALPHA_I#endif	ldx	[%fp+  STACK_START + 56], LDA	ldx	[%fp+  STACK_START + 64], X	ldx	[%fp+  STACK_START + 72], INCX	ldx	[%fp+  STACK_START + 80], Y	ldx	[%fp+  STACK_START + 88], INCY	ldx	[%fp+  STACK_START + 96], BUFFER#endif	clr	IS	mov	P, I	sll	LDA, ZBASE_SHIFT, LDA	sll	I, ZBASE_SHIFT, I	smul	LDA, N, PNLDA	sll	INCX, ZBASE_SHIFT, INCX	sll	INCY, ZBASE_SHIFT, INCY	sub	I, PNLDA, PNLDA.LL10:	sll	IS, ZBASE_SHIFT, I	sub	M, IS, MIN_M	mov	P, J	cmp	MIN_M, J	nop	movg	%icc, J, MIN_M	nop	cmp	INCX, 2 * SIZE	beq	.LL100	add	X, I, XP	sra	MIN_M, 2, I	mov	BUFFER, XP	cmp	I, 0	ble,pn	%icc, .LL15	mov	BUFFER, Y1.LL11:	LDF	[X + 0 * SIZE], a1	LDF	[X + 1 * SIZE], a2	add	X, INCX, X	LDF	[X + 0 * SIZE], a3	LDF	[X + 1 * SIZE], a4	add	X, INCX, X	LDF	[X + 0 * SIZE], a5	LDF	[X + 1 * SIZE], a6	add	X, INCX, X	LDF	[X + 0 * SIZE], a7	LDF	[X + 1 * SIZE], a8	add	X, INCX, X	STF	a1, [Y1 + 0 * SIZE]	add	I, -1, I	STF	a2, [Y1 + 1 * SIZE]	cmp	I, 0	STF	a3, [Y1 + 2 * SIZE]	STF	a4, [Y1 + 3 * SIZE]	STF	a5, [Y1 + 4 * SIZE]	STF	a6, [Y1 + 5 * SIZE]	STF	a7, [Y1 + 6 * SIZE]	STF	a8, [Y1 + 7 * SIZE]	bg,pn	%icc, .LL11	add	Y1, 8 * SIZE, Y1.LL15:	and	MIN_M, 3, I	cmp	I, 0	ble,pn	%icc, .LL100	nop.LL16:	LDF	[X + 0 * SIZE], a1	LDF	[X + 1 * SIZE], a2	add	X, INCX, X	add	I, -1, I	cmp	I, 0	nop	STF	a1, [Y1 + 0 * SIZE]	STF	a2, [Y1 + 1 * SIZE]	bg,pn	%icc, .LL16	add	Y1, 2 * SIZE, Y1.LL100:	sra	N, 2, J	cmp	J, 0	ble	%icc, .LL200	mov	Y, Y1.LL110:	LDF	FZERO, t1	FMOV	t1, c1	sra	MIN_M, 2, I	FMOV	t1, c2	add	A,  LDA, A2	FMOV	t1, c3	mov	A,  A1	FMOV	t1, c4	add	A2, LDA, A3	FMOV	t1, c5	FMOV	t1, c6	FMOV	t1, c7	FMOV	t1, c8	FMOV	t1, c9	FMOV	t1, c10	FMOV	t1, c11	FMOV	t1, c12	FMOV	t1, c13	FMOV	t1, c14	FMOV	t1, c15	FMOV	t1, c16	add	A3, LDA, A4	FMOV	t1, t2	mov	XP, X1	FMOV	t1, t3	add	A4, LDA, A	cmp	I, 0	ble	%icc, .LL115	FMOV	t1, t4	LDF	[A1 + 0 * SIZE], a1	nop	LDF	[A1 + 1 * SIZE], a2	add	A1, 2 * SIZE, A1	LDF	[A2 + 0 * SIZE], a3	LDF	[A2 + 1 * SIZE], a4	add	A2, 2 * SIZE, A2	LDF	[A3 + 0 * SIZE], a5	LDF	[A3 + 1 * SIZE], a6	add	A3, 2 * SIZE, A3	LDF	[A4 + 0 * SIZE], a7	LDF	[A4 + 1 * SIZE], a8	add	A4, 2 * SIZE, A4	LDF	[X1 + 0 * SIZE], b1	nop	LDF	[X1 + 1 * SIZE], b2	nop	LDF	[X1 + 2 * SIZE], b3	add	X1, 4 * SIZE, X1	deccc	 I	ble	 .LL112	prefetch [Y1 + 7 * SIZE], 2#ifndef XCONJ#define FADDX	FADD#else#define FADDX	FSUB#endif.LL111:	FADD	c13, t1, c13	prefetch [A1 + PREFETCHSIZE * SIZE], 1	FMUL	a1, b1, t1	nop	FADDX	c14, t2, c14	nop	FMUL	a1, b2, t2	LDF	[A1 + 0 * SIZE], a1	FADD	c15, t3, c15	nop	FMUL	a2, b1, t3	LDF	[X1 - 1 * SIZE], b4	FADD	c16, t4, c16	nop	FMUL	a2, b2, t4	LDF	[A1 + 1 * SIZE], a2	FADD	c1, t1, c1	nop	FMUL	a3, b1, t1	nop	FADDX	c2, t2, c2	nop	FMUL	a3, b2, t2	LDF	[A2 + 0 * SIZE], a3	FADD	c3, t3, c3	nop	FMUL	a4, b1, t3	nop	FADD	c4, t4, c4	nop	FMUL	a4, b2, t4	LDF	[A2 + 1 * SIZE], a4	FADD	c5, t1, c5	nop	FMUL	a5, b1, t1	nop	FADDX	c6, t2, c6	nop	FMUL	a5, b2, t2	LDF	[A3 + 0 * SIZE], a5	FADD	c7, t3, c7	nop	FMUL	a6, b1, t3	nop	FADD	c8, t4, c8	nop	FMUL	a6, b2, t4	LDF	[A3 + 1 * SIZE], a6	FADD	c9, t1, c9	nop	FMUL	a7, b1, t1	nop	FADDX	c10, t2, c10	nop	FMUL	a7, b2, t2	LDF	[A4 + 0 * SIZE], a7	FADD	c11, t3, c11	nop	FMUL	a8, b1, t3	LDF	[X1 + 0 * SIZE], b1	FADD	c12, t4, c12	nop	FMUL	a8, b2, t4	LDF	[A4 + 1 * SIZE], a8	FADD	c13, t1, c13	nop	FMUL	a1, b3, t1	prefetch [A2 + PREFETCHSIZE * SIZE], 1	FADDX	c14, t2, c14	nop	FMUL	a1, b4, t2	LDF	[A1 + 2 * SIZE], a1	FADD	c15, t3, c15	nop	FMUL	a2, b3, t3	LDF	[X1 + 1 * SIZE], b2	FADD	c16, t4, c16	nop	FMUL	a2, b4, t4	LDF	[A1 + 3 * SIZE], a2	FADD	c1, t1, c1	nop	FMUL	a3, b3, t1	nop	FADDX	c2, t2, c2	nop	FMUL	a3, b4, t2	LDF	[A2 + 2 * SIZE], a3	FADD	c3, t3, c3	nop	FMUL	a4, b3, t3	nop	FADD	c4, t4, c4	nop	FMUL	a4, b4, t4	LDF	[A2 + 3 * SIZE], a4	FADD	c5, t1, c5	nop	FMUL	a5, b3, t1	nop	FADDX	c6, t2, c6	nop	FMUL	a5, b4, t2	LDF	[A3 + 2 * SIZE], a5	FADD	c7, t3, c7	nop	FMUL	a6, b3, t3	nop	FADD	c8, t4, c8	nop	FMUL	a6, b4, t4	LDF	[A3 + 3 * SIZE], a6	FADD	c9, t1, c9	nop	FMUL	a7, b3, t1	nop	FADDX	c10, t2, c10	nop	FMUL	a7, b4, t2	LDF	[A4 + 2 * SIZE], a7	FADD	c11, t3, c11	nop	FMUL	a8, b3, t3	LDF	[X1 + 2 * SIZE], b3	FADD	c12, t4, c12	nop	FMUL	a8, b4, t4	LDF	[A4 + 3 * SIZE], a8	FADD	c13, t1, c13	prefetch [A3 + PREFETCHSIZE * SIZE], 1	FMUL	a1, b1, t1	nop	FADDX	c14, t2, c14	nop	FMUL	a1, b2, t2	LDF	[A1 + 4 * SIZE], a1	FADD	c15, t3, c15	nop	FMUL	a2, b1, t3	LDF	[X1 + 3 * SIZE], b4	FADD	c16, t4, c16	nop	FMUL	a2, b2, t4	LDF	[A1 + 5 * SIZE], a2	FADD	c1, t1, c1	nop	FMUL	a3, b1, t1	nop	FADDX	c2, t2, c2	nop	FMUL	a3, b2, t2	LDF	[A2 + 4 * SIZE], a3	FADD	c3, t3, c3	nop	FMUL	a4, b1, t3	nop	FADD	c4, t4, c4	nop	FMUL	a4, b2, t4	LDF	[A2 + 5 * SIZE], a4	FADD	c5, t1, c5	nop	FMUL	a5, b1, t1	nop	FADDX	c6, t2, c6	nop	FMUL	a5, b2, t2	LDF	[A3 + 4 * SIZE], a5	FADD	c7, t3, c7	deccc	I	FMUL	a6, b1, t3	nop	FADD	c8, t4, c8	nop	FMUL	a6, b2, t4	LDF	[A3 + 5 * SIZE], a6	FADD	c9, t1, c9	nop	FMUL	a7, b1, t1	nop	FADDX	c10, t2, c10	nop	FMUL	a7, b2, t2	LDF	[A4 + 4 * SIZE], a7	FADD	c11, t3, c11	nop	FMUL	a8, b1, t3	LDF	[X1 + 4 * SIZE], b1	FADD	c12, t4, c12	nop	FMUL	a8, b2, t4	LDF	[A4 + 5 * SIZE], a8	FADD	c13, t1, c13	prefetch [A4 + PREFETCHSIZE * SIZE], 1	FMUL	a1, b3, t1	nop	FADDX	c14, t2, c14	nop	FMUL	a1, b4, t2	LDF	[A1 + 6 * SIZE], a1	FADD	c15, t3, c15	nop	FMUL	a2, b3, t3	LDF	[X1 + 5 * SIZE], b2	FADD	c16, t4, c16	nop	FMUL	a2, b4, t4	LDF	[A1 + 7 * SIZE], a2	FADD	c1, t1, c1	add	A1, 8 * SIZE, A1	FMUL	a3, b3, t1	nop	FADDX	c2, t2, c2	nop	FMUL	a3, b4, t2	LDF	[A2 + 6 * SIZE], a3	FADD	c3, t3, c3	nop	FMUL	a4, b3, t3	nop	FADD	c4, t4, c4	nop	FMUL	a4, b4, t4	LDF	[A2 + 7 * SIZE], a4	FADD	c5, t1, c5	add	A2, 8 * SIZE, A2	FMUL	a5, b3, t1	nop	FADDX	c6, t2, c6	nop	FMUL	a5, b4, t2	LDF	[A3 + 6 * SIZE], a5	FADD	c7, t3, c7	add	A4, 8 * SIZE, A4	FMUL	a6, b3, t3	nop	FADD	c8, t4, c8	nop	FMUL	a6, b4, t4	LDF	[A3 + 7 * SIZE], a6	FADD	c9, t1, c9	add	A3, 8 * SIZE, A3	FMUL	a7, b3, t1	nop	FADDX	c10, t2, c10	add	X1, 8 * SIZE, X1	FMUL	a7, b4, t2	LDF	[A4 - 2 * SIZE], a7	FADD	c11, t3, c11	nop	FMUL	a8, b3, t3	LDF	[X1 - 2 * SIZE], b3	FADD	c12, t4, c12	FMUL	a8, b4, t4	bg,pn	%icc, .LL111	LDF	[A4 - 1 * SIZE], a8.LL112:	FADD	c13, t1, c13	nop	FMUL	a1, b1, t1	LDF	[X1 - 1 * SIZE], b4	FADDX	c14, t2, c14	nop	FMUL	a1, b2, t2	LDF	[A1 + 0 * SIZE], a1	FADD	c15, t3, c15	nop	FMUL	a2, b1, t3	LDF	[X1 - 1 * SIZE], b4	FADD	c16, t4, c16	nop	FMUL	a2, b2, t4	LDF	[A1 + 1 * SIZE], a2	FADD	c1, t1, c1	nop	FMUL	a3, b1, t1	nop	FADDX	c2, t2, c2	nop	FMUL	a3, b2, t2	LDF	[A2 + 0 * SIZE], a3	FADD	c3, t3, c3	nop	FMUL	a4, b1, t3	nop	FADD	c4, t4, c4	nop	FMUL	a4, b2, t4	LDF	[A2 + 1 * SIZE], a4	FADD	c5, t1, c5	nop	FMUL	a5, b1, t1	nop	FADDX	c6, t2, c6	nop	FMUL	a5, b2, t2	LDF	[A3 + 0 * SIZE], a5	FADD	c7, t3, c7	nop	FMUL	a6, b1, t3	nop	FADD	c8, t4, c8	nop	FMUL	a6, b2, t4	LDF	[A3 + 1 * SIZE], a6	FADD	c9, t1, c9	nop	FMUL	a7, b1, t1	nop	FADDX	c10, t2, c10	nop	FMUL	a7, b2, t2	LDF	[A4 + 0 * SIZE], a7	FADD	c11, t3, c11	nop	FMUL	a8, b1, t3	LDF	[X1 + 0 * SIZE], b1	FADD	c12, t4, c12	nop	FMUL	a8, b2, t4	LDF	[A4 + 1 * SIZE], a8	FADD	c13, t1, c13	nop	FMUL	a1, b3, t1	LDF	[X1 + 1 * SIZE], b2	FADDX	c14, t2, c14	nop	FMUL	a1, b4, t2	LDF	[A1 + 2 * SIZE], a1	FADD	c15, t3, c15	nop	FMUL	a2, b3, t3	nop	FADD	c16, t4, c16	nop	FMUL	a2, b4, t4	LDF	[A1 + 3 * SIZE], a2	FADD	c1, t1, c1	nop	FMUL	a3, b3, t1	nop	FADDX	c2, t2, c2	nop	FMUL	a3, b4, t2	LDF	[A2 + 2 * SIZE], a3	FADD	c3, t3, c3	nop	FMUL	a4, b3, t3	nop	FADD	c4, t4, c4	nop	FMUL	a4, b4, t4	LDF	[A2 + 3 * SIZE], a4	FADD	c5, t1, c5	nop	FMUL	a5, b3, t1	nop	FADDX	c6, t2, c6	nop	FMUL	a5, b4, t2	LDF	[A3 + 2 * SIZE], a5	FADD	c7, t3, c7	nop	FMUL	a6, b3, t3	nop	FADD	c8, t4, c8	nop	FMUL	a6, b4, t4	LDF	[A3 + 3 * SIZE], a6	FADD	c9, t1, c9	nop	FMUL	a7, b3, t1	nop	FADDX	c10, t2, c10	nop	FMUL	a7, b4, t2	LDF	[A4 + 2 * SIZE], a7	FADD	c11, t3, c11	nop	FMUL	a8, b3, t3	LDF	[X1 + 2 * SIZE], b3	FADD	c12, t4, c12	nop	FMUL	a8, b4, t4	LDF	[A4 + 3 * SIZE], a8	FADD	c13, t1, c13	nop	FMUL	a1, b1, t1	LDF	[X1 + 3 * SIZE], b4	FADDX	c14, t2, c14	add	X1, 4 * SIZE, X1	FMUL	a1, b2, t2	LDF	[A1 + 4 * SIZE], a1	FADD	c15, t3, c15	nop	FMUL	a2, b1, t3	nop	FADD	c16, t4, c16	nop	FMUL	a2, b2, t4	LDF	[A1 + 5 * SIZE], a2	FADD	c1, t1, c1	add	A1, 6 * SIZE, A1	FMUL	a3, b1, t1	nop	FADDX	c2, t2, c2	nop	FMUL	a3, b2, t2	LDF	[A2 + 4 * SIZE], a3	FADD	c3, t3, c3	nop	FMUL	a4, b1, t3	nop	FADD	c4, t4, c4	nop	FMUL	a4, b2, t4	LDF	[A2 + 5 * SIZE], a4	FADD	c5, t1, c5

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -