⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemv_n.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define M	%i0#define N	%i1#if defined(DOUBLE) && !defined(__64BIT__)#define A	%i5#define LDA	%i2#define X	%i3#define INCX	%i4	#else#define A	%i4#define LDA	%i5#define X	%i2#define INCX	%i3	#endif#define Y	%l0#define INCY	%l1#define BUFFER	%l2#define I	%l3#define J	%l5#define A1	%o0#define A2	%o1#define A3	%o2#define A4	%o3#define Y1	%l4#define YY	%l6#ifdef DOUBLE#define t1	%f0#define	t2 	%f2#define t3	%f4#define	t4 	%f6#define y1	%f8#define y2	%f10#define y3	%f12#define y4	%f14#define y5	%f16#define y6	%f18#define y7	%f20#define y8	%f22#define a1	%f24#define a2	%f26#define a3	%f28#define a4	%f30#define a5	%f32#define a6	%f34#define a7	%f36#define a8	%f38#define a9	%f40#define a10	%f42#define a11	%f44#define a12	%f46#define a13	%f48#define a14	%f50#define a15	%f52#define a16	%f54#define x1	%f56#define x2	%f58#define x3	%f60#define x4	%f62#define FZERO	%f52#define ALPHA	%f54#else#define t1	%f0#define	t2 	%f1#define t3	%f2#define	t4 	%f3#define y1	%f4#define y2	%f5#define y3	%f6#define y4	%f7#define y5	%f8#define y6	%f9#define y7	%f10#define y8	%f11#define a1	%f12#define a2	%f13#define a3	%f14#define a4	%f15#define a5	%f16#define a6	%f17#define a7	%f18#define a8	%f19#define a9	%f20#define a10	%f21#define a11	%f22#define a12	%f23#define a13	%f24#define a14	%f25#define a15	%f26#define a16	%f27#define x1	%f28#define x2	%f29#define x3	%f30#define x4	%f31#define FZERO	%f26#define ALPHA	%f27#endif#ifndef __64BIT__#define STACK_FZERO	[%fp + STACK_START +  8]#define STACK_ALPHA	[%fp + STACK_START + 16]#else#define STACK_FZERO	[%fp + STACK_START + 32]#define STACK_ALPHA	[%fp + STACK_START + 40]#endif	PROLOGUE	SAVESP	nop#ifndef __64BIT__#ifdef DOUBLE	st	%g0, [%fp + STACK_START +  8]	st	%g0, [%fp + STACK_START + 12]	st	%i3, [%fp + STACK_START + 16]   /* ALPHA */	st	%i4, [%fp + STACK_START + 20]	ld	[%fp + STACK_START + 28], LDA	nop	ld	[%fp + STACK_START + 32], X	nop	ld	[%fp + STACK_START + 36], INCX	nop	ld	[%fp + STACK_START + 40], Y	ld	[%fp + STACK_START + 44], INCY	ld	[%fp + STACK_START + 48], BUFFER#else	st	%g0, [%fp + STACK_START +  8]	st	%i3, [%fp + STACK_START + 16]   /* ALPHA */	ld	[%fp + STACK_START + 28], X	nop	ld	[%fp + STACK_START + 32], INCX	nop	ld	[%fp + STACK_START + 36], Y	ld	[%fp + STACK_START + 40], INCY	ld	[%fp + STACK_START + 44], BUFFER#endif	LDF	[%fp + STACK_START +  8], FZERO	LDF	[%fp + STACK_START + 16], ALPHA#else#ifdef DOUBLE	stx	%g0, [%fp + STACK_START + 32]	FMOV	%f6, ALPHA	nop	ldd	[%fp + STACK_START + 32], FZERO#else	st	%g0, [%fp + STACK_START + 32]	FMOV	%f7, ALPHA	nop	ld	[%fp + STACK_START + 32], FZERO#endif	ldx	[%fp+  STACK_START + 56], X	ldx	[%fp+  STACK_START + 64], INCX	ldx	[%fp+  STACK_START + 72], Y	ldx	[%fp+  STACK_START + 80], INCY	ldx	[%fp+  STACK_START + 88], BUFFER#endif	sll	LDA, BASE_SHIFT, LDA#ifdef DOUBLE	STF	%f6, STACK_ALPHA#else	STF	%f7, STACK_ALPHA#endif	cmp	M, 0	ble	%icc, .LL999	sll	INCX, BASE_SHIFT, INCX	cmp	N, 0	ble	%icc, .LL999	sll	INCY, BASE_SHIFT, INCY	cmp	INCY, SIZE	be	%icc, .LL10	mov	Y, YY	add	M, 7, J	sra	J, 3, J	mov	BUFFER, YY	mov	BUFFER, Y1.LL01:	STF	FZERO, [Y1 +  0 * SIZE]	STF	FZERO, [Y1 +  1 * SIZE]	STF	FZERO, [Y1 +  2 * SIZE]	STF	FZERO, [Y1 +  3 * SIZE]	STF	FZERO, [Y1 +  4 * SIZE]	STF	FZERO, [Y1 +  5 * SIZE]	STF	FZERO, [Y1 +  6 * SIZE]	deccc	J	STF	FZERO, [Y1 +  7 * SIZE]	bg,pn	%icc, .LL01	add	Y1, 8 * SIZE, Y1	.LL10:	sra	N, 2, J	cmp	J, 0	ble,pn	%icc, .LL20	nop.LL11:	mov	YY, Y1	mov	A,  A1	add	A,  LDA, A2	add	A2, LDA, A3	add	A3, LDA, A4	add	A4, LDA, A	LDF	STACK_ALPHA, ALPHA	LDF	[X], x1	add	X, INCX, X	LDF	[X], x2	add	X, INCX, X	LDF	[X], x3	add	X, INCX, X	LDF	[X], x4	add	X, INCX, X	FMUL	ALPHA, x1, x1	FMUL	ALPHA, x2, x2	FMUL	ALPHA, x3, x3	FMUL	ALPHA, x4, x4	sra	M, 3, I	cmp	I, 0	ble,pn	%icc, .LL16	nop	LDF	[A1 + 0 * SIZE], a1	LDF	[A1 + 1 * SIZE], a2	LDF	[A1 + 2 * SIZE], a3	LDF	[A1 + 3 * SIZE], a4	LDF	[A1 + 4 * SIZE], a5	LDF	[A1 + 5 * SIZE], a6	LDF	[A1 + 6 * SIZE], a7	LDF	[A1 + 7 * SIZE], a8	LDF	[A2 + 0 * SIZE], a9	LDF	[A2 + 1 * SIZE], a10	LDF	[A2 + 2 * SIZE], a11	LDF	[A2 + 3 * SIZE], a12	LDF	[A2 + 4 * SIZE], a13	LDF	[A2 + 5 * SIZE], a14	LDF	[A2 + 6 * SIZE], a15	LDF	[A2 + 7 * SIZE], a16	FMUL	a1,  x1, t1	LDF	[A3 + 0 * SIZE], a1	FMUL	a2,  x1, t2	LDF	[A3 + 1 * SIZE], a2	FMUL	a3,  x1, t3	LDF	[A3 + 2 * SIZE], a3	FMUL	a4,  x1, t4	LDF	[A3 + 3 * SIZE], a4	deccc	I	ble,pn	%icc, .LL13	nop	nop	nop	nop#ifdef DOUBLE#define PREFETCHSIZE 20#else#define PREFETCHSIZE 40#endif.LL12:	LDF	[Y1 +  0 * SIZE], y1	LDF	[Y1 +  1 * SIZE], y2	LDF	[Y1 +  2 * SIZE], y3	LDF	[Y1 +  3 * SIZE], y4	LDF	[Y1 +  4 * SIZE], y5	LDF	[Y1 +  5 * SIZE], y6	LDF	[Y1 +  6 * SIZE], y7	LDF	[Y1 +  7 * SIZE], y8	FADD	y1,  t1, y1	prefetch  [A1 +  PREFETCHSIZE * SIZE], 1	FMUL	a5,  x1, t1	LDF	[A3 + 4 * SIZE], a5	FADD	y2,  t2, y2	nop	FMUL	a6,  x1, t2	LDF	[A3 + 5 * SIZE], a6	FADD	y3,  t3, y3	nop	FMUL	a7,  x1, t3	LDF	[A3 + 6 * SIZE], a7	FADD	y4,  t4, y4	nop	FMUL	a8,  x1, t4	LDF	[A3 + 7 * SIZE], a8	FADD	y5,  t1, y5	nop	FMUL	a9,  x2, t1	LDF	[A4 +  0 * SIZE], a9	FADD	y6,  t2, y6	nop	FMUL	a10, x2, t2	LDF	[A4 +  1 * SIZE], a10	FADD	y7,  t3, y7	nop	FMUL	a11, x2, t3	LDF	[A4 +  2 * SIZE], a11	FADD	y8,  t4, y8	nop	FMUL	a12, x2, t4	LDF	[A4 +  3 * SIZE], a12	FADD	y1,  t1, y1	prefetch  [A2 +  PREFETCHSIZE * SIZE], 1	FMUL	a13, x2, t1	LDF	[A4 +  4 * SIZE], a13	FADD	y2,  t2, y2	nop	FMUL	a14, x2, t2	LDF	[A4 +  5 * SIZE], a14	FADD	y3,  t3, y3	nop	FMUL	a15, x2, t3	LDF	[A4 +  6 * SIZE], a15	FADD	y4,  t4, y4	nop	FMUL	a16, x2, t4	LDF	[A4 +  7 * SIZE], a16	FADD	y5,  t1, y5	nop	FMUL	a1,  x3, t1	LDF	[A1 +  8 * SIZE], a1	FADD	y6,  t2, y6	nop	FMUL	a2,  x3, t2	LDF	[A1 +  9 * SIZE], a2	FADD	y7,  t3, y7	nop	FMUL	a3,  x3, t3	LDF	[A1 + 10 * SIZE], a3	FADD	y8,  t4, y8	nop	FMUL	a4,  x3, t4	LDF	[A1 + 11 * SIZE], a4	FADD	y1,  t1, y1	prefetch  [A3 +  PREFETCHSIZE * SIZE], 1	FMUL	a5,  x3, t1	LDF	[A1 + 12 * SIZE], a5	FADD	y2,  t2, y2	nop	FMUL	a6,  x3, t2	LDF	[A1 + 13 * SIZE], a6	FADD	y3,  t3, y3	nop	FMUL	a7,  x3, t3	LDF	[A1 + 14 * SIZE], a7	FADD	y4,  t4, y4	nop	FMUL	a8,  x3, t4	LDF	[A1 + 15 * SIZE], a8	FADD	y5,  t1, y5	nop	FMUL	a9,  x4, t1	LDF	[A2 +  8 * SIZE], a9	FADD	y6,  t2, y6	nop	FMUL	a10, x4, t2	LDF	[A2 +  9 * SIZE], a10	FADD	y7,  t3, y7	nop	FMUL	a11, x4, t3	LDF	[A2 + 10 * SIZE], a11	FADD	y8,  t4, y8	nop	FMUL	a12, x4, t4	LDF	[A2 + 11 * SIZE], a12	FADD	y1,  t1, y1	prefetch  [A4 +  PREFETCHSIZE * SIZE], 1	FMUL	a13, x4, t1	LDF	[A2 + 12 * SIZE], a13	FADD	y2,  t2, y2	add	A3, 8 * SIZE, A3	FMUL	a14, x4, t2	LDF	[A2 + 13 * SIZE], a14	FADD	y3,  t3, y3	add	Y1, 8 * SIZE, Y1	FMUL	a15, x4, t3	LDF	[A2 + 14 * SIZE], a15	FADD	y4,  t4, y4	deccc	I	FMUL	a16, x4, t4	LDF	[A2 + 15 * SIZE], a16	FADD	y5,  t1, y5	add	A1, 8 * SIZE, A1	FMUL	a1,  x1, t1	LDF	[A3 +  0 * SIZE], a1	FADD	y6,  t2, y6	add	A2, 8 * SIZE, A2	FMUL	a2,  x1, t2	LDF	[A3 +  1 * SIZE], a2	FADD	y7,  t3, y7	add	A4, 8 * SIZE, A4	FMUL	a3,  x1, t3	LDF	[A3 +  2 * SIZE], a3	FADD	y8,  t4, y8	nop	FMUL	a4,  x1, t4	LDF	[A3 +  3 * SIZE], a4	STF	y1, [Y1 - 8 * SIZE]	STF	y2, [Y1 - 7 * SIZE]	STF	y3, [Y1 - 6 * SIZE]	STF	y4, [Y1 - 5 * SIZE]	STF	y5, [Y1 - 4 * SIZE]	STF	y6, [Y1 - 3 * SIZE]	STF	y7, [Y1 - 2 * SIZE]	bg,pn	%icc, .LL12	STF	y8, [Y1 - 1 * SIZE].LL13:	LDF	[Y1 +  0 * SIZE], y1	LDF	[Y1 +  1 * SIZE], y2	LDF	[Y1 +  2 * SIZE], y3	LDF	[Y1 +  3 * SIZE], y4	LDF	[Y1 +  4 * SIZE], y5	LDF	[Y1 +  5 * SIZE], y6	LDF	[Y1 +  6 * SIZE], y7	LDF	[Y1 +  7 * SIZE], y8	FADD	y1,  t1, y1	FMUL	a5,  x1, t1	LDF	[A3 + 0 * SIZE], a1	FADD	y2,  t2, y2	FMUL	a6,  x1, t2	LDF	[A3 + 1 * SIZE], a2	FADD	y3,  t3, y3	FMUL	a7,  x1, t3	LDF	[A3 + 2 * SIZE], a3	FADD	y4,  t4, y4	FMUL	a8,  x1, t4	LDF	[A3 + 3 * SIZE], a4	FADD	y5,  t1, y5	FMUL	a9,  x2, t1	LDF	[A3 + 4 * SIZE], a5	FADD	y6,  t2, y6	FMUL	a10, x2, t2	LDF	[A3 + 5 * SIZE], a6	FADD	y7,  t3, y7	FMUL	a11, x2, t3	LDF	[A3 + 6 * SIZE], a7	FADD	y8,  t4, y8	FMUL	a12, x2, t4	LDF	[A3 + 7 * SIZE], a8	FADD	y1,  t1, y1	FMUL	a13, x2, t1	LDF	[A4 + 0 * SIZE], a9	FADD	y2,  t2, y2	FMUL	a14, x2, t2	LDF	[A4 + 1 * SIZE], a10	FADD	y3,  t3, y3	FMUL	a15, x2, t3	LDF	[A4 + 2 * SIZE], a11	FADD	y4,  t4, y4	FMUL	a16, x2, t4	LDF	[A4 + 3 * SIZE], a12	FADD	y5,  t1, y5	FMUL	a1,  x3, t1	LDF	[A4 + 4 * SIZE], a13	FADD	y6,  t2, y6	FMUL	a2,  x3, t2	LDF	[A4 + 5 * SIZE], a14	FADD	y7,  t3, y7	FMUL	a3,  x3, t3	LDF	[A4 + 6 * SIZE], a15	FADD	y8,  t4, y8	FMUL	a4,  x3, t4	LDF	[A4 + 7 * SIZE], a16	FADD	y1,  t1, y1	FMUL	a5,  x3, t1	FADD	y2,  t2, y2	FMUL	a6,  x3, t2	FADD	y3,  t3, y3	FMUL	a7,  x3, t3	FADD	y4,  t4, y4	FMUL	a8,  x3, t4	FADD	y5,  t1, y5	FMUL	a9,  x4, t1	FADD	y6,  t2, y6	FMUL	a10, x4, t2	FADD	y7,  t3, y7	FMUL	a11, x4, t3	FADD	y8,  t4, y8	FMUL	a12, x4, t4	FADD	y1,  t1, y1	FMUL	a13, x4, t1	FADD	y2,  t2, y2	FMUL	a14, x4, t2	FADD	y3,  t3, y3	FMUL	a15, x4, t3	FADD	y4,  t4, y4	FMUL	a16, x4, t4	add	A4, 8 * SIZE, A4	STF	y1, [Y1 + 0 * SIZE]	FADD	y5,  t1, y5	STF	y2, [Y1 + 1 * SIZE]	FADD	y6,  t2, y6	STF	y3, [Y1 + 2 * SIZE]	FADD	y7,  t3, y7	STF	y4, [Y1 + 3 * SIZE]	FADD	y8,  t4, y8	STF	y5, [Y1 + 4 * SIZE]	add	A1, 8 * SIZE, A1	STF	y6, [Y1 + 5 * SIZE]	add	A2, 8 * SIZE, A2	STF	y7, [Y1 + 6 * SIZE]	add	A3, 8 * SIZE, A3	STF	y8, [Y1 + 7 * SIZE]	add	Y1, 8 * SIZE, Y1.LL16:	andcc	M, 4, I	ble,pn	%icc, .LL17	nop	LDF	[A1 + 0 * SIZE], a1	LDF	[A1 + 1 * SIZE], a2	LDF	[A1 + 2 * SIZE], a3	LDF	[A1 + 3 * SIZE], a4	LDF	[A2 + 0 * SIZE], a5	LDF	[A2 + 1 * SIZE], a6	LDF	[A2 + 2 * SIZE], a7	LDF	[A2 + 3 * SIZE], a8	LDF	[A3 + 0 * SIZE], a9	LDF	[A3 + 1 * SIZE], a10	LDF	[A3 + 2 * SIZE], a11	LDF	[A3 + 3 * SIZE], a12	LDF	[A4 + 0 * SIZE], a13	LDF	[A4 + 1 * SIZE], a14	LDF	[A4 + 2 * SIZE], a15	LDF	[A4 + 3 * SIZE], a16	LDF	[Y1 + 0 * SIZE], y1	add	A1, 4 * SIZE, A1	LDF	[Y1 + 1 * SIZE], y2	add	A2, 4 * SIZE, A2	LDF	[Y1 + 2 * SIZE], y3	add	A3, 4 * SIZE, A3	LDF	[Y1 + 3 * SIZE], y4	add	A4, 4 * SIZE, A4	FMUL	a1,  x1, t1	FMUL	a2,  x1, t2	FMUL	a3,  x1, t3	FMUL	a4,  x1, t4	FADD	y1,  t1, y1	FMUL	a5,  x2, t1	FADD	y2,  t2, y2	FMUL	a6,  x2, t2	FADD	y3,  t3, y3	FMUL	a7,  x2, t3	FADD	y4,  t4, y4	FMUL	a8,  x2, t4	FADD	y1,  t1, y1	FMUL	a9,  x3, t1	FADD	y2,  t2, y2	FMUL	a10, x3, t2	FADD	y3,  t3, y3	FMUL	a11, x3, t3	FADD	y4,  t4, y4	FMUL	a12, x3, t4	FADD	y1,  t1, y1	FMUL	a13, x4, t1	FADD	y2,  t2, y2	FMUL	a14, x4, t2	FADD	y3,  t3, y3	FMUL	a15, x4, t3	FADD	y4,  t4, y4	FMUL	a16, x4, t4	FADD	y1,  t1, y1	FADD	y2,  t2, y2	FADD	y3,  t3, y3	FADD	y4,  t4, y4	STF	y1, [Y1 + 0 * SIZE]	STF	y2, [Y1 + 1 * SIZE]	STF	y3, [Y1 + 2 * SIZE]	STF	y4, [Y1 + 3 * SIZE]	add	Y1, 4 * SIZE, Y1.LL17:	andcc	M, 2, I	ble,pn	%icc, .LL18	nop	LDF	[A1 + 0 * SIZE], a1	LDF	[A2 + 0 * SIZE], a2	LDF	[A3 + 0 * SIZE], a3	LDF	[A4 + 0 * SIZE], a4	LDF	[Y1 + 0 * SIZE], y1	LDF	[A1 + 1 * SIZE], a5

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -