⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 qgemv_n.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define SP	r12#define M	r32#define N	r33#ifndef XDOUBLE#define A	r36#define LDA	r37#define X	r38#define INCX	r39#define Y	r34#define INCY	r35#else#define A	r38#define LDA	r39#define X	r34#define INCX	r35#define Y	r36#define INCY	r37#endif#define BUFFER	r11#define I	r14#define J	r15#define AO1	r16#define AO2	r17#define AO3	r18#define AO4	r19#define AO5	r20#define AO6	r21#define AO7	r22#define AO8	r23#define YLD1	r24#define YLD2	r25#define YST1	r26#define YST2	r27#define II	r28#define YY	r29#define ARLC	r30#define PR	r31	#define LDA7M8	r8#define PREA	r9#define PREB	r10#define ALPHA1	f8#define ALPHA2	f9#define ALPHA3	f10#define ALPHA4	f11#define ALPHA5	f12#define ALPHA6	f13#define ALPHA7	f14#define ALPHA8	f15#define RPREFETCHSIZE	( 8 * 1 +  6)#define WPREFETCHSIZE	( 8 * 1 +  6)#define RPREFETCH	lfetch.nt1#define WPREFETCH	lfetch.excl.nt1#define ALPHA	f6	PROLOGUE	.prologue	PROFCODE	{ .mmi	mov	ARLC  = ar.lc	}	;;	mov	PR = pr	adds	r14 = 16, SP	adds	r15 = 24, SP	adds	r16 = 32, SP	.body	;;	#ifdef XDOUBLE	ld8	X      = [r14], 16	ld8	INCX   = [r15], 16	;;#endif	ld8	Y      = [r14], 16	ld8	INCY   = [r15], 16	;;	ld8	BUFFER = [r14]	;;	mov	ALPHA = f8	cmp.ge	p7, p0 = 0, M	cmp.ge	p6, p0 = 0, N	;;	shladd	INCX = INCX, BASE_SHIFT, r0	shladd	LDA  = LDA,  BASE_SHIFT, r0	shladd	INCY = INCY, BASE_SHIFT, r0	;;	(p7) br.cond.dpnt .L999	(p6) br.cond.dpnt .L999	;;	sub	I = A, Y	mov	YY = Y	;;	cmp.eq	p10, p0 = SIZE, INCY	(p10) br.cond.dptk .L10	;;	shr	J = M, 3	mov	YY = BUFFER	;;	(p8) adds  YY = SIZE, BUFFER	;;	mov	ar.lc = J	mov	YST1 = YY	adds	YST2 = 4 * SIZE, YY	;;.L02:	STFD	[YST1] = f0, 1 * SIZE	STFD	[YST2] = f0, 1 * SIZE	;;	STFD	[YST1] = f0, 1 * SIZE	STFD	[YST2] = f0, 1 * SIZE	;;	STFD	[YST1] = f0, 1 * SIZE	STFD	[YST2] = f0, 1 * SIZE	;;	STFD	[YST1] = f0, 5 * SIZE	STFD	[YST2] = f0, 5 * SIZE	br.cloop.sptk.few .L02	;;.L10:	shr	J   = N, 3	;;	cmp.eq	p6, p0 = r0, J	(p6) br.cond.dpnt .L20	;;	.align 16.L11:	shladd	LDA7M8 = LDA, 3, r0	;;	sub	LDA7M8 = LDA, LDA7M8		;;	adds	LDA7M8 = 8 * SIZE, LDA7M8	;;		mov	YLD1 = YY	mov	YST1 = YY	adds	YLD2 = 1 * SIZE, YY	adds	YST2 = 1 * SIZE, YY	;;	LDFD	ALPHA1 = [X], INCX	;;	LDFD	ALPHA2 = [X], INCX	;;	LDFD	ALPHA3 = [X], INCX	;;	LDFD	ALPHA4 = [X], INCX	;;	LDFD	ALPHA5 = [X], INCX	;;	LDFD	ALPHA6 = [X], INCX	;;	LDFD	ALPHA7 = [X], INCX	;;	LDFD	ALPHA8 = [X], INCX	;;	FMPY	ALPHA1 = ALPHA, ALPHA1	FMPY	ALPHA2 = ALPHA, ALPHA2	FMPY	ALPHA3 = ALPHA, ALPHA3	FMPY	ALPHA4 = ALPHA, ALPHA4	FMPY	ALPHA5 = ALPHA, ALPHA5	FMPY	ALPHA6 = ALPHA, ALPHA6	;;	mov	AO1 = A	adds	AO2 = 1 * SIZE, A	adds	AO3 = 2 * SIZE, A	adds	AO4 = 3 * SIZE, A	adds	AO5 = 4 * SIZE, A	adds	AO6 = 5 * SIZE, A	adds	AO7 = 6 * SIZE, A	adds	AO8 = 7 * SIZE, A	shladd	A   = LDA, 3, A	;;	shr	I = M, 3	mov	pr.rot= 0	;;	cmp.eq	p16, p0 = r0, r0	;;	adds	I = -1, I	adds J = -1, J	;;	adds	PREB = (WPREFETCHSIZE) * SIZE, YY	;;	cmp.lt p7, p8 = r0, J	tbit.nz	p13, p11 = M, 2	mov	ar.ec= 2	;;	FMPY	ALPHA7 = ALPHA, ALPHA7	;;	{ .mfi	and	II = 7, M	FMPY	ALPHA8 = ALPHA, ALPHA8	mov	ar.lc = I	}	{ .mib	cmp.eq	p6, p0 = -1, I	tbit.nz	p14, p12 = M, 1	(p6) br.cond.dpnt .L15	}	;;	.align 16.L12:	{ .mmf	(p17) LDFD	f93  = [AO5], LDA7M8	(p17) LDFD	f94  = [AO6], LDA7M8	(p17) FMA	f101 = ALPHA1,  f33, f101	}	{ .mmf	(p17) LDFD	f95  = [AO7], LDA7M8	(p17) LDFD	f96  = [AO8], LDA7M8	(p17) FMA	f104 = ALPHA1,  f34, f104	}	;;	{ .mmf	(p16) LDFD	f32  = [AO1]	(p16) LDFD	f33  = [AO2], LDA	(p17) FMA	f107 = ALPHA1,  f35, f107	}	{ .mmf	(p16) LDFD	f34  = [AO3], LDA	(p16) LDFD	f35  = [AO4], LDA	(p17) FMA	f110 = ALPHA1,  f36, f110	}	;;	{ .mmf	(p16) LDFD	f100  = [YLD1], 2 * SIZE	(p16) LDFD	f103  = [YLD2], 2 * SIZE	(p17) FMA	f113 = ALPHA1,  f37, f113	}	{ .mmf	(p16) adds	PREA = (RPREFETCHSIZE) * SIZE, AO1	(p16) add	AO1 = AO1, LDA	(p17) FMA	f116 = ALPHA1,  f38, f116	}	;;	{ .mmf	(p18) STFD	[YST1] = f102, 2 * SIZE	(p18) STFD	[YST2] = f105, 2 * SIZE	(p17) FMA	f119 = ALPHA1,  f39, f119	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f122 = ALPHA1,  f40, f122	}	;;	{ .mmf	(p16) LDFD	f36  = [AO5], LDA	(p16) LDFD	f37  = [AO6], LDA	(p17) FMA	f101 = ALPHA2,  f41, f101	}	{ .mmf	(p16) LDFD	f38  = [AO7], LDA	(p16) LDFD	f39  = [AO8], LDA	(p17) FMA	f104 = ALPHA2,  f42, f104	}	;;	{ .mmf	(p16) LDFD	f40  = [AO1], LDA	(p16) LDFD	f41  = [AO2], LDA	(p17) FMA	f107 = ALPHA2,  f43, f107	}	{ .mmf	(p16) LDFD	f42  = [AO3], LDA	(p16) LDFD	f43  = [AO4], LDA	(p17) FMA	f110 = ALPHA2,  f44, f110	}	;;	{ .mmf	(p16) LDFD	f106  = [YLD1], 2 * SIZE	(p16) LDFD	f109  = [YLD2], 2 * SIZE	(p17) FMA	f113 = ALPHA2,  f45, f113	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f116 = ALPHA2,  f46, f116	}	;;	{ .mmf	(p18) STFD	[YST1] = f108, 2 * SIZE	(p18) STFD	[YST2] = f111, 2 * SIZE	(p17) FMA	f119 = ALPHA2,  f47, f119	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f122 = ALPHA2,  f48, f122	}	;;	{ .mmf	(p16) LDFD	f44  = [AO5], LDA	(p16) LDFD	f45  = [AO6], LDA	(p17) FMA	f101 = ALPHA3,  f49, f101	}	{ .mmf	(p16) LDFD	f46  = [AO7], LDA	(p16) LDFD	f47  = [AO8], LDA	(p17) FMA	f104 = ALPHA3,  f50, f104	}	;;	{ .mmf	(p16) LDFD	f48  = [AO1], LDA	(p16) LDFD	f49  = [AO2], LDA	(p17) FMA	f107 = ALPHA3,  f51, f107	}	{ .mmf	(p16) LDFD	f50  = [AO3], LDA	(p16) LDFD	f51  = [AO4], LDA	(p17) FMA	f110 = ALPHA3,  f52, f110	}	;;	{ .mmf	(p16) LDFD	f112 = [YLD1], 2 * SIZE	(p16) LDFD	f115 = [YLD2], 2 * SIZE	(p17) FMA	f113 = ALPHA3,  f53, f113	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f116 = ALPHA3,  f54, f116	}	;;	{ .mmf	(p18) STFD	[YST1] = f114, 2 * SIZE	(p18) STFD	[YST2] = f117, 2 * SIZE	(p17) FMA	f119 = ALPHA3,  f55, f119	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f122 = ALPHA3,  f56, f122	}	;;	{ .mmf	(p16) LDFD	f52  = [AO5], LDA	(p16) LDFD	f53  = [AO6], LDA	(p17) FMA	f101 = ALPHA4,  f57, f101	}	{ .mmf	(p16) LDFD	f54  = [AO7], LDA	(p16) LDFD	f55  = [AO8], LDA	(p17) FMA	f104 = ALPHA4,  f58, f104	}	;;	{ .mmf	(p16) LDFD	f56  = [AO1], LDA	(p16) LDFD	f57  = [AO2], LDA	(p17) FMA	f107 = ALPHA4,  f59, f107	}	{ .mmf	(p16) LDFD	f58  = [AO3], LDA	(p16) LDFD	f59  = [AO4], LDA	(p17) FMA	f110 = ALPHA4,  f60, f110	}	;;	{ .mmf	(p16) LDFD	f118 = [YLD1], 2 * SIZE	(p16) LDFD	f121 = [YLD2], 2 * SIZE	(p17) FMA	f113 = ALPHA4,  f61, f113	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f116 = ALPHA4,  f62, f116	}	;;	{ .mmf	(p18) STFD	[YST1] = f120, 2 * SIZE	(p18) STFD	[YST2] = f123, 2 * SIZE	(p17) FMA	f119 = ALPHA4,  f63, f119	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f122 = ALPHA4,  f64, f122	}	;;	{ .mmf	(p16) LDFD	f60  = [AO5], LDA	(p16) LDFD	f61  = [AO6], LDA	(p17) FMA	f101 = ALPHA5,  f65, f101	}	{ .mmf	(p16) LDFD	f62  = [AO7], LDA	(p16) LDFD	f63  = [AO8], LDA	(p17) FMA	f104 = ALPHA5,  f66, f104	}	;;	{ .mmf	(p16) LDFD	f64  = [AO1], LDA	(p16) LDFD	f65  = [AO2], LDA	(p17) FMA	f107 = ALPHA5,  f67, f107	}	{ .mmf	(p16) LDFD	f66  = [AO3], LDA	(p16) LDFD	f67  = [AO4], LDA	(p17) FMA	f110 = ALPHA5,  f68, f110	}	;;	{ .mmf	(p16) WPREFETCH [PREB], 8 * SIZE	nop   __LINE__	(p17) FMA	f113 = ALPHA5,  f69, f113	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f116 = ALPHA5,  f70, f116	}	;;	{ .mmf	(p16) RPREFETCH [PREA]	nop   __LINE__	(p17) FMA	f119 = ALPHA5,  f71, f119	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f122 = ALPHA5,  f72, f122	}	;;	{ .mmf	(p16) LDFD	f68  = [AO5], LDA	(p16) LDFD	f69  = [AO6], LDA	(p17) FMA	f101 = ALPHA6,  f73, f101	}	{ .mmf	(p16) LDFD	f70  = [AO7], LDA	(p16) LDFD	f71  = [AO8], LDA	(p17) FMA	f104 = ALPHA6,  f74, f104	}	;;	{ .mmf	(p16) LDFD	f72  = [AO1], LDA	(p16) LDFD	f73  = [AO2], LDA	(p17) FMA	f107 = ALPHA6,  f75, f107	}	{ .mmf	(p16) LDFD	f74  = [AO3], LDA	(p16) LDFD	f75  = [AO4], LDA	(p17) FMA	f110 = ALPHA6,  f76, f110	}	;;	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f113 = ALPHA6,  f77, f113	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f116 = ALPHA6,  f78, f116	}	;;	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f119 = ALPHA6,  f79, f119	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f122 = ALPHA6,  f80, f122	}	;;	{ .mmf	(p16) LDFD	f76  = [AO5], LDA	(p16) LDFD	f77  = [AO6], LDA	(p17) FMA	f101 = ALPHA7,  f81, f101	}	{ .mmf	(p16) LDFD	f78  = [AO7], LDA	(p16) LDFD	f79  = [AO8], LDA	(p17) FMA	f104 = ALPHA7,  f82, f104	}	;;	{ .mmf	(p16) LDFD	f80  = [AO1], LDA	(p16) LDFD	f81  = [AO2], LDA	(p17) FMA	f107 = ALPHA7,  f83, f107	}	{ .mmf	(p16) LDFD	f82  = [AO3], LDA	(p16) LDFD	f83  = [AO4], LDA	(p17) FMA	f110 = ALPHA7,  f84, f110	}	;;	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f113 = ALPHA7,  f85, f113	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f116 = ALPHA7,  f86, f116	}	;;	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f119 = ALPHA7,  f87, f119	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p17) FMA	f122 = ALPHA7,  f88, f122	}	;; 	{ .mmf	(p16) LDFD	f84  = [AO5], LDA	(p16) LDFD	f85  = [AO6], LDA	(p17) FMA	f101 = ALPHA8,  f89, f101	}	{ .mmf	(p16) LDFD	f86  = [AO7], LDA	(p16) LDFD	f87  = [AO8], LDA	(p17) FMA	f104 = ALPHA8,  f90, f104	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -