⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemv_n.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define SP	r12#define M	r32#define N	r33#define A	r37#define LDA	r38#define X	r39#define INCX	r34#define Y	r35#define INCY	r36#define BUFFER	r11#define I	r14#define J	r15#define AO1	r16#define AO2	r17#define AO3	r18#define AO4	r19#define AO5	r20#define AO6	r21#define AO7	r22#define AO8	r23#define YLD1	r24#define YLD2	r25#define YST1	r26#define YST2	r27#define YY	r28#define XX	r9	#define RPRE1	loc0#define RPRE2	loc1#define RPRE3	loc2#define RPRE4	loc3#define RPRE5	loc4#define RPRE6	loc5#define RPRE7	loc6#define RPRE8	loc7#define INCXM1	r2#define INCX3M1	r3#define AO9	loc8#define AO10	loc9#define AO11	loc10#define AO12	loc11#define AO13	loc12#define AO14	loc13#define AO15	loc14#define AO16	loc15#define PREB	r8#define ARLC	r29#define PR	r30#define ARPFS	r31	#ifdef DOUBLE#define RPREFETCH	(16 * 2 +  8)#else#define RPREFETCH	(16 * 2 + 16)#endif#define PREFETCH	lfetch.nt1#define ALPHA_R	f6#define ALPHA_I	f7#if   !defined(CONJ) && !defined(XCONJ)#define ADD1	     FNMA#define ADD2	     FMA#define ADD3	     FNMA#define ADD4	     FMA#elif  defined(CONJ) && !defined(XCONJ)#define ADD1	     FNMA#define ADD2	     FMA#define ADD3	     FMA#define ADD4	     FNMA#elif !defined(CONJ) &&  defined(XCONJ)#define ADD1	     FMA#define ADD2	     FNMA#define ADD3	     FNMA#define ADD4	     FMA#else#define ADD1	     FMA#define ADD2	     FNMA#define ADD3	     FMA#define ADD4	     FNMA#endif	PROLOGUE	.prologue	PROFCODE	{ .mmi	.save	ar.pfs, ARPFS	alloc	ARPFS = ar.pfs, 8, 16, 0, 0	mov	ARLC  = ar.lc	}	;;	mov	PR = pr	adds	r14 = 16, SP	adds	r15 = 24, SP	adds	r16 = 32, SP	adds	r17 = 40, SP	;;	adds	r8 = -8 * 16, SP	adds	r9 = -7 * 16, SP	adds	SP = -8 * 16, SP	;;	stf.spill  [r8] = f16, 32	stf.spill  [r9] = f17, 32	;;		stf.spill  [r8] = f18, 32	stf.spill  [r9] = f19, 32	;;		stf.spill  [r8] = f20, 32	stf.spill  [r9] = f21, 32	;;	stf.spill  [r8] = f22	stf.spill  [r9] = f23	;;		ld8	INCX   = [r14]	ld8	Y      = [r15]	ld8	INCY   = [r16]	ld8	BUFFER = [r17]	.body	;;	cmp.ge	p7, p0 = 0, M	cmp.ge	p6, p0 = 0, N	mov	ALPHA_R = f8	shladd	INCX = INCX, ZBASE_SHIFT, r0	shladd	LDA  = LDA,  ZBASE_SHIFT, r0	mov	ALPHA_I = f9	;;	shladd	INCY = INCY, ZBASE_SHIFT, r0	tbit.nz	p8, p0 = A,   BASE_SHIFT	(p7) br.cond.dpnt .L999	;;	shladd	XX = INCX, 1, X	adds	INCXM1  = -SIZE, INCX	(p6) br.cond.dpnt .L999	;;	shladd	INCX3M1 = INCX, 1, INCXM1	cmp.eq	p10, p11 = 2 * SIZE, INCY	mov	YY = Y	;;	(p11) mov YY = BUFFER	mov	YST1 = BUFFER	shr	J = M, 2	;;	{ .mib	adds	YST2 = 4 * SIZE, BUFFER	mov	ar.lc = J	(p10) br.cond.dptk .L10	}	;;.L02:	STFD	[YST1] = f0, 1 * SIZE	STFD	[YST2] = f0, 1 * SIZE	;;	STFD	[YST1] = f0, 1 * SIZE	STFD	[YST2] = f0, 1 * SIZE	;;	STFD	[YST1] = f0, 1 * SIZE	STFD	[YST2] = f0, 1 * SIZE	;;	STFD	[YST1] = f0, 5 * SIZE	STFD	[YST2] = f0, 5 * SIZE	br.cloop.sptk.few .L02	;;.L10:	{ .mmi	mov	AO1 = A	nop	__LINE__	shr	J   = N, 3	}	;;	{ .mmb	add	AO2 = LDA, A	cmp.eq	p6, p0 = r0, J	(p6) br.cond.dpnt .L20	}	;;	.align 16.L11:	LDFD	f32 = [X], SIZE	LDFD	f36 = [XX], SIZE	mov	pr.rot= 0	;;	LDFD	f33 = [X], INCXM1	LDFD	f37 = [XX], INCXM1	mov	YLD1 = YY	;;	LDFD	f34 = [X], SIZE	LDFD	f38 = [XX], SIZE	adds	YLD2 = 4 * SIZE, YY	;;	LDFD	f35 = [X], INCX3M1	LDFD	f39 = [XX], INCX3M1	mov	YST1 = YY	;;	LDFD	f40 = [X], SIZE	LDFD	f44 = [XX], SIZE	adds	YST2 = 4 * SIZE, YY	;;	LDFD	f41 = [X], INCXM1	LDFD	f45 = [XX], INCXM1	shr	I = M, 2	;;	LDFD	f42 = [X], SIZE	LDFD	f46 = [XX], SIZE	mov	AO1 = A	;;	LDFD	f43 = [X], INCX3M1	LDFD	f47 = [XX], INCX3M1	add	AO2 = LDA, A	;;	shladd	AO3 = LDA, 1, A	FMPY	f8  = ALPHA_R, f32	mov	ar.ec= 2	shladd	AO4 = LDA, 1, AO2	FMPY	f9  = ALPHA_I, f32	;;	shladd	AO5 = LDA, 1, AO3	FMPY	f10 = ALPHA_R, f34	shladd	AO6 = LDA, 1, AO4	FMPY	f11 = ALPHA_I, f34	;;	FMPY	f12 = ALPHA_R, f36	shladd	AO7 = LDA, 1, AO5	FMPY	f13 = ALPHA_I, f36	shladd	AO8 = LDA, 1, AO6	FMPY	f14 = ALPHA_R, f38	;;	adds	PREB   = RPREFETCH * SIZE, YLD1	FMPY	f15 = ALPHA_I, f38	adds	RPRE1  = RPREFETCH * SIZE, AO1	FMPY	f16 = ALPHA_R, f40	adds	RPRE2  = (RPREFETCH + 8) * SIZE, AO2	FMPY	f17 = ALPHA_I, f40	adds	RPRE3  = RPREFETCH * SIZE, AO3	FMPY	f18 = ALPHA_R, f42	adds	RPRE4  = (RPREFETCH + 8) * SIZE, AO4	FMPY	f19 = ALPHA_I, f42	adds	RPRE5  = RPREFETCH * SIZE, AO5	FMPY	f20 = ALPHA_R, f44	adds	RPRE6  = (RPREFETCH + 8) * SIZE, AO6	FMPY	f21 = ALPHA_I, f44	adds	RPRE7  = RPREFETCH * SIZE, AO7	FMPY	f22 = ALPHA_R, f46	adds	RPRE8  = (RPREFETCH + 8) * SIZE, AO8	FMPY	f23 = ALPHA_I, f46	;;	ADD1	f8  = ALPHA_I, f33, f8	tbit.nz	p14, p0 = M, 1	ADD2	f9  = ALPHA_R, f33, f9	shladd	A   = LDA, 3, A	ADD1	f10 = ALPHA_I, f35, f10	adds	AO9  = 4 * SIZE, AO1	ADD2	f11 = ALPHA_R, f35, f11	adds	AO10 = 4 * SIZE, AO2	ADD1	f12 = ALPHA_I, f37, f12	adds	AO11 = 4 * SIZE, AO3	ADD2	f13 = ALPHA_R, f37, f13	adds	AO12 = 4 * SIZE, AO4	ADD1	f14 = ALPHA_I, f39, f14	adds	AO13 = 4 * SIZE, AO5	ADD2	f15 = ALPHA_R, f39, f15	adds	AO14 = 4 * SIZE, AO6	ADD1	f16 = ALPHA_I, f41, f16	adds	AO15 = 4 * SIZE, AO7	ADD2	f17 = ALPHA_R, f41, f17	adds	AO16 = 4 * SIZE, AO8	ADD1	f18 = ALPHA_I, f43, f18	cmp.eq	p6, p0 = 0, I	ADD2	f19 = ALPHA_R, f43, f19	cmp.eq	p16, p0 = r0, r0	ADD1	f20 = ALPHA_I, f45, f20	adds	I = -1, I	ADD2	f21 = ALPHA_R, f45, f21	;;	{ .mfi	nop	__LINE__	ADD1	f22 = ALPHA_I, f47, f22	mov	ar.lc = I	}	{ .mfb	nop	__LINE__	ADD2	f23 = ALPHA_R, f47, f23	(p6) br.cond.dpnt .L15	}	;;	.align 16.L12:	{ .mfi	(p17) LDFD	f89  = [AO8], 1 * SIZE	(p17) FMA	f101 = f8,   f33, f101	(p16) tbit.nz.unc	p12, p13 = I, 0	}	{ .mfi	(p17) LDFD	f93  = [AO16], 1 * SIZE	(p17) FMA	f113 = f8,   f37, f113	}	;;	{ .mfi	(p17) LDFD	f90  = [AO8], 1 * SIZE	(p17) FMA	f104 = f9,   f33, f104	(p16) adds	I = -1, I	}	{ .mfi	(p17) LDFD	f94  = [AO16], 1 * SIZE	(p17) FMA	f116 = f9,   f37, f116	}	;;	{ .mfi	(p17) LDFD	f91  = [AO8], 1 * SIZE	(p17) FMA	f107 = f8,   f35, f107	}	{ .mfi	(p17) LDFD	f95  = [AO16], 1 * SIZE	(p17) FMA	f119 = f8,   f39, f119	}	;;	{ .mfi	(p17) LDFD	f92  = [AO8], 5 * SIZE	(p17) FMA	f110 = f9,   f35, f110	}	{ .mfi	(p17) LDFD	f96  = [AO16], 5 * SIZE	(p17) FMA	f122 = f9,   f39, f122	}	;;	{ .mfi	(p12) lfetch.excl.nt2	[PREB],  16 * SIZE	(p17) ADD3	f101 = f9,   f34, f101	}	{ .mfi	(p17) ADD3	f113 = f9,   f38, f113	}	;;	{ .mfi	(p16) LDFD	f100 = [YLD1], 1 * SIZE	(p17) ADD4	f104 = f8,   f34, f104	}	{ .mfi	(p16) LDFD	f112 = [YLD2], 1 * SIZE	(p17) ADD4	f116 = f8,   f38, f116	}	;;	{ .mfi	(p16) LDFD	f103 = [YLD1], 1 * SIZE	(p17) ADD3	f107 = f9,   f36, f107	}	{ .mfi	(p16) LDFD	f115 = [YLD2], 1 * SIZE	(p17) ADD3	f119 = f9,   f40, f119	}	;;	{ .mfi	(p12) PREFETCH	[RPRE1], 16 * SIZE	(p17) ADD4	f110 = f8,   f36, f110	}	{ .mfi	(p17) ADD4	f122 = f8,   f40, f122	}	;;	{ .mfi	(p16) LDFD	f32  = [AO1], 1 * SIZE	(p17) FMA	f101 = f10,   f41, f101	}	{ .mfi	(p16) LDFD	f36  = [AO9], 1 * SIZE	(p17) FMA	f113 = f10,   f45, f113	}	;;	{ .mfi	(p16) LDFD	f33  = [AO1], 1 * SIZE	(p17) FMA	f104 = f11,   f41, f104	}	{ .mfi	(p16) LDFD	f37  = [AO9], 1 * SIZE	(p17) FMA	f116 = f11,   f45, f116	}	;;	{ .mfi	(p16) LDFD	f34  = [AO1], 1 * SIZE	(p17) FMA	f107 = f10,   f43, f107	}	{ .mfi	(p16) LDFD	f38  = [AO9], 1 * SIZE	(p17) FMA	f119 = f10,   f47, f119	}	;;	{ .mfi	(p16) LDFD	f35  = [AO1], 5 * SIZE	(p17) FMA	f110 = f11,   f43, f110	}	{ .mfi	(p16) LDFD	f39  = [AO9], 5 * SIZE	(p17) FMA	f122 = f11,   f47, f122	}	;;	{ .mfi	(p17) ADD3	f101 = f11,   f42, f101	}	{ .mfi	(p17) ADD3	f113 = f11,   f46, f113	}	;;	{ .mfi	(p16) LDFD	f106 = [YLD1], 1 * SIZE	(p17) ADD4	f104 = f10,   f42, f104	}	{ .mfi	(p16) LDFD	f118 = [YLD2], 1 * SIZE	(p17) ADD4	f116 = f10,   f46, f116	}	;;	{ .mfi	(p16) LDFD	f109 = [YLD1], 5 * SIZE	(p17) ADD3	f107 = f11,   f44, f107	}	{ .mfi 	(p16) LDFD	f121 = [YLD2], 5 * SIZE	(p17) ADD3	f119 = f11,   f48, f119	}	;;	{ .mfi	(p13) PREFETCH	[RPRE2], 16 * SIZE	(p17) ADD4	f110 = f10,   f44, f110	}	{ .mfi	(p17) ADD4	f122 = f10,   f48, f122	}	;;	{ .mfi	(p16) LDFD	f40  = [AO2], 1 * SIZE	(p17) FMA	f101 = f12,   f49, f101	}	{ .mfi	(p16) LDFD	f44  = [AO10], 1 * SIZE	(p17) FMA	f113 = f12,   f53, f113	}	;;	{ .mfi	(p16) LDFD	f41  = [AO2], 1 * SIZE	(p17) FMA	f104 = f13,   f49, f104	}	{ .mfi	(p16) LDFD	f45  = [AO10], 1 * SIZE	(p17) FMA	f116 = f13,   f53, f116	}	;;	{ .mfi	(p16) LDFD	f42  = [AO2], 1 * SIZE	(p17) FMA	f107 = f12,   f51, f107	}	{ .mfi	(p16) LDFD	f46  = [AO10], 1 * SIZE	(p17) FMA	f119 = f12,   f55, f119	}	;;	{ .mfi	(p16) LDFD	f43  = [AO2], 5 * SIZE	(p17) FMA	f110 = f13,   f51, f110	}	{ .mfi	(p16) LDFD	f47  = [AO10], 5 * SIZE	(p17) FMA	f122 = f13,   f55, f122	}	;;	{ .mfi	(p17) ADD3	f101 = f13,   f50, f101	}	{ .mfi	(p17) ADD3	f113 = f13,   f54, f113	}	;;	{ .mfi	(p17) ADD4	f104 = f12,   f50, f104	}	{ .mfi	(p17) ADD4	f116 = f12,   f54, f116	}	;;	{ .mfi	(p17) ADD3	f107 = f13,   f52, f107	}	{ .mfi	(p17) ADD3	f119 = f13,   f56, f119	}	;;	{ .mfi	(p12) PREFETCH	[RPRE3], 16 * SIZE	(p17) ADD4	f110 = f12,   f52, f110	}	{ .mfi	(p17) ADD4	f122 = f12,   f56, f122	}	;;	{ .mfi	(p16) LDFD	f48  = [AO3], 1 * SIZE	(p17) FMA	f101 = f14,   f57, f101	}	{ .mfi	(p16) LDFD	f52  = [AO11], 1 * SIZE	(p17) FMA	f113 = f14,   f61, f113	}	;;	{ .mfi	(p16) LDFD	f49  = [AO3], 1 * SIZE	(p17) FMA	f104 = f15,   f57, f104	}	{ .mfi	(p16) LDFD	f53  = [AO11], 1 * SIZE	(p17) FMA	f116 = f15,   f61, f116	}	;;	{ .mfi	(p16) LDFD	f50  = [AO3], 1 * SIZE	(p17) FMA	f107 = f14,   f59, f107	}	{ .mfi	(p16) LDFD	f54  = [AO11], 1 * SIZE	(p17) FMA	f119 = f14,   f63, f119	}	;;	{ .mfi	(p16) LDFD	f51  = [AO3], 5 * SIZE	(p17) FMA	f110 = f15,   f59, f110	}	{ .mfi	(p16) LDFD	f55  = [AO11], 5 * SIZE	(p17) FMA	f122 = f15,   f63, f122	}	;;	{ .mfi	(p17) ADD3	f101 = f15,   f58, f101	}	{ .mfi	(p17) ADD3	f113 = f15,   f62, f113	}	;;	{ .mfi	(p17) ADD4	f104 = f14,   f58, f104	}	{ .mfi	(p17) ADD4	f116 = f14,   f62, f116	}	;;	{ .mfi	(p17) ADD3	f107 = f15,   f60, f107	}	{ .mfi	(p17) ADD3	f119 = f15,   f64, f119	}	;;	{ .mfi	(p13) PREFETCH	[RPRE4], 16 * SIZE	(p17) ADD4	f110 = f14,   f60, f110	}	{ .mfi	(p17) ADD4	f122 = f14,   f64, f122	}	;;	{ .mfi	(p16) LDFD	f56  = [AO4], 1 * SIZE	(p17) FMA	f101 = f16,   f65, f101	}	{ .mfi	(p16) LDFD	f60  = [AO12], 1 * SIZE	(p17) FMA	f113 = f16,   f69, f113	}	;;	{ .mfi	(p16) LDFD	f57  = [AO4], 1 * SIZE	(p17) FMA	f104 = f17,   f65, f104	}	{ .mfi	(p16) LDFD	f61  = [AO12], 1 * SIZE	(p17) FMA	f116 = f17,   f69, f116	}	;;	{ .mmf	(p18) STFD	[YST1] = f102, 1 * SIZE	(p18) STFD	[YST2] = f114, 1 * SIZE	(p17) FMA	f107 = f16,   f67, f107	}	{ .mmf	(p16) LDFD	f58  = [AO4], 1 * SIZE	(p16) LDFD	f62  = [AO12], 1 * SIZE	(p17) FMA	f119 = f16,   f71, f119	}	;;	{ .mmf	(p18) STFD	[YST1] = f105, 1 * SIZE	(p18) STFD	[YST2] = f117, 1 * SIZE	(p17) FMA	f110 = f17,   f67, f110	}	{ .mmf	(p16) LDFD	f59  = [AO4], 5 * SIZE	(p16) LDFD	f63  = [AO12], 5 * SIZE	(p17) FMA	f122 = f17,   f71, f122	}	;;	{ .mfi	(p17) ADD3	f101 = f17,   f66, f101	}	{ .mfi	(p17) ADD3	f113 = f17,   f70, f113	}	;;	{ .mfi	(p17) ADD4	f104 = f16,   f66, f104	}	{ .mfi	(p17) ADD4	f116 = f16,   f70, f116	}	;;	{ .mfi	(p17) ADD3	f107 = f17,   f68, f107	}	{ .mfi	(p17) ADD3	f119 = f17,   f72, f119	}	;;	{ .mfi	(p12) PREFETCH	[RPRE5], 16 * SIZE	(p17) ADD4	f110 = f16,   f68, f110	}	{ .mfi	(p17) ADD4	f122 = f16,   f72, f122	}	;;	{ .mfi	(p16) LDFD	f64  = [AO5], 1 * SIZE	(p17) FMA	f101 = f18,   f73, f101	}	{ .mfi	(p16) LDFD	f68  = [AO13], 1 * SIZE	(p17) FMA	f113 = f18,   f77, f113	}	;;	{ .mfi	(p16) LDFD	f65  = [AO5], 1 * SIZE	(p17) FMA	f104 = f19,   f73, f104	}	{ .mfi	(p16) LDFD	f69  = [AO13], 1 * SIZE	(p17) FMA	f116 = f19,   f77, f116	}	;;	{ .mmf	(p18) STFD	[YST1] = f108, 1 * SIZE	(p18) STFD	[YST2] = f120, 1 * SIZE	(p17) FMA	f107 = f18,   f75, f107	}	{ .mmf	(p16) LDFD	f66  = [AO5], 1 * SIZE	(p16) LDFD	f70  = [AO13], 1 * SIZE	(p17) FMA	f119 = f18,   f79, f119	}	;;	{ .mmf	(p18) STFD	[YST1] = f111, 5 * SIZE	(p18) STFD	[YST2] = f123, 5 * SIZE	(p17) FMA	f110 = f19,   f75, f110	}	{ .mmf	(p16) LDFD	f67  = [AO5], 5 * SIZE	(p16) LDFD	f71  = [AO13], 5 * SIZE	(p17) FMA	f122 = f19,   f79, f122	}	;;	{ .mfi	(p17) ADD3	f101 = f19,   f74, f101	}	{ .mfi	(p17) ADD3	f113 = f19,   f78, f113	}	;;	{ .mfi	(p17) ADD4	f104 = f18,   f74, f104	}	{ .mfi	(p17) ADD4	f116 = f18,   f78, f116	}	;;	{ .mfi	(p17) ADD3	f107 = f19,   f76, f107	}	{ .mfi	(p17) ADD3	f119 = f19,   f80, f119	}	;;	{ .mfi	(p13) PREFETCH	[RPRE6], 16 * SIZE	(p17) ADD4	f110 = f18,   f76, f110	}	{ .mfi	(p17) ADD4	f122 = f18,   f80, f122	}	;;	{ .mfi	(p16) LDFD	f72  = [AO6], 1 * SIZE	(p17) FMA	f101 = f20,   f81, f101	}	{ .mfi	(p16) LDFD	f76  = [AO14], 1 * SIZE	(p17) FMA	f113 = f20,   f85, f113	}	;;	{ .mfi	(p16) LDFD	f73  = [AO6], 1 * SIZE	(p17) FMA	f104 = f21,   f81, f104	}	{ .mfi	(p16) LDFD	f77  = [AO14], 1 * SIZE	(p17) FMA	f116 = f21,   f85, f116	}	;;	{ .mfi	(p16) LDFD	f74  = [AO6], 1 * SIZE	(p17) FMA	f107 = f20,   f83, f107	}	{ .mfi	(p16) LDFD	f78  = [AO14], 1 * SIZE	(p17) FMA	f119 = f20,   f87, f119	}	;;	{ .mfi	(p16) LDFD	f75  = [AO6], 5 * SIZE	(p17) FMA	f110 = f21,   f83, f110	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -