⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemv_t.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define P	2048#define SP	r12#define M	r32#define N	r33#define A	r37#define LDA	r38#define X	r39#define INCX	r34#define Y	r35#define INCY	r36#define BUFFER	r11#define MIN_M	r14#define I	r15#define J	r16#define IS	r17#define AO1	r18#define AO2	r19#define AO3	r20#define AO4	r21#define AO5	r22#define AO6	r23#define AO7	r24#define AO8	r25#define BO	r26#define LDAP	r27#define INCYM1	r28#define RPRE1	loc0#define RPRE2	loc1#define RPRE3	loc2#define RPRE4	loc3#define RPRE5	loc4#define RPRE6	loc5#define RPRE7	loc6#define RPRE8	loc7#define AO21	loc8#define AO41	loc9#define AO61	loc10#define AO81	loc11#define CLD1	loc12#define CLD2	loc13#define CST1	loc14#define CST2	loc15	#define PREB	r8#define WPRE	r9#define OFFSET	PREB#define INCX3M1	WPRE#define INCY3M1	r10#define ARLC	r29#define PR	r30#define ARPFS	r31	#ifdef DOUBLE#define RPREFETCH	(16 * 2 +  8)#else#define RPREFETCH	(16 * 2 + 16)#endif#define PREFETCH	lfetch.nt1#define ALPHA_R	f6#define ALPHA_I	f7#if   !defined(CONJ) && !defined(XCONJ)#define ADD1	     FMA#define ADD2	     FMA#define ADD3	     FNMA#define ADD4	     FMA#elif  defined(CONJ) && !defined(XCONJ)#define ADD1	     FMA#define ADD2	     FMA#define ADD3	     FMA#define ADD4	     FNMA#elif !defined(CONJ) &&  defined(XCONJ)#define ADD1	     FMA#define ADD2	     FNMA#define ADD3	     FMA#define ADD4	     FMA#else#define ADD1	     FMA#define ADD2	     FNMA#define ADD3	     FNMA#define ADD4	     FNMA#endif	PROLOGUE	.prologue	PROFCODE	{ .mmi	.save	ar.pfs, ARPFS	alloc	ARPFS = ar.pfs, 8, 16, 0, 0	mov	ARLC  = ar.lc	}	;;	mov	PR = pr	adds	r14 = 16, SP	adds	r15 = 24, SP	adds	r16 = 32, SP	adds	r17 = 40, SP	;;	adds	r8 = -8 * 16, SP	adds	r9 = -7 * 16, SP	adds	SP = -8 * 16, SP	;;	stf.spill  [r8] = f16, 32	stf.spill  [r9] = f17, 32	;;		stf.spill  [r8] = f18, 32	stf.spill  [r9] = f19, 32	;;		stf.spill  [r8] = f20, 32	stf.spill  [r9] = f21, 32	;;	stf.spill  [r8] = f22	stf.spill  [r9] = f23	;;		ld8	INCX   = [r14]	ld8	Y      = [r15]	ld8	INCY   = [r16]	ld8	BUFFER = [r17]	;;	setf.sig f11 = LDA	setf.sig f10 = N	;;	mov	ALPHA_R = f8	mov	ALPHA_I = f9	mov	IS = 0	cmp.ge	p7, p0 = 0, M	cmp.ge	p6, p0 = 0, N	;;	shladd	INCX = INCX, ZBASE_SHIFT, r0	shladd	LDA  = LDA,  ZBASE_SHIFT, r0	.body	;;	shladd	INCY = INCY, ZBASE_SHIFT, r0	xmpy.l f10 = f10, f11	mov	r2 = P	;;	adds	INCYM1 = - SIZE, INCY	;;	shladd	INCY3M1  = INCY, 1, INCYM1	;;	getf.sig LDAP = f10	tbit.nz	p8, p0 = A,   BASE_SHIFT	;;	sub	LDAP = r2, LDAP	(p7) br.cond.dpnt .L999	(p6) br.cond.dpnt .L999	.align 16	;;.LIs_loop:	{ .mmi	sub	MIN_M = M, IS	adds	OFFSET = -SIZE, INCX	mov	pr.rot= 0	}	{ .mmi	mov	AO1 = BUFFER	shladd	BO  = INCX, 1, X	adds	AO2 = 4 * SIZE, BUFFER	}	;;	{ .mmi	cmp.le	p6, p0 = r2, MIN_M	shladd	INCX3M1  = INCX, 1, INCX	mov	ar.ec= 5	}	;;	{ .mmi	(p6) mov MIN_M = P	adds	INCX3M1 = -SIZE, INCX3M1	cmp.eq	p16, p0 = r0, r0	}	;;	{ .mii	shr	I = MIN_M, 3	tbit.nz	p13, p0 = MIN_M, 2	}	;;	{ .mmi	adds I = -1, I	}	;;	{ .mib	cmp.gt	p6, p0 = 0, I	mov	ar.lc = I	(p6) br.cond.dpnt .L05	}	;;	.align 16.L01:	(p20) STFD	[AO1] = f36, SIZE	(p20) STFD	[AO2] = f56, SIZE	(p16) LDFD	f32 = [X], SIZE	(p16) LDFD	f52 = [BO], SIZE	;;	(p20) STFD	[AO1] = f41, SIZE	(p20) STFD	[AO2] = f61, SIZE	(p16) LDFD	f37 = [X], OFFSET	(p16) LDFD	f57 = [BO], OFFSET	;;	(p20) STFD	[AO1] = f46, SIZE	(p20) STFD	[AO2] = f66, SIZE	(p16) LDFD	f42 = [X], SIZE	(p16) LDFD	f62 = [BO], SIZE	;;	(p20) STFD	[AO1] = f51, 5 * SIZE	(p20) STFD	[AO2] = f71, 5 * SIZE	(p16) LDFD	f47 = [X], INCX3M1	(p16) LDFD	f67 = [BO], INCX3M1	;;	(p20) STFD	[AO1] = f76, SIZE	(p20) STFD	[AO2] = f96, SIZE	(p16) LDFD	f72 = [X], SIZE	(p16) LDFD	f92 = [BO], SIZE	;;	(p20) STFD	[AO1] = f81, SIZE	(p20) STFD	[AO2] = f101, SIZE	(p16) LDFD	f77 = [X], OFFSET	(p16) LDFD	f97 = [BO], OFFSET	;;	(p20) STFD	[AO1] = f86, SIZE	(p20) STFD	[AO2] = f106, SIZE	(p16) LDFD	f82 = [X], SIZE	(p16) LDFD	f102 = [BO], SIZE	;;	(p20) STFD	[AO1] = f91, 5 * SIZE	(p20) STFD	[AO2] = f111, 5 * SIZE	(p16) LDFD	f87 = [X], INCX3M1	(p16) LDFD	f107 = [BO], INCX3M1	br.ctop.sptk.few .L01	;;	.align 16.L05:	{ .mmi	(p13) LDFD f32 = [X],  SIZE	(p13) LDFD f36 = [BO],  SIZE	tbit.nz	p14, p0 = MIN_M, 1	}	;;	{ .mmi	(p13) LDFD f33 = [X],  OFFSET	(p13) LDFD f37 = [BO],  OFFSET	tbit.nz	p15, p0 = MIN_M, 0	}	;;	{ .mmb	(p13) LDFD f34 = [X],  SIZE	(p13) LDFD f38 = [BO],  SIZE	}	;;	{ .mmi	(p13) LDFD f35 = [X],  INCX3M1	(p13) LDFD f39 = [BO],  INCX3M1	}	;;	{ .mmi	(p14) LDFD f40 = [X],  SIZE	}	;;	(p14) LDFD f41 = [X],  OFFSET	(p13) STFD [AO1] = f32, SIZE	tbit.nz	p8, p0 = A,   BASE_SHIFT	;;	(p14) LDFD f42 = [X],  SIZE	(p13) STFD [AO2] = f36, SIZE	;;	(p14) LDFD f43 = [X],  OFFSET	(p13) STFD [AO1] = f33, SIZE	;;	(p15) LDFD f44 = [X],  SIZE	(p13) STFD [AO2] = f37, SIZE	;;	(p15) LDFD f45 = [X],  OFFSET	(p13) STFD [AO1] = f34, SIZE	(p13) STFD [AO2] = f38, SIZE	;;	(p13) STFD [AO1] = f35, 5 * SIZE	(p13) STFD [AO2] = f39, 5 * SIZE	;;	(p14) STFD [AO1] = f40, SIZE	;;	(p14) STFD [AO1] = f41, SIZE	;;	(p14) STFD [AO1] = f42, SIZE	;;	(p14) STFD [AO1] = f43, SIZE	;;	(p15) STFD [AO1] = f44, SIZE	;;	(p15) STFD [AO1] = f45, SIZE	(p8) br.cond.dpnt .L100	;;	.align 16.L10:	{ .mmi	mov	CLD1  = Y	shladd	CLD2  = INCY, 1, Y	shr	J   = N, 3	}	;;	{ .mmb	mov	CST1  = Y	cmp.eq	p6, p0 = r0, J	(p6) br.cond.dpnt .L20	}	;;	.align 16.L11:	{ .mfi	mov	AO1 = A	mov	f8  = f0	mov	pr.rot= 0	}	{ .mfi	add	AO2 = LDA, A	mov	f10 = f0	mov	BO  = BUFFER	}	;;	{ .mmf	shladd	AO3 = LDA, 1, A	shladd	AO4 = LDA, 1, AO2	mov	f12 = f0	}	{ .mmf	adds	RPRE1  = (RPREFETCH +  0) * SIZE, AO1	adds	RPRE2  = (RPREFETCH +  2) * SIZE, AO2	mov	f14 = f0	}	;;	{ .mmf	shladd	AO5 = LDA, 1, AO3	shladd	AO6 = LDA, 1, AO4	mov	f16 = f0	}	{ .mmf	adds	RPRE3  = (RPREFETCH +  4) * SIZE, AO3	adds	RPRE4  = (RPREFETCH +  6) * SIZE, AO4	mov	f18 = f0	}	;;	{ .mmf	shladd	AO7 = LDA, 1, AO5	shladd	AO8 = LDA, 1, AO6	mov	f20 = f0	}	{ .mmf	adds	RPRE5  = (RPREFETCH +  8) * SIZE, AO5	adds	RPRE6  = (RPREFETCH + 10) * SIZE, AO6	mov	f22 = f0	}	;;	{ .mfi	shladd	A   = LDA, 3, A	mov	f9  = f0	mov	ar.ec= 5	}	{ .mmf	adds	RPRE7  = (RPREFETCH + 12) * SIZE, AO7	adds	RPRE8  = (RPREFETCH + 14) * SIZE, AO8	mov	f11 = f0	}	;;	{ .mmf	adds	WPRE = 16 * SIZE, CLD1	adds	PREB   = RPREFETCH * SIZE, BO	mov	f13 = f0	}	{ .mmf	adds	I = -1, MIN_M	cmp.eq	p16, p0 = r0, r0	mov	f15 = f0	}	;;	{ .mfi	cmp.eq  p12, p0 = r0, r0	mov	f17 = f0	mov	ar.lc = I	}	{ .mmf	nop	__LINE__	nop	__LINE__	mov	f19 = f0	}	;;	{ .mmf	lfetch.excl.nt1	[WPRE]	nop	__LINE__	mov	f21 = f0	}	{ .mmf	mov	I = 0	nop	__LINE__	mov	f23 = f0	}	;;	.align 16.L16:	{ .mmf	(p12) PREFETCH [RPRE1], 16 * SIZE	(p16) LDFPD	f32,  f37  = [AO1], 2 * SIZE	(p20) ADD1	f8  = f116, f36, f8	}	{ .mmf	(p16) cmp.eq.unc p13, p0 = 1, I	nop   __LINE__	(p20) ADD2	f9  = f121, f36, f9	}	;;	{ .mmf	(p13) PREFETCH [PREB], 16 * SIZE	(p16) LDFPD	f112, f117 = [BO], 2 * SIZE	(p20) ADD1	f10 = f116, f46, f10	}	{ .mmf	(p16) cmp.eq.unc p14, p0 = 2, I	(p16) cmp.eq.unc p15, p0 = 3, I	(p20) ADD2	f11 = f121, f46, f11	}	;;	{ .mmf	(p16) LDFPD	f42,  f47  = [AO2], 2 * SIZE	nop   __LINE__	(p20) ADD1	f12 = f116, f56, f12	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f13 = f121, f56, f13	}	;;	{ .mmf	(p13) PREFETCH [RPRE2], 16 * SIZE	nop   __LINE__	(p20) ADD1	f14 = f116, f66, f14	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f15 = f121, f66, f15	}	;;	{ .mmf	(p16) LDFPD	f52,  f57  = [AO3], 2 * SIZE	nop   __LINE__	(p20) ADD3	f8  = f121, f41, f8	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD4	f9  = f116, f41, f9	}	;;	{ .mmf	(p14) PREFETCH [RPRE3], 16 * SIZE	nop   __LINE__	(p20) ADD3	f10 = f121, f51, f10	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD4	f11 = f116, f51, f11	}	;;	{ .mmf	(p16) LDFPD	f62,  f67  = [AO4], 2 * SIZE	nop   __LINE__	(p20) ADD3	f12 = f121, f61, f12	}	{ .mmf	(p16) cmp.eq.unc p12, p0 = 4, I	(p16) cmp.eq.unc p13, p0 = 5, I	(p20) ADD4	f13 = f116, f61, f13	}	;;	{ .mmf	(p15) PREFETCH [RPRE4], 16 * SIZE	nop   __LINE__	(p20) ADD3	f14 = f121, f71, f14	}	{ .mmf	(p16) cmp.eq.unc p14, p0 = 6, I	(p16) cmp.eq.unc p15, p0 = 7, I	(p20) ADD4	f15 = f116, f71, f15	}	;;	{ .mmf	(p16) LDFPD	f72,  f77  = [AO5], 2 * SIZE	nop   __LINE__	(p20) ADD1	f16 = f116, f76, f16	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f17 = f121, f76, f17	}	;;	{ .mmf	(p12) PREFETCH [RPRE5], 16 * SIZE	nop   __LINE__	(p20) ADD1	f18 = f116, f86, f18	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f19 = f121, f86, f19	}	;;	{ .mmf	(p16) LDFPD	f82,  f87  = [AO6], 2 * SIZE	nop   __LINE__	(p20) ADD1	f20 = f116, f96, f20	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f21 = f121, f96, f21	}	;;	{ .mmf	(p13) PREFETCH [RPRE6], 16 * SIZE	nop   __LINE__	(p20) ADD1	f22 = f116, f106, f22	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD2	f23 = f121, f106, f23	}	;;	{ .mmf	(p16) LDFPD	f92,  f97  = [AO7], 2 * SIZE	nop   __LINE__	(p20) ADD3	f16 = f121, f81, f16	}	{ .mmf	nop   __LINE__	nop   __LINE__	(p20) ADD4	f17 = f116, f81, f17	}	;;	{ .mmf	(p14) PREFETCH [RPRE7], 16 * SIZE	nop   __LINE__	(p20) ADD3	f18 = f121, f91, f18	}	{ .mmf	nop   __LINE__	(p16) adds I = 1, I	(p20) ADD4	f19 = f116, f91, f19	}	;;	{ .mmf	(p16) LDFPD	f102, f107 = [AO8], 2 * SIZE	nop   __LINE__	(p20) ADD3	f20 = f121, f101, f20	}	{ .mmf	(p15) mov I = 0	nop   __LINE__	(p20) ADD4	f21 = f116, f101, f21	}	;;	{ .mmf	(p15) PREFETCH [RPRE8], 16 * SIZE	nop   __LINE__	(p20) ADD3	f22 = f121, f111, f22	}	{ .mfb	(p16) cmp.eq.unc p12, p0 = 0, I	(p20) ADD4	f23 = f116, f111, f23	br.ctop.sptk.few .L16	}	;;.L18:	LDFD	f32 = [CLD1], SIZE	LDFD	f36 = [CLD2], SIZE	shladd	CST2  = INCY, 1, CST1	;;	LDFD	f33 = [CLD1], INCYM1	LDFD	f37 = [CLD2], INCYM1	;;	LDFD	f34 = [CLD1], SIZE	LDFD	f38 = [CLD2], SIZE	;;	LDFD	f35 = [CLD1], INCY3M1	LDFD	f39 = [CLD2], INCY3M1	;;	LDFD	f40 = [CLD1], SIZE	LDFD	f44 = [CLD2], SIZE	;;	LDFD	f41 = [CLD1], INCYM1	LDFD	f45 = [CLD2], INCYM1	;;	LDFD	f42 = [CLD1], SIZE	LDFD	f46 = [CLD2], SIZE	;;	LDFD	f43 = [CLD1], INCY3M1	LDFD	f47 = [CLD2], INCY3M1	;;	FMA	f32 = ALPHA_R, f8,  f32	FMA	f36 = ALPHA_R, f12, f36	FMA	f33 = ALPHA_I, f8,  f33	FMA	f37 = ALPHA_I, f12, f37	FMA	f34 = ALPHA_R, f10, f34	FMA	f38 = ALPHA_R, f14, f38	FMA	f35 = ALPHA_I, f10, f35	FMA	f39 = ALPHA_I, f14, f39	;;	FNMA	f32 = ALPHA_I, f9,  f32	FNMA	f36 = ALPHA_I, f13, f36	FMA	f33 = ALPHA_R, f9,  f33	FMA	f37 = ALPHA_R, f13, f37	FNMA	f34 = ALPHA_I, f11, f34	FNMA	f38 = ALPHA_I, f15, f38	FMA	f35 = ALPHA_R, f11, f35	FMA	f39 = ALPHA_R, f15, f39	;;	FMA	f40 = ALPHA_R, f16, f40	FMA	f44 = ALPHA_R, f20, f44	FMA	f41 = ALPHA_I, f16, f41	FMA	f45 = ALPHA_I, f20, f45	FMA	f42 = ALPHA_R, f18, f42	FMA	f46 = ALPHA_R, f22, f46	FMA	f43 = ALPHA_I, f18, f43	FMA	f47 = ALPHA_I, f22, f47	;;	{ .mmf	STFD [CST1] = f32, SIZE	STFD [CST2] = f36, SIZE	FNMA	f40 = ALPHA_I, f17, f40	}	{ .mmf	nop	__LINE__	nop	__LINE__	FNMA	f44 = ALPHA_I, f21, f44	}	;;	{ .mmf	STFD [CST1] = f33

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -