⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemv_t.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define P	4096#define SP	r12#define M	r32#define N	r33#define A	r36#define LDA	r37#define X	r38#define INCX	r39#define Y	r34#define INCY	r35#define BUFFER	r11#define MIN_M	r14#define I	r15#define J	r16#define IS	r17#define AO1	r18#define AO2	r19#define AO3	r20#define AO4	r21#define AO5	r22#define AO6	r23#define AO7	r24#define AO8	r25#define BO	r26#define LDAP	r27#define RPRE1	loc0#define RPRE2	loc1#define RPRE3	loc2#define RPRE4	loc3#define RPRE5	loc4#define RPRE6	loc5#define RPRE7	loc6#define RPRE8	loc7#define AO21	loc8#define AO41	loc9#define AO61	loc10#define AO81	loc11	#define PREB	r8#define WPRE	r9#define OFFSET	PREB#define CO	r10#define ARLC	r29#define PR	r30#define ARPFS	r31	#ifdef DOUBLE#define RPREFETCH	(16 * 3 +  8)#else#define RPREFETCH	(16 * 3 + 16)#endif#define PREFETCH	lfetch.nt1#define ALPHA	f6	PROLOGUE	.prologue	PROFCODE	{ .mmi	.save	ar.pfs, ARPFS	alloc	ARPFS = ar.pfs, 8, 16, 8, 0	setf.sig f11 = LDA	mov	ARLC  = ar.lc	}	{ .mmi	adds	r15 = 24, SP	adds	r16 = 32, SP	adds	r14 = 16, SP	}	;;	{ .mmi	setf.sig f10 = N	ld8	Y      = [r14]	mov	PR = pr	}	{ .mmi	ld8	INCY   = [r15]	adds	r8 = -8 * 16, SP	adds	r9 = -7 * 16, SP	}	;;	{ .mmi	stf.spill  [r8] = f16, 32	stf.spill  [r9] = f17, 32	adds	SP = -8 * 16, SP	}	;;	{ .mmf	stf.spill  [r8] = f18, 32	stf.spill  [r9] = f19, 32	mov	ALPHA = f8	}	;;	{ .mmi	stf.spill  [r8] = f20, 32	stf.spill  [r9] = f21, 32	mov	IS = 0	}	;;	{ .mmf	stf.spill  [r8] = f22	stf.spill  [r9] = f23	xmpy.l f10 = f10, f11	}	.body	;;		;;	{ .mmi	ld8	BUFFER = [r16]	cmp.ge	p7, p0 = r0, M	cmp.ge	p6, p0 = r0, N	}	;;	{ .mmi	shladd	INCX = INCX, BASE_SHIFT, r0	shladd	LDA  = LDA, BASE_SHIFT, r0	shladd	INCY = INCY, BASE_SHIFT, r0	}	;;	{ .mmi	getf.sig LDAP = f10	mov	r2 = P	tbit.nz	p8, p0 = A,   BASE_SHIFT	}	{ .mmi	nop	__LINE__	nop	__LINE__	tbit.nz	p9, p0 = LDA, BASE_SHIFT	}	;;	{ .mbb	sub	LDAP = r2, LDAP	(p7) br.cond.dpnt .L999	(p6) br.cond.dpnt .L999	}	.align 16	;;.LIs_loop:	{ .mmi	sub	MIN_M = M, IS	(p8) LDFD f32 = [X],  INCX	mov	pr.rot= 0	}	{ .mmi	mov	AO1 = BUFFER	adds	AO2 = 4 * SIZE, BUFFER	}	;;	cmp.le	p6, p0 = r2, MIN_M	;;	(p6) mov MIN_M = P	;;	(p8) adds MIN_M = -1, MIN_M	;;	{ .mmi	shladd	OFFSET = INCX, 2, INCX	shladd	BO  = INCX, 2, X	shr	I = MIN_M, 3	}	;;	{ .mmi	adds I = -1, I	cmp.eq	p16, p0 = r0, r0	mov	ar.ec= 5	}	;;	{ .mmi	(p8) STFD [AO1] = f32, 2 * SIZE	(p8) adds	AO2 = 6 * SIZE, BUFFER	mov	ar.lc = I	}	{ .mib	cmp.gt	p6, p0 = 0, I	tbit.nz	p13, p0 = MIN_M, 2	(p6) br.cond.dpnt .L05	}	;;	.align 16.L01:	(p20) STFD [AO1] = f36,  SIZE	(p20) STFD [AO2] = f56,  SIZE	(p16) LDFD f32 = [X],  INCX	(p16) LDFD f52 = [BO], INCX	;;	(p20) STFD [AO1] = f41,  SIZE	(p20) STFD [AO2] = f61,  SIZE	(p16) LDFD f37 = [X],  INCX	(p16) LDFD f57 = [BO], INCX	;;	(p20) STFD [AO1] = f46,  SIZE	(p20) STFD [AO2] = f66,  SIZE	(p16) LDFD f42 = [X],  INCX	(p16) LDFD f62 = [BO], INCX	;;	(p20) STFD [AO1] = f51,  5 * SIZE	(p20) STFD [AO2] = f71,  5 * SIZE	(p16) LDFD f47 = [X],  OFFSET	(p16) LDFD f67 = [BO], OFFSET	br.ctop.sptk.few .L01	;;	.align 16.L05:	(p13) LDFD f32 = [X],  INCX	tbit.nz	p14, p0 = MIN_M, 1	;;	(p13) LDFD f33 = [X],  INCX	tbit.nz	p15, p0 = MIN_M, 0	;;	(p13) LDFD f34 = [X],  INCX	;;	(p13) LDFD f35 = [X],  INCX	;;	(p14) LDFD f36 = [X],  INCX	;;	(p13) STFD [AO1] = f32, SIZE	(p14) LDFD f37 = [X],  INCX	;;	(p13) STFD [AO1] = f33, SIZE	(p15) LDFD f38 = [X],  INCX	;;	(p13) STFD [AO1] = f34, SIZE	;;	(p13) STFD [AO1] = f35, SIZE	;;	(p14) STFD [AO1] = f36, SIZE	;;	(p14) STFD [AO1] = f37, SIZE	;;	(p15) STFD [AO1] = f38, SIZE	(p9) br.cond.dpnt .L100	;;	.align 16.L10:	{ .mmi	mov	CO  = Y	nop	__LINE__	shr	J   = N, 3	}	;;	{ .mib	nop	__LINE__	cmp.eq	p6, p0 = r0, J	(p6) br.cond.dpnt .L20	}	;;	.align 16.L11:	{ .mfi	mov	AO1 = A	mov	f8  = f0	mov	pr.rot= 0	}	{ .mfi	add	AO2 = LDA, A	mov	f10 = f0	shr	I = MIN_M, 4	}	;;	{ .mmf	shladd	AO3 = LDA, 1, A	shladd	AO4 = LDA, 1, AO2	mov	f12 = f0	}	{ .mmf	(p8) LDFD f32 = [AO1], SIZE	(p8) LDFD f33 = [AO2], SIZE	mov	f14 = f0	}	;;	{ .mmf	shladd	AO5 = LDA, 1, AO3	shladd	AO6 = LDA, 1, AO4	mov	f16 = f0	}	{ .mmf	(p8) LDFD f34 = [AO3], SIZE	(p8) LDFD f35 = [AO4], SIZE	mov	f18 = f0	}	;;	{ .mmf	shladd	AO7 = LDA, 1, AO5	shladd	AO8 = LDA, 1, AO6	mov	f20 = f0	}	{ .mmf	(p8) LDFD f36 = [AO5], SIZE	(p8) LDFD f37 = [AO6], SIZE	mov	f22 = f0	}	;;	{ .mfi	(p8) LDFD f38 = [AO7], SIZE	mov	f9  = f0	mov	ar.ec= 2	}	{ .mmf	(p8) LDFD f39 = [AO8], SIZE	mov	BO  = BUFFER	mov	f11 = f0	}	;;	{ .mmf	(p8) LDFD f40 = [BO], 2 * SIZE	cmp.eq	p6, p0 = 0, I	mov	f13 = f0	}	{ .mmf	shladd	A   = LDA, 3, A	cmp.eq	p16, p0 = r0, r0	mov	f15 = f0	}	;;	{ .mmf	add	I = I, I	nop	__LINE__	mov	f17 = f0	}	{ .mmf	adds	RPRE1  = RPREFETCH * SIZE, AO1	adds	RPRE2  = (RPREFETCH + 8) * SIZE, AO2	mov	f19 = f0	}	;;	{ .mmf	adds	I = -1, I	nop	__LINE__	mov	f21 = f0	}	{ .mmf	adds	RPRE3  = RPREFETCH * SIZE, AO3	adds	RPRE4  = (RPREFETCH + 8) * SIZE, AO4	mov	f23 = f0	}	;;	{ .mmf	nop	__LINE__	nop	__LINE__	(p8) FMPY	f8  = f40, f32	}	{ .mmf	adds	RPRE5  = RPREFETCH * SIZE, AO5	adds	RPRE6  = (RPREFETCH + 8) * SIZE, AO6	(p8) FMPY	f10 = f40, f33	}	;;	{ .mmf	nop	__LINE__	nop	__LINE__	(p8) FMPY	f12 = f40, f34	}	{ .mmf	adds	RPRE7  = RPREFETCH * SIZE, AO7	adds	RPRE8  = (RPREFETCH + 8) * SIZE, AO8	(p8) FMPY	f14 = f40, f35	}	;;	{ .mfi	nop	__LINE__	(p8) FMPY	f16 = f40, f36	mov	ar.lc = I	}	{ .mmf	adds	WPRE = 8 * SIZE, CO	adds	PREB  = RPREFETCH * SIZE, BO	(p8) FMPY	f18 = f40, f37	}	;;	{ .mmf	lfetch.excl.nt1	[WPRE]	nop	__LINE__	(p8) FMPY	f20 = f40, f38	}	{ .mfb	nop	__LINE__	(p8) FMPY	f22 = f40, f39	(p6) br.cond.dpnt .L15	}	;;	.align 16.L12:	{ .mfi	(p17) LDFPD	f95, f96 = [AO8], 2 * SIZE	(p17) FMA	f8  = f104, f33, f8	(p16) tbit.nz.unc p14, p15 = I, 0	}	{ .mfi	(p17) LDFPD	f110, f111 = [BO], 2 * SIZE	(p17) FMA	f9  = f105, f34, f9	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f32, f33 = [AO1], 2 * SIZE	(p17) FMA	f10 = f104, f35, f10	nop	__LINE__	}	{ .mfi	(p14) PREFETCH [RPRE1], 16 * SIZE	(p17) FMA	f11 = f105, f36, f11	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f34, f35 = [AO2], 2 * SIZE	(p17) FMA	f12 = f104, f37, f12	nop	__LINE__	}	{ .mfi	(p15) PREFETCH [RPRE2], 16 * SIZE	(p17) FMA	f13 = f105, f38, f13	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f36, f37 = [AO3], 2 * SIZE	(p17) FMA	f14 = f104, f39, f14	nop	__LINE__	}	{ .mfi	(p14) PREFETCH [RPRE3], 16 * SIZE	(p17) FMA	f15 = f105, f40, f15	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f38, f39 = [AO4], 2 * SIZE	(p17) FMA	f16 = f104, f41, f16	nop	__LINE__	}	{ .mfi	(p15) PREFETCH [RPRE4], 16 * SIZE	(p17) FMA	f17 = f105, f42, f17	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f40, f41 = [AO5], 2 * SIZE	(p17) FMA	f18 = f104, f43, f18	nop	__LINE__	}	{ .mfi	(p14) PREFETCH [RPRE5], 16 * SIZE	(p17) FMA	f19 = f105, f44, f19	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f42, f43 = [AO6], 2 * SIZE	(p17) FMA	f20 = f104, f45, f20	nop	__LINE__	}	{ .mfi	(p15) PREFETCH [RPRE6], 16 * SIZE	(p17) FMA	f21 = f105, f46, f21	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f44, f45 = [AO7], 2 * SIZE	(p17) FMA	f22 = f104, f47, f22	nop	__LINE__	}	{ .mfi	(p14) PREFETCH [RPRE7], 16 * SIZE	(p17) FMA	f23 = f105, f48, f23	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f46, f47 = [AO8], 2 * SIZE	(p17) FMA	f8  = f106, f49, f8	nop	__LINE__	}	{ .mfi	(p15) PREFETCH [RPRE8], 16 * SIZE	(p17) FMA	f9  = f107, f50, f9	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f48, f49 = [AO1], 2 * SIZE	(p17) FMA	f10 = f106, f51, f10	nop	__LINE__	}	{ .mfi	(p14) PREFETCH [PREB], 16 * SIZE	(p17) FMA	f11 = f107, f52, f11	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f50, f51 = [AO2], 2 * SIZE	(p17) FMA	f12 = f106, f53, f12	nop	__LINE__	}	{ .mfi	(p16) LDFPD	f103, f104 = [BO], 2 * SIZE	(p17) FMA	f13 = f107, f54, f13	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f52, f53 = [AO3], 2 * SIZE	(p17) FMA	f14 = f106, f55, f14	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f15 = f107, f56, f15	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f54, f55 = [AO4], 2 * SIZE	(p17) FMA	f16 = f106, f57, f16	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f17 = f107, f58, f17	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f56, f57 = [AO5], 2 * SIZE	(p17) FMA	f18 = f106, f59, f18	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f19 = f107, f60, f19	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f58, f59 = [AO6], 2 * SIZE	(p17) FMA	f20 = f106, f61, f20	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f21 = f107, f62, f21	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f60, f61 = [AO7], 2 * SIZE	(p17) FMA	f22 = f106, f63, f22	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f23 = f107, f64, f23	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f62, f63 = [AO8], 2 * SIZE	(p17) FMA	f8  = f108, f65, f8	nop	__LINE__	}	{ .mfi	(p16) LDFPD	f105, f106 = [BO], 2 * SIZE	(p17) FMA	f9  = f109, f66, f9	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f64, f65 = [AO1], 2 * SIZE	(p17) FMA	f10 = f108, f67, f10	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f11 = f109, f68, f11	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f66, f67 = [AO2], 2 * SIZE	(p17) FMA	f12 = f108, f69, f12	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f13 = f109, f70, f13 	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f68, f69 = [AO3], 2 * SIZE	(p17) FMA	f14 = f108, f71, f14	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f15 = f109, f72, f15	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f70, f71 = [AO4], 2 * SIZE	(p17) FMA	f16 = f108, f73, f16	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f17 = f109, f74, f17	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f72, f73 = [AO5], 2 * SIZE	(p17) FMA	f18 = f108, f75, f18	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f19 = f109, f76, f19	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f74, f75 = [AO6], 2 * SIZE	(p17) FMA	f20 = f108, f77, f20	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f21 = f109, f78, f21	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f76, f77 = [AO7], 2 * SIZE	(p17) FMA	f22 = f108, f79, f22	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f23 = f109, f80, f23	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f107, f108 = [BO], 2 * SIZE	(p17) FMA	f8  = f110, f81, f8	nop	__LINE__	}	{ .mfi	(p16) LDFPD	f78, f79 = [AO8], 2 * SIZE	(p17) FMA	f9  = f111, f82, f9	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f80, f81 = [AO1], 2 * SIZE	(p17) FMA	f10 = f110, f83, f10	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f11 = f111, f84, f11	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f82, f83 = [AO2], 2 * SIZE	(p17) FMA	f12 = f110, f85, f12	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f13 = f111, f86, f13	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f84, f85 = [AO3], 2 * SIZE	(p17) FMA	f14 = f110, f87, f14	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f15 = f111, f88, f15	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f86, f87 = [AO4], 2 * SIZE	(p17) FMA	f16 = f110, f89, f16	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f17 = f111, f90, f17	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f88, f89 = [AO5], 2 * SIZE	(p17) FMA	f18 = f110, f91, f18	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f19 = f111, f92, f19	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f90, f91 = [AO6], 2 * SIZE	(p17) FMA	f20 = f110, f93, f20	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p17) FMA	f21 = f111, f94, f21	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f92, f93 = [AO7], 2 * SIZE	(p17) FMA	f22 = f110, f95, f22	nop	__LINE__	}	{ .mfb	adds	I = -1, I	(p17) FMA	f23 = f111, f96, f23	br.ctop.sptk.few .L12	}	;;	.align 16.L15:	and	I = 15, MIN_M	mov	pr.rot= 0	;;	cmp.eq	p6,  p0 = 0, I	cmp.eq	p16, p15 = r0, r0	;;	adds	I = 1, I	;;	shr	I = I, 1	;;	adds	I = -1, I	;;	mov	ar.lc = I	mov	ar.ec= 3	and	I = 15, MIN_M	(p6) br.cond.dpnt .L18	;;	.align 16.L16:	{ .mfi	(p16) LDFPD	f104, f107 = [BO], 2 * SIZE	(p18) FMA	f8  = f106, f34, f8	nop	__LINE__	}	{ .mfi	(p16) LDFPD	f32,  f35  = [AO1], 2 * SIZE	(p15) FMA	f9  = f109, f37, f9	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f38,  f41  = [AO2], 2 * SIZE	(p18) FMA	f10 = f106, f40, f10	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p15) FMA	f11 = f109, f43, f11	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f44,  f47  = [AO3], 2 * SIZE	(p18) FMA	f12 = f106, f46, f12	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p15) FMA	f13 = f109, f49, f13	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f50,  f53  = [AO4], 2 * SIZE	(p18) FMA	f14 = f106, f52, f14	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p15) FMA	f15 = f109, f55, f15	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f56,  f59  = [AO5], 2 * SIZE	(p18) FMA	f16 = f106, f58, f16	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p15) FMA	f17 = f109, f61, f17	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f62,  f65  = [AO6], 2 * SIZE	(p18) FMA	f18 = f106, f64, f18	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p15) FMA	f19 = f109, f67, f19	(p17) adds	I = -2, I	}	;;	{ .mfi	(p16) LDFPD	f68,  f71  = [AO7], 2 * SIZE	(p18) FMA	f20 = f106, f70, f20	nop	__LINE__	}	{ .mfi	nop	__LINE__	(p15) FMA	f21 = f109, f73, f21	nop	__LINE__	}	;;	{ .mfi	(p16) LDFPD	f74,  f77  = [AO8], 2 * SIZE	(p15) FMA	f23 = f109, f79, f23	(p17) cmp.ne.unc p15, p0 = -1, I	}	{ .mfb	nop	__LINE__	(p18) FMA	f22 = f106, f76, f22	br.ctop.sptk.few .L16	}	;;.L18:	{ .mmf	mov	AO1 = CO	LDFD	f32 = [CO], INCY	FADD	f8  = f8,  f9	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -