⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_beta.s

📁 Optimized GotoBLAS libraries
💻 S
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define PREFETCHSIZE 140#define CO1	r14#define CO2	r15#define CO3	r16#define DO1	r17#define DO2	r18#define DO3	r19#define I	r22#define I_AND_15 r23#define PRE1	r24#define PR	r30#define ARLC	r31#define M	r32#define N	r33#define C	r34#define LDC	r35#define J	r36#define BETA	f8	PROLOGUE	.prologue	PROFCODE	{ .mmi#ifndef XDOUBLE	adds	CO1 = 16, r12	adds	CO2 = 24, r12#else	adds	CO1 = 32, r12	adds	CO2 = 40, r12#endif	.save	ar.lc, ARLC	mov	ARLC = ar.lc	}	{ .mfb	cmp.ge	p6, p0 = 0, N	fcmp.eq	p0, p15 = BETA, f0	(p6) br.ret.sptk.many b0		}	;;	.body	{ .mmi	ld8	C = [CO1], 8	ld8	LDC = [CO2]	mov	PR = pr	}	{ .mmi	mov	J = N	shr	I = M, 4	}	;;	{ .mmb	shladd LDC = LDC, BASE_SHIFT, r0	adds	I = -1, I	(p15) br.cond.dpnt .L100		// if (beta != 0) goto L100	}	;;	.align 32.L60:	{ .mmi	mov	CO1 = C	mov	CO3 = C	add	CO2 = 4 * SIZE, C	}	{ .mmi	adds	PRE1 = PREFETCHSIZE * SIZE, C	add	C = C, LDC	tbit.nz	p12, p0 = M, 3	}	;;	{ .mmi	and	I_AND_15 = 15, M	mov	ar.lc = I	}	{ .mib	cmp.gt	p8, p0 = 0, I	(p8) br.cond.dpnt .L80	}	;;	.align 32.L70:	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	}	{ .mmi	lfetch.excl.nt1	[PRE1]	nop.m 0	adds	PRE1 = 16 * SIZE, PRE1	}	;;	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	adds	CO3 = 16 * SIZE, CO3	}	;;	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	}	;;	{ .mmi	STFD	[CO1] = f0, 5 * SIZE	STFD	[CO2] = f0, 5 * SIZE	}	;;	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	}	;;	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	}	;;	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	}	;;	{ .mmb	STFD	[CO1] = f0, 5 * SIZE	STFD	[CO2] = f0, 5 * SIZE	br.cloop.sptk.few .L70	}	;;	.align 32.L80:	{ .mmi	(p12) STFD [CO1] = f0, 1 * SIZE	(p12) STFD [CO2] = f0, 1 * SIZE	tbit.nz	p13, p0 = M, 2	}	{ .mmb	cmp.eq	p9, p0 = 0, I_AND_15	adds	J = -1, J	(p9) br.cond.dptk .L99	}	;;	{ .mmi	(p12) STFD [CO1] = f0, 1 * SIZE	(p12) STFD [CO2] = f0, 1 * SIZE	tbit.nz	p14, p0 = M, 1	}	;;	{ .mmi	(p12) STFD [CO1] = f0, 1 * SIZE	(p12) STFD [CO2] = f0, 1 * SIZE	(p12) adds CO3 = 8 * SIZE, CO3		}	;;	{ .mmi	(p12) STFD [CO1] = f0, 5 * SIZE	(p12) STFD [CO2] = f0	(p13) adds CO3 = 4 * SIZE, CO3	}	;;	{ .mmi	(p13) STFD [CO1] = f0, 1 * SIZE	(p14) STFD [CO3] = f0, 1 * SIZE	}	;;	{ .mmi	(p13) STFD [CO1] = f0, 1 * SIZE	(p14) STFD [CO3] = f0, 1 * SIZE	tbit.nz	p15, p0 = M, 0	}	;;	{ .mmi	(p13) STFD [CO1] = f0, 1 * SIZE	(p15) STFD [CO3] = f0	}	;;	{ .mmi	(p13) STFD [CO1] = f0	}	;;	.align 32.L99:	{ .mib	cmp.lt	p6, p0 = 0, J	mov ar.lc = ARLC	}	{ .mbb	(p6) br.cond.dptk .L60	br.ret.sptk.many b0	}	;;	.align 32.L100:	{ .mmi	mov	CO1 = C	mov	CO3 = C	mov	pr.rot = 0	}	{ .mmi	adds	PRE1 = PREFETCHSIZE * SIZE, C	add	CO2 = 4 * SIZE, C	mov	DO1 = C	}	;;	{ .mmi	mov	ar.ec = 6	}	{ .mmi	adds	DO2 = 4 * SIZE, C	mov	DO3 = C	add	C = C, LDC	}	;;	{ .mmi	and	I_AND_15 = 15, M	cmp.eq	p16, p0 = r0, r0	mov	ar.lc = I	}	{ .mib	cmp.gt	p8, p0 = 0, I	tbit.nz	p12, p0 = M, 3	(p8) br.cond.dpnt .L180	}	;;	.align 32.L170:	{ .mmf	(p21) STFD [DO1] = f6, 1 * SIZE	(p21) STFD [DO2] = f7, 1 * SIZE	(p21) FMPY f6  = BETA, f85	}	{ .mmf	(p16) lfetch.excl.nt1	[PRE1]	(p16) adds CO3 = 16 * SIZE, CO3	(p21) FMPY f7  = BETA, f91	}	;;	{ .mmf	(p21) STFD [DO1] = f10, 1 * SIZE	(p21) STFD [DO2] = f11, 1 * SIZE	(p21) FMPY f10 = BETA, f97	}	{ .mmf	(p16) LDFD f32 = [CO1], 1 * SIZE	(p16) LDFD f38 = [CO2], 1 * SIZE	(p21) FMPY f11 = BETA, f103	}	;;	{ .mmf	(p21) STFD [DO1] = f12, 1 * SIZE	(p21) STFD [DO2] = f13, 1 * SIZE	(p21) FMPY f12 = BETA, f109	}	{ .mmf	(p16) LDFD f44 = [CO1], 1 * SIZE	(p16) LDFD f50 = [CO2], 1 * SIZE	(p21) FMPY f13 = BETA, f115	}	;;	{ .mmf	(p21) STFD [DO1] = f14, 5 * SIZE	(p21) STFD [DO2] = f15, 5 * SIZE	(p21) FMPY f14 = BETA, f121	}	{ .mmf	(p16) LDFD f56 = [CO1], 1 * SIZE	(p16) LDFD f62 = [CO2], 1 * SIZE	(p21) FMPY f15 = BETA, f127	}	;;	{ .mmf	(p21) STFD [DO1] = f6, 1 * SIZE	(p21) STFD [DO2] = f7, 1 * SIZE	(p20) FMPY f6  = BETA, f36	}	{ .mmf	(p16) LDFD f68 = [CO1], 5 * SIZE	(p16) LDFD f74 = [CO2], 5 * SIZE	(p20) FMPY f7  = BETA, f42	}	;;	{ .mmf	(p21) STFD [DO1] = f10, 1 * SIZE	(p21) STFD [DO2] = f11, 1 * SIZE	(p20) FMPY f10 = BETA, f48	}	{ .mmf	(p16) LDFD f80 = [CO1], 1 * SIZE	(p16) LDFD f86 = [CO2], 1 * SIZE	(p20) FMPY f11 = BETA, f54	}	;;	{ .mmf	(p21) STFD [DO1] = f12, 1 * SIZE	(p21) STFD [DO2] = f13, 1 * SIZE	(p20) FMPY f12 = BETA, f60	}	{ .mmf	(p16) LDFD f92 = [CO1], 1 * SIZE	(p16) LDFD f98 = [CO2], 1 * SIZE	(p20) FMPY f13 = BETA, f66	}	;;	{ .mmf	(p21) STFD [DO1] = f14, 5 * SIZE	(p21) STFD [DO2] = f15, 5 * SIZE	(p20) FMPY f14 = BETA, f72	}	{ .mmf	(p16) LDFD f104 = [CO1], 1 * SIZE	(p16) LDFD f110 = [CO2], 1 * SIZE	(p20) FMPY f15 = BETA, f78	}	;;	{ .mmi	(p16) LDFD f116 = [CO1], 5 * SIZE	(p16) LDFD f122 = [CO2], 5 * SIZE	adds	PRE1 = 16 * SIZE, PRE1	}	{ .mmb	(p16) adds DO3 = 16 * SIZE, DO3	nop.m 0	br.ctop.sptk.few .L170	}	;;	.align 32.L180:	{ .mmi	(p12) LDFD f32 = [CO1], 1 * SIZE	(p12) LDFD f36 = [CO2], 1 * SIZE	tbit.nz	p13, p0 = M, 2	}	{ .mmb	cmp.eq	p9, p0 = 0, I_AND_15	adds	J = -1, J	(p9) br.cond.dptk .L199	}	;;	{ .mmi	(p12) LDFD f33 = [CO1], 1 * SIZE	(p12) LDFD f37 = [CO2], 1 * SIZE	tbit.nz	p14, p0 = M, 1	}	;;	{ .mmi	(p12) LDFD f34 = [CO1], 1 * SIZE	(p12) LDFD f38 = [CO2], 1 * SIZE	(p12) adds CO3 = 8 * SIZE, CO3		}	;;	{ .mmi	(p12) LDFD f35 = [CO1], 5 * SIZE	(p12) LDFD f39 = [CO2]	(p13) adds CO3 = 4 * SIZE, CO3	}	;;	{ .mmi	(p13) LDFD f40 = [CO1], 1 * SIZE	(p14) LDFD f44 = [CO3], 1 * SIZE	}	;;	{ .mmi	(p13) LDFD f41 = [CO1], 1 * SIZE	(p14) LDFD f45 = [CO3], 1 * SIZE	tbit.nz	p15, p0 = M, 0	}	;;	{ .mmf	(p13) LDFD f42 = [CO1], 1 * SIZE	(p15) LDFD f46 = [CO3]	(p12) FMPY f32 = BETA, f32	}	{ .mmf	(p12) FMPY f36 = BETA, f36	}	;;	{ .mmf	(p13) LDFD f43 = [CO1]	(p12) FMPY f33 = BETA, f33	}	{ .mmf	(p12) FMPY f37 = BETA, f37	}	;;	(p12) FMPY f34 = BETA, f34	(p12) FMPY f38 = BETA, f38	(p12) FMPY f35 = BETA, f35	(p12) FMPY f39 = BETA, f39	;;	{ .mmf	(p12) STFD [DO1] = f32, 1 * SIZE	(p12) STFD [DO2] = f36, 1 * SIZE	(p13) FMPY f40 = BETA, f40	}	{ .mmf	(p12) adds DO3 = 8 * SIZE, DO3		(p14) FMPY f44 = BETA, f44	}	;;	{ .mmf	(p12) STFD [DO1] = f33, 1 * SIZE	(p12) STFD [DO2] = f37, 1 * SIZE	(p13) FMPY f41 = BETA, f41	}	{ .mmf	(p13) adds DO3 = 4 * SIZE, DO3		(p14) FMPY f45 = BETA, f45	}	;;	{ .mmf	(p12) STFD [DO1] = f34, 1 * SIZE	(p12) STFD [DO2] = f38, 1 * SIZE	(p13) FMPY f42 = BETA, f42	}	{ .mmf	(p15) FMPY f46 = BETA, f46	}	;;	{ .mmf	(p12) STFD [DO1] = f35, 5 * SIZE	(p12) STFD [DO2] = f39	(p13) FMPY f43 = BETA, f43	}	;;	{ .mmi	(p13) STFD [DO1] = f40, 1 * SIZE	(p14) STFD [DO3] = f44, 1 * SIZE	}	;;	{ .mmi	(p13) STFD [DO1] = f41, 1 * SIZE	(p14) STFD [DO3] = f45, 1 * SIZE	}	;;	{ .mmi	(p13) STFD [DO1] = f42, 1 * SIZE	(p15) STFD [DO3] = f46	}	;;	{ .mmi	(p13) STFD [DO1] = f43	}	;;	.align 32.L199:	{ .mib	cmp.lt	p6, p0 = 0, J	mov ar.lc = ARLC	(p6) br.cond.dptk .L100	}	;;	{ .mib	mov	pr = PR, -1	br.ret.sptk.many b0	}	;;	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -