⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm_beta.s

📁 Optimized GotoBLAS libraries
💻 S
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define PREFETCHSIZE 74#define CO1	r14#define CO2	r15#define CO3	r16#define DO1	r17#define DO2	r18#define DO3	r19#define I	r22#define I_AND_15 r23#define PRE1	r24#define PR	r30#define ARLC	r31#define M	r32#define N	r33#define C	r34#define LDC	r35#define J	r36#define BETA_R	f8#define BETA_I	f9	PROLOGUE	.prologue	PROFCODE	{ .mmi	adds	CO1 = 24, r12	adds	CO2 = 32, r12	.save	ar.lc, ARLC	mov	ARLC = ar.lc	}	{ .mfb	cmp.ge	p6, p0 = 0, N	fcmp.eq	p0, p14 = BETA_R, f0	(p6) br.ret.sptk.many b0		}	;;	.body	{ .mmi	ld8	C = [CO1], 8	ld8	LDC = [CO2]	mov	PR = pr	}	{ .mfi	mov	J = N	fcmp.eq	p0, p15 = BETA_I, f0	shr	I = M, 3	}	;;	{ .mmb	cmp.ge	p6, p0 = 0, M	adds	I = -1, I	(p6) br.ret.sptk.many b0		}	;;	{ .mbb	shladd LDC = LDC, ZBASE_SHIFT, r0	(p14) br.cond.dpnt .L100	(p15) br.cond.dpnt .L100	}	;;	.align 32.L60:	{ .mmi	mov	CO1 = C	mov	CO3 = C	add	CO2 = 4 * SIZE, C	}	{ .mmi	adds	PRE1 = PREFETCHSIZE * SIZE, C	add	C = C, LDC	tbit.nz	p12, p0 = M, 2	}	;;	{ .mmi	and	I_AND_15 = 15, M	mov	ar.lc = I	}	{ .mib	cmp.gt	p8, p0 = 0, I	(p8) br.cond.dpnt .L80	}	;;	.align 32.L70:	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	}	{ .mmi	lfetch.excl.nt1	[PRE1], 16 * SIZE	nop.m 0	}	;;	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	adds	CO3 = 16 * SIZE, CO3	}	;;	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	}	;;	{ .mmi	STFD	[CO1] = f0, 5 * SIZE	STFD	[CO2] = f0, 5 * SIZE	}	;;	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	}	;;	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	}	;;	{ .mmi	STFD	[CO1] = f0, 1 * SIZE	STFD	[CO2] = f0, 1 * SIZE	}	;;	{ .mmb	STFD	[CO1] = f0, 5 * SIZE	STFD	[CO2] = f0, 5 * SIZE	br.cloop.sptk.few .L70	}	;;	.align 32.L80:	{ .mmi	(p12) STFD [CO1] = f0, 1 * SIZE	(p12) STFD [CO2] = f0, 1 * SIZE	tbit.nz	p13, p0 = M, 1	}	{ .mmb	cmp.eq	p9, p0 = 0, I_AND_15	adds	J = -1, J	(p9) br.cond.dptk .L99	}	;;	{ .mmi	(p12) STFD [CO1] = f0, 1 * SIZE	(p12) STFD [CO2] = f0, 1 * SIZE	tbit.nz	p14, p0 = M, 0	}	;;	{ .mmi	(p12) STFD [CO1] = f0, 1 * SIZE	(p12) STFD [CO2] = f0, 1 * SIZE	(p12) adds CO3 = 8 * SIZE, CO3		}	;;	{ .mmi	(p12) STFD [CO1] = f0, 5 * SIZE	(p12) STFD [CO2] = f0	(p13) adds CO3 = 4 * SIZE, CO3	}	;;	{ .mmi	(p13) STFD [CO1] = f0, 1 * SIZE	(p14) STFD [CO3] = f0, 1 * SIZE	}	;;	{ .mmi	(p13) STFD [CO1] = f0, 1 * SIZE	(p14) STFD [CO3] = f0, 1 * SIZE	}	;;	{ .mmi	(p13) STFD [CO1] = f0, 1 * SIZE	}	;;	{ .mmi	(p13) STFD [CO1] = f0	}	;;	.align 32.L99:	{ .mib	cmp.lt	p6, p0 = 0, J	mov ar.lc = ARLC	}	{ .mbb	(p6) br.cond.dptk .L60	br.ret.sptk.many b0	}	;;	.align 32.L100:	{ .mmi	mov	CO1 = C	mov	CO3 = C	mov	pr.rot = 0	}	{ .mmi	adds	PRE1 = PREFETCHSIZE * SIZE, C	add	CO2 = 4 * SIZE, C	mov	DO1 = C	}	;;	{ .mmi	mov	ar.ec = 6	}	{ .mmi	adds	DO2 = 4 * SIZE, C	mov	DO3 = C	add	C = C, LDC	}	;;	{ .mmi	and	I_AND_15 = 15, M	cmp.eq	p16, p0 = r0, r0	mov	ar.lc = I	}	{ .mib	cmp.gt	p8, p0 = 0, I	tbit.nz	p12, p0 = M, 2	(p8) br.cond.dpnt .L180	}	;;	.align 32.L170:	{ .mmf	(p21) STFD [DO1] = f37, 1 * SIZE	(p16) lfetch.excl.nt1	[PRE1], 16 * SIZE	(p21) FNMA f61  = BETA_I, f67, f61	}	{ .mmf	(p16) LDFD f32  = [CO1], 1 * SIZE	(p16) adds CO2 = 16 * SIZE, CO2	(p21) FMPY f12  = BETA_I, f85	}	;;	{ .mfi	(p21) STFD [DO1] = f43, 1 * SIZE	(p21) FMA  f67  = BETA_R, f67, f10	(p16) adds CO3 = 16 * SIZE, CO3	}	{ .mfi	(p16) LDFD f38  = [CO1], 1 * SIZE	(p21) FMPY f85  = BETA_R, f85	(p16) adds DO2 = 16 * SIZE, DO2	}	;;	{ .mfi	(p21) STFD [DO1] = f49, 1 * SIZE	(p21) FNMA f73  = BETA_I, f79, f73	(p16) adds DO3 = 16 * SIZE, DO3	}	{ .mfi	(p16) LDFD f44  = [CO1], 1 * SIZE	(p21) FMPY f13  = BETA_I, f97	nop.i 0	}	;;	(p21) STFD [DO1] = f55, 1 * SIZE	(p21) FMA  f79  = BETA_R, f79, f11	(p16) LDFD f50  = [CO1], 1 * SIZE	(p21) FMPY f97  = BETA_R, f97	;;	(p21) STFD [DO1] = f61, 1 * SIZE	(p21) FNMA f85  = BETA_I, f91,  f85	(p16) LDFD f56  = [CO1], 1 * SIZE	(p21) FMPY f14  = BETA_I, f109	;;	(p21) STFD [DO1] = f67, 1 * SIZE	(p21) FMA  f91  = BETA_R, f91,  f12	(p16) LDFD f62  = [CO1], 1 * SIZE	(p21) FMPY f109 = BETA_R, f109	;;	(p21) STFD [DO1] = f73, 1 * SIZE	(p21) FNMA f97  = BETA_I, f103, f97	(p16) LDFD f68  = [CO1], 1 * SIZE	(p21) FMPY f15  = BETA_I, f121	;;	(p21) STFD [DO1] = f79, 1 * SIZE	(p21) FMA  f103 = BETA_R, f103, f13	(p16) LDFD f74  = [CO1], 1 * SIZE	(p21) FMPY f121 = BETA_R, f121	;;	(p21) STFD [DO1] = f85,  1 * SIZE	(p21) FNMA f109 = BETA_I, f115, f109	(p16) LDFD f80  = [CO1], 1 * SIZE	(p20) FMPY f6  = BETA_I, f36	;;	(p21) STFD [DO1] = f91,  1 * SIZE	(p21) FMA  f115 = BETA_R, f115, f14	(p16) LDFD f86  = [CO1], 1 * SIZE	(p20) FMPY f36 = BETA_R, f36	;;	(p21) STFD [DO1] = f97,  1 * SIZE	(p21) FNMA f121 = BETA_I, f127, f121	(p16) LDFD f92  = [CO1], 1 * SIZE	(p20) FMPY f7  = BETA_I, f48	;;	(p21) STFD [DO1] = f103, 1 * SIZE	(p21) FMA  f127 = BETA_R, f127, f15	(p16) LDFD f98  = [CO1], 1 * SIZE	(p20) FMPY f48 = BETA_R, f48	;;	(p21) STFD [DO1] = f109, 1 * SIZE	(p20) FNMA f36  = BETA_I, f42, f36	(p16) LDFD f104 = [CO1], 1 * SIZE	(p20) FMPY f10 = BETA_I, f60	;;	(p21) STFD [DO1] = f115, 1 * SIZE	(p20) FMA  f42  = BETA_R, f42, f6	(p16) LDFD f110 = [CO1], 1 * SIZE	(p20) FMPY f60 = BETA_R, f60	;;	(p21) STFD [DO1] = f121, 1 * SIZE	(p20) FNMA f48  = BETA_I, f54, f48	(p16) LDFD f116 = [CO1], 1 * SIZE	(p20) FMPY f11 = BETA_I, f72	;;	(p21) STFD [DO1] = f127, 1 * SIZE	(p20) FMA  f54  = BETA_R, f54, f7	(p16) LDFD f122 = [CO1], 1 * SIZE	(p20) FMPY f72 = BETA_R, f72	br.ctop.sptk.few .L170	;;	.align 32.L180:	{ .mmi	(p12) LDFD f32 = [CO1], 1 * SIZE	(p12) LDFD f36 = [CO2], 1 * SIZE	tbit.nz	p13, p0 = M, 1	}	{ .mmb	cmp.eq	p9, p0 = 0, I_AND_15	adds	J = -1, J	(p9) br.cond.dptk .L199	}	;;	{ .mmi	(p12) LDFD f33 = [CO1], 1 * SIZE	(p12) LDFD f37 = [CO2], 1 * SIZE	tbit.nz	p14, p0 = M, 0	}	;;	{ .mmi	(p12) LDFD f34 = [CO1], 1 * SIZE	(p12) LDFD f38 = [CO2], 1 * SIZE	(p12) adds CO3 = 8 * SIZE, CO3		}	;;	{ .mmi	(p12) LDFD f35 = [CO1], 5 * SIZE	(p12) LDFD f39 = [CO2]	(p13) adds CO3 = 4 * SIZE, CO3	}	;;	{ .mmi	(p13) LDFD f40 = [CO1], 1 * SIZE	(p14) LDFD f44 = [CO3], 1 * SIZE	}	;;	{ .mmi	(p13) LDFD f41 = [CO1], 1 * SIZE	(p14) LDFD f45 = [CO3], 1 * SIZE	}	;;	{ .mmf	(p13) LDFD f42 = [CO1], 1 * SIZE	}	;;	{ .mmf	(p13) LDFD f43 = [CO1]	}	;;	(p12) FMPY f80 = BETA_I, f32	(p12) FMPY f32 = BETA_R, f32	(p12) FMPY f81 = BETA_I, f34	(p12) FMPY f34 = BETA_R, f34	(p12) FMPY f82 = BETA_I, f36	(p12) FMPY f36 = BETA_R, f36	(p12) FMPY f83 = BETA_I, f38	(p12) FMPY f38 = BETA_R, f38	;;	(p12) FNMA f32 = BETA_I, f33, f32	(p12) FMA  f33 = BETA_R, f33, f80	(p12) FNMA f34 = BETA_I, f35, f34	(p12) FMA  f35 = BETA_R, f35, f81	(p12) FNMA f36 = BETA_I, f37, f36	(p12) FMA  f37 = BETA_R, f37, f82	(p12) FNMA f38 = BETA_I, f39, f38	(p12) FMA  f39 = BETA_R, f39, f83	;;	(p13) FMPY f84 = BETA_I, f40	(p13) FMPY f40 = BETA_R, f40	(p13) FMPY f85 = BETA_I, f42	(p13) FMPY f42 = BETA_R, f42	(p14) FMPY f86 = BETA_I, f44	(p14) FMPY f44 = BETA_R, f44	;;	(p13) FNMA f40 = BETA_I, f41, f40	(p13) FMA  f41 = BETA_R, f41, f84	(p13) FNMA f42 = BETA_I, f43, f42	(p13) FMA  f43 = BETA_R, f43, f85	(p14) FNMA f44 = BETA_I, f45, f44	(p14) FMA  f45 = BETA_R, f45, f86	;;	{ .mmf	(p12) STFD [DO1] = f32, 1 * SIZE	(p12) STFD [DO2] = f36, 1 * SIZE	}	{ .mmf	(p12) adds DO3 = 8 * SIZE, DO3		}	;;	{ .mmf	(p12) STFD [DO1] = f33, 1 * SIZE	(p12) STFD [DO2] = f37, 1 * SIZE	}	{ .mmf	(p13) adds DO3 = 4 * SIZE, DO3		}	;;	{ .mmf	(p12) STFD [DO1] = f34, 1 * SIZE	(p12) STFD [DO2] = f38, 1 * SIZE	}	;;	{ .mmf	(p12) STFD [DO1] = f35, 5 * SIZE	(p12) STFD [DO2] = f39	}	;;	{ .mmi	(p13) STFD [DO1] = f40, 1 * SIZE	(p14) STFD [DO3] = f44, 1 * SIZE	}	;;	{ .mmi	(p13) STFD [DO1] = f41, 1 * SIZE	(p14) STFD [DO3] = f45, 1 * SIZE	}	;;	{ .mmi	(p13) STFD [DO1] = f42, 1 * SIZE	;;	(p13) STFD [DO1] = f43	}	;;	.align 32.L199:	{ .mib	cmp.lt	p6, p0 = 0, J	mov ar.lc = ARLC	(p6) br.cond.dptk .L100	}	;;	{ .mib	mov	pr = PR, -1	br.ret.sptk.many b0	}	;;	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -