⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm_tcopy.s

📁 Optimized GotoBLAS libraries
💻 S
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define PREFETCHSIZE   24#define WPREFETCHSIZE  48#define LD	LDF8#define ST	STF8_NTA#define PREA	r2#define PREB	r3#define I	r14#define J	r15#define A1	r16#define A2	r17#define A3	r18#define A4	r19#define A5	r20#define A6	r21#define A7	r22#define A8	r23#define B1	r24#define B2	r25#define COUNT	r26#define TEMP	r27#define BO2	r28#define BO3	r29#define LDB	r8#define ARLC	r30#define PR	r31#define M	r32#define N	r33#define A	r34#define LDA	r35#define B	r36	PROLOGUE	.prologue	PROFCODE	.body	{ .mmi	setf.sig f32 = M	and	r8  = -4, N	mov	ARLC  = ar.lc	}	;;	{ .mmi	setf.sig f33  = r8	and	r9  = -2, N	mov	PR = pr	}	;;	{ .mmi	setf.sig f34  = r9	shladd	LDA = LDA, ZBASE_SHIFT, r0	shl	LDB = M, BASE_SHIFT + 3	}	;;	{ .mfi	nop	 __LINE__	xmpy.l	f33  = f32, f33	shr	J = M, 2	}	{ .mfi	nop	 __LINE__	xmpy.l	f34  = f32, f34	nop	 __LINE__	}	;;	{ .mmb	getf.sig BO2 = f33	getf.sig BO3 = f34	nop	 __LINE__	}	;;	{ .mmi	shladd	BO2 = BO2, ZBASE_SHIFT, B	shladd	BO3 = BO3, ZBASE_SHIFT, B	tbit.nz p10, p0 =N, 1	}	{ .mib	cmp.eq	p6, p0 = 0, J	tbit.nz p11, p0 =N, 0	(p6)	br.cond.dpnt .L20	}	;;	.align 32.L11:	{ .mmi	mov	A1 = A	add	A2 = A, LDA	mov	pr.rot = 0	}	{ .mmi	shladd A3 = LDA, 1, A	mov    B1 = B	shr    I  = N, 2	}	;;	{ .mmi	shladd	A4 = LDA, 1, A2	cmp.eq	p16,p0 = r0, r0	mov	ar.ec = 3	}	{ .mmi	cmp.eq	p6,p0 = 0,I	adds	I =-1, I	adds	J =-1, J	}	;;	{ .mmi	shladd	A = LDA, 2, A	adds	A5 = 4 * SIZE, A1	adds	A6 = 4 * SIZE, A2	}	{ .mmi	adds	A7 = 4 * SIZE, A3	adds	A8 = 4 * SIZE, A4	adds	PREA = PREFETCHSIZE * SIZE,A1	}	;;	{ .mmb	adds   B2 = 4 * SIZE, B	adds	PREB = WPREFETCHSIZE * SIZE, B	nop	__LINE__	}	{ .mib	adds   B  = 32 * SIZE, B	mov	ar.lc = I	(p6) br.cond.dpnt.few .L15	}	;;.L12:	{ .mmb	(p16)	lfetch.nt1	[PREA], LDA	(p16)	lfetch.excl.nt1	[PREB], LDB	nop	__LINE__	}	{ .mmb	nop	__LINE__	nop	__LINE__	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f34, SIZE	(p18)	ST	[B2] = f37, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f32 = [A1], SIZE	(p16)	LD	f35 = [A5], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f40, SIZE	(p18)	ST	[B2] = f43, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f38 = [A1], SIZE	(p16)	LD	f41 = [A5], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f46,  SIZE	(p18)	ST	[B2] = f49,  SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f44 = [A1], SIZE	(p16)	LD	f47 = [A5], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18)	ST	[B1] = f52,  5 * SIZE	(p18)	ST	[B2] = f55,  5 * SIZE	tbit.z	p0,p7 = COUNT,0	}	{ .mmb	(p16)	LD	f50 = [A1], 5 * SIZE	(p16)	LD	f53 = [A5], 5 * SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f58, SIZE	(p18)	ST	[B2] = f61, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f56 = [A2], SIZE	(p16)	LD	f59 = [A6], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f64, SIZE	(p18)	ST	[B2] = f67, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f62 = [A2], SIZE	(p16)	LD	f65 = [A6], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f70, SIZE	(p18)	ST	[B2] = f73, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f68 = [A2], SIZE	(p16)	LD	f71 = [A6], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18)	ST	[B1]  = f76, 5 * SIZE	(p18)	ST	[B2]  = f79, 5 * SIZE	shladd	TEMP = LDA, 2, r0	}	{ .mmb	(p16)	LD	f74 = [A2], 5 * SIZE	(p16)	LD	f77 = [A6], 5 * SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f82, SIZE	(p18)	ST	[B2] = f85, SIZE	nop	__LINE__	}	{ .mmb	(p16)	lfetch.nt1	[PREA], LDA	(p16)	lfetch.excl.nt1	[PREB], LDB	nop	__LINE__	}	;;	{ .mmi	(p18)	ST	[B1] = f88, SIZE	(p18)	ST	[B2] = f91, SIZE	adds	TEMP = -16 * SIZE, TEMP	}	{ .mmb	(p16)	LD	f80 = [A3], SIZE	(p16)	LD	f83 = [A7], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18)	ST	[B1] = f94, SIZE	(p18)	ST	[B2] = f97, SIZE	(p7)	sub	PREA = PREA, TEMP	}	{ .mmb	(p16)	LD	f86 = [A3], SIZE	(p16)	LD	f89 = [A7], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f100, 5 * SIZE	(p18)	ST	[B2] = f103, 5 * SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f92 = [A3], SIZE	(p16)	LD	f95 = [A7], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f106, SIZE	(p18)	ST	[B2] = f109, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f98  = [A3], 5 * SIZE	(p16)	LD	f101 = [A7], 5 * SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f112, SIZE	(p18)	ST	[B2] = f115, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f104 = [A4], SIZE	(p16)	LD	f107 = [A8], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f118, SIZE	(p18)	ST	[B2] = f121, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f110 = [A4], SIZE	(p16)	LD	f113 = [A8], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18)	ST	[B1] = f124, -27 * SIZE	(p18)	ST	[B2] = f127, -27 * SIZE	(p16)	adds	COUNT =  1, COUNT	}	{ .mmb	(p16)	LD	f116 = [A4], SIZE	(p16)	LD	f119 = [A8], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18) add	B1 = B1, LDB	(p18) add	B2 = B2, LDB	nop	__LINE__	}	{ .mmb	(p16)	LD	f122 = [A4], 5 * SIZE	(p16)	LD	f125 = [A8], 5 * SIZE	br.ctop.sptk.few .L12	}	;;	.align 32.L15:	{ .mmb	(p10)	LD	f32 = [A1], SIZE	(p10)	LD	f40 = [A2], SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	LD	f33 = [A1], SIZE	(p10)	LD	f41 = [A2], SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	LD	f34 = [A1], SIZE	(p10)	LD	f42 = [A2], SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	LD	f35 = [A1], SIZE	(p10)	LD	f43 = [A2], SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	LD	f50 = [A3], SIZE	(p10)	LD	f60 = [A4], SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	LD	f51 = [A3], SIZE	(p10)	LD	f61 = [A4], SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	LD	f52 = [A3], SIZE	(p10)	LD	f62 = [A4], SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	LD	f53 = [A3], SIZE	(p10)	LD	f63 = [A4], SIZE	nop	__LINE__	}	;;	{ .mmb	(p11)	LD	f36 = [A1], SIZE	(p11)	LD	f44 = [A2], SIZE	nop	__LINE__	}	;;	{ .mmb	(p11)	LD	f37 = [A1]	(p11)	LD	f45 = [A2]	nop	__LINE__	}	;;	{ .mmb	(p11)	LD	f54 = [A3], SIZE	(p11)	LD	f64 = [A4], SIZE	nop	__LINE__	}	;;	{ .mmi	(p11)	LD	f55 = [A3]	(p11)	LD	f65 = [A4]	adds	B2 = 4 * SIZE, BO2	}	;;	{ .mmb	(p10)	ST	[BO2] = f32, SIZE	(p10)	ST	[B2]  = f40, SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	ST	[BO2] = f33, SIZE	(p10)	ST	[B2]  = f41, SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	ST	[BO2] = f34, SIZE	(p10)	ST	[B2]  = f42, SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	ST	[BO2] = f35, 5 * SIZE	(p10)	ST	[B2]  = f43, 5 * SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	ST	[BO2] = f50, SIZE	(p10)	ST	[B2]  = f60, SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	ST	[BO2] = f51, SIZE	(p10)	ST	[B2]  = f61, SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	ST	[BO2] = f52, SIZE	(p10)	ST	[B2]  = f62, SIZE	nop	__LINE__	}	;;	{ .mmi	(p10)	ST	[BO2] = f53, 5 * SIZE	(p10)	ST	[B2]  = f63	adds	B2 = 4 * SIZE, BO3	}	;;	{ .mmb	(p11)	ST	[BO3] = f36, SIZE	(p11)	ST	[B2] = f54, SIZE	nop	__LINE__	}	;;	{ .mmi	(p11)	ST	[BO3] = f37, SIZE	(p11)	ST	[B2] = f55, SIZE	mov	COUNT = r0	}	;;	{ .mmi	(p11)	ST	[BO3] = f44, SIZE	(p11)	ST	[B2] = f64, SIZE	cmp.eq	p0,p6 = 0,J	}	;;	{ .mmb	(p11)	ST	[BO3] = f45, 5 * SIZE	(p11)	ST	[B2] = f65, 5 * SIZE	(p6)	br.cond.dptk.few .L11	}	;;	.align 32.L20:	{ .mmi	mov	A1 = A	add	A2 = A, LDA	mov	pr.rot = 0	}	{ .mmi	mov    B1 = B	adds	PREA = PREFETCHSIZE * SIZE,A	tbit.z	p6, p0 = M, 1	}	;;	{ .mmi	cmp.eq	p16,p0 = r0, r0	adds   B2 = 4 * SIZE, B	mov	ar.ec = 3	}	{ .mib	adds	PREB = WPREFETCHSIZE * SIZE, B	shr    I  = N, 2	(p6)	br.cond.dpnt .L30	}	;;	{ .mmi	cmp.eq	p6, p0 = 0, I	adds	I =-1, I	nop	__LINE__	}	{ .mmi	shladd	A = LDA, 1, A	adds	A5 = 4 * SIZE, A1	adds	A6 = 4 * SIZE, A2	}	;;	{ .mmb	nop	__LINE__	nop	__LINE__	nop	__LINE__	}	{ .mib	adds   B  = 16 * SIZE, B	mov	ar.lc = I	(p6) br.cond.dpnt.few .L25	}	;;.L22:	{ .mmi	(p16)	lfetch.nt1	[PREA], LDA	(p16)	lfetch.excl.nt1	[PREB], LDB	shladd	TEMP = LDA, 1, r0	}	;;	{ .mmb	(p18)	ST	[B1] = f34, SIZE	(p18)	ST	[B2] = f37, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f32 = [A1], SIZE	(p16)	LD	f35 = [A5], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f40, SIZE	(p18)	ST	[B2] = f43, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f38 = [A1], SIZE	(p16)	LD	f41 = [A5], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f46,  SIZE	(p18)	ST	[B2] = f49,  SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f44 = [A1], SIZE	(p16)	LD	f47 = [A5], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18)	ST	[B1] = f52,  5 * SIZE	(p18)	ST	[B2] = f55,  5 * SIZE	tbit.z	p0,p7 = COUNT,0	}	{ .mmb	(p16)	LD	f50 = [A1], 5 * SIZE	(p16)	LD	f53 = [A5], 5 * SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f58, SIZE	(p18)	ST	[B2] = f61, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f56 = [A2], SIZE	(p16)	LD	f59 = [A6], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18)	ST	[B1] = f64, SIZE	(p18)	ST	[B2] = f67, SIZE	adds	TEMP = -16 * SIZE, TEMP	}	{ .mmb	(p16)	LD	f62 = [A2], SIZE	(p16)	LD	f65 = [A6], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18)	ST	[B1] = f70,  SIZE	(p18)	ST	[B2] = f73,  SIZE	(p7)	sub	PREA = PREA, TEMP	}	{ .mmb	(p16)	LD	f68 = [A2], SIZE	(p16)	LD	f71 = [A6], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18)	ST	[B1] = f76, -11 * SIZE	(p18)	ST	[B2] = f79, -11 * SIZE	(p16)	adds	COUNT =  1, COUNT	}	{ .mmb	(p16)	LD	f74 = [A2], 5 * SIZE	(p16)	LD	f77 = [A6], 5 * SIZE	nop	__LINE__	}	;;	{ .mmb	(p18) add	B1 = B1, LDB	(p18) add	B2 = B2, LDB	br.ctop.sptk.few .L22	}	;;	.align 32.L25:	{ .mmb	(p10)	LD	f32 = [A1], SIZE	(p10)	LD	f40 = [A2], SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	LD	f33 = [A1], SIZE	(p10)	LD	f41 = [A2], SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	LD	f34 = [A1], SIZE	(p10)	LD	f42 = [A2], SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	LD	f35 = [A1], SIZE	(p10)	LD	f43 = [A2], SIZE	nop	__LINE__	}	;;	{ .mmb	(p11)	LD	f36 = [A1], SIZE	(p11)	LD	f44 = [A2], SIZE	nop	__LINE__	}	;;	{ .mmi	(p11)	LD	f37 = [A1]	(p11)	LD	f45 = [A2]	adds	B2 = 4 * SIZE, BO2	}	;;	{ .mmb	(p10)	ST	[BO2] = f32, SIZE	(p10)	ST	[B2]  = f40, SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	ST	[BO2] = f33, SIZE	(p10)	ST	[B2]  = f41, SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	ST	[BO2] = f34, SIZE	(p10)	ST	[B2]  = f42, SIZE	nop	__LINE__	}	;;	{ .mmb	(p10)	ST	[BO2] = f35, 5 * SIZE	(p10)	ST	[B2]  = f43, 5 * SIZE	nop	__LINE__	}	;;	{ .mmi	(p11)	ST	[BO3] = f36, SIZE	;;	(p11)	ST	[BO3] = f37, SIZE	mov	COUNT = r0	}	;;	{ .mmi	(p11)	ST	[BO3] = f44, SIZE	;;	(p11)	ST	[BO3] = f45, SIZE	nop	__LINE__	}	;;	.align 32.L30:	{ .mmi	mov	A1 = A	adds	A5 = 4 * SIZE, A	mov	pr.rot = 0	}	{ .mmi	mov    B1 = B	adds   B2 = 4 * SIZE, B	tbit.z	p6, p0 = M, 0	}	;;	{ .mmb	nop	__LINE__	nop	__LINE__	nop	__LINE__	}	{ .mib	cmp.eq	p16,p0 = r0, r0	shr    I  = N, 2	(p6)	br.cond.dpnt .L999	}	;;	{ .mmi	cmp.eq	p6, p0 = 0, I	adds	I =-1, I	mov	ar.ec = 3	}	;;	{ .mib	nop	__LINE__	mov	ar.lc = I	(p6) br.cond.dpnt.few .L35	}	;;	.align 32.L32:	{ .mmb	(p18)	ST	[B1] = f34, SIZE	(p18)	ST	[B2] = f37, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f32 = [A1], SIZE	(p16)	LD	f35 = [A5], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f40, SIZE	(p18)	ST	[B2] = f43, SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f38 = [A1], SIZE	(p16)	LD	f41 = [A5], SIZE	nop	__LINE__	}	;;	{ .mmb	(p18)	ST	[B1] = f46,  SIZE	(p18)	ST	[B2] = f49,  SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f44 = [A1], SIZE	(p16)	LD	f47 = [A5], SIZE	nop	__LINE__	}	;;	{ .mmi	(p18)	ST	[B1] = f52, -3 * SIZE	(p18)	ST	[B2] = f55, -3 * SIZE	nop	__LINE__	}	{ .mmb	(p16)	LD	f50 = [A1], 5 * SIZE	(p16)	LD	f53 = [A5], 5 * SIZE	nop	__LINE__	}	;;	{ .mmb	nop	__LINE__	nop	__LINE__	nop	__LINE__	}	{ .mmb	(p18) add	B1 = B1, LDB	(p18) add	B2 = B2, LDB	br.ctop.sptk.few .L32	}	;;	.align 32.L35:	{ .mmi	(p10)	LD	f32 = [A1], SIZE	;;	(p10)	LD	f33 = [A1], SIZE	nop	__LINE__	}	;;	{ .mmi	(p10)	LD	f34 = [A1], SIZE	;;	(p10)	LD	f35 = [A1], SIZE	nop	__LINE__	}	;;	{ .mmi	(p11)	LD	f36 = [A1], SIZE	;;	(p11)	LD	f37 = [A1]	nop	__LINE__	}	;;	{ .mmi	(p10)	ST	[BO2] = f32, SIZE	;;	(p10)	ST	[BO2] = f33, SIZE	nop	__LINE__	}	;;	{ .mmi	(p10)	ST	[BO2] = f34, SIZE	;;	(p10)	ST	[BO2] = f35, SIZE	nop	__LINE__	}	;;	{ .mmi	(p11)	ST	[BO3] = f36, SIZE	;;	(p11)	ST	[BO3] = f37, SIZE	nop	__LINE__	}	;;	.align 32.L999:	mov pr    = PR, -1	mov	 ar.lc = ARLC	br.ret.sptk.many b0	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -