⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 izamax.s

📁 Optimized GotoBLAS libraries
💻 S
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#ifdef XDOUBLE#define PREFETCH_SIZE ( 8 * 16 +  4)#elif defined(DOUBLE)#define PREFETCH_SIZE (16 * 16 +  8)#else#define PREFETCH_SIZE (32 * 16 + 16)#endif#if defined(USEMAX) || defined(USEAMAX)#define CMPUNC cmp.gt.unc#define CMP    cmp.gt#endif#if defined(USEMIN) || defined(USEAMIN)#define CMPUNC cmp.lt.unc#define CMP    cmp.lt#endif#define RET	r8#define N	r32#define DX	r33#define INCX	r34		#define PRE1	r2#define I	r14#define J	r15#define K	r16#define TMP	r17#define INCXM1	r18#define INCX8	r19#define MAX1	r20#define DMAX1	r21#define DATA1	r22#define DATA2	r23#define DATA3	r24#define DATA4	r25#define DATA5	r26#define DATA6	r27#define DATA7	r28#define DATA8	r29#define PR	r30#define ARLC	r31	PROLOGUE	.prologue	PROFCODE	{ .mmi	mov	MAX1 = -1	mov	DMAX1 = 0	.save	ar.lc, ARLC	mov	ARLC = ar.lc	}	.body#ifdef F_INTERFACE	{ .mmi	LDINT	N = [N]	LDINT	INCX = [INCX]	nop.i 0	}	;;#ifndef USE64BITINT	{ .mii	nop.m 0	sxt4	N = N	sxt4	INCX = INCX	}	;; #endif#endif	{ .mii	adds	K = -1, N	shl	INCX = INCX, ZBASE_SHIFT	mov	PR = pr	}	{ .mmb	cmp.ge	p8, p0 = 0, N	(p8) br.cond.dptk .L999	}	;;	{ .mib	cmp.ge	p6, p0 = 0, INCX	mov	pr.rot= 0	(p6) br.cond.dptk .L999	}	;;	{ .mmi	LDFD	f6 = [DX], SIZE	adds	INCXM1 = - SIZE, INCX	mov	ar.ec= 5	}	;;	{ .mmi	LDFD	f7 = [DX], INCXM1	mov	MAX1 = 0	mov	I = 1	}	;;	{ .mfi	cmp.eq	p16, p0 = r0, r0	fabs	f6 = f6	shr	J =  K, 3	}	{ .mmf	nop.m 0	nop.m 0	fabs	f7 = f7	}	;;	{ .mmi	cmp.ne	p8,  p0 = r0, r0	adds	J = -1, J	shladd	INCX8 = INCX, 3, r0	}	{ .mmf	nop.m 0	nop.m 0	FADD	f6 = f6, f7	}	;;	{ .mmi	getf.d	DMAX1 = f6	adds	PRE1 = PREFETCH_SIZE * SIZE, DX	mov	ar.lc = J	}	{ .mib	cmp.eq	p7 ,p0  =   -1, J	tbit.z	p0, p13 = K, 2	(p7) br.cond.dpnt  .L15	}	.align 32	;;.L10:	{ .mmf	(p16) lfetch.nt1 [PRE1], INCX8	(p16) LDFD	f32  = [DX], SIZE	(p19) fabs	f35  = f35	}	{ .mmf	(p8 ) mov  DMAX1 = DATA1	nop.m 0	(p19) fabs	f40  = f40	}	;;	{ .mmf	(p20) getf.d	DATA5 = f12	(p16) LDFD	f37  = [DX], INCXM1	(p20) FADD	f14  = f96,  f101	}	{ .mmi	(p8 ) adds  MAX1 = 0, I	(p20) CMPUNC	p8,  p0 = DATA2,  DMAX1	nop.i 0	}	;;	{ .mmf	(p16) LDFD	f42  = [DX], SIZE	(p8 ) mov  DMAX1 = DATA2	(p19) fabs	f45  = f45	}	{ .mmf	nop.m 0	nop.m 0	(p19) fabs	f50  = f50	}	;;	{ .mmf	(p20) getf.d	DATA6 = f13	(p16) LDFD	f47  = [DX], INCXM1	(p20) FADD	f15  = f106, f111	}	{ .mmi	(p8 ) adds  MAX1 = 1, I	(p20) CMPUNC	p8,  p0 = DATA3,  DMAX1	nop.i 0	}	;;	{ .mmf	(p16) LDFD	f52  = [DX], SIZE	(p8 ) mov  DMAX1 = DATA3	(p19) fabs	f55  = f55	}	{ .mmf	nop.m 0	nop.m 0	(p19) fabs	f60  = f60	}	;;	{ .mmf	(p20) getf.d	DATA7 = f14	(p16) LDFD	f57  = [DX], INCXM1	(p19) FADD	f8   = f35,  f40	}	{ .mmi	(p8 ) adds  MAX1 = 2, I	(p20) CMPUNC	p8,  p0 = DATA4,  DMAX1	nop.i 0	}	;;	{ .mmf	(p16) LDFD	f62 = [DX], SIZE	(p8 ) mov  DMAX1 = DATA4	(p19) fabs	f65  = f65	}	{ .mmf	nop.m 0	nop.m 0	(p19) fabs	f70  = f70	}	;;	{ .mmf	(p20) getf.d	DATA8 = f15	(p16) LDFD	f67  = [DX], INCXM1	(p19) FADD	f9   = f45,  f50	}	{ .mmi	(p8 ) adds  MAX1 = 3, I	(p20) CMPUNC	p8,  p0 = DATA5,  DMAX1	nop.i 0	}	;;	{ .mmf	(p16) LDFD	f72 = [DX], SIZE	(p8 ) mov  DMAX1 = DATA5	(p19) fabs	f75  = f75	}	{ .mmf	nop.m 0	nop.m 0	(p19) fabs	f80  = f80	}	;;	{ .mmf	(p19) getf.d	DATA1 = f8	(p16) LDFD	f77  = [DX], INCXM1	(p19) FADD	f10  = f55,  f60	}	{ .mmi	(p8 ) adds  MAX1 = 4, I	(p20) CMPUNC	p8,  p0 = DATA6,  DMAX1	nop.i 0	}	;;	{ .mmf	(p16) LDFD	f82 = [DX], SIZE	(p8 ) mov  DMAX1 = DATA6	(p19) fabs	f85  = f85	}    	{ .mmf	nop.m 0	nop.m 0	(p19) fabs	f90  = f90	}	;;	{ .mmf	(p19) getf.d	DATA2 = f9	(p16) LDFD	f87  = [DX], INCXM1	(p19) FADD	f11  = f65,  f70	}	{ .mmi	(p8 ) adds  MAX1 = 5, I	(p20) CMPUNC	p8,  p0 = DATA7,  DMAX1	nop.i 0	}	;;	{ .mmf	(p16) LDFD	f92  = [DX], SIZE	(p8 ) mov  DMAX1 = DATA7	(p19) fabs	f95  = f95	}	{ .mmf	mov	TMP = I	nop.m 0	(p19) fabs	f100 = f100	}	;;	{ .mmf	(p19) getf.d	DATA3 = f10	(p16) LDFD	f97  = [DX], INCXM1	(p19) FADD	f12  = f75,  f80	}	{ .mmi	(p8 ) adds  MAX1 = 6, I	(p20) CMPUNC	p8, p0 = DATA8,  DMAX1	nop.i 0	}	;;	{ .mmf	(p16) LDFD	f102 = [DX], SIZE	(p8 ) mov  DMAX1 = DATA8	(p19) fabs	f105 = f105	}	{ .mmf	(p20) adds I = 8, I	nop.m 0	(p19) fabs	f110 = f110	}	;;	{ .mmi	(p19) getf.d	DATA4 = f11	(p16) LDFD	f107 = [DX], INCXM1	(p8 ) adds  MAX1 = 7, TMP	}	{ .mfb	(p19) CMPUNC	p8,  p0 = DATA1,  DMAX1	(p19) FADD	f13  = f85,  f90	br.ctop.sptk.few .L10	}	;;	.align 32.L15:	{ .mmi	(p13) LDFD	f32 = [DX], SIZE	and	J =  7, K	mov	pr = PR, -65474	}	;;	{ .mmb	(p13) LDFD	f33 = [DX], INCXM1	cmp.eq	p8 ,p0  =   r0, J	(p8) br.cond.dpnt  .L999	}	;;	{ .mmi	(p13) LDFD	f34 = [DX], SIZE	;;	(p13) LDFD	f35 = [DX], INCXM1	nop.i 0	}	;;	{ .mmi	(p13) LDFD	f36 = [DX], SIZE	;;	(p13) LDFD	f37 = [DX], INCXM1	nop.i 0	}	;;	{ .mfi	(p13) LDFD	f38 = [DX], SIZE	(p13) fabs	f32 = f32	tbit.z	p0, p14 = K, 1	}	;;	{ .mmf	(p13) LDFD	f39 = [DX], INCXM1	nop.m 0	(p13) fabs	f33 = f33	}	;;	{ .mmf	(p14) LDFD	f40 = [DX], SIZE	nop.m 0	(p13) fabs	f34 = f34	}	;;	{ .mfi	(p14) LDFD	f41 = [DX], INCXM1	(p13) fabs	f35 = f35	tbit.z	p0, p15 = K, 0	}	;;	{ .mmf	(p14) LDFD	f42 = [DX], SIZE	nop.m 0	(p13) fabs	f36 = f36	}	;;	{ .mmf	(p14) LDFD	f43 = [DX], INCXM1	nop.m 0	(p13) fabs	f37 = f37	}	{ .mmf	nop.m 0	nop.m 0	(p13) FADD	f32 = f32, f33	}	;;	{ .mmf	(p15) LDFD	f44 = [DX], SIZE	nop.m 0	(p13) fabs	f38 = f38	}	;;	{ .mmf	(p15) LDFD	f45 = [DX], INCXM1	nop.m 0	(p13) fabs	f39 = f39	}	{ .mmf	nop.m 0	nop.m 0	(p13) FADD	f34 = f34, f35	}	;;	{ .mmf	nop.m 0	nop.m 0	(p14) fabs	f40 = f40	}	;;	{ .mmf	(p13) getf.d	DATA1 = f32	nop.m 0	(p14) fabs	f41 = f41	}	{ .mmf	nop.m 0	nop.m 0	(p13) FADD	f36 = f36, f37	}	;;	{ .mmf	nop.m 0	nop.m 0	(p14) fabs	f42 = f42	}	;;	{ .mmf	(p13) getf.d	DATA2 = f34	nop.m 0	(p14) fabs	f43 = f43	}	{ .mmf	nop.m 0	nop.m 0	(p13) FADD	f38 = f38, f39	}	;;	{ .mmf	nop.m 0	nop.m 0	(p15) fabs	f44 = f44	}	;;	{ .mmf	(p13) getf.d	DATA3 = f36	nop.m 0	(p15) fabs	f45 = f45	}	{ .mmf	nop.m 0	nop.m 0	(p14) FADD	f40 = f40, f41	}	;;	{ .mmf	(p13) getf.d	DATA4 = f38	nop.m 0	(p14) FADD	f42 = f42, f43	}	;;	{ .mmf	(p14) getf.d	DATA5 = f40	nop.m 0	(p15) FADD	f44 = f44, f45	}	;;	{ .mmi	(p14) getf.d	DATA6 = f42	nop.m 0	(p13) CMPUNC	p8,  p0 = DATA1, DMAX1	}	;;	{ .mmi	(p15) getf.d	DATA7 = f44	(p8 ) adds	MAX1  = 0, I	(p8 ) mov	DMAX1 = DATA1	}	;;	{ .mmi	(p13) CMPUNC	p8,  p0 = DATA2, DMAX1	;;	(p8 ) adds	MAX1  = 1, I	(p8 ) mov	DMAX1 = DATA2	}	;;	{ .mmi	(p13) CMPUNC	p8,  p0 = DATA3, DMAX1	;;	(p8 ) adds	MAX1  = 2, I	(p8 ) mov	DMAX1 = DATA3	}	;;	{ .mmi	(p13) CMPUNC	p8,  p0 = DATA4, DMAX1	;;	(p8 ) adds	MAX1  = 3, I	(p8 ) mov	DMAX1 = DATA4	}{ .mmi	(p13) adds	I = 4, I	nop.m 0	nop.i 0	}	;;	{ .mmi	(p14) CMPUNC	p8,  p0 = DATA5, DMAX1	;;	(p8 ) adds	MAX1  = 0, I	(p8 ) mov	DMAX1 = DATA5	}	;;	{ .mmi	(p14) CMPUNC	p8,  p0 = DATA6, DMAX1	;;	(p8 ) adds	MAX1  = 1, I	(p8 ) mov	DMAX1 = DATA6	}{ .mmi	(p14) adds	I = 2, I	nop.m 0	nop.i 0	}	;;	{ .mmi	(p15) CMPUNC	p8,  p0 = DATA7, DMAX1	;;	(p8) adds	MAX1  = 0, I	(p8) mov	DMAX1 = DATA7	}	;;	.align 32	.L999:	{ .mmi	setf.d	f8 = DMAX1	adds RET = 1, MAX1	mov	ar.lc = ARLC	}	{ .mmb	nop.m 0	nop.m 0	br.ret.sptk.many b0	}	EPILOGUE

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -