⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ztrsm_kernel_rt.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 3 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define M	%i0#define N	%i1#define K	%i2#define A	%i5#define B	%i3#define C	%i4#define LDC	%o0#define AO	%o1#define BO	%o2#define I	%o3#define J	%o4#define L	%o5#define C1	%l0#define C2	%l1#define OFFSET	%l2#define	KK	%l3#define TEMP1	%l4#define TEMP2	%l5#define AORIG	%l6#ifdef DOUBLE#define c01	%f0#define c02	%f2#define c03	%f4#define c04	%f6#define c05	%f8#define c06	%f10#define c07	%f12#define c08	%f14#define c09	%f16#define c10	%f18#define c11	%f20#define c12	%f22#define c13	%f24#define c14	%f26#define c15	%f28#define c16	%f30#define t1	%f32#define	t2 	%f34#define t3	%f36#define	t4 	%f38#define a1	%f40#define a2	%f42#define a3	%f44#define a4	%f46#define a5	%f62#define b1	%f48#define b2	%f50#define b3	%f52#define b4	%f54#define b5	%f56#define FZERO	%f58#else#define c01	%f0#define c02	%f1#define c03	%f2#define c04	%f3#define c05	%f4#define c06	%f5#define c07	%f6#define c08	%f7#define c09	%f8#define c10	%f9#define c11	%f10#define c12	%f11#define c13	%f12#define c14	%f13#define c15	%f14#define c16	%f15#define t1	%f16#define	t2 	%f17#define t3	%f18#define	t4 	%f19#define a1	%f20#define a2	%f21#define a3	%f22#define a4	%f23#define a5	%f31#define b1	%f24#define b2	%f25#define b3	%f26#define b4	%f27#define b5	%f28#define FZERO	%f29#endif#define	t5	c13#define	t6	c14#define	t7	c15#define	t8	c16#ifndef CONJ#define FADD1	FADD#define FADD2	FADD#define FADD3	FADD#define FADD4	FSUB#else#if defined(LN) || defined(LT)#define FADD1	FADD#define FADD2	FSUB#define FADD3	FADD#define FADD4	FADD#endif#if defined(RN) || defined(RT)#define FADD1	FADD#define FADD2	FADD#define FADD3	FSUB#define FADD4	FADD#endif#endif#define APREFETCHSIZE 40#define BPREFETCHSIZE 40#define APREFETCH_CATEGORY 0#define BPREFETCH_CATEGORY 0	PROLOGUE	SAVESP	#ifndef __64BIT__#ifdef DOUBLE	st	%g0, [%fp + STACK_START +  8]	st	%g0, [%fp + STACK_START + 12]	nop	st	%i3, [%fp + STACK_START + 16]	nop	st	%i4, [%fp + STACK_START + 20]	nop	st	%i5, [%fp + STACK_START + 24]	ld	[%fp + STACK_START + 32], A	ld	[%fp + STACK_START + 36], B	ld	[%fp + STACK_START + 40], C	ld	[%fp + STACK_START + 44], LDC	ld	[%fp + STACK_START + 48], OFFSET	ldd	[%fp + STACK_START +  8], FZERO#else	st	%g0, [%fp + STACK_START +  8]	nop	st	%i3, [%fp + STACK_START + 16]	nop	st	%i4, [%fp + STACK_START + 20]	ld	[%fp + STACK_START + 28], B	ld	[%fp + STACK_START + 32], C	ld	[%fp + STACK_START + 36], LDC	ld	[%fp + STACK_START + 40], OFFSET	ld	[%fp + STACK_START +  8], FZERO#endif#else#ifdef DOUBLE	stx	%g0, [%fp + STACK_START + 32]#else	st	%g0, [%fp + STACK_START + 32]#endif	ldx	[%fp+  STACK_START + 56], B	nop	ldx	[%fp+  STACK_START + 64], C	nop	ldx	[%fp+  STACK_START + 72], LDC	ldx	[%fp+  STACK_START + 80], OFFSET	LDF	[%fp + STACK_START + 32], FZERO#endif	sll	LDC, ZBASE_SHIFT, LDC#ifdef LN	smul	M, K, TEMP1	sll	TEMP1, ZBASE_SHIFT, TEMP1	add	A, TEMP1, A	sll	M, ZBASE_SHIFT, TEMP1	add	C, TEMP1, C#endif#ifdef RN	neg	OFFSET, KK#endif#ifdef RT	smul	N, K, TEMP1	sll	TEMP1, ZBASE_SHIFT, TEMP1	add	B, TEMP1, B	smul	N, LDC, TEMP1	add	C, TEMP1, C	sub	N, OFFSET, KK#endif	and	N, 1, J	cmp	J, 0	ble,pn	%icc, .LL100	nop#ifdef RT	sll	K, 0 + ZBASE_SHIFT, TEMP1	sub	B, TEMP1, B	sub	C, LDC, C#endif	mov	C, C1#ifdef LN	add	M, OFFSET, KK#endif#ifdef LT	mov	OFFSET, KK#endif#if defined(LN) || defined(RT)	mov	A, AORIG#else	mov	A, AO#endif#ifndef RT	add	C, LDC, C#endif	sra	M, 1, I	cmp	I, 0	ble,pn	%icc, .LL150	FMOV	FZERO, c03.LL121:#if defined(LT) || defined(RN)	sra	KK, 2, L	mov	B, BO	cmp	L,  0#else#ifdef LN	sll	K,  1 + ZBASE_SHIFT, TEMP1	sub	AORIG, TEMP1, AORIG#endif	sll	KK, 1 + ZBASE_SHIFT, TEMP1	sll	KK, 0 + ZBASE_SHIFT, TEMP2	add	AORIG, TEMP1, AO	add	B,     TEMP2, BO	sub	K, KK, TEMP1	sra	TEMP1, 2, L	cmp	L,  0#endif	FMOV	FZERO, c03	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, t1	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, c07	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, t2	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, c04	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, t3	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, c08	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, t4	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, c01	prefetch [C1 + 3 * SIZE], 3	FMOV	FZERO, c05	FMOV	FZERO, c02	ble,pn	%icc, .LL125	FMOV	FZERO, c06.LL122:	FADD1	c03, t1, c03	add	L, -1, L	FMUL	a1, b1, t1	prefetch [AO + APREFETCHSIZE * SIZE], 0	FADD3	c07, t2, c07	add	BO,  8 * SIZE, BO	FMUL	a1, b2, t2	LDF	[AO + 4 * SIZE], a1	FADD2	c04, t3, c04	add	AO, 16 * SIZE, AO	FMUL	a2, b1, t3	cmp	L,  0	FADD4	c08, t4, c08	nop	FMUL	a2, b2, t4	LDF	[AO - 11 * SIZE], a2	FADD1	c01, t1, c01	nop	FMUL	a3, b1, t1	nop	FADD3	c05, t2, c05	nop	FMUL	a3, b2, t2	LDF	[AO - 10 * SIZE], a3	FADD2	c02, t3, c02	nop	FMUL	a4, b1, t3	LDF	[BO -  4 * SIZE], b1	FADD4	c06, t4, c06	nop	FMUL	a4, b2, t4	LDF	[BO -  3 * SIZE], b2	FADD1	c03, t1, c03	nop	FMUL	a1, b3, t1	LDF	[AO -  9 * SIZE], a4	FADD3	c07, t2, c07	nop	FMUL	a1, b4, t2	LDF	[AO -  8 * SIZE], a1	FADD2	c04, t3, c04	nop	FMUL	a2, b3, t3	nop	FADD4	c08, t4, c08	nop	FMUL	a2, b4, t4	LDF	[AO -  7 * SIZE], a2	FADD1	c01, t1, c01	nop	FMUL	a3, b3, t1	nop	FADD3	c05, t2, c05	nop	FMUL	a3, b4, t2	LDF	[AO -  6 * SIZE], a3	FADD2	c02, t3, c02	nop	FMUL	a4, b3, t3	LDF	[BO -  2 * SIZE], b3	FADD4	c06, t4, c06	nop	FMUL	a4, b4, t4	LDF	[BO -  1 * SIZE], b4	FADD1	c03, t1, c03	nop	FMUL	a1, b1, t1	LDF	[AO -  5 * SIZE], a4	FADD3	c07, t2, c07	nop	FMUL	a1, b2, t2	LDF	[AO -  4 * SIZE], a1	FADD2	c04, t3, c04	nop	FMUL	a2, b1, t3	nop	FADD4	c08, t4, c08	nop	FMUL	a2, b2, t4	LDF	[AO -  3 * SIZE], a2	FADD1	c01, t1, c01	nop	FMUL	a3, b1, t1	nop	FADD3	c05, t2, c05	nop	FMUL	a3, b2, t2	LDF	[AO -  2 * SIZE], a3	FADD2	c02, t3, c02	nop	FMUL	a4, b1, t3	LDF	[BO +  0 * SIZE], b1	FADD4	c06, t4, c06	nop	FMUL	a4, b2, t4	LDF	[BO +  1 * SIZE], b2	FADD1	c03, t1, c03	nop	FMUL	a1, b3, t1	LDF	[AO -  1 * SIZE], a4	FADD3	c07, t2, c07	nop	FMUL	a1, b4, t2	LDF	[AO +  0 * SIZE], a1	FADD2	c04, t3, c04	nop	FMUL	a2, b3, t3	nop	FADD4	c08, t4, c08	nop	FMUL	a2, b4, t4	LDF	[AO +  1 * SIZE], a2	FADD1	c01, t1, c01	nop	FMUL	a3, b3, t1	nop	FADD3	c05, t2, c05	nop	FMUL	a3, b4, t2	LDF	[AO +  2 * SIZE], a3	FADD2	c02, t3, c02	nop	FMUL	a4, b3, t3	LDF	[BO +  2 * SIZE], b3	FADD4	c06, t4, c06	FMUL	a4, b4, t4	LDF	[AO +  3 * SIZE], a4	bg,pt	%icc, .LL122	LDF	[BO +  3 * SIZE], b4.LL125:#if defined(LT) || defined(RN)	and	KK,  3, L#else	and	TEMP1, 3, L#endif	cmp	L,  0	ble,a,pn %icc, .LL129	nop.LL126:	FADD1	c03, t1, c03	add	AO, 4 * SIZE, AO	FMUL	a1, b1, t1	add	BO, 2 * SIZE, BO	FADD3	c07, t2, c07	add	L, -1, L	FMUL	a1, b2, t2	LDF	[AO + 0 * SIZE], a1	FADD2	c04, t3, c04	cmp	L, 0	FMUL	a2, b1, t3	FADD4	c08, t4, c08	FMUL	a2, b2, t4	LDF	[AO + 1 * SIZE], a2	FADD1	c01, t1, c01	FMUL	a3, b1, t1	FADD3	c05, t2, c05	FMUL	a3, b2, t2	LDF	[AO + 2 * SIZE], a3	FADD2	c02, t3, c02	FMUL	a4, b1, t3	LDF	[BO + 0 * SIZE], b1	FADD4	c06, t4, c06	FMUL	a4, b2, t4	LDF	[BO + 1 * SIZE], b2	bg,pt	%icc, .LL126	LDF	[AO + 3 * SIZE], a4.LL129:	FADD1	c03, t1, c03	FADD3	c07, t2, c07	FADD2	c04, t3, c04	FADD4	c08, t4, c08	FADD	c01, c06, c01	FADD	c02, c05, c02	FADD	c03, c08, c03	FADD	c04, c07, c04#if defined(LN) || defined(RT)#ifdef LN	sub	KK, 2, TEMP1#else	sub	KK, 1, TEMP1#endif	sll	TEMP1, 1 + ZBASE_SHIFT, TEMP2	sll	TEMP1, 0 + ZBASE_SHIFT, TEMP1	add	AORIG, TEMP2, AO	add	B,     TEMP1, BO#endif#if defined(LN) || defined(LT)	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	LDF	[BO +  2 * SIZE], a3	LDF	[BO +  3 * SIZE], a4	FSUB	a1, c01, c01	FSUB	a2, c02, c02	FSUB	a3, c03, c03	FSUB	a4, c04, c04#else	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  2 * SIZE], a3	LDF	[AO +  3 * SIZE], a4	FSUB	a1, c01, c01	FSUB	a2, c02, c02	FSUB	a3, c03, c03	FSUB	a4, c04, c04#endif#ifdef LN	LDF	[AO +  6 * SIZE], a1	LDF	[AO +  7 * SIZE], a2	LDF	[AO +  4 * SIZE], a3	LDF	[AO +  5 * SIZE], a4	LDF	[AO +  0 * SIZE], b1	LDF	[AO +  1 * SIZE], b2	FMUL	a1, c03, t1	FMUL	a2, c04, t2	FMUL	a1, c04, t3	FMUL	a2, c03, t4	FADD4	t1, t2, c03	FADD2	t3, t4, c04	FMUL	a3, c03, t1	FMUL	a3, c04, t2	FMUL	a4, c04, t5	FMUL	a4, c03, t6	FSUB	c01, t1, c01	FSUB	c02, t2, c02	FADD2	c01, t5, c01	FADD4	c02, t6, c02	FMUL	b1, c01, t1	FMUL	b2, c02, t2	FMUL	b1, c02, t3	FMUL	b2, c01, t4	FADD4	t1, t2, c01	FADD2	t3, t4, c02#endif#ifdef LT	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  2 * SIZE], a3	LDF	[AO +  3 * SIZE], a4	LDF	[AO +  6 * SIZE], b1	LDF	[AO +  7 * SIZE], b2	FMUL	a1, c01, t1	FMUL	a2, c02, t2	FMUL	a1, c02, t3	FMUL	a2, c01, t4	FADD4	t1, t2, c01	FADD2	t3, t4, c02	FMUL	a3, c01, t1	FMUL	a3, c02, t2	FMUL	a4, c02, t5	FMUL	a4, c01, t6	FSUB	c03, t1, c03	FSUB	c04, t2, c04	FADD2	c03, t5, c03	FADD4	c04, t6, c04	FMUL	b1, c03, t1	FMUL	b2, c04, t2	FMUL	b1, c04, t3	FMUL	b2, c03, t4	FADD4	t1, t2, c03	FADD2	t3, t4, c04#endif#ifdef RN	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	FMUL	a1, c01, t1	FMUL	a2, c02, t2	FMUL	a1, c02, t3	FMUL	a2, c01, t4	FMUL	a1, c03, t5	FMUL	a2, c04, t6	FMUL	a1, c04, t7	FMUL	a2, c03, t8	FADD4	t1, t2, c01	FADD3	t3, t4, c02	FADD4	t5, t6, c03	FADD3	t7, t8, c04#endif#ifdef RT	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	FMUL	a1, c01, t1	FMUL	a2, c02, t2	FMUL	a1, c02, t3	FMUL	a2, c01, t4	FMUL	a1, c03, t5	FMUL	a2, c04, t6	FMUL	a1, c04, t7	FMUL	a2, c03, t8	FADD4	t1, t2, c01	FADD3	t3, t4, c02	FADD4	t5, t6, c03	FADD3	t7, t8, c04#endif#ifdef LN	add	C1, -4 * SIZE, C1#endif#if defined(LN) || defined(LT)	STF	c01, [BO +  0 * SIZE]	STF	c02, [BO +  1 * SIZE]	STF	c03, [BO +  2 * SIZE]	STF	c04, [BO +  3 * SIZE]#else	STF	c01, [AO +  0 * SIZE]	STF	c02, [AO +  1 * SIZE]	STF	c03, [AO +  2 * SIZE]	STF	c04, [AO +  3 * SIZE]#endif	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	STF	c03, [C1 + 2 * SIZE]	STF	c04, [C1 + 3 * SIZE]	FMOV	FZERO, t1	FMOV	FZERO, t2	FMOV	FZERO, t3	FMOV	FZERO, t4#ifndef LN	add	C1, 4 * SIZE, C1#endif#ifdef RT	sll	K, 1 + ZBASE_SHIFT, TEMP1	add	AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN)	sub	K, KK, TEMP1	sll	TEMP1, 1 + ZBASE_SHIFT, TEMP2	sll	TEMP1, 0 + ZBASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LT	add	KK, 2, KK#endif#ifdef LN	sub	KK, 2, KK#endif	add	I, -1, I	cmp	I, 0	bg,pt	%icc, .LL121	FMOV	FZERO, c03.LL150:	and	M, 1, I	cmp	I, 0	ble,pn	%icc, .LL199	nop#if defined(LT) || defined(RN)	sra	KK, 2, L	mov	B, BO	cmp	L,  0#else#ifdef LN	sll	K,  0 + ZBASE_SHIFT, TEMP1	sub	AORIG, TEMP1, AORIG#endif	sll	KK, 0 + ZBASE_SHIFT, TEMP1	add	AORIG, TEMP1, AO	add	B,     TEMP1, BO	sub	K, KK, TEMP1	sra	TEMP1, 2, L	cmp	L,  0#endif	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, c01	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, t1	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c02	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, t4	ble,pn	%icc, .LL155	nop.LL152:	FADD1	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -