⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 trsm_kernel_rt.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 5 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define M	%i0#define N	%i1#define K	%i2#if defined(DOUBLE) && !defined(__64BIT__)#define A	%i5#define B	%i4#else#define A	%i4#define B	%i5#endif#define C	%o4#define LDC	%o5#define AO	%l0#define BO	%l1#define I	%l2#define J	%l3#define L	%l4#define C1	%o0#define C2	%o1#define C3	%o2#define C4	%o3#define OFFSET	%l5#define	KK	%l6#define TEMP1	%l7#define TEMP2	%i3#define AORIG	%g1#ifdef DOUBLE#define c01	%f0#define c02	%f2#define c03	%f4#define c04	%f6#define c05	%f8#define c06	%f10#define c07	%f12#define c08	%f14#define c09	%f16#define c10	%f18#define c11	%f20#define c12	%f22#define c13	%f24#define c14	%f26#define c15	%f28#define c16	%f30#define t1	%f32#define	t2 	%f34#define t3	%f36#define	t4 	%f38#define a1	%f40#define a2	%f42#define a3	%f44#define a4	%f46#define a5	%f58#define b1	%f48#define b2	%f50#define b3	%f52#define b4	%f54#define b5	%f56#define FZERO	%f60#define ALPHA	%f62#else#define c01	%f0#define c02	%f1#define c03	%f2#define c04	%f3#define c05	%f4#define c06	%f5#define c07	%f6#define c08	%f7#define c09	%f8#define c10	%f9#define c11	%f10#define c12	%f11#define c13	%f12#define c14	%f13#define c15	%f14#define c16	%f15#define t1	%f16#define	t2 	%f17#define t3	%f18#define	t4 	%f19#define a1	%f20#define a2	%f21#define a3	%f22#define a4	%f23#define a5	%f31#define b1	%f24#define b2	%f25#define b3	%f26#define b4	%f27#define b5	%f28#define FZERO	%f29#define ALPHA	%f30#endif#define APREFETCHSIZE 40#define BPREFETCHSIZE 40#define APREFETCH_CATEGORY 0#define BPREFETCH_CATEGORY 0	PROLOGUE	SAVESP	nop#ifndef __64BIT__#ifdef DOUBLE	st	%g0, [%fp + STACK_START +  8]	st	%g0, [%fp + STACK_START + 12]	st	%i3, [%fp + STACK_START + 16]   /* ALPHA */	st	%i4, [%fp + STACK_START + 20]	ld	[%fp + STACK_START + 28], B	ld	[%fp + STACK_START + 32], C	ld	[%fp + STACK_START + 36], LDC	ld	[%fp + STACK_START + 40], OFFSET#else	st	%g0, [%fp + STACK_START +  8]	st	%i3, [%fp + STACK_START + 16]   /* ALPHA */	ld	[%fp + STACK_START + 28], C	ld	[%fp + STACK_START + 32], LDC	ld	[%fp + STACK_START + 36], OFFSET#endif	LDF	[%fp + STACK_START +  8], FZERO	LDF	[%fp + STACK_START + 16], ALPHA#else#ifdef DOUBLE	stx	%g0, [%fp + STACK_START + 32]	FMOV	%f6, ALPHA	nop	ldd	[%fp + STACK_START + 32], FZERO#else	st	%g0, [%fp + STACK_START + 32]	FMOV	%f7, ALPHA	nop	ld	[%fp + STACK_START + 32], FZERO#endif	ldx	[%fp+  STACK_START + 56], C	ldx	[%fp+  STACK_START + 64], LDC	ldx	[%fp+  STACK_START + 72], OFFSET#endif	sll	LDC, BASE_SHIFT, LDC#ifdef LN	smul	M, K, TEMP1	sll	TEMP1, BASE_SHIFT, TEMP1	add	A, TEMP1, A	sll	M, BASE_SHIFT, TEMP1	add	C, TEMP1, C#endif#ifdef RN	neg	OFFSET, KK#endif#ifdef RT	smul	N, K, TEMP1	sll	TEMP1, BASE_SHIFT, TEMP1	add	B, TEMP1, B	smul	N, LDC, TEMP1	add	C, TEMP1, C	sub	N, OFFSET, KK#endif	and	N, 1, J	cmp	J, 0	ble,pn	%icc, .LL100	nop#ifdef RT	sll	K, 0 + BASE_SHIFT, TEMP1	sub	B, TEMP1, B	sub	C, LDC, C#endif	mov	C, C1#ifdef LN	add	M, OFFSET, KK#endif#ifdef LT	mov	OFFSET, KK#endif#if defined(LN) || defined(RT)	mov	A, AORIG#else	mov	A, AO#endif#ifndef RT	add	C, LDC, C#endif	sra	M, 2, I	cmp	I, 0	ble,pn	%icc, .LL250	nop.LL221:#if defined(LT) || defined(RN)	sra	KK, 2, L	mov	B, BO	cmp	L,  0#else#ifdef LN	sll	K,  2 + BASE_SHIFT, TEMP1	sub	AORIG, TEMP1, AORIG#endif	sll	KK, 2 + BASE_SHIFT, TEMP1	sll	KK, 0 + BASE_SHIFT, TEMP2	add	AORIG, TEMP1, AO	add	B,     TEMP2, BO	sub	K, KK, TEMP1	sra	TEMP1, 2, L	cmp	L,  0#endif	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, c01	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, t1	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c02	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, t4	ble,pn	%icc, .LL225	prefetch [C1 + 4 * SIZE], 2.LL222:	FADD	c01, t1, c01	add	BO,  4 * SIZE, BO	FMUL	a1, b1, t1	LDF	[AO +  4 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b1, t2	LDF	[AO +  5 * SIZE], a2	FADD	c03, t3, c03	add	L, -1, L	FMUL	a3, b1, t3	LDF	[AO +  6 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b1, t4	LDF	[AO +  7 * SIZE], a4	LDF	[BO +  0 * SIZE], b1	FADD	c01, t1, c01	cmp	L,  0	FMUL	a1, b2, t1	LDF	[AO +  8 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b2, t2	LDF	[AO +  9 * SIZE], a2	FADD	c03, t3, c03	FMUL	a3, b2, t3	LDF	[AO + 10 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b2, t4	LDF	[AO + 11 * SIZE], a4	LDF	[BO +  1 * SIZE], b2	FADD	c01, t1, c01	FMUL	a1, b3, t1	LDF	[AO + 12 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b3, t2	LDF	[AO + 13 * SIZE], a2	FADD	c03, t3, c03	FMUL	a3, b3, t3	LDF	[AO + 14 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b3, t4	LDF	[AO + 15 * SIZE], a4	LDF	[BO +  2 * SIZE], b3	FADD	c01, t1, c01	FMUL	a1, b4, t1	LDF	[AO + 16 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b4, t2	LDF	[AO + 17 * SIZE], a2	FADD	c03, t3, c03	FMUL	a3, b4, t3	LDF	[AO + 18 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b4, t4	LDF	[AO + 19 * SIZE], a4	add	AO, 16 * SIZE, AO	bg,pt	%icc, .LL222	LDF	[BO +  3 * SIZE], b4.LL225:#if defined(LT) || defined(RN)	and	KK,  3, L#else	and	TEMP1, 3, L#endif	cmp	L,  0	ble,a,pn %icc, .LL229	nop.LL226:	FADD	c01, t1, c01	add	BO, 1 * SIZE, BO	FMUL	a1, b1, t1	LDF	[AO + 4 * SIZE], a1	FADD	c02, t2, c02	add	L, -1, L	FMUL	a2, b1, t2	LDF	[AO + 5 * SIZE], a2	FADD	c03, t3, c03	cmp	L, 0	FMUL	a3, b1, t3	LDF	[AO + 6 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b1, t4	LDF	[AO + 7 * SIZE], a4	add	AO, 4 * SIZE, AO	bg,pt	%icc, .LL226	LDF	[BO + 0 * SIZE], b1.LL229:	FADD	c01, t1, c01	FADD	c02, t2, c02	FADD	c03, t3, c03	FADD	c04, t4, c04#if defined(LN) || defined(RT)#ifdef LN	sub	KK, 4, TEMP1#else	sub	KK, 1, TEMP1#endif	sll	TEMP1, 2 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AORIG, TEMP2, AO	add	B,     TEMP1, BO#endif#if defined(LN) || defined(LT)	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	LDF	[BO +  2 * SIZE], a3	LDF	[BO +  3 * SIZE], a4	FSUB	a1, c01, c01	FSUB	a2, c02, c02	FSUB	a3, c03, c03	FSUB	a4, c04, c04#else	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  2 * SIZE], a3	LDF	[AO +  3 * SIZE], a4	FSUB	a1, c01, c01	FSUB	a2, c02, c02	FSUB	a3, c03, c03	FSUB	a4, c04, c04#endif#ifdef LN	LDF	[AO + 15 * SIZE], a1	LDF	[AO + 14 * SIZE], a2	LDF	[AO + 13 * SIZE], a3	LDF	[AO + 12 * SIZE], a4	FMUL	a1, c04, c04	FMUL	a2, c04, t1	FSUB	c03, t1, c03	FMUL	a3, c04, t1	FSUB	c02, t1, c02	FMUL	a4, c04, t1	FSUB	c01, t1, c01	LDF	[AO + 10 * SIZE], a1	LDF	[AO +  9 * SIZE], a2	LDF	[AO +  8 * SIZE], a3	FMUL	a1, c03, c03	FMUL	a2, c03, t1	FSUB	c02, t1, c02	FMUL	a3, c03, t1	FSUB	c01, t1, c01	LDF	[AO +  5 * SIZE], a1	LDF	[AO +  4 * SIZE], a2	FMUL	a1, c02, c02	FMUL	a2, c02, t1	FSUB	c01, t1, c01	LDF	[AO +  0 * SIZE], a1	FMUL	a1, c01, c01#endif#ifdef LT	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  2 * SIZE], a3	LDF	[AO +  3 * SIZE], a4	FMUL	a1, c01, c01	FMUL	a2, c01, t1	FSUB	c02, t1, c02	FMUL	a3, c01, t1	FSUB	c03, t1, c03	FMUL	a4, c01, t1	FSUB	c04, t1, c04	LDF	[AO +  5 * SIZE], a1	LDF	[AO +  6 * SIZE], a2	LDF	[AO +  7 * SIZE], a3	FMUL	a1, c02, c02	FMUL	a2, c02, t1	FSUB	c03, t1, c03	FMUL	a3, c02, t1	FSUB	c04, t1, c04	LDF	[AO + 10 * SIZE], a1	LDF	[AO + 11 * SIZE], a2	FMUL	a1, c03, c03	FMUL	a2, c03, t1	FSUB	c04, t1, c04	LDF	[AO + 15 * SIZE], a1	FMUL	a1, c04, c04#endif#ifdef RN	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01	FMUL	a1, c02, c02	FMUL	a1, c03, c03	FMUL	a1, c04, c04#endif#ifdef RT	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01	FMUL	a1, c02, c02	FMUL	a1, c03, c03	FMUL	a1, c04, c04#endif#ifdef LN	add	C1, -4 * SIZE, C1#endif#if defined(LN) || defined(LT)	STF	c01, [BO +  0 * SIZE]	STF	c02, [BO +  1 * SIZE]	STF	c03, [BO +  2 * SIZE]	STF	c04, [BO +  3 * SIZE]#else	STF	c01, [AO +  0 * SIZE]	STF	c02, [AO +  1 * SIZE]	STF	c03, [AO +  2 * SIZE]	STF	c04, [AO +  3 * SIZE]#endif	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	STF	c03, [C1 + 2 * SIZE]	STF	c04, [C1 + 3 * SIZE]	FMOV	FZERO, t1	FMOV	FZERO, t2	FMOV	FZERO, t3	FMOV	FZERO, t4#ifndef LN	add	C1, 4 * SIZE, C1#endif#ifdef RT	sll	K, 2 + BASE_SHIFT, TEMP1	add	AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN)	sub	K, KK, TEMP1	sll	TEMP1, 2 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LT	add	KK, 4, KK#endif#ifdef LN	sub	KK, 4, KK#endif	add	I, -1, I	cmp	I, 0	bg,pt	%icc, .LL221	nop.LL250:	and	M, 2, I	cmp	I, 0	ble,pn	%icc, .LL270	nop#if defined(LT) || defined(RN)	sra	KK, 2, L	mov	B, BO	cmp	L,  0#else#ifdef LN	sll	K,  1 + BASE_SHIFT, TEMP1	sub	AORIG, TEMP1, AORIG#endif	sll	KK, 1 + BASE_SHIFT, TEMP1	sll	KK, 0 + BASE_SHIFT, TEMP2	add	AORIG, TEMP1, AO	add	B,     TEMP2, BO	sub	K, KK, TEMP1	sra	TEMP1, 2, L	cmp	L,  0#endif	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, c01	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, t1	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c02	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, t2	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c03	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, t3	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c04	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, t4	ble,pn	%icc, .LL255	nop.LL252:	FADD	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1	LDF	[AO + 4 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b1, t2	LDF	[AO +  5 * SIZE], a2	LDF	[BO +  4 * SIZE], b1	FADD	c03, t3, c03	cmp	L, 0	FMUL	a3, b2, t3	LDF	[AO +  6 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b2, t4	LDF	[AO +  7 * SIZE], a4	LDF	[BO +  5 * SIZE], b2	FADD	c01, t1, c01	FMUL	a1, b3, t1	LDF	[AO +  8 * SIZE], a1	FADD	c02, t2, c02	FMUL	a2, b3, t2	LDF	[AO +  9 * SIZE], a2	LDF	[BO +  6 * SIZE], b3	FADD	c03, t3, c03	FMUL	a3, b4, t3	LDF	[AO + 10 * SIZE], a3	FADD	c04, t4, c04	FMUL	a4, b4, t4	LDF	[AO + 11 * SIZE], a4	add	AO,  8 * SIZE, AO	LDF	[BO +  7 * SIZE], b4	bg,pt	%icc, .LL252	add	BO,  4 * SIZE, BO.LL255:#if defined(LT) || defined(RN)	and	KK,  3, L#else	and	TEMP1, 3, L#endif	cmp	L,  0	ble,a,pn %icc, .LL259	nop.LL256:	FADD	c01, t1, c01	add	L, -1, L	FMUL	a1, b1, t1	LDF	[AO + 2 * SIZE], a1	FADD	c02, t2, c02	cmp	L, 0	FMUL	a2, b1, t2	LDF	[AO + 3 * SIZE], a2	LDF	[BO + 1 * SIZE], b1	add	AO, 2 * SIZE, AO	bg,pt	%icc, .LL256	add	BO, 1 * SIZE, BO.LL259:	FADD	c01, t1, c01	FADD	c02, t2, c02	FADD	c03, t3, c03	FADD	c04, t4, c04	FADD	c01, c03, c01	FADD	c02, c04, c02#if defined(LN) || defined(RT)#ifdef LN	sub	KK, 2, TEMP1#else	sub	KK, 1, TEMP1#endif	sll	TEMP1, 1 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AORIG, TEMP2, AO	add	B,     TEMP1, BO#endif#if defined(LN) || defined(LT)	LDF	[BO +  0 * SIZE], a1	LDF	[BO +  1 * SIZE], a2	FSUB	a1, c01, c01	FSUB	a2, c02, c02#else	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	FSUB	a1, c01, c01	FSUB	a2, c02, c02#endif#ifdef LN	LDF	[AO +  3 * SIZE], a1	LDF	[AO +  2 * SIZE], a2	LDF	[AO +  0 * SIZE], a3	FMUL	a1, c02, c02	FMUL	a2, c02, t1	FSUB	c01, t1, c01	FMUL	a3, c01, c01#endif#ifdef LT	LDF	[AO +  0 * SIZE], a1	LDF	[AO +  1 * SIZE], a2	LDF	[AO +  3 * SIZE], a3	FMUL	a1, c01, c01	FMUL	a2, c01, t1	FSUB	c02, t1, c02	FMUL	a3, c02, c02#endif#ifdef RN	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01	FMUL	a1, c02, c02#endif#ifdef RT	LDF	[BO +  0 * SIZE], a1	FMUL	a1, c01, c01	FMUL	a1, c02, c02#endif#ifdef LN	add	C1, -2 * SIZE, C1#endif#if defined(LN) || defined(LT)	STF	c01, [BO +  0 * SIZE]	STF	c02, [BO +  1 * SIZE]#else	STF	c01, [AO +  0 * SIZE]	STF	c02, [AO +  1 * SIZE]#endif	STF	c01, [C1 + 0 * SIZE]	STF	c02, [C1 + 1 * SIZE]	FMOV	FZERO, t1	FMOV	FZERO, t2	FMOV	FZERO, t3	FMOV	FZERO, t4#ifndef LN	add	C1, 2 * SIZE, C1#endif#ifdef RT	sll	K, 1 + BASE_SHIFT, TEMP1	add	AORIG, TEMP1, AORIG#endif#if defined(LT) || defined(RN)	sub	K, KK, TEMP1	sll	TEMP1, 1 + BASE_SHIFT, TEMP2	sll	TEMP1, 0 + BASE_SHIFT, TEMP1	add	AO, TEMP2, AO	add	BO, TEMP1, BO#endif#ifdef LT	add	KK, 2, KK#endif#ifdef LN	sub	KK, 2, KK#endif.LL270:	and	M, 1, I	cmp	I, 0	ble,pn	%icc, .LL299	nop#if defined(LT) || defined(RN)	sra	KK, 2, L	mov	B, BO	cmp	L,  0#else#ifdef LN	sll	K,  0 + BASE_SHIFT, TEMP1

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -