⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gemm_kernel.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 4 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#define M	%i0#define N	%i1#define K	%i2#if defined(DOUBLE) && !defined(__64BIT__)#define A	%i5#define B	%i4#else#define A	%i4#define B	%i5#endif#define C	%o4#define LDC	%o5#define AO	%l0#define BO	%l1#define I	%l2#define J	%l3#define L	%l4#define C1	%o0#define C2	%o1#define C3	%o2#define C4	%o3#define OFFSET	%l5#define	KK	%l6#define TEMP1	%l7#define TEMP2	%i3#ifdef DOUBLE#define c01	%f0#define c02	%f2#define c03	%f4#define c04	%f6#define c05	%f8#define c06	%f10#define c07	%f12#define c08	%f14#define c09	%f16#define c10	%f18#define c11	%f20#define c12	%f22#define c13	%f24#define c14	%f26#define c15	%f28#define c16	%f30#define t1	%f32#define	t2 	%f34#define t3	%f36#define	t4 	%f38#define a1	%f40#define a2	%f42#define a3	%f44#define a4	%f46#define a5	%f58#define b1	%f48#define b2	%f50#define b3	%f52#define b4	%f54#define b5	%f56#define FZERO	%f60#define ALPHA	%f62#else#define c01	%f0#define c02	%f1#define c03	%f2#define c04	%f3#define c05	%f4#define c06	%f5#define c07	%f6#define c08	%f7#define c09	%f8#define c10	%f9#define c11	%f10#define c12	%f11#define c13	%f12#define c14	%f13#define c15	%f14#define c16	%f15#define t1	%f16#define	t2 	%f17#define t3	%f18#define	t4 	%f19#define a1	%f20#define a2	%f21#define a3	%f22#define a4	%f23#define a5	%f31#define b1	%f24#define b2	%f25#define b3	%f26#define b4	%f27#define b5	%f28#define FZERO	%f29#define ALPHA	%f30#endif	PROLOGUE	SAVESP	nop#ifndef __64BIT__#ifdef DOUBLE	st	%g0, [%fp + STACK_START +  8]	st	%g0, [%fp + STACK_START + 12]	st	%i3, [%fp + STACK_START + 16]   /* ALPHA */	st	%i4, [%fp + STACK_START + 20]	ld	[%fp + STACK_START + 28], B	ld	[%fp + STACK_START + 32], C	ld	[%fp + STACK_START + 36], LDC#ifndef TRMMKERNEL	nop#else	ld	[%fp + STACK_START + 40], OFFSET#endif#else	st	%g0, [%fp + STACK_START +  8]	st	%i3, [%fp + STACK_START + 16]   /* ALPHA */	ld	[%fp + STACK_START + 28], C	ld	[%fp + STACK_START + 32], LDC#ifdef TRMMKERNEL	ld	[%fp + STACK_START + 36], OFFSET#endif#endif	LDF	[%fp + STACK_START +  8], FZERO	LDF	[%fp + STACK_START + 16], ALPHA#else#ifdef DOUBLE	stx	%g0, [%fp + STACK_START + 32]	FMOV	%f6, ALPHA	nop	ldd	[%fp + STACK_START + 32], FZERO#else	st	%g0, [%fp + STACK_START + 32]	FMOV	%f7, ALPHA	nop	ld	[%fp + STACK_START + 32], FZERO#endif	ldx	[%fp+  STACK_START + 56], C	ldx	[%fp+  STACK_START + 64], LDC#ifdef TRMMKERNEL	ldx	[%fp+  STACK_START + 72], OFFSET#endif#endif#if defined(TRMMKERNEL) && !defined(LEFT)	neg	OFFSET, KK#endif	sra	N, 2, J	cmp	J, 0	ble,pn	%icc, .LL100	sll	LDC, BASE_SHIFT, LDC.LL11:	add	C, LDC, C2	FMOV	FZERO, t1	nop	mov	C, C1	add	C2, LDC, C3	FMOV	FZERO, t2	sra	K, 2, L	mov	A, AO	sra	M, 2, I	add	C3, LDC, C4	FMOV	FZERO, t3#if defined(TRMMKERNEL) &&  defined(LEFT)	mov	OFFSET, KK#endif	cmp	I, 0	add	C4, LDC, C	FMOV	FZERO, t4	ble,pn	%icc, .LL50	FMOV	FZERO, c01.LL21:#if !defined(TRMMKERNEL)	FMOV	FZERO, c02	mov	B, BO	FMOV	FZERO, c03	cmp	L,  0#else	FMOV	FZERO, c02	FMOV	FZERO, c03#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))	mov	B, BO#else	sll	KK, 2 + BASE_SHIFT, TEMP1	add	AO, TEMP1, AO	add	B,  TEMP1, BO#endif#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 4, L#else	add	KK, 4, L#endif	sra	L, 2, L	cmp	L,  0#endif	LDF	[AO + 0 * SIZE], a1	FMOV	FZERO, c04	LDF	[BO + 0 * SIZE], b1	FMOV	FZERO, c05	LDF	[AO + 1 * SIZE], a2	FMOV	FZERO, c06	LDF	[BO + 1 * SIZE], b2	FMOV	FZERO, c07	LDF	[AO + 2 * SIZE], a3	FMOV	FZERO, c08	LDF	[BO + 2 * SIZE], b3	FMOV	FZERO, c09	LDF	[AO + 3 * SIZE], a4	FMOV	FZERO, c10	LDF	[BO + 3 * SIZE], b4	FMOV	FZERO, c11	LDF	[BO +  4 * SIZE], b5	/* ***** */	LDF	[AO +  4 * SIZE], a5	/* ***** */	prefetch [C1 + 3 * SIZE], 3	FMOV	FZERO, c12	prefetch [C2 + 3 * SIZE], 3	FMOV	FZERO, c13	prefetch [C3 + 3 * SIZE], 3	FMOV	FZERO, c14	prefetch [C4 + 3 * SIZE], 3	FMOV	FZERO, c15	ble,pn	%icc, .LL25	FMOV	FZERO, c16#define APREFETCHSIZE 40#define BPREFETCHSIZE 40#define APREFETCH_CATEGORY 0#define BPREFETCH_CATEGORY 0.LL22:	FADD	c04, t1, c04	prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY	FMUL	a1, b1, t1	nop	FADD	c08, t2, c08	prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY	FMUL	a1, b2, t2	add	AO, 16 * SIZE, AO	FADD	c12, t3, c12	LDF	[AO - 13 * SIZE], a4	FMUL	a1, b3, t3	add	BO, 16 * SIZE, BO	FADD	c16, t4, c16	nop	FMUL	a1, b4, t4	LDF	[AO -  8 * SIZE], a1	FADD	c01, t1, c01	nop	FMUL	a2, b1, t1	nop	FADD	c05, t2, c05	nop	FMUL	a2, b2, t2	nop	FADD	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD	c13, t4, c13	add	L, -1, L	FMUL	a2, b4, t4	LDF	[AO - 11 * SIZE], a2	FADD	c02, t1, c02	nop	FMUL	a3, b1, t1	nop	FADD	c06, t2, c06	nop	FMUL	a3, b2, t2	nop	FADD	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO - 10 * SIZE], a3	FADD	c03, t1, c03	nop	FMUL	a4, b1, t1	LDF	[BO -  8 * SIZE], b1	FADD	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO - 11 * SIZE], b2	FADD	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO - 10 * SIZE], b3	FADD	c15, t4, c15	nop	FMUL	a4, b4, t4	LDF	[BO -  9 * SIZE], b4	FADD	c04, t1, c04	nop	FMUL	a5, b5, t1	LDF	[AO -  9 * SIZE], a4	FADD	c08, t2, c08	nop	FMUL	a5, b2, t2	nop	FADD	c12, t3, c12	nop	FMUL	a5, b3, t3	nop	FADD	c16, t4, c16	nop	FMUL	a5, b4, t4	LDF	[AO - 4 * SIZE], a5	FADD	c01, t1, c01	nop	FMUL	a2, b5, t1	nop	FADD	c05, t2, c05	nop	FMUL	a2, b2, t2	nop	FADD	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD	c13, t4, c13	nop	FMUL	a2, b4, t4	LDF	[AO -  7 * SIZE], a2	FADD	c02, t1, c02	nop	FMUL	a3, b5, t1	nop	FADD	c06, t2, c06	nop	FMUL	a3, b2, t2	nop	FADD	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO -  6 * SIZE], a3	FADD	c03, t1, c03	nop	FMUL	a4, b5, t1	LDF	[BO - 4 * SIZE], b5	FADD	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO -  7 * SIZE], b2	FADD	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO -  6 * SIZE], b3	FADD	c15, t4, c15	nop	FMUL	a4, b4, t4	LDF	[BO -  5 * SIZE], b4	FADD	c04, t1, c04	nop	FMUL	a1, b1, t1	LDF	[AO -  5 * SIZE], a4	FADD	c08, t2, c08	nop	FMUL	a1, b2, t2	nop	FADD	c12, t3, c12	nop	FMUL	a1, b3, t3	nop	FADD	c16, t4, c16	nop	FMUL	a1, b4, t4	LDF	[AO -  0 * SIZE], a1	FADD	c01, t1, c01	nop	FMUL	a2, b1, t1	nop#ifdef DOUBLE	prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY#else	nop#endif	FADD	c05, t2, c05	nop	FMUL	a2, b2, t2	FADD	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD	c13, t4, c13	nop	FMUL	a2, b4, t4	nop	FADD	c02, t1, c02	nop	FMUL	a3, b1, t1	LDF	[AO - 3 * SIZE], a2	FADD	c06, t2, c06#ifdef DOUBLE	prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY#else	nop#endif	FMUL	a3, b2, t2	nop	FADD	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO - 2 * SIZE], a3	FADD	c03, t1, c03	nop	FMUL	a4, b1, t1	LDF	[BO -  0 * SIZE], b1	FADD	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO - 3 * SIZE], b2	FADD	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO - 2 * SIZE], b3	FADD	c15, t4, c15	nop	FMUL	a4, b4, t4	LDF	[BO - 1 * SIZE], b4	FADD	c04, t1, c04	nop	FMUL	a5, b5, t1	LDF	[AO - 1 * SIZE], a4	FADD	c08, t2, c08	FMUL	a5, b2, t2	FADD	c12, t3, c12	FMUL	a5, b3, t3	FADD	c16, t4, c16	nop	FMUL	a5, b4, t4	LDF	[AO +  4 * SIZE], a5	FADD	c01, t1, c01	nop	FMUL	a2, b5, t1	nop	FADD	c05, t2, c05	nop	FMUL	a2, b2, t2	nop	FADD	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD	c13, t4, c13	nop	FMUL	a2, b4, t4	LDF	[AO +  1 * SIZE], a2	FADD	c02, t1, c02	nop	FMUL	a3, b5, t1	nop	FADD	c06, t2, c06	nop	FMUL	a3, b2, t2	nop	FADD	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO +  2 * SIZE], a3	FADD	c03, t1, c03	cmp	L, 0	FMUL	a4, b5, t1	LDF	[BO +  4 * SIZE], b5	FADD	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO +  1 * SIZE], b2	FADD	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO +  2 * SIZE], b3	FADD	c15, t4, c15	FMUL	a4, b4, t4	bg,pt	%icc, .LL22	LDF	[BO +  3 * SIZE], b4.LL25:#ifndef TRMMKERNEL	and	K, 3, L#else#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))	sub	K, KK, L#elif defined(LEFT)	add	KK, 4, L#else	add	KK, 4, L#endif	and	L, 3,  L#endif	cmp	L,  0	ble,a,pn %icc, .LL29	nop.LL26:	FADD	c04, t1, c04	LDF	[AO +  3 * SIZE], a4	FMUL	a1, b1, t1	add	AO, 4 * SIZE, AO	FADD	c08, t2, c08	add	BO, 4 * SIZE, BO	FMUL	a1, b2, t2	add	L, -1, L	FADD	c12, t3, c12	nop	FMUL	a1, b3, t3	cmp	L, 0	FADD	c16, t4, c16	nop	FMUL	a1, b4, t4	LDF	[AO + 0 * SIZE], a1	FADD	c01, t1, c01	nop	FMUL	a2, b1, t1	nop	FADD	c05, t2, c05	nop	FMUL	a2, b2, t2	nop	FADD	c09, t3, c09	nop	FMUL	a2, b3, t3	nop	FADD	c13, t4, c13	nop	FMUL	a2, b4, t4	LDF	[AO + 1 * SIZE], a2	FADD	c02, t1, c02	nop	FMUL	a3, b1, t1	nop	FADD	c06, t2, c06	nop	FMUL	a3, b2, t2	nop	FADD	c10, t3, c10	nop	FMUL	a3, b3, t3	nop	FADD	c14, t4, c14	nop	FMUL	a3, b4, t4	LDF	[AO + 2 * SIZE], a3	FADD	c03, t1, c03	nop	FMUL	a4, b1, t1	LDF	[BO + 0 * SIZE], b1	FADD	c07, t2, c07	nop	FMUL	a4, b2, t2	LDF	[BO + 1 * SIZE], b2	FADD	c11, t3, c11	nop	FMUL	a4, b3, t3	LDF	[BO + 2 * SIZE], b3	FADD	c15, t4, c15	FMUL	a4, b4, t4	bg,pt	%icc, .LL26	LDF	[BO + 3 * SIZE], b4.LL29:#ifndef TRMMKERNEL	FADD	c04, t1, c04	add	I, -1, I	FMUL	c01, ALPHA, c01	LDF	[C1 + 0 * SIZE], a1	FADD	c08, t2, c08	cmp	I, 0	FMUL	c02, ALPHA, c02	LDF	[C1 + 1 * SIZE], a2	FADD	c12, t3, c12	nop	FMUL	c03, ALPHA, c03	LDF	[C1 + 2 * SIZE], a3	FADD	c16, t4, c16	nop	FMUL	c04, ALPHA, c04	LDF	[C1 + 3 * SIZE], a4	FMUL	c05, ALPHA, c05	LDF	[C2 + 0 * SIZE], b1	FMUL	c06, ALPHA, c06	LDF	[C2 + 1 * SIZE], b2	FMUL	c07, ALPHA, c07	LDF	[C2 + 2 * SIZE], b3	FMUL	c08, ALPHA, c08	LDF	[C2 + 3 * SIZE], b4	FMUL	c09, ALPHA, c09	LDF	[C3 + 0 * SIZE], t1	FMUL	c10, ALPHA, c10	LDF	[C3 + 1 * SIZE], t2	FMUL	c11, ALPHA, c11	LDF	[C3 + 2 * SIZE], t3	FMUL	c12, ALPHA, c12	LDF	[C3 + 3 * SIZE], t4	FMUL	c13, ALPHA, c13	add	C1, 4 * SIZE, C1	FADD	c01, a1, c01	LDF	[C4 + 0 * SIZE], a1	FMUL	c14, ALPHA, c14	add	C2, 4 * SIZE, C2	FADD	c02, a2, c02	LDF	[C4 + 1 * SIZE], a2

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -