⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 zgemm_kernel_2x2.s

📁 Optimized GotoBLAS libraries
💻 S
📖 第 1 页 / 共 2 页
字号:
/*********************************************************************//*                                                                   *//*             Optimized BLAS libraries                              *//*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     *//*                                                                   *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   *//* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of     *//* profits, interruption of business, or related expenses which may  *//* arise from use of Software or Documentation, including but not    *//* limited to those resulting from defects in Software and/or        *//* Documentation, or loss or inaccuracy of data of any kind.         *//*********************************************************************/#define ASSEMBLER#include "common.h"#include "version.h"#if !defined(EV4) && !defined(EV5) && !defined(EV6)#error "Architecture is not specified."#endif#ifdef EV6#define PREFETCHSIZE 48#define UNOP unop#endif#ifdef EV5#define PREFETCHSIZE 48#define UNOP#endif#ifdef EV4#define UNOP#endif	.set	noat	.set	noreorder	.arch ev6.text	.align	5	.globl	CNAME	.ent	CNAME#define STACKSIZE 80#define M	$16#define N	$17#define K	$18#define A	$21#define	B	$22#define C	$20#define	LDC	$23#define C1	$19#define C2	$24#define AO	$at#define	BO	$5#define I	$6#define J	$7#define L	$8#define a1	$f16#define a2	$f17#define a3	$f18#define a4	$f19#define b1	$f20#define b2	$f21#define b3	$f22#define b4	$f23#define t1	$f24#define t2	$f25#define t3	$f26#define t4	$f27#define a5	$f28#define a6	$f30#define b5	$f29#define alpha_i	$f29#define alpha_r	$f30#define c01	$f0#define c02	$f1#define c03	$f2#define c04	$f3#define c05	$f4#define c06	$f5#define c07	$f6#define c08	$f7#define c09	$f8#define c10	$f9#define c11	$f10#define c12	$f11#define c13	$f12#define c14	$f13#define c15	$f14#define c16	$f15#define ALPHA_R	64($sp)#define ALPHA_I	72($sp)CNAME:	.frame	$sp, STACKSIZE, $26, 0#ifdef PROFILE	ldgp	$gp, 0($27)	lda	$at, _mcount	jsr	$at, ($at), _mcount#endif#ifndef PROFILE	.prologue 0#else	.prologue 1#endif	lda	$sp, -STACKSIZE($sp)	ldq	B,        0 + STACKSIZE($sp)	ldq	C,        8 + STACKSIZE($sp)	ldq	LDC,     16 + STACKSIZE($sp)	sll	LDC, ZBASE_SHIFT, LDC	stt	$f2,   0($sp)	stt	$f3,   8($sp)	stt	$f4,  16($sp)	stt	$f5,  24($sp)	stt	$f6,  32($sp)	stt	$f7,  40($sp)	stt	$f8,  48($sp)	stt	$f9,  56($sp)	stt	$f19, ALPHA_R	stt	$f20, ALPHA_I	cmple	M, 0, $0	cmple	N, 0, $1	cmple	K, 0, $2	or	$0, $1, $0	or	$0, $2, $0	bne	$0, $L999$L00:	sra	N, 1, J	ble	J, $L30	.align 4	$L01:	mov	C,  C1	addq	C,  LDC, C2	mov	A, AO	lda	J,        -1(J)	addq	C2, LDC, C	unop	.align 4$L10:	sra	M,  1, I	fclr	t1	fclr	t2	fclr	t3	fclr	t4	fclr	c01	fclr	c05 	fclr	c09	fclr	c13	ble	I, $L20	.align 4$L11:	LD	a1,  0 * SIZE(AO)	fclr	c02	LD	a2,  1 * SIZE(AO)	fclr	c06	LD	a3,  2 * SIZE(AO)	fclr	c10	LD	a4,  3 * SIZE(AO)	fclr	c14	LD	b1,  0 * SIZE(B)	fclr	c03	LD	b2,  1 * SIZE(B)	fclr	c07	LD	b3,  2 * SIZE(B)	fclr	c11	LD	b4,  3 * SIZE(B)	fclr	c15 	lds	$f31,  4 * SIZE(C1)	fclr	c04	lda	L,        -2(K)	unop	lds	$f31,  4 * SIZE(C2)	unop	lda	BO,  4 * SIZE(B)	fclr	c08	unop	lda	AO,  4 * SIZE(AO)	fclr	c12	fclr	c16	unop	ble	L, $L15	.align	5$L12:/*  1 */	ADD	c11,  t1, c11#ifndef EV4	ldq	$31,  PREFETCHSIZE * SIZE(AO)#else	unop#endif	MUL	b1, a1, t1#ifndef EV4	ldl	$31,  PREFETCHSIZE * SIZE(BO)#else	unop#endif	ADD	c12,  t2, c12	unop	MUL	b1, a2, t2	unop	ADD	c16,  t3, c16	unop	MUL	b2, a2, t3	LD	a5,   0 * SIZE(AO)	ADD	c15, t4, c15	unop	MUL	b2, a1, t4	LD	b5,   0 * SIZE(BO)/*  2 */	ADD	c01, t1, c01	UNOP	MUL	b1, a3, t1	UNOP	ADD	c02, t2, c02	UNOP	MUL	b1, a4, t2	UNOP	ADD	c06,  t3, c06	unop	MUL	b2, a4, t3	unop	ADD	c05, t4, c05	unop	MUL	b4, a1, t4	unop/*  3 */	ADD	c03, t1, c03	unop	MUL	b3, a1, t1	unop	ADD	c04, t2, c04	unop	MUL	b3, a2, t2	unop	ADD	c08,  t3, c08	unop	MUL	b4, a2, t3	LD	a2,  1 * SIZE(AO)	ADD	c13, t4, c13	unop	MUL	b2, a3, t4	LD	b2,  1 * SIZE(BO)/*  4 */	ADD	c09,  t1, c09	unop	MUL	b3, a3, t1	LD	a6,  2 * SIZE(AO)	ADD	c10,  t2, c10	unop	MUL	b3, a4, t2	LD	b3,  2 * SIZE(BO)	ADD	c14, t3, c14	unop	MUL	b4, a4, t3	LD	a4,  3 * SIZE(AO)	ADD	c07,  t4, c07	unop	MUL	b4, a3, t4	LD	b4,  3 * SIZE(BO)/*  5 */	ADD	c11,  t1, c11	unop	MUL	b5,  a5,  t1	LD	a1,  4 * SIZE(AO)	ADD	c12,  t2, c12	lda	L,        -2(L)	MUL	b5,  a2, t2	LD	b1,  4 * SIZE(BO)	ADD	c16,  t3, c16	unop	MUL	b2, a2, t3	unop	ADD	c15, t4, c15	unop	MUL	b2, a5,  t4	unop/*  6 */	ADD	c01, t1, c01	unop	MUL	b5,  a6, t1	unop	ADD	c02, t2, c02	unop	MUL	b5,  a4, t2	unop	ADD	c06,  t3, c06	unop	MUL	b2, a4, t3	unop	ADD	c05, t4, c05	unop	MUL	b4, a5,  t4	unop/*  7 */	ADD	c03, t1, c03	lda	AO,    8 * SIZE(AO)	MUL	b3, a5,  t1	unop	ADD	c04, t2, c04	lda	BO,    8 * SIZE(BO)	MUL	b3, a2, t2	unop	ADD	c08,  t3, c08	unop	MUL	b4, a2, t3	LD	a2, -3 * SIZE(AO)	ADD	c13, t4, c13	unop	MUL	b2, a6, t4	LD	b2, -3 * SIZE(BO)/*  8 */	ADD	c09,  t1, c09	unop	MUL	b3, a6, t1	LD	a3, -2 * SIZE(AO)	ADD	c10,  t2, c10	unop	MUL	b3, a4, t2	LD	b3, -2 * SIZE(BO)	ADD	c14, t3, c14	unop	MUL	b4, a4, t3	LD	a4, -1 * SIZE(AO)	ADD	c07,  t4, c07	MUL	b4, a6, t4	LD	b4, -1 * SIZE(BO)	bgt	L,  $L12	.align 4$L15:	ADD	c11,  t1, c11	ldt	alpha_r, ALPHA_R	MUL	b1, a1, t1	blbs	K, $L18	.align 4	ADD	c12,  t2, c12	MUL	b1, a2, t2	ADD	c16,  t3, c16	MUL	b2, a2, t3	ADD	c15, t4, c15	MUL	b2, a1, t4	ADD	c01, t1, c01	MUL	b1, a3, t1	ADD	c02, t2, c02	unop	MUL	b1, a4, t2	LD	b1,  0 * SIZE(BO)	ADD	c06,  t3, c06	MUL	b2, a4, t3	ADD	c05, t4, c05	MUL	b4, a1, t4	ADD	c03, t1, c03	unop	MUL	b3, a1, t1	LD	a1,  0 * SIZE(AO)	ADD	c04, t2, c04	unop	MUL	b3, a2, t2	unop	ADD	c08,  t3, c08	unop	MUL	b4, a2, t3	LD	a2,  1 * SIZE(AO)	ADD	c13, t4, c13	unop	MUL	b2, a3, t4	LD	b2,  1 * SIZE(BO)	ADD	c09,  t1, c09	unop	MUL	b3, a3, t1	lda	AO,  4 * SIZE(AO)	ADD	c10,  t2, c10	unop	MUL	b3, a4, t2 	LD	b3,  2 * SIZE(BO)	ADD	c14, t3, c14	unop	MUL	b4, a4, t3	LD	a4, -1 * SIZE(AO)	ADD	c07,  t4, c07	unop	MUL	b4, a3, t4	LD	a3, -2 * SIZE(AO)	ADD	c11,  t1, c11	LD	b4,  3 * SIZE(BO)	MUL	b1, a1, t1	lda	BO,  4 * SIZE(BO)	.align 4$L18:	ADD	c12,  t2, c12	unop	MUL	b1, a2, t2	ldt	alpha_i, ALPHA_I	ADD	c16,  t3, c16	unop	MUL	b2, a2, t3	LD	a5, 0 * SIZE(C1)	ADD	c15, t4, c15	MUL	b2, a1, t4	ADD	c01, t1, c01	MUL	b1, a3, t1	ADD	c02, t2, c02	unop	MUL	b1, a4, t2	LD	b1, 1 * SIZE(C1)	ADD	c06,  t3, c06	MUL	b2, a4, t3	ADD	c05, t4, c05	MUL	b4, a1, t4	ADD	c03, t1, c03	unop	MUL	b3, a1, t1	LD	a1, 2 * SIZE(C1)	ADD	c04, t2, c04	unop	MUL	b3, a2, t2	unop	ADD	c08,  t3, c08	unop	MUL	b4, a2, t3	LD	a2, 3 * SIZE(C1)	ADD	c13, t4, c13	unop	MUL	b2, a3, t4	LD	b2, 0 * SIZE(C2)	ADD	c09,  t1, c09	lda	I,        -1(I)	MUL	b3, a3, t1	unop	ADD	c10,  t2, c10	unop	MUL	b3, a4, t2	LD	b3, 1 * SIZE(C2)	ADD	c14, t3, c14	unop	MUL	b4, a4, t3  	LD	a4, 2 * SIZE(C2)	ADD	c07,  t4, c07	unop	MUL	b4, a3, t4	LD	a3, 3 * SIZE(C2)	ADD	c11,  t1, c11	ADD	c12,  t2, c12	ADD	c16,  t3, c16	ADD	c15,  t4, c15#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \      defined(CC) || defined(CR) || defined(RC) || defined(RR)	SUB	c01, c06, c01	ADD	c02, c05, c02	SUB	c03, c08, c03	ADD	c04, c07, c04	SUB	c09, c14, c09	MUL	  alpha_r, c01, t1	ADD	c10, c13, c10	MUL	  alpha_r, c02, t2	SUB	c11, c16, c11	MUL	  alpha_r, c03, t3	ADD	c12, c15, c12	MUL	  alpha_r, c04, t4#else	ADD	c01, c06, c01	SUB	c02, c05, c02	ADD	c03, c08, c03	SUB	c04, c07, c04	ADD	c09, c14, c09	MUL	  alpha_r, c01, t1	SUB	c10, c13, c10	MUL	  alpha_r, c02, t2	ADD	c11, c16, c11	MUL	  alpha_r, c03, t3	SUB	c12, c15, c12	MUL	  alpha_r, c04, t4#endif#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \      defined(NC) || defined(TC) || defined(NR) || defined(TR)	ADD	  a5,  t1,  a5	MUL	  alpha_i, c02, t1	ADD	  b1,  t2,  b1	MUL	  alpha_i, c01, t2	ADD	  a1,  t3,  a1	MUL	  alpha_i, c04, t3	ADD	  a2,  t4,  a2	MUL	  alpha_i, c03, t4	SUB	  a5,  t1,  a5	MUL	  alpha_r, c09, t1	ADD	  b1,  t2,  b1	MUL	  alpha_r, c10, t2	SUB	  a1,  t3,  a1	MUL	  alpha_r, c11, t3	ADD	  a2,  t4,  a2	MUL	  alpha_r, c12, t4	ADD	  b2,  t1,  b2	MUL	  alpha_i, c10, t1	ADD	  b3,  t2,  b3	MUL	  alpha_i, c09, t2	ADD	  a4,  t3,  a4	MUL	  alpha_i, c12, t3	ADD	  a3,  t4,  a3	MUL	  alpha_i, c11, t4	SUB	  b2,  t1,  b2	ST	a5,  0 * SIZE(C1)	fclr	t1	unop	ADD	  b3,  t2,  b3	ST	b1,  1 * SIZE(C1)	fclr	t2	unop	SUB	  a4,  t3,  a4	ST	a1,  2 * SIZE(C1)	fclr	t3	unop	ADD	  a3,  t4,  a3	ST	a2,  3 * SIZE(C1)	fclr	t4	lda	C1,   4 * SIZE(C1)#else	ADD	  a5,  t1,  a5	MUL	  alpha_i, c02, t1	SUB	  b1,  t2,  b1	MUL	  alpha_i, c01, t2	ADD	  a1,  t3,  a1	MUL	  alpha_i, c04, t3	SUB	  a2,  t4,  a2	MUL	  alpha_i, c03, t4	ADD	  a5,  t1,  a5	MUL	  alpha_r, c09, t1	ADD	  b1,  t2,  b1	MUL	  alpha_r, c10, t2	ADD	  a1,  t3,  a1	MUL	  alpha_r, c11, t3	ADD	  a2,  t4,  a2	MUL	  alpha_r, c12, t4	ADD	  b2,  t1,  b2	MUL	  alpha_i, c10, t1	SUB	  b3,  t2,  b3	MUL	  alpha_i, c09, t2	ADD	  a4,  t3,  a4	MUL	  alpha_i, c12, t3	SUB	  a3,  t4,  a3	MUL	  alpha_i, c11, t4	ADD	  b2,  t1,  b2	ST	a5,  0 * SIZE(C1)	fclr	t1	unop	ADD	  b3,  t2,  b3	ST	b1,  1 * SIZE(C1)	fclr	t2	unop	ADD	  a4,  t3,  a4	ST	a1,  2 * SIZE(C1)	fclr	t3	unop	ADD	  a3,  t4,  a3	ST	a2,  3 * SIZE(C1)	fclr	t4	lda	C1,   4 * SIZE(C1)#endif	ST	b2,  0 * SIZE(C2)	fclr	c01 	ST	b3,  1 * SIZE(C2)	fclr	c05	ST	a4,  2 * SIZE(C2)	unop 	fclr	c09	unop	ST	a3,  3 * SIZE(C2)	fclr	c13	lda	C2,   4 * SIZE(C2)	bgt	I, $L11	.align 4

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -