📄 gemm_kernel_4x4.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#include "version.h"#if !defined(EV4) && !defined(EV5) && !defined(EV6)#error "Architecture is not specified."#endif#ifdef EV6#define PREFETCHSIZE 48#define UNOP unop#endif#ifdef EV5#define PREFETCHSIZE 56#define UNOP#endif#ifdef EV4#define UNOP#endif .set noat .set noreorder .arch ev6.text .align 5 .globl CNAME .ent CNAME#define STACKSIZE 80#define M $16#define N $17#define K $18#define A $20#define B $21#define C $22#define LDC $23#define C1 $19#define C2 $24#define C3 $25#define C4 $27#define AO $at#define BO $5#define I $6#define J $7#define L $8#define a1 $f16#define a2 $f17#define a3 $f18#define a4 $f19#define b1 $f20#define b2 $f21#define b3 $f22#define b4 $f23#define t1 $f24#define t2 $f25#define t3 $f26#define t4 $f27#define a5 $f28#define a6 $f30#define b5 $f29#define alpha $f30#define c01 $f0#define c02 $f1#define c03 $f2#define c04 $f3#define c05 $f4#define c06 $f5#define c07 $f6#define c08 $f7#define c09 $f8#define c10 $f9#define c11 $f10#define c12 $f11#define c13 $f12#define c14 $f13#define c15 $f14#define c16 $f15#define ALPHA 64($sp)CNAME: .frame $sp, STACKSIZE, $26, 0#ifdef PROFILE ldgp $gp, 0($27) lda $at, _mcount jsr $at, ($at), _mcount#endif#ifndef PROFILE .prologue 0#else .prologue 1#endif lda $sp, -STACKSIZE($sp) ldq C, 0 + STACKSIZE($sp) ldq LDC, 8 + STACKSIZE($sp) SXADDQ LDC, 0, LDC stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp) stt $f7, 40($sp) stt $f8, 48($sp) stt $f9, 56($sp) stt $f19, ALPHA cmple M, 0, $0 cmple N, 0, $1 cmple K, 0, $2 or $0, $1, $0 or $0, $2, $0 bne $0, $L999$L00: sra N, 2, J ble J, $L40 .align 4 $L01: mov C, C1 addq C, LDC, C2 mov A, AO lda J, -1(J) addq C2, LDC, C3 s4addq LDC, C, C unop addq C3, LDC, C4 .align 4$L10: sra M, 2, I fclr t1 fclr t2 fclr t3 fclr t4 fclr c01 fclr c05 fclr c09 fclr c13 ble I, $L20 .align 4$L11: LD a1, 0 * SIZE(AO) fclr c02 LD a2, 1 * SIZE(AO) fclr c06 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(B) fclr c03 LD b2, 1 * SIZE(B) fclr c07 LD b3, 2 * SIZE(B) fclr c11 LD b4, 3 * SIZE(B) fclr c15 lds $f31, 4 * SIZE(C1) fclr c04 lda L, -2(K) unop lds $f31, 4 * SIZE(C2) unop lda BO, 4 * SIZE(B) fclr c08 lds $f31, 4 * SIZE(C3) unop lda AO, 4 * SIZE(AO) fclr c12 lds $f31, 4 * SIZE(C4) fclr c16 unop ble L, $L15 .align 5$L12:/* 1 */ ADD c11, t1, c11#ifndef EV4 ldq $31, PREFETCHSIZE * SIZE(AO)#else unop#endif MUL b1, a1, t1#ifndef EV4 ldl $31, PREFETCHSIZE * SIZE(BO)#else unop#endif ADD c12, t2, c12 unop MUL b1, a2, t2 unop ADD c16, t3, c16 unop MUL b2, a2, t3 LD a5, 0 * SIZE(AO) ADD c15, t4, c15 unop MUL b2, a1, t4 LD b5, 0 * SIZE(BO)/* 2 */ ADD c01, t1, c01 UNOP MUL b1, a3, t1 UNOP ADD c02, t2, c02 UNOP MUL b1, a4, t2 UNOP ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a1, t4 unop/* 3 */ ADD c03, t1, c03 unop MUL b3, a1, t1 unop ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO)/* 4 */ ADD c09, t1, c09 unop MUL b3, a3, t1 LD a6, 2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, 3 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD b4, 3 * SIZE(BO)/* 5 */ ADD c11, t1, c11 unop MUL b5, a5, t1 LD a1, 4 * SIZE(AO) ADD c12, t2, c12 lda L, -2(L) MUL b5, a2, t2 LD b1, 4 * SIZE(BO) ADD c16, t3, c16 unop MUL b2, a2, t3 unop ADD c15, t4, c15 unop MUL b2, a5, t4 unop/* 6 */ ADD c01, t1, c01 unop MUL b5, a6, t1 unop ADD c02, t2, c02 unop MUL b5, a4, t2 unop ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a5, t4 unop/* 7 */ ADD c03, t1, c03 lda AO, 8 * SIZE(AO) MUL b3, a5, t1 unop ADD c04, t2, c04 lda BO, 8 * SIZE(BO) MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, -3 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a6, t4 LD b2, -3 * SIZE(BO)/* 8 */ ADD c09, t1, c09 unop MUL b3, a6, t1 LD a3, -2 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, -2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 MUL b4, a6, t4 LD b4, -1 * SIZE(BO) bgt L, $L12 .align 4$L15: ADD c11, t1, c11 ldt alpha, ALPHA MUL b1, a1, t1 blbs K, $L18 .align 4 ADD c12, t2, c12 MUL b1, a2, t2 ADD c16, t3, c16 MUL b2, a2, t3 ADD c15, t4, c15 MUL b2, a1, t4 ADD c01, t1, c01 MUL b1, a3, t1 ADD c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(BO) ADD c06, t3, c06 MUL b2, a4, t3 ADD c05, t4, c05 MUL b4, a1, t4 ADD c03, t1, c03 unop MUL b3, a1, t1 LD a1, 0 * SIZE(AO) ADD c04, t2, c04 unop MUL b3, a2, t2 unop ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 1 * SIZE(AO) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 1 * SIZE(BO) ADD c09, t1, c09 unop MUL b3, a3, t1 lda AO, 4 * SIZE(AO) ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 2 * SIZE(BO) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, -1 * SIZE(AO) ADD c07, t4, c07 unop MUL b4, a3, t4 LD a3, -2 * SIZE(AO) ADD c11, t1, c11 LD b4, 3 * SIZE(BO) MUL b1, a1, t1 lda BO, 4 * SIZE(BO) .align 4$L18: ADD c12, t2, c12 unop MUL b1, a2, t2 LD a5, 0 * SIZE(C1) ADD c16, t3, c16 unop MUL b2, a2, t3 unop ADD c15, t4, c15 unop MUL b2, a1, t4 LD b5, 1 * SIZE(C1) ADD c01, t1, c01 unop MUL b1, a3, t1 unop ADD c02, t2, c02 unop MUL b1, a4, t2 LD b1, 0 * SIZE(C2) ADD c06, t3, c06 unop MUL b2, a4, t3 unop ADD c05, t4, c05 unop MUL b4, a1, t4 unop ADD c03, t1, c03 unop MUL b3, a1, t1 unop ADD c04, t2, c04 unop MUL b3, a2, t2 LD a1, 0 * SIZE(C3) ADD c08, t3, c08 unop MUL b4, a2, t3 LD a2, 2 * SIZE(C1) ADD c13, t4, c13 unop MUL b2, a3, t4 LD b2, 3 * SIZE(C1) ADD c09, t1, c09 lda I, -1(I) MUL b3, a3, t1 unop ADD c10, t2, c10 unop MUL b3, a4, t2 LD b3, 0 * SIZE(C4) ADD c14, t3, c14 unop MUL b4, a4, t3 LD a4, 1 * SIZE(C2) ADD c07, t4, c07 unop MUL b4, a3, t4 LD a3, 2 * SIZE(C2) ADD c11, t1, c11 unop MUL alpha, c01, c01 LD b4, 3 * SIZE(C2) ADD c12, t2, c12 unop MUL alpha, c02, c02 LD t1, 1 * SIZE(C3) ADD c16, t3, c16 unop MUL alpha, c03, c03 LD t2, 2 * SIZE(C3) ADD c15, t4, c15 unop MUL alpha, c04, c04 LD t3, 3 * SIZE(C3) MUL alpha, c05, c05 unop ADD c01, a5, c01 LD t4, 1 * SIZE(C4) MUL alpha, c06, c06 unop ADD c02, b5, c02 LD a5, 2 * SIZE(C4) MUL alpha, c07, c07 unop ADD c03, a2, c03 LD b5, 3 * SIZE(C4) MUL alpha, c08, c08 unop ADD c04, b2, c04 unop MUL alpha, c09, c09 ST c01, 0 * SIZE(C1) ADD c05, b1, c05 unop MUL alpha, c10, c10 ST c02, 1 * SIZE(C1) ADD c06, a4, c06 unop MUL alpha, c11, c11 ST c03, 2 * SIZE(C1) ADD c07, a3, c07 unop MUL alpha, c12, c12 ST c04, 3 * SIZE(C1) ADD c08, b4, c08 lda C1, 4 * SIZE(C1) MUL alpha, c13, c13 ST c05, 0 * SIZE(C2) ADD c09, a1, c09 unop MUL alpha, c14, c14 ST c06, 1 * SIZE(C2) ADD c10, t1, c10 unop MUL alpha, c15, c15 ST c07, 2 * SIZE(C2) ADD c11, t2, c11 unop MUL alpha, c16, c16 ST c08, 3 * SIZE(C2) ADD c12, t3, c12 lda C2, 4 * SIZE(C2) ADD c13, b3, c13 ST c09, 0 * SIZE(C3) fclr t1 lda C4, 4 * SIZE(C4) ADD c14, t4, c14 ST c10, 1 * SIZE(C3) fclr t2 unop ADD c15, a5, c15 ST c11, 2 * SIZE(C3) fclr t3 unop ADD c16, b5, c16 ST c12, 3 * SIZE(C3) fclr t4 lda C3, 4 * SIZE(C3) ST c13, -4 * SIZE(C4) fclr c01 ST c14, -3 * SIZE(C4) fclr c05 ST c15, -2 * SIZE(C4) fclr c09 unop unop ST c16, -1 * SIZE(C4) fclr c13 unop bgt I, $L11 .align 4$L20: and M, 2, I ble I, $L30 .align 4$L21: LD a1, 0 * SIZE(AO) fclr c02 LD a2, 1 * SIZE(AO) fclr c06 LD a3, 2 * SIZE(AO) fclr c10 LD a4, 3 * SIZE(AO) fclr c14 LD b1, 0 * SIZE(B) lda L, -2(K) LD b2, 1 * SIZE(B) lda AO, 2 * SIZE(AO) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) lda BO, 4 * SIZE(B) ble L, $L25 .align 4$L22: ADD c09, t1, c09 unop MUL a1, b1, t1 unop ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 lda BO, 8 * SIZE(BO) ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, -7 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 unop ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, -6 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, 2 * SIZE(AO) ADD c06, t4, c06 MUL a2, b4, t4 LD b5, -5 * SIZE(BO) ADD c09, t1, c09 unop MUL a3, b1, t1 LD a2, 3 * SIZE(AO) ADD c10, t2, c10 unop MUL a4, b1, t2 LD b1, -4 * SIZE(BO) ADD c13, t3, c13 unop MUL a3, b2, t3 lda AO, 4 * SIZE(AO) ADD c14, t4, c14 MUL a4, b2, t4 LD b2, -3 * SIZE(BO) ADD c01, t1, c01 lda L, -2(L) MUL a3, b3, t1 LD b4, -1 * SIZE(BO) ADD c02, t2, c02 unop MUL a4, b3, t2 LD b3, -2 * SIZE(BO) ADD c05, t3, c05 unop MUL a3, b5, t3 LD a3, 0 * SIZE(AO) ADD c06, t4, c06 MUL a4, b5, t4 LD a4, 1 * SIZE(AO) bgt L, $L22 .align 4$L25: ADD c09, t1, c09 ldt alpha, ALPHA MUL a1, b1, t1 blbs K, $L28 ADD c10, t2, c10 unop MUL a2, b1, t2 LD b1, 0 * SIZE(BO) ADD c13, t3, c13 unop MUL a1, b2, t3 unop ADD c14, t4, c14 unop MUL a2, b2, t4 LD b2, 1 * SIZE(BO) ADD c01, t1, c01 unop MUL a1, b3, t1 lda AO, 2 * SIZE(AO) ADD c02, t2, c02 unop MUL a2, b3, t2 LD b3, 2 * SIZE(BO) ADD c05, t3, c05 unop MUL a1, b4, t3 LD a1, -2 * SIZE(AO) ADD c06, t4, c06 unop MUL a2, b4, t4 LD a2, -1 * SIZE(AO) ADD c09, t1, c09 LD b4, 3 * SIZE(BO) MUL a1, b1, t1 lda BO, 4 * SIZE(BO) .align 4$L28: ADD c10, t2, c10 unop MUL a2, b1, t2 LD a3, 0 * SIZE(C1) ADD c13, t3, c13 unop MUL a1, b2, t3 LD a4, 1 * SIZE(C1) ADD c14, t4, c14 unop MUL a2, b2, t4 LD a5, 0 * SIZE(C2) ADD c01, t1, c01 unop MUL a1, b3, t1 LD b5, 1 * SIZE(C2) ADD c02, t2, c02 unop MUL a2, b3, t2 LD b1, 0 * SIZE(C3) ADD c05, t3, c05 unop MUL a1, b4, t3 LD b2, 1 * SIZE(C3) ADD c06, t4, c06 unop MUL a2, b4, t4 LD b3, 0 * SIZE(C4) ADD c09, t1, c09 unop MUL alpha, c01, c01 LD b4, 1 * SIZE(C4) ADD c10, t2, c10 unop MUL alpha, c02, c02 unop ADD c13, t3, c13 MUL alpha, c05, c05 ADD c14, t4, c14 MUL alpha, c06, c06 MUL alpha, c09, c09 ADD c01, a3, c01 MUL alpha, c10, c10 ADD c02, a4, c02 MUL alpha, c13, c13 ADD c05, a5, c05 MUL alpha, c14, c14 ADD c06, b5, c06 ADD c09, b1, c09 ST c01, 0 * SIZE(C1) fclr t1 unop ADD c10, b2, c10 ST c02, 1 * SIZE(C1) fclr t2 unop ADD c13, b3, c13 ST c05, 0 * SIZE(C2) fclr t3 unop ADD c14, b4, c14 ST c06, 1 * SIZE(C2) fclr t4 unop ST c09, 0 * SIZE(C3) fclr c01 lda C1, 2 * SIZE(C1) unop ST c10, 1 * SIZE(C3) fclr c05 lda C2, 2 * SIZE(C2) unop ST c13, 0 * SIZE(C4) fclr c09 lda C3, 2 * SIZE(C3) unop ST c14, 1 * SIZE(C4) fclr c13 lda C4, 2 * SIZE(C4) unop .align 4$L30: and M, 1, I ble I, $L39 .align 4$L31: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(B) lda L, -2(K) LD b2, 1 * SIZE(B) lda AO, 1 * SIZE(AO) LD b3, 2 * SIZE(B) LD b4, 3 * SIZE(B) lda BO, 4 * SIZE(B) ble L, $L35 .align 4$L32: ADD c01, t1, c01 lda L, -2(L) MUL a1, b1, t1 LD b1, 0 * SIZE(BO) ADD c05, t2, c05 lda AO, 2 * SIZE(AO) MUL a1, b2, t2 LD b2, 1 * SIZE(BO) ADD c09, t3, c09 LD b5, 3 * SIZE(BO) MUL a1, b3, t3 LD b3, 2 * SIZE(BO) ADD c13, t4, c13 MUL a1, b4, t4 LD a1, -1 * SIZE(AO) ADD c01, t1, c01 MUL a2, b1, t1 LD b1, 4 * SIZE(BO) lda BO, 8 * SIZE(BO) ADD c05, t2, c05 MUL a2, b2, t2 LD b2, -3 * SIZE(BO) ADD c09, t3, c09 LD b4, -1 * SIZE(BO) MUL a2, b3, t3 LD b3, -2 * SIZE(BO) ADD c13, t4, c13 MUL a2, b5, t4 LD a2, 0 * SIZE(AO) bgt L, $L32 .align 4$L35: ADD c01, t1, c01 ldt alpha, ALPHA MUL a1, b1, t1 blbs K, $L38 .align 4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -