📄 gemv_n.s
字号:
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/ .set noat .set noreorder#define ASSEMBLER#include "common.h"#ifdef EV6#ifdef DOUBLE#define DGEMV_P 120#else#define DGEMV_P 120#endif#endif#ifdef EV5#ifdef DOUBLE#define DGEMV_P 56#else#define DGEMV_P 56#endif#endif#ifdef EV4#ifdef DOUBLE#define DGEMV_P 24#else#define DGEMV_P 24#endif#endif#define STACKSIZE 8 /* $24 is free */.text .align 5 .globl CNAME .ent CNAMECNAME: .frame $sp, STACKSIZE*8, $26, 0#ifdef PROFILE ldgp $gp, 0($27) lda $28, _mcount jsr $28, ($28), _mcount#endif ldq $19, 0($sp) ldl $5, 8($sp) # incx ldq $18, 16($sp) # y ldl $6, 24($sp) # incy ldq $27, 32($sp) # buffer lda $sp, -STACKSIZE*8($sp)#ifndef PROFILE .prologue 0#else .prologue 1#endif mulq $21, DGEMV_P, $25 # ldaP = lda*P stt $f2, 0($sp) clr $28 # js = 0 stt $f3, 8($sp) cmpeq $5, 1, $0 stt $f4, 16($sp) stt $f5, 24($sp) cmovne $0, 0, $5 # if incx == 1 then incx = 0 stt $f6, 32($sp) cmple $16, 0, $2 stt $f7, 40($sp) cmple $17, 0, $3 stt $f8, 48($sp) or $2, $3, $2 stt $f9, 56($sp) bne $2, $End # if m<0 goto $End .align 4$L5: mov $18, $7 # c_offset = y fclr $f27 subl $17, $28, $22 # jx = n - js fclr $f28 cmple $22, DGEMV_P, $1 # $1 = (jx < P) fclr $f12 mov $20, $23 # a_offset = a fclr $f13 cmoveq $1, DGEMV_P, $22 # if (jx>P) jx = P fclr $f14 sra $16, 3, $8 # i = (m>>3) fclr $f15 sra $22, 3, $1 beq $5, $L60 mov $27, $2 ble $1, $L55 .align 4$L50: LD $f18, 0*SIZE($19) SXADDQ $5, $19, $19 LD $f20, 0*SIZE($19) SXADDQ $5, $19, $19 LD $f21, 0*SIZE($19) SXADDQ $5, $19, $19 LD $f22, 0*SIZE($19) SXADDQ $5, $19, $19 LD $f23, 0*SIZE($19) SXADDQ $5, $19, $19 LD $f24, 0*SIZE($19) SXADDQ $5, $19, $19 LD $f25, 0*SIZE($19) SXADDQ $5, $19, $19 LD $f26, 0*SIZE($19) SXADDQ $5, $19, $19 ST $f18, 0*SIZE($2) ST $f20, 1*SIZE($2) ST $f21, 2*SIZE($2) ST $f22, 3*SIZE($2) ST $f23, 4*SIZE($2) ST $f24, 5*SIZE($2) ST $f25, 6*SIZE($2) ST $f26, 7*SIZE($2) addq $2, 8*SIZE, $2 subl $1, 1, $1 bgt $1, $L50 .align 4$L55: and $22, 7, $1 ble $1, $L60 .align 4$L56: LD $f26, 0*SIZE($19) SXADDQ $5, $19, $19 subl $1, 1, $1 addq $2, SIZE, $2 ST $f26, -1*SIZE($2) bgt $1, $L56 .align 4$L60: unop SXADDQ $25, $20, $20 # a+= ldaP unop ble $8, $L7 # if (i<=0) goto $L7 .align 4$L8: mov $23, $2 # a1_offset = a_offset fclr $f23 sra $22, 1, $4 # j = (jx>>1) fclr $f24 SXADDQ $28, $19, $3 # b_offset = x + is cmovne $5, $27, $3 # b_offset = &stack[0] fclr $f17 subl $4, 1, $4 # j -- fclr $f0 fclr $f25 addq $23, 8*SIZE, $23 # a_offset += 8 fclr $f26 blt $4,$L11 .align 4 LD $f22, 0*SIZE($2) LD $f29, 1*SIZE($2) LD $f21, 2*SIZE($2) LD $f16, 3*SIZE($2) LD $f30, 4*SIZE($2) LD $f20, 5*SIZE($2) LD $f18, 6*SIZE($2) LD $f1, 7*SIZE($2) LD $f10, 0*SIZE($3) SXADDQ $21, $2, $2 # a1_offset += lda LD $f11, 1*SIZE($3) addq $3, 2*SIZE, $3 # b_offset += 2 LD $f9, 0*SIZE($2) LD $f2, 1*SIZE($2) LD $f3, 2*SIZE($2) LD $f4, 3*SIZE($2) LD $f5, 4*SIZE($2) LD $f6, 5*SIZE($2) LD $f7, 6*SIZE($2) LD $f8, 7*SIZE($2) unop SXADDQ $21, $2, $2 # a1_offset += lda unop ble $4,$L12 # if j<=0 goto $L12 .align 4$L13: ADD $f25, $f12, $f25#ifdef EV6 ldl $31, 12*SIZE($2)#else#ifdef DOUBLE ldl $31, 40*SIZE($2)#else ldl $31, 48*SIZE($2)#endif#endif MUL $f22, $f10, $f12 LD $f22, 0*SIZE($2) ADD $f26, $f13, $f26#ifdef EV6 ldl $31, 16*SIZE($3)#else unop#endif MUL $f29, $f10, $f13 LD $f29, 1*SIZE($2) ADD $f27, $f14, $f27 unop MUL $f21, $f10, $f14 LD $f21, 2*SIZE($2) ADD $f28, $f15, $f28 unop MUL $f16, $f10, $f15 LD $f16, 3*SIZE($2) ADD $f23, $f12, $f23 unop MUL $f30, $f10, $f12 LD $f30, 4*SIZE($2) ADD $f24, $f13, $f24 unop MUL $f20, $f10, $f13 LD $f20, 5*SIZE($2) ADD $f17, $f14, $f17 unop MUL $f18, $f10, $f14 LD $f18, 6*SIZE($2) ADD $f0, $f15, $f0 MUL $f1, $f10, $f15 LD $f1, 7*SIZE($2) SXADDQ $21, $2, $2 # a1_offset += lda ADD $f25, $f12, $f25 LD $f10, 0*SIZE($3) MUL $f9, $f11, $f12 LD $f9, 0*SIZE($2) ADD $f26, $f13, $f26 subl $4,1,$4 MUL $f2, $f11, $f13 LD $f2, 1*SIZE($2) ADD $f27, $f14, $f27 unop MUL $f3, $f11, $f14 LD $f3, 2*SIZE($2) ADD $f28, $f15, $f28 unop MUL $f4, $f11, $f15 LD $f4, 3*SIZE($2) ADD $f23, $f12, $f23 unop MUL $f5, $f11, $f12 LD $f5, 4*SIZE($2) ADD $f24, $f13, $f24 unop MUL $f6, $f11, $f13 LD $f6, 5*SIZE($2) ADD $f17, $f14, $f17 addq $3, 2*SIZE, $3 MUL $f7, $f11, $f14 LD $f7, 6*SIZE($2) ADD $f0, $f15, $f0 MUL $f8, $f11, $f15 LD $f8, 7*SIZE($2) LD $f11,-1*SIZE($3) unop SXADDQ $21, $2, $2 # a1_offset += lda unop bgt $4,$L13 .align 4$L12: ADD $f25, $f12, $f25 MUL $f22, $f10, $f12 ADD $f26, $f13, $f26 MUL $f29, $f10, $f13 ADD $f27, $f14, $f27 MUL $f21, $f10, $f14 ADD $f28, $f15, $f28 MUL $f16, $f10, $f15 ADD $f23, $f12, $f23 MUL $f30, $f10, $f12 ADD $f24, $f13, $f24 MUL $f20, $f10, $f13 ADD $f17, $f14, $f17 MUL $f18, $f10, $f14 ADD $f0, $f15, $f0 MUL $f1, $f10, $f15 ADD $f25, $f12, $f25 MUL $f9, $f11, $f12 ADD $f26, $f13, $f26 MUL $f2, $f11, $f13 ADD $f27, $f14, $f27 MUL $f3, $f11, $f14 ADD $f28, $f15, $f28 MUL $f4, $f11, $f15 ADD $f23, $f12, $f23 MUL $f5, $f11, $f12 ADD $f24, $f13, $f24 MUL $f6, $f11, $f13 ADD $f17, $f14, $f17 MUL $f7, $f11, $f14 ADD $f0, $f15, $f0 MUL $f8, $f11, $f15 .align 4$L11: mov $7, $1 LD $f10, 0*SIZE($3) # may seg. fault? unop blbc $22, $L18 LD $f22, 0*SIZE($2) LD $f29, 1*SIZE($2) LD $f21, 2*SIZE($2) LD $f16, 3*SIZE($2) LD $f30, 4*SIZE($2) LD $f20, 5*SIZE($2) LD $f18, 6*SIZE($2) LD $f1, 7*SIZE($2) ADD $f25,$f12,$f25 MUL $f22,$f10,$f12 ADD $f26,$f13,$f26 MUL $f29,$f10,$f13 ADD $f27,$f14,$f27 MUL $f21,$f10,$f14 ADD $f28,$f15,$f28 MUL $f16,$f10,$f15 ADD $f23,$f12,$f23 MUL $f30,$f10,$f12 ADD $f24,$f13,$f24 MUL $f20,$f10,$f13 ADD $f17,$f14,$f17 MUL $f18,$f10,$f14 ADD $f0,$f15,$f0 MUL $f1,$f10,$f15 .align 4$L18: ADD $f25, $f12, $f25 LD $f22, 0*SIZE($7) MUL $f23, $f19, $f23 SXADDQ $6, $7, $7 ADD $f26, $f13, $f26 LD $f29, 0*SIZE($7) MUL $f24, $f19, $f24 SXADDQ $6, $7, $7 ADD $f27, $f14, $f27 LD $f21, 0*SIZE($7) MUL $f17, $f19, $f17 SXADDQ $6, $7, $7 ADD $f28, $f15, $f28 LD $f16, 0*SIZE($7) MUL $f0, $f19, $f0 SXADDQ $6, $7, $7 MUL $f25, $f19, $f25 LD $f30, 0*SIZE($7) ADD $f22, $f23, $f22 SXADDQ $6, $7, $7 MUL $f26, $f19, $f26 LD $f20, 0*SIZE($7) ADD $f29, $f24, $f29 SXADDQ $6, $7, $7 MUL $f27, $f19, $f27 LD $f18, 0*SIZE($7) ADD $f21, $f17, $f21 SXADDQ $6, $7, $7 MUL $f28, $f19, $f28 LD $f1, 0*SIZE($7) ADD $f16, $f0, $f16 SXADDQ $6, $7, $7 ADD $f30, $f25, $f30 subl $8, 1, $8 ST $f22, 0*SIZE($1) SXADDQ $6, $1, $1 ADD $f20, $f26, $f20 ST $f29, 0*SIZE($1) fclr $f12 SXADDQ $6, $1, $1 ADD $f18, $f27, $f18 ST $f21, 0*SIZE($1) fclr $f27 SXADDQ $6, $1, $1 ADD $f1, $f28, $f1 ST $f16, 0*SIZE($1) fclr $f28 SXADDQ $6, $1, $1 ST $f30, 0*SIZE($1) fclr $f13 SXADDQ $6, $1, $1 fclr $f14 ST $f20, 0*SIZE($1) SXADDQ $6, $1, $1 ST $f18, 0*SIZE($1) SXADDQ $6, $1, $1 ST $f1, 0*SIZE($1) fclr $f15 unop bgt $8, $L8 .align 4$L7: and $16, 7, $8 # i = (m & 7) ble $8, $L4 # if (i<=0) goto $L4 .align 4$L22: mov $23, $2 # a1_offset = a_offset fclr $f23 sra $22, 1, $4 # j = (jx >> 1) fclr $f24 addq $23, 1*SIZE, $23 # a_offset ++ fclr $f12 SXADDQ $28, $19, $3 # b_offset = x + is fclr $f13 cmovne $5, $27, $3 # b_offset = &stack[0] ble $4, $L25 subl $4, 1, $4 # j -- unop LD $f22,0*SIZE($2) SXADDQ $21, $2, $2 # a1_offset += lda LD $f9, 0*SIZE($2) SXADDQ $21, $2, $2 # a1_offset += lda LD $f10,0*SIZE($3) LD $f11,1*SIZE($3) addq $3, 2*SIZE, $3 ble $4, $L26 .align 4$L27: ADD $f23, $f12, $f23 MUL $f22, $f10, $f12 LD $f22, 0*SIZE($2) LD $f10, 0*SIZE($3) ADD $f24, $f13, $f24 SXADDQ $21, $2, $2 # a1_offset += lda MUL $f9, $f11, $f13 LD $f11, 1*SIZE($3) subl $4,1,$4 LD $f9, 0*SIZE($2) SXADDQ $21, $2, $2 # a1_offset += lda addq $3, 2*SIZE, $3 bgt $4,$L27 .align 4$L26: ADD $f24, $f13, $f24 MUL $f9, $f11, $f13 ADD $f23, $f12, $f23 MUL $f22, $f10, $f12 .align 4$L25: blbc $22,$L32 LD $f22,0*SIZE($2) LD $f10,0*SIZE($3) ADD $f23,$f12,$f23 MUL $f22,$f10,$f12 .align 4$L32: LD $f22, 0*SIZE($7) ADD $f24, $f13, $f24 ADD $f23, $f12, $f23 ADD $f23, $f24, $f23 subl $8,1,$8 MUL $f23, $f19, $f23 ADD $f22, $f23, $f22 ST $f22, 0*SIZE($7) SXADDQ $6, $7, $7 bgt $8,$L22 .align 4$L4: addl $28, DGEMV_P, $28 nop cmplt $28, $17, $1 bne $1,$L5 .align 4$End: ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) ldt $f7, 40($sp) ldt $f8, 48($sp) ldt $f9, 56($sp) lda $sp, STACKSIZE*8($sp) ret $31,($26),1 .end CNAME
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -