zger.s
来自「Optimized GotoBLAS libraries」· S 代码 · 共 409 行
S
409 行
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#include "version.h"#ifdef EV6#ifdef DOUBLE#define Q 160#else#define Q 320#endif#endif#ifdef EV5#ifdef DOUBLE#define Q 160#else#define Q 320#endif#endif#ifdef EV4#ifdef DOUBLE#define Q 16#else#define Q 16#endif#endif .set noat .set noreorder#define STACKSIZE 8*8/* $0 : $1 : $2 : $3 : is $4 : j $5 : is $6 : i $7 : min_i $8 : b_offset $9 : --- $10: --- $11: --- $12: --- $13: --- $14: --- $15: --- $16: m $17: n $18: $19: x $20: incx $21: x $22: incy $23: a $24: lda $25: c_offset $26: --- $27: $at: a_orig $29: GP $30: SP $31: Zero*//* $f0 : atemp4 $f1 : alpha_r $f2 : alpha_i $f3 : temp5 $f4 : temp6 $f5 : temp7 $f6 : temp8 $f7 : $f8 : $f9 : $f10: temp3 $f11: temp1 $f12: temp2 $f13: temp4 $f14: temp9,5 $f15: temp10,6 $f16: atemp8 $f17: atemp6 $f18: ctemp4 $f19: ctemp3 $f20: atemp2 $f21: atemp7 $f22: temp11,7 $f23: temp12,8 $f24: temp_r $f25: temp_i $f26: atemp1 $f27: ctemp2 $f28: ctemp1 $f29: atemp2 $f30: atemp5 $f31: Zero*/.text .align 5 .globl NAME .ent NAMENAME: .frame $sp, STACKSIZE, $26, 0#ifdef PROFILE ldgp $gp, 0($27) lda $28, _mcount jsr $28, ($28), _mcount#endif lda $sp,-STACKSIZE($sp) stt $f2, 0($sp) stt $f3, 8($sp) stt $f4, 16($sp) stt $f5, 24($sp) stt $f6, 32($sp)#ifndef PROFILE .prologue 0#else .prologue 1#endif ldl $20, STACKSIZE+ 0($sp) # incx fabs $f19, $f10 ldq $19, STACKSIZE+ 8($sp) # y fabs $f20, $f11 ldl $22, STACKSIZE+16($sp) # incy fmov $f19, $f1 # alpha_r ldq $23, STACKSIZE+24($sp) # a fmov $f20, $f2 # alpha_i ldl $24, STACKSIZE+32($sp) # lda clr $18 ldq $27, STACKSIZE+40($sp) # buffer or $16, $17, $1 addq $20, $20, $20 # incx *= 2 addq $22, $22, $22 # incy *= 2 addq $24, $24, $24 beq $1, $End lda $1, 2 ADD $f10, $f11, $f10 clr $3 # is = 0 cmpeq $20, 2, $1 cmovne $1, 0, $20 fbeq $f10,$End .align 4$L23: subl $16, $3, $7 # min_i = m - is lda $2, Q cmple $7, $2, $1 mov $19, $8 # b_offset = y cmoveq $1, $2, $7 mov $23, $28 # a_orig = a mov $17, $4 # j = n mov $27, $5 mov $7, $25 beq $20, $L39 .align 4$Copy_Loop: LD $f30, 0*SIZE($21) LD $f17, 1*SIZE($21) ST $f30, 0*SIZE($5) ST $f17, 1*SIZE($5) SXADDQ $20, $21, $21 lda $5, 2*SIZE($5) lda $25, -1($25) bgt $25, $Copy_Loop .align 4$L39: LD $f26, 0*SIZE($8) mov $28, $5 # a_offset = a_orig + is LD $f29, 1*SIZE($8) mov $21, $25 # x MUL $f1, $f26, $f11 cmovne $20, $27, $25 MUL $f2, $f29, $f12 lda $4, -1($4) MUL $f1, $f29, $f10 sra $7, 2, $6 # i = (min_i >> 2) MUL $f2, $f26, $f13 SXADDQ $22, $8, $8 # b_offset += incy#ifndef CONJ SUB $f11, $f12, $f24 unop ADD $f13, $f10, $f25#else ADD $f11, $f12, $f24 unop SUB $f13, $f10, $f25#endif ble $6,$L28 .align 4 LD $f30, 0*SIZE($25) LD $f17, 1*SIZE($25) LD $f21, 2*SIZE($25) LD $f16, 3*SIZE($25) LD $f28, 0*SIZE($5) LD $f27, 1*SIZE($5) LD $f19, 2*SIZE($5) LD $f18, 3*SIZE($5) MUL $f30, $f24, $f11 # temp1 = atemp5 * temp_r LD $f26, 4*SIZE($25) MUL $f30, $f25, $f12 # temp2 = atemp5 * temp_i LD $f29, 5*SIZE($25) MUL $f21, $f24, $f10 # temp3 = atemp7 * temp_r LD $f20, 6*SIZE($25) MUL $f21, $f25, $f13 # temp4 = atemp7 * temp_i LD $f0, 7*SIZE($25) ADD $f28, $f11, $f3 # temp5 = ctemp1 + temp1 lda $6, -1($6) MUL $f17, $f25, $f11 # temp1 = atemp6 * temp_i LD $f28, 4*SIZE($5) ADD $f27, $f12, $f4 # temp6 = ctemp2 + temp2 unop MUL $f17, $f24, $f12 # temp2 = atemp6 * temp_r LD $f27, 5*SIZE($5) ADD $f19, $f10, $f5 # temp7 = ctemp3 + temp3 unop MUL $f16, $f25, $f10 # temp3 = atemp8 * temp_i LD $f19, 6*SIZE($5) ADD $f18, $f13, $f6 # temp8 = ctemp4 + temp4 MUL $f16, $f24, $f13 # temp4 = atemp8 * temp_r LD $f18, 7*SIZE($5) ble $6,$L29 .align 4$MainLoop: SUB $f3, $f11, $f14 # temp9 = temp5 - temp1 lds $f31, 24*SIZE($5) MUL $f26, $f24, $f11 # temp1 = atemp1 * temp_r LD $f30, 8*SIZE($25) # atemp5 ADD $f4, $f12, $f15 # temp10 = temp6 - temp2 LD $f17, 9*SIZE($25) # atemp6 MUL $f26, $f25, $f12 # temp2 = atemp1 * temp_i LD $f26, 12*SIZE($25) # atemp1 SUB $f5, $f10, $f22 # temp11 = temp7 - temp3 LD $f31, 32*SIZE($25) MUL $f20, $f24, $f10 # temp3 = atemp3 * temp_r LD $f21, 10*SIZE($25) # atemp7 ADD $f6, $f13, $f23 # temp12 = temp8 - temp4 LD $f16, 11*SIZE($25) # atemp8 MUL $f20, $f25, $f13 # temp4 = atemp3 * temp_i LD $f20, 14*SIZE($25) # atemp3 ADD $f28, $f11, $f3 # temp5 = ctemp1 + temp1 LD $f28, 8*SIZE($5) # ctemp1 MUL $f29, $f25, $f11 # temp1 = atemp2 * temp_i lda $6, -1($6) ST $f14, 0*SIZE($5) # temp9 ST $f15, 1*SIZE($5) # temp10 ST $f22, 2*SIZE($5) # temp11 ST $f23, 3*SIZE($5) # temp12 ADD $f27, $f12, $f4 # temp6 = ctemp2 + temp2 LD $f27, 9*SIZE($5) # ctemp2 MUL $f29, $f24, $f12 # temp2 = atemp2 * temp_r LD $f29, 13*SIZE($25) # atemp2 ADD $f19, $f10, $f5 # temp7 = ctemp3 + temp3 LD $f19, 10*SIZE($5) # ctemp3 MUL $f0, $f25, $f10 # temp3 = atemp4 * temp_i unop ADD $f18, $f13, $f6 # temp8 = ctemp4 + temp4 LD $f18, 11*SIZE($5) # ctemp4 MUL $f0, $f24, $f13 # temp4 = atemp4 * temp_r unop SUB $f3, $f11, $f14 # temp9 = temp5 - temp1 unop MUL $f30, $f24, $f11 # temp1 = atemp5 * temp_r LD $f0, 15*SIZE($25) # atemp4 ADD $f4, $f12, $f15 # temp10 = temp6 + temp2 unop MUL $f30, $f25, $f12 # temp2 = atemp5 * temp_i lda $25, 8*SIZE($25) SUB $f5, $f10, $f22 # temp11 = temp7 - temp3 MUL $f21, $f24, $f10 # temp3 = atemp7 * temp_r ADD $f6, $f13, $f23 # temp12 = temp8 + temp4 MUL $f21, $f25, $f13 # temp4 = atemp8 * temp_i ADD $f28, $f11, $f3 # temp5 = ctemp1 + temp1 LD $f28, 12*SIZE($5) # ctemp1 MUL $f17, $f25, $f11 # temp1 = atemp6 * temp_i ST $f14, 4*SIZE($5) # temp9 ADD $f27, $f12, $f4 # temp6 = ctemp2 + temp2 LD $f27, 13*SIZE($5) # ctemp2 MUL $f17, $f24, $f12 # temp2 = atemp6 * temp_r ST $f15, 5*SIZE($5) # temp10 ADD $f19, $f10, $f5 # temp7 = ctemp3 + temp3 MUL $f16, $f25, $f10 # temp3 = atemp8 * temp_i LD $f19, 14*SIZE($5) # ctemp3 ST $f22, 6*SIZE($5) # temp11 ADD $f18, $f13, $f6 # temp8 = ctemp4 + temp4 MUL $f16, $f24, $f13 # temp4 = atemp8 * temp_r LD $f18, 15*SIZE($5) # ctemp4 ST $f23, 7*SIZE($5) # temp12 lda $5, 8*SIZE($5) bgt $6, $MainLoop .align 4$L29: SUB $f3, $f11, $f14 # temp9 = temp5 - temp1 MUL $f26, $f24, $f11 # temp1 = atemp1 * temp_r ADD $f4, $f12, $f15 # temp10 = temp6 - temp2 MUL $f26, $f25, $f12 # temp2 = atemp1 * temp_i SUB $f5, $f10, $f22 # temp11 = temp7 - temp3 MUL $f20, $f24, $f10 # temp3 = atemp3 * temp_r ADD $f6, $f13, $f23 # temp12 = temp8 - temp4 MUL $f20, $f25, $f13 # temp4 = atemp3 * temp_i ADD $f28, $f11, $f3 # temp5 = ctemp1 + temp1 MUL $f29, $f25, $f11 # temp1 = atemp2 * temp_i ADD $f27, $f12, $f4 # temp6 = ctemp2 + temp2 MUL $f29, $f24, $f12 # temp2 = atemp2 * temp_r ADD $f19, $f10, $f5 # temp7 = ctemp3 + temp3 ST $f14, 0*SIZE($5) MUL $f0, $f25, $f10 # temp3 = atemp4 * temp_i unop ADD $f18, $f13, $f6 # temp8 = ctemp4 + temp4 ST $f15, 1*SIZE($5) MUL $f0, $f24, $f13 # temp4 = atemp4 * temp_r unop SUB $f3, $f11, $f14 # temp9 = temp5 - temp1 ST $f22, 2*SIZE($5) ADD $f4, $f12, $f15 # temp10 = temp6 + temp2 lda $25, 8*SIZE($25) SUB $f5, $f10, $f22 # temp11 = temp7 - temp3 ST $f23, 3*SIZE($5) ADD $f6, $f13, $f23 # temp12 = temp8 + temp4 lda $5, 8*SIZE($5) ST $f14, -4*SIZE($5) ST $f15, -3*SIZE($5) ST $f22, -2*SIZE($5) ST $f23, -1*SIZE($5) .align 4$L28: and $7, 3, $6 ble $6, $L34 .align 4$L38: LD $f26, 0*SIZE($25) LD $f29, 1*SIZE($25) LD $f28, 0*SIZE($5) LD $f27, 1*SIZE($5) MUL $f26, $f25, $f12 MUL $f26, $f24, $f11 MUL $f29, $f24, $f13 MUL $f29, $f25, $f10 ADD $f28, $f11, $f14 ADD $f27, $f12, $f15 SUB $f14, $f10, $f14 ADD $f15, $f13, $f15 lda $25, 2*SIZE($25) lda $6, -1($6) ST $f14, 0*SIZE($5) ST $f15, 1*SIZE($5) lda $5, 2*SIZE($5) bgt $6, $L38 .align 4$L34: SXADDQ $24, $28, $28 # a_orig += lda unop unop bgt $4, $L39 .align 4 lda $2, 2*Q*SIZE($21) # c_offset = x + is lda $23, 2*Q*SIZE($23) lda $3, Q($3) cmoveq $20, $2, $21 cmplt $3,$16,$1 bne $1,$L23 .align 4$End: clr $0 ldt $f2, 0($sp) ldt $f3, 8($sp) ldt $f4, 16($sp) ldt $f5, 24($sp) ldt $f6, 32($sp) lda $sp, STACKSIZE($sp) ret .end NAME .ident VERSION
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?