ger.s
来自「Optimized GotoBLAS libraries」· S 代码 · 共 530 行
S
530 行
/*********************************************************************//* *//* Optimized BLAS libraries *//* By Kazushige Goto <kgoto@tacc.utexas.edu> *//* *//* Copyright (c) The University of Texas, 2005. All rights reserved. *//* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING *//* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF *//* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, *//* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY *//* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF *//* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO *//* THE USE OF THE SOFTWARE OR DOCUMENTATION. *//* Under no circumstances shall University be liable for incidental, *//* special, indirect, direct or consequential damages or loss of *//* profits, interruption of business, or related expenses which may *//* arise from use of Software or Documentation, including but not *//* limited to those resulting from defects in Software and/or *//* Documentation, or loss or inaccuracy of data of any kind. *//*********************************************************************/#define ASSEMBLER#include "common.h"#include "version.h"#ifdef EV6#ifdef DOUBLE#define Q 200#else#define Q 400#endif#endif #ifdef EV5#ifdef DOUBLE#define Q 200#else#define Q 400#endif#endif#ifdef EV4#ifdef DOUBLE#define Q 16#else#define Q 16#endif#endif .set noat .set noreorder/* $0 : $1 : $2 : $3 : $4 : $5 : a_offset $6 : a_offset1 $7 : x_offset $8 : y_offset $9 : $10: $11: $12: $13: $14: $15: $16: m $17: n $18: a_offset2 $19: x $20: incx $21: y $22: incy $23: a $24: lda $25: i $26: is $27: min_i $at: j $29: GP $30: SP $31: Zero*/#define STACKSIZE 32.text .align 5 .globl NAME .ent NAMENAME: .frame $sp, STACKSIZE, $26, 0#ifdef PROFILE ldgp $gp, 0($27) lda $28, _mcount jsr $28, ($28), _mcount#endif fmov $f19, $f1 # alpha ldq $19, 0($sp) # y lda $sp, -STACKSIZE($sp) clr $0 ldl $22, STACKSIZE + 8($sp) # incy ldq $23, STACKSIZE + 16($sp) # a or $16, $17, $2 ldl $24, STACKSIZE + 24($sp) # lda#ifndef PROFILE .prologue 0#else .prologue 1#endif stq $9, 0($sp) lda $18, Q cmpeq $21, 1, $1 beq $2, $End ldq $9, STACKSIZE + 32($sp) # Buffer clr $4 # is = 0 cmovne $1, 0, $21 fbeq $f19, $End .align 4$L21: subl $16, $4, $27 # min_i = m - is SXADDQ $4, $23, $5 # a_offset = a + is cmple $27, $18, $1 # $1 = (Q < min_i) mov $9, $6 cmoveq $1, $18, $27 # if (Q<min_i) min_i = Q mov $27, $25 nop beq $21, $Skip_Copying .align 4 $Copy_Loop: LD $f10, 0($20) SXADDQ $21, $20, $20 ST $f10, 0($6) lda $6, SIZE($6) lda $25, -1($25) bgt $25, $Copy_Loop .align 4$Skip_Copying: sra $17, 1, $28 # j = (n >> 1) mov $19, $8 # y_offset = y addq $4, $18, $4 # is += Q ble $28, $L23 .align 4$L38: mov $5, $6 # a_offset1 = a_offset LD $f10, 0($8) SXADDQ $22, $8, $8 # y_offset += incy LD $f11, 0($8) SXADDQ $22, $8, $8 # y_offset += incy lda $28, -1($28) # j -- sra $27, 3, $25 # i = (min_i >> 2) SXADDQ $24, $5, $3 # a_offset2 = a_offset + lda MUL $f1, $f10, $f25 # temp1 = alpha * *y_offset SXADDQ $24, $3, $5 # a_offset = a_offset2 + lda; mov $20, $7 # x_offset = x cmovne $21, $9, $7 MUL $f1, $f11, $f21 # temp2 = alpha * *y_offset ble $25,$L27 LD $f22, 0*SIZE($7) LD $f29, 1*SIZE($7) LD $f28, 2*SIZE($7) LD $f27, 3*SIZE($7) LD $f26, 0*SIZE($6) LD $f19, 1*SIZE($6) LD $f30, 2*SIZE($6) LD $f20, 3*SIZE($6) LD $f17, 0*SIZE($3) MUL $f25, $f22, $f10 LD $f18, 1*SIZE($3) MUL $f25, $f29, $f13 LD $f16, 2*SIZE($3) MUL $f25, $f28, $f11 LD $f0, 3*SIZE($3) MUL $f25, $f27, $f12 lda $25, -1($25) ble $25, $L28 .align 4$MainLoop: lds $f31, 32*SIZE($3) unop lds $f31, 32*SIZE($6) unop ADD $f10, $f26, $f10 LD $f26, 4*SIZE($6) # 8 Clocks MUL $f21, $f22, $f24 LD $f9, 4*SIZE($7) # ADD $f13, $f19, $f13 LD $f19, 5*SIZE($6) MUL $f21, $f29, $f23 LD $f29, 5*SIZE($7) ADD $f11, $f30, $f11 LD $f30, 6*SIZE($6) MUL $f21, $f28, $f15 LD $f28, 6*SIZE($7) ADD $f12, $f20, $f12 LD $f20, 7*SIZE($6) MUL $f21, $f27, $f14 LD $f27, 7*SIZE($7) ST $f10, 0*SIZE($6) ADD $f24, $f17, $f24 MUL $f25, $f9, $f10 LD $f17, 4*SIZE($3) # 8 Clocks ST $f13, 1*SIZE($6) ADD $f23, $f18, $f23 LD $f18, 5*SIZE($3) MUL $f25, $f29, $f13 ST $f11, 2*SIZE($6) ADD $f15, $f16, $f15 MUL $f25, $f28, $f11 LD $f16, 6*SIZE($3) ST $f12, 3*SIZE($6) ADD $f14, $f0, $f14 LD $f0, 7*SIZE($3) MUL $f25, $f27, $f12 ST $f24, 0*SIZE($3) unop unop unop ADD $f10, $f26, $f10 LD $f26, 8*SIZE($6) MUL $f21, $f9, $f24 LD $f22, 8*SIZE($7) # 4 Clocks ST $f23, 1*SIZE($3) unop unop unop ADD $f13, $f19, $f13 LD $f19, 9*SIZE($6) MUL $f21, $f29, $f23 LD $f29, 9*SIZE($7) ST $f15, 2*SIZE($3) unop unop unop ADD $f11, $f30, $f11 LD $f30,10*SIZE($6) MUL $f21, $f28, $f15 LD $f28,10*SIZE($7) ST $f14, 3*SIZE($3) unop unop lda $25, -1($25) ADD $f12, $f20, $f12 LD $f20,11*SIZE($6) MUL $f21, $f27, $f14 LD $f27,11*SIZE($7) ST $f10, 4*SIZE($6) ADD $f24, $f17, $f24 LD $f17, 8*SIZE($3) # 9 clocks MUL $f25, $f22, $f10 ST $f13, 5*SIZE($6) ADD $f23, $f18, $f23 LD $f18, 9*SIZE($3) MUL $f25, $f29, $f13 ST $f11, 6*SIZE($6) ADD $f15, $f16, $f15 LD $f16,10*SIZE($3) MUL $f25, $f28, $f11 ST $f12, 7*SIZE($6) ADD $f14, $f0, $f14 LD $f0, 11*SIZE($3) MUL $f25, $f27, $f12 ST $f24, 4*SIZE($3) lda $7, 8*SIZE($7) ST $f23, 5*SIZE($3) lda $6, 8*SIZE($6) ST $f15, 6*SIZE($3) ST $f14, 7*SIZE($3) lda $3, 8*SIZE($3) bgt $25,$MainLoop .align 4$L28: ADD $f10, $f26, $f10 LD $f26, 4*SIZE($6) MUL $f21, $f22, $f24 LD $f9, 4*SIZE($7) ADD $f13, $f19, $f13 LD $f19, 5*SIZE($6) MUL $f21, $f29, $f23 LD $f29, 5*SIZE($7) ADD $f11, $f30, $f11 LD $f30, 6*SIZE($6) MUL $f21, $f28, $f15 LD $f28, 6*SIZE($7) ADD $f12, $f20, $f12 LD $f20, 7*SIZE($6) MUL $f21, $f27, $f14 LD $f27, 7*SIZE($7) ST $f10, 0*SIZE($6) ADD $f24, $f17, $f24 MUL $f25, $f9, $f10 LD $f17, 4*SIZE($3) ST $f13, 1*SIZE($6) ADD $f23, $f18, $f23 MUL $f25, $f29, $f13 LD $f18, 5*SIZE($3) ST $f11, 2*SIZE($6) ADD $f15, $f16, $f15 MUL $f25, $f28, $f11 LD $f16, 6*SIZE($3) ST $f12, 3*SIZE($6) ADD $f14, $f0, $f14 MUL $f25, $f27, $f12 LD $f0, 7*SIZE($3) ADD $f10, $f26, $f10 ST $f24, 0*SIZE($3) MUL $f21, $f9, $f24 unop ADD $f13, $f19, $f13 ST $f23, 1*SIZE($3) MUL $f21, $f29, $f23 unop ADD $f11, $f30, $f11 ST $f15, 2*SIZE($3) MUL $f21, $f28, $f15 unop ADD $f12, $f20, $f12 ST $f14, 3*SIZE($3) MUL $f21, $f27, $f14 unop ADD $f24, $f17, $f24 ST $f10, 4*SIZE($6) ADD $f23, $f18, $f23 ST $f13, 5*SIZE($6) ADD $f15, $f16, $f15 ST $f11, 6*SIZE($6) ADD $f14, $f0, $f14 ST $f12, 7*SIZE($6) ST $f24, 4*SIZE($3) lda $6, 8*SIZE($6) ST $f23, 5*SIZE($3) lda $7, 8*SIZE($7) ST $f15, 6*SIZE($3) nop ST $f14, 7*SIZE($3) lda $3, 8*SIZE($3) .align 4$L27: and $27,7,$25 # min_i unop unop ble $25,$L26 .align 4$L37: LD $f22, 0($7) LD $f26, 0($6) LD $f30, 0($3) lda $25, -1($25) MUL $f25,$f22,$f10 MUL $f21,$f22,$f11 ADD $f10,$f26,$f10 ADD $f11,$f30,$f11 ST $f10,0($6) lda $6, SIZE($6) ST $f11,0($3) lda $3, SIZE($3) lda $7, SIZE($7) bgt $25,$L37 .align 4$L26: bgt $28, $L38 .align 4$L23: blbc $17, $L39 .align 4$L54: LD $f10,0($8) mov $5, $6 # a_offset1 = a_offset mov $20, $7 # x_offset = x cmovne $21, $9, $7 SXADDQ $22, $8, $8 # y_offset += incy MUL $f1,$f10,$f25 sra $27, 2, $25 # min_i ble $25,$L43 LD $f22, 0*SIZE($7) LD $f29, 1*SIZE($7) LD $f28, 2*SIZE($7) LD $f27, 3*SIZE($7) LD $f26, 0*SIZE($6) LD $f19, 1*SIZE($6) LD $f30, 2*SIZE($6) LD $f20, 3*SIZE($6) subl $25,1,$25 ble $25,$L44 .align 4$L48: MUL $f25, $f22, $f10 LD $f22, 4*SIZE($7) MUL $f25, $f29, $f13 LD $f29, 5*SIZE($7) MUL $f25, $f28, $f11 LD $f28, 6*SIZE($7) MUL $f25, $f27, $f12 LD $f27, 7*SIZE($7) ADD $f10, $f26, $f10 LD $f26, 4*SIZE($6) ADD $f13, $f19, $f13 LD $f19, 5*SIZE($6) ADD $f11, $f30, $f11 LD $f30, 6*SIZE($6) ADD $f12, $f20, $f12 LD $f20, 7*SIZE($6) ST $f10, 0*SIZE($6) lda $25, -1($25) ST $f13, 1*SIZE($6) lda $7, 4*SIZE($7) ST $f11, 2*SIZE($6) ST $f12, 3*SIZE($6) lda $6, 4*SIZE($6) bgt $25,$L48 .align 4$L44: MUL $f25, $f22, $f10 MUL $f25, $f29, $f13 MUL $f25, $f28, $f11 MUL $f25, $f27, $f12 ADD $f10, $f26, $f10 ADD $f13, $f19, $f13 ADD $f11, $f30, $f11 ADD $f12, $f20, $f12 ST $f10, 0*SIZE($6) ST $f13, 1*SIZE($6) ST $f11, 2*SIZE($6) lda $7, 4*SIZE($7) ST $f12, 3*SIZE($6) lda $6, 4*SIZE($6) .align 4$L43: and $27, 3, $25 # min_i ble $25, $L42 .align 4$L53: LD $f22,0($7) LD $f26,0($6) MUL $f25,$f22,$f10 ADD $f10,$f26,$f10 lda $7, SIZE($7) lda $25, -1($25) ST $f10,0($6) lda $6, SIZE($6) bgt $25,$L53 .align 4$L42: bgt $28,$L54 .align 4$L39: SXADDQ $18, $20, $6 # x += Q cmoveq $21, $6, $20 cmplt $4, $16, $1 bne $1, $L21 .align 4$End: ldq $9, 0($sp) lda $sp, STACKSIZE($sp) ret .end NAME .ident VERSION
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?