📄 e_sinh.s
字号:
// Calculate sinh_FR_X2 = ax*ax and sinh_FR_X4 = ax*ax*ax*ax{ .mmf nop.m 999(p0) ldfe sinh_FR_P1 = [r34],16 (p0) fma.s1 sinh_FR_X2 = sinh_FR_X, sinh_FR_X, f0 ;; }{ .mmi(p0) ldfe sinh_FR_P2 = [r34],16 ;; (p0) ldfe sinh_FR_P3 = [r34],16 nop.i 999 ;;}{ .mmi(p0) ldfe sinh_FR_P4 = [r34],16 ;; (p0) ldfe sinh_FR_P5 = [r34],16 nop.i 999 ;;}{ .mfi(p0) ldfe sinh_FR_P6 = [r34],16 (p0) fma.s1 sinh_FR_X4 = sinh_FR_X2, sinh_FR_X2, f0 nop.i 999 ;;}// Calculate sinh_FR_podd = p_odd and sinh_FR_peven = p_even { .mfi nop.m 999(p0) fma.s1 sinh_FR_poly_podd_temp1 = sinh_FR_X4, sinh_FR_P5, sinh_FR_P3 nop.i 999 ;;}{ .mfi nop.m 999(p0) fma.s1 sinh_FR_poly_podd_temp2 = sinh_FR_X4, sinh_FR_poly_podd_temp1, sinh_FR_P1 nop.i 999}{ .mfi nop.m 999(p0) fma.s1 sinh_FR_poly_peven_temp1 = sinh_FR_X4, sinh_FR_P6, sinh_FR_P4 nop.i 999 ;;}{ .mfi nop.m 999(p0) fma.s1 sinh_FR_podd = sinh_FR_X2, sinh_FR_poly_podd_temp2, f0 nop.i 999}{ .mfi nop.m 999(p0) fma.s1 sinh_FR_poly_peven_temp2 = sinh_FR_X4, sinh_FR_poly_peven_temp1, sinh_FR_P2 nop.i 999 ;;}{ .mfi nop.m 999(p0) fma.s1 sinh_FR_peven = sinh_FR_X4, sinh_FR_poly_peven_temp2, f0 nop.i 999 ;;}// Calculate sinh_FR_Y_lo = ax*p_odd + (ax*p_even){ .mfi nop.m 999(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_X, sinh_FR_peven, f0 nop.i 999 ;;}{ .mfi nop.m 999(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_X, sinh_FR_podd, sinh_FR_Y_lo_temp nop.i 999 ;;}// Calculate sinh_FR_SINH = Y_hi + Y_lo. Note that ax = Y_hi{ .mfi nop.m 999(p0) fma.s1 sinh_FR_SINH = sinh_FR_X, f1, sinh_FR_Y_lo nop.i 999 ;;}// Dummy multiply to generate inexact{ .mfi nop.m 999(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones nop.i 999}// Calculate f8 = sign * (Y_hi + Y_lo)// Go to return{ .mfb nop.m 999(p0) fma.d.s0 f8 = sinh_FR_SGNX,sinh_FR_SINH,f0 (p0) br.ret.sptk b0 ;; }L(SINH_BY_TBL): // Now that we are at TBL; so far all we know is that |x| >= 0.25.// The first two steps are the same for TBL and EXP, but if we are HUGE// we want to leave now. // Double-extended:// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true)// Double// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true)// Single// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true){ .mlx nop.m 999(p0) movl r32 = 0x0000000000010009 ;; }{ .mfi(p0) setf.exp f9 = r32 nop.f 999 nop.i 999 ;;}{ .mfi nop.m 999(p0) fcmp.ge.unc.s1 p6,p7 = sinh_FR_X,f9 nop.i 999 ;;}{ .mib nop.m 999 nop.i 999(p6) br.cond.spnt L(SINH_HUGE) ;; }// r32 = 1// r34 = N-1 // r35 = N// r36 = j// r37 = N+1// TBL can never overflow// sinh(x) = sinh(B+R)// = sinh(B)cosh(R) + cosh(B)sinh(R)// // ax = |x| = M*log2/64 + R// B = M*log2/64// M = 64*N + j // We will calcualte M and get N as (M-j)/64// The division is a shift.// exp(B) = exp(N*log2 + j*log2/64)// = 2^N * 2^(j*log2/64)// sinh(B) = 1/2(e^B -e^-B)// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64)) // sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64)) // cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64)) // 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)// R = ax - M*log2/64// R = ax - M*log2_by_64_hi - M*log2_by_64_lo// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)// = 1 + p_odd + p_even// where the p_even uses the A coefficients and the p_even uses the B coefficients// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd// cosh(R) = 1 + p_even// sinh(B) = S_hi + S_lo// cosh(B) = C_hi// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R)// ******************************************************// STEP 1 (TBL and EXP)// ******************************************************// Get the following constants. // f9 = Inv_log2by64// f10 = log2by64_hi// f11 = log2by64_lo{ .mmi(p0) adds r32 = 0x1,r0 (p0) addl r34 = @ltoff(double_sinh_arg_reduction), gp nop.i 999};;{ .mmi ld8 r34 = [r34] nop.m 999 nop.i 999};;// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and// put them in an exponent.// sinh_FR_spos = 2^(N-1) and sinh_FR_sneg = 2^(-N-1)// r39 = 0xffff + (N-1) = 0xffff +N -1// r40 = 0xffff - (N +1) = 0xffff -N -1{ .mlx nop.m 999(p0) movl r38 = 0x000000000000fffe ;; }{ .mmi(p0) ldfe sinh_FR_Inv_log2by64 = [r34],16 ;; (p0) ldfe sinh_FR_log2by64_hi = [r34],16 nop.i 999 ;;}{ .mbb(p0) ldfe sinh_FR_log2by64_lo = [r34],16 nop.b 999 nop.b 999 ;;}// Get the A coefficients// f9 = A_1// f10 = A_2// f11 = A_3{ .mmi nop.m 999(p0) addl r34 = @ltoff(double_sinh_ab_table), gp nop.i 999};;{ .mmi ld8 r34 = [r34] nop.m 999 nop.i 999};;// Calculate M and keep it as integer and floating point.// f38 = M = round-to-integer(x*Inv_log2by64)// sinh_FR_M = M = truncate(ax/(log2/64))// Put the significand of M in r35// and the floating point representation of M in sinh_FR_M{ .mfi nop.m 999(p0) fma.s1 sinh_FR_M = sinh_FR_X, sinh_FR_Inv_log2by64, f0 nop.i 999}{ .mfi(p0) ldfe sinh_FR_A1 = [r34],16 nop.f 999 nop.i 999 ;;}{ .mfi nop.m 999(p0) fcvt.fx.s1 sinh_FR_M_temp = sinh_FR_M nop.i 999 ;;}{ .mfi nop.m 999(p0) fnorm.s1 sinh_FR_M = sinh_FR_M_temp nop.i 999 ;;}{ .mfi(p0) getf.sig r35 = sinh_FR_M_temp nop.f 999 nop.i 999 ;;}// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It // has a range of -32 thru 31.// r35 = M// r36 = j { .mii nop.m 999 nop.i 999 ;;(p0) and r36 = 0x3f, r35 ;; }// Calculate R// f13 = f44 - f12*f10 = ax - M*log2by64_hi// f14 = f13 - f8*f11 = R = (ax - M*log2by64_hi) - M*log2by64_lo{ .mfi nop.m 999(p0) fnma.s1 sinh_FR_R_temp = sinh_FR_M, sinh_FR_log2by64_hi, sinh_FR_X nop.i 999}{ .mfi(p0) ldfe sinh_FR_A2 = [r34],16 nop.f 999 nop.i 999 ;;}{ .mfi nop.m 999(p0) fnma.s1 sinh_FR_R = sinh_FR_M, sinh_FR_log2by64_lo, sinh_FR_R_temp nop.i 999}// Get the B coefficients// f15 = B_1// f32 = B_2// f33 = B_3{ .mmi(p0) ldfe sinh_FR_A3 = [r34],16 ;; (p0) ldfe sinh_FR_B1 = [r34],16 nop.i 999 ;;}{ .mmi(p0) ldfe sinh_FR_B2 = [r34],16 ;; (p0) ldfe sinh_FR_B3 = [r34],16 nop.i 999 ;;}{ .mii nop.m 999(p0) shl r34 = r36, 0x2 ;; (p0) sxt1 r37 = r34 ;; }// ******************************************************// STEP 2 (TBL and EXP)// ******************************************************// Calculate Rsquared and Rcubed in preparation for p_even and p_odd// f12 = R*R*R// f13 = R*R// f14 = R <== from above{ .mfi nop.m 999(p0) fma.s1 sinh_FR_Rsq = sinh_FR_R, sinh_FR_R, f0 (p0) shr r36 = r37, 0x2 ;; }// r34 = M-j = r35 - r36// r35 = N = (M-j)/64{ .mii(p0) sub r34 = r35, r36 nop.i 999 ;;(p0) shr r35 = r34, 0x6 ;; }{ .mii(p0) sub r40 = r38, r35 (p0) adds r37 = 0x1, r35 (p0) add r39 = r38, r35 ;; }// Get the address of the J table, add the offset, // addresses are sinh_AD_mJ and sinh_AD_J, get the T value// f32 = T(j)_hi// f33 = T(j)_lo// f34 = T(-j)_hi// f35 = T(-j)_lo{ .mmi(p0) sub r34 = r35, r32 (p0) addl r37 = @ltoff(double_sinh_j_table), gp nop.i 999};;{ .mmi ld8 r37 = [r37] nop.m 999 nop.i 999};;{ .mfi nop.m 999(p0) fma.s1 sinh_FR_Rcub = sinh_FR_Rsq, sinh_FR_R, f0 nop.i 999}// ******************************************************// STEP 3 Now decide if we need to branch to EXP// ******************************************************// Put 32 in f9; p6 true if x < 32// Go to EXP if |x| >= 32 { .mlx nop.m 999(p0) movl r32 = 0x0000000000010004 ;; }// Calculate p_even// f34 = B_2 + Rsq *B_3// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3)// f36 = p_even = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3)){ .mfi nop.m 999(p0) fma.s1 sinh_FR_peven_temp1 = sinh_FR_Rsq, sinh_FR_B3, sinh_FR_B2 nop.i 999 ;;}{ .mfi nop.m 999(p0) fma.s1 sinh_FR_peven_temp2 = sinh_FR_Rsq, sinh_FR_peven_temp1, sinh_FR_B1 nop.i 999}// Calculate p_odd// f34 = A_2 + Rsq *A_3// f35 = A_1 + Rsq * (A_2 + Rsq *A_3)// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3)){ .mfi nop.m 999(p0) fma.s1 sinh_FR_podd_temp1 = sinh_FR_Rsq, sinh_FR_A3, sinh_FR_A2 nop.i 999 ;;}{ .mfi(p0) setf.exp sinh_FR_N_temp1 = r39 nop.f 999 nop.i 999 ;;}{ .mfi nop.m 999(p0) fma.s1 sinh_FR_peven = sinh_FR_Rsq, sinh_FR_peven_temp2, f0 nop.i 999}{ .mfi nop.m 999(p0) fma.s1 sinh_FR_podd_temp2 = sinh_FR_Rsq, sinh_FR_podd_temp1, sinh_FR_A1 nop.i 999 ;;}{ .mfi(p0) setf.exp f9 = r32 nop.f 999 nop.i 999 ;;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -