📄 libm_sincos_large.s
字号:
}{ .mfi nop.m 999(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1 nop.i 999}{ .mfi nop.m 999//// if (i_1 == 0):// poly_lo = FR_rsq * S_5 + S_4// poly_hi = FR_rsq * S_2 + S_1//(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1 == 0):// Z = Z * r for only one of the small r cases - not there// in original implementation notes.//(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 nop.i 999}{ .mfi nop.m 999(p10) fma.d.s1 FR_C_1 = FR_C_1, FR_C_1, f0 nop.i 999 ;;}{ .mfi nop.m 999(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 nop.i 999}{ .mfi nop.m 999//// poly_lo = FR_rsq * poly_lo + S_3// poly_hi = FR_rsq * poly_hi//(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1 == 0): dummy fmpy's to flag inexact// r = 1//(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 nop.i 999}{ .mfi nop.m 999//// poly_hi = r * poly_hi// fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c nop.i 999 ;;}{ .mfi nop.m 999(p12) fms.s1 FR_r = f0, f1, FR_r nop.i 999 ;;}{ .mfi nop.m 999//// poly_hi = Z * poly_lo + c// if i_0 == 1: r = -r// fma.s1 FR_poly = FR_poly, f1, FR_poly_hi nop.i 999 ;;}{ .mfi nop.m 999(p12) fms.s1 FR_Input_X = FR_r, f1, FR_poly nop.i 999}{ .mfb nop.m 999//// poly = poly + poly_hi//(p11) fma.s1 FR_Input_X = FR_r, f1, FR_poly//// if (i_0 == 0) Result = r + poly// if (i_0 != 0) Result = r - poly// br.ret.sptk b0 ;;}SINCOS_NORMAL_R:{ .mii nop.m 999 extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;//// Set table_ptr1 and table_ptr2 to base address of// constant table. cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;}{ .mfi nop.m 999 fma.s1 FR_rsq = FR_r, FR_r, f0 extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;}{ .mfi nop.m 999 frcpa.s1 FR_r_hi, p6 = f1, FR_r cmp.eq.unc p11, p12 = 0x0, GR_i_0};;// ******************************************************************// ******************************************************************// ******************************************************************//// r and c have been computed.// We known whether this is the sine or cosine routine.// Make sure ftz mode is set - should be automatic when using wre// Get [i_0,i_1] - two lsb of N_fix_gr alone.//{ .mmi nop.m 999 addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp nop.i 999};;{ .mmi ld8 GR_Table_Base = [GR_Table_Base] nop.m 999 nop.i 999};;{ .mfi(p10) add GR_Table_Base = 384, GR_Table_Base//(p12) fms.s1 FR_Input_X = f0, f1, f1(p12) fms.s1 FR_prelim = f0, f1, f1(p9) add GR_Table_Base = 224, GR_Table_Base ;;}{ .mmf nop.m 999(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16//// if (i_1==0) poly = poly * FR_rsq + PP_1_lo// else poly = FR_rsq * poly////(p11) fma.s1 FR_Input_X = f0, f1, f1 ;;(p11) fma.s1 FR_prelim = f0, f1, f1 ;;}{ .mmf(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16//// Adjust table pointers based on i_0// Compute rsq = r * r//(p9) ldfe FR_PP_8 = [GR_Table_Base], 16 fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 ;;}{ .mmf(p9) ldfe FR_PP_7 = [GR_Table_Base], 16(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16//// Load PP_8 and QQ_8; PP_7 and QQ_7// frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;;}//// if (i_1==0) poly = PP_7 + FR_rsq * PP_8.// else poly = QQ_7 + FR_rsq * QQ_8.//{ .mmb(p9) ldfe FR_PP_6 = [GR_Table_Base], 16(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16 nop.b 999 ;;}{ .mmb(p9) ldfe FR_PP_5 = [GR_Table_Base], 16(p10) ldfe FR_S_1 = [GR_Table_Base], 16 nop.b 999 ;;}{ .mmb(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16(p9) ldfe FR_C_1 = [GR_Table_Base], 16 nop.b 999 ;;}{ .mmi(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16 ;;(p9) ldfe FR_PP_1 = [GR_Table_Base], 16 nop.i 999 ;;}{ .mmf(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16//// if (i_1=0) corr = corr + c*c// else corr = corr * c//(p9) ldfe FR_PP_4 = [GR_Table_Base], 16(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 ;;}//// if (i_1=0) poly = rsq * poly + PP_5// else poly = rsq * poly + QQ_5// Load PP_4 or QQ_4//{ .mmf(p9) ldfe FR_PP_3 = [GR_Table_Base], 16(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16//// r_hi = frcpa(frcpa(r)).// r_cube = r * FR_rsq.//(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 ;;}//// Do dummy multiplies so inexact is always set.//{ .mfi(p9) ldfe FR_PP_2 = [GR_Table_Base], 16//// r_lo = r - r_hi//(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0 nop.i 999 ;;}{ .mmf nop.m 999(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1=0) U_lo = r_hi * r_hi// else U_lo = r_hi + r//(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1=0) corr = C_1 * rsq// else corr = S_1 * r_cubed + r//(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1=0) U_hi = r_hi + U_hi// else U_hi = QQ_1 * U_hi + 1//(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo nop.i 999}{ .mfi nop.m 999//// U_hi = r_hi * r_hi// fms.s1 FR_r_lo = FR_r, f1, FR_r_hi nop.i 999 ;;}{ .mfi nop.m 999//// Load PP_1, PP_6, PP_5, and C_1// Load QQ_1, QQ_6, QQ_5, and S_1// fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 nop.i 999}{ .mfi nop.m 999(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1=0) U_lo = r * r_hi + U_lo// else U_lo = r_lo * U_lo//(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c nop.i 999 ;;}{ .mfi nop.m 999(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 nop.i 999}{ .mfi nop.m 999//// if (i_1 =0) U_hi = r + U_hi// if (i_1 =0) U_lo = r_lo * U_lo////(p9) fma.d.s1 FR_PP_5 = FR_PP_5, FR_PP_4, f0 nop.i 999 ;;}{ .mfi nop.m 999(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1=0) poly = poly * rsq + PP_6// else poly = poly * rsq + QQ_6//(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.d.s1 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1!=0) U_hi = PP_1 * U_hi// if (i_1!=0) U_lo = r * r + U_lo// Load PP_3 or QQ_3//(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 nop.i 999 ;;}{ .mfi nop.m 999(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 nop.i 999 ;;}{ .mfi nop.m 999(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 nop.i 999 ;;}{ .mfi nop.m 999//// Load PP_2, QQ_2//(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1==0) poly = FR_rsq * poly + PP_3// else poly = FR_rsq * poly + QQ_3// Load PP_1_lo//(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1 =0) poly = poly * rsq + pp_r4// else poly = poly * rsq + qq_r4//(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1==0) U_lo = PP_1_hi * U_lo// else U_lo = QQ_1 * U_lo//(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_0==0) Result = 1// else Result = -1// fma.s1 FR_V = FR_U_lo, f1, FR_corr nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1==0) poly = FR_rsq * poly + PP_2// else poly = FR_rsq * poly + QQ_2//(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 nop.i 999 ;;}{ .mfi nop.m 999//// V = U_lo + corr//(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1==0) poly = r_cube * poly// else poly = FR_rsq * poly// fma.s1 FR_V = FR_poly, f1, FR_V nop.i 999 ;;}{ .mfi nop.m 999//(p12) fms.s1 FR_Input_X = FR_Input_X, FR_U_hi, FR_V(p12) fms.s1 FR_Input_X = FR_prelim, FR_U_hi, FR_V nop.i 999}{ .mfb nop.m 999//// V = V + poly////(p11) fma.s1 FR_Input_X = FR_Input_X, FR_U_hi, FR_V(p11) fma.s1 FR_Input_X = FR_prelim, FR_U_hi, FR_V//// if (i_0==0) Result = Result * U_hi + V// else Result = Result * U_hi - V// br.ret.sptk b0 ;;}//// If cosine, FR_Input_X = 1// If sine, FR_Input_X = +/-Zero (Input FR_Input_X)// Results are exact, no exceptions//SINCOS_ZERO:{ .mmb cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos nop.m 999 nop.b 999 ;;}{ .mfi nop.m 999(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X nop.i 999}{ .mfb nop.m 999(p6) fmerge.s FR_Input_X = f1, f1 br.ret.sptk b0 ;;}SINCOS_SPECIAL://// Path for Arg = +/- QNaN, SNaN, Inf// Invalid can be raised. SNaNs// become QNaNs//{ .mfb nop.m 999 fmpy.s1 FR_Input_X = FR_Input_X, f0 br.ret.sptk b0 ;;}GLOBAL_LIBM_END(__libm_cos_large)// *******************************************************************// *******************************************************************// *******************************************************************//// Special Code to handle very large argument case.// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63// The interface is custom:// On input:// (Arg or x) is in f8// On output:// r is in f8// c is in f9// N is in r8// Be sure to allocate at least 2 GP registers as output registers for// __libm_pi_by_2_reduce. This routine uses r49-50. These are used as// scratch registers within the __libm_pi_by_2_reduce routine (for speed).//// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We// use this to eliminate save/restore of key fp registers in this calling// function.//// *******************************************************************// *******************************************************************// *******************************************************************LOCAL_LIBM_ENTRY(__libm_callout_2)SINCOS_ARG_TOO_LARGE:.prologue// Readjust Table ptr{ .mfi adds GR_Table_Base1 = -16, GR_Table_Base1 nop.f 999.save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs};;{ .mmi ldfs FR_Two_to_M3 = [GR_Table_Base1],4 mov GR_SAVE_GP=gp // Save gp.save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0};;.body//// Call argument reduction with x in f8// Returns with N in r8, r in f8, c in f9// Assumes f71-127 are preserved across the call//{ .mib ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0 nop.i 0 br.call.sptk b0=__libm_pi_by_2_reduce#};;{ .mfi add GR_N_Inc = GR_Sin_or_Cos,r8 fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 mov b0 = GR_SAVE_B0 // Restore return address};;{ .mfi mov gp = GR_SAVE_GP // Restore gp(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs};;{ .mbb nop.m 999(p6) br.cond.spnt SINCOS_SMALL_R // Branch if |r| < 1/4 br.cond.sptk SINCOS_NORMAL_R ;; // Branch if 1/4 <= |r| < pi/4}LOCAL_LIBM_END(__libm_callout_2).type __libm_pi_by_2_reduce#,@function.global __libm_pi_by_2_reduce#
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -