📄 libm_sincosl.s
字号:
}{ .mfb cmp.eq p15, p0 = 0x1, GR_Cis(p14) fms.s0 FR_ResultC = FR_FirstS, f1, FR_polyS(p15) br.ret.sptk b0};;{ .mmb // exit for sincosl stfe [sincos_pResSin] = FR_ResultS stfe [sincos_pResCos] = FR_ResultC br.ret.sptk b0};;SINCOSL_NORMAL_R://// Here if 2^-3 <= |r| < pi/4// THIS IS THE MAIN PATH//// Enter with r, c, and N_Inc having been computed//{ .mfi ldfe FR_PP_6 = [GR_ad_pp], 16 // Load PP_6 fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r nop.i 0}{ .mfi ldfe FR_QQ_6 = [GR_ad_qq], 16 // Load QQ_6 nop.f 0 nop.i 0};;{ .mmi ldfe FR_PP_5 = [GR_ad_pp], 16 // Load PP_5 ldfe FR_QQ_5 = [GR_ad_qq], 16 // Load QQ_5 nop.i 0};;SINCOSL_NORMAL_R_0:// Entry for 2^-3 < |x| < pi/4.pred.rel "mutex",p9,p10{ .mmf ldfe FR_C_1 = [GR_ad_pp], 16 // Load C_1 ldfe FR_S_1 = [GR_ad_qq], 16 // Load S_1 frcpa.s1 FR_r_hi, p6 = f1, FR_r // r_hi = frcpa(r)};;{ .mfi nop.m 0 fma.s1 FR_polyS = FR_rsq, FR_PP_8, FR_PP_7 // poly = rsq*PP_8+PP_7 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_polyC = FR_rsq, FR_QQ_8, FR_QQ_7 // poly = rsq*QQ_8+QQ_7 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // rcubed = r * rsq nop.i 0};;SINCOSL_NORMAL_R_1:// Entry for pi/4 <= |x| < 2^24.pred.rel "mutex",p9,p10{ .mmf ldfe FR_PP_1 = [GR_ad_pp], 16 // Load PP_1_hi ldfe FR_QQ_1 = [GR_ad_qq], 16 // Load QQ_1 frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi // r_hi = frpca(frcpa(r))};;{ .mfi ldfe FR_PP_4 = [GR_ad_pp], 16 // Load PP_4 fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_6 // poly = rsq*poly+PP_6 and GR_N_SinCos = 0x1, GR_N_Inc}{ .mfi ldfe FR_QQ_4 = [GR_ad_qq], 16 // Load QQ_4 fma.s1 FR_polyC = FR_rsq, FR_polyC, FR_QQ_6 // poly = rsq*poly+QQ_6 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_corrS = FR_C_1, FR_rsq, f0 // corr = C_1 * rsq sub GR_N_SignS = GR_N_Inc, GR_N_SinCos}{ .mfi nop.m 0 fma.s1 FR_corrC = FR_S_1, FR_r_cubed, FR_r // corr = S_1 * r^3 + r add GR_N_SignC = GR_N_Inc, GR_N_SinCos};;{ .mfi ldfe FR_PP_3 = [GR_ad_pp], 16 // Load PP_3 fma.s1 FR_r_hi_sq = FR_r_hi, FR_r_hi, f0 // r_hi_sq = r_hi * r_hi tbit.z p7,p11 = GR_N_Inc, 0}{ .mfi ldfe FR_QQ_3 = [GR_ad_qq], 16 // Load QQ_3 fms.s1 FR_r_lo = FR_r, f1, FR_r_hi // r_lo = r - r_hi nop.i 0};;{ .mfi ldfe FR_PP_2 = [GR_ad_pp], 16 // Load PP_2 fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_5 // poly = rsq*poly+PP_5(p7) tbit.z.unc p9,p10 = GR_N_SignC, 1}{ .mfi ldfe FR_QQ_2 = [GR_ad_qq], 16 // Load QQ_2 fma.s1 FR_polyC = FR_rsq, FR_polyC, FR_QQ_5 // poly = rsq*poly+QQ_5 nop.i 0};;{ .mfi ldfe FR_PP_1_lo = [GR_ad_pp], 16 // Load PP_1_lo fma.s1 FR_corrS = FR_corrS, FR_c, FR_c // corr = corr * c + c(p7) tbit.z.unc p7,p8 = GR_N_SignS, 1}{ .mfi nop.m 0 fnma.s1 FR_corrC = FR_corrC, FR_c, f0 // corr = -corr * c nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_U_loS = FR_r, FR_r_hi, FR_r_hi_sq // U_lo = r*r_hi+r_hi_sq(p11) tbit.z.unc p13,p14 = GR_N_SignC, 1}{ .mfi nop.m 0 fma.s1 FR_U_loC = FR_r_hi, f1, FR_r // U_lo = r_hi + r nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_U_hiS = FR_r_hi, FR_r_hi_sq, f0 // U_hi = r_hi*r_hi_sq(p11) tbit.z.unc p11,p12 = GR_N_SignS, 1}{ .mfi nop.m 0 fma.s1 FR_U_hiC = FR_QQ_1, FR_r_hi_sq, f1 // U_hi = QQ_1*r_hi_sq+1 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_4 // poly = poly*rsq+PP_4 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_polyC = FR_rsq, FR_polyC, FR_QQ_4 // poly = poly*rsq+QQ_4 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_U_loS = FR_r, FR_r, FR_U_loS // U_lo = r * r + U_lo nop.i 0}{ .mfi nop.m 0 fma.s1 FR_U_loC = FR_r_lo, FR_U_loC, f0 // U_lo = r_lo * U_lo nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_U_hiS = FR_PP_1, FR_U_hiS, f0 // U_hi = PP_1 * U_hi nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_3 // poly = poly*rsq+PP_3 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_polyC = FR_rsq, FR_polyC, FR_QQ_3 // poly = poly*rsq+QQ_3 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_U_loS = FR_r_lo, FR_U_loS, f0 // U_lo = r_lo * U_lo nop.i 0}{ .mfi nop.m 0 fma.s1 FR_U_loC = FR_QQ_1,FR_U_loC, f0 // U_lo = QQ_1 * U_lo nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_U_hiS = FR_r, f1, FR_U_hiS // U_hi = r + U_hi nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_2 // poly = poly*rsq+PP_2 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_polyC = FR_rsq, FR_polyC, FR_QQ_2 // poly = poly*rsq+QQ_2 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_U_loS = FR_PP_1, FR_U_loS, f0 // U_lo = PP_1 * U_lo nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_1_lo // poly =poly*rsq+PP1lo nop.i 0}{ .mfi nop.m 0 fma.s1 FR_polyC = FR_rsq, FR_polyC, f0 // poly = poly*rsq nop.i 0};;.pred.rel "mutex",p8,p14{ .mfi nop.m 0(p8) fms.s0 FR_U_hiS = f1, f0, FR_U_hiS nop.i 0}{ .mfi nop.m 0(p14) fms.s0 FR_U_hiS = f1, f0, FR_U_hiS nop.i 0};;.pred.rel "mutex",p10,p12{ .mfi nop.m 0(p10) fms.s0 FR_U_hiC = f1, f0, FR_U_hiC nop.i 0}{ .mfi nop.m 0(p12) fms.s0 FR_U_hiC = f1, f0, FR_U_hiC nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_VS = FR_U_loS, f1, FR_corrS // V = U_lo + corr nop.i 0}{ .mfi nop.m 0 fma.s1 FR_VC = FR_U_loC, f1, FR_corrC // V = U_lo + corr nop.i 0};;{ .mfi nop.m 0 fma.s0 FR_inexact = FR_PP_5, FR_PP_4, f0 // Dummy op to set inexact nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_polyS = FR_r_cubed, FR_polyS, f0 // poly = poly*r^3 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_polyC = FR_rsq, FR_polyC, f0 // poly = poly*rsq nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_VS = FR_polyS, f1, FR_VS // V = poly + V nop.i 0}{ .mfi nop.m 0 fma.s1 FR_VC = FR_polyC, f1, FR_VC // V = poly + V nop.i 0};;.pred.rel "mutex",p7,p8{ .mfi nop.m 0(p7) fma.s0 FR_ResultS = FR_U_hiS, f1, FR_VS nop.i 0}{ .mfi nop.m 0(p8) fms.s0 FR_ResultS = FR_U_hiS, f1, FR_VS nop.i 0};;.pred.rel "mutex",p9,p10{ .mfi nop.m 0(p9) fma.s0 FR_ResultC = FR_U_hiC, f1, FR_VC nop.i 0}{ .mfi nop.m 0(p10) fms.s0 FR_ResultC = FR_U_hiC, f1, FR_VC nop.i 0};;.pred.rel "mutex",p11,p12{ .mfi nop.m 0(p11) fma.s0 FR_ResultS = FR_U_hiC, f1, FR_VC nop.i 0}{ .mfi nop.m 0(p12) fms.s0 FR_ResultS = FR_U_hiC, f1, FR_VC nop.i 0};;.pred.rel "mutex",p13,p14{ .mfi nop.m 0(p13) fma.s0 FR_ResultC = FR_U_hiS, f1, FR_VS nop.i 0}{ .mfb cmp.eq p15, p0 = 0x1, GR_Cis(p14) fms.s0 FR_ResultC = FR_U_hiS, f1, FR_VS(p15) br.ret.sptk b0};;{ .mmb // exit for sincosl stfe [sincos_pResSin] = FR_ResultS stfe [sincos_pResCos] = FR_ResultC br.ret.sptk b0};;SINCOSL_ZERO:{ .mfi nop.m 0 fmerge.s FR_ResultS = FR_Input_X, FR_Input_X // If sin, result = input nop.i 0}{ .mfb cmp.eq p15, p0 = 0x1, GR_Cis fma.s0 FR_ResultC = f1, f1, f0 // If cos, result=1.0(p15) br.ret.sptk b0};;{ .mmb // exit for sincosl stfe [sincos_pResSin] = FR_ResultS stfe [sincos_pResCos] = FR_ResultC br.ret.sptk b0};;SINCOSL_DENORMAL:{ .mmb getf.exp GR_signexp_x = FR_norm_x // Get sign and exponent of x nop.m 999 br.cond.sptk SINCOSL_COMMON2 // Return to common code};;SINCOSL_SPECIAL://// Path for Arg = +/- QNaN, SNaN, Inf// Invalid can be raised. SNaNs// become QNaNs//{ .mfi cmp.eq p15, p0 = 0x1, GR_Cis fmpy.s0 FR_ResultS = FR_Input_X, f0 nop.i 0}{ .mfb nop.m 0 fmpy.s0 FR_ResultC = FR_Input_X, f0(p15) br.ret.sptk b0};;{ .mmb // exit for sincosl stfe [sincos_pResSin] = FR_ResultS stfe [sincos_pResCos] = FR_ResultC br.ret.sptk b0};;GLOBAL_LIBM_END(__libm_sincosl)// *******************************************************************// *******************************************************************// *******************************************************************//// Special Code to handle very large argument case.// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63// The interface is custom:// On input:// (Arg or x) is in f8// On output:// r is in f8// c is in f9// N is in r8// Be sure to allocate at least 2 GP registers as output registers for// __libm_pi_by_2_reduce. This routine uses r62-63. These are used as// scratch registers within the __libm_pi_by_2_reduce routine (for speed).//// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We// use this to eliminate save/restore of key fp registers in this calling// function.//// *******************************************************************// *******************************************************************// *******************************************************************LOCAL_LIBM_ENTRY(__libm_callout)SINCOSL_ARG_TOO_LARGE:.prologue{ .mfi nop.f 0.save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs};;{ .mmi setf.exp FR_Two_to_M3 = GR_exp_2_to_m3 // Form 2^-3 mov GR_SAVE_GP=gp // Save gp.save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0};;.body//// Call argument reduction with x in f8// Returns with N in r8, r in f8, c in f9// Assumes f71-127 are preserved across the call//{ .mib setf.exp FR_Neg_Two_to_M3 = GR_exp_m2_to_m3 // Form -(2^-3) nop.i 0 br.call.sptk b0=__libm_pi_by_2_reduce#};;{ .mfi mov GR_N_Inc = r8 fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 mov b0 = GR_SAVE_B0 // Restore return address};;{ .mfi mov gp = GR_SAVE_GP // Restore gp(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs};;{ .mbb nop.m 0(p6) br.cond.spnt SINCOSL_SMALL_R // Branch if |r|< 2^-3 for |x| >= 2^63 br.cond.sptk SINCOSL_NORMAL_R // Branch if |r|>=2^-3 for |x| >= 2^63};;LOCAL_LIBM_END(__libm_callout).type __libm_pi_by_2_reduce#,@function.global __libm_pi_by_2_reduce#
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -