📄 libm_sincosl.s
字号:
//{ .mfi nop.m 0(p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3 nop.i 0};;{ .mfi nop.m 0(p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3 nop.i 0};;{ .mfi nop.m 0(p8) fma.s1 FR_c = FR_c, f1, FR_w nop.i 0}{ .mfb nop.m 0(p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w(p12) br.cond.spnt SINCOSL_SMALL_R // Branch if |r| < 2^-3 // and 2^24 <= |x| < 2^63};;{ .mib nop.m 0 nop.i 0(p13) br.cond.sptk SINCOSL_NORMAL_R // Branch if |r| >= 2^-3 // and 2^24 <= |x| < 2^63};;SINCOSL_LARGER_S_TINY:// Here if |s| < 2^-14, and 2^24 <= |x| < 2^63//// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true.// The remaining stuff is for Case 4.// Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup)// Note: the (-) is still missing for V_lo.// Small s: w = w + N_0 * d_2// Note: the (-) is now incorporated in w.//{ .mfi and GR_N_SinCos = 0x1, GR_N_Inc fcmp.ge.unc.s1 p6, p7 = FR_U_hiabs, FR_V_hiabs tbit.z p8,p12 = GR_N_Inc, 0}{ .mfi nop.m 0 fma.s1 FR_t = FR_U_lo, f1, FR_V_lo // C_hi = S + A nop.i 0};;{ .mfi sub GR_N_SignS = GR_N_Inc, GR_N_SinCos(p6) fms.s1 FR_a = FR_U_hi, f1, FR_A add GR_N_SignC = GR_N_Inc, GR_N_SinCos}{ .mfi nop.m 0(p7) fma.s1 FR_a = FR_V_hi, f1, FR_A nop.i 0};;{ .mmf ldfe FR_C_1 = [GR_ad_c], 16 ldfe FR_S_1 = [GR_ad_s], 16 fma.s1 FR_C_hi = FR_s, f1, FR_A};;{ .mmi ldfe FR_C_2 = [GR_ad_c], 64 ldfe FR_S_2 = [GR_ad_s], 64(p8) tbit.z.unc p10,p11 = GR_N_SignC, 1};;//// r and c have been computed.// Make sure ftz mode is set - should be automatic when using wre// |r| < 2**(-3)// Get [i_0,i_1] - two lsb of N_fix.//// For larger u than v: a = U_hi - A// Else a = V_hi - A (do an add to account for missing (-) on V_hi//{ .mfi nop.m 0 fma.s1 FR_t = FR_t, f1, FR_w // t = t + w(p8) tbit.z.unc p8,p9 = GR_N_SignS, 1}{ .mfi nop.m 0(p6) fms.s1 FR_a = FR_a, f1, FR_V_hi nop.i 0};;//// If u > v: a = (U_hi - A) + V_hi// Else a = (V_hi - A) + U_hi// In each case account for negative missing from V_hi.//{ .mfi nop.m 0 fms.s1 FR_C_lo = FR_s, f1, FR_C_hi(p12) tbit.z.unc p14,p15 = GR_N_SignC, 1}{ .mfi nop.m 0(p7) fms.s1 FR_a = FR_U_hi, f1, FR_a nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_C_lo = FR_C_lo, f1, FR_A // C_lo = (S - C_hi) + A(p12) tbit.z.unc p12,p13 = GR_N_SignS, 1}{ .mfi nop.m 0 fma.s1 FR_t = FR_t, f1, FR_a // t = t + a nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_r = FR_C_hi, f1, FR_C_lo nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_C_lo = FR_C_lo, f1, FR_t // C_lo = C_lo + t nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_rsq = FR_r, FR_r, f0 nop.i 0}{ .mfi nop.m 0 fms.s1 FR_c = FR_C_hi, f1, FR_r nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_FirstS = f0, f1, FR_r nop.i 0}{ .mfi nop.m 0 fma.s1 FR_FirstC = f0, f1, f1 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_polyS = FR_rsq, FR_S_2, FR_S_1 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_polyC = FR_rsq, FR_C_2, FR_C_1 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_r_cubed = FR_rsq, FR_r, f0 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_c = FR_c, f1, FR_C_lo nop.i 0};;.pred.rel "mutex",p9,p15{ .mfi nop.m 0(p9) fms.s0 FR_FirstS = f1, f0, FR_FirstS nop.i 0}{ .mfi nop.m 0(p15) fms.s0 FR_FirstS = f1, f0, FR_FirstS nop.i 0};;.pred.rel "mutex",p11,p13{ .mfi nop.m 0(p11) fms.s0 FR_FirstC = f1, f0, FR_FirstC nop.i 0}{ .mfi nop.m 0(p13) fms.s0 FR_FirstC = f1, f0, FR_FirstC nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_polyS = FR_r_cubed, FR_polyS, FR_c nop.i 0}{ .mfi nop.m 0 fma.s1 FR_polyC = FR_rsq, FR_polyC, f0 nop.i 0};;.pred.rel "mutex",p8,p9{ .mfi nop.m 0(p8) fma.s0 FR_ResultS = FR_FirstS, f1, FR_polyS nop.i 0}{ .mfi nop.m 0(p9) fms.s0 FR_ResultS = FR_FirstS, f1, FR_polyS nop.i 0};;.pred.rel "mutex",p10,p11{ .mfi nop.m 0(p10) fma.s0 FR_ResultC = FR_FirstC, f1, FR_polyC nop.i 0}{ .mfi nop.m 0(p11) fms.s0 FR_ResultC = FR_FirstC, f1, FR_polyC nop.i 0};;.pred.rel "mutex",p12,p13{ .mfi nop.m 0(p12) fma.s0 FR_ResultS = FR_FirstC, f1, FR_polyC nop.i 0}{ .mfi nop.m 0(p13) fms.s0 FR_ResultS = FR_FirstC, f1, FR_polyC nop.i 0};;.pred.rel "mutex",p14,p15{ .mfi nop.m 0(p14) fma.s0 FR_ResultC = FR_FirstS, f1, FR_polyS nop.i 0}{ .mfb cmp.eq p10, p0 = 0x1, GR_Cis(p15) fms.s0 FR_ResultC = FR_FirstS, f1, FR_polyS(p10) br.ret.sptk b0};;{ .mmb // exit for sincosl stfe [sincos_pResSin] = FR_ResultS stfe [sincos_pResCos] = FR_ResultC br.ret.sptk b0};;SINCOSL_SMALL_R://// Here if |r| < 2^-3//// Enter with r, c, and N_Inc computed//{ .mfi nop.m 0 fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r nop.i 0};;{ .mmi ldfe FR_S_5 = [GR_ad_se], -16 // Load S_5 ldfe FR_C_5 = [GR_ad_ce], -16 // Load C_5 nop.i 0};;{ .mmi ldfe FR_S_4 = [GR_ad_se], -16 // Load S_4 ldfe FR_C_4 = [GR_ad_ce], -16 // Load C_4 nop.i 0};;SINCOSL_SMALL_R_0:// Entry point for 2^-3 < |x| < pi/4SINCOSL_SMALL_R_1:// Entry point for pi/4 < |x| < 2^24 and |r| < 2^-3{ .mfi ldfe FR_S_3 = [GR_ad_se], -16 // Load S_3 fma.s1 FR_r6 = FR_rsq, FR_rsq, f0 // Z = rsq * rsq tbit.z p7,p11 = GR_N_Inc, 0}{ .mfi ldfe FR_C_3 = [GR_ad_ce], -16 // Load C_3 nop.f 0 and GR_N_SinCos = 0x1, GR_N_Inc};;{ .mfi ldfe FR_S_2 = [GR_ad_se], -16 // Load S_2 fnma.s1 FR_cC = FR_c, FR_r, f0 // c = -c * r sub GR_N_SignS = GR_N_Inc, GR_N_SinCos}{ .mfi ldfe FR_C_2 = [GR_ad_ce], -16 // Load C_2 nop.f 0 add GR_N_SignC = GR_N_Inc, GR_N_SinCos};;{ .mmi ldfe FR_S_1 = [GR_ad_se], -16 // Load S_1 ldfe FR_C_1 = [GR_ad_ce], -16 // Load C_1(p7) tbit.z.unc p9,p10 = GR_N_SignC, 1};;{ .mfi nop.m 0 fma.s1 FR_r7 = FR_r6, FR_r, f0 // Z = Z * r(p7) tbit.z.unc p7,p8 = GR_N_SignS, 1};;{ .mfi nop.m 0 fma.s1 FR_poly_loS = FR_rsq, FR_S_5, FR_S_4 // poly_lo=rsq*S_5+S_4(p11) tbit.z.unc p13,p14 = GR_N_SignC, 1}{ .mfi nop.m 0 fma.s1 FR_poly_loC = FR_rsq, FR_C_5, FR_C_4 // poly_lo=rsq*C_5+C_4 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_poly_hiS = FR_rsq, FR_S_2, FR_S_1 // poly_hi=rsq*S_2+S_1(p11) tbit.z.unc p11,p12 = GR_N_SignS, 1}{ .mfi nop.m 0 fma.s1 FR_poly_hiC = FR_rsq, FR_C_2, FR_C_1 // poly_hi=rsq*C_2+C_1 nop.i 0};;{ .mfi nop.m 0 fma.s0 FR_FirstS = FR_r, f1, f0 nop.i 0}{ .mfi nop.m 0 fma.s0 FR_FirstC = f1, f1, f0 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_r6 = FR_r6, FR_rsq, f0 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_r7 = FR_r7, FR_rsq, f0 nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_poly_loS = FR_rsq, FR_poly_loS, FR_S_3 // p_lo=p_lo*rsq+S_3 nop.i 0}{ .mfi nop.m 0 fma.s1 FR_poly_loC = FR_rsq, FR_poly_loC, FR_C_3 // p_lo=p_lo*rsq+C_3 nop.i 0};;{ .mfi nop.m 0 fma.s0 FR_inexact = FR_S_4, FR_S_4, f0 // Dummy op to set inexact nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_poly_hiS = FR_poly_hiS, FR_rsq, f0 // p_hi=p_hi*rsq nop.i 0}{ .mfi nop.m 0 fma.s1 FR_poly_hiC = FR_poly_hiC, FR_rsq, f0 // p_hi=p_hi*rsq nop.i 0};;.pred.rel "mutex",p8,p14{ .mfi nop.m 0(p8) fms.s0 FR_FirstS = f1, f0, FR_FirstS nop.i 0}{ .mfi nop.m 0(p14) fms.s0 FR_FirstS = f1, f0, FR_FirstS nop.i 0};;.pred.rel "mutex",p10,p12{ .mfi nop.m 0(p10) fms.s0 FR_FirstC = f1, f0, FR_FirstC nop.i 0}{ .mfi nop.m 0(p12) fms.s0 FR_FirstC = f1, f0, FR_FirstC nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_polyS = FR_r7, FR_poly_loS, FR_cS // poly=Z*poly_lo+c nop.i 0}{ .mfi nop.m 0 fma.s1 FR_polyC = FR_r6, FR_poly_loC, FR_cC // poly=Z*poly_lo+c nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_poly_hiS = FR_r, FR_poly_hiS, f0 // p_hi=r*p_hi nop.i 0};;{ .mfi nop.m 0 fma.s1 FR_polyS = FR_polyS, f1, FR_poly_hiS nop.i 0}{ .mfi nop.m 0 fma.s1 FR_polyC = FR_polyC, f1, FR_poly_hiC nop.i 0};;.pred.rel "mutex",p7,p8{ .mfi nop.m 0(p7) fma.s0 FR_ResultS = FR_FirstS, f1, FR_polyS nop.i 0}{ .mfi nop.m 0(p8) fms.s0 FR_ResultS = FR_FirstS, f1, FR_polyS nop.i 0};;.pred.rel "mutex",p9,p10{ .mfi nop.m 0(p9) fma.s0 FR_ResultC = FR_FirstC, f1, FR_polyC nop.i 0}{ .mfi nop.m 0(p10) fms.s0 FR_ResultC = FR_FirstC, f1, FR_polyC nop.i 0};;.pred.rel "mutex",p11,p12{ .mfi nop.m 0(p11) fma.s0 FR_ResultS = FR_FirstC, f1, FR_polyC nop.i 0}{ .mfi nop.m 0(p12) fms.s0 FR_ResultS = FR_FirstC, f1, FR_polyC nop.i 0};;.pred.rel "mutex",p13,p14{ .mfi nop.m 0(p13) fma.s0 FR_ResultC = FR_FirstS, f1, FR_polyS nop.i 0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -