📄 libm_sincos.s
字号:
//data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(31 pi/16) -S1data8 0xfb14be7fbae58157 , 0x00003ffe // cos(31 pi/16) C1//data8 0x0000000000000000 , 0x00000000 // sin(32 pi/16) S0data8 0x8000000000000000 , 0x00003fff // cos(32 pi/16) C0LOCAL_OBJECT_END(double_sin_cos_beta_k4).section .textGLOBAL_IEEE754_ENTRY(sincos)// cis_GR_sig_inv_pi_by_16 = significand of 16/pi{ .mlx getf.exp cis_r_signexp = cis_Arg movl cis_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A}// cis_GR_rshf_2to61 = 1.1000 2^(63+63-2){ .mlx addl cis_AD_1 = @ltoff(double_cis_pi), gp movl cis_GR_rshf_2to61 = 0x47b8000000000000};;{ .mfi ld8 cis_AD_1 = [cis_AD_1] fnorm.s1 cis_NORM_f8 = cis_Arg cmp.eq p13, p14 = r0, r0 // p13 set for sincos}// cis_GR_exp_2tom61 = exponent of scaling factor 2^-61{ .mib mov cis_GR_exp_2tom61 = 0xffff-61 nop.i 0 br.cond.sptk _CIS_COMMON};;GLOBAL_IEEE754_END(sincos)GLOBAL_LIBM_ENTRY(__libm_sincos)// cis_GR_sig_inv_pi_by_16 = significand of 16/pi{ .mlx getf.exp cis_r_signexp = cis_Arg movl cis_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A}// cis_GR_rshf_2to61 = 1.1000 2^(63+63-2){ .mlx addl cis_AD_1 = @ltoff(double_cis_pi), gp movl cis_GR_rshf_2to61 = 0x47b8000000000000};;// p14 set for __libm_sincos and cis{ .mfi ld8 cis_AD_1 = [cis_AD_1] fnorm.s1 cis_NORM_f8 = cis_Arg cmp.eq p14, p13 = r0, r0}// cis_GR_exp_2tom61 = exponent of scaling factor 2^-61{ .mib mov cis_GR_exp_2tom61 = 0xffff-61 nop.i 0 nop.b 0};;_CIS_COMMON:// Form two constants we need// 16/pi * 2^-2 * 2^63, scaled by 2^61 since we just loaded the significand// 1.1000...000 * 2^(63+63-2) to right shift int(W) into the low significand// fcmp used to set denormal, and invalid on snans{ .mfi setf.sig cis_SIG_INV_PI_BY_16_2TO61 = cis_GR_sig_inv_pi_by_16 fclass.m p6,p0 = cis_Arg, 0xe7 // if x=0,inf,nan addl cis_gr_tmp = -1, r0}// 1.1000 2^63 for right shift{ .mlx setf.d cis_RSHF_2TO61 = cis_GR_rshf_2to61 movl cis_GR_rshf = 0x43e8000000000000};;// Form another constant// 2^-61 for scaling Nfloat// 0x1001a is register_bias + 27.// So if f8 >= 2^27, go to large arguments routine{ .mfi alloc GR_SAVE_PFS = ar.pfs, 3, 5, 0, 0 fclass.m p11,p0 = cis_Arg, 0x0b // Test for x=unorm mov cis_exp_limit = 0x1001a}{ .mib setf.exp cis_2TOM61 = cis_GR_exp_2tom61 nop.i 0(p6) br.cond.spnt _CIS_SPECIAL_ARGS};;// Load the two pieces of pi/16// Form another constant// 1.1000...000 * 2^63, the right shift constant{ .mmb ldfe cis_Pi_by_16_hi = [cis_AD_1],16 setf.d cis_RSHF = cis_GR_rshf(p11) br.cond.spnt _CIS_UNORM // Branch if x=unorm};;_CIS_COMMON2:// Return here if x=unorm// Create constant inexact set{ .mmi ldfe cis_Pi_by_16_lo = [cis_AD_1],16 setf.sig cis_tmp = cis_gr_tmp nop.i 0};;// Select exponent (17 lsb){ .mfi ldfe cis_Pi_by_16_lowest = [cis_AD_1],16 nop.f 0 dep.z cis_r_exp = cis_r_signexp, 0, 17};;// Start loading P, Q coefficients// p10 is true if we must call routines to handle larger arguments// p10 is true if f8 exp is > 0x1001a{ .mmb ldfpd cis_P4,cis_Q4 = [cis_AD_1],16 cmp.ge p10, p0 = cis_r_exp, cis_exp_limit(p10) br.cond.spnt _CIS_LARGE_ARGS // go to |x| >= 2^27 path};;// cis_W = x * cis_Inv_Pi_by_16// Multiply x by scaled 16/pi and add large const to shift integer part of W to// rightmost bits of significand{ .mfi ldfpd cis_P3,cis_Q3 = [cis_AD_1],16 fma.s1 cis_W_2TO61_RSH = cis_NORM_f8,cis_SIG_INV_PI_BY_16_2TO61,cis_RSHF_2TO61 nop.i 0};;// get N = (int)cis_int_Nfloat// cis_NFLOAT = Round_Int_Nearest(cis_W){ .mmf getf.sig cis_GR_n = cis_W_2TO61_RSH ldfpd cis_P2,cis_Q2 = [cis_AD_1],16 fms.s1 cis_NFLOAT = cis_W_2TO61_RSH,cis_2TOM61,cis_RSHF};;// cis_r = -cis_Nfloat * cis_Pi_by_16_hi + x{ .mfi ldfpd cis_P1,cis_Q1 = [cis_AD_1], 16 fnma.s1 cis_r = cis_NFLOAT,cis_Pi_by_16_hi,cis_NORM_f8 nop.i 0};;// Add 2^(k-1) (which is in cis_r_sincos) to N{ .mmi add cis_GR_n_cos = 0x8, cis_GR_n;;//Get M (least k+1 bits of N) and cis_GR_m_sin = 0x1f,cis_GR_n and cis_GR_m_cos = 0x1f,cis_GR_n_cos};;{ .mmi nop.m 0 nop.m 0 shl cis_GR_32m_sin = cis_GR_m_sin,5};;// Add 32*M to address of sin_cos_beta table// cis_r = cis_r -cis_Nfloat * cis_Pi_by_16_lo{ .mfi add cis_AD_2_sin = cis_GR_32m_sin, cis_AD_1 fnma.s1 cis_r = cis_NFLOAT, cis_Pi_by_16_lo, cis_r shl cis_GR_32m_cos = cis_GR_m_cos,5};;// Add 32*M to address of sin_cos_beta table{ .mmf ldfe cis_Sm_sin = [cis_AD_2_sin],16 add cis_AD_2_cos = cis_GR_32m_cos, cis_AD_1 fclass.m.unc p10,p0 = cis_Arg,0x0b // den. input - uflow};;{ .mfi ldfe cis_Sm_cos = [cis_AD_2_cos], 16 nop.i 0};;{ .mfi ldfe cis_Cm_sin = [cis_AD_2_sin] fma.s1 cis_rsq = cis_r, cis_r, f0 // get r^2 nop.i 0}// fmpy forces inexact flag{ .mfi nop.m 0 fmpy.s0 cis_tmp = cis_tmp,cis_tmp nop.i 0};;{ .mfi nop.m 0 fnma.s1 cis_r_exact = cis_NFLOAT, cis_Pi_by_16_lowest, cis_r nop.i 0};;{ .mfi ldfe cis_Cm_cos = [cis_AD_2_cos] fma.s1 cis_P_temp1 = cis_rsq, cis_P4, cis_P3 nop.i 0}{ .mfi nop.m 0 fma.s1 cis_Q_temp1 = cis_rsq, cis_Q4, cis_Q3 nop.i 0};;{ .mfi nop.m 0 fmpy.s1 cis_srsq_sin = cis_Sm_sin, cis_rsq nop.i 0}{ .mfi nop.m 0 fmpy.s1 cis_srsq_cos = cis_Sm_cos,cis_rsq nop.i 0};;{ .mfi nop.m 0 fma.s1 cis_Q_temp2 = cis_rsq, cis_Q_temp1, cis_Q2 nop.i 0}{ .mfi nop.m 0 fma.s1 cis_P_temp2 = cis_rsq, cis_P_temp1, cis_P2 nop.i 0};;{ .mfi nop.m 0 fmpy.s1 cis_rcub = cis_r_exact, cis_rsq // get r^3 nop.i 0};;{ .mfi nop.m 0 fma.s1 cis_Q = cis_rsq, cis_Q_temp2, cis_Q1 nop.i 0}{ .mfi nop.m 0 fma.s1 cis_P = cis_rsq, cis_P_temp2, cis_P1 nop.i 0};;{ .mfi nop.m 0 fma.s1 cis_Q_sin = cis_srsq_sin,cis_Q, cis_Sm_sin nop.i 0}{ .mfi nop.m 0 fma.s1 cis_Q_cos = cis_srsq_cos,cis_Q, cis_Sm_cos nop.i 0};;{ .mfi nop.m 0 fma.s1 cis_P = cis_rcub,cis_P, cis_r_exact // final P nop.i 0};;// If den. arg, force underflow to be set{ .mfi nop.m 0(p10) fmpy.d.s0 cis_tmp = cis_Arg,cis_Arg nop.i 0};;{ .mfi nop.m 0 fma.d.s0 cis_Sin_res = cis_Cm_sin,cis_P,cis_Q_sin//Final sin nop.i 0}{ .mfb nop.m 0 fma.d.s0 cis_Cos_res = cis_Cm_cos,cis_P,cis_Q_cos//Final cos(p14) br.ret.sptk b0 // common exit for __libm_sincos and cis main path};;{ .mmb stfd [cis_pResSin] = cis_Sin_res stfd [cis_pResCos] = cis_Cos_res br.ret.sptk b0 // common exit for sincos main path};;_CIS_SPECIAL_ARGS:// sin(+/-0) = +/-0// sin(Inf) = NaN// sin(NaN) = NaN{ .mfi nop.m 999 fma.d.s0 cis_Sin_res = cis_Arg, f0, f0 // sinf(+/-0,NaN,Inf) nop.i 999};;// cos(+/-0) = 1.0// cos(Inf) = NaN// cos(NaN) = NaN{ .mfb nop.m 999 fma.d.s0 cis_Cos_res = cis_Arg, f0, f1 // cosf(+/-0,NaN,Inf)(p14) br.ret.sptk b0 //spec exit for __libm_sincos and cis main path};;{ .mmb stfd [cis_pResSin] = cis_Sin_res stfd [cis_pResCos] = cis_Cos_res br.ret.sptk b0 // common exit for sincos main path};;_CIS_UNORM:// Here if x=unorm{ .mfb getf.exp cis_r_signexp = cis_NORM_f8 // Get signexp of x fcmp.eq.s0 p11,p0 = cis_Arg, f0 // Dummy op to set denorm br.cond.sptk _CIS_COMMON2 // Return to main path};;GLOBAL_LIBM_END(__libm_sincos)//// |x| > 2^27 path ///////.proc _CIS_LARGE_ARGS_CIS_LARGE_ARGS:.prologue{ .mfi nop.m 0 nop.f 0.save ar.pfs, GR_SAVE_PFS mov GR_SAVE_PFS = ar.pfs};;{ .mfi mov GR_SAVE_GP = gp nop.f 0.save b0, GR_SAVE_B0 mov GR_SAVE_B0 = b0};;.body// Call of huge arguments sincos{ .mib nop.m 0 mov GR_SAVE_PR = pr br.call.sptk b0 = __libm_sincos_large};;{ .mfi mov gp = GR_SAVE_GP nop.f 0 mov pr = GR_SAVE_PR, 0x1fffe};;{ .mfi nop.m 0 nop.f 0 mov b0 = GR_SAVE_B0};;{ .mfi nop.m 0 fma.d.s0 cis_Cos_res = cis_Cos_res, f1, f0 mov ar.pfs = GR_SAVE_PFS}{ .mfb nop.m 0 fma.d.s0 cis_Sin_res = cis_Sin_res, f1, f0(p14) br.ret.sptk b0 // exit for |x| > 2^27 path (__libm_sincos and cis)};;{ .mmb stfd [cis_pResSin] = cis_Sin_res stfd [cis_pResCos] = cis_Cos_res br.ret.sptk b0 // exit for sincos |x| > 2^27 path};;.endp _CIS_LARGE_ARGS.type __libm_sincos_large#,@function.global __libm_sincos_large#
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -