⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 s_cos.s

📁 Glibc 2.3.2源代码(解压后有100多M)
💻 S
📖 第 1 页 / 共 5 页
字号:
}{ .mfb      and       sind_r_exp = sind_r_17_ones, sind_r_signexp(p7)  fmerge.s      f8 = f1,f1(p7)  br.ret.spnt    b0 ;;}// p10 is true if we must call routines to handle larger arguments// p10 is true if f8 exp is > 0x10009{ .mfi      ldfpd      sind_P2,sind_Q2 = [sind_AD_1],16      nop.f 999      cmp.ge  p10,p0 = sind_r_exp,sind_exp_limit};;// sind_W          = x * sind_Inv_Pi_by_16// Multiply x by scaled 16/pi and add large const to shift integer part of W to//   rightmost bits of significand{ .mfi      ldfpd      sind_P1,sind_Q1 = [sind_AD_1]      fma.s1 sind_W_2TO61_RSH = sind_NORM_f8,sind_SIG_INV_PI_BY_16_2TO61,sind_RSHF_2TO61      nop.i 999}{ .mbb(p10) cmp.ne.unc p11,p12=sind_r_sincos,r0  // p11 call __libm_cos_double_dbx                                           // p12 call __libm_sin_double_dbx(p11) br.cond.spnt L(COSD_DBX)(p12) br.cond.spnt L(SIND_DBX)};;// sind_NFLOAT = Round_Int_Nearest(sind_W)// This is done by scaling back by 2^-61 and subtracting the shift constant{ .mfi      nop.m 999      fms.s1 sind_NFLOAT = sind_W_2TO61_RSH,sind_2TOM61,sind_RSHF      nop.i 999 ;;}// get N = (int)sind_int_Nfloat{ .mfi      getf.sig  sind_GR_n = sind_W_2TO61_RSH      nop.f 999      nop.i 999 ;;}// Add 2^(k-1) (which is in sind_r_sincos) to N// sind_r          = -sind_Nfloat * sind_Pi_by_16_hi + x// sind_r          = sind_r -sind_Nfloat * sind_Pi_by_16_lo{ .mfi      add       sind_GR_n = sind_GR_n, sind_r_sincos      fnma.s1  sind_r      = sind_NFLOAT, sind_Pi_by_16_hi, sind_NORM_f8      nop.i 999 ;;}// Get M (least k+1 bits of N){ .mmi      and       sind_GR_m = 0x1f,sind_GR_n ;;      nop.m 999      shl       sind_GR_32m = sind_GR_m,5 ;;}// Add 32*M to address of sin_cos_beta table{ .mmi      add       sind_AD_2 = sind_GR_32m, sind_AD_beta_table      nop.m 999      nop.i 999 ;;}{ .mfi      ldfe      sind_Sm = [sind_AD_2],16(p8)  fclass.m.unc p10,p0=f8,0x0b  // If sin, note denormal input to set uflow      nop.i 999 ;;}{ .mfi      ldfe      sind_Cm = [sind_AD_2]      fnma.s1  sind_r      = sind_NFLOAT, sind_Pi_by_16_lo,  sind_r      nop.i 999 ;;}// get rsq{ .mfi      nop.m 999      fma.s1   sind_rsq  = sind_r, sind_r,   f0      nop.i 999}{ .mfi      nop.m 999      fmpy.s0  fp_tmp = fp_tmp,fp_tmp // fmpy forces inexact flag      nop.i 999 ;;}// form P and Q series{ .mfi      nop.m 999      fma.s1      sind_P_temp1 = sind_rsq, sind_P4, sind_P3      nop.i 999}{ .mfi      nop.m 999      fma.s1      sind_Q_temp1 = sind_rsq, sind_Q4, sind_Q3      nop.i 999 ;;}// get rcube and sm*rsq{ .mfi      nop.m 999      fmpy.s1     sind_srsq    = sind_Sm,sind_rsq      nop.i 999}{ .mfi      nop.m 999      fmpy.s1     sind_rcub    = sind_r, sind_rsq      nop.i 999 ;;}{ .mfi      nop.m 999      fma.s1      sind_Q_temp2 = sind_rsq, sind_Q_temp1, sind_Q2      nop.i 999}{ .mfi      nop.m 999      fma.s1      sind_P_temp2 = sind_rsq, sind_P_temp1, sind_P2      nop.i 999 ;;}{ .mfi      nop.m 999      fma.s1      sind_Q       = sind_rsq, sind_Q_temp2, sind_Q1      nop.i 999}{ .mfi      nop.m 999      fma.s1      sind_P       = sind_rsq, sind_P_temp2, sind_P1      nop.i 999 ;;}// Get final P and Q{ .mfi      nop.m 999      fma.s1   sind_Q = sind_srsq,sind_Q, sind_Sm      nop.i 999}{ .mfi      nop.m 999      fma.s1   sind_P = sind_rcub,sind_P, sind_r      nop.i 999 ;;}// If sin(denormal), force inexact to be set{ .mfi      nop.m 999(p10) fmpy.d.s0 fp_tmp = f8,f8      nop.i 999 ;;}// Final calculation{ .mfb      nop.m 999      fma.d    f8     = sind_Cm, sind_P, sind_Q      br.ret.sptk    b0 ;;}.endp cos#ASM_SIZE_DIRECTIVE(cos#).proc __libm_callout_1s__libm_callout_1s:L(SIND_DBX):.prologue{ .mfi        nop.m 0        nop.f 0.save   ar.pfs,GR_SAVE_PFS        mov  GR_SAVE_PFS=ar.pfs};;{ .mfi        mov GR_SAVE_GP=gp        nop.f 0.save   b0, GR_SAVE_B0        mov GR_SAVE_B0=b0}.body{ .mib      nop.m 999      nop.i 999      br.call.sptk.many   b0=__libm_sin_double_dbx# ;;};;{ .mfi       mov gp        = GR_SAVE_GP       nop.f  999       mov b0        = GR_SAVE_B0};;{ .mib      nop.m 999      mov ar.pfs    = GR_SAVE_PFS      br.ret.sptk     b0 ;;}.endp  __libm_callout_1sASM_SIZE_DIRECTIVE(__libm_callout_1s).proc __libm_callout_1c__libm_callout_1c:L(COSD_DBX):.prologue{ .mfi        nop.m 0        nop.f 0.save   ar.pfs,GR_SAVE_PFS        mov  GR_SAVE_PFS=ar.pfs};;{ .mfi        mov GR_SAVE_GP=gp        nop.f 0.save   b0, GR_SAVE_B0        mov GR_SAVE_B0=b0}.body{ .mib      nop.m 999      nop.i 999      br.call.sptk.many   b0=__libm_cos_double_dbx# ;;};;{ .mfi       mov gp        = GR_SAVE_GP       nop.f  999       mov b0        = GR_SAVE_B0};;{ .mib      nop.m 999      mov ar.pfs    = GR_SAVE_PFS      br.ret.sptk     b0 ;;}.endp  __libm_callout_1cASM_SIZE_DIRECTIVE(__libm_callout_1c)// ====================================================================// ====================================================================// These functions calculate the sin and cos for inputs// greater than 2^10// __libm_sin_double_dbx# and __libm_cos_double_dbx#// *********************************************************************// *********************************************************************//// Function:   Combined sin(x) and cos(x), where////             sin(x) = sine(x), for double precision x values//             cos(x) = cosine(x), for double precision x values//// *********************************************************************//// Accuracy:       Within .7 ulps for 80-bit floating point values//                 Very accurate for double precision values//// *********************************************************************//// Resources Used:////    Floating-Point Registers: f8 (Input and Return Value)//                              f32-f99////    General Purpose Registers://      r32-r43//      r44-r45 (Used to pass arguments to pi_by_2 reduce routine)////    Predicate Registers:      p6-p13//// *********************************************************************////  IEEE Special Conditions:////    Denormal  fault raised on denormal inputs//    Overflow exceptions do not occur//    Underflow exceptions raised when appropriate for sin//    (No specialized error handling for this routine)//    Inexact raised when appropriate by algorithm////    sin(SNaN) = QNaN//    sin(QNaN) = QNaN//    sin(inf) = QNaN//    sin(+/-0) = +/-0//    cos(inf) = QNaN//    cos(SNaN) = QNaN//    cos(QNaN) = QNaN//    cos(0) = 1//// *********************************************************************////  Mathematical Description//  ========================////  The computation of FSIN and FCOS is best handled in one piece of//  code. The main reason is that given any argument Arg, computation//  of trigonometric functions first calculate N and an approximation//  to alpha where////  Arg = N pi/2 + alpha, |alpha| <= pi/4.////  Since////  cos( Arg ) = sin( (N+1) pi/2 + alpha ),////  therefore, the code for computing sine will produce cosine as long//  as 1 is added to N immediately after the argument reduction//  process.////  Let M = N if sine//      N+1 if cosine.////  Now, given////  Arg = M pi/2  + alpha, |alpha| <= pi/4,////  let I = M mod 4, or I be the two lsb of M when M is represented//  as 2's complement. I = [i_0 i_1]. Then////  sin( Arg ) = (-1)^i_0  sin( alpha )	if i_1 = 0,//             = (-1)^i_0  cos( alpha )     if i_1 = 1.////  For example://       if M = -1, I = 11//         sin ((-pi/2 + alpha) = (-1) cos (alpha)//       if M = 0, I = 00//         sin (alpha) = sin (alpha)//       if M = 1, I = 01//         sin (pi/2 + alpha) = cos (alpha)//       if M = 2, I = 10//         sin (pi + alpha) = (-1) sin (alpha)//       if M = 3, I = 11//         sin ((3/2)pi + alpha) = (-1) cos (alpha)////  The value of alpha is obtained by argument reduction and//  represented by two working precision numbers r and c where////  alpha =  r  +  c     accurately.////  The reduction method is described in a previous write up.//  The argument reduction scheme identifies 4 cases. For Cases 2//  and 4, because |alpha| is small, sin(r+c) and cos(r+c) can be//  computed very easily by 2 or 3 terms of the Taylor series//  expansion as follows:////  Case 2://  -------////  sin(r + c) = r + c - r^3/6	accurately//  cos(r + c) = 1 - 2^(-67)	accurately////  Case 4://  -------////  sin(r + c) = r + c - r^3/6 + r^5/120	accurately//  cos(r + c) = 1 - r^2/2 + r^4/24		accurately////  The only cases left are Cases 1 and 3 of the argument reduction//  procedure. These two cases will be merged since after the//  argument is reduced in either cases, we have the reduced argument//  represented as r + c and that the magnitude |r + c| is not small//  enough to allow the usage of a very short approximation.////  The required calculation is either////  sin(r + c)  =  sin(r)  +  correction,  or//  cos(r + c)  =  cos(r)  +  correction.////  Specifically,////	sin(r + c) = sin(r) + c sin'(r) + O(c^2)//		   = sin(r) + c cos (r) + O(c^2)//		   = sin(r) + c(1 - r^2/2)  accurately.//  Similarly,////	cos(r + c) = cos(r) - c sin(r) + O(c^2)//		   = cos(r) - c(r - r^3/6)  accurately.////  We therefore concentrate on accurately calculating sin(r) and//  cos(r) for a working-precision number r, |r| <= pi/4 to within//  0.1% or so.////  The greatest challenge of this task is that the second terms of//  the Taylor series////	r - r^3/3! + r^r/5! - ...////  and////	1 - r^2/2! + r^4/4! - ...////  are not very small when |r| is close to pi/4 and the rounding//  errors will be a concern if simple polynomial accumulation is//  used. When |r| < 2^-3, however, the second terms will be small//  enough (6 bits or so of right shift) that a normal Horner//  recurrence suffices. Hence there are two cases that we consider//  in the accurate computation of sin(r) and cos(r), |r| <= pi/4.////  Case small_r: |r| < 2^(-3)//  --------------------------////  Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1],//  we have////	sin(Arg) = (-1)^i_0 * sin(r + c)	if i_1 = 0//		 = (-1)^i_0 * cos(r + c) 	if i_1 = 1////  can be accurately approximated by////  sin(Arg) = (-1)^i_0 * [sin(r) + c]	if i_1 = 0//           = (-1)^i_0 * [cos(r) - c*r] if i_1 = 1////  because |r| is small and thus the second terms in the correction//  are unneccessary.////  Finally, sin(r) and cos(r) are approximated by polynomials of//  moderate lengths.////  sin(r) =  r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11//  cos(r) =  1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10////  We can make use of predicates to selectively calculate//  sin(r) or cos(r) based on i_1.////  Case normal_r: 2^(-3) <= |r| <= pi/4//  ------------------------------------////  This case is more likely than the previous one if one considers//  r to be uniformly distributed in [-pi/4 pi/4]. Again,////  sin(Arg) = (-1)^i_0 * sin(r + c)	if i_1 = 0//           = (-1)^i_0 * cos(r + c) 	if i_1 = 1.////  Because |r| is now larger, we need one extra term in the//  correction. sin(Arg) can be accurately approximated by////  sin(Arg) = (-1)^i_0 * [sin(r) + c(1-r^2/2)]      if i_1 = 0//           = (-1)^i_0 * [cos(r) - c*r*(1 - r^2/6)]    i_1 = 1.////  Finally, sin(r) and cos(r) are approximated by polynomials of//  moderate lengths.////	sin(r) =  r + PP_1_hi r^3 + PP_1_lo r^3 +//	              PP_2 r^5 + ... + PP_8 r^17////	cos(r) =  1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16////  where PP_1_hi is only about 16 bits long and QQ_1 is -1/2.//  The crux in accurate computation is to calculate////  r + PP_1_hi r^3   or  1 + QQ_1 r^2////  accurately as two pieces: U_hi and U_lo. The way to achieve this//  is to obtain r_hi as a 10 sig. bit number that approximates r to//  roughly 8 bits or so of accuracy. (One convenient way is////  r_hi := frcpa( frcpa( r ) ).)////  This way,////	r + PP_1_hi r^3 =  r + PP_1_hi r_hi^3 +//	                        PP_1_hi (r^3 - r_hi^3)//		        =  [r + PP_1_hi r_hi^3]  +//			   [PP_1_hi (r - r_hi)//			      (r^2 + r_hi r + r_hi^2) ]//		        =  U_hi  +  U_lo////  Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long,//  PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed//  exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign//  and that there is no more than 8 bit shift off between r and//  PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus//  calculated without any error. Finally, the fact that////	|U_lo| <= 2^(-8) |U_hi|////  says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly//  8 extra bits of accuracy.////  Similarly,////	1 + QQ_1 r^2  =  [1 + QQ_1 r_hi^2]  +//	                    [QQ_1 (r - r_hi)(r + r_hi)]//		      =  U_hi  +  U_lo.////  Summarizing, we calculate r_hi = frcpa( frcpa( r ) ).////  If i_1 = 0, then////    U_hi := r + PP_1_hi * r_hi^3//    U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2)//    poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17//    correction := c * ( 1 + C_1 r^2 )////  Else ...i_1 = 1////    U_hi := 1 + QQ_1 * r_hi * r_hi//    U_lo := QQ_1 * (r - r_hi) * (r + r_hi)//    poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16//    correction := -c * r * (1 + S_1 * r^2)////  End////  Finally,////	V := poly + ( U_lo + correction )////                 /    U_hi  +  V         if i_0 = 0//	result := |//                 \  (-U_hi) -  V         if i_0 = 1////  It is important that in the last step, negation of U_hi is//  performed prior to the subtraction which is to be performed in//  the user-set rounding mode.//////  Algorithmic Description//  =======================////  The argument reduction algorithm is tightly integrated into FSIN//  and FCOS which share the same code. The following is complete and//  self-contained. The argument reduction description given//  previously is repeated below.//////  Step 0. Initialization.////   If FSIN is invoked, set N_inc := 0; else if FCOS is invoked,//   set N_inc := 1.////  Step 1. Check for exceptional and special cases.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -