📄 s_cos.s
字号:
}{ .mfb and sind_r_exp = sind_r_17_ones, sind_r_signexp(p7) fmerge.s f8 = f1,f1(p7) br.ret.spnt b0 ;;}// p10 is true if we must call routines to handle larger arguments// p10 is true if f8 exp is > 0x10009{ .mfi ldfpd sind_P2,sind_Q2 = [sind_AD_1],16 nop.f 999 cmp.ge p10,p0 = sind_r_exp,sind_exp_limit};;// sind_W = x * sind_Inv_Pi_by_16// Multiply x by scaled 16/pi and add large const to shift integer part of W to// rightmost bits of significand{ .mfi ldfpd sind_P1,sind_Q1 = [sind_AD_1] fma.s1 sind_W_2TO61_RSH = sind_NORM_f8,sind_SIG_INV_PI_BY_16_2TO61,sind_RSHF_2TO61 nop.i 999}{ .mbb(p10) cmp.ne.unc p11,p12=sind_r_sincos,r0 // p11 call __libm_cos_double_dbx // p12 call __libm_sin_double_dbx(p11) br.cond.spnt L(COSD_DBX)(p12) br.cond.spnt L(SIND_DBX)};;// sind_NFLOAT = Round_Int_Nearest(sind_W)// This is done by scaling back by 2^-61 and subtracting the shift constant{ .mfi nop.m 999 fms.s1 sind_NFLOAT = sind_W_2TO61_RSH,sind_2TOM61,sind_RSHF nop.i 999 ;;}// get N = (int)sind_int_Nfloat{ .mfi getf.sig sind_GR_n = sind_W_2TO61_RSH nop.f 999 nop.i 999 ;;}// Add 2^(k-1) (which is in sind_r_sincos) to N// sind_r = -sind_Nfloat * sind_Pi_by_16_hi + x// sind_r = sind_r -sind_Nfloat * sind_Pi_by_16_lo{ .mfi add sind_GR_n = sind_GR_n, sind_r_sincos fnma.s1 sind_r = sind_NFLOAT, sind_Pi_by_16_hi, sind_NORM_f8 nop.i 999 ;;}// Get M (least k+1 bits of N){ .mmi and sind_GR_m = 0x1f,sind_GR_n ;; nop.m 999 shl sind_GR_32m = sind_GR_m,5 ;;}// Add 32*M to address of sin_cos_beta table{ .mmi add sind_AD_2 = sind_GR_32m, sind_AD_beta_table nop.m 999 nop.i 999 ;;}{ .mfi ldfe sind_Sm = [sind_AD_2],16(p8) fclass.m.unc p10,p0=f8,0x0b // If sin, note denormal input to set uflow nop.i 999 ;;}{ .mfi ldfe sind_Cm = [sind_AD_2] fnma.s1 sind_r = sind_NFLOAT, sind_Pi_by_16_lo, sind_r nop.i 999 ;;}// get rsq{ .mfi nop.m 999 fma.s1 sind_rsq = sind_r, sind_r, f0 nop.i 999}{ .mfi nop.m 999 fmpy.s0 fp_tmp = fp_tmp,fp_tmp // fmpy forces inexact flag nop.i 999 ;;}// form P and Q series{ .mfi nop.m 999 fma.s1 sind_P_temp1 = sind_rsq, sind_P4, sind_P3 nop.i 999}{ .mfi nop.m 999 fma.s1 sind_Q_temp1 = sind_rsq, sind_Q4, sind_Q3 nop.i 999 ;;}// get rcube and sm*rsq{ .mfi nop.m 999 fmpy.s1 sind_srsq = sind_Sm,sind_rsq nop.i 999}{ .mfi nop.m 999 fmpy.s1 sind_rcub = sind_r, sind_rsq nop.i 999 ;;}{ .mfi nop.m 999 fma.s1 sind_Q_temp2 = sind_rsq, sind_Q_temp1, sind_Q2 nop.i 999}{ .mfi nop.m 999 fma.s1 sind_P_temp2 = sind_rsq, sind_P_temp1, sind_P2 nop.i 999 ;;}{ .mfi nop.m 999 fma.s1 sind_Q = sind_rsq, sind_Q_temp2, sind_Q1 nop.i 999}{ .mfi nop.m 999 fma.s1 sind_P = sind_rsq, sind_P_temp2, sind_P1 nop.i 999 ;;}// Get final P and Q{ .mfi nop.m 999 fma.s1 sind_Q = sind_srsq,sind_Q, sind_Sm nop.i 999}{ .mfi nop.m 999 fma.s1 sind_P = sind_rcub,sind_P, sind_r nop.i 999 ;;}// If sin(denormal), force inexact to be set{ .mfi nop.m 999(p10) fmpy.d.s0 fp_tmp = f8,f8 nop.i 999 ;;}// Final calculation{ .mfb nop.m 999 fma.d f8 = sind_Cm, sind_P, sind_Q br.ret.sptk b0 ;;}.endp cos#ASM_SIZE_DIRECTIVE(cos#).proc __libm_callout_1s__libm_callout_1s:L(SIND_DBX):.prologue{ .mfi nop.m 0 nop.f 0.save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs};;{ .mfi mov GR_SAVE_GP=gp nop.f 0.save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0}.body{ .mib nop.m 999 nop.i 999 br.call.sptk.many b0=__libm_sin_double_dbx# ;;};;{ .mfi mov gp = GR_SAVE_GP nop.f 999 mov b0 = GR_SAVE_B0};;{ .mib nop.m 999 mov ar.pfs = GR_SAVE_PFS br.ret.sptk b0 ;;}.endp __libm_callout_1sASM_SIZE_DIRECTIVE(__libm_callout_1s).proc __libm_callout_1c__libm_callout_1c:L(COSD_DBX):.prologue{ .mfi nop.m 0 nop.f 0.save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs};;{ .mfi mov GR_SAVE_GP=gp nop.f 0.save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0}.body{ .mib nop.m 999 nop.i 999 br.call.sptk.many b0=__libm_cos_double_dbx# ;;};;{ .mfi mov gp = GR_SAVE_GP nop.f 999 mov b0 = GR_SAVE_B0};;{ .mib nop.m 999 mov ar.pfs = GR_SAVE_PFS br.ret.sptk b0 ;;}.endp __libm_callout_1cASM_SIZE_DIRECTIVE(__libm_callout_1c)// ====================================================================// ====================================================================// These functions calculate the sin and cos for inputs// greater than 2^10// __libm_sin_double_dbx# and __libm_cos_double_dbx#// *********************************************************************// *********************************************************************//// Function: Combined sin(x) and cos(x), where//// sin(x) = sine(x), for double precision x values// cos(x) = cosine(x), for double precision x values//// *********************************************************************//// Accuracy: Within .7 ulps for 80-bit floating point values// Very accurate for double precision values//// *********************************************************************//// Resources Used://// Floating-Point Registers: f8 (Input and Return Value)// f32-f99//// General Purpose Registers:// r32-r43// r44-r45 (Used to pass arguments to pi_by_2 reduce routine)//// Predicate Registers: p6-p13//// *********************************************************************//// IEEE Special Conditions://// Denormal fault raised on denormal inputs// Overflow exceptions do not occur// Underflow exceptions raised when appropriate for sin// (No specialized error handling for this routine)// Inexact raised when appropriate by algorithm//// sin(SNaN) = QNaN// sin(QNaN) = QNaN// sin(inf) = QNaN// sin(+/-0) = +/-0// cos(inf) = QNaN// cos(SNaN) = QNaN// cos(QNaN) = QNaN// cos(0) = 1//// *********************************************************************//// Mathematical Description// ========================//// The computation of FSIN and FCOS is best handled in one piece of// code. The main reason is that given any argument Arg, computation// of trigonometric functions first calculate N and an approximation// to alpha where//// Arg = N pi/2 + alpha, |alpha| <= pi/4.//// Since//// cos( Arg ) = sin( (N+1) pi/2 + alpha ),//// therefore, the code for computing sine will produce cosine as long// as 1 is added to N immediately after the argument reduction// process.//// Let M = N if sine// N+1 if cosine.//// Now, given//// Arg = M pi/2 + alpha, |alpha| <= pi/4,//// let I = M mod 4, or I be the two lsb of M when M is represented// as 2's complement. I = [i_0 i_1]. Then//// sin( Arg ) = (-1)^i_0 sin( alpha ) if i_1 = 0,// = (-1)^i_0 cos( alpha ) if i_1 = 1.//// For example:// if M = -1, I = 11// sin ((-pi/2 + alpha) = (-1) cos (alpha)// if M = 0, I = 00// sin (alpha) = sin (alpha)// if M = 1, I = 01// sin (pi/2 + alpha) = cos (alpha)// if M = 2, I = 10// sin (pi + alpha) = (-1) sin (alpha)// if M = 3, I = 11// sin ((3/2)pi + alpha) = (-1) cos (alpha)//// The value of alpha is obtained by argument reduction and// represented by two working precision numbers r and c where//// alpha = r + c accurately.//// The reduction method is described in a previous write up.// The argument reduction scheme identifies 4 cases. For Cases 2// and 4, because |alpha| is small, sin(r+c) and cos(r+c) can be// computed very easily by 2 or 3 terms of the Taylor series// expansion as follows://// Case 2:// -------//// sin(r + c) = r + c - r^3/6 accurately// cos(r + c) = 1 - 2^(-67) accurately//// Case 4:// -------//// sin(r + c) = r + c - r^3/6 + r^5/120 accurately// cos(r + c) = 1 - r^2/2 + r^4/24 accurately//// The only cases left are Cases 1 and 3 of the argument reduction// procedure. These two cases will be merged since after the// argument is reduced in either cases, we have the reduced argument// represented as r + c and that the magnitude |r + c| is not small// enough to allow the usage of a very short approximation.//// The required calculation is either//// sin(r + c) = sin(r) + correction, or// cos(r + c) = cos(r) + correction.//// Specifically,//// sin(r + c) = sin(r) + c sin'(r) + O(c^2)// = sin(r) + c cos (r) + O(c^2)// = sin(r) + c(1 - r^2/2) accurately.// Similarly,//// cos(r + c) = cos(r) - c sin(r) + O(c^2)// = cos(r) - c(r - r^3/6) accurately.//// We therefore concentrate on accurately calculating sin(r) and// cos(r) for a working-precision number r, |r| <= pi/4 to within// 0.1% or so.//// The greatest challenge of this task is that the second terms of// the Taylor series//// r - r^3/3! + r^r/5! - ...//// and//// 1 - r^2/2! + r^4/4! - ...//// are not very small when |r| is close to pi/4 and the rounding// errors will be a concern if simple polynomial accumulation is// used. When |r| < 2^-3, however, the second terms will be small// enough (6 bits or so of right shift) that a normal Horner// recurrence suffices. Hence there are two cases that we consider// in the accurate computation of sin(r) and cos(r), |r| <= pi/4.//// Case small_r: |r| < 2^(-3)// --------------------------//// Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1],// we have//// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0// = (-1)^i_0 * cos(r + c) if i_1 = 1//// can be accurately approximated by//// sin(Arg) = (-1)^i_0 * [sin(r) + c] if i_1 = 0// = (-1)^i_0 * [cos(r) - c*r] if i_1 = 1//// because |r| is small and thus the second terms in the correction// are unneccessary.//// Finally, sin(r) and cos(r) are approximated by polynomials of// moderate lengths.//// sin(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11// cos(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10//// We can make use of predicates to selectively calculate// sin(r) or cos(r) based on i_1.//// Case normal_r: 2^(-3) <= |r| <= pi/4// ------------------------------------//// This case is more likely than the previous one if one considers// r to be uniformly distributed in [-pi/4 pi/4]. Again,//// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0// = (-1)^i_0 * cos(r + c) if i_1 = 1.//// Because |r| is now larger, we need one extra term in the// correction. sin(Arg) can be accurately approximated by//// sin(Arg) = (-1)^i_0 * [sin(r) + c(1-r^2/2)] if i_1 = 0// = (-1)^i_0 * [cos(r) - c*r*(1 - r^2/6)] i_1 = 1.//// Finally, sin(r) and cos(r) are approximated by polynomials of// moderate lengths.//// sin(r) = r + PP_1_hi r^3 + PP_1_lo r^3 +// PP_2 r^5 + ... + PP_8 r^17//// cos(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16//// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2.// The crux in accurate computation is to calculate//// r + PP_1_hi r^3 or 1 + QQ_1 r^2//// accurately as two pieces: U_hi and U_lo. The way to achieve this// is to obtain r_hi as a 10 sig. bit number that approximates r to// roughly 8 bits or so of accuracy. (One convenient way is//// r_hi := frcpa( frcpa( r ) ).)//// This way,//// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 +// PP_1_hi (r^3 - r_hi^3)// = [r + PP_1_hi r_hi^3] +// [PP_1_hi (r - r_hi)// (r^2 + r_hi r + r_hi^2) ]// = U_hi + U_lo//// Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long,// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign// and that there is no more than 8 bit shift off between r and// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus// calculated without any error. Finally, the fact that//// |U_lo| <= 2^(-8) |U_hi|//// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly// 8 extra bits of accuracy.//// Similarly,//// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] +// [QQ_1 (r - r_hi)(r + r_hi)]// = U_hi + U_lo.//// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ).//// If i_1 = 0, then//// U_hi := r + PP_1_hi * r_hi^3// U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2)// poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17// correction := c * ( 1 + C_1 r^2 )//// Else ...i_1 = 1//// U_hi := 1 + QQ_1 * r_hi * r_hi// U_lo := QQ_1 * (r - r_hi) * (r + r_hi)// poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16// correction := -c * r * (1 + S_1 * r^2)//// End//// Finally,//// V := poly + ( U_lo + correction )//// / U_hi + V if i_0 = 0// result := |// \ (-U_hi) - V if i_0 = 1//// It is important that in the last step, negation of U_hi is// performed prior to the subtraction which is to be performed in// the user-set rounding mode.////// Algorithmic Description// =======================//// The argument reduction algorithm is tightly integrated into FSIN// and FCOS which share the same code. The following is complete and// self-contained. The argument reduction description given// previously is repeated below.////// Step 0. Initialization.//// If FSIN is invoked, set N_inc := 0; else if FCOS is invoked,// set N_inc := 1.//// Step 1. Check for exceptional and special cases.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -