📄 s_cosl.s
字号:
(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 nop.i 999}{ .mfi nop.m 999(p10) fma.s0 FR_C_1 = FR_C_1, FR_C_1, f0 nop.i 999 ;;}{ .mfi nop.m 999(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 nop.i 999}{ .mfi nop.m 999//// poly_lo = FR_rsq * poly_lo + S_3// poly_hi = FR_rsq * poly_hi//(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1 == 0): dummy fmpy's to flag inexact// r = 1//(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 nop.i 999}{ .mfi nop.m 999//// poly_hi = r * poly_hi //(p0) fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c nop.i 999 ;;}{ .mfi nop.m 999(p12) fms.s1 FR_r = f0, f1, FR_r nop.i 999 ;;}{ .mfi nop.m 999//// poly_hi = Z * poly_lo + c // if i_0 == 1: r = -r //(p0) fma.s1 FR_poly = FR_poly, f1, FR_poly_hi nop.i 999 ;;}{ .mfi nop.m 999(p12) fms.s0 FR_Input_X = FR_r, f1, FR_poly nop.i 999}{ .mfb nop.m 999//// poly = poly + poly_hi //(p11) fma.s0 FR_Input_X = FR_r, f1, FR_poly//// if (i_0 == 0) Result = r + poly// if (i_0 != 0) Result = r - poly//(p0) br.ret.sptk b0 ;;}L(SINCOSL_NORMAL_R): { .mii nop.m 999(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;//// Set table_ptr1 and table_ptr2 to base address of// constant table.(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;}{ .mfi nop.m 999(p0) fma.s1 FR_rsq = FR_r, FR_r, f0(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;}{ .mfi nop.m 999(p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0};;// ******************************************************************// ******************************************************************// ******************************************************************//// r and c have been computed.// We known whether this is the sine or cosine routine.// Make sure ftz mode is set - should be automatic when using wre// Get [i_0,i_1] - two lsb of N_fix_gr alone.//{ .mmi nop.m 999(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp nop.i 999};;{ .mmi ld8 GR_Table_Base = [GR_Table_Base] nop.m 999 nop.i 999};;{ .mfi(p10) add GR_Table_Base = 384, GR_Table_Base(p12) fms.s1 FR_Input_X = f0, f1, f1(p9) add GR_Table_Base = 224, GR_Table_Base ;;}{ .mfi(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16//// if (i_1==0) poly = poly * FR_rsq + PP_1_lo// else poly = FR_rsq * poly//(p11) fma.s1 FR_Input_X = f0, f1, f1 nop.i 999 ;;}{ .mmb(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16//// Adjust table pointers based on i_0 // Compute rsq = r * r//(p9) ldfe FR_PP_8 = [GR_Table_Base], 16 nop.b 999 ;;}{ .mfi nop.m 999(p0) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 nop.i 999 ;;}{ .mmf(p9) ldfe FR_PP_7 = [GR_Table_Base], 16(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16//// Load PP_8 and QQ_8; PP_7 and QQ_7//(p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;;}//// if (i_1==0) poly = PP_7 + FR_rsq * PP_8.// else poly = QQ_7 + FR_rsq * QQ_8.//{ .mmb(p9) ldfe FR_PP_6 = [GR_Table_Base], 16(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16 nop.b 999 ;;}{ .mmb(p9) ldfe FR_PP_5 = [GR_Table_Base], 16(p10) ldfe FR_S_1 = [GR_Table_Base], 16 nop.b 999 ;;}{ .mmb(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16(p9) ldfe FR_C_1 = [GR_Table_Base], 16 nop.b 999 ;;}{ .mmb(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16(p9) ldfe FR_PP_1 = [GR_Table_Base], 16 nop.b 999 ;;}{ .mmb(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16//// if (i_1=0) corr = corr + c*c// else corr = corr * c //(p9) ldfe FR_PP_4 = [GR_Table_Base], 16 nop.b 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 nop.i 999 ;;}//// if (i_1=0) poly = rsq * poly + PP_5 // else poly = rsq * poly + QQ_5 // Load PP_4 or QQ_4//{ .mmi(p9) ldfe FR_PP_3 = [GR_Table_Base], 16 ;;(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16 nop.i 999}{ .mfi nop.m 999//// r_hi = frcpa(frcpa(r)).// r_cube = r * FR_rsq.//(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 nop.i 999 ;;}//// Do dummy multiplies so inexact is always set. //{ .mfi(p9) ldfe FR_PP_2 = [GR_Table_Base], 16//// r_lo = r - r_hi //(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0 nop.i 999 ;;}{ .mbb(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16 nop.b 999 nop.b 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1=0) U_lo = r_hi * r_hi// else U_lo = r_hi + r//(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1=0) corr = C_1 * rsq// else corr = S_1 * r_cubed + r//(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r nop.i 999}{ .mfi nop.m 999//// if (i_1=0) U_hi = r_hi + U_hi // else U_hi = QQ_1 * U_hi + 1//(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo nop.i 999 ;;}{ .mfi nop.m 999//// U_hi = r_hi * r_hi //(p0) fms.s1 FR_r_lo = FR_r, f1, FR_r_hi nop.i 999}{ .mfi nop.m 999//// Load PP_1, PP_6, PP_5, and C_1// Load QQ_1, QQ_6, QQ_5, and S_1//(p0) fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 nop.i 999}{ .mfi nop.m 999(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1=0) U_lo = r * r_hi + U_lo // else U_lo = r_lo * U_lo//(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c nop.i 999 ;;}{ .mfi nop.m 999(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 nop.i 999}{ .mfi nop.m 999//// if (i_1 =0) U_hi = r + U_hi// if (i_1 =0) U_lo = r_lo * U_lo // //(p9) fma.s0 FR_PP_5 = FR_PP_5, FR_PP_4, f0 nop.i 999 ;;}{ .mfi nop.m 999(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1=0) poly = poly * rsq + PP_6// else poly = poly * rsq + QQ_6 //(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1 nop.i 999}{ .mfi nop.m 999(p10) fma.s0 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1!=0) U_hi = PP_1 * U_hi // if (i_1!=0) U_lo = r * r + U_lo // Load PP_3 or QQ_3//(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 nop.i 999 ;;}{ .mfi nop.m 999(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 nop.i 999 ;;}{ .mfi nop.m 999(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 nop.i 999 ;;}{ .mfi nop.m 999//// Load PP_2, QQ_2//(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1==0) poly = FR_rsq * poly + PP_3// else poly = FR_rsq * poly + QQ_3// Load PP_1_lo//(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1 =0) poly = poly * rsq + pp_r4// else poly = poly * rsq + qq_r4//(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1==0) U_lo = PP_1_hi * U_lo// else U_lo = QQ_1 * U_lo//(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_0==0) Result = 1// else Result = -1//(p0) fma.s1 FR_V = FR_U_lo, f1, FR_corr nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1==0) poly = FR_rsq * poly + PP_2// else poly = FR_rsq * poly + QQ_2// (p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo nop.i 999 ;;}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 nop.i 999 ;;}{ .mfi nop.m 999//// V = U_lo + corr//(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 nop.i 999 ;;}{ .mfi nop.m 999//// if (i_1==0) poly = r_cube * poly// else poly = FR_rsq * poly//(p0) fma.s1 FR_V = FR_poly, f1, FR_V nop.i 999 ;;}{ .mfi nop.m 999(p12) fms.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V nop.i 999}{ .mfb nop.m 999//// V = V + poly //(p11) fma.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V//// if (i_0==0) Result = Result * U_hi + V// else Result = Result * U_hi - V//(p0) br.ret.sptk b0 };;//// If cosine, FR_Input_X = 1// If sine, FR_Input_X = +/-Zero (Input FR_Input_X)// Results are exact, no exceptions//L(SINCOSL_ZERO):{ .mbb(p0) cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos nop.b 999 nop.b 999 ;;}{ .mfi nop.m 999(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X nop.i 999}{ .mfb nop.m 999(p6) fmerge.s FR_Input_X = f1, f1(p0) br.ret.sptk b0 ;;}L(SINCOSL_SPECIAL):{ .mfb nop.m 999//// Path for Arg = +/- QNaN, SNaN, Inf// Invalid can be raised. SNaNs// become QNaNs//(p0) fmpy.s0 FR_Input_X = FR_Input_X, f0(p0) br.ret.sptk b0 ;;}.endp cosl#ASM_SIZE_DIRECTIVE(cosl#)// Call int pi_by_2_reduce(double* x, double *y)// for |arguments| >= 2**63// Address to save r and c as double //// sp+32 -> f0// r45 sp+16 -> f0// r44 -> sp -> InputX // .proc __libm_callout__libm_callout:L(SINCOSL_ARG_TOO_LARGE): .prologue{ .mfi add r45=-32,sp // Parameter: r address nop.f 0.save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs}{ .mfi.fframe 64 add sp=-64,sp // Create new stack nop.f 0 mov GR_SAVE_GP=gp // Save gp};;{ .mmi stfe [r45] = f0,16 // Clear Parameter r on stack add r44 = 16,sp // Parameter x address.save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0};;.body{ .mib stfe [r45] = f0,-16 // Clear Parameter c on stack nop.i 0 nop.b 0}{ .mib stfe [r44] = FR_Input_X // Store Parameter x on stack nop.i 0(p0) br.call.sptk b0=__libm_pi_by_2_reduce# ;;};;{ .mii(p0) ldfe FR_Input_X =[r44],16//// Get r and c off stack//(p0) adds GR_Table_Base1 = -16, GR_Table_Base1//// Get r and c off stack//(p0) add GR_N_Inc = GR_Sin_or_Cos,r8 ;;}{ .mmb(p0) ldfe FR_r =[r45],16//// Get X off the stack// Readjust Table ptr//(p0) ldfs FR_Two_to_M3 = [GR_Table_Base1],4 nop.b 999 ;;}{ .mmb(p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0(p0) ldfe FR_c =[r45] nop.b 999 ;;}{ .mfi.restore sp add sp = 64,sp // Restore stack pointer(p0) fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 mov b0 = GR_SAVE_B0 // Restore return address};;{ .mib mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs nop.b 0};;{ .mfi nop.m 999(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 nop.i 999 ;;}{ .mib nop.m 999 nop.i 999(p6) br.cond.spnt L(SINCOSL_SMALL_R) ;;}{ .mib nop.m 999 nop.i 999(p0) br.cond.sptk L(SINCOSL_NORMAL_R) ;;}.endp __libm_calloutASM_SIZE_DIRECTIVE(__libm_callout).type __libm_pi_by_2_reduce#,@function.global __libm_pi_by_2_reduce#
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -