📄 e_acosl.s
字号:
ld8 r3 = [r3] mov R_TMP = 0x3fbf;;}{.mmi add r2 = 64, r3 ldfe F_C3 = [r3], 16 // p7 = 1 if |s|<2^{-64} (exponent of s<bias-64) cmp.lt p7, p0 = R_EXP0, R_TMP;;}{.mmf ldfe F_C5 = [r3], 16 ldfpd F_C11, F_C13 = [r2], 16 nop.f 0;;}{.mmf ldfpd F_C7, F_C9 = [r3], 16 ldfpd F_C15, F_C17 = [r2] nop.f 0;;}{.mfb // load pi/2 ldfpd F_PI2_LO, F_PI2_HI = [r3] // s^2 fma.s1 F_R2 = f8, f8, f0 // |s|<2^{-64} (p7) br.cond.spnt RETURN_PI2;;}{.mfi nop.m 0 // s^3 fma.s1 F_R3 = f8, F_R2, f0 nop.i 0}{.mfi nop.m 0 // s^4 fma.s1 F_R4 = F_R2, F_R2, f0 nop.i 0;;}{.mfi nop.m 0 // c3+c5*s^2 fma.s1 F_P35 = F_C5, F_R2, F_C3 nop.i 0}{.mfi nop.m 0 // c11+c13*s^2 fma.s1 F_P1113 = F_C13, F_R2, F_C11 nop.i 0;;}{.mfi nop.m 0 // c7+c9*s^2 fma.s1 F_P79 = F_C9, F_R2, F_C7 nop.i 0}{.mfi nop.m 0 // c15+c17*s^2 fma.s1 F_P1517 = F_C17, F_R2, F_C15 nop.i 0;;}{.mfi nop.m 0 // (pi/2)_high-s_high fnma.s1 F_T = f8, f1, F_PI2_HI nop.i 0}{.mfi nop.m 0 // s^8 fma.s1 F_R8 = F_R4, F_R4, f0 nop.i 0;;}{.mfi nop.m 0 // c3+c5*s^2+c7*s^4+c9*s^6 fma.s1 F_P39 = F_P79, F_R4, F_P35 nop.i 0}{.mfi nop.m 0 // c11+c13*s^2+c15*s^4+c17*s^6 fma.s1 F_P1117 = F_P1517, F_R4, F_P1113 nop.i 0;;}{.mfi nop.m 0 // -s_high fms.s1 F_S = F_T, f1, F_PI2_HI nop.i 0;;}{.mfi nop.m 0 // c3+..+c17*s^14 fma.s1 F_P317 = F_R8, F_P1117, F_P39 nop.i 0;;}{.mfi nop.m 0 // s_low fma.s1 F_DS = f8, f1, F_S nop.i 0;;}{.mfi nop.m 0 // (pi/2)_low-s^3*(c3+..+c17*s^14) fnma.s0 F_P317 = F_P317, F_R3, F_PI2_LO nop.i 0;;}{.mfi nop.m 0 // (pi/2)_low-s_low-s^3*(c3+..+c17*s^14) fms.s1 F_P317 = F_P317, f1, F_DS nop.i 0;;}{.mfb nop.m 0 // result: pi/2-s-c3*s^3-..-c17*s^17 fma.s0 f8 = F_T, f1, F_P317 br.ret.sptk b0;;}RETURN_PI2:{.mfi nop.m 0 // (pi/2)_low-s fms.s0 F_PI2_LO = F_PI2_LO, f1, f8 nop.i 0;;}{.mfb nop.m 0 // (pi/2)-s fma.s0 f8 = F_PI2_HI, f1, F_PI2_LO br.ret.sptk b0;;}VERY_LARGE_INPUT:{.mmf // pointer to pi_low, pi_high add r2 = 80, r3 // load C5 ldfe F_C5 = [r3], 16 // x = ((1-(s^2)_s)*y^2-1)/2-(s^2-(s^2)_s)*y^2/2 fma.s1 F_X = F_X, F_05, f0;;}.pred.rel "mutex", p6, p11{.mmf // load pi (low, high), if s<0 (p6) ldfpd F_PI2_LO, F_PI2_HI = [r2] // C7, C9 ldfpd F_C7, F_C9 = [r3], 16 // if s>0, set F_PI2_LO=0 (p11) fma.s1 F_PI2_HI = f0, f0, f0;;}{.mfi nop.m 0 (p11) fma.s1 F_PI2_LO = f0, f0, f0 nop.i 0;;}{.mfi // adjust address for C_11 add r3 = 16, r3 // c9*x+c8 fma.s1 F_S89 = F_X, F_CS9, F_CS8 nop.i 0}{.mfi nop.m 0 // x^2 fma.s1 F_X2 = F_X, F_X, f0 nop.i 0;;}{.mfi nop.m 0 // y*(1-s^2)*x fma.s1 F_Y1S2X = F_Y1S2, F_X, f0 nop.i 0}{.mfi // C11, C13 ldfpd F_C11, F_C13 = [r3], 16 // c7*x+c6 fma.s1 F_S67 = F_X, F_CS7, F_CS6 nop.i 0;;}{.mfi // C15, C17 ldfpd F_C15, F_C17 = [r3], 16 // c3*x+c2 fma.s1 F_S23 = F_X, F_CS3, F_CS2 nop.i 0;;}{.mfi nop.m 0 // c5*x+c4 fma.s1 F_S45 = F_X, F_CS5, F_CS4 nop.i 0;;}{.mfi nop.m 0 // y*(1-s^2)*x^2 fma.s1 F_Y1S2X2 = F_Y1S2, F_X2, f0 nop.i 0}{.mfi nop.m 0 // x^4 fma.s1 F_X4 = F_X2, F_X2, f0 nop.i 0;;}{.mfi nop.m 0 // c9*x^3+..+c6 fma.s1 F_S69 = F_X2, F_S89, F_S67 nop.i 0;;}{.mfi nop.m 0 // c5*x^3+..+c2 fma.s1 F_S25 = F_X2, F_S45, F_S23 nop.i 0;;}{.mfi nop.m 0 // (pi)_high-y*(1-s^2)_s fnma.s1 F_HI = F_Y, F_1S2_S, F_PI2_HI nop.i 0;;}{.mfi nop.m 0 // c9*x^7+..+c2 fma.s1 F_S29 = F_X4, F_S69, F_S25 nop.i 0;;}{.mfi nop.m 0 // -(y*(1-s^2)_s)_high fms.s1 F_1S2_HI = F_HI, f1, F_PI2_HI nop.i 0;;}{.mfi nop.m 0 // (PS29*x^2+x)*y*(1-s^2) fma.s1 F_S19 = F_Y1S2X2, F_S29, F_Y1S2X nop.i 0;;}{.mfi nop.m 0 // y*(1-s^2)_s-(y*(1-s^2))_high fma.s1 F_DS2 = F_Y, F_1S2_S, F_1S2_HI nop.i 0;;}{.mfi nop.m 0 // R ~ sqrt(1-s^2) // (used for polynomial evaluation) fnma.s1 F_R = F_S19, f1, F_Y1S2 nop.i 0;;}{.mfi nop.m 0 // y*(1-s^2)-(y*(1-s^2))_high fma.s1 F_DS2 = F_Y, F_DS, F_DS2 nop.i 0}{.mfi nop.m 0 // (pi)_low+(PS29*x^2)*y*(1-s^2) fma.s1 F_S29 = F_Y1S2X2, F_S29, F_PI2_LO nop.i 0;;}{.mfi nop.m 0 // R^2 fma.s1 F_R2 = F_R, F_R, f0 nop.i 0;;}{.mfi nop.m 0 // if s<0 // (pi)_low+(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-(y*(1-s^2))_high) fms.s1 F_S29 = F_S29, f1, F_DS2 nop.i 0;;}{.mfi nop.m 0 // c7+c9*R^2 fma.s1 F_P79 = F_C9, F_R2, F_C7 nop.i 0}{.mfi nop.m 0 // c3+c5*R^2 fma.s1 F_P35 = F_C5, F_R2, F_C3 nop.i 0;;}{.mfi nop.m 0 // R^4 fma.s1 F_R4 = F_R2, F_R2, f0 nop.i 0}{.mfi nop.m 0 // R^3 fma.s1 F_R3 = F_R2, F_R, f0 nop.i 0;;}{.mfi nop.m 0 // c11+c13*R^2 fma.s1 F_P1113 = F_C13, F_R2, F_C11 nop.i 0}{.mfi nop.m 0 // c15+c17*R^2 fma.s1 F_P1517 = F_C17, F_R2, F_C15 nop.i 0;;}{.mfi nop.m 0 // (pi)_low+(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-(y*(1-s^2))_high)+y*(1-s^2)*x fma.s1 F_S29 = F_Y1S2, F_X, F_S29 nop.i 0;;}{.mfi nop.m 0 // c11+c13*R^2+c15*R^4+c17*R^6 fma.s1 F_P1117 = F_P1517, F_R4, F_P1113 nop.i 0}{.mfi nop.m 0 // c3+c5*R^2+c7*R^4+c9*R^6 fma.s1 F_P39 = F_P79, F_R4, F_P35 nop.i 0;;}{.mfi nop.m 0 // R^8 fma.s1 F_R8 = F_R4, F_R4, f0 nop.i 0;;}{.mfi nop.m 0 // c3+c5*R^2+c7*R^4+c9*R^6+..+c17*R^14 fma.s1 F_P317 = F_P1117, F_R8, F_P39 nop.i 0;;}{.mfi nop.m 0 // (pi)_low-(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)- // -(y*(1-s^2))_high)+y*(1-s^2)*x - P3, 17 fnma.s1 F_S29 = F_P317, F_R3, F_S29 nop.i 0;;}.pred.rel "mutex", p6, p11{.mfi nop.m 0 // Result (if s<0): // (pi)_low-(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)- // -(y*(1-s^2))_high)+y*(1-s^2)*x - P3, 17 // +(pi)_high-(y*(1-s^2))_high (p6) fma.s0 f8 = F_S29, f1, F_HI nop.i 0}{.mfb nop.m 0 // Result (if s>0): // (PS29*x^2)*y*(1-s^2)- // -y*(1-s^2)*x + P3, 17 // +(y*(1-s^2)) (p11) fms.s0 f8 = F_Y, F_1S2_S, F_S29 br.ret.sptk b0;;}acosl_SPECIAL_CASES:{.mfi alloc r32 = ar.pfs, 1, 4, 4, 0 // check if the input is a NaN, or unsupported format // (i.e. not infinity or normal/denormal) fclass.nm p7, p8 = f8, 0x3f // pointer to pi/2 add r3 = 96, r3;;}{.mfi // load pi/2 ldfpd F_PI2_HI, F_PI2_LO = [r3] // get |s| fmerge.s F_S = f0, f8 nop.i 0}{.mfb nop.m 0 // if NaN, quietize it, and return (p7) fma.s0 f8 = f8, f1, f0 (p7) br.ret.spnt b0;;}{.mfi nop.m 0 // |s| = 1 ? fcmp.eq.s0 p9, p10 = F_S, f1 nop.i 0}{.mfi nop.m 0 // load FR_X fma.s1 FR_X = f8, f1, f0 // load error tag mov GR_Parameter_TAG = 57;;}{.mfi nop.m 0 // if s = 1, result is 0 (p9) fma.s0 f8 = f0, f0, f0 // set p6=0 for |s|>1 (p10) cmp.ne p6, p0 = r0, r0;;}{.mfb nop.m 0 // if s = -1, result is pi (p6) fma.s0 f8 = F_PI2_HI, f1, F_PI2_LO // return if |s| = 1 (p9) br.ret.sptk b0;;}{.mfi nop.m 0 // get Infinity frcpa.s1 FR_RESULT, p0 = f1, f0 nop.i 0;;}{.mfb nop.m 0 // return QNaN indefinite (0*Infinity) fma.s0 FR_RESULT = f0, FR_RESULT, f0 nop.b 0;;}GLOBAL_LIBM_END(acosl)LOCAL_LIBM_ENTRY(__libm_error_region).prologue// (1){ .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0.save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs}{ .mfi.fframe 64 add sp=-64,sp // Create new stack nop.f 0 mov GR_SAVE_GP=gp // Save gp};;// (2){ .mmi stfe [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address.save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0};;.body// (3){ .mib stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y nop.b 0 // Parameter 3 address}{ .mib stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y br.call.sptk b0=__libm_error_support# // Call error handling function};;{ .mmi nop.m 0 nop.m 0 add GR_Parameter_RESULT = 48,sp};;// (4){ .mmi ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack.restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address};;{ .mib mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return};;LOCAL_LIBM_END(__libm_error_region).type __libm_error_support#,@function.global __libm_error_support#
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -