📄 math.asm
字号:
atan Proc
movd MM7, [sgn] ; mask to extract sign bit
movd MM5, [mabs] ; mask to clear sign bit
pand MM7, MM0 ; sign(x)
movd MM6, [one] ; 1.0
pand MM0, MM5 ; z=abs(x)
pcmpgtd MM6, MM0 ; z < 1 ? 0xffffffff : 0
pfrcp MM2, MM0 ; 1/z approx
movq MM1, MM0 ; save z
pfrcpit1 MM0, MM2 ; 1/z step
movd MM3, [qq2] ; qq2
pfrcpit2 MM0, MM2 ; 1/z final
movd MM4, [pp2] ; pp2
pfmin MM0, MM1 ; z = z < 1 ? z : 1/z
movq MM1, MM0 ; save z
pfmul MM0, MM0 ; z^2
movd MM5, [pp1] ; pp1
pfadd MM3, MM0 ; z^2 + qq2
pfmul MM4, MM0 ; pp2 * z^2
movd MM2, [qq1] ; qq1
pfmul MM3, MM0 ; (z^2 + qq2) * z^2
pfadd MM4, MM5 ; p2 * z^2 + pp1
movd MM5, [pp0] ; pp0
pfadd MM3, MM2 ; (z^2 + qq2) * z^2 + qq1
pfmul MM4, MM0 ; (p2 * z^2 + pp1) * z^2
movd MM2, [qq0] ; qq0
pfmul MM3, MM0 ; ((z^2 + qq2) * z^2 + qq1) * z^2
pfadd MM4, MM5 ; (p2 * z^2 + pp1) * z^2 + pp0
pfadd MM3, MM2 ; qx=((z^2 + qq2) * z^2 + qq1) * z^2 + qq0
pfmul MM0, MM4 ; ((p2 * z^2 + pp1) * z^2 + pp0) * z^2
pfmul MM0, MM1 ; px*z^3=((p2 * z^2 + pp1) * z^2 + pp0) * z^3
pfrcp MM5, MM3 ; 1/qx approx
movd MM4, [pio2] ; pi/2
pfrcpit1 MM3, MM5 ; 1/qx step
pfrcpit2 MM3, MM5 ; 1/qx final
pfmul MM0, MM3 ; z^3*px/qx
pfadd MM0, MM1 ; res=z + z^3 * px/qx
pfsub MM4, MM0 ; pi/2-res
pandn MM6, MM4 ; z < 1 ? 0 : pi/2-res
pfmax MM0, MM6 ; atan(abs(x)) = z < 1 ? res : pi/2-res
por MM0, MM7 ; atan(x)=sign(x)*atan(abs(x))
ret
atan EndP
;******************************************************************************
; Routine: a_log
; Input: mm0.lo
; Result: mm0.lo
; Uses: mm0-mm7
; Comment:
; Compute log(abs(x)) using MMX and 3DNow! instructions. Scalar version.
;
; If the input has an exponent of 0xFF, the result of this routine
; is undefined. Inputs with an exponent of 0 are treated as true
; zeros and return a result of (- max_normal). Underflow or over-
; flow can not occur otherwise.
;
; The input x = 2^k * m, thus the natural logarithm is log(2^k) +
; log(m) = k*log(2) + log(m). Here, m is chosen such than m is <
; sqrt(2). Then, log(m) = 2*artanh(m+1)/(m-1). A polynomial minimax
; approximation is used to compute artanh(z). k*log(2) is computed
; with increased precision by splitting the constant log(2) into a
; 16-bit high-order part and a 24-bit low-order part. The product
; of the high-order part and k is exactly representable.
;
; Testing shows that this function has an error of less than 2.42
; single precision ulps.
;
;******************************************************************************
Align 16
;Public _a_log
log Proc
movd MM6, [mant] ; mask for mantissa
movq MM4, MM0 ; save x
movd MM2, [expo] ; mask for exponent
pand MM0, MM6 ; extract mantissa of x => m
movd MM3, [one] ; 1.0
pand MM4, MM2 ; extract biased exponent of x => e
por MM0, MM3 ; float(m)
movd MM3, [rt2] ; sqrt(2)
psrld MM4, 23 ; biased exponent e
movq MM2, MM0 ; save m
pxor MM5, MM5 ; create 0
movd MM6, [edec] ; 0x0080000
pcmpgtd MM0, MM3 ; m > sqrt(2) ? 0xFFFFFFFF : 0
pcmpeqd MM5, MM4 ; sel = (e == 0) ? 0xFFFFFFFFL : 0
psubd MM4, MM0 ; increment e if m > sqrt(2)
movd MM3, [bias] ; 127
pand MM0, MM6 ; m > sqrt(2) ? 0x00800000 : 0
movd MM6, [one] ; 1.0
psubd MM2, MM0 ; if m > sqrt(2), m = m/2
psubd MM4, MM3 ; true exponent = i
movq MM0, MM2 ; save m
pfadd MM2, MM6 ; m + 1
pfsub MM0, MM6 ; m - 1
pi2fd MM4, MM4 ; float(i)
movd MM7, [ln2lo]; lower 24 bits of ln(2)
pfrcp MM6, MM2 ; approx 1/mm+1) lo
movd MM3, [ln2hi]; upper 16 bits of ln(2)
pfrcpit1 MM2, MM6 ; refine 1/mm+1)
pfmul MM7, MM4 ; i*ln2lo
pfmul MM3, MM4 ; i*ln2hi
movd MM4, [c2] ; c2
pfrcpit2 MM2, MM6 ; 1/mm+1)
movd MM1, [c1] ; c1
pfmul MM2, MM0 ; z=mm+1)/mm-1)
movq MM0, MM2 ; save z
pfadd MM0, MM0 ; 2*z
pfmul MM2, MM2 ; z^2
movq MM6, MM2 ; save z^2
pfmul MM2, MM4 ; c2 * z^2
movd MM4, [c0] ; c0
pfadd MM2, MM1 ; c2 * z^2 + c1
pfmul MM2, MM6 ; (c2 * z^2 + c1) * z^2
movd MM1, [maxn] ; maxn (negative largest normal)
pfmul MM6, MM0 ; 2*z^3
pfadd MM2, MM4 ; px = (c2 * z^2 + c1) * z^2 + c0
pfmul MM2, MM6 ; px*2*z^3
pfadd MM2, MM0 ; px*2*z^3+2*z
movq MM0, MM5 ; sel
pand MM5, MM1 ; select largest negative normal if e = 0
pfadd MM2, MM7 ; px*2*z^3+2*z+i*ln2lo
pfadd MM2, MM3 ; ln(z)=px*2*z^3+2*z+i*ln2lo+i*ln2hi
pandn MM0, MM2 ; select regular result if e != 0
por MM0, MM5 ; mux in either normal or special result
ret
log EndP
;******************************************************************************
; Routine: a_log10
; Input: mm0
; Result: mm0
; Uses: mm0-mm7, eax, ecx, edx
; Comment:
; See a_log for the details of operation, this routine merely
; converts the result into a log base 10 by way of the mathematical
; identity log10(x) = ln(x)/ln(10).
;
; Note that due to register contention/latancy issues, calling
; _a_log() is not noticeably worse than inlining the function.
;
;******************************************************************************
Align 16
;Public _a_log10
log10 Proc
call log ; mm0 = ln(x)
movd MM1, [rle10]; mm1 = 1/ln(10)
pfmul MM0, MM1 ; mm0 = ln(x) / ln(10)
ret
log10 EndP
;******************************************************************************
; Routine: a_sincos (a_cos)
; Input: mm0.lo
; Result: mm0 (cos|sin)
; Uses: mm0-mm7, eax, ebx, ecx, edx, esi
; Comment:
; sincos computes both sin and cos simultaneously.
; Since the cos value is returned in mm0.lo, the "a_cos()" routine
; is really an alias for a_sincos(). The only difficulty is that
; a_cos does not unpack its result into both halves of mm0. Since
; most usages don't need the vectorization, the instruction has been
; left out.
;******************************************************************************
Align 16
;Public _a_sincos, _a_cos
;_a_cos:
cos Proc
SINCOSMAC ; mm0 = cos(x) | sin(x)
nop ; K7 doesn't like branching to a 'ret'
ret
cos EndP
;******************************************************************************
; Routine: a_sin
; Input: mm0.lo
; Result: mm0 (sin|sin)
; Uses: mm0-mm7, eax, ebx, ecx, edx, esi
;******************************************************************************
Align 16
;Public _a_sin
sin Proc
SINCOSMAC
punpckhdq MM0,MM0 ; select sin value
ret
sin EndP
;******************************************************************************
; Routine: a_tan
; Input: mm0.lo
; Result: mm0 = tan|???
; Uses: mm0-mm7, eax, ebx, ecx, edx, esi
; Comment:
; Yet another spawn of the SINCOSMAC macro, a_atan computes the arctangent
; of its input parameter using the definition tan(x) = sin(x) / cos(x).
;******************************************************************************
Align 16
;Public _a_tan
tan Proc
SINCOSMAC
movq MM1,MM0 ; mm1 = sincos(x)
pfrcp MM2,MM0 ; mm2 = 1/cos(x), stage 1
punpckhdq MM1,MM1 ; mm1 = sin(x)
pfrcpit1 MM0,MM2 ; mm0 = 1/cos(x), stage 2
pfrcpit2 MM0,MM2 ; mm0 = 1/cos(x), stage 3
pfmul MM0,MM1 ; tan(x) = sin(x) / cos(x)
ret
tan EndP
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -