📄 math.asm

📁 一个不错的用汇编语言编写的FFT算法程序
💻 ASM
📖 第 1 页 / 共 2 页
字号:
上一页 12
atan Proc
        movd        MM7, [sgn]  ; mask to extract sign bit
        movd        MM5, [mabs] ; mask to clear sign bit
        pand        MM7, MM0    ; sign(x)
        movd        MM6, [one]  ; 1.0
        pand        MM0, MM5    ; z=abs(x)
        pcmpgtd     MM6, MM0    ; z < 1 ? 0xffffffff : 0
        pfrcp       MM2, MM0    ; 1/z approx
        movq        MM1, MM0    ; save z
        pfrcpit1    MM0, MM2    ; 1/z step
        movd        MM3, [qq2]  ; qq2
        pfrcpit2    MM0, MM2    ; 1/z final
        movd        MM4, [pp2]  ; pp2
        pfmin       MM0, MM1    ; z = z < 1 ? z : 1/z
        movq        MM1, MM0    ; save z
        pfmul       MM0, MM0    ; z^2
        movd        MM5, [pp1]  ; pp1
        pfadd       MM3, MM0    ; z^2 + qq2
        pfmul       MM4, MM0    ; pp2 * z^2
        movd        MM2, [qq1]  ; qq1
        pfmul       MM3, MM0    ; (z^2 + qq2) * z^2
        pfadd       MM4, MM5    ; p2 * z^2 + pp1
        movd        MM5, [pp0]  ; pp0
        pfadd       MM3, MM2    ; (z^2 + qq2) * z^2 + qq1
        pfmul       MM4, MM0    ; (p2 * z^2 + pp1) * z^2
        movd        MM2, [qq0]  ; qq0
        pfmul       MM3, MM0    ; ((z^2 + qq2) * z^2 + qq1) * z^2
        pfadd       MM4, MM5    ; (p2 * z^2 + pp1) * z^2 + pp0
        pfadd       MM3, MM2    ; qx=((z^2 + qq2) * z^2 + qq1) * z^2 + qq0
        pfmul       MM0, MM4    ; ((p2 * z^2 + pp1) * z^2 + pp0) * z^2
        pfmul       MM0, MM1    ; px*z^3=((p2 * z^2 + pp1) * z^2 + pp0) * z^3
        pfrcp       MM5, MM3    ; 1/qx approx
        movd        MM4, [pio2] ; pi/2
        pfrcpit1    MM3, MM5    ; 1/qx step
        pfrcpit2    MM3, MM5    ; 1/qx final
        pfmul       MM0, MM3    ; z^3*px/qx
        pfadd       MM0, MM1    ; res=z + z^3 * px/qx 
        pfsub       MM4, MM0    ; pi/2-res
        pandn       MM6, MM4    ; z < 1 ? 0 : pi/2-res
        pfmax       MM0, MM6    ; atan(abs(x)) = z < 1 ? res : pi/2-res
        por         MM0, MM7    ; atan(x)=sign(x)*atan(abs(x))
        ret
atan EndP


;******************************************************************************
; Routine:  a_log
; Input:    mm0.lo
; Result:   mm0.lo
; Uses:     mm0-mm7
; Comment:
;   Compute log(abs(x)) using MMX and 3DNow! instructions. Scalar version.
;
;   If the input has an exponent of 0xFF, the result of this routine
;   is undefined. Inputs with an exponent of 0 are treated as true
;   zeros and return a result of (- max_normal). Underflow or over-
;   flow can not occur otherwise.
;
;   The input x = 2^k * m, thus the natural logarithm is log(2^k) +
;   log(m) = k*log(2) + log(m). Here, m is chosen such than m is <
;   sqrt(2). Then, log(m) = 2*artanh(m+1)/(m-1). A polynomial minimax
;   approximation is used to compute artanh(z). k*log(2) is computed
;   with increased precision by splitting the constant log(2) into a
;   16-bit high-order part and a 24-bit low-order part. The product
;   of the high-order part and k is exactly representable.
;
;   Testing shows that this function has an error of less than 2.42
;   single precision ulps.
;
;******************************************************************************
Align   16
;Public  _a_log
log  Proc
        movd        MM6, [mant] ; mask for mantissa                  
        movq        MM4, MM0    ; save x                            
        movd        MM2, [expo] ; mask for exponent      
        pand        MM0, MM6    ; extract mantissa of x => m 
        movd        MM3, [one]  ; 1.0                            
        pand        MM4, MM2    ; extract biased exponent of x => e 
        por         MM0, MM3    ; float(m)                            
        movd        MM3, [rt2]  ; sqrt(2)                             
        psrld       MM4, 23     ; biased exponent e               
        movq        MM2, MM0    ; save m                    
        pxor        MM5, MM5    ; create 0                     
        movd        MM6, [edec] ; 0x0080000                         
        pcmpgtd     MM0, MM3    ; m > sqrt(2) ? 0xFFFFFFFF : 0        
        pcmpeqd     MM5, MM4    ; sel = (e == 0) ? 0xFFFFFFFFL : 0
        psubd       MM4, MM0    ; increment e if m > sqrt(2)          
        movd        MM3, [bias] ; 127                             
        pand        MM0, MM6    ; m > sqrt(2) ? 0x00800000 : 0
        movd        MM6, [one]  ; 1.0                                
        psubd       MM2, MM0    ; if m > sqrt(2),  m = m/2      
        psubd       MM4, MM3    ; true exponent = i                  
        movq        MM0, MM2    ; save m                             
        pfadd       MM2, MM6    ; m + 1
        pfsub       MM0, MM6    ; m - 1                          
        pi2fd       MM4, MM4    ; float(i)
        movd        MM7, [ln2lo]; lower 24 bits of ln(2)
        pfrcp       MM6, MM2    ; approx 1/mm+1) lo
        movd        MM3, [ln2hi]; upper 16 bits of ln(2)
        pfrcpit1    MM2, MM6    ; refine 1/mm+1) 
        pfmul       MM7, MM4    ; i*ln2lo
        pfmul       MM3, MM4    ; i*ln2hi
        movd        MM4, [c2]   ; c2             
        pfrcpit2    MM2, MM6    ; 1/mm+1)        
        movd        MM1, [c1]   ; c1        
        pfmul       MM2, MM0    ; z=mm+1)/mm-1)                         
        movq        MM0, MM2    ; save z                            
        pfadd       MM0, MM0    ; 2*z
        pfmul       MM2, MM2    ; z^2                             
        movq        MM6, MM2    ; save z^2                             
        pfmul       MM2, MM4    ; c2 * z^2                          
        movd        MM4, [c0]   ; c0               
        pfadd       MM2, MM1    ; c2 * z^2 + c1        
        pfmul       MM2, MM6    ; (c2 * z^2 + c1) * z^2
        movd        MM1, [maxn] ; maxn (negative largest normal)             
        pfmul       MM6, MM0    ; 2*z^3
        pfadd       MM2, MM4    ; px = (c2 * z^2 + c1) * z^2 + c0
        pfmul       MM2, MM6    ; px*2*z^3
        pfadd       MM2, MM0    ; px*2*z^3+2*z
        movq        MM0, MM5    ; sel                               
        pand        MM5, MM1    ; select largest negative normal if e = 0
        pfadd       MM2, MM7    ; px*2*z^3+2*z+i*ln2lo
        pfadd       MM2, MM3    ; ln(z)=px*2*z^3+2*z+i*ln2lo+i*ln2hi
        pandn       MM0, MM2    ; select regular result if e != 0
        por         MM0, MM5    ; mux in either normal or special result
        ret
log  EndP


;******************************************************************************
; Routine:  a_log10
; Input:    mm0
; Result:   mm0
; Uses:     mm0-mm7, eax, ecx, edx
; Comment:
;   See a_log for the details of operation, this routine merely
;   converts the result into a log base 10 by way of the mathematical
;   identity log10(x) = ln(x)/ln(10).
;
;   Note that due to register contention/latancy issues, calling
;   _a_log() is not noticeably worse than inlining the function.
;
;******************************************************************************
Align   16
;Public  _a_log10
log10    Proc
        call        log      ; mm0 = ln(x)
        movd        MM1, [rle10]; mm1 = 1/ln(10)
        pfmul       MM0, MM1    ; mm0 = ln(x) / ln(10)
        ret
log10    EndP

;******************************************************************************
; Routine:  a_sincos (a_cos)
; Input:    mm0.lo
; Result:   mm0 (cos|sin)
; Uses:     mm0-mm7, eax, ebx, ecx, edx, esi
; Comment:
;   sincos computes both sin and cos simultaneously.
;   Since the cos value is returned in mm0.lo, the "a_cos()" routine
;   is really an alias for a_sincos().  The only difficulty is that
;   a_cos does not unpack its result into both halves of mm0.  Since
;   most usages don't need the vectorization, the instruction has been
;   left out.
;******************************************************************************
Align   16
;Public  _a_sincos, _a_cos
;_a_cos:
cos   Proc
        SINCOSMAC               ; mm0 = cos(x) | sin(x)
        nop                     ; K7 doesn't like branching to a 'ret'
        ret
cos   EndP



;******************************************************************************
; Routine:  a_sin
; Input:    mm0.lo
; Result:   mm0 (sin|sin)
; Uses:     mm0-mm7, eax, ebx, ecx, edx, esi
;******************************************************************************
Align   16
;Public  _a_sin
sin  Proc
        SINCOSMAC
        punpckhdq   MM0,MM0     ; select sin value
        ret
sin  EndP



;******************************************************************************
; Routine:  a_tan
; Input:    mm0.lo
; Result:   mm0 = tan|???
; Uses:     mm0-mm7, eax, ebx, ecx, edx, esi
; Comment:
;   Yet another spawn of the SINCOSMAC macro, a_atan computes the arctangent
;   of its input parameter using the definition  tan(x) = sin(x) / cos(x).
;******************************************************************************
Align   16
;Public  _a_tan
tan  Proc
        SINCOSMAC
        movq        MM1,MM0     ; mm1 = sincos(x)
        pfrcp       MM2,MM0     ; mm2 = 1/cos(x), stage 1
        punpckhdq   MM1,MM1     ; mm1 = sin(x)
        pfrcpit1    MM0,MM2     ; mm0 = 1/cos(x), stage 2
        pfrcpit2    MM0,MM2     ; mm0 = 1/cos(x), stage 3
        pfmul       MM0,MM1     ; tan(x) = sin(x) / cos(x)
        ret
tan  EndP
上一页 12
💿 文件大小 33 K
👤 上传用户 pangbo888
📂 所属分类汇编语言
🏷️ 相关标签

#FFT #汇编语言 #编写 #算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -