📄 math.asm
字号:
;******************************************************************************
; EQU - equates and compile time constants
;******************************************************************************
ML2H = 0bf318000h
MASKSMH = 0807fffffh ;; mask for sign and significand
MASKSH = 080000000h ;; mask for sign bit
;******************************************************************************
; DATA - segment and definitions
;******************************************************************************
.DATA
Align 8
;******************************************************************************
; Vector (3DNow!) data
;******************************************************************************
PMOne DQ 0BF8000003F800000h ; real4 1.0, -1.0
HalfVal DQ 03F0000003F000000h ; real4 0.5, 0.5
HalfMin DQ 0BF0000003F000000h ; real4 0.5, -0.5f
ones DQ 03F8000003F800000h ; real4 1.0, 1.0
twos DQ 04000000040000000h ; real4 2.0, 2.0
pinfs DQ 07f8000007f800000h ; dword PINH, PINH
smh_masks DQ 0807fffff807fffffh ; dword MASKSMH, MASKSMH
sign_mask DQ 07fffffff7fffffffh ;
sh_masks DQ 08000000080000000h ; dword MASKSH, MASKSH
two_126s DQ 00000007E0000007Eh ; dword 126, 126
negh_mask DQ 08000000000000000h ; dword 0, 080000000h (pxor same as pfmul by PMOne)
negh_mask2 DQ 08000000080000000h
ooln2s DQ 03FB8AA3B3FB8AA3Bh ; ln2 | ln2
; SINCOS specific values
pio4ht DQ 0b97daa22bf490000h ; -0.000241913 | -0.785156
mo56_42 DQ 0bcc30c31bc924925h ; -0.0238095 | -0.0178571
pio4s DQ 03f490fdb3f490fdbh ; 0.785398 | 0.785398
mo30_20 DQ 0bd4ccccdbd088889h ; -0.05 | -0.0333333
mo12_6 DQ 0be2aaaabbdaaaaabh ; -0.166667 | -0.0833333
mo2s DQ 0bf000000bf000000h ; -0.5 | -0.5
iones DQ 00000000100000001h ; 1 | 1
;******************************************************************************
; Scalar (single float) data
;******************************************************************************
sgn DD 080000000h ; mask for sign bit
mabs DD 07FFFFFFFh ; mask for absolute value (~sgn)
mant DD 0007FFFFFh ; mask for mantissa
expo DD 07F800000h ; mask for exponent
one DD 03F800000h ; 1.0f
half DD 03F000000h ; 0.5f
two DD 040000000h ; 2.0
oob DD 000000000h ; "out of bounds" value
nan DD 07fffffffh ; "Not a number" value
n0 DD 040A008EFh
n1 DD 03DAA7B3Dh
d0 DD 0412008EFh
qq0 DD 0419D92C8h
qq1 DD 041E6BD60h
qq2 DD 041355DC0h
pp0 DD 0C0D21907h
pp1 DD 0C0B59883h
pp2 DD 0BF52C7EAh
bnd DD 03F133333h
asp0 DD 03F6A4AA5h
asp1 DD 0BF004C2Ch
asq0 DD 040AFB829h
asq1 DD 0C0AF5123h
pio2 DD 03FC90FDBh
npio2 DD 0BFC90FDBh
ooln2 DD 03FB8AA3Bh
upper DD 042B17218h
lower DD 0C2AEAC50h
ln2hi DD 03F317200h
ln2lo DD 035BFBE8Eh
rt2 DD 03FB504F3h
edec DD 000800000h
bias DD 00000007Fh
c2 DD 03E18EFE2h
c1 DD 03E4CAF6Fh
c0 DD 03EAAAABDh
tl2e DD 04038AA3Bh
maxn DD 0FF7FFFFFh
q1 DD 043BC00B5h
p1 DD 041E77545h
q0 DD 045E451C5h
p0 DD 0451E424Bh
mine DD 0C2FC0000h
maxe DD 043000000h
max DD 07F7FFFFFh ; FLT_MAX
rle10 DD 03ede5bdbh ; 1/ln10
; SINCOS specific values
fouropi DD 03fa2f983h ; 1.27324f
xmax DD 046c90fdbh ; 25735.9
.CODE
;******************************************************************************
; SINCOSMAC - sin/cos simultaneous computation
; Input: mm0 - angle in radians
; Output: mm0 - (sin|cos)
; Uses: mm0-mm7, eax, ebx, ecx, edx, esi
; Comment: This macro simultaneously computes sin and cos of the input
; parameter, and returns the result packed in mm0 as (sin|cos).
; Ultimately, this routine needs higher precision and a more
; efficient implementation (less inter-register bank traffic).
;******************************************************************************
SINCOSMAC Macro
movd eax,MM0
movq MM1,MM0
movd MM3,[mabs]
mov ebx,eax
mov edx,eax
pand MM0,MM3 ;; m0 = fabs(x)
and ebx,080000000h ;; get sign bit
shr edx,01fh
xor eax,ebx ;; sign(ebx) = sign(eax)
cmp eax,[xmax]
movd MM2,[fouropi]
jl short x2
movd MM0,[one]
jmp ending
Align 16
x2:
movq MM1,MM0
pfmul MM0,MM2 ;; mm0 = fabs(x) * 4 / PI
movq MM3,[pio4ht]
pf2id MM0,MM0
movq MM7,[mo56_42]
movd ecx,MM0
pi2fd MM0,MM0
mov esi,ecx
movq MM6,[mo30_20]
punpckldq MM0,MM0
movq MM5,[ones]
pfmul MM0,MM3
movq MM3,[pio4s]
pfadd MM1,MM0
shr esi,2
punpckhdq MM0,MM0
xor edx,esi
pfadd MM1,MM0
test ecx,1
punpckldq MM1,MM1
jz short x5
pfsubr MM1,MM3
x5: movq MM2,MM5
shl edx,01fh
punpckldq MM2,MM1
pfmul MM1,MM1
mov esi,ecx
movq MM4,[mo12_6]
shr esi,1
pfmul MM7,MM1
xor ecx,esi
pfmul MM6,MM1
shl esi,01fh
pfadd MM7,MM5
xor ebx,esi
pfmul MM4,MM1
pfmul MM7,MM6
movq MM6,[mo2s]
pfadd MM7,MM5
pfmul MM6,MM1
pfmul MM4,MM7
movd MM0,edx
pfadd MM4,MM5
punpckldq MM6,MM5
psrlq MM5,32
pfmul MM4,MM6
punpckldq MM0,MM0
movd MM1,ebx
pfadd MM4,MM5
test ecx,1
pfmul MM4,MM2
jz short x7
punpckldq MM5,MM4
punpckhdq MM4,MM5
x7: pxor MM4,MM1
pxor MM0,MM4
ending:
EndM
;******************************************************************************
;******************************************************************************
;_TEXT Segment Public USE32 PAGE 'CODE'
;******************************************************************************
; Routine: a_atan
; Input: mm0.lo
; Result: mm0.lo
; Uses: mm0-mm7
; Comment:
; Compute atan(x) using MMX and 3DNow! instructions.Scalar version.
;
; If the input has an exponent of 0xFF, the result of this routine
; is undefined. Inputs with an exponent of 0 are treated as true
; zeroes and return a function value of 0. Result can not overflow.
;
; atan(x) = sign(x)*atan(abs(x). If x > 1, atan(x) = pi/2-atan(1/x)
; atan(x) for -1 <= x <= 1 is approximated by a rational minimax
; approximation.
;
; Testing shows that this function has an error of less than 2.27
; single precision ulps
;
; input mm0.low argument x
; output mm0.low result atan(x)
; destroys mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
;******************************************************************************
Align 16
;Public _a_atan
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -