📄 sseroutines.asm
字号:
; Rabbit - 78.8
; Mandel - 90.5
format MS COFF
include '%fasminc%\win32a.inc'
section '.data' data readable writeable align 16
align 16
radius dd 4.0, 4.0, 4.0, 4.0
radiusd dq 4.0, 4.0
px1 dd 4 dup ?
py1 dd 4 dup ?
dx1 dd 4 dup ?
dy1 dd 4 dup ?
left1 dd 4 dup ?
maskbw dd 0x808080, 0x808080, 0x808080, 0x808080
mask dd 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
mask1 dd 0, 0, 0, 0xFFFFFFFF
mask2 dd 0, 0, 0xFFFFFFFF, 0xFFFFFFFF
useMovAdd dd 1
ITER = 64 ; The number of iterations
Julia = 1
Mandel = 0
section '.text' code readable executable
macro defineinstr instr {
macro instr [args] \{
\common
if ssewidth = 4
instr#S args
else
instr#D args
end if
\}
}
defineinstr MOVAP
defineinstr XORP
defineinstr ADDP
defineinstr MULP
defineinstr SUBP
defineinstr ANDP
defineinstr MOVMSKP
defineinstr ANDNP
defineinstr CMPLEP
macro copyscr reg, from {
if ssewidth = 4
MOVSS reg, dword[from]
SHUFPS reg, reg, 0
else
MOVSD reg, qword[from]
SHUFPD reg, reg, 0
end if
}
macro mycvt a, x {
if ssewidth = 4
CVTTSS2SI eax, x
else
CVTTSD2SI eax, x
end if
mov eax, [esi + eax]
mov a, eax
}
macro ConvertAndWrite color {
if color
if ssewidth = 4
mycvt [edi], xmm6
SHUFPS xmm6, xmm6, 0E5h
mycvt [edi+4], xmm6
SHUFPS xmm6, xmm6, 0E6h
mycvt [edi+8], xmm6
SHUFPS xmm6, xmm6, 0E7h
mycvt [edi+12], xmm6
else if ssewidth = 2
mycvt [edi], xmm6
SHUFPD xmm6, xmm6, 1
mycvt [edi+4], xmm6
end if
else if ~color
ANDNP xmm2, dqword[maskbw]
if ssewidth = 4
MOVAPS [edi], xmm2
else
SHUFPS xmm2, xmm2, 08h
MOVLPS [edi], xmm2
end if
end if
}
macro JuliaMandelPaintMovAdd color, type {
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
; xmm0 = zx xmm1 = zy xmm2 = tmp xmm3 = tmp xmm4 = zx2 xmm5 = zy2 xmm6 = result xmm7 = 4.0
; eax = tmp ebx = x ecx = i counter edx = y edi = bits pointer esi = color table
YLOOP:
MOVAP xmm4, dqword[left1]
BSTART:
mov ebx, [w]
XLOOP:
MOVAP xmm0,xmm4
XORP xmm6,xmm6
MOVAP xmm1,xmm5
mov ecx, ITER
ILOOP:
; xmm0 = zx xmm1 = zy
MOVAP xmm2, xmm0 ; 0 - 6
MULP xmm0, xmm0 ; 0 - 6
MOVAP xmm3, xmm1 ; 1 - 7
ADDP xmm1, xmm1 ; 1 - 5
MULP xmm1, xmm2 ; 6 - 12
MOVAP xmm2, xmm0 ; 7 - 13
MULP xmm3, xmm3 ; 8 - 14
; xmm0 = zx^2 xmm1 = 2 * zy xmm2 = zx xmm3 = zy^2
if type = Julia
ADDP xmm1, dqword[py1]
else if type = Mandel
ADDP xmm1, xmm5 ; 12 - 16
end if
SUBP xmm0, xmm3 ; 14 - 18
ADDP xmm2, xmm3 ; 16 - 20
; xmm0 = zx^2 - zy^2 xmm1 = 2*zx*zy xmm2 = zx^2 + zy^2 xmm3 = zy^2
CMPLEP xmm2, xmm7 ; 20 - 24
if type = Julia
ADDP xmm0, dqword[px1]
else if type = Mandel
ADDP xmm0, xmm4 ; 18 - 22
end if
MOVMSKP eax, xmm2 ; 24 - 30
test eax, eax
jz EXIT
if color
ANDP xmm2, xmm7 ; 25 - 27 ; xmm6 += (xmm2 < radius) ? 4.0 : 0.0;
ADDP xmm6, xmm2 ; 28 - 32
end if
sub ecx, 1
jnz ILOOP
EXIT:
ConvertAndWrite color
add edi, ssewidth * 4
ADDP xmm4, dqword[dx1]
sub ebx, ssewidth
ja XLOOP
ADDP xmm5, dqword[dy1]
add edi, [piw]
sub edx, 1
jnz YLOOP
pop edi esi ebx
ret
}
macro JuliaMandelPaintVod color, type {
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
; xmm0 = zx xmm1 = zy xmm2 = tmp xmm3 = tmp xmm4 = zx2 xmm5 = zy2 xmm6 = result xmm7 = 4.0
; eax = tmp ebx = x ecx = i counter edx = y edi = bits pointer esi = color table
YLOOP:
MOVAP xmm4, dqword[left1]
BSTART:
mov ebx, [w]
XLOOP:
MOVAP xmm0,xmm4
XORP xmm6,xmm6
MOVAP xmm1,xmm5
mov ecx, ITER
ILOOP:
; xmm0 = zx xmm1 = zy
MOVAP xmm2, xmm1 ; 0 - 6 mov
MULP xmm2, xmm2 ; 6 - 12 fp:mul
MULP xmm1, xmm0 ; 0 - 6 fp:mul
MOVAP xmm3, xmm2 ; 13 - 19 mov
MULP xmm0, xmm0 ; 2 - 8 fp:mul
ADDP xmm2, xmm0 ; 12 - 16 fp:add
CMPLEP xmm2, xmm7 ; 16 - 20 fp:add
ADDP xmm1, xmm1 ; 6 - 10 fp:add
SUBP xmm0, xmm3 ; 19 - 23 fp:add
if type = Julia
ADDP xmm1, dqword[py1] ; 10 - 14 fp:add
ADDP xmm0, dqword[px1] ; 23 - 27 fp:add
else if type = Mandel
ADDP xmm1, xmm5
ADDP xmm0, xmm4
end if
MOVMSKP eax, xmm2 ; 20 - 26 fp
test eax, eax ; 26 - 27 alu0/1
jz EXIT ; 26 - 27 alu0/1
if color
ANDP xmm2, xmm7 ; 21 - 23 mmx:alu
ADDP xmm6, xmm2 ; 24 - 28 fp:add
end if
sub ecx, 1
jnz ILOOP
EXIT:
ConvertAndWrite color
add edi, ssewidth * 4
ADDP xmm4, dqword[dx1]
sub ebx, ssewidth
ja XLOOP
ADDP xmm5, dqword[dy1]
add edi, [piw]
sub edx, 1
jnz YLOOP
pop edi esi ebx
ret
}
macro SelectTypeAndColor method {
local JULIABW, MANDEL, MANDELBW
test eax, eax
jz MANDEL ; Mandel = 0, Julia = 1
; Julia
copyscr xmm2, px
copyscr xmm3, py
MOVAP dqword[px1], xmm2
MOVAP dqword[py1], xmm3
test ebx, ebx
jnz JULIABW
method 1, Julia
JULIABW:
method 0, Julia
MANDEL:
test ebx, ebx
jnz MANDELBW
method 1, Mandel
MANDELBW:
method 0, Mandel
}
macro SelectMethod {
local VOD
mov ecx, [useMovAdd]
mov eax, [type]
mov ebx, [bw]
test ecx, ecx
jz VOD
SelectTypeAndColor JuliaMandelPaintMovAdd
VOD:
SelectTypeAndColor JuliaMandelPaintVod
}
proc PaintJuliaMandelSSE c w,h,dx2,dy2,bits,a,LEFT,TOP,px,py,piw,type,bw
ssewidth = 4 ; calculating four pixels at the same time
push ebx esi edi
mov edx, [h]
mov esi, [a]
mov edi, [bits]
copyscr xmm2, dx2
copyscr xmm3, dy2
MOVAPS dqword[dy1], xmm3
copyscr xmm4, LEFT
copyscr xmm5, TOP
MOVAPS xmm0, xmm2 ; xmm2 = 0 | dx2 | dx2 * 2 | dx2 * 3
ANDPS xmm0, dqword[mask1]
MOVAPS xmm1, xmm2
ANDPS xmm1, dqword[mask2]
ADDPS xmm0, xmm1
ADDPS xmm0, xmm2
ANDPS xmm0, dqword[mask]
ADDPS xmm4, xmm0
MOVAPS dqword[left1], xmm4
ADDPS xmm2, xmm2
ADDPS xmm2, xmm2
MOVAPS dqword[dx1], xmm2
MOVAPS xmm7, dqword[radius]
SelectMethod
endp
proc PaintJuliaMandelSSE2 c w,h,dx2:qword,dy2:qword,bits,a,LEFT:qword,TOP:qword,px:qword,py:qword,piw,type,bw
ssewidth = 2 ; two pixels at the same time
push ebx esi edi
mov edx, [h]
mov esi, [a]
mov edi, [bits]
copyscr xmm3, dy2
XORPD xmm2, xmm2
MOVAPD dqword[dy1], xmm3
MOVHPD xmm2, [dx2]
copyscr xmm4, LEFT
copyscr xmm5, TOP
ADDPD xmm4, xmm2
MOVAPD dqword[left1], xmm4
ADDPD xmm2, xmm2
SHUFPD xmm2, xmm2, 3
MOVAPD dqword[dx1], xmm2
MOVAPD xmm7, dqword[radiusd]
SelectMethod
endp
IsSSE:
pushfd
pop eax
mov edx, eax
xor eax, 00200000h
push eax
popfd
pushfd
pop eax
xor eax, edx
jz nosse1 ; 386 or 486, can't use CPUID
xor eax, eax
inc eax
cpuid
xor eax, eax
test edx, 02000000h
jz nosse1 ; SSE not supported
push SEHhandler ; try to execute an SSE instruction
push dword[FS:0]
mov [FS:0], esp
XORPS xmm0, xmm0
inc eax
nosse:
pop dword[FS:0]
pop edx
nosse1:
ret
IsSSE2:
xor eax, eax
inc eax
cpuid
xor eax, eax
test edx, 04000000h
jz nosse2 ; SSE2 not supported
inc eax
nosse2:
ret
IsMovAdd:
xor eax, eax
cpuid ; chech if Intel Family 6 => then use Vodnaya
xor eax, eax
inc eax
cmp ebx, 'Genu'
jne noVod
cmp edx, 'ineI'
jne noVod
cmp ecx, 'ntel'
jne noVod
cpuid
and eax, 0xF00
sub eax, 0x600
mov [useMovAdd], eax
noVod:
ret
virtual at eax
EXCEPTION_RECORD:
.ExceptionCode dd ?
.ExceptionFlag dd ?
.NestedExceptionRecord dd ?
.ExceptionAddress dd ?
.NumberParameters dd ?
.AdditionalData dd ?
end virtual
virtual at eax
CONTEXT:
.ContextFlags dd ?
;DEBUG REGISTERS
.Dr dd 6 dup ?
;FLOATING POINT
.ControlWord dd ?
.StatusWord dd ?
.TagWord dd ?
.ErrorOffset dd ?
.ErrorSelector dd ?
.DataOffset dd ?
.DataSelector dd ?
.FPURegs db 80 dup ?
.Cr0NpxState dd ?
;SEGMENT REGISTERS
.SegGs dd ?
.SegFs dd ?
.SegEs dd ?
.SegDs dd ?
;GENERAL-PURPOSE REGISTERS
.GPRedi dd ?
.GPResi dd ?
.GPRebx dd ?
.GPRedx dd ?
.GPRecx dd ?
.GPReax dd ?
.GPRebp dd ?
.GPReip dd ?
.SegCs dd ?
.GRPflags dd ?
.GRPesp dd ?
.SegSS dd ?
end virtual
SEHhandler:
mov eax, [esp+04] ; get EXCEPTION_RECORD structure address
cmp [EXCEPTION_RECORD.ExceptionCode], STATUS_ILLEGAL_INSTRUCTION
jnz nexthandler
mov eax, [EXCEPTION_RECORD.ExceptionFlag]
test eax, eax
jnz nexthandler
mov eax, [esp+12] ; get CONTEXT structure address
mov [CONTEXT.GPReip], nosse
xor eax, eax
ret
nexthandler: ; allow system handler to show error message
mov eax, 1
ret
public PaintJuliaMandelSSE as '_PaintJuliaMandelSSE'
public PaintJuliaMandelSSE2 as '_PaintJuliaMandelSSE2'
public IsSSE as '_IsSSE'
public IsSSE2 as '_IsSSE2'
public IsMovAdd as '_IsMovAdd'
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -