📄 p2pyuv.asm
字号:
movd esi, mm4 ; esi = dU
;; unpack and pack
movd edx, mm0 ; edx = 00v100u1
;; Store the U data
mov [esi+ecx*2], dl ; store 1 byte of U data.
movd esi, mm5 ; esi = dV
;; Store the V data
shr edx, 16 ; edx = 000000v1
mov [esi+ecx*2], dl ; store 1 byte of V data.
movd esi, mm3
nextline1:
;; Add pitches for next line
mov ecx, var(parms.src_pitch)
shl ecx, 1 ; ecx = src_pitch*2
add edi, ecx ; s1 = s1 + src_pitch*2
add esi, ecx ; s2 = s2 + src_pitch*2
mov ecx, var(parms.dyPitch)
shl ecx, 1 ; ecx = dyPitch*2
add ebx, ecx ; d1 = d1+ dyPitch*2
add ebp, ecx ; d2 = d2+ dyPitch*2
mov ecx, var(parms.dvPitch) ; ecx = dvPitch
movd mm3, ecx
paddd mm4, mm3 ; dv = dv+ dvPitch
mov ecx, var(parms.duPitch) ; ecx =
movd mm3, ecx
xor ecx, ecx
paddd mm5, mm3 ; du = du+ duPitch
;; inc DY counter and loop
inc eax
cmp eax, var(parms.tmp2)
jne NEAR DYLOOP1
end1:
;; Free up stack temp var.
add esp, numtemps*4
;; Pop off the stack....
pop edx
pop ecx
pop ebp
pop esi
pop edi
pop ebx
emms
;; success
xor eax, eax
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Our YUY2 to Planar YUV MMX
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
YUY2ToPlanarYUV_MMX:
;; Save some stuff...
push ebx
push edi
push esi
push ebp
push ecx
push edx
; Make room for temps on stack
sub esp, numtemps*4;
;; Load our chroma pointers.
movd mm4, var(parms.du) ; mm4 = du
movd mm5, var(parms.dv) ; mm5 = dv
;; Set up the loops
mov eax, var(parms.dest_dy)
mov ecx, var(parms.dest_dx)
shr eax, 1 ; eax = dest_dy/2
shr ecx, 2 ; ecx = dest_dx/4
jnc even ; Was dest_dx divisible by 4?
odd:
; We have 1 macro pixel left over
mov DWORD var(parms.tmp3), 1 ; Store the fact that we got odd marco pixels.
jmp cont
even:
mov DWORD var(parms.tmp3), 0 ; We have even pixels...
cont:
mov var(parms.tmp1), ecx ; save dx loop count
mov var(parms.tmp2), eax ; save dx loop count
xor ecx, ecx
mov edi, var(parms.s1) ;s1
mov esi, var(parms.s2) ;s2
mov ebx, var(parms.d1) ;d1
mov ebp, var(parms.d2) ;d2
pxor mm7, mm7 ; mm7 = 00000000 00000000
movq mm6, [MaskLuma] ; mm6 = Mask out Luma value.
xor eax, eax
DYLOOP:
DXLOOP:
;; Process 2 macro pixels at a time, 2 lines at a time.
movq mm0, [edi+ecx*8] ; mm0= v2y4u2y3 v1y2u1y1
movq mm1, [esi+ecx*8] ; mm1= v2y4u2y3 v1y2u1y1
;; Store luma values in planar YUV space
movq mm2, mm0
movq mm3, mm1
pand mm2, mm6 ; mm2 = 00Y400Y3 00Y200Y1 of s1
pand mm3, mm6 ; mm3 = 00Y400Y3 00Y200Y1 of s1
packuswb mm2, mm7 ; mm2 = 00000000 Y4Y3Y2Y1 of s1
packuswb mm3, mm7 ; mm3 = 00000000 Y4Y3Y2Y1 of s2
movd [ebx+ecx*4], mm2 ; d1=s1
psrlw mm0, 8 ; mm0 = 00v200u2 00v100u1 of S1
movd [ebp+ecx*4], mm3 ; d2=s2
;;Compute averaged chroma values
psrlw mm1, 8 ; mm1 = 00v200u2 00v100u1 of S2
paddw mm0, mm1 ; mm0 = v2v2u2u2 v1v1u1u1 s1+s2
psrlw mm0, 1 ; mm0 = 00v200u2 00v100u1 (s1+s2)/2
;;
;;Store chromas in du and dv.
;;
movd mm3, esi ;save esi
;; unpack and pack
packuswb mm0, mm0 ; mm0 = 00000000 v2u2v2u1
punpcklbw mm0, mm0 ; mm0 = v2v2u2u2 v1v1u1u1
movd esi, mm4 ; esi = dU
movq mm1, mm0
psrlq mm1, 32 ; mm1 = 0000000 v2v2u2u2
punpcklbw mm0, mm1 ; mm0 = v2v1v2v1 u2u1u2u1
;; Store the U data
movd edx, mm0 ; edx = u1u2u1u2
psrlq mm0, 32 ; mm0 = 00000000 v1v2v1v2
mov WORD [esi+ecx*2], dx ; store 2 bytes of U data.
;; Store the V data
movd esi, mm5 ; esi = dV
movd edx, mm0 ; ecx = v1v2v1v2
mov WORD [esi+ecx*2], dx ; store 2 bytes of V data.
movd esi, mm3
;; inc DX counter and loop
inc ecx
cmp ecx, var(parms.tmp1)
jne DXLOOP
;; Now we have to check for any pixels left over if dest_dx
;; is not divisible by 4. Since dest_dx must be at least even
;; we can only have 0 or two(1 macro pixel).
mov ecx, var(parms.tmp3)
jz nextline
;;
;; Do odd marco pixel here. ===========
;;
movd mm0, [edi+ecx*8] ; mm0= 00000000 v1y2u1y1
movd mm1, [esi+ecx*8] ; mm1= 00000000 v1y2u1y1
;; Store luma values in planar YUV space
movq mm2, mm0
movq mm3, mm1
pand mm2, mm6 ; mm2 = 00000000 00Y100Y2 of s1
pand mm3, mm6 ; mm3 = 00000000 00Y100Y2 of s1
packuswb mm2, mm7 ; mm2 = 00000000 0000Y2Y1 of s1
packuswb mm3, mm7 ; mm3 = 00000000 0000Y2Y1 of s2
movd edx, mm2 ; grab lower 32 bits of mm2
mov WORD [ebx+ecx*4], dx ; Just store 16 bits of ecx
movd edx, mm3 ; grab lower 32 bits of mm3
mov WORD [ebp+ecx*4], dx ; just store 16 bits of ecx
;;Compute averaged chroma values
psrlw mm0, 8 ; mm0 = 00000000 00v100u1 of S1
psrlw mm1, 8 ; mm1 = 00000000 00v100u1 of S2
paddw mm0, mm1 ; mm0 = 00000000 v1v1u1u1 s1+s2
psrlw mm0, 1 ; mm0 = 00000000 00v100u1 (s1+s2)/2
;;
;;Store chromas in du and dv.
;;
movd mm3, esi ;save esi
movd esi, mm4 ; esi = dU
;; unpack and pack
movd edx, mm0 ; edx = 00v100u1
;; Store the U data
mov [esi+ecx*2], dl ; store 1 byte of U data.
movd esi, mm5 ; esi = dV
;; Store the V data
shr edx, 16 ; edx = 000000v1
mov [esi+ecx*2], dl ; store 1 byte of V data.
movd esi, mm3
nextline:
;; Add pitches for next line
mov ecx, var(parms.src_pitch)
shl ecx, 1 ; ecx = src_pitch*2
add edi, ecx ; s1 = s1 + src_pitch*2
add esi, ecx ; s2 = s2 + src_pitch*2
mov ecx, var(parms.dyPitch)
shl ecx, 1 ; ecx = dyPitch*2
add ebx, ecx ; d1 = d1+ dyPitch*2
add ebp, ecx ; d2 = d2+ dyPitch*2
mov ecx, var(parms.dvPitch) ; ecx = dvPitch
movd mm3, ecx
paddd mm4, mm3 ; dv = dv+ dvPitch
mov ecx, var(parms.duPitch) ; ecx =
movd mm3, ecx
xor ecx, ecx
paddd mm5, mm3 ; du = du+ duPitch
;; inc DY counter and loop
inc eax
cmp eax, var(parms.tmp2)
jne NEAR DYLOOP
end:
;; Free up stack temp var.
add esp, numtemps*4
;; Pop off the stack....
pop edx
pop ecx
pop ebp
pop esi
pop edi
pop ebx
emms
;; success
xor eax, eax
ret
;;; leave a trace
version: db '$(gfw) Copyright 2001 RealNetworks Inc. Revision:1.0 $',0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -