📄 frame_sse2.asm
字号:
; void __stdcall yuv422_to_yuy2_sse2(
; [esp + 4] = FRAME *top,
; [esp + 8] = FRAME *bottom,
; [esp +12] = unsigned char *out,
; [esp +16] = CONVERSION_PARAMETER *prm
; )
_yuv422_to_yuy2_sse2@16 PROC
;
;-------------------------------------------------------------------
; 巊梡偡傞儘乕僇儖曄悢
;
; [esp+ 4] work
; [esp+36] out
; [esp+40] width/16
; [esp+44] width%16*3
; [esp+48] in_step*2
; [esp+52] out_step
;
; total 88 + 兛
;-------------------------------------------------------------------
; 儗僕僗僞偺戅旔
push ebp
push edi
push esi
push edx
push ecx
push ebx
push eax
;-------------------------------------------------------------------
; 儀乕僗億僀儞僞偵僗僞僢僋傾僪儗僗傪婰壇
mov ebp, esp
;-------------------------------------------------------------------
; 儘乕僇儖曄悢椞堟偺妋曐
sub esp, 88
and esp, 0fffffff0h
sub esp, 4
;-------------------------------------------------------------------
; 儖乕僾僷儔儊乕僞偺嶌惉
mov eax, [ebp+28+4]
mov ebx, [ebp+28+8]
mov edi, [ebp+28+12]
mov edx, [ebp+28+16]
movd mm0, [eax+16]
movq mm1, [eax+20]
movd mm3, [ebx+16]
movq mm4, [ebx+20]
mov eax, [edx] ; width
movd mm6, [edx+8] ; in_step
mov ebx, [edx+12] ; out_step
movd mm7, [edx+16] ; c_offset
mov edx, [edx+4] ; height
mov ecx, eax
mov [esp+52], ebx ; out_step
and eax, 0fh ; width%16
shr ecx, 4 ; width/16
punpckldq mm6, mm6
punpckldq mm7, mm7
mov ebx, eax
paddd mm3, mm6 ; bottom y + in_step
paddd mm4, mm6 ; bottom uv + in_step
paddd mm1, mm7 ; top uv + c_offset
shl eax, 1
paddd mm4, mm7 ; bottom uv + in_step + c_offset
pslld mm6, 1 ; in_step * 2
add eax, ebx ; width%16*3
movq mm2, mm1
movq mm5, mm4
psrlq mm2, 32
psrlq mm5, 32
mov [esp+36], edi
mov [esp+40], ecx
mov [esp+44], eax
movd [esp+48], mm6
movd esi, mm0
movd eax, mm1
movd ebx, mm2
;-------------------------------------------------------------------
; 廲曽岦儖乕僾
yuv422_to_yuy2_next_line:
;-------------------------------------------------------------------
; 墶曽岦儖乕僾
yuv422_to_yuy2_next_16_pixel:
;-------------------------------------------------------------------
; 曄姺僐傾
movq xmm0, [esi]
movq xmm1, [esi+8]
movd xmm4, [eax]
movd xmm5, [eax+4]
movd xmm6, [ebx]
movd xmm7, [ebx+4]
lea esi, [esi+16]
lea eax, [eax+8]
lea ebx, [ebx+8]
punpcklbw xmm4, xmm6
punpcklbw xmm5, xmm7
punpcklbw xmm0, xmm4
punpcklbw xmm1, xmm5
movdqu [edi], xmm0
movdqu [edi+16], xmm1
lea edi, [edi+32]
;-------------------------------------------------------------------
; 墶曽岦儖乕僾廔抂僠僃僢僋
dec ecx
jnz yuv422_to_yuy2_next_16_pixel
;-------------------------------------------------------------------
; 抂悢張棟
mov ecx, [esp+44]
test ecx, ecx
jz yuv422_to_yuy2_line_end
; 抂悢張棟僐傾
movq xmm0, [esi]
movq xmm1, [esi+8]
movd xmm4, [eax]
movd xmm5, [eax+4]
movd xmm6, [ebx]
movd xmm7, [ebx+4]
punpcklbw xmm4, xmm6
punpcklbw xmm5, xmm7
punpcklbw xmm0, xmm4
punpcklbw xmm1, xmm5
movdqa [esp+4], xmm0
movdqa [esp+20], xmm1
lea esi, [esp+4]
rep movsb
;-------------------------------------------------------------------
; 廲曽岦儖乕僾廔抂僠僃僢僋
yuv422_to_yuy2_line_end:
mov ecx, [esp+48]
movd esi, mm0
movd eax, mm1
movd ebx, mm2
add esi, ecx
add eax, ecx
add ebx, ecx
mov ecx, [esp+52]
mov edi, [esp+36]
movq mm0, mm3
movq mm1, mm4
movq mm2, mm5
movd mm3, esi
movd mm4, eax
movd mm5, ebx
movd esi, mm0
movd eax, mm1
movd ebx, mm2
add edi, ecx
mov ecx, [esp+40]
mov [esp+36], edi
dec edx
jnz yuv422_to_yuy2_next_line
;-------------------------------------------------------------------
; 屻巒枛
mov esp, ebp
pop eax
pop ebx
pop ecx
pop edx
pop esi
pop edi
pop ebp
ret 16
;
_yuv422_to_yuy2_sse2@16 ENDP
;-------------------------------------------------------------------
; 廔椆
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; yuy2_convert_sse2 - YUY2 僨乕僞偺曄姺峴楍曄峏
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;-------------------------------------------------------------------
PUBLIC C _yuy2_convert_sse2@16
; void __stdcall yuy2_convert_sse2(
; [esp + 4] = unsigned char *yuy2
; [esp + 8] = int step
; [esp +12] = int height
; [esp +16] = YUY2_CONVERSION_PARAMETER *prm
; )
_yuy2_convert_sse2@16 PROC
;
;-------------------------------------------------------------------
; 巊梡偡傞儘乕僇儖曄悢
;
; [esp+16] uv&vv
; [esp+32] work[16]
; [esp+48] abs(step)/16
; [esp+52] p
;-------------------------------------------------------------------
; 儗僕僗僞偺戅旔
;
push ebp
push eax
push ebx
push ecx
push esi
push edi
;-------------------------------------------------------------------
; 儀乕僗億僀儞僞偵僗僞僢僋傾僪儗僗傪婰壇
mov ebp, esp
;-------------------------------------------------------------------
; 儘乕僇儖曄悢椞堟偺妋曐
sub esp, 56
and esp, 0fffffff0h
;-------------------------------------------------------------------
; 曄悢偺僙僢僩傾僢僾
mov esi, [ebp+24+4]
mov ecx, [ebp+24+8]
mov ebx, [ebp+24+12]
mov eax, [ebp+24+16]
movq xmm5, [eax]
movq xmm4, [eax+8]
movq xmm3, [eax+16]
mov eax, ecx ; abs(step) phase-1
pcmpeqw xmm2, xmm2
pxor xmm6, xmm6
pxor xmm7, xmm7
sar ecx, 31 ; abs(step) phase-2
pshufd xmm3, xmm3, 01000100b
pshufd xmm4, xmm4, 01000100b
pshufd xmm5, xmm5, 01000100b
psrad xmm3, 2
psrad xmm4, 2
psrad xmm5, 2
xor eax, ecx ; abs(step) phase-3
psubw xmm6, xmm2
psubd xmm7, xmm2
sub eax, ecx ; abs(step) phase-4
packssdw xmm3, xmm3
packssdw xmm4, xmm4
packssdw xmm5, xmm5
mov ecx, eax ; copy abs(step)
psllw xmm6, 7 ; 128x8
pslld xmm7, 12 ; 4096x4
movdqa [esp+16], xmm3 ; vu&vv
shr ecx, 4 ; abs(step)/16
and eax, 15 ; abs(step)%16
mov [esp+48], ecx
;-------------------------------------------------------------------
; 廲曽岦儖乕僾
yuy2_convert_sse2_v_head:
test ecx,ecx
jz yuy2_convert_sse2_h_tail
yuy2_convert_sse2_h_head:
movdqu xmm0, [esi]
movdqu xmm3, [esi]
psrlw xmm0, 8
psllw xmm3, 8
psubw xmm0, xmm6
psrlw xmm3, 8
movdqa xmm1, xmm0
movdqa xmm2, xmm0
pmaddwd xmm0, xmm5
pmaddwd xmm1, xmm4
pmaddwd xmm2, [esp+16]
paddd xmm0, xmm7
paddd xmm1, xmm7
paddd xmm2, xmm7
psrad xmm0, 14
psrad xmm1, 14
psrad xmm2, 14
packssdw xmm0, xmm0 ; YD_YC_YB_YA_YD_YC_YB_YA
packssdw xmm1, xmm1 ; VD_VC_VB_VA_VD_VC_VB_VA
packssdw xmm2, xmm2 ; UD_UC_UB_UA_UD_UC_UB_UA
punpcklwd xmm0, xmm0 ; YD_YD_YC_YC_YB_YB_YA_YA
punpcklwd xmm1, xmm2 ; VD_UD_VC_UC_VB_UB_VA_UA
paddw xmm0, xmm3 ; YH_YG_YF_YE_YD_YC_YB_YA
paddw xmm1, xmm6 ; UV+128
packuswb xmm0, xmm0 ; YD_YC_YB_YA_YD_YC_YB_YA
packuswb xmm1, xmm1 ; VB_UB_VA_UA_VB_UB_VA_UA
punpcklbw xmm0, xmm1 ; VB_YD_UB_YC_VA_YB_UA_YA
movdqu [esi], xmm0
add esi, 16
dec ecx
jnz yuy2_convert_sse2_h_head
yuy2_convert_sse2_h_tail:
test eax,eax
jz yuy2_convert_sse2_h_last
mov [esp+52], esi
lea edi, [esp+32]
mov ecx, eax
rep movsb
movdqu xmm0, [esp+32]
movdqu xmm3, [esi+32]
psrlw xmm0, 8
psllw xmm3, 8
psubw xmm0, xmm6
psrlw xmm3, 8
movdqa xmm1, xmm0
movdqa xmm2, xmm0
pmaddwd xmm0, xmm5
pmaddwd xmm1, xmm4
pmaddwd xmm2, [esp+16]
paddd xmm0, xmm7
paddd xmm1, xmm7
paddd xmm2, xmm7
psrad xmm0, 14
psrad xmm1, 14
psrad xmm2, 14
packssdw xmm0, xmm0 ; YD_YC_YB_YA_YD_YC_YB_YA
packssdw xmm1, xmm1 ; VD_VC_VB_VA_VD_VC_VB_VA
packssdw xmm2, xmm2 ; UD_UC_UB_UA_UD_UC_UB_UA
punpcklwd xmm0, xmm0 ; YD_YD_YC_YC_YB_YB_YA_YA
punpcklwd xmm1, xmm2 ; VD_UD_VC_UC_VB_UB_VA_UA
paddw xmm0, xmm3 ; YH_YG_YF_YE_YD_YC_YB_YA
paddw xmm1, xmm6 ; UV+128
packuswb xmm1, xmm1 ; VB_UB_VA_UA_VB_UB_VA_UA
packuswb xmm0, xmm0 ; YD_YC_YB_YA_YD_YC_YB_YA
punpcklbw xmm0, xmm1 ; VB_YD_UB_YC_VA_YB_UA_YA
movdqu [esp+32], xmm0
lea esi, [esp+32]
mov edi, [esp+52]
mov ecx, eax
rep movsb
yuy2_convert_sse2_h_last:
mov ecx, [esp+48]
mov esi, [ebp+24+4]
add esi, [ebp+24+8]
mov [ebp+24+4], esi
dec ebx
jnz yuy2_convert_sse2_v_head
;-------------------------------------------------------------------
; 屻巒枛
mov esp, ebp
pop edi
pop esi
pop ecx
pop ebx
pop eax
pop ebp
ret 16
_yuy2_convert_sse2@16 ENDP
;-------------------------------------------------------------------
; 廔椆
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; _TEXT64 僙僌儊儞僩偺廔椆
END
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -