📄 yuvammx.asm
字号:
%define yuva_lines dword [esp+4*(9+npush)]
%define yuva_pitch dword [esp+4*(10+npush)]
%define yuva_startx dword [esp+4*(11+npush)]
%define yuva_starty dword [esp+4*(12+npush)]
%define dst dword [esp+4*(13+npush)]
%define dst_pels dword [esp+4*(14+npush)]
%define dst_lines dword [esp+4*(15+npush)]
%define dst_pitch dword [esp+4*(16+npush)]
%define dst_startx dword [esp+4*(17+npush)]
%define dst_starty dword [esp+4*(18+npush)]
%define width dword [esp+4*(19+npush)]
%define height dword [esp+4*(20+npush)]
push ebp
push esi
push edi
push ecx
push ebx
;; tmp on stack
%assign ntmps 15
%assign npush (5+ntmps)
sub esp, ntmps*4
%define yuvay dword [esp + 0*4]
%define yuvau dword [esp + 1*4]
%define yuvaoffsetv dword [esp + 2*4]
%define yuvaoffseta dword [esp + 3*4]
%define negyuvapitch dword [esp + 4*4]
%define yuvayinc dword [esp + 5*4]
%define yuvauvinc dword [esp + 6*4]
%define sy dword [esp + 7*4]
%define su dword [esp + 8*4]
%define soffsetv dword [esp + 9*4]
%define syinc dword [esp + 10*4]
%define suvinc dword [esp + 11*4]
%define dy dword [esp + 12*4]
%define width0 dword [esp + 13*4]
%define dyinc dword [esp + 14*4]
;;
mov edi, width
and edi, -4 ;; width truncated to multiple of 4 (width & ~ 3)
mov ebp, edi ;; truncated for address increment computation only
shr ebp, 1 ;; (width & ~ 3)/2
;;-----
mov eax, dst
mov dy, eax
mov ecx, dst_pitch
add ecx, ecx
sub ecx, edi
sub ecx, edi
mov dyinc, ecx ;; inc = 2*pitch - 2*(width & ~ 3)
;;-----
mov eax, src_lines
mov ecx, src_pitch
mov edx, eax
imul eax, ecx ;; pitch*lines
mov esi, src
mov sy, esi
add esi, eax
mov su, esi
shr edx, 1 ;; lines/2
shr ecx, 1 ;; pitch/2
imul edx, ecx ;; (pitch/2)*(lines*2)
mov soffsetv, edx
sub ecx, ebp ;; pitch/2 - width/2
mov suvinc, ecx
mov ecx, src_pitch
add ecx, ecx
sub ecx, edi
mov syinc, ecx ;; inc = 2*pitch - (width & ~ 3)
;;-----
mov eax, yuva_lines
mov ecx, yuva_pitch
mov edx, ecx
neg edx
mov negyuvapitch, edx
mov edx, eax ;; lines
imul eax, ecx ;; pitch*lines
mov esi, yuva
mov yuvay, esi
add esi, eax
mov yuvau, esi
shr ecx, 1 ;; pitch/2
shr edx, 1 ;; lines/2
imul edx, ecx ;; (lines/2)*(pitch/2)
mov yuvaoffsetv, edx
lea edx, [eax+edx*2]
mov yuvaoffseta, edx
sub ecx, ebp
mov yuvauvinc, ecx
mov ecx, yuva_pitch
add ecx, ecx
sub ecx, edi
mov yuvayinc, ecx ;; inc = 2*pitch - (width & ~ 3)
;;-----
;;------------------
;; pointer adjustment to (x,y)
mov ecx, src_pitch
mov eax, src_starty
mov edx, eax
mov ebx, src_startx
imul eax, ecx ;; y*pitch
mov esi, sy ;;
add esi, eax ;; sy + y*pitch
add esi, ebx ;; sy + y*pitch + x
mov sy, esi
shr ecx, 1 ;; pitch/2
shr edx, 1 ;; y/2
imul edx, ecx ;; (y/2)*(pitch/2)
shr ebx, 1 ;; x/2
mov esi, su ;;
add esi, edx ;; su + (y/2)*(pitch/2)
add esi, ebx ;; su + (y/2)*(pitch/2) + x/2
mov su, esi
;;
;; pointer adjustment to (x,y)
mov ecx, yuva_pitch
mov eax, yuva_starty
mov edx, eax
mov ebx, yuva_startx
imul eax, ecx ;; y*pitch
add eax, ebx ;; y*pitch + x
mov esi, yuvay ;;
add esi, eax ;; yuvay + y*pitch + x
mov yuvay, esi
shr ecx, 1 ;; pitch/2
shr edx, 1 ;; y/2
imul edx, ecx ;; (y/2)*(pitch/2)
shr ebx, 1 ;; x/2
mov esi, yuvau ;;
add esi, edx ;; yuvau + (y/2)*(pitch/2)
add esi, ebx ;; yuvau + (y/2)*(pitch/2) + x/2
mov yuvau, esi
;; pointer adjustment to (x,y)
mov ecx, dst_pitch
mov eax, dst_starty
mov ebx, dst_startx
imul eax, ecx ;; y*pitch
add ebx, ebx ;; 2*x
mov esi, dy ;;
add esi, eax ;; dy + y*pitch
add esi, ebx ;; dy + y*pitch + 2*x
mov dy, esi
;;-----
;; yuva P = top
;; [Y01Y02][Y03Y04]
;; [Y11Y12][Y13Y04]
;; [U1] [U3]
;; [V1] [V3]
;; [A01A02][A03A04]
;; [A11Y12][A13A04]
;; I420 src Q = bot
;; [Y01Y02][Y03Y04]
;; [Y11Y12][Y13Y04]
;; [U1] [U3]
;; [V1] [V3]
;; YUY2 dst
;; [YUYV][YUYV] ;; byte order [Y0][U][Y1][V]
;; [YUYV][YUYV]
;;-------
pxor mm7, mm7
movq mm6, [con1]
;; set up line loop
mov eax, height
dec eax
mov height, eax
jle near exit
mov esi, yuvay
mov ebx, yuvau
mov edi, sy
mov edx, su
y100: ;; line loop
mov eax, width
sub eax, 3
mov width0, eax
jle near two_pels
a100:
;; do four pels per iteration
;;
mov ecx, yuvaoffseta
mov eax, src_pitch
;; ------- first line --------
;; yuva 4 y's
movd mm0, [esi] ;; p4 p3 p2 p1
punpcklbw mm0, mm7 ;; word p4 p3 p2 p1
;; src I420 4 y's
movq mm2, [edi] ;; q4 q3 q2 q1
punpcklbw mm2, mm7 ;; word q4 q3 q2 q1
psubw mm2, mm0 ;; q-p
;; yuva 4 a's
movd mm4, [esi+ecx] ;; a4 a3 a2 a1
punpcklbw mm4, mm7 ;; word a4 a3 a2 a1
pmullw mm2, mm4 ;; word alpha*(q-p) for y4 y3 y2 y1
psrlw mm2, 8 ;; byte alpha*(q-p) for 0 y4 0 y3 0 y2 0 y1
paddb mm2, mm0 ;; blended for 0 y4 0 y3 0 y2 0 y1
pmaddwd mm4, mm6 ;; line 1 dword a4+a3 a2+a1
;;
;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1
;; mm4 = line 1 dword ave(a4,a3) ave(a2,a1)
;;
;; ------ second line ---------
mov ebp, negyuvapitch
sub esi, ebp ;; point to line 2
;;add esi, yuva_pitch
;;;;add edi, src_pitch
;; yuva 4 y's
movd mm1, [esi] ;; p4 p3 p2 p1
punpcklbw mm1, mm7 ;; word p4 p3 p2 p1
;; src I420 4 y's
movd mm3, [edi+eax] ;; q4 q3 q2 q1
punpcklbw mm3, mm7 ;; word q4 q3 q2 q1
psubw mm3, mm1 ;; q-p
;; yuva 4 a's
movd mm5, [esi+ecx] ;; a4 a3 a2 a1
punpcklbw mm5, mm7 ;; word a4 a3 a2 a1
lea esi, [esi+ebp+4] ;; back to line 1 and inc
add edi, 4 ;; inc sy
pmullw mm3, mm5 ;; word alpha*(q-p) for y4 y3 y2 y1
psrlw mm3, 8 ;; byte alpha*(q-p) for 0 y4 0 y3 0 y2 0 y1
paddb mm3, mm1 ;; blended for 0 y4 0 y3 0 y2 0 y1
pmaddwd mm5, mm6 ;; line 2 dword a4+a3 a2+a1
;;
packssdw mm4, mm5 ;; for u v sum a22 a21 a12 a11
psrlw mm4, 1 ;; for u v ave a22 a21 a12 a11
;;
;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1
;; mm3 = line 2 blended for 0 y4 0 y3 0 y2 0 y1 0
;; mm4 = ave for line 2 line 1 = a22 a21 a12 a11
;;
;; ----- U V --------
;;mov ebx, yuvau
;;mov edx, su
mov ecx, yuvaoffsetv
mov eax, soffsetv
;; --U--
movd mm0, [ebx] ;; x x pu2 pu1
punpcklwd mm0, mm0 ;; byte pu2 pu1 pu2 pu1
punpcklbw mm0, mm7 ;; word pu2 pu1 pu2 pu1
movd mm1, [edx] ;; x x qu2 qu1
punpcklwd mm1, mm1 ;; byte qu2 qu1 qu2 qu1
punpcklbw mm1, mm7 ;; word qu2 qu1 qu2 qu1
psubw mm1, mm0 ;; qu - pu
pmullw mm1, mm4 ;; alpha*(qu-pu)
psrlw mm1, 8
paddb mm1, mm0 ;; line 2 line 1 blended u2 u1 u2 u1
;; --V--
movd mm0, [ebx+ecx] ;; x x pv2 pv1
punpcklwd mm0, mm0 ;; byte pv2 pv1 pv2 pv1
punpcklbw mm0, mm7 ;; word pv2 pv1 pv2 pv1
;;movd mm5, [edx+eax] ;; x x qv2 qv1
movzx eax, word [edx+eax] ;; prevent possible access vio
movd mm5, eax ;; 0 0 qv2 qv1
punpcklwd mm5, mm5 ;; byte qv2 qv1 qv2 qv1
punpcklbw mm5, mm7 ;; word qv2 qv1 qv2 qv1
psubw mm5, mm0 ;; qv - pv
add ebx, 2 ;; inc yuvau address
add edx, 2 ;; inc su address
pmullw mm5, mm4 ;; alpha*(qv-pv)
psrlw mm5, 8
paddb mm5, mm0 ;; line 2 line 1 blended v2 v1 v2 v1
;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1
;; mm3 = line 2 blended for 0 y4 0 y3 0 y2 0 y1
;; mm1 = line 2 line 1 blended 0 u2 0 u1 0 u2 0 u1
;; mm5 = line 2 line 1 blended 0 v2 0 v1 0 v2 0 v1
;;
mov ecx, dy
mov eax, dst_pitch
packuswb mm2, mm3 ;; blended y24 y23 y22 y21 y14 y13 y12 y11
psllq mm5, 8 ;; v2 0 v1 0 v2 0 v1 0
por mm1, mm5 ;; v22 u22 v21 u21 v12 u12 v11 u11
movq mm3, mm2
punpcklbw mm3, mm1 ;; v12 y14 u12 y13 v11 y12 u11 y11
movq [ecx], mm3 ;; line 1 result
punpckhbw mm2, mm1 ;; v22 y24 u22 y23 v21 y22 u21 y21
movq [ecx+eax], mm2 ;; line 2 result
add ecx, 8 ;; inc dy address
mov dy, ecx
mov eax, width0 ;; pel loop
sub eax, 4
mov width0, eax
jg near a100
;;------------------------------
two_pels:
;; do two pels if any
;; remaining pels = eax+3
;; compute 2 pels if remaining pels = 2 or 3, 1 not computed
add eax, 2
jle near line_done
;;
;;
mov ecx, yuvaoffseta
mov eax, src_pitch
;; ------- first line --------
;; yuva 2 y's
movd mm0, [esi] ;; p2 p1
punpcklbw mm0, mm7 ;; word p2 p1
;; src I420 2 y's
movq mm2, [edi] ;; q2 q1
punpcklbw mm2, mm7 ;; word q2 q1
psubw mm2, mm0 ;; q-p
;; yuva 2 a's
movd mm4, [esi+ecx] ;; a2 a1
punpcklbw mm4, mm7 ;; word a2 a1
pmullw mm2, mm4 ;; word alpha*(q-p) for y2 y1
psrlw mm2, 8 ;; byte alpha*(q-p) for 0 y2 0 y1
paddb mm2, mm0 ;; blended for 0 y2 0 y1
pmaddwd mm4, mm6 ;; line 1 dword a2+a1
;;
;; mm2 = line 1 blended for 0 y2 0 y1
;; mm4 = line 1 ave(a2,a1)
;;
;; ------ second line ---------
mov ebp, negyuvapitch
sub esi, ebp ;; point to line 2
;; yuva 2 y's
movd mm1, [esi] ;; p2 p1
punpcklbw mm1, mm7 ;; word p2 p1
;; src I420 2 y's
movd mm3, [edi+eax] ;; q2 q1
punpcklbw mm3, mm7 ;; word q2 q1
psubw mm3, mm1 ;; q-p
;; yuva 2 a's
;;movd mm5, [esi+ecx] ;; a2 a1
movzx eax, word [esi+ecx] ;; prevent possible access vio
movd mm5, eax ;; a2 a1
punpcklbw mm5, mm7 ;; word a2 a1
lea esi, [esi+ebp] ;; back to line 1, no inc
;;add edi, 4 ;; inc sy
pmullw mm3, mm5 ;; word alpha*(q-p) for y2 y1
psrlw mm3, 8 ;; byte alpha*(q-p) for 0 y2 0 y1
paddb mm3, mm1 ;; blended for 0 y2 0 y1
pmaddwd mm5, mm6 ;; line 2 dword a2+a1
;;
packssdw mm4, mm5 ;; for uv sum xx a21 xx a11
psrlw mm4, 1 ;; for uv ave xx a21 xx a11
;;
;; mm2 = line 1 blended for 0 y2 0 y1
;; mm3 = line 2 blended for 0 y2 0 y1
;; mm4 = ave = a21 a11
;;
;; ----- U V --------
mov ecx, yuvaoffsetv
mov eax, soffsetv
;; --U--
;;movd mm0, [ebx] ;; x x x pu1
;;punpcklwd mm0, mm0 ;; byte x pu1 x pu1
;;punpcklbw mm0, mm7 ;; word x pu1 x pu1
movzx ebp, byte [ebx]
movd mm0, ebp ;; 0 0 0 pu1
punpckldq mm0, mm0 ;; word 0 pu1 0 pu1
;;movd mm1, [edx] ;; x x qu2 qu1
;;punpcklwd mm1, mm1 ;; byte qu2 qu1 qu2 qu1
;;punpcklbw mm1, mm7 ;; word qu2 qu1 qu2 qu1
movzx ebp, byte [edx]
movd mm1, ebp ;; 0 0 0 qu1
punpckldq mm1, mm1 ;; word 0 qu1 0 qu1
psubw mm1, mm0 ;; qu - pu
pmullw mm1, mm4 ;; alpha*(qu-pu)
psrlw mm1, 8
paddb mm1, mm0 ;; line 2 line 1 blended u2 u1 u2 u1
;; --V--
;;movd mm0, [ebx+ecx] ;; x x pv2 pv1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -