📄 yuvammx.asm
字号:
;;
mov ecx, dy
mov eax, dst_pitch
packuswb mm2, mm3 ;; blended y24 y23 y22 y21 y14 y13 y12 y11
psllq mm5, 8 ;; v2 0 v1 0 v2 0 v1 0
por mm1, mm5 ;; v22 u22 v21 u21 v12 u12 v11 u11
movq mm3, mm1
punpcklbw mm1, mm2 ;; y14 v12 y13 u12 y12 v11 y11 u11
movq [ecx], mm1 ;; line 1 result
punpckhbw mm3, mm2 ;; y24 v22 y23 u22 y22 v21 y21 u21
movq [ecx+eax], mm3 ;; line 2 result
add ecx, 8 ;; inc dy address
mov dy, ecx
mov eax, width0 ;; pel loop
sub eax, 4
mov width0, eax
jg near a100
;;------------------------------
two_pels:
;; do two pels if any
;; remaining pels = eax+3
;; compute 2 pels if remaining pels = 2 or 3, 1 not computed
add eax, 2
jle near line_done
;;
;;
mov ecx, yuvaoffseta
mov eax, src_pitch
;; ------- first line --------
;; yuva 2 y's
movd mm0, [esi] ;; p2 p1
punpcklbw mm0, mm7 ;; word p2 p1
;; src I420 2 y's
movq mm2, [edi] ;; q2 q1
punpcklbw mm2, mm7 ;; word q2 q1
psubw mm2, mm0 ;; q-p
;; yuva 2 a's
movd mm4, [esi+ecx] ;; a2 a1
punpcklbw mm4, mm7 ;; word a2 a1
pmullw mm2, mm4 ;; word alpha*(q-p) for y2 y1
psrlw mm2, 8 ;; byte alpha*(q-p) for 0 y2 0 y1
paddb mm2, mm0 ;; blended for 0 y2 0 y1
pmaddwd mm4, mm6 ;; line 1 dword a2+a1
;;
;; mm2 = line 1 blended for 0 y2 0 y1
;; mm4 = line 1 ave(a2,a1)
;;
;; ------ second line ---------
mov ebp, negyuvapitch
sub esi, ebp ;; point to line 2
;; yuva 2 y's
movd mm1, [esi] ;; p2 p1
punpcklbw mm1, mm7 ;; word p2 p1
;; src I420 2 y's
movd mm3, [edi+eax] ;; q2 q1
punpcklbw mm3, mm7 ;; word q2 q1
psubw mm3, mm1 ;; q-p
;; yuva 2 a's
;;movd mm5, [esi+ecx] ;; a2 a1
movzx eax, word [esi+ecx] ;; prevent possible access vio
movd mm5, eax ;; a2 a1
punpcklbw mm5, mm7 ;; word a2 a1
lea esi, [esi+ebp] ;; back to line 1, no inc
;;add edi, 4 ;; inc sy
pmullw mm3, mm5 ;; word alpha*(q-p) for y2 y1
psrlw mm3, 8 ;; byte alpha*(q-p) for 0 y2 0 y1
paddb mm3, mm1 ;; blended for 0 y2 0 y1
pmaddwd mm5, mm6 ;; line 2 dword a2+a1
;;
packssdw mm4, mm5 ;; for uv sum xx a21 xx a11
psrlw mm4, 1 ;; for uv ave xx a21 xx a11
;;
;; mm2 = line 1 blended for 0 y2 0 y1
;; mm3 = line 2 blended for 0 y2 0 y1
;; mm4 = ave = a21 a11
;;
;; ----- U V --------
mov ecx, yuvaoffsetv
mov eax, soffsetv
;; --U--
;;movd mm0, [ebx] ;; x x x pu1
;;punpcklwd mm0, mm0 ;; byte x pu1 x pu1
;;punpcklbw mm0, mm7 ;; word x pu1 x pu1
movzx ebp, byte [ebx]
movd mm0, ebp ;; 0 0 0 pu1
punpckldq mm0, mm0 ;; word 0 pu1 0 pu1
;;movd mm1, [edx] ;; x x qu2 qu1
;;punpcklwd mm1, mm1 ;; byte qu2 qu1 qu2 qu1
;;punpcklbw mm1, mm7 ;; word qu2 qu1 qu2 qu1
movzx ebp, byte [edx]
movd mm1, ebp ;; 0 0 0 qu1
punpckldq mm1, mm1 ;; word 0 qu1 0 qu1
psubw mm1, mm0 ;; qu - pu
pmullw mm1, mm4 ;; alpha*(qu-pu)
psrlw mm1, 8
paddb mm1, mm0 ;; line 2 line 1 blended u2 u1 u2 u1
;; --V--
;;movd mm0, [ebx+ecx] ;; x x pv2 pv1
movzx ebp, byte [ebx+ecx] ;; prevent possible access vio
movd mm0, ebp ;; 0 0 0 pv1
punpckldq mm0, mm0 ;; word 0 pv1 0 pv1
;;movd mm5, [edx+eax] ;; qv1
movzx eax, byte [edx+eax] ;; prevent possible access vio
movd mm5, eax ;; 0 0 0 qv1
punpckldq mm5, mm5 ;; word 0 qv1 0 qv1
psubw mm5, mm0 ;; qv - pv
;;add ebx, 2 ;; inc yuvau address
;;add edx, 2 ;; inc su address
pmullw mm5, mm4 ;; alpha*(qv-pv)
psrlw mm5, 8
paddb mm5, mm0 ;; line 2 line 1 blended v2 v1 v2 v1
;; mm2 = line 1 blended for 0 y2 0 y1
;; mm3 = line 2 blended for 0 y2 0 y1
;; mm1 = line 2 line 1 blended 0 u2 0 u1
;; mm5 = line 2 line 1 blended 0 v2 0 v1
;;
mov ecx, dy
mov eax, dst_pitch
packuswb mm2, mm3 ;; x x y22 y21 x x y12 y11
psllq mm5, 8 ;; v2 0 v1 0 v2 0 v1 0
por mm1, mm5 ;; v22 u22 v21 u21 v12 u12 v11 u11
movq mm3, mm1
punpcklbw mm1, mm2 ;; v11 y12 u11 y11
movd [ecx], mm1 ;; line 1 result
punpckhbw mm3, mm2 ;; v21 y22 u21 y21
movd [ecx+eax], mm3 ;; line 2 result
;;-----------
;; line loop
line_done:
mov eax, yuvayinc ;; move down two lines
add esi, eax
mov eax, yuvauvinc
add ebx, eax
mov eax, syinc
add edi, eax
mov eax, suvinc
add edx, eax
mov eax, dyinc
mov ecx, dy
add ecx, eax
mov dy, ecx
mov eax, height
sub eax, 2
mov height, eax
jg near y100
;;-----------
exit:
xor eax, eax ;; return success
add esp, ntmps*4
pop ebx
pop ecx
pop edi
pop esi
pop ebp
emms
ret
;_I420andYUVAtoUYVY_MMX endp
;====================================
;====================================
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;int I420andI420toI420 (
;; unsigned char *src1_ptr, int src1_pels, int src1_lines, int src1_pitch,
;; unsigned char *src2_ptr, int src2_pels, int src2_lines, int src2_pitch,
;; unsigned char *dest_ptr, int dest_pels, int dest_lines, int dest_pitch,
;; int alpha)
;;
;;
;; src2 = top
;; inverted alpha
;; uv size computed as: uvpitch*uvlines = (pitch/2)*(lines/2)
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
_I420andI420toI420_MMX_sub:
;;
;; int I420andI420toI420_MMX_sub(
;; unsigned char *src1_ptr, int src1_pels, int src1_lines, int src1_pitch,
;; int src1_startx, int src1_starty,
;; unsigned char *src2_ptr, int src2_pels, int src2_lines, int src2_pitch,
;; int src2_startx, int src2_starty,
;; unsigned char *dest_ptr, int dest_pels, int dest_lines, int dest_pitch,
;; int dest_startx, int dest_starty,
;; int width, int height, int alpha );
;;
make_labels _I420andI420toI420_MMX_sub
;;
;; arguments
%define src dword [esp+4*(1+npush)]
%define src_pels dword [esp+4*(2+npush)]
%define src_lines dword [esp+4*(3+npush)]
%define src_pitch dword [esp+4*(4+npush)]
%define src_startx dword [esp+4*(5+npush)]
%define src_starty dword [esp+4*(6+npush)]
%define top dword [esp+4*(7+npush)]
%define top_pels dword [esp+4*(8+npush)]
%define top_lines dword [esp+4*(9+npush)]
%define top_pitch dword [esp+4*(10+npush)]
%define top_startx dword [esp+4*(11+npush)]
%define top_starty dword [esp+4*(12+npush)]
%define dst dword [esp+4*(13+npush)]
%define dst_pels dword [esp+4*(14+npush)]
%define dst_lines dword [esp+4*(15+npush)]
%define dst_pitch dword [esp+4*(16+npush)]
%define dst_startx dword [esp+4*(17+npush)]
%define dst_starty dword [esp+4*(18+npush)]
%define width dword [esp+4*(19+npush)]
%define height dword [esp+4*(20+npush)]
%define alpha dword [esp+4*(21+npush)]
push ebp
push esi
push edi
push ecx
push ebx
;; tmp on stack
%assign ntmps 16
%assign npush (5+ntmps)
sub esp, ntmps*4
;; cycle through pointers to y then u the v indexed by ecx
;; structure below must agree
%define topptr dword [esp + 0*4 + ecx*4] ;; topy[ecx*4]
%define sptr dword [esp + 4*4 + ecx*4] ;; sy[ecx*4]
%define dptr dword [esp + 8*4 + ecx*4] ;; dy[ecx*4]
;;
%define topy dword [esp + 0*4]
%define topu dword [esp + 1*4]
%define topv dword [esp + 2*4]
%define toppitch dword [esp + 3*4]
%define sy dword [esp + 4*4]
%define su dword [esp + 5*4]
%define sv dword [esp + 6*4]
%define spitch dword [esp + 7*4]
%define dy dword [esp + 8*4]
%define du dword [esp + 9*4]
%define dv dword [esp + 10*4]
%define dpitch dword [esp + 11*4]
%define lines dword [esp + 12*4]
%define pels dword [esp + 13*4]
%define dtmp0 dword [esp + 14*4]
%define dtmp1 dword [esp + 15*4]
%define btmp0(x) byte [esp + 14*4 + x]
%define btmp1(x) byte [esp + 15*4 + x]
;;-----
mov eax, dst_lines
mov ecx, dst_pitch
mov dpitch, ecx
mov edx, eax
imul eax, ecx ;; pitch*lines
mov esi, dst
mov dy, esi
add esi, eax
mov du, esi
shr edx, 1 ;; lines/2 = uv lines
shr ecx, 1 ;; pitch/2 = uvpitch
imul edx, ecx ;;
add esi, edx
mov dv, esi
;;-----
mov eax, src_lines
mov ecx, src_pitch
mov spitch, ecx
mov edx, eax
imul eax, ecx ;; pitch*lines
mov esi, src
mov sy, esi
add esi, eax
mov su, esi
shr edx, 1 ;; lines/2 = uv lines
shr ecx, 1 ;; pitch/2 = uvpitch
imul edx, ecx ;;
add esi, edx
mov sv, esi
;;-----
mov eax, top_lines
mov ecx, top_pitch
mov toppitch, ecx
mov edx, eax
imul eax, ecx ;; pitch*lines
mov esi, top
mov topy, esi
add esi, eax
mov topu, esi
shr edx, 1 ;; lines/2 = uv lines
shr ecx, 1 ;; pitch/2 = uvpitch
imul edx, ecx ;;
add esi, edx
mov topv, esi
;;-----
mov eax, height
mov lines, eax
mov eax, width
mov pels, eax
;;------------------
;;------------------
;; pointer adjustment to (x,y)
mov ecx, top_pitch
mov eax, top_starty
mov edx, eax
mov ebx, top_startx
imul eax, ecx ;; y*pitch
mov esi, topy ;;
add esi, eax ;; sy + y*pitch
add esi, ebx ;; sy + y*pitch + x
mov topy, esi
shr ecx, 1 ;; pitch/2
shr edx, 1 ;; y/2
imul edx, ecx ;; (y/2)*(pitch/2)
shr ebx, 1 ;; x/2
add edx, ebx ;; (y/2)*(pitch/2) + x/2
mov esi, topu ;;
add esi, edx ;; su + (y/2)*(pitch/2)
mov topu, esi
mov esi, topv ;;
add esi, edx ;; su + (y/2)*(pitch/2)
mov topv, esi
;;
;; pointer adjustment to (x,y)
mov ecx, src_pitch
mov eax, src_starty
mov edx, eax
mov ebx, src_startx
imul eax, ecx ;; y*pitch
mov esi, sy ;;
add esi, eax ;; sy + y*pitch
add esi, ebx ;; sy + y*pitch + x
mov sy, esi
shr ecx, 1 ;; pitch/2
shr edx, 1 ;; y/2
imul edx, ecx ;; (y/2)*(pitch/2)
shr ebx, 1 ;; x/2
add edx, ebx ;; (y/2)*(pitch/2) + x/2
mov esi, su ;;
add esi, edx ;; su + (y/2)*(pitch/2)
mov su, esi
mov esi, sv ;;
add esi, edx ;; su + (y/2)*(pitch/2)
mov sv, esi
;;;
;; pointer adjustment to (x,y)
mov ecx, dst_pitch
mov eax, dst_starty
mov edx, eax
mov ebx, dst_startx
imul eax, ecx ;; y*pitch
mov esi, dy ;;
add esi, eax ;; sy + y*pitch
add esi, ebx ;; sy + y*pitch + x
mov dy, esi
shr ecx, 1 ;; pitch/2
shr edx, 1 ;; y/2
imul edx, ecx ;; (y/2)*(pitch/2)
shr ebx, 1 ;; x/2
add edx, ebx ;; (y/2)*(pitch/2) + x/2
mov esi, du ;;
add esi, edx ;; su + (y/2)*(pitch/2)
mov du, esi
mov esi, dv ;;
add esi, edx ;; su + (y/2)*(pitch/2)
mov dv, esi
;;-----
;; I420 top P = top
;; [Y01Y02][Y03Y04]
;; [Y
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -