📄 yuvammx.asm
字号:
movzx ebp, byte [ebx+ecx] ;; prevent possible access vio
movd mm0, ebp ;; 0 0 0 pv1
punpckldq mm0, mm0 ;; word 0 pv1 0 pv1
;;movd mm5, [edx+eax] ;; qv1
movzx eax, byte [edx+eax] ;; prevent possible access vio
movd mm5, eax ;; 0 0 0 qv1
punpckldq mm5, mm5 ;; word 0 qv1 0 qv1
psubw mm5, mm0 ;; qv - pv
;;add ebx, 2 ;; inc yuvau address
;;add edx, 2 ;; inc su address
pmullw mm5, mm4 ;; alpha*(qv-pv)
psrlw mm5, 8
paddb mm5, mm0 ;; line 2 line 1 blended v2 v1 v2 v1
;; mm2 = line 1 blended for 0 y2 0 y1
;; mm3 = line 2 blended for 0 y2 0 y1
;; mm1 = line 2 line 1 blended 0 u2 0 u1
;; mm5 = line 2 line 1 blended 0 v2 0 v1
;;
mov ecx, dy
mov eax, dst_pitch
packuswb mm2, mm3 ;; x x y22 y21 x x y12 y11
psllq mm5, 8 ;; v2 0 v1 0 v2 0 v1 0
por mm1, mm5 ;; v22 u22 v21 u21 v12 u12 v11 u11
movq mm3, mm2
punpcklbw mm2, mm1 ;; v11 y12 u11 y11
movd [ecx], mm2 ;; line 1 result
punpckhbw mm3, mm1 ;; v21 y22 u21 y21
movd [ecx+eax], mm3 ;; line 2 result
;;-----------
;; line loop
line_done:
;;
;;
mov eax, yuvayinc ;; move down two lines
add esi, eax
mov eax, yuvauvinc
add ebx, eax
mov eax, syinc
add edi, eax
mov eax, suvinc
add edx, eax
mov eax, dyinc
mov ecx, dy
add ecx, eax
mov dy, ecx
mov eax, height
sub eax, 2
mov height, eax
jg near y100
;;-----------
exit:
xor eax, eax ;; return success
add esp, ntmps*4
pop ebx
pop ecx
pop edi
pop esi
pop ebp
emms
ret
;_I420andYUVAtoYUY2_MMX endp
;====================================
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; I420andYUVAtoUYVY
;;
;; This function alpha-blends two I420 buffers into a third
;; UYVY buffer using the alpha info tacked to the
;; end of the second I420 buffer
;;
;; yuva = top
;; inverted alpha
;; uv size computed as: uvpitch*uvlines = (pitch/2)*(lines/2)
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
_I420andYUVAtoUYVY_MMX:
;;
;; int I420andYUVAtoUYVY_MMX(
;; unsigned char* src, int src_pels, int src_lines, int src_pitch,
;; int src_startx, int src_starty,
;; unsigned char* yuva, int yuva_pels, int yuva_lines, int yuva_pitch,
;; int yuva_startx, int yuva_starty,
;; unsigned char* dst, int dst_pels, int dst_lines, int dst_pitch,
;; int dst_startx, int dst_starty,
;; int width, int height);
;;
;;
make_labels _I420andYUVAtoUYVY_MMX
;; arguments
%define src dword [esp+4*(1+npush)]
%define src_pels dword [esp+4*(2+npush)]
%define src_lines dword [esp+4*(3+npush)]
%define src_pitch dword [esp+4*(4+npush)]
%define src_startx dword [esp+4*(5+npush)]
%define src_starty dword [esp+4*(6+npush)]
%define yuva dword [esp+4*(7+npush)]
%define yuva_pels dword [esp+4*(8+npush)]
%define yuva_lines dword [esp+4*(9+npush)]
%define yuva_pitch dword [esp+4*(10+npush)]
%define yuva_startx dword [esp+4*(11+npush)]
%define yuva_starty dword [esp+4*(12+npush)]
%define dst dword [esp+4*(13+npush)]
%define dst_pels dword [esp+4*(14+npush)]
%define dst_lines dword [esp+4*(15+npush)]
%define dst_pitch dword [esp+4*(16+npush)]
%define dst_startx dword [esp+4*(17+npush)]
%define dst_starty dword [esp+4*(18+npush)]
%define width dword [esp+4*(19+npush)]
%define height dword [esp+4*(20+npush)]
push ebp
push esi
push edi
push ecx
push ebx
;; tmp on stack
%assign ntmps 15
%assign npush (5+ntmps)
sub esp, ntmps*4
%define yuvay dword [esp + 0*4]
%define yuvau dword [esp + 1*4]
%define yuvaoffsetv dword [esp + 2*4]
%define yuvaoffseta dword [esp + 3*4]
%define negyuvapitch dword [esp + 4*4]
%define yuvayinc dword [esp + 5*4]
%define yuvauvinc dword [esp + 6*4]
%define sy dword [esp + 7*4]
%define su dword [esp + 8*4]
%define soffsetv dword [esp + 9*4]
%define syinc dword [esp + 10*4]
%define suvinc dword [esp + 11*4]
%define dy dword [esp + 12*4]
%define width0 dword [esp + 13*4]
%define dyinc dword [esp + 14*4]
mov edi, width
and edi, -4 ;; pels truncated to multiple of 4 (width & ~ 3)
mov ebp, edi
shr ebp, 1 ;; (width & ~ 3)/2
;;-----
mov eax, dst
mov dy, eax
mov ecx, dst_pitch
add ecx, ecx
sub ecx, edi
sub ecx, edi
mov dyinc, ecx ;; inc = 2*pitch - 2*(width & ~ 3)
;;-----
mov eax, src_lines
mov ecx, src_pitch
mov edx, eax
imul eax, ecx ;; pitch*lines
mov esi, src
mov sy, esi
add esi, eax
mov su, esi
shr edx, 1 ;; lines/2
shr ecx, 1 ;; pitch/2
imul edx, ecx ;; (pitch/2)*(lines*2)
mov soffsetv, edx
sub ecx, ebp ;; pitch/2 - width/2
mov suvinc, ecx
mov ecx, src_pitch
add ecx, ecx
sub ecx, edi
mov syinc, ecx ;; inc = 2*pitch - (width & ~ 3)
;;-----
mov eax, yuva_lines
mov ecx, yuva_pitch
mov edx, ecx
neg edx
mov negyuvapitch, edx
mov edx, eax ;; lines
imul eax, ecx ;; pitch*lines
mov esi, yuva
mov yuvay, esi
add esi, eax
mov yuvau, esi
shr ecx, 1 ;; pitch/2
shr edx, 1 ;; lines/2
imul edx, ecx ;; (lines/2)*(pitch/2)
mov yuvaoffsetv, edx
lea edx, [eax+edx*2]
mov yuvaoffseta, edx
sub ecx, ebp
mov yuvauvinc, ecx
mov ecx, yuva_pitch
add ecx, ecx
sub ecx, edi
mov yuvayinc, ecx ;; inc = 2*pitch - (width & ~ 3)
;;-----
;;------------------
;; pointer adjustment to (x,y)
mov ecx, src_pitch
mov eax, src_starty
mov edx, eax
mov ebx, src_startx
imul eax, ecx ;; y*pitch
mov esi, sy ;;
add esi, eax ;; sy + y*pitch
add esi, ebx ;; sy + y*pitch + x
mov sy, esi
shr ecx, 1 ;; pitch/2
shr edx, 1 ;; y/2
imul edx, ecx ;; (y/2)*(pitch/2)
shr ebx, 1 ;; x/2
mov esi, su ;;
add esi, edx ;; su + (y/2)*(pitch/2)
add esi, ebx ;; su + (y/2)*(pitch/2) + x/2
mov su, esi
;;
;; pointer adjustment to (x,y)
mov ecx, yuva_pitch
mov eax, yuva_starty
mov edx, eax
mov ebx, yuva_startx
imul eax, ecx ;; y*pitch
add eax, ebx ;; y*pitch + x
mov esi, yuvay ;;
add esi, eax ;; yuvay + y*pitch + x
mov yuvay, esi
shr ecx, 1 ;; pitch/2
shr edx, 1 ;; y/2
imul edx, ecx ;; (y/2)*(pitch/2)
shr ebx, 1 ;; x/2
mov esi, yuvau ;;
add esi, edx ;; yuvau + (y/2)*(pitch/2)
add esi, ebx ;; yuvau + (y/2)*(pitch/2) + x/2
mov yuvau, esi
;; pointer adjustment to (x,y)
mov ecx, dst_pitch
mov eax, dst_starty
mov ebx, dst_startx
imul eax, ecx ;; y*pitch
add ebx, ebx ;; 2*x
mov esi, dy ;;
add esi, eax ;; dy + y*pitch
add esi, ebx ;; dy + y*pitch + 2*x
mov dy, esi
;;-----
;; yuva P = top
;; [Y01Y02][Y03Y04]
;; [Y11Y12][Y13Y04]
;; [U1] [U3]
;; [V1] [V3]
;; [A01A02][A03A04]
;; [A11Y12][A13A04]
;; I420 src Q = bot
;; [Y01Y02][Y03Y04]
;; [Y11Y12][Y13Y04]
;; [U1] [U3]
;; [V1] [V3]
;; UYVY dst
;; [UYVY][UYVY] ;; byte order [U][Y0][V][Y1]
;; [UYVY][UYVY]
;;-------
pxor mm7, mm7
movq mm6, [con1]
;; set up line loop
mov eax, height
dec eax
mov height, eax
jle near exit
mov esi, yuvay
mov ebx, yuvau
mov edi, sy
mov edx, su
y100: ;; line loop
mov eax, width
sub eax, 3
mov width0, eax
jle near two_pels
a100:
;; do four pels per iteration
;;
mov ecx, yuvaoffseta
mov eax, src_pitch
;; ------- first line --------
;; yuva 4 y's
movd mm0, [esi] ;; p4 p3 p2 p1
punpcklbw mm0, mm7 ;; word p4 p3 p2 p1
;; src I420 4 y's
movq mm2, [edi] ;; q4 q3 q2 q1
punpcklbw mm2, mm7 ;; word q4 q3 q2 q1
psubw mm2, mm0 ;; q-p
;; yuva 4 a's
movd mm4, [esi+ecx] ;; a4 a3 a2 a1
punpcklbw mm4, mm7 ;; word a4 a3 a2 a1
pmullw mm2, mm4 ;; word alpha*(q-p) for y4 y3 y2 y1
psrlw mm2, 8 ;; byte alpha*(q-p) for 0 y4 0 y3 0 y2 0 y1
paddb mm2, mm0 ;; blended for 0 y4 0 y3 0 y2 0 y1
pmaddwd mm4, mm6 ;; line 1 dword a4+a3 a2+a1
;;
;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1
;; mm4 = line 1 dword ave(a4,a3) ave(a2,a1)
;;
;; ------ second line ---------
mov ebp, negyuvapitch
sub esi, ebp ;; point to line 2
;;add esi, yuva_pitch
;;;;add edi, src_pitch
;; yuva 4 y's
movd mm1, [esi] ;; p4 p3 p2 p1
punpcklbw mm1, mm7 ;; word p4 p3 p2 p1
;; src I420 4 y's
movd mm3, [edi+eax] ;; q4 q3 q2 q1
punpcklbw mm3, mm7 ;; word q4 q3 q2 q1
psubw mm3, mm1 ;; q-p
;; yuva 4 a's
movd mm5, [esi+ecx] ;; a4 a3 a2 a1
punpcklbw mm5, mm7 ;; word a4 a3 a2 a1
lea esi, [esi+ebp+4] ;; back to line 1 and inc
add edi, 4 ;; inc sy
pmullw mm3, mm5 ;; word alpha*(q-p) for y4 y3 y2 y1
psrlw mm3, 8 ;; byte alpha*(q-p) for 0 y4 0 y3 0 y2 0 y1
paddb mm3, mm1 ;; blended for 0 y4 0 y3 0 y2 0 y1
pmaddwd mm5, mm6 ;; line 2 dword a4+a3 a2+a1
;;
packssdw mm4, mm5 ;; for u v sum a22 a21 a12 a11
psrlw mm4, 1 ;; for u v ave a22 a21 a12 a11
;;
;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1
;; mm3 = line 2 blended for 0 y4 0 y3 0 y2 0 y1 0
;; mm4 = ave for line 2 line 1 = a22 a21 a12 a11
;;
;; ----- U V --------
;;mov ebx, yuvau
;;mov edx, su
mov ecx, yuvaoffsetv
mov eax, soffsetv
;; --U--
movd mm0, [ebx] ;; x x pu2 pu1
punpcklwd mm0, mm0 ;; byte pu2 pu1 pu2 pu1
punpcklbw mm0, mm7 ;; word pu2 pu1 pu2 pu1
movd mm1, [edx] ;; x x qu2 qu1
punpcklwd mm1, mm1 ;; byte qu2 qu1 qu2 qu1
punpcklbw mm1, mm7 ;; word qu2 qu1 qu2 qu1
psubw mm1, mm0 ;; qu - pu
pmullw mm1, mm4 ;; alpha*(qu-pu)
psrlw mm1, 8
paddb mm1, mm0 ;; line 2 line 1 blended u2 u1 u2 u1
;; --V--
movd mm0, [ebx+ecx] ;; x x pv2 pv1
punpcklwd mm0, mm0 ;; byte pv2 pv1 pv2 pv1
punpcklbw mm0, mm7 ;; word pv2 pv1 pv2 pv1
;;movd mm5, [edx+eax] ;; x x qv2 qv1
movzx eax, word [edx+eax] ;; prevent possible access vio
movd mm5, eax ;; 0 0 qv2 qv1
punpcklwd mm5, mm5 ;; byte qv2 qv1 qv2 qv1
punpcklbw mm5, mm7 ;; word qv2 qv1 qv2 qv1
psubw mm5, mm0 ;; qv - pv
add ebx, 2 ;; inc yuvau address
add edx, 2 ;; inc su address
pmullw mm5, mm4 ;; alpha*(qv-pv)
psrlw mm5, 8
paddb mm5, mm0 ;; line 2 line 1 blended v2 v1 v2 v1
;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1
;; mm3 = line 2 blended for 0 y4 0 y3 0 y2 0 y1
;; mm1 = line 2 line 1 blended 0 u2 0 u1 0 u2 0 u1
;; mm5 = line 2 line 1 blended 0 v2 0 v1 0 v2 0 v1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -