📄 yuvammx.asm
字号:
add eax, ebx ;; y*pitch + x
mov esi, yuvay ;;
add esi, eax ;; yuvay + y*pitch + x
mov yuvay, esi
mov edi, yuvaa ;;
add edi, eax ;; yuvaa + y*pitch + x
mov yuvaa, edi
mov yuvaauv, edi ;; dup for uv use
shr ecx, 1 ;; pitch/2
shr edx, 1 ;; y/2
imul edx, ecx ;; (y/2)*(pitch/2)
shr ebx, 1 ;; x/2
mov esi, yuvau ;;
add esi, edx ;; yuvau + (y/2)*(pitch/2)
add esi, ebx ;; yuvau + (y/2)*(pitch/2) + x/2
mov yuvau, esi
;;-----
;; yuva P = top
;; [Y01Y02][Y03Y04]
;; [Y11Y12][Y13Y04]
;; [U1] [U3]
;; [V1] [V3]
;; [A01A02][A03A04]
;; [A11A12][A13A04]
;; I420 src Q = bot
;; [Y01Y02][Y03Y04]
;; [Y11Y12][Y13Y04]
;; [U1] [U3]
;; [V1] [V3]
;; I420 dst
;; [Y01Y02][Y03Y04]
;; [Y11Y12][Y13Y04]
;; [U1] [U3]
;; [V1] [V3]
;;-------
mov esi, yuvay
mov ecx, yuvaa
mov edi, sy
mov ebx, dy
mov ebp, height ;; y lines loop counter
or ebp, ebp
jle near exit
y100:
mov eax, width
sub eax, 7
jle y_by_fours
movq mm7, [mask3]
movq mm6, [mask3b]
a100:
;; yuva 8 y's
movq mm0, [esi] ;; p8 p7 p6 p5 p4 p3 p2 p1
movq mm1, mm0 ;; p8 p7 p6 p5 p4 p3 p2 p1
psrlw mm0, 8 ;; word p8 p6 p4 p2
;; src I420 8 y's
movq mm2, [edi] ;; q8 q7 q6 q5 q4 q3 q2 q1
movq mm3, mm2 ;; q8 q7 q6 q5 q4 q3 q2 q1
psrlw mm2, 8 ;; word q8 q6 q4 q2
psubw mm2, mm0 ;; q-p
;; yuva 8 a's
movq mm4, [ecx] ;; a8 a7 a6 a5 a4 a3 a2 a1
movq mm5, mm4 ;; a8 a7 a6 a5 a4 a3 a2 a1
psrlw mm4, 8 ;; word a8 a6 a4 a2
pmullw mm2, mm4 ;; word alpha*(q-p) for y8 y6 y4 y2
pand mm2, mm6 ;; word hi-byte alpha*(q-p) for y8 y6 y4 y2
;;
;;
movq mm0, mm1 ;; p8 p7 p6 p5 p4 p3 p2 p1
pand mm1, mm7 ;; word p7 p5 p3 p1
pand mm3, mm7 ;; word q7 q5 q3 q1
psubw mm3, mm1 ;; q-p
pand mm5, mm7 ;; word a7 a5 a3 a1
pmullw mm3, mm5
psrlw mm3, 8 ;; alpha*(q-p) >> 8 for y7 y5 y3 y1
por mm2, mm3 ;; alpha*(q-p) >> 8 for y8 y7 y6 y5 y4 y3 y2 y1
paddb mm2, mm0 ;; blended y8 y7 y6 y5 y4 y3 y2 y1
movq [ebx], mm2
add esi, 8
add edi, 8
add ecx, 8
add ebx, 8
sub eax, 8
jg a100
;;----------------------
y_by_fours:
pxor mm7, mm7 ;; set to zero for unpack
add eax, 4
jle y_one_two_three
a200:
;; yuva 4 y's
movd mm0, [esi] ;; byte p4 p3 p2 p1
punpcklbw mm0, mm7 ;; p4 p3 p2 p1
;; src I420 4 y's
movd mm2, [edi] ;; byte q4 q3 q2 q1
punpcklbw mm2, mm7 ;; q4 q3 q2 q1
psubw mm2, mm0 ;; q-p
;; yuva 4 a's
movd mm4, [ecx] ;; byte a4 a3 a2 a1
punpcklbw mm4, mm7 ;; a4 a3 a2 a1
pmullw mm2, mm4
psrlw mm2, 8 ;; alpha*(q-p) >> 8 for y4 y3 y2 y1
paddb mm2, mm0 ;; blended word y4 y3 y2 y1
packuswb mm2, mm7 ;; blended byte y4 y3 y2 y1
;;
movd [ebx], mm2
add esi, 4
add edi, 4
add ecx, 4
add ebx, 4
sub eax, 4 ;; not needed if doing by_eights
jg a200 ;; not needed if doing by_eights
;;
;;
;;----------------------
;; do one, two, or three odd bytes
y_one_two_three:
add eax, 3 ;; number bytes to do
jle line_done
mov edx, 4
sub edx, eax ;; 4-eax = numbers bytes offset
;;
movd mm6, edx ;; number bytes to shift
psllq mm6, 3 ;; number bits to shift
;;
;; pointer adjustment, negative offset
neg edx
;;
;;
;; yuva 1-3 y's
movd mm0, [esi] ;; byte xx p3 p2 p1
punpcklbw mm0, mm7 ;; xx p3 p2 p1
;; src I420 1-3 y's
movd mm2, [edi] ;; byte xx q3 q2 q1
punpcklbw mm2, mm7 ;; xx p3 p2 p1
psubw mm2, mm0 ;; q-p
;; yuva 1-3 a's
movd mm4, [ecx+edx] ;; byte a3 a2 a1 xx
psrlq mm4, mm6 ;; byte xx a3 a2 a1
punpcklbw mm4, mm7 ;; xx a3 a2 a1
pmullw mm2, mm4
psrlw mm2, 8 ;; alpha*(q-p) >> 8 for xx y3 y2 y1
paddb mm2, mm0 ;; blended word xx y3 y2 y1
packuswb mm2, mm7 ;; blended byte xx y3 y2 y1
;;
movd dtmp0, mm2
;; store result, byte by byte
;; eax = bytes to do
a300:
mov dl, btmp0(eax-1)
mov [ebx+eax-1], dl
dec eax
jg a300
;;------
line_done:
mov eax, yuva_pitch
mov esi, yuvay
add esi, eax
mov yuvay, esi
mov ecx, yuvaa
add ecx, eax
mov yuvaa, ecx
mov eax, src_pitch
mov edi, sy
add edi, eax
mov sy, edi
mov eax, dst_pitch
mov ebx, dy
add ebx, eax
mov dy, ebx
dec ebp
jg near y100 ;; line loop
;;-----------------------------------------------
;;-----------------------------------------------
;; do u and v
;;
mov esi, yuvau
mov ecx, yuvaauv
add ecx, yuva_pitch ;; use a's from second line
mov yuvaauv, ecx ;; save for uv line loop
mov edi, su
mov ebx, du
mov ebp, height ;; uv lines loop counter
shr ebp, 1
jle near exit
uv100:
mov eax, width
shr eax, 1 ;; u v width = width/2
sub eax, 3
jle near uv_one_two_three
b200:
;; yuva 4 u's
movd mm0, [esi] ;; byte pu4 pu3 pu2 pu1
punpcklbw mm0, mm7 ;; pu4 pu3 pu2 pu1
;; src I420 4 u's
movd mm2, [edi] ;; byte qu4 qu3 qu2 qu1
punpcklbw mm2, mm7 ;; qu4 qu3 qu2 qu1
psubw mm2, mm0 ;; q-p
;; yuva 8 a's
;; We need to average the alpha values by 2's. In the C version
;; In the C version we do the average by the whole 2x2 block.
;;movq mm4, [ecx] ;; byte a8 a7 a6 a5 a4 a3 a2 a1
;;psrlw mm4, 8 ;; word a8 a6 a4 a2
movq mm4, [ecx]
movq mm7, mm4 ;; mm7=mm4= a8 a7 a6 a5 a4 a3 a2 a1
psllw mm7, 8 ;; mm7 = a7 00 a5 00 a3 00 a1 00
psrlw mm4, 8 ;; mm4 = 00 a8 00 a6 00 a4 00 a2
psrlw mm7, 8 ;; mm7 = 00 a7 00 a5 00 a3 00 a1
paddw mm4, mm7 ;; mm4 = (a8+a7) (a6+a5) (a4+a3) (a2+a1)
pxor mm7, mm7 ;; mm7 = 0...0
psrlw mm4, 1 ;; mm4 = (a8+a7)/2 (a6+a5)/2 (a4+a3)/2 (a2+a1)/2
pmullw mm2, mm4
psrlw mm2, 8 ;; alpha*(q-p) >> 8 for u4 u3 u2 u1
paddb mm2, mm0 ;; byte add -> blended word u4 u3 u2 u1
packuswb mm2, mm7 ;; blended byte u4 u3 u2 u1
mov edx, yuvaoffsetv
;; yuva 4 v's
movd mm0, [esi+edx] ;; byte pv4 pv3 pv2 pv1
punpcklbw mm0, mm7 ;; pv4 pv3 pv2 pv1
mov edx, soffsetv
;; src I420 4 v's
movd mm3, [edi+edx] ;; byte qv4 qv3 qv2 qv1
punpcklbw mm3, mm7 ;; qv4 qv3 qv2 qv1
psubw mm3, mm0 ;; q-p
pmullw mm3, mm4
psrlw mm3, 8 ;; alpha*(q-p) >> 8 for v4 v3 v2 v1
paddb mm3, mm0 ;; blended word v4 v3 v2 v1
packuswb mm3, mm7 ;; blended byte v4 v3 v2 v1
movd [ebx], mm2 ;; postpone store for src buf = dest buf
mov edx, doffsetv
movd [ebx+edx], mm3
add esi, 4
add edi, 4
add ecx, 8
add ebx, 4
sub eax, 4
jg near b200
;;----------------------
;; do one, two, or three odd bytes
;; reads are unaligned
uv_one_two_three:
add eax, 3
jle near uv_line_done
mov edx, 4
sub edx, eax ;; 4-eax = numbers bytes offset
;;
movd mm6, edx ;; number bytes to shift
psllq mm6, 3 ;; number bits to shift
;;
;; pointer adjustment, negative offset
neg edx
add esi, edx
add edi, edx
lea ecx, [ecx+edx*2]
;;
;; yuva 1-3 u's
movd mm0, [esi] ;; byte pu3 pu2 pu1 xxx
punpcklbw mm0, mm7 ;; pu3 pu2 pu1 xxx
;; src I420 1-3 u's
movd mm2, [edi] ;; byte qu3 qu2 qu1 xxx
punpcklbw mm2, mm7 ;; qu3 qu2 qu1 xxx
psubw mm2, mm0 ;; q-p
;; yuva 1-3 a's
movq mm4, [ecx] ;; byte a6 a5 a4 a3 a2 a1 xx xx
psrlw mm4, 8 ;; word a6 a4 a2 xx
pmullw mm2, mm4
psrlw mm2, 8 ;; alpha*(q-p) >> 8 for u3 u2 u1 xx
paddb mm2, mm0 ;; blended word u3 u2 u1 xx
packuswb mm2, mm7 ;; blended byte u3 u2 u1 xx
movd dtmp0, mm2
mov edx, yuvaoffsetv
;; yuva 2 v's
movd mm0, [esi+edx] ;; byte pv3 pv2 pv1 xxx
punpcklbw mm0, mm7 ;; pv3 pv2 pv1 xxx
mov edx, soffsetv
;; src I420 1-3 u's
movd mm2, [edi+edx] ;; byte qv3 qv2 qv1 xx
punpcklbw mm2, mm7 ;; qv3 qv2 qv1 xx
psubw mm2, mm0 ;; q-p
pmullw mm2, mm4
psrlw mm2, 8 ;; alpha*(q-p) >> 8 for v3 v2 v1 xx
paddb mm2, mm0 ;; blended word v3 v2 v1 xx
packuswb mm2, mm7 ;; blended byte v3 v2 v1 xx
movd dtmp1, mm2
mov edx, doffsetv
;; store result, byte by byte
;; eax = bytes to do
neg eax
b300:
mov cl, btmp0(4+eax) ;; u ;; ecx trashed
mov ch, btmp1(4+eax) ;; v
mov [ebx], cl
mov [ebx+edx], ch
inc ebx
inc eax
jl b300
;;------
uv_line_done:
mov eax, yuvauvpitch
mov esi, yuvau
add esi, eax
mov yuvau, esi
mov eax, yuva_pitch
mov ecx, yuvaauv
lea ecx, [ecx + eax*2] ;; down two lines
mov yuvaauv, ecx ;; save for uv line loop
mov eax, suvpitch
mov edi, su
add edi, eax
mov su, edi
mov eax, duvpitch
mov ebx, du
add ebx, eax
mov du, ebx
dec ebp
jg near uv100 ;; uv line loop
;;------
exit:
xor eax, eax ;; return success
exit2:
add esp, ntmps*4
pop ebx
pop ecx
pop edi
pop esi
pop ebp
emms
ret
fail_exit:
mov eax, -1 ;; signal fail
jmp exit2
;_I420andYUVAtoI420_MMX endp
;====================================
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; I420andYUVAtoYUY2
;;
;; This function alpha-blends two I420 buffers into a third
;; YUY2 buffer using the alpha info tacked to the
;; end of the second I420 buffer
;;
;; yuva = top
;; inverted alpha
;; uv size computed as: uvpitch*uvlines = (pitch/2)*(lines/2)
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
_I420andYUVAtoYUY2_MMX:
;;
;; int I420andYUVAtoYUY2_MMX(
;; unsigned char* src, int src_pels, int src_lines, int src_pitch,
;; int src_startx, int src_starty,
;; unsigned char* yuva, int yuva_pels, int yuva_lines, int yuva_pitch,
;; int yuva_startx, int yuva_starty,
;; unsigned char* dst, int dst_pels, int dst_lines, int dst_pitch,
;; int dst_startx, int dst_starty,
;; int width, int height);
;;
make_labels _I420andYUVAtoYUY2_MMX
;;
;; arguments
%define src dword [esp+4*(1+npush)]
%define src_pels dword [esp+4*(2+npush)]
%define src_lines dword [esp+4*(3+npush)]
%define src_pitch dword [esp+4*(4+npush)]
%define src_startx dword [esp+4*(5+npush)]
%define src_starty dword [esp+4*(6+npush)]
%define yuva dword [esp+4*(7+npush)]
%define yuva_pels dword [esp+4*(8+npush)]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -