📄 yuvammx.asm
字号:
mov esi, yuvay ;; add esi, eax ;; yuvay + y*pitch + x mov yuvay, esi mov edi, yuvaa ;; add edi, eax ;; yuvaa + y*pitch + x mov yuvaa, edi mov yuvaauv, edi ;; dup for uv use shr ecx, 1 ;; pitch/2 shr edx, 1 ;; y/2 imul edx, ecx ;; (y/2)*(pitch/2) shr ebx, 1 ;; x/2 mov esi, yuvau ;; add esi, edx ;; yuvau + (y/2)*(pitch/2) add esi, ebx ;; yuvau + (y/2)*(pitch/2) + x/2 mov yuvau, esi ;;-----;; yuva P = top;; [Y01Y02][Y03Y04];; [Y11Y12][Y13Y04];; [U1] [U3];; [V1] [V3];; [A01A02][A03A04];; [A11A12][A13A04];; I420 src Q = bot;; [Y01Y02][Y03Y04];; [Y11Y12][Y13Y04];; [U1] [U3];; [V1] [V3];; I420 dst;; [Y01Y02][Y03Y04];; [Y11Y12][Y13Y04];; [U1] [U3];; [V1] [V3];;------- mov esi, yuvay mov ecx, yuvaa mov edi, sy mov ebx, dy mov ebp, height ;; y lines loop counter or ebp, ebp jle near exity100: mov eax, width sub eax, 7 jle y_by_fours movq mm7, [mask3] movq mm6, [mask3b]a100: ;; yuva 8 y's movq mm0, [esi] ;; p8 p7 p6 p5 p4 p3 p2 p1 movq mm1, mm0 ;; p8 p7 p6 p5 p4 p3 p2 p1 psrlw mm0, 8 ;; word p8 p6 p4 p2 ;; src I420 8 y's movq mm2, [edi] ;; q8 q7 q6 q5 q4 q3 q2 q1 movq mm3, mm2 ;; q8 q7 q6 q5 q4 q3 q2 q1 psrlw mm2, 8 ;; word q8 q6 q4 q2 psubw mm2, mm0 ;; q-p ;; yuva 8 a's movq mm4, [ecx] ;; a8 a7 a6 a5 a4 a3 a2 a1 movq mm5, mm4 ;; a8 a7 a6 a5 a4 a3 a2 a1 psrlw mm4, 8 ;; word a8 a6 a4 a2 pmullw mm2, mm4 ;; word alpha*(q-p) for y8 y6 y4 y2 pand mm2, mm6 ;; word hi-byte alpha*(q-p) for y8 y6 y4 y2 ;; ;; movq mm0, mm1 ;; p8 p7 p6 p5 p4 p3 p2 p1 pand mm1, mm7 ;; word p7 p5 p3 p1 pand mm3, mm7 ;; word q7 q5 q3 q1 psubw mm3, mm1 ;; q-p pand mm5, mm7 ;; word a7 a5 a3 a1 pmullw mm3, mm5 psrlw mm3, 8 ;; alpha*(q-p) >> 8 for y7 y5 y3 y1 por mm2, mm3 ;; alpha*(q-p) >> 8 for y8 y7 y6 y5 y4 y3 y2 y1 paddb mm2, mm0 ;; blended y8 y7 y6 y5 y4 y3 y2 y1 movq [ebx], mm2 add esi, 8 add edi, 8 add ecx, 8 add ebx, 8 sub eax, 8 jg a100;;----------------------y_by_fours: pxor mm7, mm7 ;; set to zero for unpack add eax, 4 jle y_one_two_threea200: ;; yuva 4 y's movd mm0, [esi] ;; byte p4 p3 p2 p1 punpcklbw mm0, mm7 ;; p4 p3 p2 p1 ;; src I420 4 y's movd mm2, [edi] ;; byte q4 q3 q2 q1 punpcklbw mm2, mm7 ;; q4 q3 q2 q1 psubw mm2, mm0 ;; q-p ;; yuva 4 a's movd mm4, [ecx] ;; byte a4 a3 a2 a1 punpcklbw mm4, mm7 ;; a4 a3 a2 a1 pmullw mm2, mm4 psrlw mm2, 8 ;; alpha*(q-p) >> 8 for y4 y3 y2 y1 paddb mm2, mm0 ;; blended word y4 y3 y2 y1 packuswb mm2, mm7 ;; blended byte y4 y3 y2 y1 ;; movd [ebx], mm2 add esi, 4 add edi, 4 add ecx, 4 add ebx, 4 sub eax, 4 ;; not needed if doing by_eights jg a200 ;; not needed if doing by_eights;;;;;;----------------------;; do one, two, or three odd bytesy_one_two_three: add eax, 3 ;; number bytes to do jle line_done mov edx, 4 sub edx, eax ;; 4-eax = numbers bytes offset ;; movd mm6, edx ;; number bytes to shift psllq mm6, 3 ;; number bits to shift ;; ;; pointer adjustment, negative offset neg edx ;;;; ;; yuva 1-3 y's movd mm0, [esi] ;; byte xx p3 p2 p1 punpcklbw mm0, mm7 ;; xx p3 p2 p1 ;; src I420 1-3 y's movd mm2, [edi] ;; byte xx q3 q2 q1 punpcklbw mm2, mm7 ;; xx p3 p2 p1 psubw mm2, mm0 ;; q-p ;; yuva 1-3 a's movd mm4, [ecx+edx] ;; byte a3 a2 a1 xx psrlq mm4, mm6 ;; byte xx a3 a2 a1 punpcklbw mm4, mm7 ;; xx a3 a2 a1 pmullw mm2, mm4 psrlw mm2, 8 ;; alpha*(q-p) >> 8 for xx y3 y2 y1 paddb mm2, mm0 ;; blended word xx y3 y2 y1 packuswb mm2, mm7 ;; blended byte xx y3 y2 y1 ;; movd dtmp0, mm2 ;; store result, byte by byte ;; eax = bytes to doa300: mov dl, btmp0(eax-1) mov [ebx+eax-1], dl dec eax jg a300;;------line_done: mov eax, yuva_pitch mov esi, yuvay add esi, eax mov yuvay, esi mov ecx, yuvaa add ecx, eax mov yuvaa, ecx mov eax, src_pitch mov edi, sy add edi, eax mov sy, edi mov eax, dst_pitch mov ebx, dy add ebx, eax mov dy, ebx dec ebp jg near y100 ;; line loop;;-----------------------------------------------;;-----------------------------------------------;; do u and v;; mov esi, yuvau mov ecx, yuvaauv add ecx, yuva_pitch ;; use a's from second line mov yuvaauv, ecx ;; save for uv line loop mov edi, su mov ebx, du mov ebp, height ;; uv lines loop counter shr ebp, 1 jle near exituv100: mov eax, width shr eax, 1 ;; u v width = width/2 sub eax, 3 jle near uv_one_two_threeb200: ;; yuva 4 u's movd mm0, [esi] ;; byte pu4 pu3 pu2 pu1 punpcklbw mm0, mm7 ;; pu4 pu3 pu2 pu1 ;; src I420 4 u's movd mm2, [edi] ;; byte qu4 qu3 qu2 qu1 punpcklbw mm2, mm7 ;; qu4 qu3 qu2 qu1 psubw mm2, mm0 ;; q-p ;; yuva 8 a's ;; We need to average the alpha values by 2's. In the C version ;; In the C version we do the average by the whole 2x2 block. ;;movq mm4, [ecx] ;; byte a8 a7 a6 a5 a4 a3 a2 a1 ;;psrlw mm4, 8 ;; word a8 a6 a4 a2 movq mm4, [ecx] movq mm7, mm4 ;; mm7=mm4= a8 a7 a6 a5 a4 a3 a2 a1 psllw mm7, 8 ;; mm7 = a7 00 a5 00 a3 00 a1 00 psrlw mm4, 8 ;; mm4 = 00 a8 00 a6 00 a4 00 a2 psrlw mm7, 8 ;; mm7 = 00 a7 00 a5 00 a3 00 a1 paddw mm4, mm7 ;; mm4 = (a8+a7) (a6+a5) (a4+a3) (a2+a1) pxor mm7, mm7 ;; mm7 = 0...0 psrlw mm4, 1 ;; mm4 = (a8+a7)/2 (a6+a5)/2 (a4+a3)/2 (a2+a1)/2 pmullw mm2, mm4 psrlw mm2, 8 ;; alpha*(q-p) >> 8 for u4 u3 u2 u1 paddb mm2, mm0 ;; byte add -> blended word u4 u3 u2 u1 packuswb mm2, mm7 ;; blended byte u4 u3 u2 u1 mov edx, yuvaoffsetv ;; yuva 4 v's movd mm0, [esi+edx] ;; byte pv4 pv3 pv2 pv1 punpcklbw mm0, mm7 ;; pv4 pv3 pv2 pv1 mov edx, soffsetv ;; src I420 4 v's movd mm3, [edi+edx] ;; byte qv4 qv3 qv2 qv1 punpcklbw mm3, mm7 ;; qv4 qv3 qv2 qv1 psubw mm3, mm0 ;; q-p pmullw mm3, mm4 psrlw mm3, 8 ;; alpha*(q-p) >> 8 for v4 v3 v2 v1 paddb mm3, mm0 ;; blended word v4 v3 v2 v1 packuswb mm3, mm7 ;; blended byte v4 v3 v2 v1 movd [ebx], mm2 ;; postpone store for src buf = dest buf mov edx, doffsetv movd [ebx+edx], mm3 add esi, 4 add edi, 4 add ecx, 8 add ebx, 4 sub eax, 4 jg near b200;;----------------------;; do one, two, or three odd bytes;; reads are unaligneduv_one_two_three: add eax, 3 jle near uv_line_done mov edx, 4 sub edx, eax ;; 4-eax = numbers bytes offset ;; movd mm6, edx ;; number bytes to shift psllq mm6, 3 ;; number bits to shift ;; ;; pointer adjustment, negative offset neg edx add esi, edx add edi, edx lea ecx, [ecx+edx*2];; ;; yuva 1-3 u's movd mm0, [esi] ;; byte pu3 pu2 pu1 xxx punpcklbw mm0, mm7 ;; pu3 pu2 pu1 xxx ;; src I420 1-3 u's movd mm2, [edi] ;; byte qu3 qu2 qu1 xxx punpcklbw mm2, mm7 ;; qu3 qu2 qu1 xxx psubw mm2, mm0 ;; q-p ;; yuva 1-3 a's movq mm4, [ecx] ;; byte a6 a5 a4 a3 a2 a1 xx xx psrlw mm4, 8 ;; word a6 a4 a2 xx pmullw mm2, mm4 psrlw mm2, 8 ;; alpha*(q-p) >> 8 for u3 u2 u1 xx paddb mm2, mm0 ;; blended word u3 u2 u1 xx packuswb mm2, mm7 ;; blended byte u3 u2 u1 xx movd dtmp0, mm2 mov edx, yuvaoffsetv ;; yuva 2 v's movd mm0, [esi+edx] ;; byte pv3 pv2 pv1 xxx punpcklbw mm0, mm7 ;; pv3 pv2 pv1 xxx mov edx, soffsetv ;; src I420 1-3 u's movd mm2, [edi+edx] ;; byte qv3 qv2 qv1 xx punpcklbw mm2, mm7 ;; qv3 qv2 qv1 xx psubw mm2, mm0 ;; q-p pmullw mm2, mm4 psrlw mm2, 8 ;; alpha*(q-p) >> 8 for v3 v2 v1 xx paddb mm2, mm0 ;; blended word v3 v2 v1 xx packuswb mm2, mm7 ;; blended byte v3 v2 v1 xx movd dtmp1, mm2 mov edx, doffsetv ;; store result, byte by byte ;; eax = bytes to do neg eaxb300: mov cl, btmp0(4+eax) ;; u ;; ecx trashed mov ch, btmp1(4+eax) ;; v mov [ebx], cl mov [ebx+edx], ch inc ebx inc eax jl b300;;------uv_line_done: mov eax, yuvauvpitch mov esi, yuvau add esi, eax mov yuvau, esi mov eax, yuva_pitch mov ecx, yuvaauv lea ecx, [ecx + eax*2] ;; down two lines mov yuvaauv, ecx ;; save for uv line loop mov eax, suvpitch mov edi, su add edi, eax mov su, edi mov eax, duvpitch mov ebx, du add ebx, eax mov du, ebx dec ebp jg near uv100 ;; uv line loop;;------exit: xor eax, eax ;; return successexit2: add esp, ntmps*4 pop ebx pop ecx pop edi pop esi pop ebp emms retfail_exit: mov eax, -1 ;; signal fail jmp exit2;_I420andYUVAtoI420_MMX endp;====================================;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; I420andYUVAtoYUY2;;;; This function alpha-blends two I420 buffers into a third;; YUY2 buffer using the alpha info tacked to the ;; end of the second I420 buffer;;;; yuva = top;; inverted alpha;; uv size computed as: uvpitch*uvlines = (pitch/2)*(lines/2);;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;_I420andYUVAtoYUY2_MMX:;;;; int I420andYUVAtoYUY2_MMX(;; unsigned char* src, int src_pels, int src_lines, int src_pitch,;; int src_startx, int src_starty,;; unsigned char* yuva, int yuva_pels, int yuva_lines, int yuva_pitch,;; int yuva_startx, int yuva_starty,;; unsigned char* dst, int dst_pels, int dst_lines, int dst_pitch,;; int dst_startx, int dst_starty,;; int width, int height);;;make_labels _I420andYUVAtoYUY2_MMX;;;; arguments%define src dword [esp+4*(1+npush)]%define src_pels dword [esp+4*(2+npush)]%define src_lines dword [esp+4*(3+npush)]%define src_pitch dword [esp+4*(4+npush)]%define src_startx dword [esp+4*(5+npush)]%define src_starty dword [esp+4*(6+npush)]%define yuva dword [esp+4*(7+npush)]%define yuva_pels dword [esp+4*(8+npush)]%define yuva_lines dword [esp+4*(9+npush)]%define yuva_pitch dword [esp+4*(10+npush)]%define yuva_startx dword [esp+4*(11+npush)]%define yuva_starty dword [esp+4*(12+npush)]%define dst dword [esp+4*(13+npush)]%define dst_pels dword [esp+4*(14+npush)]%define dst_lines dword [esp+4*(15+npush)]%define dst_pitch dword [esp+4*(16+npush)]%define dst_startx dword [esp+4*(17+npush)]%define dst_starty dword [esp+4*(18+npush)]%define width dword [esp+4*(19+npush)]%define height dword [esp+4*(20+npush)] push ebp push esi push edi
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -