📄 yuvammx.asm
字号:
punpcklbw mm4, mm7 ;; word a2 a1 pmullw mm2, mm4 ;; word alpha*(q-p) for y2 y1 psrlw mm2, 8 ;; byte alpha*(q-p) for 0 y2 0 y1 paddb mm2, mm0 ;; blended for 0 y2 0 y1 pmaddwd mm4, mm6 ;; line 1 dword a2+a1 ;; ;; mm2 = line 1 blended for 0 y2 0 y1 ;; mm4 = line 1 ave(a2,a1) ;; ;; ------ second line --------- mov ebp, negyuvapitch sub esi, ebp ;; point to line 2 ;; yuva 2 y's movd mm1, [esi] ;; p2 p1 punpcklbw mm1, mm7 ;; word p2 p1 ;; src I420 2 y's movd mm3, [edi+eax] ;; q2 q1 punpcklbw mm3, mm7 ;; word q2 q1 psubw mm3, mm1 ;; q-p ;; yuva 2 a's ;;movd mm5, [esi+ecx] ;; a2 a1 movzx eax, word [esi+ecx] ;; prevent possible access vio movd mm5, eax ;; a2 a1 punpcklbw mm5, mm7 ;; word a2 a1 lea esi, [esi+ebp] ;; back to line 1, no inc ;;add edi, 4 ;; inc sy pmullw mm3, mm5 ;; word alpha*(q-p) for y2 y1 psrlw mm3, 8 ;; byte alpha*(q-p) for 0 y2 0 y1 paddb mm3, mm1 ;; blended for 0 y2 0 y1 pmaddwd mm5, mm6 ;; line 2 dword a2+a1 ;; packssdw mm4, mm5 ;; for uv sum xx a21 xx a11 psrlw mm4, 1 ;; for uv ave xx a21 xx a11 ;; ;; mm2 = line 1 blended for 0 y2 0 y1 ;; mm3 = line 2 blended for 0 y2 0 y1 ;; mm4 = ave = a21 a11 ;; ;; ----- U V -------- mov ecx, yuvaoffsetv mov eax, soffsetv ;; --U-- ;;movd mm0, [ebx] ;; x x x pu1 ;;punpcklwd mm0, mm0 ;; byte x pu1 x pu1 ;;punpcklbw mm0, mm7 ;; word x pu1 x pu1 movzx ebp, byte [ebx] movd mm0, ebp ;; 0 0 0 pu1 punpckldq mm0, mm0 ;; word 0 pu1 0 pu1 ;;movd mm1, [edx] ;; x x qu2 qu1 ;;punpcklwd mm1, mm1 ;; byte qu2 qu1 qu2 qu1 ;;punpcklbw mm1, mm7 ;; word qu2 qu1 qu2 qu1 movzx ebp, byte [edx] movd mm1, ebp ;; 0 0 0 qu1 punpckldq mm1, mm1 ;; word 0 qu1 0 qu1 psubw mm1, mm0 ;; qu - pu pmullw mm1, mm4 ;; alpha*(qu-pu) psrlw mm1, 8 paddb mm1, mm0 ;; line 2 line 1 blended u2 u1 u2 u1 ;; --V-- ;;movd mm0, [ebx+ecx] ;; x x pv2 pv1 movzx ebp, byte [ebx+ecx] ;; prevent possible access vio movd mm0, ebp ;; 0 0 0 pv1 punpckldq mm0, mm0 ;; word 0 pv1 0 pv1 ;;movd mm5, [edx+eax] ;; qv1 movzx eax, byte [edx+eax] ;; prevent possible access vio movd mm5, eax ;; 0 0 0 qv1 punpckldq mm5, mm5 ;; word 0 qv1 0 qv1 psubw mm5, mm0 ;; qv - pv ;;add ebx, 2 ;; inc yuvau address ;;add edx, 2 ;; inc su address pmullw mm5, mm4 ;; alpha*(qv-pv) psrlw mm5, 8 paddb mm5, mm0 ;; line 2 line 1 blended v2 v1 v2 v1 ;; mm2 = line 1 blended for 0 y2 0 y1 ;; mm3 = line 2 blended for 0 y2 0 y1 ;; mm1 = line 2 line 1 blended 0 u2 0 u1 ;; mm5 = line 2 line 1 blended 0 v2 0 v1 ;; mov ecx, dy mov eax, dst_pitch packuswb mm2, mm3 ;; x x y22 y21 x x y12 y11 psllq mm5, 8 ;; v2 0 v1 0 v2 0 v1 0 por mm1, mm5 ;; v22 u22 v21 u21 v12 u12 v11 u11 movq mm3, mm1 punpcklbw mm1, mm2 ;; v11 y12 u11 y11 movd [ecx], mm1 ;; line 1 result punpckhbw mm3, mm2 ;; v21 y22 u21 y21 movd [ecx+eax], mm3 ;; line 2 result;;-----------;; line loopline_done: mov eax, yuvayinc ;; move down two lines add esi, eax mov eax, yuvauvinc add ebx, eax mov eax, syinc add edi, eax mov eax, suvinc add edx, eax mov eax, dyinc mov ecx, dy add ecx, eax mov dy, ecx mov eax, height sub eax, 2 mov height, eax jg near y100;;-----------exit: xor eax, eax ;; return success add esp, ntmps*4 pop ebx pop ecx pop edi pop esi pop ebp emms ret;_I420andYUVAtoUYVY_MMX endp;====================================;====================================;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;int I420andI420toI420 (;; unsigned char *src1_ptr, int src1_pels, int src1_lines, int src1_pitch,;; unsigned char *src2_ptr, int src2_pels, int src2_lines, int src2_pitch,;; unsigned char *dest_ptr, int dest_pels, int dest_lines, int dest_pitch,;; int alpha);;;;;; src2 = top;; inverted alpha;; uv size computed as: uvpitch*uvlines = (pitch/2)*(lines/2);;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;_I420andI420toI420_MMX_sub:;;;; int I420andI420toI420_MMX_sub(;; unsigned char *src1_ptr, int src1_pels, int src1_lines, int src1_pitch,;; int src1_startx, int src1_starty,;; unsigned char *src2_ptr, int src2_pels, int src2_lines, int src2_pitch,;; int src2_startx, int src2_starty,;; unsigned char *dest_ptr, int dest_pels, int dest_lines, int dest_pitch,;; int dest_startx, int dest_starty,;; int width, int height, int alpha );;;make_labels _I420andI420toI420_MMX_sub;;;; arguments%define src dword [esp+4*(1+npush)]%define src_pels dword [esp+4*(2+npush)]%define src_lines dword [esp+4*(3+npush)]%define src_pitch dword [esp+4*(4+npush)]%define src_startx dword [esp+4*(5+npush)]%define src_starty dword [esp+4*(6+npush)]%define top dword [esp+4*(7+npush)]%define top_pels dword [esp+4*(8+npush)]%define top_lines dword [esp+4*(9+npush)]%define top_pitch dword [esp+4*(10+npush)]%define top_startx dword [esp+4*(11+npush)]%define top_starty dword [esp+4*(12+npush)]%define dst dword [esp+4*(13+npush)]%define dst_pels dword [esp+4*(14+npush)]%define dst_lines dword [esp+4*(15+npush)]%define dst_pitch dword [esp+4*(16+npush)]%define dst_startx dword [esp+4*(17+npush)]%define dst_starty dword [esp+4*(18+npush)]%define width dword [esp+4*(19+npush)]%define height dword [esp+4*(20+npush)]%define alpha dword [esp+4*(21+npush)] push ebp push esi push edi push ecx push ebx;; tmp on stack%assign ntmps 16%assign npush (5+ntmps) sub esp, ntmps*4 ;; cycle through pointers to y then u the v indexed by ecx;; structure below must agree%define topptr dword [esp + 0*4 + ecx*4] ;; topy[ecx*4]%define sptr dword [esp + 4*4 + ecx*4] ;; sy[ecx*4]%define dptr dword [esp + 8*4 + ecx*4] ;; dy[ecx*4];; %define topy dword [esp + 0*4]%define topu dword [esp + 1*4]%define topv dword [esp + 2*4]%define toppitch dword [esp + 3*4]%define sy dword [esp + 4*4]%define su dword [esp + 5*4]%define sv dword [esp + 6*4]%define spitch dword [esp + 7*4]%define dy dword [esp + 8*4]%define du dword [esp + 9*4]%define dv dword [esp + 10*4]%define dpitch dword [esp + 11*4]%define lines dword [esp + 12*4]%define pels dword [esp + 13*4]%define dtmp0 dword [esp + 14*4]%define dtmp1 dword [esp + 15*4]%define btmp0(x) byte [esp + 14*4 + x]%define btmp1(x) byte [esp + 15*4 + x];;----- mov eax, dst_lines mov ecx, dst_pitch mov dpitch, ecx mov edx, eax imul eax, ecx ;; pitch*lines mov esi, dst mov dy, esi add esi, eax mov du, esi shr edx, 1 ;; lines/2 = uv lines shr ecx, 1 ;; pitch/2 = uvpitch imul edx, ecx ;; add esi, edx mov dv, esi;;----- mov eax, src_lines mov ecx, src_pitch mov spitch, ecx mov edx, eax imul eax, ecx ;; pitch*lines mov esi, src mov sy, esi add esi, eax mov su, esi shr edx, 1 ;; lines/2 = uv lines shr ecx, 1 ;; pitch/2 = uvpitch imul edx, ecx ;; add esi, edx mov sv, esi;;----- mov eax, top_lines mov ecx, top_pitch mov toppitch, ecx mov edx, eax imul eax, ecx ;; pitch*lines mov esi, top mov topy, esi add esi, eax mov topu, esi shr edx, 1 ;; lines/2 = uv lines shr ecx, 1 ;; pitch/2 = uvpitch imul edx, ecx ;; add esi, edx mov topv, esi;;----- mov eax, height mov lines, eax mov eax, width mov pels, eax;;------------------;;------------------;; pointer adjustment to (x,y) mov ecx, top_pitch mov eax, top_starty mov edx, eax mov ebx, top_startx imul eax, ecx ;; y*pitch mov esi, topy ;; add esi, eax ;; sy + y*pitch add esi, ebx ;; sy + y*pitch + x mov topy, esi shr ecx, 1 ;; pitch/2 shr edx, 1 ;; y/2 imul edx, ecx ;; (y/2)*(pitch/2) shr ebx, 1 ;; x/2 add edx, ebx ;; (y/2)*(pitch/2) + x/2 mov esi, topu ;; add esi, edx ;; su + (y/2)*(pitch/2) mov topu, esi mov esi, topv ;; add esi, edx ;; su + (y/2)*(pitch/2) mov topv, esi;;;; pointer adjustment to (x,y) mov ecx, src_pitch mov eax, src_starty mov edx, eax mov ebx, src_startx imul eax, ecx ;; y*pitch mov esi, sy ;; add esi, eax ;; sy + y*pitch add esi, ebx ;; sy + y*pitch + x mov sy, esi shr ecx, 1 ;; pitch/2 shr edx, 1 ;; y/2 imul edx, ecx ;; (y/2)*(pitch/2) shr ebx, 1 ;; x/2 add edx, ebx ;; (y/2)*(pitch/2) + x/2 mov esi, su ;; add esi, edx ;; su + (y/2)*(pitch/2) mov su, esi mov esi, sv ;; add esi, edx ;; su + (y/2)*(pitch/2) mov sv, esi;;;;; pointer adjustment to (x,y) mov ecx, dst_pitch mov eax, dst_starty mov edx, eax mov ebx, dst_startx imul eax, ecx ;; y*pitch mov esi, dy ;; add esi, eax ;; sy + y*pitch add esi, ebx ;; sy + y*pitch + x mov dy, esi shr ecx, 1 ;; pitch/2 shr edx, 1 ;; y/2 imul edx, ecx ;; (y/2)*(pitch/2) shr ebx, 1 ;; x/2 add edx, ebx ;; (y/2)*(pitch/2) + x/2 mov esi, du ;; add esi, edx ;; su + (y/2)*(pitch/2) mov du, esi mov esi, dv ;; add esi, edx ;; su + (y/2)*(pitch/2) mov dv, esi;;-----;; I420 top P = top;; [Y01Y02][Y03Y04];; [Y11Y12][Y13Y04];; [U1] [U3];; [V1] [V3];; I420 src Q = bot;; [Y01Y02][Y03Y04];; [Y11Y12][Y13Y04];; [U1] [U3];; [V1] [V3];; I420 dst;; [Y01Y02][Y03Y04];; [Y11Y12][Y13Y04];; [U1] [U3];; [V1] [V3] xor ecx, ecx ;; y then u then v counter;; load alpha movzx eax, byte alpha movd mm4, eax punpcklwd mm4, mm4 ;; 0 0 a a punpckldq mm4, mm4 ; a a a a ;; movq mm7, [mask3] movq mm6, [mask3b] pxor mm5, mm5;;-------yuv_loop: mov esi, topptr mov edi, sptr mov ebx, dptr mov ebp, lines ;; y lines loop counter or ebp, ebp jle near exity100: mov eax, pels sub eax, 7 jle y_by_foursa100: ;; mm4 = alpha ;; top 8 y's movq mm0, [esi] ;; p8 p7 p6 p5 p4 p3 p2 p1 movq mm1, mm0 ;; p8 p7 p6 p5 p4 p3 p2 p1 psrlw mm0, 8 ;; word p8 p6 p4 p2 ;; src I420 8 y's movq mm2, [edi] ;; q8 q7 q6 q5 q4 q3 q2 q1 movq mm3, mm2 ;; q8 q7 q6 q5 q4 q3 q2 q1 psrlw mm2, 8 ;; word q8 q6 q4 q2 psubw mm2, mm0 ;; q-p pmullw mm2, mm4 ;; word alpha*(q-p) for y8 y6 y4 y2 pand mm2, mm6 ;; word hi-byte alpha*(q-p) for y8 y6 y4 y2 ;; ;; movq mm0, mm1 ;; p8 p7 p6 p5 p4 p3 p2 p1 pand mm1, mm7 ;; word
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -