📄 yuvammx.asm
字号:
punpckhbw mm3, mm1 ;; v21 y22 u21 y21 movd [ecx+eax], mm3 ;; line 2 result;;-----------;; line loopline_done:;;;; mov eax, yuvayinc ;; move down two lines add esi, eax mov eax, yuvauvinc add ebx, eax mov eax, syinc add edi, eax mov eax, suvinc add edx, eax mov eax, dyinc mov ecx, dy add ecx, eax mov dy, ecx mov eax, height sub eax, 2 mov height, eax jg near y100;;-----------exit: xor eax, eax ;; return success add esp, ntmps*4 pop ebx pop ecx pop edi pop esi pop ebp emms ret;_I420andYUVAtoYUY2_MMX endp;====================================;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; I420andYUVAtoUYVY;;;; This function alpha-blends two I420 buffers into a third;; UYVY buffer using the alpha info tacked to the ;; end of the second I420 buffer;;;; yuva = top;; inverted alpha;; uv size computed as: uvpitch*uvlines = (pitch/2)*(lines/2);;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;_I420andYUVAtoUYVY_MMX:;;;; int I420andYUVAtoUYVY_MMX(;; unsigned char* src, int src_pels, int src_lines, int src_pitch,;; int src_startx, int src_starty,;; unsigned char* yuva, int yuva_pels, int yuva_lines, int yuva_pitch,;; int yuva_startx, int yuva_starty,;; unsigned char* dst, int dst_pels, int dst_lines, int dst_pitch,;; int dst_startx, int dst_starty,;; int width, int height);;;;;make_labels _I420andYUVAtoUYVY_MMX;; arguments%define src dword [esp+4*(1+npush)]%define src_pels dword [esp+4*(2+npush)]%define src_lines dword [esp+4*(3+npush)]%define src_pitch dword [esp+4*(4+npush)]%define src_startx dword [esp+4*(5+npush)]%define src_starty dword [esp+4*(6+npush)]%define yuva dword [esp+4*(7+npush)]%define yuva_pels dword [esp+4*(8+npush)]%define yuva_lines dword [esp+4*(9+npush)]%define yuva_pitch dword [esp+4*(10+npush)]%define yuva_startx dword [esp+4*(11+npush)]%define yuva_starty dword [esp+4*(12+npush)]%define dst dword [esp+4*(13+npush)]%define dst_pels dword [esp+4*(14+npush)]%define dst_lines dword [esp+4*(15+npush)]%define dst_pitch dword [esp+4*(16+npush)]%define dst_startx dword [esp+4*(17+npush)]%define dst_starty dword [esp+4*(18+npush)]%define width dword [esp+4*(19+npush)]%define height dword [esp+4*(20+npush)] push ebp push esi push edi push ecx push ebx;; tmp on stack%assign ntmps 15%assign npush (5+ntmps) sub esp, ntmps*4 %define yuvay dword [esp + 0*4]%define yuvau dword [esp + 1*4]%define yuvaoffsetv dword [esp + 2*4]%define yuvaoffseta dword [esp + 3*4]%define negyuvapitch dword [esp + 4*4]%define yuvayinc dword [esp + 5*4]%define yuvauvinc dword [esp + 6*4]%define sy dword [esp + 7*4]%define su dword [esp + 8*4]%define soffsetv dword [esp + 9*4]%define syinc dword [esp + 10*4]%define suvinc dword [esp + 11*4]%define dy dword [esp + 12*4]%define width0 dword [esp + 13*4]%define dyinc dword [esp + 14*4] mov edi, width and edi, -4 ;; pels truncated to multiple of 4 (width & ~ 3) mov ebp, edi shr ebp, 1 ;; (width & ~ 3)/2;;----- mov eax, dst mov dy, eax mov ecx, dst_pitch add ecx, ecx sub ecx, edi sub ecx, edi mov dyinc, ecx ;; inc = 2*pitch - 2*(width & ~ 3);;----- mov eax, src_lines mov ecx, src_pitch mov edx, eax imul eax, ecx ;; pitch*lines mov esi, src mov sy, esi add esi, eax mov su, esi shr edx, 1 ;; lines/2 shr ecx, 1 ;; pitch/2 imul edx, ecx ;; (pitch/2)*(lines*2) mov soffsetv, edx sub ecx, ebp ;; pitch/2 - width/2 mov suvinc, ecx mov ecx, src_pitch add ecx, ecx sub ecx, edi mov syinc, ecx ;; inc = 2*pitch - (width & ~ 3);;----- mov eax, yuva_lines mov ecx, yuva_pitch mov edx, ecx neg edx mov negyuvapitch, edx mov edx, eax ;; lines imul eax, ecx ;; pitch*lines mov esi, yuva mov yuvay, esi add esi, eax mov yuvau, esi shr ecx, 1 ;; pitch/2 shr edx, 1 ;; lines/2 imul edx, ecx ;; (lines/2)*(pitch/2) mov yuvaoffsetv, edx lea edx, [eax+edx*2] mov yuvaoffseta, edx sub ecx, ebp mov yuvauvinc, ecx mov ecx, yuva_pitch add ecx, ecx sub ecx, edi mov yuvayinc, ecx ;; inc = 2*pitch - (width & ~ 3);;-----;;------------------;; pointer adjustment to (x,y) mov ecx, src_pitch mov eax, src_starty mov edx, eax mov ebx, src_startx imul eax, ecx ;; y*pitch mov esi, sy ;; add esi, eax ;; sy + y*pitch add esi, ebx ;; sy + y*pitch + x mov sy, esi shr ecx, 1 ;; pitch/2 shr edx, 1 ;; y/2 imul edx, ecx ;; (y/2)*(pitch/2) shr ebx, 1 ;; x/2 mov esi, su ;; add esi, edx ;; su + (y/2)*(pitch/2) add esi, ebx ;; su + (y/2)*(pitch/2) + x/2 mov su, esi;;;; pointer adjustment to (x,y) mov ecx, yuva_pitch mov eax, yuva_starty mov edx, eax mov ebx, yuva_startx imul eax, ecx ;; y*pitch add eax, ebx ;; y*pitch + x mov esi, yuvay ;; add esi, eax ;; yuvay + y*pitch + x mov yuvay, esi shr ecx, 1 ;; pitch/2 shr edx, 1 ;; y/2 imul edx, ecx ;; (y/2)*(pitch/2) shr ebx, 1 ;; x/2 mov esi, yuvau ;; add esi, edx ;; yuvau + (y/2)*(pitch/2) add esi, ebx ;; yuvau + (y/2)*(pitch/2) + x/2 mov yuvau, esi;; pointer adjustment to (x,y) mov ecx, dst_pitch mov eax, dst_starty mov ebx, dst_startx imul eax, ecx ;; y*pitch add ebx, ebx ;; 2*x mov esi, dy ;; add esi, eax ;; dy + y*pitch add esi, ebx ;; dy + y*pitch + 2*x mov dy, esi;;-----;; yuva P = top;; [Y01Y02][Y03Y04];; [Y11Y12][Y13Y04];; [U1] [U3];; [V1] [V3];; [A01A02][A03A04];; [A11Y12][A13A04];; I420 src Q = bot;; [Y01Y02][Y03Y04];; [Y11Y12][Y13Y04];; [U1] [U3];; [V1] [V3];; UYVY dst ;; [UYVY][UYVY] ;; byte order [U][Y0][V][Y1];; [UYVY][UYVY];;------- pxor mm7, mm7 movq mm6, [con1] ;; set up line loop mov eax, height dec eax mov height, eax jle near exit mov esi, yuvay mov ebx, yuvau mov edi, sy mov edx, suy100: ;; line loop mov eax, width sub eax, 3 mov width0, eax jle near two_pelsa100: ;; do four pels per iteration ;; mov ecx, yuvaoffseta mov eax, src_pitch ;; ------- first line -------- ;; yuva 4 y's movd mm0, [esi] ;; p4 p3 p2 p1 punpcklbw mm0, mm7 ;; word p4 p3 p2 p1 ;; src I420 4 y's movq mm2, [edi] ;; q4 q3 q2 q1 punpcklbw mm2, mm7 ;; word q4 q3 q2 q1 psubw mm2, mm0 ;; q-p ;; yuva 4 a's movd mm4, [esi+ecx] ;; a4 a3 a2 a1 punpcklbw mm4, mm7 ;; word a4 a3 a2 a1 pmullw mm2, mm4 ;; word alpha*(q-p) for y4 y3 y2 y1 psrlw mm2, 8 ;; byte alpha*(q-p) for 0 y4 0 y3 0 y2 0 y1 paddb mm2, mm0 ;; blended for 0 y4 0 y3 0 y2 0 y1 pmaddwd mm4, mm6 ;; line 1 dword a4+a3 a2+a1 ;; ;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1 ;; mm4 = line 1 dword ave(a4,a3) ave(a2,a1) ;; ;; ------ second line --------- mov ebp, negyuvapitch sub esi, ebp ;; point to line 2 ;;add esi, yuva_pitch ;;;;add edi, src_pitch ;; yuva 4 y's movd mm1, [esi] ;; p4 p3 p2 p1 punpcklbw mm1, mm7 ;; word p4 p3 p2 p1 ;; src I420 4 y's movd mm3, [edi+eax] ;; q4 q3 q2 q1 punpcklbw mm3, mm7 ;; word q4 q3 q2 q1 psubw mm3, mm1 ;; q-p ;; yuva 4 a's movd mm5, [esi+ecx] ;; a4 a3 a2 a1 punpcklbw mm5, mm7 ;; word a4 a3 a2 a1 lea esi, [esi+ebp+4] ;; back to line 1 and inc add edi, 4 ;; inc sy pmullw mm3, mm5 ;; word alpha*(q-p) for y4 y3 y2 y1 psrlw mm3, 8 ;; byte alpha*(q-p) for 0 y4 0 y3 0 y2 0 y1 paddb mm3, mm1 ;; blended for 0 y4 0 y3 0 y2 0 y1 pmaddwd mm5, mm6 ;; line 2 dword a4+a3 a2+a1 ;; packssdw mm4, mm5 ;; for u v sum a22 a21 a12 a11 psrlw mm4, 1 ;; for u v ave a22 a21 a12 a11 ;; ;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1 ;; mm3 = line 2 blended for 0 y4 0 y3 0 y2 0 y1 0 ;; mm4 = ave for line 2 line 1 = a22 a21 a12 a11 ;; ;; ----- U V -------- ;;mov ebx, yuvau ;;mov edx, su mov ecx, yuvaoffsetv mov eax, soffsetv ;; --U-- movd mm0, [ebx] ;; x x pu2 pu1 punpcklwd mm0, mm0 ;; byte pu2 pu1 pu2 pu1 punpcklbw mm0, mm7 ;; word pu2 pu1 pu2 pu1 movd mm1, [edx] ;; x x qu2 qu1 punpcklwd mm1, mm1 ;; byte qu2 qu1 qu2 qu1 punpcklbw mm1, mm7 ;; word qu2 qu1 qu2 qu1 psubw mm1, mm0 ;; qu - pu pmullw mm1, mm4 ;; alpha*(qu-pu) psrlw mm1, 8 paddb mm1, mm0 ;; line 2 line 1 blended u2 u1 u2 u1 ;; --V-- movd mm0, [ebx+ecx] ;; x x pv2 pv1 punpcklwd mm0, mm0 ;; byte pv2 pv1 pv2 pv1 punpcklbw mm0, mm7 ;; word pv2 pv1 pv2 pv1 ;;movd mm5, [edx+eax] ;; x x qv2 qv1 movzx eax, word [edx+eax] ;; prevent possible access vio movd mm5, eax ;; 0 0 qv2 qv1 punpcklwd mm5, mm5 ;; byte qv2 qv1 qv2 qv1 punpcklbw mm5, mm7 ;; word qv2 qv1 qv2 qv1 psubw mm5, mm0 ;; qv - pv add ebx, 2 ;; inc yuvau address add edx, 2 ;; inc su address pmullw mm5, mm4 ;; alpha*(qv-pv) psrlw mm5, 8 paddb mm5, mm0 ;; line 2 line 1 blended v2 v1 v2 v1 ;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1 ;; mm3 = line 2 blended for 0 y4 0 y3 0 y2 0 y1 ;; mm1 = line 2 line 1 blended 0 u2 0 u1 0 u2 0 u1 ;; mm5 = line 2 line 1 blended 0 v2 0 v1 0 v2 0 v1 ;; mov ecx, dy mov eax, dst_pitch packuswb mm2, mm3 ;; blended y24 y23 y22 y21 y14 y13 y12 y11 psllq mm5, 8 ;; v2 0 v1 0 v2 0 v1 0 por mm1, mm5 ;; v22 u22 v21 u21 v12 u12 v11 u11 movq mm3, mm1 punpcklbw mm1, mm2 ;; y14 v12 y13 u12 y12 v11 y11 u11 movq [ecx], mm1 ;; line 1 result punpckhbw mm3, mm2 ;; y24 v22 y23 u22 y22 v21 y21 u21 movq [ecx+eax], mm3 ;; line 2 result add ecx, 8 ;; inc dy address mov dy, ecx mov eax, width0 ;; pel loop sub eax, 4 mov width0, eax jg near a100;;------------------------------two_pels: ;; do two pels if any ;; remaining pels = eax+3 ;; compute 2 pels if remaining pels = 2 or 3, 1 not computed add eax, 2 jle near line_done ;; ;; mov ecx, yuvaoffseta mov eax, src_pitch ;; ------- first line -------- ;; yuva 2 y's movd mm0, [esi] ;; p2 p1 punpcklbw mm0, mm7 ;; word p2 p1 ;; src I420 2 y's movq mm2, [edi] ;; q2 q1 punpcklbw mm2, mm7 ;; word q2 q1 psubw mm2, mm0 ;; q-p ;; yuva 2 a's movd mm4, [esi+ecx] ;; a2 a1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -