📄 yuvammx.asm
字号:
push ecx push ebx;; tmp on stack%assign ntmps 15%assign npush (5+ntmps) sub esp, ntmps*4 %define yuvay dword [esp + 0*4]%define yuvau dword [esp + 1*4]%define yuvaoffsetv dword [esp + 2*4]%define yuvaoffseta dword [esp + 3*4]%define negyuvapitch dword [esp + 4*4]%define yuvayinc dword [esp + 5*4]%define yuvauvinc dword [esp + 6*4]%define sy dword [esp + 7*4]%define su dword [esp + 8*4]%define soffsetv dword [esp + 9*4]%define syinc dword [esp + 10*4]%define suvinc dword [esp + 11*4]%define dy dword [esp + 12*4]%define width0 dword [esp + 13*4]%define dyinc dword [esp + 14*4];; mov edi, width and edi, -4 ;; width truncated to multiple of 4 (width & ~ 3) mov ebp, edi ;; truncated for address increment computation only shr ebp, 1 ;; (width & ~ 3)/2;;----- mov eax, dst mov dy, eax mov ecx, dst_pitch add ecx, ecx sub ecx, edi sub ecx, edi mov dyinc, ecx ;; inc = 2*pitch - 2*(width & ~ 3);;----- mov eax, src_lines mov ecx, src_pitch mov edx, eax imul eax, ecx ;; pitch*lines mov esi, src mov sy, esi add esi, eax mov su, esi shr edx, 1 ;; lines/2 shr ecx, 1 ;; pitch/2 imul edx, ecx ;; (pitch/2)*(lines*2) mov soffsetv, edx sub ecx, ebp ;; pitch/2 - width/2 mov suvinc, ecx mov ecx, src_pitch add ecx, ecx sub ecx, edi mov syinc, ecx ;; inc = 2*pitch - (width & ~ 3);;----- mov eax, yuva_lines mov ecx, yuva_pitch mov edx, ecx neg edx mov negyuvapitch, edx mov edx, eax ;; lines imul eax, ecx ;; pitch*lines mov esi, yuva mov yuvay, esi add esi, eax mov yuvau, esi shr ecx, 1 ;; pitch/2 shr edx, 1 ;; lines/2 imul edx, ecx ;; (lines/2)*(pitch/2) mov yuvaoffsetv, edx lea edx, [eax+edx*2] mov yuvaoffseta, edx sub ecx, ebp mov yuvauvinc, ecx mov ecx, yuva_pitch add ecx, ecx sub ecx, edi mov yuvayinc, ecx ;; inc = 2*pitch - (width & ~ 3);;-----;;------------------;; pointer adjustment to (x,y) mov ecx, src_pitch mov eax, src_starty mov edx, eax mov ebx, src_startx imul eax, ecx ;; y*pitch mov esi, sy ;; add esi, eax ;; sy + y*pitch add esi, ebx ;; sy + y*pitch + x mov sy, esi shr ecx, 1 ;; pitch/2 shr edx, 1 ;; y/2 imul edx, ecx ;; (y/2)*(pitch/2) shr ebx, 1 ;; x/2 mov esi, su ;; add esi, edx ;; su + (y/2)*(pitch/2) add esi, ebx ;; su + (y/2)*(pitch/2) + x/2 mov su, esi;;;; pointer adjustment to (x,y) mov ecx, yuva_pitch mov eax, yuva_starty mov edx, eax mov ebx, yuva_startx imul eax, ecx ;; y*pitch add eax, ebx ;; y*pitch + x mov esi, yuvay ;; add esi, eax ;; yuvay + y*pitch + x mov yuvay, esi shr ecx, 1 ;; pitch/2 shr edx, 1 ;; y/2 imul edx, ecx ;; (y/2)*(pitch/2) shr ebx, 1 ;; x/2 mov esi, yuvau ;; add esi, edx ;; yuvau + (y/2)*(pitch/2) add esi, ebx ;; yuvau + (y/2)*(pitch/2) + x/2 mov yuvau, esi;; pointer adjustment to (x,y) mov ecx, dst_pitch mov eax, dst_starty mov ebx, dst_startx imul eax, ecx ;; y*pitch add ebx, ebx ;; 2*x mov esi, dy ;; add esi, eax ;; dy + y*pitch add esi, ebx ;; dy + y*pitch + 2*x mov dy, esi;;-----;; yuva P = top;; [Y01Y02][Y03Y04];; [Y11Y12][Y13Y04];; [U1] [U3];; [V1] [V3];; [A01A02][A03A04];; [A11Y12][A13A04];; I420 src Q = bot;; [Y01Y02][Y03Y04];; [Y11Y12][Y13Y04];; [U1] [U3];; [V1] [V3];; YUY2 dst ;; [YUYV][YUYV] ;; byte order [Y0][U][Y1][V];; [YUYV][YUYV];;------- pxor mm7, mm7 movq mm6, [con1] ;; set up line loop mov eax, height dec eax mov height, eax jle near exit mov esi, yuvay mov ebx, yuvau mov edi, sy mov edx, suy100: ;; line loop mov eax, width sub eax, 3 mov width0, eax jle near two_pelsa100: ;; do four pels per iteration ;; mov ecx, yuvaoffseta mov eax, src_pitch ;; ------- first line -------- ;; yuva 4 y's movd mm0, [esi] ;; p4 p3 p2 p1 punpcklbw mm0, mm7 ;; word p4 p3 p2 p1 ;; src I420 4 y's movq mm2, [edi] ;; q4 q3 q2 q1 punpcklbw mm2, mm7 ;; word q4 q3 q2 q1 psubw mm2, mm0 ;; q-p ;; yuva 4 a's movd mm4, [esi+ecx] ;; a4 a3 a2 a1 punpcklbw mm4, mm7 ;; word a4 a3 a2 a1 pmullw mm2, mm4 ;; word alpha*(q-p) for y4 y3 y2 y1 psrlw mm2, 8 ;; byte alpha*(q-p) for 0 y4 0 y3 0 y2 0 y1 paddb mm2, mm0 ;; blended for 0 y4 0 y3 0 y2 0 y1 pmaddwd mm4, mm6 ;; line 1 dword a4+a3 a2+a1 ;; ;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1 ;; mm4 = line 1 dword ave(a4,a3) ave(a2,a1) ;; ;; ------ second line --------- mov ebp, negyuvapitch sub esi, ebp ;; point to line 2 ;;add esi, yuva_pitch ;;;;add edi, src_pitch ;; yuva 4 y's movd mm1, [esi] ;; p4 p3 p2 p1 punpcklbw mm1, mm7 ;; word p4 p3 p2 p1 ;; src I420 4 y's movd mm3, [edi+eax] ;; q4 q3 q2 q1 punpcklbw mm3, mm7 ;; word q4 q3 q2 q1 psubw mm3, mm1 ;; q-p ;; yuva 4 a's movd mm5, [esi+ecx] ;; a4 a3 a2 a1 punpcklbw mm5, mm7 ;; word a4 a3 a2 a1 lea esi, [esi+ebp+4] ;; back to line 1 and inc add edi, 4 ;; inc sy pmullw mm3, mm5 ;; word alpha*(q-p) for y4 y3 y2 y1 psrlw mm3, 8 ;; byte alpha*(q-p) for 0 y4 0 y3 0 y2 0 y1 paddb mm3, mm1 ;; blended for 0 y4 0 y3 0 y2 0 y1 pmaddwd mm5, mm6 ;; line 2 dword a4+a3 a2+a1 ;; packssdw mm4, mm5 ;; for u v sum a22 a21 a12 a11 psrlw mm4, 1 ;; for u v ave a22 a21 a12 a11 ;; ;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1 ;; mm3 = line 2 blended for 0 y4 0 y3 0 y2 0 y1 0 ;; mm4 = ave for line 2 line 1 = a22 a21 a12 a11 ;; ;; ----- U V -------- ;;mov ebx, yuvau ;;mov edx, su mov ecx, yuvaoffsetv mov eax, soffsetv ;; --U-- movd mm0, [ebx] ;; x x pu2 pu1 punpcklwd mm0, mm0 ;; byte pu2 pu1 pu2 pu1 punpcklbw mm0, mm7 ;; word pu2 pu1 pu2 pu1 movd mm1, [edx] ;; x x qu2 qu1 punpcklwd mm1, mm1 ;; byte qu2 qu1 qu2 qu1 punpcklbw mm1, mm7 ;; word qu2 qu1 qu2 qu1 psubw mm1, mm0 ;; qu - pu pmullw mm1, mm4 ;; alpha*(qu-pu) psrlw mm1, 8 paddb mm1, mm0 ;; line 2 line 1 blended u2 u1 u2 u1 ;; --V-- movd mm0, [ebx+ecx] ;; x x pv2 pv1 punpcklwd mm0, mm0 ;; byte pv2 pv1 pv2 pv1 punpcklbw mm0, mm7 ;; word pv2 pv1 pv2 pv1 ;;movd mm5, [edx+eax] ;; x x qv2 qv1 movzx eax, word [edx+eax] ;; prevent possible access vio movd mm5, eax ;; 0 0 qv2 qv1 punpcklwd mm5, mm5 ;; byte qv2 qv1 qv2 qv1 punpcklbw mm5, mm7 ;; word qv2 qv1 qv2 qv1 psubw mm5, mm0 ;; qv - pv add ebx, 2 ;; inc yuvau address add edx, 2 ;; inc su address pmullw mm5, mm4 ;; alpha*(qv-pv) psrlw mm5, 8 paddb mm5, mm0 ;; line 2 line 1 blended v2 v1 v2 v1 ;; mm2 = line 1 blended for 0 y4 0 y3 0 y2 0 y1 ;; mm3 = line 2 blended for 0 y4 0 y3 0 y2 0 y1 ;; mm1 = line 2 line 1 blended 0 u2 0 u1 0 u2 0 u1 ;; mm5 = line 2 line 1 blended 0 v2 0 v1 0 v2 0 v1 ;; mov ecx, dy mov eax, dst_pitch packuswb mm2, mm3 ;; blended y24 y23 y22 y21 y14 y13 y12 y11 psllq mm5, 8 ;; v2 0 v1 0 v2 0 v1 0 por mm1, mm5 ;; v22 u22 v21 u21 v12 u12 v11 u11 movq mm3, mm2 punpcklbw mm3, mm1 ;; v12 y14 u12 y13 v11 y12 u11 y11 movq [ecx], mm3 ;; line 1 result punpckhbw mm2, mm1 ;; v22 y24 u22 y23 v21 y22 u21 y21 movq [ecx+eax], mm2 ;; line 2 result add ecx, 8 ;; inc dy address mov dy, ecx mov eax, width0 ;; pel loop sub eax, 4 mov width0, eax jg near a100;;------------------------------two_pels: ;; do two pels if any ;; remaining pels = eax+3 ;; compute 2 pels if remaining pels = 2 or 3, 1 not computed add eax, 2 jle near line_done ;; ;; mov ecx, yuvaoffseta mov eax, src_pitch ;; ------- first line -------- ;; yuva 2 y's movd mm0, [esi] ;; p2 p1 punpcklbw mm0, mm7 ;; word p2 p1 ;; src I420 2 y's movq mm2, [edi] ;; q2 q1 punpcklbw mm2, mm7 ;; word q2 q1 psubw mm2, mm0 ;; q-p ;; yuva 2 a's movd mm4, [esi+ecx] ;; a2 a1 punpcklbw mm4, mm7 ;; word a2 a1 pmullw mm2, mm4 ;; word alpha*(q-p) for y2 y1 psrlw mm2, 8 ;; byte alpha*(q-p) for 0 y2 0 y1 paddb mm2, mm0 ;; blended for 0 y2 0 y1 pmaddwd mm4, mm6 ;; line 1 dword a2+a1 ;; ;; mm2 = line 1 blended for 0 y2 0 y1 ;; mm4 = line 1 ave(a2,a1) ;; ;; ------ second line --------- mov ebp, negyuvapitch sub esi, ebp ;; point to line 2 ;; yuva 2 y's movd mm1, [esi] ;; p2 p1 punpcklbw mm1, mm7 ;; word p2 p1 ;; src I420 2 y's movd mm3, [edi+eax] ;; q2 q1 punpcklbw mm3, mm7 ;; word q2 q1 psubw mm3, mm1 ;; q-p ;; yuva 2 a's ;;movd mm5, [esi+ecx] ;; a2 a1 movzx eax, word [esi+ecx] ;; prevent possible access vio movd mm5, eax ;; a2 a1 punpcklbw mm5, mm7 ;; word a2 a1 lea esi, [esi+ebp] ;; back to line 1, no inc ;;add edi, 4 ;; inc sy pmullw mm3, mm5 ;; word alpha*(q-p) for y2 y1 psrlw mm3, 8 ;; byte alpha*(q-p) for 0 y2 0 y1 paddb mm3, mm1 ;; blended for 0 y2 0 y1 pmaddwd mm5, mm6 ;; line 2 dword a2+a1 ;; packssdw mm4, mm5 ;; for uv sum xx a21 xx a11 psrlw mm4, 1 ;; for uv ave xx a21 xx a11 ;; ;; mm2 = line 1 blended for 0 y2 0 y1 ;; mm3 = line 2 blended for 0 y2 0 y1 ;; mm4 = ave = a21 a11 ;; ;; ----- U V -------- mov ecx, yuvaoffsetv mov eax, soffsetv ;; --U-- ;;movd mm0, [ebx] ;; x x x pu1 ;;punpcklwd mm0, mm0 ;; byte x pu1 x pu1 ;;punpcklbw mm0, mm7 ;; word x pu1 x pu1 movzx ebp, byte [ebx] movd mm0, ebp ;; 0 0 0 pu1 punpckldq mm0, mm0 ;; word 0 pu1 0 pu1 ;;movd mm1, [edx] ;; x x qu2 qu1 ;;punpcklwd mm1, mm1 ;; byte qu2 qu1 qu2 qu1 ;;punpcklbw mm1, mm7 ;; word qu2 qu1 qu2 qu1 movzx ebp, byte [edx] movd mm1, ebp ;; 0 0 0 qu1 punpckldq mm1, mm1 ;; word 0 qu1 0 qu1 psubw mm1, mm0 ;; qu - pu pmullw mm1, mm4 ;; alpha*(qu-pu) psrlw mm1, 8 paddb mm1, mm0 ;; line 2 line 1 blended u2 u1 u2 u1 ;; --V-- ;;movd mm0, [ebx+ecx] ;; x x pv2 pv1 movzx ebp, byte [ebx+ecx] ;; prevent possible access vio movd mm0, ebp ;; 0 0 0 pv1 punpckldq mm0, mm0 ;; word 0 pv1 0 pv1 ;;movd mm5, [edx+eax] ;; qv1 movzx eax, byte [edx+eax] ;; prevent possible access vio movd mm5, eax ;; 0 0 0 qv1 punpckldq mm5, mm5 ;; word 0 qv1 0 qv1 psubw mm5, mm0 ;; qv - pv ;;add ebx, 2 ;; inc yuvau address ;;add edx, 2 ;; inc su address pmullw mm5, mm4 ;; alpha*(qv-pv) psrlw mm5, 8 paddb mm5, mm0 ;; line 2 line 1 blended v2 v1 v2 v1 ;; mm2 = line 1 blended for 0 y2 0 y1 ;; mm3 = line 2 blended for 0 y2 0 y1 ;; mm1 = line 2 line 1 blended 0 u2 0 u1 ;; mm5 = line 2 line 1 blended 0 v2 0 v1 ;; mov ecx, dy mov eax, dst_pitch packuswb mm2, mm3 ;; x x y22 y21 x x y12 y11 psllq mm5, 8 ;; v2 0 v1 0 v2 0 v1 0 por mm1, mm5 ;; v22 u22 v21 u21 v12 u12 v11 u11 movq mm3, mm2 punpcklbw mm2, mm1 ;; v11 y12 u11 y11 movd [ecx], mm2 ;; line 1 result
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -