📄 reduced_mmx.asm
字号:
mov eax, [esp+12] ; BpS movq mm6, [Up13] movq mm7, [Up31] COL03_SSE mm0, mm1, 0 MUL_PACK mm0,mm1, mm6, mm7 movq mm4, mm0 movq mm5, mm1 STORE_1 mm4, mm5 add ecx, eax COL03_SSE mm2, mm3, 1 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL03_SSE mm0, mm1, 2 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL03_SSE mm2, mm3, 3 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL03_SSE mm0, mm1, 4 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL03_SSE mm2, mm3, 5 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL03_SSE mm0, mm1, 6 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL03_SSE mm2, mm3, 7 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 STORE_1 mm2, mm3 mov ecx, [esp+4] add ecx, 8 COL47_SSE mm0, mm1, 0 MUL_PACK mm0,mm1, mm6, mm7 movq mm4, mm0 movq mm5, mm1 STORE_1 mm4, mm5 add ecx, eax COL47_SSE mm2, mm3, 1 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL47_SSE mm0, mm1, 2 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL47_SSE mm2, mm3, 3 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL47_SSE mm0, mm1, 4 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL47_SSE mm2, mm3, 5 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL47_SSE mm0, mm1, 6 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL47_SSE mm2, mm3, 7 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 STORE_1 mm2, mm3 ret.endfunc;===========================================================================;; void xvid_Add_Upsampled_8x8_16To8_xmm(uint8_t *Dst,; const int16_t *Src, const int BpS);;;===========================================================================align 16xvid_Add_Upsampled_8x8_16To8_xmm: ; 549c mov ecx, [esp+4] ; Dst mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS COL03_SSE mm0, mm1, 0 MUL_PACK mm0,mm1, [Up13], [Up31] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax COL03_SSE mm2, mm3, 1 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03_SSE mm0, mm1, 2 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03_SSE mm2, mm3, 3 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03_SSE mm0, mm1, 4 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03_SSE mm2, mm3, 5 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03_SSE mm0, mm1, 6 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03_SSE mm2, mm3, 7 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 STORE_ADD_1 mm2, mm3 mov ecx, [esp+4] add ecx, 8 COL47_SSE mm0, mm1, 0 MUL_PACK mm0,mm1, [Up13], [Up31] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax COL47_SSE mm2, mm3, 1 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47_SSE mm0, mm1, 2 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47_SSE mm2, mm3, 3 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47_SSE mm0, mm1, 4 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47_SSE mm2, mm3, 5 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47_SSE mm0, mm1, 6 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47_SSE mm2, mm3, 7 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 STORE_ADD_1 mm2, mm3 ret.endfunc;===========================================================================;; void xvid_HFilter_31_mmx(uint8_t *Src1, uint8_t *Src2, int Nb_Blks);; void xvid_VFilter_31_x86(uint8_t *Src1, uint8_t *Src2, const int BpS, int Nb_Blks);; void xvid_HFilter_31_x86(uint8_t *Src1, uint8_t *Src2, int Nb_Blks);;;===========================================================================;//////////////////////////////////////////////////////////////////////;// horizontal/vertical filtering: [x,y] -> [ (3x+y+2)>>2, (x+3y+2)>>2 ];//;// We use the trick: tmp = (x+y+2) -> [x = (tmp+2x)>>2, y = (tmp+2y)>>2];//////////////////////////////////////////////////////////////////////align 16xvid_HFilter_31_mmx: push esi push edi mov esi, [esp+4 +8] ; Src1 mov edi, [esp+8 +8] ; Src2 mov eax, [esp+12 +8] ; Nb_Blks lea eax,[eax*2] movq mm5, [Cst2] pxor mm7, mm7 lea esi, [esi+eax*4] lea edi, [edi+eax*4] neg eax.Loop: ;12c movd mm0, [esi+eax*4] movd mm1, [edi+eax*4] movq mm2, mm5 punpcklbw mm0, mm7 punpcklbw mm1, mm7 paddsw mm2, mm0 paddsw mm0, mm0 paddsw mm2, mm1 paddsw mm1, mm1 paddsw mm0, mm2 paddsw mm1, mm2 psraw mm0, 2 psraw mm1, 2 packuswb mm0, mm7 packuswb mm1, mm7 movd [esi+eax*4], mm0 movd [edi+eax*4], mm1 add eax,1 jl .Loop pop edi pop esi ret.endfunc ; mmx is of no use here. Better use plain ASM. Moreover, ; this is for the fun of ASM coding, coz' every modern compiler can ; end up with a code that looks very much like this one...align 16xvid_VFilter_31_x86: push esi push edi push ebx push ebp mov esi, [esp+4 +16] ; Src1 mov edi, [esp+8 +16] ; Src2 mov ebp, [esp+12 +16] ; BpS mov eax, [esp+16 +16] ; Nb_Blks lea eax,[eax*8].Loop: ;7c movzx ecx, byte [esi] movzx edx, byte [edi] lea ebx, [ecx+edx+2] lea ecx,[ebx+2*ecx] lea edx,[ebx+2*edx] shr ecx,2 shr edx,2 mov [esi], cl mov [edi], dl lea esi, [esi+ebp] lea edi, [edi+ebp] dec eax jg .Loop pop ebp pop ebx pop edi pop esi ret.endfunc ; this one's just a little faster than gcc's code. Very little.align 16xvid_HFilter_31_x86: push esi push edi push ebx mov esi, [esp+4 +12] ; Src1 mov edi, [esp+8 +12] ; Src2 mov eax, [esp+12 +12] ; Nb_Blks lea eax,[eax*8] lea esi, [esi+eax] lea edi, [esi+eax] neg eax.Loop: ; 6c movzx ecx, byte [esi+eax] movzx edx, byte [edi+eax] lea ebx, [ecx+edx+2] lea ecx,[ebx+2*ecx] lea edx,[ebx+2*edx] shr ecx,2 shr edx,2 mov [esi+eax], cl mov [edi+eax], dl inc eax jl .Loop pop ebx pop edi pop esi ret.endfunc;//////////////////////////////////////////////////////////////////////;// 16b downsampling 16x16 -> 8x8;//////////////////////////////////////////////////////////////////////%macro HFILTER_1331 2 ;%1:src %2:dst reg. -trashes mm0/mm1/mm2 movq mm2, [Mask_ff] movq %2, [%1-1] ;-10123456 movq mm0, [%1] ; 01234567 movq mm1, [%1+1] ; 12345678 pand %2, mm2 ;-1|1|3|5 pand mm0, mm2 ; 0|2|4|6 pand mm1, mm2 ; 1|3|5|7 pand mm2, [%1+2] ; 2|4|6|8 paddusw mm0, mm1 paddusw %2, mm2 pmullw mm0, mm7 paddusw %2, mm0%endmacro%macro VFILTER_1331 4 ; %1-4: regs %1-%2: trashed paddsw %1, [Cst32] paddsw %2, %3 pmullw %2, mm7 paddsw %1,%4 paddsw %1, %2 psraw %1, 6%endmacro;===========================================================================;; void xvid_Filter_18x18_To_8x8_mmx(int16_t *Dst,; const uint8_t *Src, const int BpS);;;===========================================================================%macro COPY_TWO_LINES_1331 1 ; %1: dst HFILTER_1331 edx , mm5 HFILTER_1331 edx+eax, mm6 lea edx, [edx+2*eax] VFILTER_1331 mm3,mm4,mm5, mm6 movq [%1], mm3 HFILTER_1331 edx , mm3 HFILTER_1331 edx+eax, mm4 lea edx, [edx+2*eax] VFILTER_1331 mm5,mm6,mm3,mm4 movq [%1+16], mm5%endmacroalign 16xvid_Filter_18x18_To_8x8_mmx: ; 283c (~4.4c per output pixel) mov ecx, [esp+4] ; Dst mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS movq mm7, [Cst3] sub edx, eax ; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. ; process columns 0-3 HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 HFILTER_1331 edx+eax, mm4 lea edx, [edx+2*eax] COPY_TWO_LINES_1331 ecx + 0*16 COPY_TWO_LINES_1331 ecx + 2*16 COPY_TWO_LINES_1331 ecx + 4*16 COPY_TWO_LINES_1331 ecx + 6*16 ; process columns 4-7 mov edx, [esp+8] sub edx, eax add edx, 8 HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 HFILTER_1331 edx+eax, mm4 lea edx, [edx+2*eax] COPY_TWO_LINES_1331 ecx + 0*16 +8 COPY_TWO_LINES_1331 ecx + 2*16 +8 COPY_TWO_LINES_1331 ecx + 4*16 +8 COPY_TWO_LINES_1331 ecx + 6*16 +8 ret.endfunc;===========================================================================;; void xvid_Filter_Diff_18x18_To_8x8_mmx(int16_t *Dst,; const uint8_t *Src, const int BpS);;;===========================================================================%macro DIFF_TWO_LINES_1331 1 ; %1: dst HFILTER_1331 edx , mm5 HFILTER_1331 edx+eax, mm6 lea edx, [edx+2*eax] movq mm2, [%1] VFILTER_1331 mm3,mm4,mm5, mm6 psubsw mm2, mm3 movq [%1], mm2 HFILTER_1331 edx , mm3 HFILTER_1331 edx+eax, mm4 lea edx, [edx+2*eax] movq mm2, [%1+16] VFILTER_1331 mm5,mm6,mm3,mm4 psubsw mm2, mm5 movq [%1+16], mm2%endmacroalign 16xvid_Filter_Diff_18x18_To_8x8_mmx: ; 302c mov ecx, [esp+4] ; Dst mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS movq mm7, [Cst3] sub edx, eax ; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. ; process columns 0-3 HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 HFILTER_1331 edx+eax, mm4 lea edx, [edx+2*eax] DIFF_TWO_LINES_1331 ecx + 0*16 DIFF_TWO_LINES_1331 ecx + 2*16 DIFF_TWO_LINES_1331 ecx + 4*16 DIFF_TWO_LINES_1331 ecx + 6*16 ; process columns 4-7 mov edx, [esp+8] sub edx, eax add edx, 8 HFILTER_1331 edx , mm3 ; pre-load mm3/mm4 HFILTER_1331 edx+eax, mm4 lea edx, [edx+2*eax] DIFF_TWO_LINES_1331 ecx + 0*16 +8 DIFF_TWO_LINES_1331 ecx + 2*16 +8 DIFF_TWO_LINES_1331 ecx + 4*16 +8 DIFF_TWO_LINES_1331 ecx + 6*16 +8 ret.endfunc;////////////////////////////////////////////////////////////////////// ; pfeewwww... Never Do That On Stage Again. :)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -