📄 reduced_mmx.asm
字号:
movq mm6, [Up13]
movq mm7, [Up31]
COL03_SSE mm0, mm1, 0
MUL_PACK mm0,mm1, mm6, mm7
movq mm4, mm0
movq mm5, mm1
STORE_1 mm4, mm5
add ecx, eax
COL03_SSE mm2, mm3, 1
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL03_SSE mm0, mm1, 2
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL03_SSE mm2, mm3, 3
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL03_SSE mm0, mm1, 4
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL03_SSE mm2, mm3, 5
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL03_SSE mm0, mm1, 6
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL03_SSE mm2, mm3, 7
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
STORE_1 mm2, mm3
mov ecx, [esp+4]
add ecx, 8
COL47_SSE mm0, mm1, 0
MUL_PACK mm0,mm1, mm6, mm7
movq mm4, mm0
movq mm5, mm1
STORE_1 mm4, mm5
add ecx, eax
COL47_SSE mm2, mm3, 1
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL47_SSE mm0, mm1, 2
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL47_SSE mm2, mm3, 3
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL47_SSE mm0, mm1, 4
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL47_SSE mm2, mm3, 5
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL47_SSE mm0, mm1, 6
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL47_SSE mm2, mm3, 7
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
STORE_1 mm2, mm3
ret
;===========================================================================
;
; void xvid_Add_Upsampled_8x8_16To8_xmm(uint8_t *Dst,
; const int16_t *Src, const int BpS);
;
;===========================================================================
align 16
xvid_Add_Upsampled_8x8_16To8_xmm: ; 549c
mov ecx, [esp+4] ; Dst
mov edx, [esp+8] ; Src
mov eax, [esp+12] ; BpS
COL03_SSE mm0, mm1, 0
MUL_PACK mm0,mm1, [Up13], [Up31]
movq mm4, mm0
movq mm5, mm1
STORE_ADD_1 mm4, mm5
add ecx, eax
COL03_SSE mm2, mm3, 1
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL03_SSE mm0, mm1, 2
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL03_SSE mm2, mm3, 3
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL03_SSE mm0, mm1, 4
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL03_SSE mm2, mm3, 5
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL03_SSE mm0, mm1, 6
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL03_SSE mm2, mm3, 7
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
STORE_ADD_1 mm2, mm3
mov ecx, [esp+4]
add ecx, 8
COL47_SSE mm0, mm1, 0
MUL_PACK mm0,mm1, [Up13], [Up31]
movq mm4, mm0
movq mm5, mm1
STORE_ADD_1 mm4, mm5
add ecx, eax
COL47_SSE mm2, mm3, 1
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL47_SSE mm0, mm1, 2
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL47_SSE mm2, mm3, 3
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL47_SSE mm0, mm1, 4
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL47_SSE mm2, mm3, 5
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL47_SSE mm0, mm1, 6
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL47_SSE mm2, mm3, 7
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
STORE_ADD_1 mm2, mm3
ret
;===========================================================================
;
; void xvid_HFilter_31_mmx(uint8_t *Src1, uint8_t *Src2, int Nb_Blks);
; void xvid_VFilter_31_x86(uint8_t *Src1, uint8_t *Src2, const int BpS, int Nb_Blks);
; void xvid_HFilter_31_x86(uint8_t *Src1, uint8_t *Src2, int Nb_Blks);
;
;===========================================================================
;//////////////////////////////////////////////////////////////////////
;// horizontal/vertical filtering: [x,y] -> [ (3x+y+2)>>2, (x+3y+2)>>2 ]
;//
;// We use the trick: tmp = (x+y+2) -> [x = (tmp+2x)>>2, y = (tmp+2y)>>2]
;//////////////////////////////////////////////////////////////////////
align 16
xvid_HFilter_31_mmx:
push esi
push edi
mov esi, [esp+4 +8] ; Src1
mov edi, [esp+8 +8] ; Src2
mov eax, [esp+12 +8] ; Nb_Blks
lea eax,[eax*2]
movq mm5, [Cst2]
pxor mm7, mm7
lea esi, [esi+eax*4]
lea edi, [edi+eax*4]
neg eax
.Loop: ;12c
movd mm0, [esi+eax*4]
movd mm1, [edi+eax*4]
movq mm2, mm5
punpcklbw mm0, mm7
punpcklbw mm1, mm7
paddsw mm2, mm0
paddsw mm0, mm0
paddsw mm2, mm1
paddsw mm1, mm1
paddsw mm0, mm2
paddsw mm1, mm2
psraw mm0, 2
psraw mm1, 2
packuswb mm0, mm7
packuswb mm1, mm7
movd [esi+eax*4], mm0
movd [edi+eax*4], mm1
add eax,1
jl .Loop
pop edi
pop esi
ret
; mmx is of no use here. Better use plain ASM. Moreover,
; this is for the fun of ASM coding, coz' every modern compiler can
; end up with a code that looks very much like this one...
align 16
xvid_VFilter_31_x86:
push esi
push edi
push ebx
push ebp
mov esi, [esp+4 +16] ; Src1
mov edi, [esp+8 +16] ; Src2
mov ebp, [esp+12 +16] ; BpS
mov eax, [esp+16 +16] ; Nb_Blks
lea eax,[eax*8]
.Loop: ;7c
movzx ecx, byte [esi]
movzx edx, byte [edi]
lea ebx, [ecx+edx+2]
lea ecx,[ebx+2*ecx]
lea edx,[ebx+2*edx]
shr ecx,2
shr edx,2
mov [esi], cl
mov [edi], dl
lea esi, [esi+ebp]
lea edi, [edi+ebp]
dec eax
jg .Loop
pop ebp
pop ebx
pop edi
pop esi
ret
; this one's just a little faster than gcc's code. Very little.
align 16
xvid_HFilter_31_x86:
push esi
push edi
push ebx
mov esi, [esp+4 +12] ; Src1
mov edi, [esp+8 +12] ; Src2
mov eax, [esp+12 +12] ; Nb_Blks
lea eax,[eax*8]
lea esi, [esi+eax]
lea edi, [esi+eax]
neg eax
.Loop: ; 6c
movzx ecx, byte [esi+eax]
movzx edx, byte [edi+eax]
lea ebx, [ecx+edx+2]
lea ecx,[ebx+2*ecx]
lea edx,[ebx+2*edx]
shr ecx,2
shr edx,2
mov [esi+eax], cl
mov [edi+eax], dl
inc eax
jl .Loop
pop ebx
pop edi
pop esi
ret
;//////////////////////////////////////////////////////////////////////
;// 16b downsampling 16x16 -> 8x8
;//////////////////////////////////////////////////////////////////////
%macro HFILTER_1331 2 ;%1:src %2:dst reg. -trashes mm0/mm1/mm2
movq mm2, [Mask_ff]
movq %2, [%1-1] ;-10123456
movq mm0, [%1] ; 01234567
movq mm1, [%1+1] ; 12345678
pand %2, mm2 ;-1|1|3|5
pand mm0, mm2 ; 0|2|4|6
pand mm1, mm2 ; 1|3|5|7
pand mm2, [%1+2] ; 2|4|6|8
paddusw mm0, mm1
paddusw %2, mm2
pmullw mm0, mm7
paddusw %2, mm0
%endmacro
%macro VFILTER_1331 4 ; %1-4: regs %1-%2: trashed
paddsw %1, [Cst32]
paddsw %2, %3
pmullw %2, mm7
paddsw %1,%4
paddsw %1, %2
psraw %1, 6
%endmacro
;===========================================================================
;
; void xvid_Filter_18x18_To_8x8_mmx(int16_t *Dst,
; const uint8_t *Src, const int BpS);
;
;===========================================================================
%macro COPY_TWO_LINES_1331 1 ; %1: dst
HFILTER_1331 edx , mm5
HFILTER_1331 edx+eax, mm6
lea edx, [edx+2*eax]
VFILTER_1331 mm3,mm4,mm5, mm6
movq [%1], mm3
HFILTER_1331 edx , mm3
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
VFILTER_1331 mm5,mm6,mm3,mm4
movq [%1+16], mm5
%endmacro
align 16
xvid_Filter_18x18_To_8x8_mmx: ; 283c (~4.4c per output pixel)
mov ecx, [esp+4] ; Dst
mov edx, [esp+8] ; Src
mov eax, [esp+12] ; BpS
movq mm7, [Cst3]
sub edx, eax
; mm3/mm4/mm5/mm6 is used as a 4-samples delay line.
; process columns 0-3
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
COPY_TWO_LINES_1331 ecx + 0*16
COPY_TWO_LINES_1331 ecx + 2*16
COPY_TWO_LINES_1331 ecx + 4*16
COPY_TWO_LINES_1331 ecx + 6*16
; process columns 4-7
mov edx, [esp+8]
sub edx, eax
add edx, 8
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
COPY_TWO_LINES_1331 ecx + 0*16 +8
COPY_TWO_LINES_1331 ecx + 2*16 +8
COPY_TWO_LINES_1331 ecx + 4*16 +8
COPY_TWO_LINES_1331 ecx + 6*16 +8
ret
;===========================================================================
;
; void xvid_Filter_Diff_18x18_To_8x8_mmx(int16_t *Dst,
; const uint8_t *Src, const int BpS);
;
;===========================================================================
%macro DIFF_TWO_LINES_1331 1 ; %1: dst
HFILTER_1331 edx , mm5
HFILTER_1331 edx+eax, mm6
lea edx, [edx+2*eax]
movq mm2, [%1]
VFILTER_1331 mm3,mm4,mm5, mm6
psubsw mm2, mm3
movq [%1], mm2
HFILTER_1331 edx , mm3
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
movq mm2, [%1+16]
VFILTER_1331 mm5,mm6,mm3,mm4
psubsw mm2, mm5
movq [%1+16], mm2
%endmacro
align 16
xvid_Filter_Diff_18x18_To_8x8_mmx: ; 302c
mov ecx, [esp+4] ; Dst
mov edx, [esp+8] ; Src
mov eax, [esp+12] ; BpS
movq mm7, [Cst3]
sub edx, eax
; mm3/mm4/mm5/mm6 is used as a 4-samples delay line.
; process columns 0-3
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
DIFF_TWO_LINES_1331 ecx + 0*16
DIFF_TWO_LINES_1331 ecx + 2*16
DIFF_TWO_LINES_1331 ecx + 4*16
DIFF_TWO_LINES_1331 ecx + 6*16
; process columns 4-7
mov edx, [esp+8]
sub edx, eax
add edx, 8
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
DIFF_TWO_LINES_1331 ecx + 0*16 +8
DIFF_TWO_LINES_1331 ecx + 2*16 +8
DIFF_TWO_LINES_1331 ecx + 4*16 +8
DIFF_TWO_LINES_1331 ecx + 6*16 +8
ret
;//////////////////////////////////////////////////////////////////////
; pfeewwww... Never Do That On Stage Again. :)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -