📄 dct-a.asm
字号:
MMX_SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12 MMX_SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12 ; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3 MMX_TRANSPOSE mm2, mm0, mm3, mm4, mm1 MMX_SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12 MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12 MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12 ; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3 MMX_TRANSPOSE mm1, mm2, mm3, mm0, mm4 movq [r10+ 0], mm1 ; dct movq [r10+ 8], mm0 movq [r10+16], mm4 movq [r10+24], mm3 pop rbx ret endfunccglobal x264_add4x4_idct_mmxextALIGN 16;-----------------------------------------------------------------------------; void x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );-----------------------------------------------------------------------------x264_add4x4_idct_mmxext: ; Load dct coeffs movq mm0, [parm3q+ 0] ; dct movq mm4, [parm3q+ 8] movq mm3, [parm3q+16] movq mm1, [parm3q+24] mov rax, parm1q ; p_dst movsxd rcx, parm2d ; i_dst lea rdx, [rcx+rcx*2] ; out:mm0, mm1, mm2, mm3 MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2 MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02 MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) MMX_SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 ; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0 MMX_TRANSPOSE mm1, mm4, mm0, mm2, mm3 MMX_SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02 MMX_SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3) MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 MMX_ZERO mm7 movq mm6, [pw_32 GLOBAL] MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [rax] MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [rax+rcx] MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [rax+rcx*2] MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [rax+rdx] ret; =============================================================================; 8x8 Transform; =============================================================================; in: ABCDEFGH; out: FBCGEDHI%macro DCT8_1D 10 MMX_SUMSUB_BA %8, %1 ; %8=s07, %1=d07 MMX_SUMSUB_BA %7, %2 ; %7=s16, %2=d16 MMX_SUMSUB_BA %6, %3 ; %6=s25, %3=d25 MMX_SUMSUB_BA %5, %4 ; %5=s34, %4=d34 MMX_SUMSUB_BA %5, %8 ; %5=a0, %8=a2 MMX_SUMSUB_BA %6, %7 ; %6=a1, %7=a3 movdqa %9, %1 psraw %9, 1 paddw %9, %1 paddw %9, %2 paddw %9, %3 ; %9=a4 movdqa %10, %4 psraw %10, 1 paddw %10, %4 paddw %10, %2 psubw %10, %3 ; %10=a7 MMX_SUMSUB_BA %4, %1 psubw %1, %3 psubw %4, %2 psraw %3, 1 psraw %2, 1 psubw %1, %3 ; %1=a5 psubw %4, %2 ; %4=a6 MMX_SUMSUB_BA %6, %5 ; %6=b0, %5=b4 movdqa %2, %10 psraw %2, 2 paddw %2, %9 ; %2=b1 psraw %9, 2 psubw %9, %10 ; %9=b7 movdqa %3, %7 psraw %3, 1 paddw %3, %8 ; %3=b2 psraw %8, 1 psubw %8, %7 ; %8=b6 movdqa %7, %4 psraw %7, 2 paddw %7, %1 ; %7=b3 psraw %1, 2 psubw %4, %1 ; %4=b5%endmacrocglobal x264_sub8x8_dct8_sse2ALIGN 16;-----------------------------------------------------------------------------; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );-----------------------------------------------------------------------------x264_sub8x8_dct8_sse2:; mov rdi, rdi ; dct; mov rsi, rsi ; pix1 movsxd rdx, edx ; i_pix1; mov rcx, rcx ; pix2 movsxd r8, r8d ; i_pix2 MMX_ZERO xmm9 MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [rsi ], [rcx] MMX_LOAD_DIFF_8P xmm1, xmm8, xmm9, [rsi+rdx ], [rcx+r8] MMX_LOAD_DIFF_8P xmm2, xmm8, xmm9, [rsi+rdx*2], [rcx+r8*2] lea r9, [rdx+rdx*2] lea r10, [r8+r8*2] add rsi, r9 add rcx, r10 MMX_LOAD_DIFF_8P xmm3, xmm8, xmm9, [rsi ], [rcx] MMX_LOAD_DIFF_8P xmm4, xmm8, xmm9, [rsi+rdx ], [rcx+r8] MMX_LOAD_DIFF_8P xmm5, xmm8, xmm9, [rsi+rdx*2], [rcx+r8*2] MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [rsi+r9 ], [rcx+r10] MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [rsi+rdx*4], [rcx+r8*4] SSE2_TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 DCT8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1, xmm6, xmm9 SSE2_TRANSPOSE8x8 xmm4, xmm5, xmm7, xmm2, xmm8, xmm3, xmm1, xmm6, xmm0 DCT8_1D xmm4, xmm3, xmm6, xmm2, xmm0, xmm8, xmm7, xmm5, xmm1, xmm9 movdqa [rdi+0x00], xmm8 movdqa [rdi+0x10], xmm3 movdqa [rdi+0x20], xmm6 movdqa [rdi+0x30], xmm7 movdqa [rdi+0x40], xmm0 movdqa [rdi+0x50], xmm2 movdqa [rdi+0x60], xmm5 movdqa [rdi+0x70], xmm1 ret; in: ABCDEFGH; out: IBHDEACG%macro IDCT8_1D 10 MMX_SUMSUB_BA %5, %1 ; %5=a0, %1=a2 movdqa %10, %3 psraw %3, 1 psubw %3, %7 ; %3=a4 psraw %7, 1 paddw %7, %10 ; %7=a6 movdqa %9, %2 psraw %9, 1 paddw %9, %2 paddw %9, %4 paddw %9, %6 ; %9=a7 movdqa %10, %6 psraw %10, 1 paddw %10, %6 paddw %10, %8 psubw %10, %2 ; %10=a5 psubw %2, %4 psubw %6, %4 paddw %2, %8 psubw %6, %8 psraw %4, 1 psraw %8, 1 psubw %2, %4 ; %2=a3 psubw %6, %8 ; %6=a1 MMX_SUMSUB_BA %7, %5 ; %7=b0, %5=b6 MMX_SUMSUB_BA %3, %1 ; %3=b2, %1=b4 movdqa %4, %9 psraw %4, 2 paddw %4, %6 ; %4=b1 psraw %6, 2 psubw %9, %6 ; %9=b7 movdqa %8, %10 psraw %8, 2 paddw %8, %2 ; %8=b3 psraw %2, 2 psubw %2, %10 ; %2=b5 MMX_SUMSUB_BA %9, %7 ; %9=c0, %7=c7 MMX_SUMSUB_BA %2, %3 ; %2=c1, %3=c6 MMX_SUMSUB_BA %8, %1 ; %8=c2, %1=c5 MMX_SUMSUB_BA %4, %5 ; %4=c3, %5=c4%endmacrocglobal x264_add8x8_idct8_sse2ALIGN 16;-----------------------------------------------------------------------------; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int i_dst, int16_t dct[8][8] );-----------------------------------------------------------------------------x264_add8x8_idct8_sse2: movsxd rsi, esi ; i_dst movdqa xmm0, [rdx+0x00] ; dct movdqa xmm1, [rdx+0x10] movdqa xmm2, [rdx+0x20] movdqa xmm3, [rdx+0x30] movdqa xmm4, [rdx+0x40] movdqa xmm5, [rdx+0x50] movdqa xmm6, [rdx+0x60] movdqa xmm7, [rdx+0x70] SSE2_TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 IDCT8_1D xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1, xmm9, xmm6 SSE2_TRANSPOSE8x8 xmm9, xmm5, xmm1, xmm3, xmm8, xmm0, xmm7, xmm2, xmm4 paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end IDCT8_1D xmm9, xmm0, xmm2, xmm3, xmm4, xmm8, xmm1, xmm5, xmm6, xmm7 MMX_ZERO xmm15 MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [rdi] MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [rdi+rsi] MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [rdi+rsi*2] lea rax, [rsi+rsi*2] add rdi, rax MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [rdi] MMX_STORE_DIFF_8P xmm4, xmm14, xmm15, [rdi+rsi] MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [rdi+rsi*2] MMX_STORE_DIFF_8P xmm2, xmm14, xmm15, [rdi+rax] MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [rdi+rsi*4] ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -