📄 downmix.c
字号:
"subps %%xmm0, %%xmm1 \n\t" "addps %%xmm0, %%xmm2 \n\t" "movaps %%xmm1, (%0, %%esi) \n\t" "movaps %%xmm2, 1024(%0, %%esi) \n\t" "addl $16, %%esi \n\t" " jnz 1b \n\t" :: "r" (samples+256), "m" (bias) : "%esi" );}static void mix32to2_SSE (sample_t * samples, sample_t bias){ asm volatile( "movlps %1, %%xmm7 \n\t" "shufps $0x00, %%xmm7, %%xmm7 \n\t" "movl $-1024, %%esi \n\t" ".balign 16\n\t" "1: \n\t" "movaps 1024(%0, %%esi), %%xmm0 \n\t" "addps %%xmm7, %%xmm0 \n\t" // common "movaps %%xmm0, %%xmm1 \n\t" // common "addps (%0, %%esi), %%xmm0 \n\t" "addps 2048(%0, %%esi), %%xmm1 \n\t" "addps 3072(%0, %%esi), %%xmm0 \n\t" "addps 4096(%0, %%esi), %%xmm1 \n\t" "movaps %%xmm0, (%0, %%esi) \n\t" "movaps %%xmm1, 1024(%0, %%esi) \n\t" "addl $16, %%esi \n\t" " jnz 1b \n\t" :: "r" (samples+256), "m" (bias) : "%esi" );}static void mix32toS_SSE (sample_t * samples, sample_t bias){ asm volatile( "movlps %1, %%xmm7 \n\t" "shufps $0x00, %%xmm7, %%xmm7 \n\t" "movl $-1024, %%esi \n\t" ".balign 16\n\t" "1: \n\t" "movaps 1024(%0, %%esi), %%xmm0 \n\t" "movaps 3072(%0, %%esi), %%xmm2 \n\t" "addps %%xmm7, %%xmm0 \n\t" // common "addps 4096(%0, %%esi), %%xmm2 \n\t" // surround "movaps (%0, %%esi), %%xmm1 \n\t" "movaps 2048(%0, %%esi), %%xmm3 \n\t" "subps %%xmm2, %%xmm1 \n\t" "addps %%xmm2, %%xmm3 \n\t" "addps %%xmm0, %%xmm1 \n\t" "addps %%xmm0, %%xmm3 \n\t" "movaps %%xmm1, (%0, %%esi) \n\t" "movaps %%xmm3, 1024(%0, %%esi) \n\t" "addl $16, %%esi \n\t" " jnz 1b \n\t" :: "r" (samples+256), "m" (bias) : "%esi" );}static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias){ asm volatile( "movlps %2, %%xmm7 \n\t" "shufps $0x00, %%xmm7, %%xmm7 \n\t" "movl $-1024, %%esi \n\t" ".balign 16\n\t" "1: \n\t" "movaps (%0, %%esi), %%xmm0 \n\t" "movaps 16(%0, %%esi), %%xmm1 \n\t" "addps 1024(%0, %%esi), %%xmm0 \n\t" "addps 1040(%0, %%esi), %%xmm1 \n\t" "addps %%xmm7, %%xmm0 \n\t" "addps %%xmm7, %%xmm1 \n\t" "movaps %%xmm0, (%1, %%esi) \n\t" "movaps %%xmm1, 16(%1, %%esi) \n\t" "addl $32, %%esi \n\t" " jnz 1b \n\t" :: "r" (src+256), "r" (dest+256), "m" (bias) : "%esi" );}static void zero_MMX(sample_t * samples){ asm volatile( "movl $-1024, %%esi \n\t" "pxor %%mm0, %%mm0 \n\t" ".balign 16\n\t" "1: \n\t" "movq %%mm0, (%0, %%esi) \n\t" "movq %%mm0, 8(%0, %%esi) \n\t" "movq %%mm0, 16(%0, %%esi) \n\t" "movq %%mm0, 24(%0, %%esi) \n\t" "addl $32, %%esi \n\t" " jnz 1b \n\t" "emms" :: "r" (samples+256) : "%esi" );}/* I hope dest and src will be at least 8 byte aligned and size will devide on 8 without remain Note: untested and unused.*/static void copy_MMX(void *dest,const void *src,unsigned size){ unsigned i; size /= 64; for(i=0;i<size;i++) { __asm __volatile( "movq %0, %%mm0\n\t" "movq 8%0, %%mm1\n\t" "movq 16%0, %%mm2\n\t" "movq 24%0, %%mm3\n\t" "movq 32%0, %%mm4\n\t" "movq 40%0, %%mm5\n\t" "movq 48%0, %%mm6\n\t" "movq 56%0, %%mm7\n\t" "movq %%mm0, %1\n\t" "movq %%mm1, 8%1\n\t" "movq %%mm2, 16%1\n\t" "movq %%mm3, 24%1\n\t" "movq %%mm4, 32%1\n\t" "movq %%mm5, 40%1\n\t" "movq %%mm6, 48%1\n\t" "movq %%mm7, 56%1\n\t" : :"m"(src),"m"(dest)); }}static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, sample_t clev, sample_t slev){ switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { case CONVERT (A52_CHANNEL, A52_CHANNEL2): memcpy (samples, samples + 256, 256 * sizeof (sample_t)); break; case CONVERT (A52_CHANNEL, A52_MONO): case CONVERT (A52_STEREO, A52_MONO): mix_2to1_SSE: mix2to1_SSE (samples, samples + 256, bias); break; case CONVERT (A52_2F1R, A52_MONO): if (slev == 0) goto mix_2to1_SSE; case CONVERT (A52_3F, A52_MONO): mix_3to1_SSE: mix3to1_SSE (samples, bias); break; case CONVERT (A52_3F1R, A52_MONO): if (slev == 0) goto mix_3to1_SSE; case CONVERT (A52_2F2R, A52_MONO): if (slev == 0) goto mix_2to1_SSE; mix4to1_SSE (samples, bias); break; case CONVERT (A52_3F2R, A52_MONO): if (slev == 0) goto mix_3to1_SSE; mix5to1_SSE (samples, bias); break; case CONVERT (A52_MONO, A52_DOLBY): memcpy (samples + 256, samples, 256 * sizeof (sample_t)); break; case CONVERT (A52_3F, A52_STEREO): case CONVERT (A52_3F, A52_DOLBY): mix_3to2_SSE: mix3to2_SSE (samples, bias); break; case CONVERT (A52_2F1R, A52_STEREO): if (slev == 0) break; mix21to2_SSE (samples, samples + 256, bias); break; case CONVERT (A52_2F1R, A52_DOLBY): mix21toS_SSE (samples, bias); break; case CONVERT (A52_3F1R, A52_STEREO): if (slev == 0) goto mix_3to2_SSE; mix31to2_SSE (samples, bias); break; case CONVERT (A52_3F1R, A52_DOLBY): mix31toS_SSE (samples, bias); break; case CONVERT (A52_2F2R, A52_STEREO): if (slev == 0) break; mix2to1_SSE (samples, samples + 512, bias); mix2to1_SSE (samples + 256, samples + 768, bias); break; case CONVERT (A52_2F2R, A52_DOLBY): mix22toS_SSE (samples, bias); break; case CONVERT (A52_3F2R, A52_STEREO): if (slev == 0) goto mix_3to2_SSE; mix32to2_SSE (samples, bias); break; case CONVERT (A52_3F2R, A52_DOLBY): mix32toS_SSE (samples, bias); break; case CONVERT (A52_3F1R, A52_3F): if (slev == 0) break; mix21to2_SSE (samples, samples + 512, bias); break; case CONVERT (A52_3F2R, A52_3F): if (slev == 0) break; mix2to1_SSE (samples, samples + 768, bias); mix2to1_SSE (samples + 512, samples + 1024, bias); break; case CONVERT (A52_3F1R, A52_2F1R): mix3to2_SSE (samples, bias); memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); break; case CONVERT (A52_2F2R, A52_2F1R): mix2to1_SSE (samples + 512, samples + 768, bias); break; case CONVERT (A52_3F2R, A52_2F1R): mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used) move2to1_SSE (samples + 768, samples + 512, bias); break; case CONVERT (A52_3F2R, A52_3F1R): mix2to1_SSE (samples + 768, samples + 1024, bias); break; case CONVERT (A52_2F1R, A52_2F2R): memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); break; case CONVERT (A52_3F1R, A52_2F2R): mix3to2_SSE (samples, bias); memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); break; case CONVERT (A52_3F2R, A52_2F2R): mix3to2_SSE (samples, bias); memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); break; case CONVERT (A52_3F1R, A52_3F2R): memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); break; }}static void upmix_MMX (sample_t * samples, int acmod, int output){ switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { case CONVERT (A52_CHANNEL, A52_CHANNEL2): memcpy (samples + 256, samples, 256 * sizeof (sample_t)); break; case CONVERT (A52_3F2R, A52_MONO): zero_MMX (samples + 1024); case CONVERT (A52_3F1R, A52_MONO): case CONVERT (A52_2F2R, A52_MONO): zero_MMX (samples + 768); case CONVERT (A52_3F, A52_MONO): case CONVERT (A52_2F1R, A52_MONO): zero_MMX (samples + 512); case CONVERT (A52_CHANNEL, A52_MONO): case CONVERT (A52_STEREO, A52_MONO): zero_MMX (samples + 256); break; case CONVERT (A52_3F2R, A52_STEREO): case CONVERT (A52_3F2R, A52_DOLBY): zero_MMX (samples + 1024); case CONVERT (A52_3F1R, A52_STEREO): case CONVERT (A52_3F1R, A52_DOLBY): zero_MMX (samples + 768); case CONVERT (A52_3F, A52_STEREO): case CONVERT (A52_3F, A52_DOLBY): mix_3to2_MMX: memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); zero_MMX (samples + 256); break; case CONVERT (A52_2F2R, A52_STEREO): case CONVERT (A52_2F2R, A52_DOLBY): zero_MMX (samples + 768); case CONVERT (A52_2F1R, A52_STEREO): case CONVERT (A52_2F1R, A52_DOLBY): zero_MMX (samples + 512); break; case CONVERT (A52_3F2R, A52_3F): zero_MMX (samples + 1024); case CONVERT (A52_3F1R, A52_3F): case CONVERT (A52_2F2R, A52_2F1R): zero_MMX (samples + 768); break; case CONVERT (A52_3F2R, A52_3F1R): zero_MMX (samples + 1024); break; case CONVERT (A52_3F2R, A52_2F1R): zero_MMX (samples + 1024); case CONVERT (A52_3F1R, A52_2F1R): mix_31to21_MMX: memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); goto mix_3to2_MMX; case CONVERT (A52_3F2R, A52_2F2R): memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); goto mix_31to21_MMX; }}static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias){ asm volatile( "movd %2, %%mm7 \n\t" "punpckldq %2, %%mm7 \n\t" "movl $-1024, %%esi \n\t" ".balign 16\n\t" "1: \n\t" "movq (%0, %%esi), %%mm0 \n\t" "movq 8(%0, %%esi), %%mm1 \n\t" "movq 16(%0, %%esi), %%mm2 \n\t" "movq 24(%0, %%esi), %%mm3 \n\t" "pfadd (%1, %%esi), %%mm0 \n\t" "pfadd 8(%1, %%esi), %%mm1 \n\t" "pfadd 16(%1, %%esi), %%mm2 \n\t" "pfadd 24(%1, %%esi), %%mm3 \n\t" "pfadd %%mm7, %%mm0 \n\t" "pfadd %%mm7, %%mm1 \n\t" "pfadd %%mm7, %%mm2 \n\t" "pfadd %%mm7, %%mm3 \n\t" "movq %%mm0, (%1, %%esi) \n\t" "movq %%mm1, 8(%1, %%esi) \n\t" "movq %%mm2, 16(%1, %%esi) \n\t" "movq %%mm3, 24(%1, %%esi) \n\t" "addl $32, %%esi \n\t" " jnz 1b \n\t" :: "r" (src+256), "r" (dest+256), "m" (bias) : "%esi" );}static void mix3to1_3dnow (sample_t * samples, sample_t bias){ asm volatile( "movd %1, %%mm7 \n\t" "punpckldq %1, %%mm7 \n\t" "movl $-1024, %%esi \n\t" ".balign 16\n\t" "1: \n\t" "movq (%0, %%esi), %%mm0 \n\t" "movq 8(%0, %%esi), %%mm1 \n\t" "movq 1024(%0, %%esi), %%mm2 \n\t" "movq 1032(%0, %%esi), %%mm3 \n\t" "pfadd 2048(%0, %%esi), %%mm0 \n\t" "pfadd 2056(%0, %%esi), %%mm1 \n\t" "pfadd %%mm7, %%mm0 \n\t" "pfadd %%mm7, %%mm1 \n\t" "pfadd %%mm2, %%mm0 \n\t" "pfadd %%mm3, %%mm1 \n\t" "movq %%mm0, (%0, %%esi) \n\t" "movq %%mm1, 8(%0, %%esi) \n\t" "addl $16, %%esi \n\t" " jnz 1b \n\t" :: "r" (samples+256), "m" (bias) : "%esi" );}static void mix4to1_3dnow (sample_t * samples, sample_t bias){ asm volatile( "movd %1, %%mm7 \n\t" "punpckldq %1, %%mm7 \n\t" "movl $-1024, %%esi \n\t" ".balign 16\n\t" "1: \n\t" "movq (%0, %%esi), %%mm0 \n\t" "movq 8(%0, %%esi), %%mm1 \n\t" "movq 1024(%0, %%esi), %%mm2 \n\t" "movq 1032(%0, %%esi), %%mm3 \n\t" "pfadd 2048(%0, %%esi), %%mm0 \n\t" "pfadd 2056(%0, %%esi), %%mm1 \n\t" "pfadd 3072(%0, %%esi), %%mm2 \n\t" "pfadd 3080(%0, %%esi), %%mm3 \n\t" "pfadd %%mm7, %%mm0 \n\t" "pfadd %%mm7, %%mm1 \n\t" "pfadd %%mm2, %%mm0 \n\t" "pfadd %%mm3, %%mm1 \n\t" "movq %%mm0, (%0, %%esi) \n\t" "movq %%mm1, 8(%0, %%esi) \n\t" "addl $16, %%esi \n\t" " jnz 1b \n\t" :: "r" (samples+256), "m" (bias) : "%esi" );}static void mix5to1_3dnow (sample_t * samples, sample_t bias){ asm volatile( "movd %1, %%mm7 \n\t" "punpckldq %1, %%mm7 \n\t" "movl $-1024, %%esi \n\t" ".balign 16\n\t" "1: \n\t" "movq (%0, %%esi), %%mm0 \n\t" "movq 8(%0, %%esi), %%mm1 \n\t" "movq 1024(%0, %%esi), %%mm2 \n\t" "movq 1032(%0, %%esi), %%mm3 \n\t" "pfadd 2048(%0, %%esi), %%mm0 \n\t" "pfadd 2056(%0, %%esi), %%mm1 \n\t" "pfadd 3072(%0, %%esi), %%mm2 \n\t" "pfadd 3080(%0, %%esi), %%mm3 \n\t" "pfadd %%mm7, %%mm0 \n\t" "pfadd %%mm7, %%mm1 \n\t" "pfadd 4096(%0, %%esi), %%mm2 \n\t" "pfadd 4104(%0, %%esi), %%mm3 \n\t" "pfadd %%mm2, %%mm0 \n\t" "pfadd %%mm3, %%mm1 \n\t" "movq %%mm0, (%0, %%esi) \n\t" "movq %%mm1, 8(%0, %%esi) \n\t"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -