📄 liba52_changes.diff
字号:
+ " jnz 1b \n\t"+ :: "r" (left+256), "r" (right+256), "m" (bias)+ : "%esi"+ );+}++static void mix21toS_SSE (sample_t * samples, sample_t bias)+{+ asm volatile(+ "movlps %1, %%xmm7 \n\t"+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"+ "movl $-1024, %%esi \n\t"+ ".balign 16\n\t"+ "1: \n\t"+ "movaps 2048(%0, %%esi), %%xmm0 \n\t" // surround+ "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 1024(%0, %%esi), %%xmm2 \n\t"+ "addps %%xmm7, %%xmm1 \n\t"+ "addps %%xmm7, %%xmm2 \n\t"+ "subps %%xmm0, %%xmm1 \n\t"+ "addps %%xmm0, %%xmm2 \n\t"+ "movaps %%xmm1, (%0, %%esi) \n\t"+ "movaps %%xmm2, 1024(%0, %%esi) \n\t"+ "addl $16, %%esi \n\t"+ " jnz 1b \n\t"+ :: "r" (samples+256), "m" (bias)+ : "%esi"+ );+}++static void mix31to2_SSE (sample_t * samples, sample_t bias)+{+ asm volatile(+ "movlps %1, %%xmm7 \n\t"+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"+ "movl $-1024, %%esi \n\t"+ ".balign 16\n\t"+ "1: \n\t"+ "movaps 1024(%0, %%esi), %%xmm0 \n\t" + "addps 3072(%0, %%esi), %%xmm0 \n\t" + "addps %%xmm7, %%xmm0 \n\t" // common+ "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 2048(%0, %%esi), %%xmm2 \n\t"+ "addps %%xmm0, %%xmm1 \n\t"+ "addps %%xmm0, %%xmm2 \n\t"+ "movaps %%xmm1, (%0, %%esi) \n\t"+ "movaps %%xmm2, 1024(%0, %%esi) \n\t"+ "addl $16, %%esi \n\t"+ " jnz 1b \n\t"+ :: "r" (samples+256), "m" (bias)+ : "%esi"+ );+}++static void mix31toS_SSE (sample_t * samples, sample_t bias)+{+ asm volatile(+ "movlps %1, %%xmm7 \n\t"+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"+ "movl $-1024, %%esi \n\t"+ ".balign 16\n\t"+ "1: \n\t"+ "movaps 1024(%0, %%esi), %%xmm0 \n\t" + "movaps 3072(%0, %%esi), %%xmm3 \n\t" // surround+ "addps %%xmm7, %%xmm0 \n\t" // common+ "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 2048(%0, %%esi), %%xmm2 \n\t"+ "addps %%xmm0, %%xmm1 \n\t"+ "addps %%xmm0, %%xmm2 \n\t"+ "subps %%xmm3, %%xmm1 \n\t"+ "addps %%xmm3, %%xmm2 \n\t"+ "movaps %%xmm1, (%0, %%esi) \n\t"+ "movaps %%xmm2, 1024(%0, %%esi) \n\t"+ "addl $16, %%esi \n\t"+ " jnz 1b \n\t"+ :: "r" (samples+256), "m" (bias)+ : "%esi"+ );+}++static void mix22toS_SSE (sample_t * samples, sample_t bias)+{+ asm volatile(+ "movlps %1, %%xmm7 \n\t"+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"+ "movl $-1024, %%esi \n\t"+ ".balign 16\n\t"+ "1: \n\t"+ "movaps 2048(%0, %%esi), %%xmm0 \n\t" + "addps 3072(%0, %%esi), %%xmm0 \n\t" // surround+ "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 1024(%0, %%esi), %%xmm2 \n\t"+ "addps %%xmm7, %%xmm1 \n\t"+ "addps %%xmm7, %%xmm2 \n\t"+ "subps %%xmm0, %%xmm1 \n\t"+ "addps %%xmm0, %%xmm2 \n\t"+ "movaps %%xmm1, (%0, %%esi) \n\t"+ "movaps %%xmm2, 1024(%0, %%esi) \n\t"+ "addl $16, %%esi \n\t"+ " jnz 1b \n\t"+ :: "r" (samples+256), "m" (bias)+ : "%esi"+ );+}++static void mix32to2_SSE (sample_t * samples, sample_t bias)+{+ asm volatile(+ "movlps %1, %%xmm7 \n\t"+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"+ "movl $-1024, %%esi \n\t"+ ".balign 16\n\t"+ "1: \n\t"+ "movaps 1024(%0, %%esi), %%xmm0 \n\t" + "addps %%xmm7, %%xmm0 \n\t" // common+ "movaps %%xmm0, %%xmm1 \n\t" // common+ "addps (%0, %%esi), %%xmm0 \n\t" + "addps 2048(%0, %%esi), %%xmm1 \n\t" + "addps 3072(%0, %%esi), %%xmm0 \n\t" + "addps 4096(%0, %%esi), %%xmm1 \n\t" + "movaps %%xmm0, (%0, %%esi) \n\t"+ "movaps %%xmm1, 1024(%0, %%esi) \n\t"+ "addl $16, %%esi \n\t"+ " jnz 1b \n\t"+ :: "r" (samples+256), "m" (bias)+ : "%esi"+ );+}++static void mix32toS_SSE (sample_t * samples, sample_t bias)+{+ asm volatile(+ "movlps %1, %%xmm7 \n\t"+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"+ "movl $-1024, %%esi \n\t"+ ".balign 16\n\t"+ "1: \n\t"+ "movaps 1024(%0, %%esi), %%xmm0 \n\t" + "movaps 3072(%0, %%esi), %%xmm2 \n\t" + "addps %%xmm7, %%xmm0 \n\t" // common+ "addps 4096(%0, %%esi), %%xmm2 \n\t" // surround + "movaps (%0, %%esi), %%xmm1 \n\t" + "movaps 2048(%0, %%esi), %%xmm3 \n\t" + "subps %%xmm2, %%xmm1 \n\t" + "addps %%xmm2, %%xmm3 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm3 \n\t" + "movaps %%xmm1, (%0, %%esi) \n\t"+ "movaps %%xmm3, 1024(%0, %%esi) \n\t"+ "addl $16, %%esi \n\t"+ " jnz 1b \n\t"+ :: "r" (samples+256), "m" (bias)+ : "%esi"+ );+}++static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias)+{+ asm volatile(+ "movlps %2, %%xmm7 \n\t"+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"+ "movl $-1024, %%esi \n\t"+ ".balign 16\n\t"+ "1: \n\t"+ "movaps (%0, %%esi), %%xmm0 \n\t" + "movaps 16(%0, %%esi), %%xmm1 \n\t" + "addps 1024(%0, %%esi), %%xmm0 \n\t"+ "addps 1040(%0, %%esi), %%xmm1 \n\t"+ "addps %%xmm7, %%xmm0 \n\t"+ "addps %%xmm7, %%xmm1 \n\t"+ "movaps %%xmm0, (%1, %%esi) \n\t"+ "movaps %%xmm1, 16(%1, %%esi) \n\t"+ "addl $32, %%esi \n\t"+ " jnz 1b \n\t"+ :: "r" (src+256), "r" (dest+256), "m" (bias)+ : "%esi"+ );+}++static void zero_MMX(sample_t * samples)+{+ asm volatile(+ "movl $-1024, %%esi \n\t"+ "pxor %%mm0, %%mm0 \n\t"+ ".balign 16\n\t"+ "1: \n\t"+ "movq %%mm0, (%0, %%esi) \n\t"+ "movq %%mm0, 8(%0, %%esi) \n\t"+ "movq %%mm0, 16(%0, %%esi) \n\t"+ "movq %%mm0, 24(%0, %%esi) \n\t"+ "addl $32, %%esi \n\t"+ " jnz 1b \n\t"+ "emms"+ :: "r" (samples+256)+ : "%esi"+ );+}++/*+ I hope dest and src will be at least 8 byte aligned and size+ will devide on 8 without remain+ Note: untested and unused.+*/+static void copy_MMX(void *dest,const void *src,unsigned size)+{+ unsigned i;+ size /= 64;+ for(i=0;i<size;i++)+ {+ __asm __volatile(+ "movq %0, %%mm0\n\t"+ "movq 8%0, %%mm1\n\t"+ "movq 16%0, %%mm2\n\t"+ "movq 24%0, %%mm3\n\t"+ "movq 32%0, %%mm4\n\t"+ "movq 40%0, %%mm5\n\t"+ "movq 48%0, %%mm6\n\t"+ "movq 56%0, %%mm7\n\t"+ "movq %%mm0, %1\n\t"+ "movq %%mm1, 8%1\n\t"+ "movq %%mm2, 16%1\n\t"+ "movq %%mm3, 24%1\n\t"+ "movq %%mm4, 32%1\n\t"+ "movq %%mm5, 40%1\n\t"+ "movq %%mm6, 48%1\n\t"+ "movq %%mm7, 56%1\n\t"+ :+ :"m"(src),"m"(dest));+ }+}++static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,+ sample_t clev, sample_t slev)+{+ switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {++ case CONVERT (A52_CHANNEL, A52_CHANNEL2):+ memcpy (samples, samples + 256, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_CHANNEL, A52_MONO):+ case CONVERT (A52_STEREO, A52_MONO):+ mix_2to1_SSE:+ mix2to1_SSE (samples, samples + 256, bias);+ break;++ case CONVERT (A52_2F1R, A52_MONO):+ if (slev == 0)+ goto mix_2to1_SSE;+ case CONVERT (A52_3F, A52_MONO):+ mix_3to1_SSE:+ mix3to1_SSE (samples, bias);+ break;++ case CONVERT (A52_3F1R, A52_MONO):+ if (slev == 0)+ goto mix_3to1_SSE;+ case CONVERT (A52_2F2R, A52_MONO):+ if (slev == 0)+ goto mix_2to1_SSE;+ mix4to1_SSE (samples, bias);+ break;++ case CONVERT (A52_3F2R, A52_MONO):+ if (slev == 0)+ goto mix_3to1_SSE;+ mix5to1_SSE (samples, bias);+ break;++ case CONVERT (A52_MONO, A52_DOLBY):+ memcpy (samples + 256, samples, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_3F, A52_STEREO):+ case CONVERT (A52_3F, A52_DOLBY):+ mix_3to2_SSE:+ mix3to2_SSE (samples, bias);+ break;++ case CONVERT (A52_2F1R, A52_STEREO):+ if (slev == 0)+ break;+ mix21to2_SSE (samples, samples + 256, bias);+ break;++ case CONVERT (A52_2F1R, A52_DOLBY):+ mix21toS_SSE (samples, bias);+ break;++ case CONVERT (A52_3F1R, A52_STEREO):+ if (slev == 0)+ goto mix_3to2_SSE;+ mix31to2_SSE (samples, bias);+ break;++ case CONVERT (A52_3F1R, A52_DOLBY):+ mix31toS_SSE (samples, bias);+ break;++ case CONVERT (A52_2F2R, A52_STEREO):+ if (slev == 0)+ break;+ mix2to1_SSE (samples, samples + 512, bias);+ mix2to1_SSE (samples + 256, samples + 768, bias);+ break;++ case CONVERT (A52_2F2R, A52_DOLBY):+ mix22toS_SSE (samples, bias);+ break;++ case CONVERT (A52_3F2R, A52_STEREO):+ if (slev == 0)+ goto mix_3to2_SSE;+ mix32to2_SSE (samples, bias);+ break;++ case CONVERT (A52_3F2R, A52_DOLBY):+ mix32toS_SSE (samples, bias);+ break;++ case CONVERT (A52_3F1R, A52_3F):+ if (slev == 0)+ break;+ mix21to2_SSE (samples, samples + 512, bias);+ break;++ case CONVERT (A52_3F2R, A52_3F):+ if (slev == 0)+ break;+ mix2to1_SSE (samples, samples + 768, bias);+ mix2to1_SSE (samples + 512, samples + 1024, bias);+ break;++ case CONVERT (A52_3F1R, A52_2F1R):+ mix3to2_SSE (samples, bias);+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_2F2R, A52_2F1R):+ mix2to1_SSE (samples + 512, samples + 768, bias);+ break;++ case CONVERT (A52_3F2R, A52_2F1R):+ mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used)+ move2to1_SSE (samples + 768, samples + 512, bias);+ break;++ case CONVERT (A52_3F2R, A52_3F1R):+ mix2to1_SSE (samples + 768, samples + 1024, bias);+ break;++ case CONVERT (A52_2F1R, A52_2F2R):+ memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_3F1R, A52_2F2R):+ mix3to2_SSE (samples, bias);+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_3F2R, A52_2F2R):+ mix3to2_SSE (samples, bias);+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));+ memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_3F1R, A52_3F2R):+ memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));+ break;+ }+}++static void upmix_MMX (sample_t * samples, int acmod, int output)+{+ switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {++ case CONVERT (A52_CHANNEL, A52_CHANNEL2):+ memcpy (samples + 256, samples, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_3F2R, A52_MONO):+ zero_MMX (samples + 1024);+ case CONVERT (A52_3F1R, A52_MONO):+ case CONVERT (A52_2F2R, A52_MONO):+ zero_MMX (samples + 768);+ case CONVERT (A52_3F, A52_MONO):+ case CONVERT (A52_2F1R, A52_MONO):+ zero_MMX (samples + 512);+ case CONVERT (A52_CHANNEL, A52_MONO):+ case CONVERT (A52_STEREO, A52_MONO):+ zero_MMX (samples + 256);+ break;++ case CONVERT (A52_3F2R, A52_STEREO):+ case CONVERT (A52_3F2R, A52_DOLBY):+ zero_MMX (samples + 1024);+ case CONVERT (A52_3F1R, A52_STEREO):+ case CONVERT (A52_3F1R, A52_DOLBY):+ zero_MMX (samples + 768);+ case CONVERT (A52_3F, A52_STEREO):+ case CONVERT (A52_3F, A52_DOLBY):+ mix_3to2_MMX:+ memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t));+ zero_MMX (samples + 256);+ break;++ case CONVERT (A52_2F2R, A52_STEREO):+ case CONVERT (A52_2F2R, A52_DOLBY):+ zero_MMX (samples + 768);+ case CONVERT (A52_2F1R, A52_STEREO):+ case CONVERT (A52_2F1R, A52_DOLBY):+ zero_MMX (samples + 512);+ break;++ case CONVERT (A52_3F2R, A52_3F):+ zero_MMX (samples + 1024);+ case CONVERT (A52_3F1R, A52_3F):+ case CONVERT (A52_2F2R, A52_2F1R):+ zero_MMX (samples + 768);+ break;++ case CONVERT (A52_3F2R, A52_3F1R):+ zero_MMX (samples + 1024);+ break;++ case CONVERT (A52_3F2R, A52_2F1R):+ zero_MMX (samples + 1024);+ case CONVERT (A52_3F1R, A52_2F1R):+ mix_31to21_MMX:+ memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));+ goto mix_3to2_MMX;++ case CONVERT (A52_3F2R, A52_2F2R):+ memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));+ goto mix_31to21_MMX;+ }+}++static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias)+{+ asm volatile(+ "movd %2, %%mm7 \n\t"+ "punpckldq %2, %%mm7 \n\t"+ "movl $-1024, %%esi \n\t"+ ".balign 16\n\t"+ "1: \n\t"+ "movq (%0, %%esi), %%mm0 \n\t" + "movq 8(%0, %%esi), %%mm1 \n\t"+ "movq 16(%0, %%esi), %%mm2 \n\t" + "movq 24(%0, %%esi), %%mm3 \n\t"+ "pfadd (%1, %%esi), %%mm0 \n\t"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -