📄 liba52_changes.diff
字号:
+ case CONVERT (A52_3F, A52_STEREO):+ case CONVERT (A52_3F, A52_DOLBY):+ mix_3to2_3dnow:+ mix3to2_3dnow (samples, bias);+ break;++ case CONVERT (A52_2F1R, A52_STEREO):+ if (slev == 0)+ break;+ mix21to2_3dnow (samples, samples + 256, bias);+ break;++ case CONVERT (A52_2F1R, A52_DOLBY):+ mix21toS_3dnow (samples, bias);+ break;++ case CONVERT (A52_3F1R, A52_STEREO):+ if (slev == 0)+ goto mix_3to2_3dnow;+ mix31to2_3dnow (samples, bias);+ break;++ case CONVERT (A52_3F1R, A52_DOLBY):+ mix31toS_3dnow (samples, bias);+ break;++ case CONVERT (A52_2F2R, A52_STEREO):+ if (slev == 0)+ break;+ mix2to1_3dnow (samples, samples + 512, bias);+ mix2to1_3dnow (samples + 256, samples + 768, bias);+ break;++ case CONVERT (A52_2F2R, A52_DOLBY):+ mix22toS_3dnow (samples, bias);+ break;++ case CONVERT (A52_3F2R, A52_STEREO):+ if (slev == 0)+ goto mix_3to2_3dnow;+ mix32to2_3dnow (samples, bias);+ break;++ case CONVERT (A52_3F2R, A52_DOLBY):+ mix32toS_3dnow (samples, bias);+ break;++ case CONVERT (A52_3F1R, A52_3F):+ if (slev == 0)+ break;+ mix21to2_3dnow (samples, samples + 512, bias);+ break;++ case CONVERT (A52_3F2R, A52_3F):+ if (slev == 0)+ break;+ mix2to1_3dnow (samples, samples + 768, bias);+ mix2to1_3dnow (samples + 512, samples + 1024, bias);+ break;++ case CONVERT (A52_3F1R, A52_2F1R):+ mix3to2_3dnow (samples, bias);+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_2F2R, A52_2F1R):+ mix2to1_3dnow (samples + 512, samples + 768, bias);+ break;++ case CONVERT (A52_3F2R, A52_2F1R):+ mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used)+ move2to1_3dnow (samples + 768, samples + 512, bias);+ break;++ case CONVERT (A52_3F2R, A52_3F1R):+ mix2to1_3dnow (samples + 768, samples + 1024, bias);+ break;++ case CONVERT (A52_2F1R, A52_2F2R):+ memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_3F1R, A52_2F2R):+ mix3to2_3dnow (samples, bias);+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_3F2R, A52_2F2R):+ mix3to2_3dnow (samples, bias);+ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));+ memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));+ break;++ case CONVERT (A52_3F1R, A52_3F2R):+ memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));+ break;+ }+ __asm __volatile("femms":::"memory");+}++#endif //ARCH_X86--- liba52/imdct.c 2005-03-22 19:59:35.000000000 +0100+++ imdct.c 2004-04-26 22:00:57.000000000 +0200@@ -17,17 +23,32 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA+ *+ * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)+ * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>+ * michael did port them from libac3 (untested, perhaps totally broken)+ * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) */ #include "config.h" -#include <inttypes.h> #include <math.h> #include <stdio.h>+#ifndef M_PI+#define M_PI 3.1415926535897932384626433832795029+#endif+#include <inttypes.h> #include "a52.h" #include "a52_internal.h" #include "mm_accel.h"+#include "mangle.h"++#ifdef RUNTIME_CPUDETECT+#undef HAVE_3DNOWEX+#endif++#define USE_AC3_C void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias); void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);@@ -37,9 +58,22 @@ sample_t imag; } complex_t; +static void fft_128p(complex_t *a);++static const int pm128[128] attribute_used __attribute__((aligned(16))) =+{+ 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,+ 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,+ 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,+ 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,+ 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,+ 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,+ 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,+ 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127+}; /* 128 point bit-reverse LUT */-static uint8_t bit_reverse_512[] = {+static uint8_t attribute_used bit_reverse_512[] = { 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70, 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78, 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74, @@ -67,23 +101,42 @@ 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b, 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; -static complex_t buf[128];+#ifdef ARCH_X86+// NOTE: SSE needs 16byte alignment or it will segfault +// +static complex_t __attribute__((aligned(16))) buf[128];+static float __attribute__((aligned(16))) sseSinCos1c[256];+static float __attribute__((aligned(16))) sseSinCos1d[256];+static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};+//static float __attribute__((aligned(16))) sseW0[4];+static float __attribute__((aligned(16))) sseW1[8];+static float __attribute__((aligned(16))) sseW2[16];+static float __attribute__((aligned(16))) sseW3[32];+static float __attribute__((aligned(16))) sseW4[64];+static float __attribute__((aligned(16))) sseW5[128];+static float __attribute__((aligned(16))) sseW6[256];+static float __attribute__((aligned(16))) *sseW[7]=+ {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};+static float __attribute__((aligned(16))) sseWindow[512];+#else+static complex_t __attribute__((aligned(16))) buf[128];+#endif /* Twiddle factor LUT */-static complex_t w_1[1];-static complex_t w_2[2];-static complex_t w_4[4];-static complex_t w_8[8];-static complex_t w_16[16];-static complex_t w_32[32];-static complex_t w_64[64];-static complex_t * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};+static complex_t __attribute__((aligned(16))) w_1[1];+static complex_t __attribute__((aligned(16))) w_2[2];+static complex_t __attribute__((aligned(16))) w_4[4];+static complex_t __attribute__((aligned(16))) w_8[8];+static complex_t __attribute__((aligned(16))) w_16[16];+static complex_t __attribute__((aligned(16))) w_32[32];+static complex_t __attribute__((aligned(16))) w_64[64];+static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; /* Twiddle factors for IMDCT */-static sample_t xcos1[128];-static sample_t xsin1[128];-static sample_t xcos2[64];-static sample_t xsin2[64];+static sample_t __attribute__((aligned(16))) xcos1[128];+static sample_t __attribute__((aligned(16))) xsin1[128];+static sample_t __attribute__((aligned(16))) xcos2[64];+static sample_t __attribute__((aligned(16))) xsin2[64]; /* Windowing function for Modified DCT - Thank you acroread */ sample_t imdct_window[] = {@@ -145,16 +198,19 @@ void imdct_do_512(sample_t data[],sample_t delay[], sample_t bias) {- int i,k;+ int i;+#ifndef USE_AC3_C+ int k; int p,q; int m; int two_m; int two_m_plus_one; - sample_t tmp_a_i;- sample_t tmp_a_r; sample_t tmp_b_i; sample_t tmp_b_r;+#endif+ sample_t tmp_a_i;+ sample_t tmp_a_r; sample_t *data_ptr; sample_t *delay_ptr;@@ -162,22 +218,21 @@ /* 512 IMDCT with source and dest data in 'data' */ - /* Pre IFFT complex multiply plus IFFT cmplx conjugate */+ /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ for( i=0; i < 128; i++) { /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ - buf[i].real = (data[256-2*i-1] * xcos1[i]) - (data[2*i] * xsin1[i]);- buf[i].imag = -1.0 * ((data[2*i] * xcos1[i]) + (data[256-2*i-1] * xsin1[i]));- }-- /* Bit reversed shuffling */- for(i=0; i<128; i++) {- k = bit_reverse_512[i];- if (k < i)- swap_cmplx(&buf[i],&buf[k]);+#ifdef USE_AC3_C+ int j= pm128[i];+#else+ int j= bit_reverse_512[i];+#endif + buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);+ buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); } /* FFT Merge */- for (m=0; m < 7; m++) {+/* unoptimized variant+ for (m=1; m < 7; m++) { if(m) two_m = (1 << m); else@@ -185,8 +240,8 @@ two_m_plus_one = (1 << (m+1)); - for(k = 0; k < two_m; k++) {- for(i = 0; i < 128; i += two_m_plus_one) {+ for(i = 0; i < 128; i += two_m_plus_one) {+ for(k = 0; k < two_m; k++) { p = k + i; q = p + two_m; tmp_a_r = buf[p].real;@@ -200,7 +255,102 @@ } } }+*/+#ifdef USE_AC3_C+ fft_128p (&buf[0]);+#else++ /* 1. iteration */+ for(i = 0; i < 128; i += 2) {+ tmp_a_r = buf[i].real;+ tmp_a_i = buf[i].imag;+ tmp_b_r = buf[i+1].real;+ tmp_b_i = buf[i+1].imag;+ buf[i].real = tmp_a_r + tmp_b_r;+ buf[i].imag = tmp_a_i + tmp_b_i;+ buf[i+1].real = tmp_a_r - tmp_b_r;+ buf[i+1].imag = tmp_a_i - tmp_b_i;+ }+ + /* 2. iteration */+ // Note w[1]={{1,0}, {0,-1}}+ for(i = 0; i < 128; i += 4) {+ tmp_a_r = buf[i].real;+ tmp_a_i = buf[i].imag;+ tmp_b_r = buf[i+2].real;+ tmp_b_i = buf[i+2].imag;+ buf[i].real = tmp_a_r + tmp_b_r;+ buf[i].imag = tmp_a_i + tmp_b_i;+ buf[i+2].real = tmp_a_r - tmp_b_r;+ buf[i+2].imag = tmp_a_i - tmp_b_i;+ tmp_a_r = buf[i+1].real;+ tmp_a_i = buf[i+1].imag;+ tmp_b_r = buf[i+3].imag;+ tmp_b_i = buf[i+3].real;+ buf[i+1].real = tmp_a_r + tmp_b_r;+ buf[i+1].imag = tmp_a_i - tmp_b_i;+ buf[i+3].real = tmp_a_r - tmp_b_r;+ buf[i+3].imag = tmp_a_i + tmp_b_i;+ } + /* 3. iteration */+ for(i = 0; i < 128; i += 8) {+ tmp_a_r = buf[i].real;+ tmp_a_i = buf[i].imag;+ tmp_b_r = buf[i+4].real;+ tmp_b_i = buf[i+4].imag;+ buf[i].real = tmp_a_r + tmp_b_r;+ buf[i].imag = tmp_a_i + tmp_b_i;+ buf[i+4].real = tmp_a_r - tmp_b_r;+ buf[i+4].imag = tmp_a_i - tmp_b_i;+ tmp_a_r = buf[1+i].real;+ tmp_a_i = buf[1+i].imag;+ tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;+ tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;+ buf[1+i].real = tmp_a_r + tmp_b_r;+ buf[1+i].imag = tmp_a_i + tmp_b_i;+ buf[i+5].real = tmp_a_r - tmp_b_r;+ buf[i+5].imag = tmp_a_i - tmp_b_i;+ tmp_a_r = buf[i+2].real;+ tmp_a_i = buf[i+2].imag;+ tmp_b_r = buf[i+6].imag;+ tmp_b_i = - buf[i+6].real;+ buf[i+2].real = tmp_a_r + tmp_b_r;+ buf[i+2].imag = tmp_a_i + tmp_b_i;+ buf[i+6].real = tmp_a_r - tmp_b_r;+ buf[i+6].imag = tmp_a_i - tmp_b_i;+ tmp_a_r = buf[i+3].real;+ tmp_a_i = buf[i+3].imag;+ tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;+ tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;+ buf[i+3].real = tmp_a_r + tmp_b_r;+ buf[i+3].imag = tmp_a_i + tmp_b_i;+ buf[i+7].real = tmp_a_r - tmp_b_r;+ buf[i+7].imag = tmp_a_i - tmp_b_i;+ }+ + /* 4-7. iterations */+ for (m=3; m < 7; m++) {+ two_m = (1 << m);++ two_m_plus_one = two_m<<1;++ for(i = 0; i < 128; i += two_m_plus_one) {+ for(k = 0; k < two_m; k++) {+ int p = k + i;+ int q = p + two_m;+ tmp_a_r = buf[p].real;+ tmp_a_i = buf[p].imag;+ tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;+ tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;+ buf[p].real = tmp_a_r + tmp_b_r;+ buf[p].imag = tmp_a_i + tmp_b_i;+ buf[q].real = tmp_a_r - tmp_b_r;+ buf[q].imag = tmp_a_i - tmp_b_i;+ }+ }+ }+#endif /* Post IFFT complex multiply plus IFFT complex conjugate*/ for( i=0; i < 128; i++) { /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */@@ -219,12 +369,12 @@ *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; }-+ for(i=0; i< 64; i++) { *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; }-+ /* The trailing edge of the window goes into the delay line */ delay_ptr = delay; @@ -232,13 +382,717 @@ *delay_ptr++ = -buf[64+i].real * *--window_ptr; *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; }-+ for(i=0; i<64; i++) { *delay_ptr++ = buf[i].imag * *--window_ptr; *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; } } +#ifdef HAVE_ALTIVEC++#ifndef SYS_DARWIN+#include <altivec.h>+#endif++// used to build registers permutation vectors (vcprm)+// the 's' are for words in the _s_econd vector+#define WORD_0 0x00,0x01,0x02,0x03+#define WORD_1 0x04,0x05,0x06,0x07+#define WORD_2 0x08,0x09,0x0a,0x0b+#define WORD_3 0x0c,0x0d,0x0e,0x0f+#define WORD_s0 0x10,0x11,0x12,0x13+#define WORD_s1 0x14,0x15,0x16,0x17+#define WORD_s2 0x18,0x19,0x1a,0x1b+#define WORD_s3 0x1c,0x1d,0x1e,0x1f++#ifdef SYS_DARWIN+#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)+#else+#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}+#endif++// vcprmle is used to keep the same index as in the SSE version.+// it's the same as vcprm, with the index inversed+// ('le' is Little Endian)+#define vcprmle(a,b,c,d) vcprm(d,c,b,a)++// used to build inverse/identity vectors (vcii)+// n is _n_egative, p is _p_ositive+#define FLOAT_n -1.+#define FLOAT_p 1.++#ifdef SYS_DARWIN+#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)+#else+#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -