📄 liba52_changes.diff
字号:
+#endif++#ifdef SYS_DARWIN+#define FOUROF(a) (a)+#else+#define FOUROF(a) {a,a,a,a}+#endif+++void+imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)+{+ int i;+ int k;+ int p,q;+ int m;+ int two_m;+ int two_m_plus_one;++ sample_t tmp_b_i;+ sample_t tmp_b_r;+ sample_t tmp_a_i;+ sample_t tmp_a_r;++ sample_t *data_ptr;+ sample_t *delay_ptr;+ sample_t *window_ptr;+ + /* 512 IMDCT with source and dest data in 'data' */+ + /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/+ for( i=0; i < 128; i++) {+ /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ + int j= bit_reverse_512[i];+ buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);+ buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));+ }+ + /* 1. iteration */+ for(i = 0; i < 128; i += 2) {+#if 0+ tmp_a_r = buf[i].real;+ tmp_a_i = buf[i].imag;+ tmp_b_r = buf[i+1].real;+ tmp_b_i = buf[i+1].imag;+ buf[i].real = tmp_a_r + tmp_b_r;+ buf[i].imag = tmp_a_i + tmp_b_i;+ buf[i+1].real = tmp_a_r - tmp_b_r;+ buf[i+1].imag = tmp_a_i - tmp_b_i;+#else+ vector float temp, bufv; ++ bufv = vec_ld(i << 3, (float*)buf);+ temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));+ bufv = vec_madd(bufv, vcii(p,p,n,n), temp);+ vec_st(bufv, i << 3, (float*)buf);+#endif+ }+ + /* 2. iteration */+ // Note w[1]={{1,0}, {0,-1}}+ for(i = 0; i < 128; i += 4) {+#if 0+ tmp_a_r = buf[i].real;+ tmp_a_i = buf[i].imag;+ tmp_b_r = buf[i+2].real;+ tmp_b_i = buf[i+2].imag;+ buf[i].real = tmp_a_r + tmp_b_r;+ buf[i].imag = tmp_a_i + tmp_b_i;+ buf[i+2].real = tmp_a_r - tmp_b_r;+ buf[i+2].imag = tmp_a_i - tmp_b_i;+ tmp_a_r = buf[i+1].real;+ tmp_a_i = buf[i+1].imag;+ /* WARNING: im <-> re here ! */+ tmp_b_r = buf[i+3].imag;+ tmp_b_i = buf[i+3].real;+ buf[i+1].real = tmp_a_r + tmp_b_r;+ buf[i+1].imag = tmp_a_i - tmp_b_i;+ buf[i+3].real = tmp_a_r - tmp_b_r;+ buf[i+3].imag = tmp_a_i + tmp_b_i;+#else+ vector float buf01, buf23, temp1, temp2;+ + buf01 = vec_ld((i + 0) << 3, (float*)buf);+ buf23 = vec_ld((i + 2) << 3, (float*)buf);+ buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));++ temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);+ temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);++ vec_st(temp1, (i + 0) << 3, (float*)buf);+ vec_st(temp2, (i + 2) << 3, (float*)buf);+#endif+ }++ /* 3. iteration */+ for(i = 0; i < 128; i += 8) {+#if 0+ tmp_a_r = buf[i].real;+ tmp_a_i = buf[i].imag;+ tmp_b_r = buf[i+4].real;+ tmp_b_i = buf[i+4].imag;+ buf[i].real = tmp_a_r + tmp_b_r;+ buf[i].imag = tmp_a_i + tmp_b_i;+ buf[i+4].real = tmp_a_r - tmp_b_r;+ buf[i+4].imag = tmp_a_i - tmp_b_i;+ tmp_a_r = buf[1+i].real;+ tmp_a_i = buf[1+i].imag;+ tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;+ tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;+ buf[1+i].real = tmp_a_r + tmp_b_r;+ buf[1+i].imag = tmp_a_i + tmp_b_i;+ buf[i+5].real = tmp_a_r - tmp_b_r;+ buf[i+5].imag = tmp_a_i - tmp_b_i;+ tmp_a_r = buf[i+2].real;+ tmp_a_i = buf[i+2].imag;+ /* WARNING re <-> im & sign */+ tmp_b_r = buf[i+6].imag;+ tmp_b_i = - buf[i+6].real;+ buf[i+2].real = tmp_a_r + tmp_b_r;+ buf[i+2].imag = tmp_a_i + tmp_b_i;+ buf[i+6].real = tmp_a_r - tmp_b_r;+ buf[i+6].imag = tmp_a_i - tmp_b_i;+ tmp_a_r = buf[i+3].real;+ tmp_a_i = buf[i+3].imag;+ tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;+ tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;+ buf[i+3].real = tmp_a_r + tmp_b_r;+ buf[i+3].imag = tmp_a_i + tmp_b_i;+ buf[i+7].real = tmp_a_r - tmp_b_r;+ buf[i+7].imag = tmp_a_i - tmp_b_i;+#else+ vector float buf01, buf23, buf45, buf67;++ buf01 = vec_ld((i + 0) << 3, (float*)buf);+ buf23 = vec_ld((i + 2) << 3, (float*)buf);++ tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;+ tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;+ buf[i+5].real = tmp_b_r;+ buf[i+5].imag = tmp_b_i;+ tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;+ tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;+ buf[i+7].real = tmp_b_r;+ buf[i+7].imag = tmp_b_i;++ buf23 = vec_ld((i + 2) << 3, (float*)buf);+ buf45 = vec_ld((i + 4) << 3, (float*)buf);+ buf67 = vec_ld((i + 6) << 3, (float*)buf);+ buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));+ + vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);+ vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);+ vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);+ vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);+#endif+ }+ + /* 4-7. iterations */+ for (m=3; m < 7; m++) {+ two_m = (1 << m);++ two_m_plus_one = two_m<<1;++ for(i = 0; i < 128; i += two_m_plus_one) {+ for(k = 0; k < two_m; k+=2) {+#if 0+ int p = k + i;+ int q = p + two_m;+ tmp_a_r = buf[p].real;+ tmp_a_i = buf[p].imag;+ tmp_b_r =+ buf[q].real * w[m][k].real -+ buf[q].imag * w[m][k].imag;+ tmp_b_i =+ buf[q].imag * w[m][k].real ++ buf[q].real * w[m][k].imag;+ buf[p].real = tmp_a_r + tmp_b_r;+ buf[p].imag = tmp_a_i + tmp_b_i;+ buf[q].real = tmp_a_r - tmp_b_r;+ buf[q].imag = tmp_a_i - tmp_b_i;++ tmp_a_r = buf[(p + 1)].real;+ tmp_a_i = buf[(p + 1)].imag;+ tmp_b_r =+ buf[(q + 1)].real * w[m][(k + 1)].real -+ buf[(q + 1)].imag * w[m][(k + 1)].imag;+ tmp_b_i =+ buf[(q + 1)].imag * w[m][(k + 1)].real ++ buf[(q + 1)].real * w[m][(k + 1)].imag;+ buf[(p + 1)].real = tmp_a_r + tmp_b_r;+ buf[(p + 1)].imag = tmp_a_i + tmp_b_i;+ buf[(q + 1)].real = tmp_a_r - tmp_b_r;+ buf[(q + 1)].imag = tmp_a_i - tmp_b_i;+#else+ int p = k + i;+ int q = p + two_m;+ vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;+ const vector float vczero = (const vector float)FOUROF(0.);+ // first compute buf[q] and buf[q+1]+ vecq = vec_ld(q << 3, (float*)buf);+ vecw = vec_ld(0, (float*)&(w[m][k]));+ temp1 = vec_madd(vecq, vecw, vczero);+ temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));+ temp2 = vec_madd(temp2, vecw, vczero);+ temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));+ temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));+ vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);+ // then butterfly with buf[p] and buf[p+1]+ vecp = vec_ld(p << 3, (float*)buf);+ + temp1 = vec_add(vecp, vecq);+ temp2 = vec_sub(vecp, vecq);+ + vec_st(temp1, p << 3, (float*)buf);+ vec_st(temp2, q << 3, (float*)buf);+#endif+ }+ }+ }++ /* Post IFFT complex multiply plus IFFT complex conjugate*/+ for( i=0; i < 128; i+=4) {+ /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */+#if 0+ tmp_a_r = buf[(i + 0)].real;+ tmp_a_i = -1.0 * buf[(i + 0)].imag;+ buf[(i + 0)].real =+ (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]);+ buf[(i + 0)].imag =+ (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]);++ tmp_a_r = buf[(i + 1)].real;+ tmp_a_i = -1.0 * buf[(i + 1)].imag;+ buf[(i + 1)].real =+ (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]);+ buf[(i + 1)].imag =+ (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]);++ tmp_a_r = buf[(i + 2)].real;+ tmp_a_i = -1.0 * buf[(i + 2)].imag;+ buf[(i + 2)].real =+ (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]);+ buf[(i + 2)].imag =+ (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]);++ tmp_a_r = buf[(i + 3)].real;+ tmp_a_i = -1.0 * buf[(i + 3)].imag;+ buf[(i + 3)].real =+ (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]);+ buf[(i + 3)].imag =+ (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]);+#else+ vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;+ vector float temp0022, temp1133, tempCS01;+ const vector float vczero = (const vector float)FOUROF(0.);++ bufv_0 = vec_ld((i + 0) << 3, (float*)buf);+ bufv_2 = vec_ld((i + 2) << 3, (float*)buf);++ cosv = vec_ld(i << 2, xcos1);+ sinv = vec_ld(i << 2, xsin1);++ temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));+ temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));+ tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));+ temp1 = vec_madd(temp0022, tempCS01, vczero);+ tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));+ temp2 = vec_madd(temp1133, tempCS01, vczero);+ bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);+ + vec_st(bufv_0, (i + 0) << 3, (float*)buf);++ /* idem with bufv_2 and high-order cosv/sinv */++ temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));+ temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));+ tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));+ temp1 = vec_madd(temp0022, tempCS01, vczero);+ tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));+ temp2 = vec_madd(temp1133, tempCS01, vczero);+ bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);++ vec_st(bufv_2, (i + 2) << 3, (float*)buf);+ +#endif+ }+ + data_ptr = data;+ delay_ptr = delay;+ window_ptr = imdct_window;++ /* Window and convert to real valued signal */+ for(i=0; i< 64; i++) { + *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; + *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; + }+ + for(i=0; i< 64; i++) { + *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; + *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; + }+ + /* The trailing edge of the window goes into the delay line */+ delay_ptr = delay;++ for(i=0; i< 64; i++) { + *delay_ptr++ = -buf[64+i].real * *--window_ptr; + *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; + }+ + for(i=0; i<64; i++) {+ *delay_ptr++ = buf[i].imag * *--window_ptr; + *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; + }+}+#endif+++// Stuff below this line is borrowed from libac3+#include "srfftp.h"+#ifdef ARCH_X86+#ifndef HAVE_3DNOW+#define HAVE_3DNOW 1+#endif+#include "srfftp_3dnow.h"++const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }}; +const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }}; +const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };++#undef HAVE_3DNOWEX+#include "imdct_3dnow.h"+#define HAVE_3DNOWEX+#include "imdct_3dnow.h"++void+imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)+{+/* int i,k;+ int p,q;*/+ int m;+ int two_m;+ int two_m_plus_one;++/* sample_t tmp_a_i;+ sample_t tmp_a_r;+ sample_t tmp_b_i;+ sample_t tmp_b_r;*/++ sample_t *data_ptr;+ sample_t *delay_ptr;+ sample_t *window_ptr;+ + /* 512 IMDCT with source and dest data in 'data' */+ /* see the c version (dct_do_512()), its allmost identical, just in C */ ++ /* Pre IFFT complex multiply plus IFFT cmplx conjugate */+ /* Bit reversed shuffling */+ asm volatile(+ "xorl %%esi, %%esi \n\t"+ "leal "MANGLE(bit_reverse_512)", %%eax \n\t"+ "movl $1008, %%edi \n\t"+ "pushl %%ebp \n\t" //use ebp without telling gcc+ ".balign 16 \n\t"+ "1: \n\t"+ "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI+ "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI+ "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi+ "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi+ "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR+ "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t"+ "mulps %%xmm0, %%xmm2 \n\t"+ "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI+ "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"+ "subps %%xmm0, %%xmm2 \n\t"+ "movzbl (%%eax), %%edx \n\t"+ "movzbl 1(%%eax), %%ebp \n\t"+ "movlps %%xmm2, (%1, %%edx,8) \n\t"+ "movhps %%xmm2, (%1, %%ebp,8) \n\t"+ "addl $16, %%esi \n\t"+ "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap+ "subl $16, %%edi \n\t"+ " jnc 1b \n\t"+ "popl %%ebp \n\t"//no we didnt touch ebp *g*+ :: "b" (data), "c" (buf)+ : "%esi", "%edi", "%eax", "%edx"+ );+++ /* FFT Merge */+/* unoptimized variant+ for (m=1; m < 7; m++) {+ if(m)+ two_m = (1 << m);+ else+ two_m = 1;++ two_m_plus_one = (1 << (m+1));++ for(i = 0; i < 128; i += two_m_plus_one) {+ for(k = 0; k < two_m; k++) {+ p = k + i;+ q = p + two_m;+ tmp_a_r = buf[p].real;+ tmp_a_i = buf[p].imag;+ tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;+ tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;+ buf[p].real = tmp_a_r + tmp_b_r;+ buf[p].imag = tmp_a_i + tmp_b_i;+ buf[q].real = tmp_a_r - tmp_b_r;+ buf[q].imag = tmp_a_i - tmp_b_i;+ }+ }+ }+*/+ + /* 1. iteration */+ // Note w[0][0]={1,0}+ asm volatile(+ "xorps %%xmm1, %%xmm1 \n\t"+ "xorps %%xmm2, %%xmm2 \n\t"+ "movl %0, %%esi \n\t"+ ".balign 16 \n\t"+ "1: \n\t"+ "movlps (%%esi), %%xmm0 \n\t" //buf[p]+ "movlps 8(%%esi), %%xmm1\n\t" //buf[q]+ "movhps (%%esi), %%xmm0 \n\t" //buf[p]+ "movhps 8(%%esi), %%xmm2\n\t" //buf[q]+ "addps %%xmm1, %%xmm0 \n\t"+ "subps %%xmm2, %%xmm0 \n\t"+ "movaps %%xmm0, (%%esi) \n\t"+ "addl $16, %%esi \n\t"+ "cmpl %1, %%esi \n\t"+ " jb 1b \n\t"+ :: "g" (buf), "r" (buf + 128)+ : "%esi"+ );+ + /* 2. iteration */+ // Note w[1]={{1,0}, {0,-1}}+ asm volatile(+ "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1+ "movl %0, %%esi \n\t"+ ".balign 16 \n\t"+ "1: \n\t"+ "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3+ "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3+ "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3+ "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1+ "movaps (%%esi), %%xmm1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -