📄 fastsb.c

📁 arm MP3 解压算法源代码
💻 C
字号:
/***********************************************
copyright by Haia Tech
www.haia2004.com
************************************************/



/* Fast Inverse DCT implemented using Lee's algorithm */
/*
  The DCT matrix for N values is defined as:

  D(i,j) = cos((2*j+1)*i*PI/(2*N))

  Lee's fast-DCT algorithm, as used here, needs an 8-value DCT
  and an 16-value DCT matrix.
*/
#include "dewindow.h" 
#include "common.h"
#include <math.h> 

void matrix_mul16(double in1[16][16], double in2[16][16],double out[16][16]);
void fast_idct(double *in, double *out);
void fast_idct_init();


static int Granule_sbsynth_Vptr[2]={64,64};
typedef double BB[2][1024];
static BB Granule_sbsynth_V;

static double A16[16][16], A8[8][8];       /* DCT matrix         */
static double G16[16][16], G8[8][8];       /* Output butterfly   */
static double H16[16][16], H8[8][8];       /* Scaling            */

static double B16[16][16], B8[8][8];       /* B = G * DCT * H    */


void matrix_mul16(double in1[16][16],
		  double in2[16][16],
		  double out[16][16]);

void matrix_mul8(double in1[8][8],
		 double in2[8][8],
		 double out[8][8]);

void Granule_subband_synthesis(int ch, SS s, short SAM[2][SSLIMIT][SBLIMIT])
{
    int i, j, t, k;
	long is;
    double band[32];
    double *v, sum, *d;
	short *S=&SAM[0][0][0];

    /* We have 18 time-vectors of 32 subband magnitudes each. For every
       vector of 32 magnitudes, the subband synthesis generates 32
       PCM samples, so the result of 18 of these is 18*32=576 samples.
     */

    /* go through each time window */
	
    for(t = 0; t < 18; t++)
	{
	/* extract the subband strengths */
  	    v = &s[0][t];
		for(i = 0; i < 32; i++)
		{
	      band[i] = *v;
	      v = &v[18];
		}
	/* advance the buffer position */
	   Granule_sbsynth_Vptr[ch] = (Granule_sbsynth_Vptr[ch] - 64) & 0x3ff;
	   v = &Granule_sbsynth_V[ch][Granule_sbsynth_Vptr[ch]];

	   fast_idct(band, v);

	/* 32*16=512 mac's */
    /*      15          */
    /* Sj =SIGM W(j+32i) */
    /*     i=0          */

	  v = &Granule_sbsynth_V[ch][0];
      for (j = 0; j < 32; j++) 
	  {
        d = &D_coex[j];
	    k = j + Granule_sbsynth_Vptr[ch];
	    sum = 0.0f;
	    for(i = 0; i < 8; i++) 
		{
	      sum += d[0] * v[k];
	      k = (k + 96) & 0x3ff;
	      sum += d[32] * v[k];
	      k = (k + 32) & 0x3ff; 
	      d = &d[64];
		}
	
	  /* convert to integer, and clip the output */
 	    is = (long)(sum*32768);
	
	    if(is >= 32768) 
	      is = 32767;
	    else if(is < -32768)
	      is = -32768;
	    *S = (short)is;
	    S++;
	  }
    }
}

/* 18 * (4096+1024) = 92160 MAC's per call, with 2 calls per frame and
   38 frames per second this is 7 million MAC's per second.

   18 * (384 * 2 + 1024) = 32256 MAC's per call using Lee's fast DCT! That
   is just 2.4 million MAC's per second!

	We need a buffer of 1024 floats per channel.

   */

void Granule_subband_synthesis2(SS s1,SS s2,short  SAM[2][SSLIMIT][SBLIMIT])
{
    int i, j, t, k;
    double band[64];
    double *v1, *v2, sum, sum2, *d;
	double *v;
	long is;
	short *S=&SAM[0][0][0];

    /* We have 18 time-vectors of 32 subband magnitudes each. For every
       vector of 32 magnitudes, the subband synthesis generates 32
       PCM samples, so the result of 18 of these is 18*32=576 samples.
     */

    /* go through each time window */

    for(t = 0; t < 18; t++) 
	{

	/* extract the subband strengths */

	   v1 = &s1[0][t];
	   v2 = &s2[0][t];
	   for(i = 0; i < 32; i++)
	   {
	     band[i] = *v1;
	     band[i+32] = *v2;
	     v1 = &v1[18];
	     v2 = &v2[18];
	   }

	/* advance the buffer position */

 	  Granule_sbsynth_Vptr[0] = (Granule_sbsynth_Vptr[0] - 64) & 0x3ff;
	  v = &(Granule_sbsynth_V[0][Granule_sbsynth_Vptr[0]]);
	
	/* calculate 64 values for each channel and insert them into the 1024 wide buffer */

	  fast_idct(band, v);
//	  fast_idct(&band[32], &v[1024]);
         
	/* 32*16*2=1024 mac's */
	
	/* windowing - calculate 32 samples. each sample is the sum of 16 terms */
	
	/*     15          */
	/* Sj = E W(j+32i) */
	/*    i=0          */
	
	  v = &Granule_sbsynth_V[0][0];
	  for (j = 0; j < 32; j++)
	  { 
	    d = &D_coex[j];
	    k = j + Granule_sbsynth_Vptr[0];
	    
	    sum = (double)0.0f;
	    sum2 = (double)0.0f;
	    for(i = 0; i < 8; i++) 
		{
		   sum += d[0] * v[k];
		   sum2 += d[0] * v[k+1024];
		   k = (k + 96) & 0x3ff;

		   sum += d[32] * v[k];
		   sum2 += d[32] * v[k+1024];
		   k = (k + 32) & 0x3ff; 
		   d = &d[64];
	    }
	    /* convert to integer, and clip the output */
//      if(j%2)
	  {
		is=(long) (sum*32768);
// OVERFLOW_CHECKING
	    if(is >= 32768) 
		  *S = 32767;	
		else if(is < -32768) 
		  *S = -32768;
		else 
  		  *S = (short)is;

		S++;
	    
		is=(long) (sum*32768);
	    if(is >= 32768) 
		  *(S) = 32767;
		else if(is < -32768) 
		  *(S) = -32768;
		else 
		  *(S) = (short)is;

	    S++;

	  }
	  }
    }
}


void fast_idct_init()
{
    int i,j;
    double t16[16][16], t8[8][8];


    /* create the 16 matrixes */

    for(i = 0; i < 16; i++) {
	  for(j = 0; j < 16; j++) 
	  {
	    A16[i][j] = cos((2*j+1)*i*PI/32);
	    if(i == j || j == (i + 1))
		  G16[i][j] = 1.0f;
	    else
		  G16[i][j] = 0.0f;
	    if(i == j)
		  H16[i][j] = 1.0f/(2*cos((2*i+1)*PI/64));
	    else
		  H16[i][j] = 0.0;
	  }
    }

    /* create the 8 matrixes */

    for(i = 0; i < 8; i++) {
	  for(j = 0; j < 8; j++) 
	  {
	    A8[i][j] = cos((2*j+1)*i*PI/16);
	    if(i == j || j == (i + 1))
		  G8[i][j] = 1.0f;
	    else
		  G8[i][j] = 0.0f;
	    if(i == j)
		  H8[i][j] = 1.0f/(2*cos((2*i+1)*PI/32));
	    else
		  H8[i][j] = 0.0f;
	  }
    }

    /* generate the B matrixes */

    matrix_mul16(A16, H16, t16);
    matrix_mul16(G16, t16, B16);

    matrix_mul8(A8, H8, t8);
    matrix_mul8(G8, t8, B8);
}

/* This is a two-level implementation of Lee's fast-DCT algorithm */
/* 
   The 32 input values are split in two 16-value vectors using an
   even butterfly and an odd butterfly. The odd values are taken
   through Lee's odd path using a 16x16 DCT matrix (A16) and appropriate
   scaling (G16*A16*H16). The even values are further split into
   two 8-value vectors using even and odd butterflies into ee and eo.
   The ee values are fed through an 8x8 DCT matrix (A8) while the eo
   values are fed through the odd path using G8*A8*H8.

   This two-level configuration uses 384 muls and 432 adds, compared
   to the direct 32x32 DCT which uses 1024 muls and 992 adds.
*/

void fast_idct(double *in, double *out)
{
    double even[16], odd[16], ee[8], eo[8];
    double s1, s2;
    double t[32];
    int i, j;
	static int init=1;

    if(init)
	{
		fast_idct_init();
		init=0;
	}
	/* input butterflies - level 1 */
    /* 32 adds */

    for(i = 0; i < 16; i++) {
	   even[i] = in[i] + in[31-i];
	   odd[i] = in[i] - in[31-i];
    }

    /* input butterflies - level 2 */
    /* 16 adds */

    for(i = 0; i < 8; i++) {
	   ee[i] = even[i] + even[15-i];
	   eo[i] = even[i] - even[15-i];
    }

    /* multiply the even_even vector (ee) with the ee matrix (A8) */
    /* multiply the even_odd vector (eo) with the eo matrix (B8) */
    /* 128 muls, 128 adds */

    for(i = 0; i < 8; i++) {
	  s1 = 0.0;
	  s2 = 0.0;
	  for(j = 0; j < 8; j += 2) 
	  {
	    s1 += A8[i][j] * ee[j] +
		A8[i][j+1] * ee[j+1];
	    s2 += B8[i][j] * eo[j] +
		B8[i][j+1] * eo[j+1];
	  }
	  t[i*4] = s1;
	  t[i*4+2] = s2;
    }


    /* multiply the odd vector (odd) with the odd matrix (B16) */
    /* 256 muls, 256 adds */

    for(i = 0; i < 16; i++) 
	{
	  s1 = 0.0;
	  for(j = 0; j < 16; j += 4) 
	  {
	    s1 += B16[i][j] * odd[j] +
		B16[i][j+1] * odd[j+1] +
		B16[i][j+2] * odd[j+2] +
		B16[i][j+3] * odd[j+3];
	  }
	  t[i*2+1] = s1;
    }

    /* the output vector t now is expanded to 64 values using the
       symmetric property of the cosinus function */

    for(i = 0; i < 16; i++) 
	{
	  out[i] = t[i+16];
	  out[i+17] = -t[31-i];
	  out[i+32] = -t[16-i];
	  out[i+48] = -t[i];
    }
    out[16] = 0.0;
}

void matrix_mul16(double in1[16][16],
		  double in2[16][16],
		  double out[16][16])
{
    int i,j,z;

    for(i = 0; i < 16; i++) {
	  for(j = 0; j < 16; j++)
	  {
	    out[i][j] = 0.0;
	    for(z = 0; z < 16; z++)
		  out[i][j] += in1[i][z] * in2[z][j];
	  }
    } 
}

void matrix_mul8(double in1[8][8],
		 double in2[8][8],
		 double out[8][8])
{
    int i,j,z;

    for(i = 0; i < 8; i++) {
	  for(j = 0; j < 8; j++)  
	  {
	    out[i][j] = 0.0;
	    for(z = 0; z < 8; z++)
	 	  out[i][j] += in1[i][z] * in2[z][j];
	  }
    }
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -