⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 estimation.c

📁 软件无线电的平台
💻 C
📖 第 1 页 / 共 2 页
字号:
  mmx_t *y_ptr,*tw_ptr;  // output data pointer and twiddle pointer  unsigned int   size = 1<<log2size;   // size of the FFT  unsigned short no_shift = log2_amp;  // we take out no_shift shifts (this means that no_shift stages of the  // FFT does not shift by 1 the result  // the field log2_amp is used to increase the FFT output dinamic  if(flag==0) {    /* flag=0 ** reverse bit ordering and real input **     * do reverse bit ordering and duplicate input:      * |x0 x1 x2 x3| ->|x0 0 x0 0| ...... |x3 0 x3 0|      * produces 4 samples at a time     */    pxor_r2r(mm6,mm6);            // clear mm6: in mm6 there is always 0    for(i=0;i<(size>>2);i++) {     // reverse 4 samples at a time => does the loop size>>2 times      movq_m2r(x[i],mm0);         // load x0, x1,x2 and x3 at the same time      movq_r2r(mm0,mm1);          // and make a copy of it      punpcklwd_r2r(mm6,mm0);     // now in mm0 there is | x0 0 x1 0 |      punpckhwd_r2r(mm6,mm1);     // now in mm1 there is | x2 0 x3 0 |      movq_r2r(mm0,mm2);          // copy mm0 in mm2      movq_r2r(mm1,mm3);          // copy mm1 in mm3      punpckldq_r2r(mm0,mm2);     // now in mm2 there is |x0 0 x0 0 |      punpckldq_r2r(mm1,mm3);     // now in mm3 there is |x2 0 x2 0 |      movq_r2r(mm0,mm4);          // copy mm0 in mm4      movq_r2r(mm1,mm5);          // copy mm1 in mm5      punpckhdq_r2r(mm0,mm4);     // now in mm4 there is |x1 0 x1 0 |      punpckhdq_r2r(mm1,mm5);     // now in mm5 there is |x3 0 x3 0 |      index = rev[0];             // get the output index      movq_r2m(mm2,y[index]);     // write |x0 0 x0 0 | to y[index]      index = rev[1];             // get the output index      movq_r2m(mm4,y[index]);     // write |x1 0 x1 0 | to y[index]      index = rev[2];             // get the output index      movq_r2m(mm3,y[index]);     // write |x2 0 x2 0 | to y[index]      index = rev[3];             // get the output index      movq_r2m(mm5,y[index]);     // write |x3 0 x3 0 | to y[index]      rev+=4;                     // increase reverse buffer pointer    }  }  if(flag==1) {    /* WARNING: Here we assume the input format to be     * [Re0,Im0,Re0,Im0, Re1,Im1,Re1,Im1,...     * we do reverse bit ordering and we reorder 4 samples per loop     */    for(i=0;i<(size>>2);i++) {      movq_m2r(x[0],mm0);         // load 1st sample      movq_m2r(x[1],mm1);         // load 2nd sample      movq_m2r(x[2],mm2);         // load 3rd sample      movq_m2r(x[3],mm3);         // load 4th sample      index = rev[0];             // get the output index      movq_r2m(mm0,y[index]);     // write |x0 0 x0 0 | to y[index]      index = rev[1];             // get the output index      movq_r2m(mm1,y[index]);     // write |x1 0 x1 0 | to y[index]      index = rev[2];             // get the output index      movq_r2m(mm2,y[index]);     // write |x2 0 x2 0 | to y[index]      index = rev[3];             // get the output index      movq_r2m(mm3,y[index]);     // write |x3 0 x3 0 | to y[index]      rev+=4;                     // increase reverse buffer pointer      x+=4;    }  }  if(flag==3) {    // does nothing: starts from FIRST STAGE OF FFT below  }  if(flag==4) {    /* WARNING: Here we assume only the first half of the vector x     * is significant: the second half is assumed to be 0     * After reordering the significant samples and the zeros are      * interleaved: (the odd samples are 0) so we can modify a bit      * the reordering loop in order to avoid the first stage of the IFFT     * In fact the first butterfly is (w0 = (1,0))<ul>     * <li>X0 = x0+w0*x1  = x0+x1 = x0</li>     * <li>X1 = x0-w0*x1  = x0-x1 = x0</li>     * </ul>     * since x1 = 0 (odd sample)     *     * Remark: int this way we do not do the shift by 1 in the first stage.     * so we decrease no_shift by 1 (if it is not 0);       */    if(no_shift!=0)      no_shift--;    for(i=0;i<(size>>3);i++) {      movq_m2r(x[0],mm0);         // load 1st sample      movq_m2r(x[1],mm1);         // load 2nd sample      movq_m2r(x[2],mm2);         // load 3rd sample      movq_m2r(x[3],mm3);         // load 4th sample      index = rev[0];             // get the output index      movq_r2m(mm0,y[index]);     // write |x0 0 x0 0 | to y[index]      movq_r2m(mm0,y[index+1]);   // write |x0 0 x0 0 | to y[index+1]      index = rev[1];             // get the output index      movq_r2m(mm1,y[index]);     // write |x1 0 x1 0 | to y[index]      movq_r2m(mm1,y[index+1]);   // write |x1 0 x1 0 | to y[index+1]      index = rev[2];             // get the output index      movq_r2m(mm2,y[index]);     // write |x2 0 x2 0 | to y[index]      movq_r2m(mm2,y[index+1]);   // write |x2 0 x2 0 | to y[index+1]      index = rev[3];             // get the output index      movq_r2m(mm3,y[index]);     // write |x3 0 x3 0 | to y[index]      movq_r2m(mm3,y[index+1]);   // write |x3 0 x3 0 | to y[index+1]      rev+=4;                     // increase reverse buffer pointer      x+=4;    }  }  if(flag==5) {    // does nothing: starts from SECOND STAGE OF THE FFT below  }  // *** FIST FFT STAGE *** does only if flag!=4 (first stage integrated with reordering)  if((flag!=4) && (flag!=5)) {    // Now the input data is stored in y in reverse bit order in the format |x_i 0 x_i 0| (Re Im Re Im)    // SECOND STEP: does the radix-2 FFT itself    // - First stage of the FFT    //   In the first stage we do not use twiddle factors since the twiddle is (1,0)    //   so we avoid multiplications and shifts    //   This loop is unrolled by a factor 2 ->(n_b>>1)    //   So here we implicitly assume size>=4    //   For each loop it computes two butterflies    //   butterfly input : 4 cpx elements : x0, x1, x2, x3    //   butterfly output: 4 cpx elements : X0, X1, X2, X3    //   twiddles associated with the butterflies: w0 and w1    //   X0 = x0+w0*x1  = x0+x1    //   X1 = x0-w0*x1  = x0-x1    //   X2 = x2+w1*x3  = x2+x3    //   X3 = x2-w1*x3  = x2-x3    //   For the first stage w0 = w1 = [1 0 0 1] (see twiddle format)    n_b = size/2;    y_ptr = &y[0];    for(k=0;k<(n_b>>1);k++) {      movq_m2r(y_ptr[0],mm1);     // 1- load x0      movq_m2r(y_ptr[1],mm0);     // 1- load x1      movq_r2r(mm1,mm2);          // 1- copy x0 in mm2      paddsw_r2r(mm0,mm1);        // 1- compute X0 = x0+x1      psubsw_r2r(mm0,mm2);        // 1- compute X1 = x0-x1      psraw_i2r(1, mm1);          // 1- shift X0 right by 1      movq_m2r(y_ptr[2],mm3);     // 2- load x2      psraw_i2r(1, mm2);          // 1- shift X1 right by 1      movq_m2r(y_ptr[3],mm4);     // 2- load x3      movq_r2m(mm1,y_ptr[0]);     // 1- store X0 in memory      movq_r2r(mm3,mm5);          // 2- copy x2 in mm5      movq_r2m(mm2,y_ptr[1]);     // 1- store X1 in memory      paddsw_r2r(mm4,mm3);        // 2- compute X2 = x2+x3      psubsw_r2r(mm4,mm5);        // 2- compute X3 = x2-x3      psraw_i2r(1, mm3);          // 2- shift X2 right by 1      psraw_i2r(1, mm5);          // 2- shift X3 right by 1      movq_r2m(mm3,y_ptr[2]);     // 2- store X2 in memory      movq_r2m(mm5,y_ptr[3]);     // 2- store X3 in memory      y_ptr +=4;                  // increase the pointer to the next 4 elements    }  }  // - Stages from 2 to log2size-1  tw_ptr =&twiddle[1];          // for the 2nd stage we start with this twiddle  bs  =2;                       // the blocksize is two  n_b =size>>2;                 // half n_b  // first we compute log2size-no_shift-1 stages doing shift by 1 (see below)  for(i=1;i<log2size-no_shift;i++) {    y_ptr = &y[0];    for(k=0;k<n_b;k++) {      // bs is greater than or equal to 2 so I can unroll a bit the inner loop      //   For each loop it computes two butterflies      //   butterfly input : 4 cpx elements : x0, x1, x2, x3      //   butterfly output: 4 cpx elements : X0, X1, X2, X3      //   twiddles associated with the butterflies: w0 and w1      //   X0 = x0+w0*x1      //   X1 = x0-w0*x1      //   X2 = x2+w1*x3      //   X3 = x2-w1*x3      for(j=0;j<bs;j+=2) {        movq_m2r(y_ptr[0],mm1);     // 1- load x0        movq_m2r(y_ptr[bs],mm0);    // 1- load x1        movq_r2r(mm1,mm2);          // 1- copy x0 in mm2        pmaddwd_m2r(tw_ptr[0],mm0); // 1- compute w*x1 (load the twiddle)        psrad_i2r(14, mm0);         // 1- shift right by SHIFT        movq_m2r(y_ptr[1],mm3);     // 2- load x0        packssdw_r2r(mm0,mm0);      // 1- pack in a 64 bit register [re im re im]        movq_m2r(y_ptr[bs+1],mm4);  // 2- load x1        paddsw_r2r(mm0,mm1);        // 1- add and store the result in mm1        movq_r2r(mm3,mm5);          // 2- copy x0 in mm5        psubsw_r2r(mm0,mm2);        // 1- sub and store the result in mm2        pmaddwd_m2r(tw_ptr[1],mm4); // 2- compute w*x1 (load the twiddle)        psraw_i2r(1, mm1);          // 1- shift right by 1        psraw_i2r(1, mm2);          // 1- shift right by 1        psrad_i2r(14, mm4);         // 2- shift right by SHIFT        movq_r2m(mm1,y_ptr[0]);     // 1- store the first result        packssdw_r2r(mm4,mm4);      // 2- pack in a 64 bit register [re im re im]        movq_r2m(mm2,y_ptr[bs]);    // 1- store the second result        paddsw_r2r(mm4,mm3);        // 2- add and store the result in mm3        psubsw_r2r(mm4,mm5);        // 2- sub and store the result in mm5        psraw_i2r(1, mm3);          // 2- shift right by 1        psraw_i2r(1, mm5);          // 2- shift right by 1        movq_r2m(mm3,y_ptr[1]);     // 2- store the first result        movq_r2m(mm5,y_ptr[bs+1]);  // 2- store the first result        y_ptr +=2;                  // increase the data pointer        tw_ptr +=2;                 // increase the twiddle pointer      }      tw_ptr -=bs;                  // twiddle pointer now points to the first twiddle of this stage      y_ptr +=bs;                   // jump to next block    }    tw_ptr+=bs;                     // jump to the first twiddle of the next stage    bs  <<=1;                       // double bs    n_b >>=1;                       // half n_b  }  // and then we do no_shift stages without shift by 1  for(i=log2size-no_shift;i<log2size;i++) {    y_ptr = &y[0];    for(k=0;k<n_b;k++) {      // bs is greater than or equal to 2 so I can unroll a bit the inner loop      // so that every loop computes two butterflies      // bs is greater than or equal to 2 so I can unroll a bit the inner loop      //   For each loop it computes two butterflies      //   butterfly input : 4 cpx elements : x0, x1, x2, x3      //   butterfly output: 4 cpx elements : X0, X1, X2, X3      //   twiddles associated with the butterflies: w0 and w1      //   X0 = x0+w0*x1      //   X1 = x0-w0*x1      //   X2 = x2+w1*x3      //   X3 = x2-w1*x3      for(j=0;j<bs;j+=2) {        movq_m2r(y_ptr[0],mm1);     // 1- load x0        movq_m2r(y_ptr[bs],mm0);    // 1- load x1        movq_r2r(mm1,mm2);          // 1- copy x0 in mm2        pmaddwd_m2r(tw_ptr[0],mm0); // 1- compute w*x1 (load the twiddle)        psrad_i2r(14, mm0);         // 1- shift right by SHIFT        movq_m2r(y_ptr[1],mm3);     // 2- load x0        packssdw_r2r(mm0,mm0);      // 1- pack in a 64 bit register [re im re im]        movq_m2r(y_ptr[bs+1],mm4);  // 2- load x1        paddsw_r2r(mm0,mm1);        // 1- add and store the result in mm1        movq_r2r(mm3,mm5);          // 2- copy x0 in mm5        psubsw_r2r(mm0,mm2);        // 1- sub and store the result in mm2        pmaddwd_m2r(tw_ptr[1],mm4); // 2- compute w*x1 (load the twiddle)        psrad_i2r(14, mm4);         // 2- shift right by SHIFT        movq_r2m(mm1,y_ptr[0]);     // 1- store the first result        packssdw_r2r(mm4,mm4);      // 2- pack in a 64 bit register [re im re im]        movq_r2m(mm2,y_ptr[bs]);    // 1- store the second result        paddsw_r2r(mm4,mm3);        // 2- add and store the result in mm3        psubsw_r2r(mm4,mm5);        // 2- sub and store the result in mm5        movq_r2m(mm3,y_ptr[1]);     // 2- store the first result        movq_r2m(mm5,y_ptr[bs+1]);  // 2- store the first result        y_ptr +=2;                  // increase the data pointer        tw_ptr +=2;                 // increase the twiddle pointer      }      tw_ptr -=bs;                  // twiddle pointer now points to the first twiddle of this stage      y_ptr +=bs;                   // jump to next block    }    tw_ptr+=bs;                     // jump to the first twiddle of the next stage    bs  <<=1;                       // double bs    n_b >>=1;                       // half n_b  }  emms();  return(0);}unsigned short ReverseBits ( unsigned index, unsigned NumBits ) {  unsigned i, rev;  for ( i=rev=0; i < NumBits; i++ ) {    rev = (rev << 1) | (index & 1);    index >>= 1;  }  return rev;}/** * @short Multiply elementwise two complex vectors of N elements * @param x1 input 1 in the format  |Re0  Im0 Re0 Im0|, ... , *  |Re(N-1)  Im(N-1) Re(N-1) Im(N-1)| We assume x1 with a  *  dynamic of 15 bit maximum * @param x2 input 2 in the format  |Re0 -Im0 Im0 Re0|, ... , *  |Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| We assume x2 with a  *  dynamic of 14 bit maximum * @param y output in the format  |Re0  Im0 Re0 Im0|, ... , *  |Re(N-1)  Im(N-1) Re(N-1) Im(N-1)| * @param N the size f the vectors (this function does N cpx mpy. WARNING: N>=4; * @param log2_amp increase the output amplitude by a factor  *  2^log2_amp (default is 0) WARNING: log2_amp>0 can cause overflow!! */int mult_cpx_vector(mmx_t *x1, mmx_t *x2, mmx_t *y,                    unsigned int N, unsigned short log2_amp) {  mmx_t shift;  unsigned int i;                 // loop counter  shift.q = 14-log2_amp;  // we compute 4 cpx multiply for each loop  for( i=0; i<(N>>2); i++ ) {    movq_m2r(x1[0],mm0);          // 1- load x1[0])    movq_m2r(x2[0],mm1);          // 1- load x2[0])    pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]    movq_m2r(x1[1],mm2);          // 2- load x1[1])    psrad_m2r(shift, mm0);        // 1- shift right by shift in order to  compensate for the input amplitude    movq_m2r(x2[1],mm3);          // 2- load x2[1])    packssdw_r2r(mm0,mm0);        // 1- pack in a 64 bit register [re im re im]    movq_m2r(x1[2],mm4);          // 3- load x1[2])    pmaddwd_r2r(mm3,mm2);         // 2- compute x1[1]*x2[1]    movq_m2r(x2[2],mm5);          // 3- load x2[2])    movq_r2m(mm0,y[0]);           // 1- store in memory.    psrad_m2r(shift, mm2);        // 2- shift right by shift in order to  compensate for the input amplitude    packssdw_r2r(mm2,mm2);        // 2- pack in a 64 bit register [re im re im]    movq_m2r(x1[3],mm6);          // 4- load x1[3])    movq_r2m(mm2,y[1]);           // 2- store in memory.    pmaddwd_r2r(mm5,mm4);         // 3- compute x1[2]*x2[2]    psrad_m2r(shift, mm4);        // 3- shift right by shift in order to  compensate for the input amplitude    movq_m2r(x2[3],mm7);          // 4- load x2[3])    packssdw_r2r(mm4,mm4);        // 3- pack in a 64 bit register [re im re im]    pmaddwd_r2r(mm7,mm6);         // 4- compute x1[3]*x2[3]    movq_r2m(mm4,y[2]);           // 3- store in memory.    psrad_m2r(shift, mm6);        // 4- shift right by shift in order to  compensate for the input amplitude    packssdw_r2r(mm6,mm6);        // 4- pack in a 64 bit register [re im re im]    movq_r2m(mm6,y[3]);           // 4- store in memory.    x1+=4;    x2+=4;    y +=4;  }  emms();  return(0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -