📄 estimation.c
字号:
mmx_t *y_ptr,*tw_ptr; // output data pointer and twiddle pointer unsigned int size = 1<<log2size; // size of the FFT unsigned short no_shift = log2_amp; // we take out no_shift shifts (this means that no_shift stages of the // FFT does not shift by 1 the result // the field log2_amp is used to increase the FFT output dinamic if(flag==0) { /* flag=0 ** reverse bit ordering and real input ** * do reverse bit ordering and duplicate input: * |x0 x1 x2 x3| ->|x0 0 x0 0| ...... |x3 0 x3 0| * produces 4 samples at a time */ pxor_r2r(mm6,mm6); // clear mm6: in mm6 there is always 0 for(i=0;i<(size>>2);i++) { // reverse 4 samples at a time => does the loop size>>2 times movq_m2r(x[i],mm0); // load x0, x1,x2 and x3 at the same time movq_r2r(mm0,mm1); // and make a copy of it punpcklwd_r2r(mm6,mm0); // now in mm0 there is | x0 0 x1 0 | punpckhwd_r2r(mm6,mm1); // now in mm1 there is | x2 0 x3 0 | movq_r2r(mm0,mm2); // copy mm0 in mm2 movq_r2r(mm1,mm3); // copy mm1 in mm3 punpckldq_r2r(mm0,mm2); // now in mm2 there is |x0 0 x0 0 | punpckldq_r2r(mm1,mm3); // now in mm3 there is |x2 0 x2 0 | movq_r2r(mm0,mm4); // copy mm0 in mm4 movq_r2r(mm1,mm5); // copy mm1 in mm5 punpckhdq_r2r(mm0,mm4); // now in mm4 there is |x1 0 x1 0 | punpckhdq_r2r(mm1,mm5); // now in mm5 there is |x3 0 x3 0 | index = rev[0]; // get the output index movq_r2m(mm2,y[index]); // write |x0 0 x0 0 | to y[index] index = rev[1]; // get the output index movq_r2m(mm4,y[index]); // write |x1 0 x1 0 | to y[index] index = rev[2]; // get the output index movq_r2m(mm3,y[index]); // write |x2 0 x2 0 | to y[index] index = rev[3]; // get the output index movq_r2m(mm5,y[index]); // write |x3 0 x3 0 | to y[index] rev+=4; // increase reverse buffer pointer } } if(flag==1) { /* WARNING: Here we assume the input format to be * [Re0,Im0,Re0,Im0, Re1,Im1,Re1,Im1,... * we do reverse bit ordering and we reorder 4 samples per loop */ for(i=0;i<(size>>2);i++) { movq_m2r(x[0],mm0); // load 1st sample movq_m2r(x[1],mm1); // load 2nd sample movq_m2r(x[2],mm2); // load 3rd sample movq_m2r(x[3],mm3); // load 4th sample index = rev[0]; // get the output index movq_r2m(mm0,y[index]); // write |x0 0 x0 0 | to y[index] index = rev[1]; // get the output index movq_r2m(mm1,y[index]); // write |x1 0 x1 0 | to y[index] index = rev[2]; // get the output index movq_r2m(mm2,y[index]); // write |x2 0 x2 0 | to y[index] index = rev[3]; // get the output index movq_r2m(mm3,y[index]); // write |x3 0 x3 0 | to y[index] rev+=4; // increase reverse buffer pointer x+=4; } } if(flag==3) { // does nothing: starts from FIRST STAGE OF FFT below } if(flag==4) { /* WARNING: Here we assume only the first half of the vector x * is significant: the second half is assumed to be 0 * After reordering the significant samples and the zeros are * interleaved: (the odd samples are 0) so we can modify a bit * the reordering loop in order to avoid the first stage of the IFFT * In fact the first butterfly is (w0 = (1,0))<ul> * <li>X0 = x0+w0*x1 = x0+x1 = x0</li> * <li>X1 = x0-w0*x1 = x0-x1 = x0</li> * </ul> * since x1 = 0 (odd sample) * * Remark: int this way we do not do the shift by 1 in the first stage. * so we decrease no_shift by 1 (if it is not 0); */ if(no_shift!=0) no_shift--; for(i=0;i<(size>>3);i++) { movq_m2r(x[0],mm0); // load 1st sample movq_m2r(x[1],mm1); // load 2nd sample movq_m2r(x[2],mm2); // load 3rd sample movq_m2r(x[3],mm3); // load 4th sample index = rev[0]; // get the output index movq_r2m(mm0,y[index]); // write |x0 0 x0 0 | to y[index] movq_r2m(mm0,y[index+1]); // write |x0 0 x0 0 | to y[index+1] index = rev[1]; // get the output index movq_r2m(mm1,y[index]); // write |x1 0 x1 0 | to y[index] movq_r2m(mm1,y[index+1]); // write |x1 0 x1 0 | to y[index+1] index = rev[2]; // get the output index movq_r2m(mm2,y[index]); // write |x2 0 x2 0 | to y[index] movq_r2m(mm2,y[index+1]); // write |x2 0 x2 0 | to y[index+1] index = rev[3]; // get the output index movq_r2m(mm3,y[index]); // write |x3 0 x3 0 | to y[index] movq_r2m(mm3,y[index+1]); // write |x3 0 x3 0 | to y[index+1] rev+=4; // increase reverse buffer pointer x+=4; } } if(flag==5) { // does nothing: starts from SECOND STAGE OF THE FFT below } // *** FIST FFT STAGE *** does only if flag!=4 (first stage integrated with reordering) if((flag!=4) && (flag!=5)) { // Now the input data is stored in y in reverse bit order in the format |x_i 0 x_i 0| (Re Im Re Im) // SECOND STEP: does the radix-2 FFT itself // - First stage of the FFT // In the first stage we do not use twiddle factors since the twiddle is (1,0) // so we avoid multiplications and shifts // This loop is unrolled by a factor 2 ->(n_b>>1) // So here we implicitly assume size>=4 // For each loop it computes two butterflies // butterfly input : 4 cpx elements : x0, x1, x2, x3 // butterfly output: 4 cpx elements : X0, X1, X2, X3 // twiddles associated with the butterflies: w0 and w1 // X0 = x0+w0*x1 = x0+x1 // X1 = x0-w0*x1 = x0-x1 // X2 = x2+w1*x3 = x2+x3 // X3 = x2-w1*x3 = x2-x3 // For the first stage w0 = w1 = [1 0 0 1] (see twiddle format) n_b = size/2; y_ptr = &y[0]; for(k=0;k<(n_b>>1);k++) { movq_m2r(y_ptr[0],mm1); // 1- load x0 movq_m2r(y_ptr[1],mm0); // 1- load x1 movq_r2r(mm1,mm2); // 1- copy x0 in mm2 paddsw_r2r(mm0,mm1); // 1- compute X0 = x0+x1 psubsw_r2r(mm0,mm2); // 1- compute X1 = x0-x1 psraw_i2r(1, mm1); // 1- shift X0 right by 1 movq_m2r(y_ptr[2],mm3); // 2- load x2 psraw_i2r(1, mm2); // 1- shift X1 right by 1 movq_m2r(y_ptr[3],mm4); // 2- load x3 movq_r2m(mm1,y_ptr[0]); // 1- store X0 in memory movq_r2r(mm3,mm5); // 2- copy x2 in mm5 movq_r2m(mm2,y_ptr[1]); // 1- store X1 in memory paddsw_r2r(mm4,mm3); // 2- compute X2 = x2+x3 psubsw_r2r(mm4,mm5); // 2- compute X3 = x2-x3 psraw_i2r(1, mm3); // 2- shift X2 right by 1 psraw_i2r(1, mm5); // 2- shift X3 right by 1 movq_r2m(mm3,y_ptr[2]); // 2- store X2 in memory movq_r2m(mm5,y_ptr[3]); // 2- store X3 in memory y_ptr +=4; // increase the pointer to the next 4 elements } } // - Stages from 2 to log2size-1 tw_ptr =&twiddle[1]; // for the 2nd stage we start with this twiddle bs =2; // the blocksize is two n_b =size>>2; // half n_b // first we compute log2size-no_shift-1 stages doing shift by 1 (see below) for(i=1;i<log2size-no_shift;i++) { y_ptr = &y[0]; for(k=0;k<n_b;k++) { // bs is greater than or equal to 2 so I can unroll a bit the inner loop // For each loop it computes two butterflies // butterfly input : 4 cpx elements : x0, x1, x2, x3 // butterfly output: 4 cpx elements : X0, X1, X2, X3 // twiddles associated with the butterflies: w0 and w1 // X0 = x0+w0*x1 // X1 = x0-w0*x1 // X2 = x2+w1*x3 // X3 = x2-w1*x3 for(j=0;j<bs;j+=2) { movq_m2r(y_ptr[0],mm1); // 1- load x0 movq_m2r(y_ptr[bs],mm0); // 1- load x1 movq_r2r(mm1,mm2); // 1- copy x0 in mm2 pmaddwd_m2r(tw_ptr[0],mm0); // 1- compute w*x1 (load the twiddle) psrad_i2r(14, mm0); // 1- shift right by SHIFT movq_m2r(y_ptr[1],mm3); // 2- load x0 packssdw_r2r(mm0,mm0); // 1- pack in a 64 bit register [re im re im] movq_m2r(y_ptr[bs+1],mm4); // 2- load x1 paddsw_r2r(mm0,mm1); // 1- add and store the result in mm1 movq_r2r(mm3,mm5); // 2- copy x0 in mm5 psubsw_r2r(mm0,mm2); // 1- sub and store the result in mm2 pmaddwd_m2r(tw_ptr[1],mm4); // 2- compute w*x1 (load the twiddle) psraw_i2r(1, mm1); // 1- shift right by 1 psraw_i2r(1, mm2); // 1- shift right by 1 psrad_i2r(14, mm4); // 2- shift right by SHIFT movq_r2m(mm1,y_ptr[0]); // 1- store the first result packssdw_r2r(mm4,mm4); // 2- pack in a 64 bit register [re im re im] movq_r2m(mm2,y_ptr[bs]); // 1- store the second result paddsw_r2r(mm4,mm3); // 2- add and store the result in mm3 psubsw_r2r(mm4,mm5); // 2- sub and store the result in mm5 psraw_i2r(1, mm3); // 2- shift right by 1 psraw_i2r(1, mm5); // 2- shift right by 1 movq_r2m(mm3,y_ptr[1]); // 2- store the first result movq_r2m(mm5,y_ptr[bs+1]); // 2- store the first result y_ptr +=2; // increase the data pointer tw_ptr +=2; // increase the twiddle pointer } tw_ptr -=bs; // twiddle pointer now points to the first twiddle of this stage y_ptr +=bs; // jump to next block } tw_ptr+=bs; // jump to the first twiddle of the next stage bs <<=1; // double bs n_b >>=1; // half n_b } // and then we do no_shift stages without shift by 1 for(i=log2size-no_shift;i<log2size;i++) { y_ptr = &y[0]; for(k=0;k<n_b;k++) { // bs is greater than or equal to 2 so I can unroll a bit the inner loop // so that every loop computes two butterflies // bs is greater than or equal to 2 so I can unroll a bit the inner loop // For each loop it computes two butterflies // butterfly input : 4 cpx elements : x0, x1, x2, x3 // butterfly output: 4 cpx elements : X0, X1, X2, X3 // twiddles associated with the butterflies: w0 and w1 // X0 = x0+w0*x1 // X1 = x0-w0*x1 // X2 = x2+w1*x3 // X3 = x2-w1*x3 for(j=0;j<bs;j+=2) { movq_m2r(y_ptr[0],mm1); // 1- load x0 movq_m2r(y_ptr[bs],mm0); // 1- load x1 movq_r2r(mm1,mm2); // 1- copy x0 in mm2 pmaddwd_m2r(tw_ptr[0],mm0); // 1- compute w*x1 (load the twiddle) psrad_i2r(14, mm0); // 1- shift right by SHIFT movq_m2r(y_ptr[1],mm3); // 2- load x0 packssdw_r2r(mm0,mm0); // 1- pack in a 64 bit register [re im re im] movq_m2r(y_ptr[bs+1],mm4); // 2- load x1 paddsw_r2r(mm0,mm1); // 1- add and store the result in mm1 movq_r2r(mm3,mm5); // 2- copy x0 in mm5 psubsw_r2r(mm0,mm2); // 1- sub and store the result in mm2 pmaddwd_m2r(tw_ptr[1],mm4); // 2- compute w*x1 (load the twiddle) psrad_i2r(14, mm4); // 2- shift right by SHIFT movq_r2m(mm1,y_ptr[0]); // 1- store the first result packssdw_r2r(mm4,mm4); // 2- pack in a 64 bit register [re im re im] movq_r2m(mm2,y_ptr[bs]); // 1- store the second result paddsw_r2r(mm4,mm3); // 2- add and store the result in mm3 psubsw_r2r(mm4,mm5); // 2- sub and store the result in mm5 movq_r2m(mm3,y_ptr[1]); // 2- store the first result movq_r2m(mm5,y_ptr[bs+1]); // 2- store the first result y_ptr +=2; // increase the data pointer tw_ptr +=2; // increase the twiddle pointer } tw_ptr -=bs; // twiddle pointer now points to the first twiddle of this stage y_ptr +=bs; // jump to next block } tw_ptr+=bs; // jump to the first twiddle of the next stage bs <<=1; // double bs n_b >>=1; // half n_b } emms(); return(0);}unsigned short ReverseBits ( unsigned index, unsigned NumBits ) { unsigned i, rev; for ( i=rev=0; i < NumBits; i++ ) { rev = (rev << 1) | (index & 1); index >>= 1; } return rev;}/** * @short Multiply elementwise two complex vectors of N elements * @param x1 input 1 in the format |Re0 Im0 Re0 Im0|, ... , * |Re(N-1) Im(N-1) Re(N-1) Im(N-1)| We assume x1 with a * dynamic of 15 bit maximum * @param x2 input 2 in the format |Re0 -Im0 Im0 Re0|, ... , * |Re(N-1) -Im(N-1) Im(N-1) Re(N-1)| We assume x2 with a * dynamic of 14 bit maximum * @param y output in the format |Re0 Im0 Re0 Im0|, ... , * |Re(N-1) Im(N-1) Re(N-1) Im(N-1)| * @param N the size f the vectors (this function does N cpx mpy. WARNING: N>=4; * @param log2_amp increase the output amplitude by a factor * 2^log2_amp (default is 0) WARNING: log2_amp>0 can cause overflow!! */int mult_cpx_vector(mmx_t *x1, mmx_t *x2, mmx_t *y, unsigned int N, unsigned short log2_amp) { mmx_t shift; unsigned int i; // loop counter shift.q = 14-log2_amp; // we compute 4 cpx multiply for each loop for( i=0; i<(N>>2); i++ ) { movq_m2r(x1[0],mm0); // 1- load x1[0]) movq_m2r(x2[0],mm1); // 1- load x2[0]) pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] movq_m2r(x1[1],mm2); // 2- load x1[1]) psrad_m2r(shift, mm0); // 1- shift right by shift in order to compensate for the input amplitude movq_m2r(x2[1],mm3); // 2- load x2[1]) packssdw_r2r(mm0,mm0); // 1- pack in a 64 bit register [re im re im] movq_m2r(x1[2],mm4); // 3- load x1[2]) pmaddwd_r2r(mm3,mm2); // 2- compute x1[1]*x2[1] movq_m2r(x2[2],mm5); // 3- load x2[2]) movq_r2m(mm0,y[0]); // 1- store in memory. psrad_m2r(shift, mm2); // 2- shift right by shift in order to compensate for the input amplitude packssdw_r2r(mm2,mm2); // 2- pack in a 64 bit register [re im re im] movq_m2r(x1[3],mm6); // 4- load x1[3]) movq_r2m(mm2,y[1]); // 2- store in memory. pmaddwd_r2r(mm5,mm4); // 3- compute x1[2]*x2[2] psrad_m2r(shift, mm4); // 3- shift right by shift in order to compensate for the input amplitude movq_m2r(x2[3],mm7); // 4- load x2[3]) packssdw_r2r(mm4,mm4); // 3- pack in a 64 bit register [re im re im] pmaddwd_r2r(mm7,mm6); // 4- compute x1[3]*x2[3] movq_r2m(mm4,y[2]); // 3- store in memory. psrad_m2r(shift, mm6); // 4- shift right by shift in order to compensate for the input amplitude packssdw_r2r(mm6,mm6); // 4- pack in a 64 bit register [re im re im] movq_r2m(mm6,y[3]); // 4- store in memory. x1+=4; x2+=4; y +=4; } emms(); return(0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -