📄 estimation.c
字号:
/** * Channel Estimation Routines * * Alessandro Nordio, 01/03 * Linus Gasser 02/10/25 * * This version is optimized for MMX. * */#include "system.h"#include "mmx.h"#include "estimation.h"#include "debugging.h"#define DBG_LVL 2/** * @short fft of 2048 16-bit ints */int channel_estimation_2048(mmx_t *x, mmx_t *y, mmx_t *seq, mmx_t *tw, mmx_t *tw2, unsigned short *rev) { mmx_t temp[2048]; // temp buffer PR_DBG( 4, "ce of 2048\n" ); // does FFT of the input, (real input) and reverse bit ordering fft_radix2_fixpoint_mmx(x, &temp[0], tw, rev,11,5,0); // multiply temp and seq vectors : temp[i] = temp[i]*seq[i] mult_cpx_vector(&temp[0], &seq[0],&temp[0], 512,5); // multiply temp and seq vectors : temp[i] = temp[i]*seq[i] mult_cpx_vector(&temp[512], &seq[0],&temp[512],512,5); /* do IFFT of temp and put the result in y (this routine assumes * the second half of the vector x to be 0 (no need to zero it) */ fft_radix2_fixpoint_mmx(&temp[0],y, tw2,rev,11,4,4); return(0);}/** * @short fft of 512 16-bit ints */int channel_estimation_512(mmx_t *x,mmx_t *y, mmx_t *seq, mmx_t *tw, mmx_t *tw2, unsigned short *rev) { mmx_t temp[512]; // temp buffer PR_DBG( 4, "ce of 512\n" ); // does FFT of the input, (real input) and reverse bit ordering fft_radix2_fixpoint_mmx(x, &temp[0], tw, rev,9,5,0); // multiply temp and seq vectors : temp[i] = temp[i]*seq[i] mult_cpx_vector(&temp[0], &seq[0],&temp[0], 128,5); // multiply temp and seq vectors : temp[i] = temp[i]*seq[i] mult_cpx_vector(&temp[128], &seq[0],&temp[128],128,5); /* do IFFT of temp and put the result in y (this routine assumes * the second half of the vector x to be 0 (no need to zero it) */ fft_radix2_fixpoint_mmx(&temp[0],y, tw2,rev,9,3,4); return(0);}/** * @short fft of 768 16-bit ints */int channel_estimation_768(mmx_t *x,mmx_t *y, mmx_t *seq, mmx_t *tw, mmx_t *tw2, unsigned short *rev) { mmx_t temp[768]; // temp buffer PR_DBG( 4, "ce of 768\n" ); // does FFT of the input, (real input -> flag = 0) fft_768_fixpoint_mmx(&x[0], &temp[0], tw, &rev[0],4,0); // multiply temp and seq vectors : temp[i] = temp[i]*seq[i] mult_cpx_vector(&temp[0], &seq[0],&temp[0], 192, 2); // multiply temp and seq vectors : temp[i] = temp[i]*seq[i] mult_cpx_vector(&temp[192], &seq[0],&temp[192],192, 2); /** do IFFT of temp and put the result in y (this routine assumes * the second half of the vector x to be 0 (no need to zero it) */ fft_768_fixpoint_mmx(&temp[0], &y[0], tw2, &rev[0],2,2); return(0);}/** * @short This function computes a 768 point complex FFT in fixed point */int fft_768_fixpoint_mmx(mmx_t *x, mmx_t *y, mmx_t *twiddle, unsigned short *rev, unsigned short log2_amp, unsigned short flag) { reverse_order768(&x[0],&y[0],&rev[0],flag); // does reverse bit ordering if(flag==0) { // does 3 256-points FFTs fft_radix2_fixpoint_mmx(&y[0], &y[0], &twiddle[0],&rev[0],8,log2_amp,3); fft_radix2_fixpoint_mmx(&y[256], &y[256], &twiddle[0],&rev[0],8,log2_amp,3); fft_radix2_fixpoint_mmx(&y[512], &y[512], &twiddle[0],&rev[0],8,log2_amp,3); butterfly3(&y[0],&y[0],&twiddle[255],256,1); } else if(flag==2) { // does 3 256-points IFFTs fft_radix2_fixpoint_mmx(&y[0], &y[0], &twiddle[0],&rev[0],8,log2_amp,5); fft_radix2_fixpoint_mmx(&y[256], &y[256], &twiddle[0],&rev[0],8,log2_amp,5); fft_radix2_fixpoint_mmx(&y[512], &y[512], &twiddle[0],&rev[0],8,log2_amp,5); butterfly3(&y[0],&y[0],&twiddle[255],256,0); } return(0);}/* * flag = 0 input is real x0,x1,x2,x3,... * flag = 1 input is cpx and in mmx_t format Re0 Im0 re0 Im0, .... * flag = 2 as in 1 but the second half of the input vector is assumed to be 0 */int reverse_order768(mmx_t *x,mmx_t *y,unsigned short *rev,unsigned short flag) { int i; short index; if(flag==0) { pxor_r2r(mm6,mm6); // clear mm6: in mm6 there is always 0 for(i=0;i<192;i++) { // reverse 4 samples at a time => does the loop 192 times 192*4 = 768 movq_m2r(x[i],mm0); // load x0, x1,x2 and x3 at the same time movq_r2r(mm0,mm1); // and make a copy of it punpcklwd_r2r(mm6,mm0); // now in mm0 there is | x0 0 x1 0 | punpckhwd_r2r(mm6,mm1); // now in mm1 there is | x2 0 x3 0 | movq_r2r(mm0,mm2); // copy mm0 in mm2 movq_r2r(mm1,mm3); // copy mm1 in mm3 punpckldq_r2r(mm0,mm2); // now in mm2 there is |x0 0 x0 0 | punpckldq_r2r(mm1,mm3); // now in mm3 there is |x2 0 x2 0 | movq_r2r(mm0,mm4); // copy mm0 in mm4 movq_r2r(mm1,mm5); // copy mm1 in mm5 punpckhdq_r2r(mm0,mm4); // now in mm4 there is |x1 0 x1 0 | punpckhdq_r2r(mm1,mm5); // now in mm5 there is |x3 0 x3 0 | index = rev[0]; // get the output index movq_r2m(mm2,y[index]); // write |x0 0 x0 0 | to y[index] index = rev[1]; // get the output index movq_r2m(mm4,y[index]); // write |x1 0 x1 0 | to y[index] index = rev[2]; // get the output index movq_r2m(mm3,y[index]); // write |x2 0 x2 0 | to y[index] index = rev[3]; // get the output index movq_r2m(mm5,y[index]); // write |x3 0 x3 0 | to y[index] rev+=4; // increase reverse buffer pointer } } // end flag=0 if(flag==1) { // WARNING: Here we assume the input format to be [Re0,Im0,Re0,Im0, Re1,Im1,Re1,Im1,..... // we do reverse bit ordering and we reorder 4 samples per loop for(i=0;i<192;i++) { movq_m2r(x[0],mm0); // load 1st sample movq_m2r(x[1],mm1); // load 2nd sample movq_m2r(x[2],mm2); // load 3rd sample movq_m2r(x[3],mm3); // load 4th sample index = rev[0]; // get the output index movq_r2m(mm0,y[index]); // write |x0 0 x0 0 | to y[index] index = rev[1]; // get the output index movq_r2m(mm1,y[index]); // write |x1 0 x1 0 | to y[index] index = rev[2]; // get the output index movq_r2m(mm2,y[index]); // write |x2 0 x2 0 | to y[index] index = rev[3]; // get the output index movq_r2m(mm3,y[index]); // write |x3 0 x3 0 | to y[index] rev+=4; // increase reverse buffer pointer x+=4; } } // end flag 1 if(flag==2) { /* WARNING: Here we assume only the first half of the vector x * is significant: the second half is assumed to be 0. After reordering * the significant samples and the zeros are interleaved: * (the odd samples are 0) So we can modify a bit the reordering * loop in order to avoid the first stage of the IFFT. In fact the * first butterfly is (w0 = (1,0))<ul> * <li>X0 = x0+w0*x1 = x0+x1 = x0</li> * <li>X1 = x0-w0*x1 = x0-x1 = x0</li> * </ul> * since x1 = 0 (odd sample) */ for(i=0;i<96;i++) { movq_m2r(x[0],mm0); // load 1st sample movq_m2r(x[1],mm1); // load 2nd sample movq_m2r(x[2],mm2); // load 3rd sample movq_m2r(x[3],mm3); // load 4th sample index = rev[0]; // get the output index movq_r2m(mm0,y[index]); // write |x0 0 x0 0 | to y[index] movq_r2m(mm0,y[index+1]); // write |x0 0 x0 0 | to y[index+1] index = rev[1]; // get the output index movq_r2m(mm1,y[index]); // write |x1 0 x1 0 | to y[index] movq_r2m(mm1,y[index+1]); // write |x1 0 x1 0 | to y[index+1] index = rev[2]; // get the output index movq_r2m(mm2,y[index]); // write |x2 0 x2 0 | to y[index] movq_r2m(mm2,y[index+1]); // write |x2 0 x2 0 | to y[index+1] index = rev[3]; // get the output index movq_r2m(mm3,y[index]); // write |x3 0 x3 0 | to y[index] movq_r2m(mm3,y[index+1]); // write |x3 0 x3 0 | to y[index+1] rev+=4; // increase reverse buffer pointer x+=4; } } emms(); return(0);}/** * @short compute the reverse budder for a 768 point FFT */int reverse_buffer768(unsigned short *buf) { int i; unsigned short j; for(i=0;i<256;i++) { j = ReverseBits(i,8); buf[3*i] = j; buf[3*i+1] = 256+j; buf[3*i+2] = 512+j; } return(0);}/** * @short This function computes N butterflies of size 3 */int butterfly3( mmx_t *x, mmx_t *y, mmx_t *tw, unsigned int N, unsigned short flag ) { int i; mmx_t *tw1,*tw2; tw1 = &tw[0]; tw2 = &tw[3*N]; if(flag==0) { for(i=0;i<N;i++) { movq_m2r(x[i+N],mm1); // load x1 movq_m2r(x[i+2*N],mm2); // load x2 movq_r2r(mm1,mm3); // make a copy of x1 movq_m2r(x[i] ,mm0); // load x0 movq_r2r(mm2,mm4); // make a copy of x2 pmaddwd_m2r(tw1[i+N],mm3); // compute w1b*x1 movq_r2r(mm1,mm5); // make a copy of x1 pmaddwd_m2r(tw2[i+N],mm4); // compute w2b*x2 movq_r2r(mm2,mm6); // make a copy of x2 pmaddwd_m2r(tw1[i],mm1); // compute w1a*x1 paddd_r2r(mm3,mm4); // compute zb = w1b*x1+w2b*x2 pmaddwd_m2r(tw2[i],mm2); // compute w2a*x2 psrad_i2r(14, mm4); // shift zb by 14 (twiddle dinamic) paddd_r2r(mm1,mm2); // compute za = w1a*x1+w2a*x2 packssdw_r2r(mm4,mm4); // pack zb in the format [re im re im] psrad_i2r(14, mm2); // shift za by 14 (twiddle dinamic) paddw_r2r(mm0,mm4); // compute X1 = zb+x0 pmaddwd_m2r(tw1[i+2*N],mm5); // compute w1c*x1 psraw_i2r(1, mm4); // shift X1 right by 1 pmaddwd_m2r(tw2[i+2*N],mm6); // compute w2c*x2 movq_r2m(mm4,y[i+N]); // store X1 packssdw_r2r(mm2,mm2); // pack za in the format [re im re im] paddd_r2r(mm5,mm6); // compute zc = w1c*x1+w2c*x2 paddw_r2r(mm0,mm2); // compute X0 = za+x0 psrad_i2r(14, mm6); // shift zc by 14 (twiddle dinamic) psraw_i2r(1, mm2); // shift X0 right by 1 packssdw_r2r(mm6,mm6); // pack zc in the format [re im re im] movq_r2m(mm2,y[i]); // store X0 paddw_r2r(mm0,mm6); // compute X2 = zc+x0 psraw_i2r(1, mm6); // shift X2 right by 1 movq_r2m(mm6,y[i+2*N]); // store X2 } } if(flag==1) { /* this loop computes half of the butterflies but only * for the first and second thirds of the output vector */ for(i=0;i<(N>>1);i++) { movq_m2r(x[i+N],mm1); // load x1 movq_m2r(x[i+2*N],mm2); // load x2 movq_r2r(mm1,mm3); // make a copy of x1 movq_m2r(x[i] ,mm0); // load x0 movq_r2r(mm2,mm4); // make a copy of x2 pmaddwd_m2r(tw1[i],mm1); // compute w1a*x1 pmaddwd_m2r(tw2[i],mm2); // compute w2a*x2 pmaddwd_m2r(tw1[i+N],mm3); // compute w1b*x1 paddd_r2r(mm1,mm2); // compute za = w1a*x1+w2a*x2 pmaddwd_m2r(tw2[i+N],mm4); // compute w2b*x2 paddd_r2r(mm3,mm4); // compute zb = w1b*x1+w2b*x2 psrad_i2r(14, mm2); // shift za by 14 (twiddle dinamic) psrad_i2r(14, mm4); // shift zb by 14 (twiddle dinamic) packssdw_r2r(mm2,mm2); // pack za in the format [re im re im] packssdw_r2r(mm4,mm4); // pack zb in the format [re im re im] paddw_r2r(mm0,mm2); // compute X0 = za+x0 paddw_r2r(mm0,mm4); // compute X1 = zb+x0 psraw_i2r(1, mm2); // shift X0 right by 1 psraw_i2r(1, mm4); // shift X1 right by 1 movq_r2m(mm2,y[i]); // store X0 movq_r2m(mm4,y[i+N]); // store X1 } // this loop computes the rest of the butterflies for(i=(N>>1);i<N;i++) { movq_m2r(x[i+N],mm1); // load x1 movq_m2r(x[i+2*N],mm2); // load x2 movq_m2r(x[i],mm0); // load x0 pmaddwd_m2r(tw1[i],mm1); // compute w1a*x1 pmaddwd_m2r(tw2[i],mm2); // compute w2a*x2 paddd_r2r(mm1,mm2); // compute za = w1a*x1+w2a*x2 psrad_i2r(14, mm2); // shift za by 14 (twiddle dinamic) packssdw_r2r(mm2,mm2); // pack za in the format [re im re im] paddw_r2r(mm0,mm2); // compute X0 = za+x0 psraw_i2r(1, mm2); // shift X0 right by 1 movq_r2m(mm2,y[i]); // store X0 } } emms(); return(0);}/** * @short FFT base 2 in fixed point */int fft_radix2_fixpoint_mmx(mmx_t *x, mmx_t *y, mmx_t *twiddle, unsigned short *rev, unsigned short log2size, unsigned short log2_amp, short flag) { int i,j,k; // counters int bs; // block_size int n_b; // numer of blocks short int index; // reverse index
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -