📄 accelerator_optimized_fft.c

📁 在cycloneII里实现对FFT的硬件加速
💻 C
字号:
#include "accelerator_optimized_fft.h"
#include "pound_defines.h"
#include "system.h"
#include "alt_types.h"

void accelerator_optimized_fft(alt_16 * __restrict__ InData,  /* real part of input data */
                        alt_16 * __restrict__ OutData)        /* real part of output data */

{
  alt_u16   twiddle_index;
  alt_u16   twiddle_incr;
  alt_u16   loop_element;
  alt_u16   loop_element_div2;
  alt_u16   l;
  alt_u16   sub_stage_index;
  alt_u16   stage_index;
  alt_u16   butterfly_index;

  /* These restricted pointers make up the bulk of the avalon masters
   * in this system */
  /* Real data ping pong buffers */
  alt_16   * __restrict__ BufferedRealCalcDataRead;
  alt_16   * __restrict__ BufferedRealCalcDataReadPort2; 
  alt_16   * __restrict__ BufferedRealCalcDataWrite;
  alt_16   * __restrict__ BufferedRealCalcDataWritePort2;    
  /* Imaginary data ping pong buffers */
  alt_16   * __restrict__ BufferedImagCalcDataRead;
  alt_16   * __restrict__ BufferedImagCalcDataReadPort2;
  alt_16   * __restrict__ BufferedImagCalcDataWrite;
  alt_16   * __restrict__ BufferedImagCalcDataWritePort2;  
  /* Cosine and Sine Tables  */
  alt_16   * __restrict__ CosineTable;
  alt_16   * __restrict__ SineTable;
  /* Pointers to in and out buffers in main memory */
  alt_u32  * __restrict__ tempInputPtr = (alt_u32 *)InData;
  alt_u32  * __restrict__ tempOutputPtr = (alt_u32 *)OutData;

  /* Temporary registers for the input stage */
  alt_u16   bit_rev_index;
  alt_u32   tempInput;

  /* Registers for the calculation stage */
  alt_16   CosReal;
  alt_16   SinReal;
  alt_16   tRealData;
  alt_16   tImagData;
  alt_16   temp1, temp2, temp3, temp4;

  /* Counters for the input and output stages */
  alt_u16  inputCounter, outputCounter;
  
  /* Assign the ping ping buffers default address locations */
  BufferedRealCalcDataRead = BufferRAM1;
  BufferedRealCalcDataReadPort2 = BufferRAM1;  
  BufferedRealCalcDataWrite = BufferRAM2;
  BufferedRealCalcDataWritePort2 = BufferRAM2;
  
  BufferedImagCalcDataRead = BufferRAM3;
  BufferedImagCalcDataReadPort2 = BufferRAM3;  
  BufferedImagCalcDataWrite = BufferRAM4;
  BufferedImagCalcDataWritePort2 = BufferRAM4;                                                               
  
  /* Point the Cosine and Sine Tables to the CosRAM and SinRAM on-chip memory
   * buffers.  These memories are local to the accelerator and are not shared
   * with the Nios II processor. */                    
  CosineTable = CosRAM;
  SineTable = SinRAM;

  /* Data input buffering (Stage 1) */
  
  /* Calculate the bitreversal index and read
   * 32 bits of data from the input buffer in SDRAM (real and imaginary pair).
   * Split the data read into half and write them into real and imaginary
   * buffers concurrently */ 
  for (inputCounter = 0; inputCounter < NUM_POINTS; inputCounter++) {
    bit_rev_index = bitrev(inputCounter);  
     
    tempInput = tempInputPtr[inputCounter];    
	  BufferedRealCalcDataRead[bit_rev_index] = (alt_16)(tempInput & 0x0000FFFF); 
    BufferedImagCalcDataRead[bit_rev_index] = (alt_16)((tempInput & 0xFFFF0000)>>16); 
  }

  /* FFT Computation (Stage 2) */
  /* Step through the fft stages */
  for (stage_index = 1; stage_index <= FFT_SIZE; stage_index++) {
    loop_element = 1<<stage_index;
    loop_element_div2 = loop_element/2;

    /* Initialize twiddle factor lookup indicies */
    twiddle_index = 0;
    twiddle_incr = 1 << (FFT_SIZE-stage_index);

    /* Step through the butterflies */
    for(sub_stage_index = 0; sub_stage_index < loop_element_div2; sub_stage_index++) {

      /* Lookup twiddle factors */
      CosReal = CosineTable[twiddle_index];
      SinReal = SineTable[twiddle_index];

      /* Process butterflies with the same twiddle factors */
      for(butterfly_index = sub_stage_index; butterfly_index < NUM_POINTS; butterfly_index += loop_element) {
        l = butterfly_index + loop_element_div2;

        /* using temps (regs) to allow this to happen concurrently since
         * these are DP RAM accesses that do not overlap.  We are using read
         * pointers here so that the write pointers at the bottom can work in
         * parallel */
        temp1 = BufferedRealCalcDataRead[l];
        temp2 = BufferedImagCalcDataRead[l];
        temp3 = BufferedRealCalcDataReadPort2[butterfly_index];
        temp4 = BufferedImagCalcDataReadPort2[butterfly_index];
      
        /* Scale twiddle products to accomodate 16 bit storage */
        /* CosReal, SinReal, temp1, and temp2 are all registers so no
         * waiting occurs here (this happens concurrently) */
        tRealData = (( CosReal * temp1 ) + ( SinReal * temp2 ))>> PRESCALE;
        tImagData = (( CosReal * temp2 ) - ( SinReal * temp1 ))>> PRESCALE;

        /* tRealData, tImagData, temp3, temp4 are all registers so no
         * waiting occurs here (this happens concurrently).  We are using write
         * pointers here so that the read pointers at the top can work in
         * parallel */
        BufferedRealCalcDataWrite[l] = temp3 - tRealData;
        BufferedImagCalcDataWrite[l] = temp4 - tImagData;
        BufferedRealCalcDataWritePort2[butterfly_index] = temp3 + tRealData;
        BufferedImagCalcDataWritePort2[butterfly_index] = temp4 + tImagData;    
		            
      }
      twiddle_index += twiddle_incr;
    }
      /* Ping-Pong Buffering 
       * At the end of each iteration of the stage loop it is time to swap the
       * pin-pong buffers.  So for example we want the BufferedRealCalcRead  
       * buffer to point to the location in memory where BufferedRealCalcWrite
       * was stored on the previous pass.  We can do this with a straight 
       * assignment.  Then we need to have the BufferedRealCalcDataWrite buffer
       * point to the address in memory where BufferedRealCalcRead was stored 
       * on the previous pass.  We have already updated BufferedRealCalcRead to
       * point to BufferedRealCalcWrites address.  So to update 
       * BufferedRealCalcWrite we can take advantage of the facts that the buffers
       * are dual-ported and assign it to the location where 
       * BufferedRealCalcDataReadPort2 was pointing to on the previous pass.*/
       
      BufferedRealCalcDataRead = BufferedRealCalcDataWrite;
      BufferedRealCalcDataWrite = BufferedRealCalcDataReadPort2;
      BufferedRealCalcDataReadPort2 = BufferedRealCalcDataRead;
      BufferedRealCalcDataWritePort2 = BufferedRealCalcDataWrite;
      
      BufferedImagCalcDataRead = BufferedImagCalcDataWrite;
      BufferedImagCalcDataWrite = BufferedImagCalcDataReadPort2;
      BufferedImagCalcDataReadPort2 = BufferedImagCalcDataRead;
      BufferedImagCalcDataWritePort2 = BufferedImagCalcDataWrite;   
  }

  /* returning the interleaved results to sdram (Stage 3)
   * Since the data is 16 bit and interleaved we'll stick the real and
   * imaginary parts together and send them off to sdram */ 
  for(outputCounter = 0; outputCounter < NUM_POINTS; outputCounter++) {
    tempOutputPtr[outputCounter] = (((alt_u32)(BufferedImagCalcDataRead[outputCounter]) & 0x0000FFFF)<<16) | ((alt_u32)BufferedRealCalcDataRead[outputCounter] & 0x0000FFFF);
  }
}
💿 文件大小 38 K
👤 上传用户 lemon_zc1949
📂 所属分类其他嵌入式/单片机内容
🏷️ 相关标签

#cycloneII #FFT #硬件加速
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -