📄 iir_flp32_pipe.asm

📁 这是我在ADSP tiger sharc 201上面实现的OFDM(标准是wimax)同步算法哦!具有非常高的指令效率.
💻 ASM
字号:

/*  iir_flp32_pipe.asm
    Floating point IIR using software pipelining for TS201 AND TS101
    July 2,2003
    32-bit float

	Dec 10, 2003 revision - BL
    Performance optimized/improved to from 2.25 to 2.0 cycles/biquad as Nbiq -> infinity

    Improved to 2.0 cycles/biquad by making all coeffs loads
    quad, extra instruction slot created for loop termination jump.
    This optimization also improves bus utilization to free up additional
    time slots for DMA transfers.


-this program can be compiled and run on TS101 and TS201.
-TS101 has only 2 memory blocks (section 1 and 2) where data buffers may be placed.
-TS201 has 5 memory blocks (section 1, 2, 3, 4, 5) where data buffers may be placed.
-In this program only 2 memory blocks have been used to maintain compatibility between
TS101 and TS201

-2 tcl files are provided, one for TS101 and one for TS201. Both of them build
the correspondent project and save the output buffer into a file output.dat
-.align_code 4 instruction has been introduced throughout the main part of the program
to optimize the cycle count for TS201 (On this processor they may be even discarded
if the cycle count is not of interest). For TS101, the assembly option -align-branch-lines 
placed in the project properties tab has the same effect (On TS101 they are a must for
IF instructions).

-This program implements a real single-sample IIR filter, SECTIONS = Nbiq must be a multiple of 4.
Performance approaches 2.0 cycles/biquad as Nbiq -> infinity.

The overall cycle count of the program may be read at the
address cycle_count in the memory after the execution of the program.

  The example waveform has a sum of 5 sinewaves. Filter coeffs filter 4 out.
  (Four bi-quads per sine to filter). Filter's scalecoef actually should be 
  slightly larger than 1.0 to preserve unity gain. The cycle count is not affected.

  To make use of TigerSharc's parallel processing and to take care of
  computational stalls, software pipelining was used. The number of
  bi-quads is presumed to be a multiple of 4, they are divided into 4 sets
  S1, S2, S3 and S4. Input is denoted X(n), output Y(n) and intermidiate
  results are:

    I1 = S1(X),
    I2 = S2(I1),
    I3 = S3(I2)

(Thus, also, Y = S4(I3)).

The parallel execution structure is:

                     ---------
    X(n)     ->     |    S1    | -> I1(n)      done in X block registers xR0, xR4, xR14, xR16, xR8, xR12
                     ---------
                     ---------                                 ||
    I1(n-1)  ->     |    S2    | -> I2(n-1)    done in Y block registers yR0, yR4, yR14, yR16, yR8, yR12
                     ---------
                     ---------
    I2(n-2)  ->     |    S3    | -> I3(n-2)    done in X block registers xR1, xR5, xR15, xR17, xR9, xR13
                     ---------
                     ---------                                 ||
    I3(n-3)  ->     |    S4    | -> Y(n-3)     done in Y block registers yR1, yR5, yR15, yR17, yR9, yR13
                      ---------

Thus, there is a three-sample delay from input to output.
To make this work, the filter coefficients must be interleaved in sets of 4.

Individual bi-quads are of the form

                    1 + b1*z^(-1) + b2*z^(-2)
    H(z) =  scale * -------------------------
                    1 - a1*z^(-1) - a2*z^(-2)

All the scale coefficients are combined into one scalecoef with a single
multiply at the end.

The recursion is implemented using the canonical form:

    w(n) = x(n) + a1*w(n-1) + a2*w(n-2)
    y(n) = w(n) + b1*w(n-1) + b2*w(n-2)

Note that input x(n) could come as output of previous bi-quad section

Note:
    The performance approaches 2.0 cycles/biquad as Nbiq -> infinity

************************************************************************/

#define        SECTIONS    16
#define        N          256

/************************************************************************/
.section data1;

.align 2;                                                       // align to long
.var delayline[SECTIONS*2] = 0.0, 0.0, 0.0, 0.0,                // filter delay line
                             0.0, 0.0, 0.0, 0.0,
                             0.0, 0.0, 0.0, 0.0,
                             0.0, 0.0, 0.0, 0.0,
                             0.0, 0.0, 0.0, 0.0,
                             0.0, 0.0, 0.0, 0.0,
                             0.0, 0.0, 0.0, 0.0,
                             0.0, 0.0, 0.0, 0.0;

.var input[N] = "indata.dat";                                    // input data
.var output[N];                                                  // output data
.var scalecoef = 1.0;

.var cycle_count;    // execution cycle counts

/************************************************************************/
.section data2;
.align 2;                                                        // align to long
.var coeffs[SECTIONS*4] = "coeffs.dat";                          // coeffs interleaved in sets of 4

/************************************************************************/
#ifdef __ADSPTS201__
  #include <defts201.h>
#endif

#include "cache_macros.h"



.section program;
.global _main;

/************************************** Power up code *****************************************/
powerup:

#ifdef __ADSPTS201__

/*in the case of TS201, at the beginning of the program the
cache must be enabled. The procedure is contained in the
cache_enable macro that uses the refresh rate as input parameter
      -if CCLK=500MHz, refresh_rate=750
      -if CCLK=400MHz, refresh_rate=600
      -if CCLK=300MHz, refresh_rate=450
      -if CCLK=250MHz, refresh_rate=375
*/

  cache_enable(750);

//-------------Optional example to preload cache-----------------------------

    j0 = j31 + coeffs; LC0 = SECTIONS;;
    nop;nop;;
.align_code 4;
ini_cache:
    xr3:0 = q[j0+=0];;
.align_code 4;
    if NLC0E, jump ini_cache; q[j0+=4] = xr3:0;;

//----------------------------------------------------------------------------

#endif


end_powerup:

//read cycle counter

  ini_cycle_count;

/************************************** Start of code *****************************************/
_main:
    j4 = j31+input; nop;;                                        			// j4 -> input                                
    j5 = j31+output; nop;;                                       			// j5 -> output            
    LC0=N; r9:8=r9:8-r9:8; yr10=[j31+scalecoef];;                		// N samples, init accumulators to 0, fetch scale

    j0 = j31+delayline;;                                         		// j0 -> delayline
    j1 = j31+delayline;;                                         		// j1 -> delayline
    k0 = k31+coeffs;;                                            		// k0 -> coeffs

/********************************* Benchmark kernel code ****************************************/
.align_code 4;
main_loop:
        r3=r3-r3; r1:0=q[j0+=4]; r15:14=q[k0+=4];;               		// r3=0, r0=w(n-2), r14 = a2 for S1,S2, r15 = a2 for S3,S4
        r7=r7-r7; xr8=[j4+=1];;                    						// r7=0, xr8=input
        LC1 = SECTIONS/4;;                          					// init counter, r1=w(n-2) for S3,S4

.align_code 4;
biq:                                                             		// Inner loop computations are SIMD, using both X and Y comp blocks
            fr3=r0*r14; fr8=r8+r3; r5:4=q[j0+=4];    r17:16=q[k0+=4];;  // r3=a2*w(n-2), r8=x(n), r4=w(n-1), r16=a1 (S1,S2), r17=a1 (S3,S4)
            fr7=r1*r15; fr9=r9+r7; q[j1+=4]=r5:4;;     					// r7=a2*w(n-2), r9=x(n), r5=w(n-1)
            fr3=r4*r16; fr8=r8+r3;                   r15:14=q[k0+=4];;  // r3=a1*w(n-1), r8=x(n)+a2*w(n-2), store new w(n-2), r14=b2 (S1,S2), r15=b2 (S3,S4)
            fr7=r5*r17; fr9=r9+r7;;     								// r7=a1*w(n-1), r9=x(n)+a2*w(n-2), store new w(n-2)
            fr3=r0*r14; fr12=r8+r3;                  r17:16=q[k0+=4];;  // r3=b2*w(n-2), r12=new w(n), r0=next w(n-2), r16=b1 (S1,S2), r17=b1 (S3,S4)
            fr7=r1*r15; fr13=r9+r7; r1:0=q[j0+=4];;    					// r7=b2*w(n-2), r13=new w(n), r1=next w(n-2)
            fr3=r4*r16; fr8=r12+r3; q[j1+=4]=r13:12; r15:14=q[k0+=4];;  // r3=b1*w(n-1), r8=w(n)+b2*w(n-2), store new w(n-1), r14=next a2 (S1,S2), r15=next a2 (S3,S4)
.align_code 4;
			if NLC1E, jump biq;
            fr7=r5*r17; fr9=r13+r7;;   									// r7=b1*w(n-1), r9=w(n)+b2*w(n-2), store new w(n-1)

.align_code 4;
        fr8=r8+r3; j1=j31+delayline;;                            		// xr8=I1(n), yr8=I2(n-1), setup j1 for next sample
        fr9=r9+r7; j0=j31+delayline;;                            		// xr9=I3(n-2), yr9=Y(n-3), setup j0 for next sample
        yfr12=r9*r10; yr9=xr9;;                                  		// scale output, yr9=I3(n-2)
        xr9=yr8; k0=k31+coeffs;;                                 		// xr9=I2(n-1), setup k0 for next sample

.align_code 4;
        if NLC0E, jump main_loop; yr8=xr8; [j5+=1]=yr12;;   	 		// write ouput data, quad align jumps,yr8=I1(n) 

/******************************************* Done ***********************************************/
//read cycle counter and compute the program's cycle count

  comp_cycle_count;


_main.end:
___lib_prog_term:
    jump ___lib_prog_term (NP);;                                      // done.
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -