📄 bkiir_flp32.asm
字号:
/*************************************************************************
-rev1.0 PM, 9/2003
-rev1.1 PM, 7/2004, passed in VDSP 3.5
-this program implements a floating point block IIR
-it can be compiled and run on TS101 and TS201.
-TS101 has only 2 memory blocks (section 1 and 2) where data buffers may be placed.
-TS201 has 5 memory blocks (section 1, 2, 3, 4, 5) where data buffers may be placed.
-In this program only 2 memory blocks have been used to maintain compatibility between
TS101 and TS201
-2 tcl files are provided, one for TS101 and one for TS201. Both of them build
the correspondent project and save the output buffer into a file output.dat
-.align_code 4 instruction has been introduced throughout the main part of the program
to efficientize the cycle count for TS201 (On this processor they may be even discarded
if the cycle count is not of interest). For TS101, the assembly option -align-branch-lines
placed in the project properties tab has the same effect (On TS101 they are a must for
IF instructions).
OVERVIEW:
==============================
32-bit Float
Fourth order IIR similar to a direct form II for TS201.
I/O FILES USED:
=====================
The data is stored in the following buffers:
inputs: N samples of input data
delayline 4 elements of the delay line
output: N samples of output data computed by this program
DESCRIPTION:
============
This is a 4th order IIR filter. The usual implementation is type II, having a
single delay line for both the feedback and feedforward coefficients. However,
the implementation presented here is a variation of the type II: it maintains
two copies of the same delay line, one copy in each compute block. This way,
the feedback MACs are computed in CBX, and the forward MACs in CBY.
The program is organized to simulate a real implementation of such filters:
-a block of N input data is filtered saving the outputs into the output buffer
and the delay line into the delayline buffer
The equations of the filter are:
w(n)=x(n)*scale+w(n-1)*a4+w(n-2)*a3+w(n-3)*a2+w(n-4)*a1
y(n)=w(n)+w(n-1)*b4+w(n-2)*b3+w(n-3)*b2+w(n-4)*b1
The coefficients a4,a3,a2,a1,b4,b3,b2,b1 are saved in the coeffs buffer
in the following order: a2,a4,b2,b4,a1,a3,b1,b3
The delay line saves w(n-1), w(n-2), w(n-3), w(n-4) in the following order:
w(n-3), w(n-1), w(n-4), w(n-2).
This order simplifies the process that updates the delay line. For example,
the delay line is loaded in the following registers:
r3=w(n-1) r2=w(n-3) r1=w(n-2) r0=w(n-4)
When w(n) must be saved into the delay line, r1:0 is shifted right 32 bits
throwing away w(n-4) and making place for w(n). The delay line looks now as:
r3=w(n-1) r2=w(n-3) r1=w(n) r0=w(n-2) and is saved in the following order:
w(n-2), w(n), w(n-3), w(n-1).
This procedure repeates each time the filter is used.
-The cycle count on TS201 is: 7*N+17=227 cycles (260 on TS101).
-The efficiency is 5*N/(7*N+17)=150/227=66%
- at the end of the program, the cycle_count variable contains the
cycle count of the main program
*************************************************************************/
// N number of data points in input
#define N 30
#define SECTIONS 4
#define scale 0.05078125
/************************************************************************/
.section data1;
.align 4;
.var inputs[N] =
5000.0, 4333.0, 5465.0, 13556.0, 7423.0, -5000.0, -4333.0, -5465.0, -13556.0, -7423.0,
5000.0, 4333.0, 5465.0, 13556.0, 7423.0, -5000.0, -4333.0, -5465.0, -13556.0, -7423.0,
5000.0, 4333.0, 5465.0, 13556.0, 7423.0, -5000.0, -4333.0, -5465.0, -13556.0, -7423.0;
.var cycle_count; // execution cycle counts
.align 4;
.var expected_output[N] =
253.906250, 1394.910156, 3400.899369, 5451.189211, 7461.879024,
8509.488592, 5519.537707, -1306.039745, -6936.575354, -8920.200002,
-8397.397895, -4821.647412, 1507.253965, 6689.519912, 8738.028980,
8444.656939, 4922.543154, -1492.260614, -6730.424864, -8759.801131,
-8433.783468, -4908.582993, 1492.343760, 6724.040670, 8757.440081,
8435.857839, 4910.422599, -1492.627568, -6724.989198, -8757.654792;
.align 4;
.var delayline[SECTIONS]=0.0, 0.0, 0.0, 0.0;
.align 4;
.var output[N];
.section data2;
.align 4;
// a2, a4, b2, b4, a1, a3, b1, b3
.var coeffs[2*SECTIONS]= 0.1412, 0.6272, 4., 4., -0.0255, -0.6108, 1., 6.;
/************************************************************************/
#ifdef __ADSPTS201__
#include "defts201.h"
#endif
#include "cache_macros.h"
/************************************************************************/
/* Main Program Section */
.section program;
/************************************** Power up code *****************************************/
main:
powerup:
#ifdef __ADSPTS201__
/*in the case of TS201, at the beginning of the program the
cache must be enabled. The procedure is contained in the
cache_enable macro that uses the refresh rate as input parameter
-if CCLK=500MHz, refresh_rate=750
-if CCLK=400MHz, refresh_rate=600
-if CCLK=300MHz, refresh_rate=450
-if CCLK=250MHz, refresh_rate=375
*/
cache_enable(750);
j0 = j31 + inputs; LC0 = N/4;;
//due to a TS201 rev0 anomaly, the initialization of LC0 must be at least
//4 instruction lines before the end of the loop (jump instruction included)
//and the loop must be at least 2 cycles long
nop;nop;;
.align_code 4;
ini_cache1:
xr3:0 = q[j0+=0];;
.align_code 4;
if NLC0E, jump ini_cache1; q[j0+=4] = xr3:0;;
j0 = j31 + coeffs; LC0 = 2;;
//due to a TS201 rev0 anomaly, the initialization of LC0 must be at least
//4 instruction lines before the end of the loop (jump instruction included)
//and the loop must be at least 2 cycles long
nop;nop;;
.align_code 4;
ini_cache:
xr3:0 = q[j0+=0];;
.align_code 4;
if NLC0E, jump ini_cache; q[j0+=4] = xr3:0;;
#endif
end_powerup:
//the delay line is loaded into the cache
xr3:0 = q[j31 + delayline];;
q[j31 + delayline] = xr3:0;;
//j1 is used to fetch the inputs.
j1 = j31 + inputs;;
/************************************** Start of code *****************************************/
//read cycle counter
ini_cycle_count;
//j2 is used to save the outputs
j2 = j31 + output; xr9 = 0x0;;
j3 = j31 + delayline;;
//yr8=x(n)
yr8 = [j1+=1]; yr30 = scale;;
k0 = k31 + coeffs; LC0 = N-1;;
//r3:2=w(n-1) w(n-3) yr7:6=a4,a2 xr7:6=b4,b2
r3:2 = l[j3+=2]; r7:6 = q[k0+=4];;
//r1:0=w(n-2) w(n-4) yr5:4=a3,a1 xr5:4=b3,b1
r1:0 = l[j3+=2]; r5:4 = q[k0+=4];;
//yr9=x(n)*scale MY
//xr9 = 0x0
yfr9 = r8 * r30;;
//yr10=w(n-4)*a1 M0
//xr10=w(n-4)*b1
fr10 = r0 * r4;;
//yr11=w(n-3)*a2 M1
//xr11=w(n-3)*b2
fr11 = r2 * r6;;
//yr12=w(n-2)*a3 M2
//xr12=w(n-2)*b3
fr12 = r1 * r5; fr14 = r9 + r10;;
//yr13=w(n-1)*a4 M3
//xr13=w(n-1)*b4
fr13 = r3 * r7; ;
// prepare the delay line for the next input
fr15 = r11 + r12; r1:0 = r3:2; r18 = lshift r1 by 0; yr8 = [j1+=1];;
fr14 = r14 + r13;;
.align_code 4;
iir_loop:
r2 = r18; yfr9 = r8 * r30;;//MY
//yr3=w(n)
fr3 = r14 + r15; fr10 = r0 * r4;; //M0
#ifdef __ADSPTS201__
//On TS201, a register can be source and result in the same instruction.
//On TS101 this is not possible, so 2 cycles are lost to perform the same task.
//Because of this, one stall appears, so the count cycle increases on TS101
xr3 = yr3; xr16 = xr3; fr11 = r2 * r6;;//M1
// M2
fr12 = r1 * r5; fr14 = r9 + r10;;
#else
xr16 = xr3; fr11 = r2 * r6;;//M1
// M2
xr3 = yr3; fr12 = r1 * r5; fr14 = r9 + r10;;
#endif
//xr17=y(n) M3
xfr17 = r3 + r16;fr13 = r3 * r7;;
fr15 = r11 + r12; r1:0 = r3:2; r18 = lshift r1 by 0;yr8 = [j1+=1];;
.align_code 4;
if NLC0E, jump iir_loop; [j2+=1] = xr17; fr14 = r14 + r13;;
fr3 = r14 + r15;;
#ifdef __ADSPTS201__
//On TS201, a register can be source and result in the same instruction.
//On TS101 this is not possible, so 2 cycles are lost to perform the same task.
xr3 = yr3; xr16 = xr3;;
#else
xr16 = xr3;;
xr3 = yr3;;
#endif
xfr17 = r3 + r16; l[j31 + delayline+2] = yr3:2;;
[j2+=1] = xr17; r2 = lshift r1 by 0;;
l[j31 + delayline] = yr3:2;;
/******************************************* Done ***********************************************/
//read cycle counter and compute the program's cycle count
comp_cycle_count;
main.end:
___lib_prog_term:
nop;nop;nop;nop;;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -