📄 bkfir2_flp32.asm
字号:
/* FIR.asm
32 bit floating point FIR filter.
rev1.0 PM, 9/2003
OVERVIEW:
=========
-this program implements a floating point block FIR
-it can be compiled and run on TS101 and TS201.
-TS101 has only 2 memory blocks (section 1 and 2) where data buffers may be placed.
-TS201 has 5 memory blocks (section 1, 2, 3, 4, 5) where data buffers may be placed.
-In this program only 2 memory blocks have been used to maintain compatibility between
TS101 and TS201
-2 tcl files are provided, one for TS101 and one for TS201. Both of them build
the correspondent project and save the output buffer into a file output.dat
-.align_code 4 instruction has been introduced throughout the main part of the program
to efficientize the cycle count for TS201 (On this processor they may be even discarded
if the cycle count is not of interest). For TS101, the assembly option -align-branch-lines
placed in the project properties tab has the same effect (On TS101 they are a must for
IF instructions).
This program achives high efficiency and small filter granularity
by exploiting high level of data re-use.
Efficiency 95%
Size 80 words
M0: 75%, M1: 0%
Filter granularity: 4
Test for a 72-tap filter. In general, this code computes filters with
length a multiple of 4. Filters with different L's require zero padding.
The buffer of coefficients is accessed with a circular buffer pointer
to save one instruction (hence one cycle) for pointer update.
Outer loop unrolled 4 times, with 4 outputs computed simultaneously
in CBX and CBY. Data misaligned with respect to coefficients is
achived by two different multiplications per line.
With this high data re-use (each data point is used in 4 multiplications),
one can either (a) decrease memory bus utilization, or (b) improve filter
granularity. This program improves filter granularity, and accesses data
mostly in single word loads. (There is an alternate version of this program
that decreses mem utilization to 25% using quad data loads, but has filter
granularity of 8.)
Restrictions: the length of data and coefficients must be a multiple of 4.
I/O FILES USED:
===============
The data is stored in file data.asm in the
following buffers:
buff1 -- N samples of input data, zero padded with L trailing zeros
coeff_buff -- L coefficients
output -- N samples of output data computed by this program
DESCRIPTION:
============
This filter is based on computing outputs yi and yi+1 on CBY, and outputs
yi+2 and yi+3 on CBX.
The trick used here to achieve off by one asymetry between the comp blocks is to
issue different multiplications to each one. So for each filter coefficient, one
issues 4 multiplications, where the 4 data points are rotated from one filter
coefficient to the next. Hence, for each set of 4 multiplications, one only needs to
load a single data point.
All data loads are broadcast, thereby creating the illusion that both comp blocks
have read access to the "same" register file. This way, one can simplify the movement
of data between the blocks. For instance, by loading x0 into both xR0 and yR0, then
any comp block can read x0.
Computations are mapped according to:
CBX CBY
------ ------
r7: yi+3 yi+1
r6: yi+2 yi
where r7 and r6 are the accumulation registers that hold the respective output.
This diagram shows the organization for the first few input points:
CBX high c0 c1 c2 c3 c4 c5 ---> yi+3
CBX low c0 c1 c2 c3 c4 c5 ---> yi+2
CBY high c0 c1 c2 c3 c4 c5 ---> yi+1
CBY low c0 c1 c2 c3 c4 c5 ---> yi
input x0 x1 x2 x3 x4 x5
All the 4 computations associated with coefficient c0 are computed in 2 cycles, and
accumulated on xR7:6 and yR7:6. After this, all 4 computations of c1 are done in the next
2 cycles, and so on.
PERFORMANCE:
============
EFFICIENCY:
----------
An N=200 sample input with L=72 takes 7461 cycles on TS201 (7414 on TS101),
for an efficiency of 97%.
Only one stall is present in the LC1 loop. All the other instructions contain
2 multiplications, one in Y and the other in X blocks.
The expression is 4+N/4*[(L/4-1)*8+9]=NL/2+4+N/4=7254
MEMORY MAP:
===========
Data buffer with Coeff circular buffer
zero pad for filter init: (negative increments):
------------------ |--------|
| 0 0 0 0 |buff1 <-- j0 | c1 c0 |coeff_buffer
| c2 |
| 0 0 0 0 |buff1+68 | |
|----------------|
| x3 x2 x1 x0 |buff1+72 | |
| x4 | |c71 c70 |coeff_buffer+70 <-j1
|--------|
|x199 x196|
|----------------|
| |
- at the end of the program, the cycle_count variable contains the
cycle count of the main program
*************************************************************************/
#define N 200 // number of data points in input
#define L 72 // number of filter coefficients
#define N_MAX 300 // maximum buffer size
.section data1;
// input data buffer of length N+L.
// L-1 leading zeros, and N data points.
.align 4;
.var inputs[N+L-1] = "input.dat";
.var temp;//this is a scratch location used in the cache loading process of the inputs buffer
.var cycle_count; // execution cycle counts
.section data2;
// Coefficients buffer with L entries.
.align 4;
.var coeffs[L] = "coefficients.dat";
.var output[N]; // buffer used for storing calculated outputs
/************************************************************************/
#ifdef __ADSPTS201__
#include <defts201.h>
#endif
#include "cache_macros.h"
/* Main Program Section */
.section program;
/************************************** Power up code *****************************************/
powerup:
#ifdef __ADSPTS201__
/*in the case of TS201, at the beginning of the program the
cache must be enabled. The procedure is contained in the
cache_enable macro that uses the refresh rate as input parameter
-if CCLK=500MHz, refresh_rate=750
-if CCLK=400MHz, refresh_rate=600
-if CCLK=300MHz, refresh_rate=450
-if CCLK=250MHz, refresh_rate=375
*/
cache_enable(750);
//-------------Optional examples to preload cache-----------------------------
j0 = j31 + inputs; LC0 = (L+N)/4;;
nop;nop;;
.align_code 4;
ini_cache1:
xr3:0 = q[j0+=0];;
.align_code 4;
if NLC0E, jump ini_cache1; q[j0+=4] = xr3:0;;
j0 = j31 + coeffs; LC0 = L/4;;
nop;nop;;
.align_code 4;
ini_cache:
xr3:0 = q[j0+=0];;
.align_code 4;
if NLC0E, jump ini_cache; q[j0+=4] = xr3:0;;
//----------------------------------------------------------------------------
#endif
end_powerup:
//consider a second loop in order to eliminate the effects of starting
//with the BTB unloaded. The second iteration gives the right value
//for the cycle count
j24 = j31 + 2;;
.align_code 4;
count_loop:
//read cycle counter
ini_cycle_count;
/************************************** Start of code *****************************************/
main :
//j0 points to data
j0 = j31 + inputs; LC1 = N/4;;
//k1 points in a circular fashion to the coefficients
k1 = k31 + coeffs + (L-2); LC0 = L/4-1;;
kb1 = k31 + coeffs;;
kl1 = k31 + L;;
//j2 points to the output buffer
j2 = j31 + output;;
//in the following, r31:28 contain the coefficients, r3:0 the input data
// Multiplications are indented accoring to the output
// that they belong to:
//
// CBX: CBY:
// yi+2 yi
// yi+3 yi+1
r31:30=cb l[k1+=-2]; r3:0=q[j0+=4];;
r0=[j0+=1]; xfr6=r2*r31; yfr6=r0*r31;;
r29:28=cb l[k1+=-2]; xfr7=r3*r31; yfr7=r1*r31;;
r1=[j0+=1]; xfr4=r3*r30; yfr4=r1*r30;;
xfr5=r0*r30; yfr5=r2*r30;;
.align_code 4;
loop:
r2=[j0+=1]; xfr4=r0*r29; yfr4=r2*r29; fr6=r6+r4;;
r31:30=cb l[k1+=-2]; xfr5=r1*r29; yfr5=r3*r29; fr7=r7+r5;;
r3=[j0+=1]; xfr4=r1*r28; yfr4=r3*r28; fr6=r6+r4;;
xfr5=r2*r28; yfr5=r0*r28; fr7=r7+r5;;
r0=[j0+=1]; xfr4=r2*r31; yfr4=r0*r31; fr6=r6+r4;;
r29:28=cb l[k1+=-2];xfr5=r3*r31; yfr5=r1*r31; fr7=r7+r5;;
r1=[j0+=1]; xfr4=r3*r30; yfr4=r1*r30; fr6=r6+r4;;
.align_code 4;
if NLC0E, jump loop;
xfr5=r0*r30; yfr5=r2*r30; fr7=r7+r5;;
r2=[j0+=-(L-2)]; xfr4=r0*r29; yfr4=r2*r29; fr6=r6+r4;;
//here the data for the next 4 filters is fetched
r31:30=cb l[k1+=-2]; xfr5=r1*r29; yfr5=r0*r28; fr7=r7+r5;;
r1:0=l[j0+=2]; xfr4=r1*r28; yfr4=r3*r28; fr6=r6+r4;;
r3:2=l[j0+=2]; xfr5=r2*r28; yfr5=r3*r29; fr7=r7+r5;;
//here the data for the next 4 filters begins to be elaborated
//stall
r0=[j0+=1]; xfr6=r2*r31; yfr6=r0*r31; fr8=r6+r4;;
r29:28=cb l[k1+=-2]; xfr7=r3*r31; yfr7=r1*r31; fr9=r7+r5;;
r1=[j0+=1];LC0 = L/4-1; xfr4=r3*r30; yfr4=r1*r30;;
//store now the outputs
.align_code 4;
if NLC1E, jump loop; xfr5=r0*r30; yfr5=r2*r30; q[j2+=4] = r9:8;;
/******************************************* Done ***********************************************/
//read cycle counter and compute the program's cycle count
comp_cycle_count;
j24 = j24 - 1;;
.align_code 4;
if NJEQ, jump count_loop;;
main.end:
___lib_prog_term:
nop;nop;nop;nop;;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -