📄 fir_decima_gen.asm
字号:
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs ( Micro Signal Architecture 1.0 specification).
By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software.
********************************************************************************
Module Name : fir_decima_gen.asm
Label name : __fir_decima_gen
Version : 1.3
Change History :
Version Date Author Comments
1.3 11/18/2002 Swarnalatha Tested with VDSP++ 3.0
compiler 6.2.2 on
ADSP-21535 Rev.0.2
1.2 11/13/2002 Swarnalatha Tested with VDSP++ 3.0
on ADSP-21535 Rev. 0.2
1.1 03/27/2002 Nishanth Modified to match
silicon cycle count
1.0 06/02/2001 Nishanth Original
Description : This function performs FIR based Decimation Filter. The
function produces the filtered decimated output for a given
input data. The characteristics of the filter are dependant on
the coefficient values,the number of taps(L) and decimation
index(M) supplied by the calling program.
The coefficients stored in vector `h` are applied to the
elements of vector `x[]`. For filtering, 40 bit accumulator is
used. The most significant 16 bits of the result is stored in
the output vetor `y[ ]`computed according to a decimation
index `L`.
Coefficients are stored in normal order not as polyphases.
The implementation of a zero phase decimator is demonstrated
in the program.
The implementation provided below does not use a delay line
once it does not require samples older than x(0).
This has been done to avoid overhead due to unnecessary
duplication of input data.
The equation for decimation by M can be expressed as:
y(n) = h(0) * x(n*M) + h(1) * x(n*M-1) + ...... + h(L-1)
* x(n*M+1-L)
This implementation is divided into two stages.
a) In the first stage, it finds the output samples which
require delay line, i.e. for the first L/M output samples
y(0) = h(0) * x(0) + h(1) * x(-1) + ... + h(L-1) * x(-L+1)
y(1) = h(0) * x(M) + h(1) * x(M-1) + ...+ h(L-1) * x(M-L+1)
...
y(f) = h(0) * x(f*M) + h(1) * x(f*M-1) + .. h(L-1)* x(f*M-L+1)
,where f = L/M - 1.
This stage has been separated out due to the use of delay
line. There are two inner loops.
One finds sum of terms containing inputs present in delay line
and the other, ones in input buffer.
b) In the second stage, all the remaining output samples are
calculated.i.e. y(L/M) to y(Nout - 1) are computed in stage 3.
c) After filtering the input, the delay line is updated by the
last L-1 input samples.
Assumptions : 1. It also assumes that L > M. If L <= M, (L/M)-1 = 0, i.e.,
Stage1 need not be done.
But loop using LC0 does the loop atleast once.
2. It also assumes that number of input samples is an integral
multiple of decimation factor.
This is for correct updation of delay line.
3. Number of coefficients(L) is assumed to be a multiple of
decimation factor(M).
4. Number of coefficients(L) must be atleast 3
Prototype : void fir_decima_gen(const fract16 x[], fract16 y[], int Ni,
fract16 h[], int L, int M, int LBYM, fract16 d[]);
x[] - input array
y[] - output array
Nout - Number of output samples
h[] - Filter coefficient array
L - No. of coefficients
M - Decimation Factor
LBYM - Number of coefficients in each polyphase(L/M)
d[] - Delay line buffer
Registers used : A1, R0-R3, R7, I0-I3, B1-B3, M0, M2, L0-L3, P0-P2, P5, LC0,
LC1.
Performance :
Code size : 244 Bytes
Cycle count : 2417 Cycles (For Ni = 256, L=16, M = 2)
*******************************************************************************/
.section L1_code;
.global __fir_decima_gen;
.align 8;
__fir_decima_gen:
[--SP]=(R7:7,P5:5); // Push R7 and P5
P5 = [SP+20]; // Address of filter coefficients(h)
P1 = [SP+24]; // Number of Coefficients (L)
R3 = [SP+28]; // Decimation Factor (M)
P0 = [SP+32]; // L/M (Stage 1 counter)
I0 = R0; // Address of input buffer
L0 = 0; // Circular buffering of input buffer is disabled
I2 = P5; // Address of coefficients
B2 = P5; // Coefficient array is a circular buffer
I3 = R1; // Address of output buffer
B3 = R1; // Output buffer is a circular buffer
P2 = R2; // Number of output samples
R2 <<= 1; // R2 = 2*No
L3 = R2; // Lehgth of output buffer = 2*No
R2 = P1; // R2 = L
R2 = R2 + R2(S) || R0 = [SP+36];
// R2 = 2*L and R0 = address of delay line buffer
L2 = R2; // Length of coefficient array = 2*L
M2 = R2; // Modifier M2 = 2*L
I1 = R0; // Address of delay line
B1 = R0; // Delay line buffer is a circular buffer
R2 += 4; // R2 = 2*L + 4
L1 = R2; // Length of delay line buffer is made 2*L + 4
// though there are only L-1 elements
// so that in stage 1 modifier of input can be used
// for delay line also.
P2 -= P0; // Stage2 counter (No - L/M)
P1 += -1; // Stage1a counter(delay line) = inner loop counter
// of Stage 2 = L-1
P5 = 1; // Stage1b counter(input buffer) = 1
R2 = 4; // R2 = 4(copy of modifier M0)
M0 = R2; // Modifier M0 is initilaized to 4
R0.L = W[I2--] || I3 -= 2;
// Coefficient pointer and output pointer are
// modified
LSETUP (FIR_DEC_GEN_STG1_ST,FIR_DEC_GEN_STG1_END) LC0 = P0;
// Stage 1 counter = L/M
P0 = R3; // P0 initialized to M
R3 = R3 + R3(S) || R1.L = W[I2--] || I1 -= M0;
// R3 = 2*M as input data is of type fract16
// Fetch the last coeffient to R1.L and modify delay
// line pointer
// Start of stage 1
FIR_DEC_GEN_STG1_ST:
A1=0 || I1 += M0 || R0.L = W[I0++];
// Modify delay line pointer, Fetch from input
// buffer to R0.L
R0.H = W[I1++] || W[I3++] = R7.H;
// Fetch from delay line to R0.H, Store previous
// result
LSETUP(FIR_DEC_GEN_STG1A,FIR_DEC_GEN_STG1A) LC1 = P1;
// Loop for terms containing samples from delay line
P1 -= P0; // Decrement counter for delay line loop
FIR_DEC_GEN_STG1A:
A1+=R0.H*R1.L || R0.H = W[I1++] || R1.L = W[I2--];
// Find sum of terms containing samples from delay
// line
LSETUP(FIR_DEC_GEN_STG1B,FIR_DEC_GEN_STG1B) LC1 = P5;
// Loop for terms containing samples from input
// buffer
FIR_DEC_GEN_STG1B:
R7.H=(A1+=R0.L*R1.L) || R0.L = W[I0++] || R1.L = W[I2--];
// Find sum of terms containing samples from delay
// line buffer
R2 = R2 + R3(S) || I0 -= M0;
// Add 2*M to copy of modifier , Modify input
// poionter
M0 = R2; // Adjust modifier
FIR_DEC_GEN_STG1_END:
P5 = P5 + P0; // Increment counter for input buffer loop
// End of stage 1
P5 -= P0; // Make P5 =L-1
R0.L = W[I0++]; // not combined with next instruction just to make
// sure that the 8 byte instructions in loop are
// aligned
R2 = R2 - R3(S); // R2 =2*l - 2*M + 4
R2 += -4; // R2 =2*l - 2*M
M0 = R2; // Modifier M0 = 2*L - 2*M
P5 += -2;
LSETUP (FIR_DEC_GEN_STG2_ST,FIR_DEC_GEN_STG2_END) LC0 = P2;
// Loop for Nout - L/M
// Start of stage 2
FIR_DEC_GEN_STG2_ST:
// LC1 is the no. of coefficients(L)-2
A1=0 || R0.L = W[I0++] || W[I3++] = R7.H;
// Fetch input into R0.L and store output present
// in R7.H
A1+=R0.L*R1.L || R0.L = W[I0++] || R1.L = W[I2--];
A1+=R0.L*R1.L || R0.L = W[I0++] || R1.L = W[I2--];
LSETUP (FIR_DEC_GEN_STG2A,FIR_DEC_GEN_STG2A) LC1 = P5;
FIR_DEC_GEN_STG2A:
A1+=R0.L*R1.L || R0.L = W[I0++] || R1.L = W[I2--];
// A1+=x(1)*h(-L+1)
// Fetch x(2) into R0.L,
// Fetch h(-L+2) R1.L(first time)
FIR_DEC_GEN_STG2_END:
R7.H=(A1+=R0.L*R1.L) || I0 -= M0 || R1.L = W[I2--];
// Last operation is unrolled, modify I0, Fetch next
// coefficient
// End of stage 2
P5 += 2;
I0 -= 4 || R0.L=W[I1--];
// Modify input pointer
// Set up loop for delay line updation(L-1)
I0 += M2 || R0.L = W[I1--];
// Modify input and delay line pointers
W[I3++] = R7.H || R0.L = W[I0--];
// Fetch last input sample and Store final output
// sample
LSETUP( FIR_DEC_GEN_DELUPDATE,FIR_DEC_GEN_DELUPDATE) LC0 = P5;
FIR_DEC_GEN_DELUPDATE:
R0.L = W[I0--] || W[I1--] = R0.L;
// Update delay line buffer with last input samples
(R7:7,P5:5)=[SP++]; // Pop R7 and P5
RTS;
NOP; //to avoid one stall if LINK or UNLINK happens to be
//the next instruction after RTS in the memory.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -