📄 example 5-10.sa
字号:
; Example 5 - 10. Real-Valued FIR Filter SA Listing for the TMS320C64x DSP
* ========================================================================= *
* *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* DSP_fir_gen: FIR Filter (general purpose) *
* *
* REVISION DATE *
* 22-Mar-2000 *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void DSP_fir_gen *
* ( *
* const short *restrict x, /* Input ('nr + nh - 1' samples) */ *
* const short *restrict h, /* Filter coefficients (nh taps) */ *
* short *restrict r, /* Output array ('nr' samples) */ *
* int nh, /* Length of filter (nh >= 5) */ *
* int nr /* Length of output (nr >= 1) */ *
* ); *
* *
* C CODE *
* *
* This is the C equivalent of the assembly code. Note that the *
* assembly code is hand optimized and restrictions may apply. *
* *
* void DSP_fir_gen *
* ( *
* const short *restrict x, /* Input ('nr + nh - 1' samples) */ *
* const short *restrict h, /* Filter coefficients (nh taps) */ *
* short *restrict r, /* Output array ('nr' samples) */ *
* int nh, /* Length of filter (nh >= 5) */ *
* int nr /* Length of output (nr >= 1) */ *
* ) *
* { *
* int i, j, sum; *
* *
* for (j = 0; j < nr; j++) *
* { *
* sum = 0; *
* for (i = 0; i < nh; i++) *
* sum += x[i + j] * h[i]; *
* *
* r[j] = sum >> 15; *
* } *
* } *
* *
* DESCRIPTION *
* Computes a real FIR filter (direct-form) using coefficients *
* stored in vector h. The real data input is stored in vector x. *
* The filter output result is stored in vector r. This FIR *
* assumes the number of filter coefficients is greater than or *
* equal to 5. It operates on 16-bit data with a 32-bit *
* accumulate. This routine has no memory hits regardless of where *
* x, h, and r arrays are located in memory. The filter is nr *
* output samples and nh coefficients. The assembly routine *
* performs 4 output samples at a time. *
* *
* NOTES *
* There are 2 versions of code in this file. The one with seperate *
* loops for the filter taps and output samples performs well, if the *
* ratio of filter taps to output samples is comparable. If the *
* number of output samples to be computed is about four times the *
* number of filter taps, then the version with the collpased *
* loop performs better. *
* *
* TECHNIQUES *
* 1. Load double word instruction is used to simultaneously load *
* four values in a single clock cycle. *
* *
* 2. The inner loop is unrolled four times and will always *
* compute a multiple of 4 of nh and nr. If nh % 4 != 0, the *
* code will fill in 0s to make nh a multiple of 4. If nr % 4 *
* != 0, the code will still perform a mutiple of 4 outputs. *
* *
* 3. Both the inner and outer loops are software pipelined. *
* *
* 4. This code yields best performance when ratio of outer *
* loop to inner loop is less than or equal to 4. *
* *
* ASSUMPTIONS *
* 1. Little Endian is assumed for LDNDW. *
* 2. nh >= 5. *
* 3. nr multiple of 4. *
* 4. Output array r[] must be word-aligned *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.text
.sect ".text:_fir_gen"
.global _DSP_fir_gen
_DSP_fir_gen: .cproc A_xptr, B_hptr, A_rptr, B_nh, A_nr
.no_mdep
.reg B_m, B_3, B_2 ;
.reg B_1, B_ofs, B_h32:B_h10 ;
.reg B_h32_n, B_h10_n, A_nr_l ;
.reg B_ij, B_nh_l, A_nh_l ;
.reg A_i, B_ic, A_il ;
.reg A_hptr, B_rptr, B_ptr ;
.reg A_sum1:A_sum0 ;
.reg B_sum3:B_sum2 ;
;--
.reg A_fg, A_x32:A_x10 ;
.reg B_x76:B_x54, A_x76:A_x54 ;
.reg A_x21, B_x43, B_x65 ;
.reg A_prod0_10, A_prod0_32, A_prod2_32 ;
.reg B_prod2_54, A_prod1_21, B_prod1_43 ;
.reg B_prod3_43, B_prod3_65, A_sum0_0 ;
.reg A_sum0_1, A_sum1_0, B_sum1_1 ;
.reg B_sum2_0, B_sum2_1, B_sum3_0 ;
.reg B_sum3_1, B_sum0_s, A_sum1_s ;
.reg B_sum2_s, B_sum3_s, B_r32:B_r10 ;
.reg B_fl, A_h32, A_h10 ;
.reg B_sum1_s
.if 1
;--------------------------------------------------------;
; Use the following code if ratio of output sampl- ;
; es to filter taps is >= 5. nh/nr >= 5 ;
;--------------------------------------------------------;
AND.2 3, B_nh, B_m ; m = nh & 3
[!B_m] MVK.2 4, B_m ; m = 4
;--------------------------------------------------------;
; Detect if mask is a multiple of 4 or not ;
; If mask is of the form 4n + 3, then h3 = 0 ;
; If mask is of the form 4n + 2, then h3h2 = 0 ;
; If mask is of the from 4n + 1, then h3,h2,h1 = 0 ;
;--------------------------------------------------------;
CMPEQ.2 3, B_m, B_3 ; m == 3
CMPEQ.2 2, B_m, B_2 ; m == 2
CMPEQ.1 1, B_m, B_1 ; m == 1
SUB.2 B_nh, B_m, B_ofs ; ofs = nh - m
;--------------------------------------------------------;
; Read the filter taps speculatively. If the filter ;
; tap is not valid it is replaced with zero. At most ;
; 3 values past the end of the array may be read. ;
;--------------------------------------------------------;
ADDAH B_hptr, B_ofs, B_ptr ; ptr = &h[ofs]
LDNDW *B_ptr, B_h32:B_h10 ; Load h32:h10
[B_3] CLR B_h32, 16, 31, B_h32 ; h32 = 00XX
[B_2] ZERO B_h32 ; h32 = 0
[B_1] ZERO B_h32 ; h10 = 0
[B_1] CLR B_h10, 16, 31, B_h10 ; h10 = 00XX
;--------------------------------------------------------;
; Pack either modified h3, h2, h1, h0 or unmodified ;
; h3,h2,h1,h0 to form a new double word h3210_n, to ;
; be used on the last stage. ;
;--------------------------------------------------------;
MV B_h32, B_h32_n ; h32_n = h32
MV B_h10, B_h10_n ; h10_n = h10
;--------------------------------------------------------;
; Round up the filter taps, number of output samples ;
; to the nearest multiple of four. ;
;--------------------------------------------------------;
ADD A_nr, 3, A_nr_l ; nr + 3
ADD B_nh, 3, B_nh_l ; nh + 3
SHRU A_nr_l, 2, A_nr_l ; nr_l >> 2
SHRU B_nh_l, 2, B_nh_l ; nr_h >> 2
MPY A_nr_l, B_nh_l, B_ij ; nr_l * nh_l
MV B_nh_l, A_nh_l ; Copy
MV A_nh_l, A_i ; i = nh_l
MV A_nh_l, B_ic ; ic = nh_l
MV A_nh_l, A_il ; il = nh_l
ZERO A_sum1:A_sum0 ; Zero accum.
ZERO B_sum3:B_sum2 ; for 4 samples
;--------------------------------------------------------;
; Perform copy of filter pointer to A side. ;
; Decrement by 2 for BDEC ;
;--------------------------------------------------------;
MV B_hptr, A_hptr ; Copy of ptr.
SUB B_ij, 2, B_ij ; ij -= 2
.mptr A_xptr, A_x+0, 8
.mptr A_hptr, A_x+0, 8
.mptr A_rptr, A_x+0, 8
LOOP: .trip 1 ; Loop
SUB.1 A_nh_l, A_i, A_fg ; fg = i - nh_l
[!A_fg]ZERO.1 A_i ; (!fg) i = 0
[!A_i] LDNDW.D1T1 *A_xptr++, A_x32:A_x10 ; Load x32:x10
LDNDW.D1T2 *A_hptr[A_i], B_h32:B_h10 ; Load h32:h10
LDNDW.D1T2 *A_xptr[A_i], B_x76:B_x54 ; Load x76:x54
LDNDW.D1T1 *A_xptr[A_i], A_x76:A_x54 ; Load x76:x54
ADD.1 A_i, 1, A_i ; i ++
SUB.2 A_il, 1, B_fl ; fl = il - 1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -