📄 fir complex 16-bit data with complex 32-bit accumulate..txt
字号:
*===============================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* Complex FIR
*
* Revision Date: 2/3/97
*
* USAGE This routine is C Callable and can be called as:
*
* void fircx(short *x, short *h, short *y, int N, int M)
*
* x = input array
* h = coefficient array
* y = output array
* N = number of coefficients (N EVEN >= 2)
* M = number of output samples (M >= 1)
*
* If routine is not to be used as a C callable function
* then all instructions relating to stack should be removed.
* Refer to comments of inxividual instructions. You will also
* need to initialize values for all of the values passed as these
* are assumed to be in registers as defined by the calling
* convention of the compiler, (refer to the C compiler reference
* guide).
*
* C Code This is the C equivalent of the Assembly Code without
* restrictions. Note that the assembly code is hand optimized and
* restrictions may apply
*
* void fircx(short *x, short *h, short *y, short N, short M)
* {
* short i,j;
* int imag, real;
*
* for (i = 0; i < 2*M; i += 2){
* imag = 0;
* real = 0;
* for (j = 0; j < 2*N; j += 2){
* real += h[j] * x[i-j] - h[j+1] * x[i+1-j];
* imag += h[j] * x[i+1-j] + h[j+1] * x[i-j];
* }
* y[i] = (real >> 15);
* y[i+1] = (imag >> 15);
* }
* }
*
* DESCRIPTION
* This complex FIR assumes the number of filter coeficients is a
* multiple of 2 and the number of output samples times the number
* of input samples is greater than 4.
*
* It operates on 16-bit data with a 32-bit accumulate. This
* routine has no memory hits regardless of where x, h, and y
* arrays are located in memory. The filter is M output samples
* and N coefficients. Each array consists of an even and odd term
* with even terms representing the real part of the element and
* the odd terms the imaginary.
*
* It is assumed that x points to the Nth element of some complex
* array (2N shorts) upon entry to the function.
*
*
* TECHNIQUES
* The inner loop is unrolled two times thus the number of
* filter coefficients must be a multiple of two
*
* The outer loop is conditionally executed in parallel with the
* inner loop. This allows for a zero overhead outer loop.
*
*
* ASSUMPTIONS
* N MULTIPLE of 2 >= 2
* M >= 1
* N*M >= 4
*
*
* MEMORY NOTE
* This code has no memory hits regardless of where x and h are
* located in memory.
*
* CYCLES 2*M*N + 10
*
*===============================================================================
.global _fircx
.text
_fircx:
STW .D2 B10,*B15-- ; push register (for c-callable func)
STW .D2 A10,*B15-- ; push register (for c-callable func)
STW .D2 B2,*B15-- ; push register (for c-callable func)
STW .D2 A11,*B15 ; push register (for c-callable func)
*** BEGIN Benchmark Timing ***
B_START
SHL .S2 B6,2,B10 ; used to reset the pointer
|| LDW .D2 *B4++[2],B5 ; h[j] & h[j+1] (real & imag)
|| LDW .D1 *A4--[2],A5 ; x[i-j] & x[i+1-j]
|| SUB .L2 B6,2,B1 ; N - 2
ADD .S1X B10,4,A0 ; used to reset the pointer
|| CMPGT .L2 B6,4,B2 ; N > 4?
|| MPY .M2X A8,B6,B0 ; M*N, loop counter
|| ADD .S2 B10,4,B9 ; used to reset the pointer
[!B1] SUB .S2 B4,B10,B4 ; reset the ptr
|| LDW .D2 *-B4[1],A3 ; h[j+2] & hi[j+3] (real & imag)
|| LDW .D1 *+A4[1],B2 ; x[i-2-j] & x[i-1-j]
||[!B1] ADD .L1 A4,A0,A4 ; reset the ptr
|| MV .S1X B6,A7 ; N
|| MPY .M1 A10,0,A10 ; zero reset counter value
|| SUB .L2X A4,4,B11 ; &x[i-2-j]
[!B1] ADD .S2 B11,B9,B11 ; reset the ptr
||[B1] SUB .L1X B1,2,A1 ; decrement inner loop counter
||[!B1] SUB .S1 A7,2,A1 ;* reset inner loop counter
|| LDW .D2 *B4++[2],B5 ;* h[j] & h[j+1] (real & imag)
LDW .D2 *B11--[2],B2 ;* x[i-2-j] & x[i-1-j]
|| LDW .D1 *A4--[2],A5 ;* x[i-j] & x[i+1-j]
||[B2] SUB .S1 A7,4,A10 ; setup reset counter offset
|| ADD .L2X A6,2,B6 ; set up reOut ptr to other reg file
MPYLH .M2X A5,B5,B8 ; x[i-j] * h[j+1] (real * imag)
|| MPY .M1X A5,B5,A8 ; x[i-j] * h[j] (real * real)
||[!A1] ADD .S1 A4,A0,A4 ;* reset the ptr
MPYHL .M2X A5,B5,B7 ; x[i+1-j] * h[j] (imag * real)
|| MPYH .M1X A5,B5,A11 ; x[i+1-j] * h[j+1] (imag * imag)
||[B0] B .S2 LOOP ; branch to the loop
||[!A1] SUB .L2 B4,B10,B4 ;* reset the ptr
|| LDW .D2 *-B4[1],A3 ;* h[j+2] & h[j+3] (real & imag)
MPYLH .M2X B2,A3,B7 ; x[i-2-j] * h[j+3] (real * imag)
|| MPY .M1X B2,A3,A11 ; x[i-2-j] * h[j+2] (real * real)
|| MVK .S1 1,A2 ; prevent first stores from executing
||[B0] SUB .S2 B0,4,B0 ; decrement loop counter
||[!A1] ADD .L2 B11,B9,B11 ;* reset the ptr
||[A1] SUB .D1 A1,2,A1 ;* decrement inner loop counter
||[!A1] SUB .L1 A7,2,A1 ;** reset inner loop counter
|| LDW .D2 *B4++[2],B5 ;** h[j] & h[j+1] (real & imag)
LOOP:
ADD .L2 B8,B7,B8 ; imag
|| SUB .L1 A8,A11,A8 ; real
|| MPYHL .M2X B2,A3,B7 ; x[i-1-j] * h[j+2] (imag * real)
|| MPYH .M1X B2,A3,A11 ; x[i-1-j] * h[j+3] (imag * imag)
|| LDW .D1 *A4--[2],A5 ;** x[i-j] & x[i+1-j]
|| LDW .D2 *B11--[2],B2 ;** x[i-2-j] & x[i-1-j]
||[!A2] SHR .S2 B1,15,B1 ; final y[i+1]
||[!A2] SHR .S1 A9,15,A9 ; final y[i]
ADD .L2 B8,B7,B1 ; imag
|| ADD .L1 A8,A11,A9 ; real
|| MPYLH .M2X A5,B5,B8 ;* x[i-j] * h[j+1] (real * imag)
|| MPY .M1X A5,B5,A8 ;* x[i-j] * h[j] (real * real)
||[!A1] ADD .S2 B11,B9,B11 ;** reset the pointer
||[!A1] ADD .S1 A4,A0,A4 ;** reset the ptr
||[!A2] STH .D2 B1,*B6++[2] ; store imOut[0]
||[!A2] STH .D1 A9,*A6++[2] ; store reOut[0]
ADD .L2 B1,B7,B1 ; imag
|| SUB .L1 A9,A11,A9 ; real
|| SUB .D1 A1,A10,A2 ; decrement loop counter
|| MPYHL .M2X A5,B5,B7 ;* x[i+1-j] * h[j] (imag * real)
|| MPYH .M1X A5,B5,A11 ;* x[i+1-j] * h[j+1] (imag * imag)
||[B0] B .S1 LOOP ;* branch to the loop
||[!A1] SUB .S2 B4,B10,B4 ;** reset the ptr
|| LDW .D2 *-B4[1],A3 ;** h[j+2] & h[j+3] (real & imag)
[A2] ADD .L2 B1,B8,B8 ; imag
||[A2] ADD .L1 A9,A8,A8 ; real
|| MPYLH .M2X B2,A3,B7 ;* x[i-2-j] * h[j+3] (real * imag)
|| MPY .M1X B2,A3,A11 ;* x[i-2-j] * h[j+2] (real * real)
||[B0] SUB .S2 B0,2,B0 ;** decrement loop counter
||[A1] SUB .D1 A1,2,A1 ;** decrement inner loop counter
||[!A1] SUB .S1 A7,2,A1 ;*** reset inner loop counter
|| LDW .D2 *B4++[2],B5 ;*** h[j] & h[j+1] (real & imag)
; Loop ends here
SHR .S2 B1,15,B1 ; final y[i+1]
|| SHR .S1 A9,15,A9 ; final y[i]
|| LDW .D2 *B15++,A11 ; pop register (for c-callable func)
STH .D2 B1,*B6++ ; store imOut[0]
|| STH .D1 A9,*A6++ ; store reOut[0]
B_END:
*** END Benchmark Timing ***
LDW .D2 *B15++,B2 ; pop register (for c-callable func)
LDW .D2 *B15++,A10 ; pop register (for c-callable func)
|| B .S2 B3 ; return
LDW .D2 *B15,B10 ; pop register (for c-callable func)
NOP 4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -