📄 cfft_rad4_ps_nbrev.asm
字号:
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs ( Micro Signal Architecture 1.0 specification).
By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software.
********************************************************************************
Module Name : CFFT_Rad4_PS_NBRev.asm
Label name : __CFFT_Rad4_PS_NBRev
Version : 1.4
Change History :
Version Date Author Comments
1.4 11/18/2002 Swarnalatha Tested with VDSP++ 3.0
compiler 6.2.2 on
ADSP-21535 Rev.0.2
1.3 11/13/2002 Swarnalatha Tested with VDSP++3.0
on ADSP-21535 Rev.0.2
1.2 02/18/2002 Nishanth Modified to match
silicon cycle count
1.1 11/12/2001 Nishanth Modified to include
initialization of M2
1.0 05/01/2001 Nishanth Original
Description : The assembly function implements radix4 outplace FFT DIT
algorithm for complex inputs.
The twiddle factor array to be passed to the function must be
initialized with real(cos) and imaginary(-sine) values
alternately.
The length of the twiddle factor array should be 3*N/4 - 2,
where N is the number of FFT points.
w = e^(-2*j*pi*[0 : 3*N/4-3]/N)
The input array to be passed to the function also must be
initialized with real and imaginary values alternately.
The real and imaginary output values will be stored
alternately in the output array.
Static scaling : Input data is scaled by 4 in the first stage
to avoid overflow. The output of each stage FFT except last
stage is scaled by 4 to avoid overflow.
Assumptions : 1. There should be atleast 2 stages since first stage is done
separately. In the first stage, Bit reversal is done and so
it is separated out. In the last stage, scaling of output
is not required and so is separated out, but if N = 16,
computation for intermediate stages are skipped by the
conditional jump.
In brief, FFT length should be atleast 16 and should be an
integer power of 4.
2. The input array base address in[] should have 'x' zeros in
LSB for bit reversing properly.
where x = log (4*N) to the base 2.
3. w[] and out[] should be aligned to a 4 byte boundary.
4. All the inputs are assumed to be scaled by 4.
5. out[] and w[] should be in different minibanks.
6. in[] and out[] should be in different minibanks.
Prototype : void _CFFT_Rad4_PS_NBRev(
complex_fract16 in[],
// (i) : Pointer to the input array.
complex_fract16 out[],
// (o) : Pointer to the output array.
int N,
// (i) : FFT length.
complex_fract16 w[])
// (i) : Twiddle factor array
Registers used : A0,A1, R0-R7, I0-I3, B1,B3, M0-M3, L0-L3, P0-P2, LC0,LC1, CC
Performance :
Code Size : 496 Bytes.
Cycle Count : 3 * N * M + 20 * M - 1.5 * N + 18
where N = FFT length and M = log(N) to the base 4.
130 cycles for FFT size of 16.
558 cycles for FFT size of 64.
2786 cycles for FFT size of 256.
13942 cycles for FFT size of 1024.
*******************************************************************************/
.section L1_code;
.align 8;
.global __CFFT_Rad4_PS_NBRev;
__CFFT_Rad4_PS_NBRev:
[--SP] = (R7:4); // Save registers R4-R7
P1 = R2; // P1 = N, length of FFT
L0 = 0; // Disable circular buffering
L2 = 0;
I0 = R0; // Address of input array
M0 = 16; // Modifier to decrement outpoint pointer for 3
// dummy writes
I1 = R1; // Address of output buffer(read pointer)
B1 = R1; // Base address of circular buffer
R0 = R2 << 2 || R3 = [SP + 28];
// R0 = 4*N , R3 = Address of twiddle factor array
L1 = R0; // Circular buffering enabled
I3 = R1; // Address of output buffer(write pointer)
B3 = R1;
L3 = R0;
R0 = R2 << 1 || I1 -= M0;
// R0 = 2*N , Decrement output pointer by 12 for
// skewing
M3 = R0; // M3 = 2*N
I2 = R3; // Address of twiddle factor array
P1 = P1 >> 2; // P1 = N/4;
M2 = 0; // To avoid overflowing of I2 during dummy
// increments at the start
P0 = 16; // Modifier for fetching input
P2 = 0; // Loop counter for number of butterflies in each
// group
// Start of first stage with input fetching by bit reversal Input is scaled by
//four. The output is also scaled by four by ASR option
LSETUP(STAGE1_ST,STAGE1_END) LC0 = P1;
// There are N/4 butterflies in first stage
STAGE1_ST:
R0 = R4 +|+ R7 , R1 = R4 -|- R7(ASR) || I0 += M3 (BREV) || R7 = [I0];
// y0 = A +|+ C, y2 = A -|- C, Do bit-reversal and
// fetch x0
R3 = R5 +|- R6 , R0 = R5 -|+ R6(ASR) || [I1++] = R0 || R6 = [I0];
// y3 = B +|- D, y1 = B -|+ D, Store y0 of this
// butterfly and fetch x2
A0=R7.L*R1.L || [I1++] = R0 || I0 += M3 (BREV);
// Dummy MAC to avoid EU->MUL/MAC stall
// Store y1 of previous butterfly , Do bit-reversal
R4 = R7 +|+ R6 , R5 = R7 -|- R6(ASR) || I0 += M3 (BREV) || R7 = [I0];
// A = x0 +|+ x2, B = x0 -|- x2, Do bit-reversal and
// fetch x1
[I1++] = R1 || R6 = [I0];
// Store y2 of previous butterfly and fetch x3
STAGE1_END:
R7 = R7 +|+ R6 , R6 = R7 -|- R6 (ASR,CO) || [I1++] = R3
|| I0 += M3 (BREV);
// C = x1 +|+ x3, D = x1 -|- x3(CO),
// Store y3 of previous butterfly , Do bit-reversal
R0 = R4 +|+ R7 , R1 = R4 -|- R7(ASR);
// y0 = A +|+ C, y2 = A -|- C
R3 = R5 +|- R6 , R0 = R5 -|+ R6(ASR) || [I1++] = R0;
// y3 = B +|- D, y1 = B -|+ D, Store y0 of this
// butterfly
[I1++] = R0; // Store y1 of last butterfly
[I1++] = R1; // Store y2 of last butterfly
[I1++] = R3; // Store y3 of last butterfly, Modify I2 as there
//are 2 dummy writes
// End of first stage with input fetching by bit
// reversal
// Start of intermediate stages. All the stages except first and last done here.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -