📄 ffts.s
字号:
;===========================================================================
;
; UNOPTIMISED algorithm - has the advantage of being much smaller
;
[ OPTIMISE=0
;---------------------------------------------------------------------------
; Now the main UNOPTIMISED FFT loops start...
; The way it works is that we start at one end, lookup in the trig table
; the value for Sine and Cosine for that element, and then we do a cunning
; inner loop where we do ALL of the elements which require those values
; of Sine and Cosine.
; Swapped out registers
StartElem RN 3 ; First calculation in this group (k)
Tab RN 4 ; Index into trig. table, (wk/(2pi/N))
; Inner loop registers
Yadr RN 0 ; Yadr = address of first data value
Zadr RN 1 ; Zadr = address of second data value
CalcsLoop RN 2 ; counter for inner (butterfly) loop
Yr RN 3 ; Yr = first data value, real part
Yi RN 4 ; Yi = first data value, imaginary part
Zr RN 5 ; Zr = second data value, real part
Zi RN 6 ; Zi = second data value, imaginary part
Tr RN 7 ; Tr = temp complex value, real part
Ti RN 8 ; Ti = temp complex value, imaginary part
GrpStage RN 9 ; Number of groups in the current stage
TabStep RN 10 ; angle step at each stage
; R11 is Tp
Cos RN 12 ; Current cos(wk) - sometimes -ve
Sin RN 13 ; Current sin(wk) - always +ve!
; R14 is PDataAr
MOV GrpStage, #1 ; first stage has one group (n=2)
MOV CalcsLoop, BR_N, LSR#1 ; and N/2 calculations per group
MOV TabStep, #(TABLE_N/2) ; first angle step goes pi around
STGLOOP ; start of STAGES loop
STR CalcsLoop,[PDataAr,#MCalcsLoop] ; swap out number of calcs per group
MOV StartElem, #0 ; first group starts at element 0
MOV Tab, #0 ; first group has wk=0
GRPLOOP ; start of GROUPS loop
ADD Tp,PDataAr,#MStartElem
STMIA Tp,{StartElem,Tab} ; swap out StartElem and Tab
; look up sin and cosine
MOV Tp,#0 ; flag b0 to negate cos
CMP Tab,#(TABLE_N/4)
RSBGT Tab,Tab,#(TABLE_N/2) ; transfer Tab to first quadrant
ORRGT Tp,Tp,#1 ; and set a flag if it has moved
CMP Tab,#(TABLE_N/8)
RSBGT Tab,Tab,#(TABLE_N/4) ; transfer Tab to first 1/8th.
ADRL Cos,TRIG_TABLE ; address of table
LDR Tab,[Cos,Tab,LSL#2] ; get the cos/sin entry
MOVLE Sin, Tab, LSR#16 ; extract sin
BICLE Cos, Tab, Sin, LSL#16 ; extract cos if 1st 1/8th
MOVGT Cos, Tab, LSR#16 ; extract cos
BICGT Sin, Tab, Cos, LSL#16 ; extract sin if 2nd 1/8th
TST Tp,#1
RSBNE Cos,Cos,#0 ; negate cos to swap quadrant
LDR Yadr,[PDataAr,#MOutAddr] ; address of first element
ADD Yadr,Yadr,StartElem,LSL #3 ; of first calculation
CALCLOOP ; start of calculations loop
ADD Zadr,Yadr,GrpStage,LSL #3 ; calculate address of element Z
LDMIA Yadr,{Yr,Yi} ; load element Y
LDMIA Zadr,{Zr,Zi} ; load element Z
MUL Tr,Zr,Cos ; t=Z*(cos-i*sin)
MLA Tr,Zi,Sin,Tr
MUL Tp,Zr,Sin
MUL Ti,Zi,Cos
SUB Ti,Ti,Tp
SUB Zr,Yr,Tr,ASR #14 ; Z = Y-T
SUB Zi,Yi,Ti,ASR #14
MOV Zr,Zr,ASR #1 ; scale results
MOV Zi,Zi,ASR #1
STMIA Zadr,{Zr,Zi} ; store new values for next stage
ADD Yr,Yr,Tr,ASR #14 ; Y = Y+T
ADD Yi,Yi,Ti,ASR #14 ; ASR #14 compensates for
; fixed-point cos & sin
MOV Yr,Yr,ASR #1
MOV Yi,Yi,ASR #1
STMIA Yadr,{Yr,Yi}
ADD Yadr,Yadr,GrpStage,LSL #4 ; prepare for next calculation
; of this group
SUBS CalcsLoop,CalcsLoop,#1
BNE CALCLOOP ; end of CALCULATIONS loop
LDMIB PDataAr,{CalcsLoop,StartElem,Tab}
ADD StartElem, StartElem,#1 ; next group starts one element later
ADD Tab, Tab, TabStep ; increase angle wk
CMP StartElem, GrpStage ; have we finished this stage?
BLT GRPLOOP ; do next group (loop of m's)
MOV GrpStage, GrpStage,LSL #1 ; next stage has twice as many
; groups per stage
MOV CalcsLoop, CalcsLoop,LSR #1 ; next stage has half as many
; calculations per group
MOV TabStep, TabStep, LSR #1 ; and finer angle resolution
LDR Tp,[PDataAr,#MN] ; find the N value
CMP GrpStage, Tp ; compare n/2 with N
BLT STGLOOP ; go around stages loop again
]
;===========================================================================
;
; OPTIMISED algorithm - much faster but larger code size
;
; The first three stages are done by hand before entering a stages loop
[ OPTIMISE=1
; The first two stages only consist of additions and subtractions,
; and the third has only additions, subtractions, and multiplies by the
; constant SQRT(2)/2.
;
; The optimisation attempts to do eight elements at a time of the first
; three stages, without any unneccessary loads or stores.
; To do this in one operation with no intermediate loads and stores
; requires a minimum of 19 registers (16 for the complex data, two for
; data pointers, and one for swapping data). We obviously don't have 19
; registers to play with, but we shall do it with the minimum number
; of intermediate stores (4) so that 4 data words are paged out once,
; and then paged back in again (as opposed to all of them being paged
; in and out twice if the 3 stages were done separately).
; Unfortunately, because of the tightness of the register situation and
; the complexity of the stages, keeping register names meaningful is a
; a bit tricky, so you'll have to bear with me for a while ...
;
; I will name the registers as follows:
; The first letter denotes the stage "A"=stage 1, "B"=stage 2, "C"=stage 3.
; The next digit denotes the butterfly group from "0" to "3".
; Each butterfly group has a pair of elements "Y" and "Z".
; Each pair of elements has a real "r" and imaginary "i" parts.
;
; So A3Zi is the imaginary part of the second element required in the
; third group of the first stage.
;
; I could have just named the registers 0 to 7 to represent the offset
; of each element, but unfortunately the register each is in keeps
; changing ...
;
; Define the register aliases for each of the optimised stages.
OS_indx RN 10 ; Points to block of values being processed
OS_cnt RN 11 ; count of blocks processed
A0Yr RN 0 ; Stage 1 register allocation,
A0Yi RN 1 ; firstly for groups 0 and 1.
A0Zr RN 2
A0Zi RN 3
A1Yr RN 4
A1Yi RN 5
A1Zr RN 6
A1Zi RN 7
Tp1 RN 8
B0Yr RN 0 ; Stage 2 register allocation,
B0Yi RN 1 ; firstly for groups 0 and 1.
B0Zr RN 4
B0Zi RN 5
B1Yr RN 2
B1Yi RN 3
B1Zr RN 6
B1Zi RN 7
A2Yr RN 4 ; Stage 1 register allocation,
A2Yi RN 5 ; then for groups 2 and 3.
A2Zr RN 6
A2Zi RN 7
Tp2 RN 3
A3Yr RN 8
A3Yi RN 9
A3Zr RN 12
A3Zi RN 13
B2Yr RN 4 ; Stage 2 register allocation,
B2Yi RN 5 ; then for groups 2 and 3.
B2Zr RN 8
B2Zi RN 9
B3Yr RN 6
B3Yi RN 7
B3Zr RN 12
B3Zi RN 13
C0Yr RN 0 ; Stage 3 register allocation,
C0Yi RN 1 ; firstly for groups 0 and 1.
C1Yr RN 2
C1Yi RN 3
C0Zr RN 4
C0Zi RN 5
C1Zr RN 6
C1Zi RN 7
C2Yr RN 0 ; Stage 3 register allocation,
C2Yi RN 1 ; then for groups 2 and 3.
C3Yr RN 2
C3Yi RN 3
C2Zr RN 4
C2Zi RN 5
C3Zr RN 6
C3Zi RN 7
CxT1 RN 8 ; Temporary registers.
CxT2 RN 9
; optimised third stage
CxTr RN 12 ; temporary value, real part
CxTi RN 13 ; temporary value, imaginary part
MOV OS_indx,BR_outst ; address of the output
; start at the beginning
MOV OS_cnt,BR_N,LSR#3 ; third stage has N/8 subsequences,
; with 4 groups of N/8 calculations,
; theta=0, -pi/4, -pi/2 and -3pi/4
STG3_LOOP
LDMIA OS_indx,{A0Yr, A0Yi, A0Zr, A0Zi, A1Yr, A1Yi, A1Zr, A1Zi }
; Group 0 of the first stage (results kept in registers).
ADD A0Yr, A0Yr, A0Zr ; Yr' = Yr + Zr
ADD A0Yi, A0Yi, A0Zi ; Yi' = Yi + Zi
SUB A0Zr, A0Yr, A0Zr, LSL #1 ; Zr' = Yr - Zr
SUB A0Zi, A0Yi, A0Zi, LSL #1 ; Zi' = Yi - Zi
; Group 1 of the first stage (results kept in registers).
ADD A1Yr, A1Yr, A1Zr ; Yr' = Yr + Zr
ADD A1Yi, A1Yi, A1Zi ; Yi' = Yi + Zi
SUB Tp1, A1Yi, A1Zi, LSL #1 ; Zi' = Yi - Zi
SUB A1Zr, A1Yr, A1Zr, LSL #1 ; Zr' = Yr - Zr
; Now to do two pairs of the second stage calculations, using the results in
; registers left from the above first stage calculation. The first half of
; the results needed for a third stage calculation are left in the registers,
; the rest are stored.
; Group 0 of the second stage
ADD B0Yr, B0Yr, B0Zr ; Yr' = Yr + Zr
ADD B0Yi, B0Yi, B0Zi ; Yi' = Yi + Zi
SUB B0Zr, B0Yr, B0Zr, LSL #1 ; Zr' = Yr - Zr
SUB B0Zi, B0Yi, B0Zi, LSL #1 ; Zi' = Yi - Zi
; Group 1 of the Second Stage
ADD B1Yr, B1Yr, Tp1 ; Yr' = Yr + Zi
SUB B1Yi, B1Yi, B1Zr ; Yi' = Yi - Zr
ADD B1Zi, B1Yi, B1Zr, LSL #1 ; Zi' = Yi + Zr
SUB B1Zr, B1Yr, Tp1 , LSL #1 ; Zr' = Yr - Zi
; Store some of the results because we need some spare registers...
ADD OS_indx,OS_indx,#12
STMIA OS_indx!,{B1Yi,B0Zr,B0Zi,B1Zr,B1Zi}
; Do another two pairs of the first stage calculations, keeping the results
; in registers, but not overwriting the four registers containing results
; from the above calculations.
LDMIA OS_indx,{B2Yr, B2Yi, B3Yr, B3Yi, B2Zr, B2Zi, B3Zr, B3Zi }
; Group 2 of the first stage.
ADD A2Yr, A2Yr, A2Zr ; Yr' = Yr + Zr
ADD A2Yi, A2Yi, A2Zi ; Yi' = Yi + Zi
SUB A2Zr, A2Yr, A2Zr, LSL #1 ; Zr' = Yr - Zr
SUB A2Zi, A2Yi, A2Zi, LSL #1 ; Zi' = Yi - Zi
; Group 3 of the first stage
ADD A3Yr, A3Yr, A3Zr ; Yr' = Yr + Zr
ADD A3Yi, A3Yi, A3Zi ; Yi' = Yi + Zi
SUB Tp2, A3Yi, A3Zi, LSL #1 ; Zi' = Yi - Zi
SUB A3Zr, A3Yr, A3Zr, LSL #1 ; Zr' = Yr - Zr
; Now to do another pair of second stage calculations, saving some registers,
; but keeping in registers the data required to complete one third stage.
; Group 2 of the second stage
ADD B2Yr, B2Yr, B2Zr ; Yr' = Yr + Zr
ADD B2Yi, B2Yi, B2Zi ; Yi' = Yi + Zi
SUB B2Zr, B2Yr, B2Zr, LSL #1 ; Zr' = Yr - Zr
SUB B2Zi, B2Yi, B2Zi, LSL #1 ; Zi' = Yi - Zi
; Group 3 of the second stage
ADD B3Yr, B3Yr, Tp2 ; Yr' = Yr + Zi
SUB B3Yi, B3Yi, B3Zr ; Yi' = Yi - Zr
ADD B3Zi, B3Yi, B3Zr, LSL #1 ; Zi' = Yi + Zr
SUB B3Zr, B3Yr, Tp2, LSL #1 ; Zr' = Yr - Zi
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -