📄 ffts.s

📁 ARM库里的FFT算法
💻 S
📖 第 1 页 / 共 4 页
字号:

;===========================================================================
;
; UNOPTIMISED algorithm - has the advantage of being much smaller
;

 [ OPTIMISE=0

;---------------------------------------------------------------------------
; Now the main UNOPTIMISED FFT loops start...
; The way it works is that we start at one end, lookup in the trig table
; the value for Sine and Cosine for that element, and then we do a cunning
; inner loop where we do ALL of the elements which require those values
; of Sine and Cosine.

; Swapped out registers

StartElem    RN 3         ; First calculation in this group (k)
Tab          RN 4         ; Index into trig. table,  (wk/(2pi/N))

; Inner loop registers

Yadr         RN 0         ; Yadr = address of first data value
Zadr         RN 1         ; Zadr = address of second data value
CalcsLoop    RN 2         ; counter for inner (butterfly) loop
Yr           RN 3         ; Yr = first data value, real part
Yi           RN 4         ; Yi = first data value, imaginary part
Zr           RN 5         ; Zr = second data value, real part
Zi           RN 6         ; Zi = second data value, imaginary part
Tr           RN 7         ; Tr = temp complex value, real part
Ti           RN 8         ; Ti = temp complex value, imaginary part
GrpStage     RN 9         ; Number of groups in the current stage
TabStep	     RN 10	  ; angle step at each stage
; R11 is Tp
Cos          RN 12        ; Current cos(wk) - sometimes -ve
Sin          RN 13        ; Current sin(wk) - always +ve!
; R14 is PDataAr

  MOV    GrpStage, #1                 ; first stage has one group (n=2)
  MOV    CalcsLoop, BR_N, LSR#1       ; and N/2 calculations per group
  MOV	 TabStep, #(TABLE_N/2)	      ; first angle step goes pi around

STGLOOP                               ; start of STAGES loop
  STR	 CalcsLoop,[PDataAr,#MCalcsLoop] ; swap out number of calcs per group
  MOV    StartElem, #0                ; first group starts at element 0
  MOV    Tab, #0                      ; first group has wk=0

GRPLOOP                               ; start of GROUPS loop
  ADD	 Tp,PDataAr,#MStartElem
  STMIA  Tp,{StartElem,Tab}           ; swap out StartElem and Tab
  ; look up sin and cosine
  MOV    Tp,#0			      ; flag b0 to negate cos
  CMP	 Tab,#(TABLE_N/4)
  RSBGT	 Tab,Tab,#(TABLE_N/2)         ; transfer Tab to first quadrant
  ORRGT	 Tp,Tp,#1                     ; and set a flag if it has moved
  CMP	 Tab,#(TABLE_N/8)
  RSBGT	 Tab,Tab,#(TABLE_N/4)         ; transfer Tab to first 1/8th.
  ADRL   Cos,TRIG_TABLE		      ; address of table
  LDR	 Tab,[Cos,Tab,LSL#2]          ; get the cos/sin entry
  MOVLE	 Sin, Tab, LSR#16	      ; extract sin
  BICLE	 Cos, Tab, Sin, LSL#16	      ; extract cos if 1st 1/8th
  MOVGT	 Cos, Tab, LSR#16	      ; extract cos
  BICGT	 Sin, Tab, Cos, LSL#16	      ; extract sin if 2nd 1/8th
  TST	 Tp,#1
  RSBNE	 Cos,Cos,#0                   ; negate cos to swap quadrant

  LDR	 Yadr,[PDataAr,#MOutAddr]     ; address of first element
  ADD    Yadr,Yadr,StartElem,LSL #3   ; of first calculation

CALCLOOP                              ; start of calculations loop
  ADD    Zadr,Yadr,GrpStage,LSL #3    ; calculate address of element Z
  LDMIA  Yadr,{Yr,Yi}                 ; load element Y
  LDMIA  Zadr,{Zr,Zi}                 ; load element Z

  MUL    Tr,Zr,Cos                    ; t=Z*(cos-i*sin)
  MLA    Tr,Zi,Sin,Tr
  MUL    Tp,Zr,Sin
  MUL    Ti,Zi,Cos
  SUB    Ti,Ti,Tp

  SUB    Zr,Yr,Tr,ASR #14             ; Z = Y-T
  SUB    Zi,Yi,Ti,ASR #14
  MOV    Zr,Zr,ASR #1                 ; scale results
  MOV    Zi,Zi,ASR #1
  STMIA  Zadr,{Zr,Zi}                 ; store new values for next stage
  ADD    Yr,Yr,Tr,ASR #14             ; Y = Y+T
  ADD    Yi,Yi,Ti,ASR #14             ; ASR #14 compensates for
                                      ; fixed-point cos & sin
  MOV    Yr,Yr,ASR #1
  MOV    Yi,Yi,ASR #1
  STMIA  Yadr,{Yr,Yi}
  ADD    Yadr,Yadr,GrpStage,LSL #4    ; prepare for next calculation
                                      ; of this group
  SUBS   CalcsLoop,CalcsLoop,#1
  BNE    CALCLOOP                     ; end of CALCULATIONS loop

  LDMIB  PDataAr,{CalcsLoop,StartElem,Tab}
  ADD    StartElem, StartElem,#1      ; next group starts one element later
  ADD    Tab, Tab, TabStep            ; increase angle wk
  CMP    StartElem, GrpStage	      ; have we finished this stage?
  BLT    GRPLOOP                      ; do next group (loop of m's)

  MOV    GrpStage, GrpStage,LSL #1    ; next stage has twice as many
                                      ; groups per stage
  MOV    CalcsLoop, CalcsLoop,LSR #1  ; next stage has half as many
                                      ; calculations per group
  MOV	 TabStep, TabStep, LSR #1     ; and finer angle resolution
  LDR	 Tp,[PDataAr,#MN]	      ; find the N value
  CMP	 GrpStage, Tp		      ; compare n/2 with N
  BLT    STGLOOP                      ; go around stages loop again

 ]


;===========================================================================
;
; OPTIMISED algorithm - much faster but larger code size
;
; The first three stages are done by hand before entering a stages loop

 [ OPTIMISE=1

 ; The first two stages only consist of additions and subtractions,
 ; and the third has only additions, subtractions, and multiplies by the
 ; constant SQRT(2)/2.
 ;
 ; The optimisation attempts to do eight elements at a time of the first
 ; three stages, without any unneccessary loads or stores.
 ; To do this in one operation with no intermediate loads and stores
 ; requires a minimum of 19 registers (16 for the complex data, two for
 ; data pointers, and one for swapping data). We obviously don't have 19
 ; registers to play with, but we shall do it with the minimum number
 ; of intermediate stores (4) so that 4 data words are paged out once,
 ; and then paged back in again (as opposed to all of them being paged
 ; in and out twice if the 3 stages were done separately).
 ; Unfortunately, because of the tightness of the register situation and
 ; the complexity of the stages, keeping register names meaningful is a
 ; a bit tricky, so you'll have to bear with me for a while ...
 ;
 ; I will name the registers as follows:
 ; The first letter denotes the stage  "A"=stage 1, "B"=stage 2, "C"=stage 3.
 ; The next digit denotes the butterfly group from  "0" to "3".
 ; Each butterfly group has a pair of elements "Y" and "Z".
 ; Each pair of elements has a real "r" and imaginary "i" parts.
 ;
 ; So A3Zi is the imaginary part of the second element required in the
 ; third group of the first stage.
 ;
 ; I could have just named the registers 0 to 7 to represent the offset
 ; of each element, but unfortunately the register each is in keeps
 ; changing ...
 ;
 ; Define the register aliases for each of the optimised stages.

OS_indx    RN 10        ; Points to block of values being processed
OS_cnt     RN 11        ; count of blocks processed

A0Yr       RN 0         ; Stage 1 register allocation,
A0Yi       RN 1         ;    firstly for groups 0 and 1.
A0Zr       RN 2
A0Zi       RN 3

A1Yr       RN 4
A1Yi       RN 5
A1Zr       RN 6
A1Zi       RN 7
Tp1        RN 8

B0Yr       RN 0         ; Stage 2 register allocation,
B0Yi       RN 1         ;    firstly for groups 0 and 1.
B0Zr       RN 4
B0Zi       RN 5

B1Yr       RN 2
B1Yi       RN 3
B1Zr       RN 6
B1Zi       RN 7

A2Yr       RN 4         ; Stage 1 register allocation,
A2Yi       RN 5         ;    then for groups 2 and 3.
A2Zr       RN 6
A2Zi       RN 7
Tp2        RN 3

A3Yr       RN 8
A3Yi       RN 9
A3Zr       RN 12
A3Zi       RN 13

B2Yr       RN 4         ; Stage 2 register allocation,
B2Yi       RN 5         ;    then for groups 2 and 3.
B2Zr       RN 8
B2Zi       RN 9

B3Yr       RN 6
B3Yi       RN 7
B3Zr       RN 12
B3Zi       RN 13

C0Yr       RN 0        ; Stage 3 register allocation,
C0Yi       RN 1        ;     firstly for groups 0 and 1.
C1Yr       RN 2
C1Yi       RN 3
C0Zr       RN 4
C0Zi       RN 5
C1Zr       RN 6
C1Zi       RN 7

C2Yr       RN 0        ; Stage 3 register allocation,
C2Yi       RN 1        ;     then for groups 2 and 3.
C3Yr       RN 2
C3Yi       RN 3
C2Zr       RN 4
C2Zi       RN 5
C3Zr       RN 6
C3Zi       RN 7

CxT1       RN 8        ; Temporary registers.
CxT2       RN 9

      ; optimised third stage

CxTr         RN 12       ; temporary value, real part
CxTi         RN 13       ; temporary value, imaginary part

      MOV OS_indx,BR_outst            ; address of the output
                                      ; start at the beginning
      MOV OS_cnt,BR_N,LSR#3           ; third stage has N/8 subsequences,
                                      ; with 4 groups of N/8 calculations,
                                      ; theta=0, -pi/4, -pi/2 and -3pi/4
STG3_LOOP

  LDMIA OS_indx,{A0Yr, A0Yi, A0Zr, A0Zi, A1Yr, A1Yi, A1Zr, A1Zi }

; Group 0 of the first stage (results kept in registers).
  ADD   A0Yr, A0Yr, A0Zr              ; Yr' = Yr + Zr
  ADD   A0Yi, A0Yi, A0Zi              ; Yi' = Yi + Zi
  SUB   A0Zr, A0Yr, A0Zr, LSL #1      ; Zr' = Yr - Zr
  SUB   A0Zi, A0Yi, A0Zi, LSL #1      ; Zi' = Yi - Zi

; Group 1 of the first stage (results kept in registers).
  ADD   A1Yr, A1Yr, A1Zr              ; Yr' = Yr + Zr
  ADD   A1Yi, A1Yi, A1Zi              ; Yi' = Yi + Zi
  SUB   Tp1,  A1Yi, A1Zi, LSL #1      ; Zi' = Yi - Zi
  SUB   A1Zr, A1Yr, A1Zr, LSL #1      ; Zr' = Yr - Zr

; Now to do two pairs of the second stage calculations, using the results in
; registers left from the above first stage calculation.  The first half of
; the results needed for a third stage calculation are left in the registers,
; the rest are stored.

; Group 0 of the second stage
  ADD   B0Yr, B0Yr, B0Zr              ; Yr' = Yr + Zr
  ADD   B0Yi, B0Yi, B0Zi              ; Yi' = Yi + Zi
  SUB   B0Zr, B0Yr, B0Zr, LSL #1      ; Zr' = Yr - Zr
  SUB   B0Zi, B0Yi, B0Zi, LSL #1      ; Zi' = Yi - Zi

; Group 1 of the Second Stage
  ADD   B1Yr, B1Yr, Tp1               ; Yr' = Yr + Zi
  SUB   B1Yi, B1Yi, B1Zr              ; Yi' = Yi - Zr
  ADD   B1Zi, B1Yi, B1Zr, LSL #1      ; Zi' = Yi + Zr
  SUB   B1Zr, B1Yr, Tp1 , LSL #1      ; Zr' = Yr - Zi

; Store some of the results because we need some spare registers...
  ADD   OS_indx,OS_indx,#12
  STMIA OS_indx!,{B1Yi,B0Zr,B0Zi,B1Zr,B1Zi}


; Do another two pairs of the first stage calculations, keeping the results
; in registers, but not overwriting the four registers containing results
; from the above calculations.
  LDMIA OS_indx,{B2Yr, B2Yi, B3Yr, B3Yi, B2Zr, B2Zi, B3Zr, B3Zi }

; Group 2 of the first stage.
  ADD   A2Yr, A2Yr, A2Zr              ; Yr' = Yr + Zr
  ADD   A2Yi, A2Yi, A2Zi              ; Yi' = Yi + Zi
  SUB   A2Zr, A2Yr, A2Zr, LSL #1      ; Zr' = Yr - Zr
  SUB   A2Zi, A2Yi, A2Zi, LSL #1      ; Zi' = Yi - Zi

; Group 3 of the first stage
  ADD   A3Yr, A3Yr, A3Zr              ; Yr' = Yr + Zr
  ADD   A3Yi, A3Yi, A3Zi              ; Yi' = Yi + Zi
  SUB   Tp2,  A3Yi, A3Zi, LSL #1      ; Zi' = Yi - Zi
  SUB   A3Zr, A3Yr, A3Zr, LSL #1      ; Zr' = Yr - Zr


; Now to do another pair of second stage calculations, saving some registers,
; but keeping in registers the data required to complete one third stage.

; Group 2 of the second stage
  ADD   B2Yr, B2Yr, B2Zr              ; Yr' = Yr + Zr
  ADD   B2Yi, B2Yi, B2Zi              ; Yi' = Yi + Zi
  SUB   B2Zr, B2Yr, B2Zr, LSL #1      ; Zr' = Yr - Zr
  SUB   B2Zi, B2Yi, B2Zi, LSL #1      ; Zi' = Yi - Zi

; Group 3 of the second stage
  ADD   B3Yr, B3Yr, Tp2               ; Yr' = Yr + Zi
  SUB   B3Yi, B3Yi, B3Zr              ; Yi' = Yi - Zr
  ADD   B3Zi, B3Yi, B3Zr, LSL #1      ; Zi' = Yi + Zr
  SUB   B3Zr, B3Yr, Tp2,  LSL #1      ; Zr' = Yr - Zi
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -