📄 ffts.s
字号:
ADD OS_indx, OS_indx,#16
STMIA OS_indx, {B2Zr, B2Zi, B3Zr, B3Zi}
; Now to do four groups of the third stage, using the data left in the
; registers.
LDR C1Yi,[OS_indx,#-36] ; Load one register which we had to use
; for temporary storage.
; Group 0 of the third stage.
ADD C0Yr, C0Yr, C0Zr ; Yr = Yr + Zr
SUB C0Zr, C0Yr, C0Zr, LSL #1 ; Zr = Yr - Zr
MOV C0Yr, C0Yr, ASR #3 ; overflow prevention scaling
MOV C0Zr, C0Zr, ASR #3 ; overflow prevention scaling
ADD C0Yi, C0Yi, C0Zi ; Yi = Yi + Zi
SUB C0Zi, C0Yi, C0Zi, LSL #1 ; Zi = Yi - Zi
MOV C0Yi, C0Yi, ASR #3 ; overflow prevention scaling
MOV C0Zi, C0Zi, ASR #3 ; overflow prevention scaling
; Group 1 of the third stage.
ADD CxT1, C1Zr, C1Zi ; Tr = (Zr + Zi)
ADD CxTr, CxT1, CxT1, LSL #1 ; *= SQRT(2)/2 = 11585/(1<<14)
RSB CxTr, CxTr, CxTr, LSL #4
ADD CxTr, CxT1, CxTr, LSL #8
ADD CxTr, CxTr, CxT1, LSL #6
SUB CxT1, C1Zi, C1Zr ; Ti = (Zi - Zr)
ADD CxTi, CxT1, CxT1, LSL #1 ; *= SQRT(2)/2 = 11585/(1<<14)
RSB CxTi, CxTi, CxTi, LSL #4
ADD CxTi, CxT1, CxTi, LSL #8
ADD CxTi, CxTi, CxT1, LSL #6
SUB C1Zr, C1Yr, CxTr, ASR #14 ; Zr = Yr - SQRT(2)/2*(Zr+Zi)
SUB C1Zi, C1Yi, CxTi, ASR #14 ; Zi = Yi - SQRT(2)/2*(Zi-Zr)
MOV C1Zr, C1Zr, ASR #3 ; overflow prevention scaling.
MOV C1Zi, C1Zi, ASR #3 ; overflow prevention scaling.
ADD C1Yr, C1Yr, CxTr, ASR #14 ; Yr = Yr + SQRT(2)/2*(Zr+Zi)
ADD C1Yi, C1Yi, CxTi, ASR #14 ; Yi = Yi + SQRT(2)/2*(Zi-Zr)
MOV C1Yr, C1Yr, ASR #3 ; overflow prevention scaling.
MOV C1Yi, C1Yi, ASR #3 ; overflow prevention scaling.
SUB OS_indx, OS_indx, #16 ; Store the results for groups 0 and 1.
STMIA OS_indx, {C0Zr, C0Zi, C1Zr, C1Zi}
SUB OS_indx, OS_indx, #32
STMIA OS_indx!, {C0Yr, C0Yi, C1Yr, C1Yi}
; Reload the data for groups 2 and 3.
LDMIA OS_indx, {C2Yr, C2Yi, C3Yr, C3Yi}
ADD OS_indx, OS_indx, #32
LDMIA OS_indx, {C2Zr, C2Zi, C3Zr, C3Zi}
; Group 2 of the third stage.
ADD CxT1, C2Yr, C2Zi ; Yr = Yr + Zi
SUB CxT2, C2Yr, C2Zi ; Zi = Yi + Zr
MOV C2Yr, CxT1, ASR #3 ; Yi = Yi - Zr
ADD CxT1, C2Yi, C2Zr ; Zr = Yr - Zi
MOV C2Zi, CxT1, ASR #3
SUB CxT1, C2Yi, C2Zr
MOV C2Yi, CxT1, ASR #3
MOV C2Zr, CxT2, ASR #3
SUB CxT1, C3Zi, C3Zr ; Tr = (Zi-Zr)
ADD CxTr, CxT1, CxT1, LSL #1 ; *= SQRT(2)/2 = 11585/(1<<14)
RSB CxTr, CxTr, CxTr, LSL #4
ADD CxTr, CxT1, CxTr, LSL #8
ADD CxTr, CxTr, CxT1, LSL #6
ADD CxT1, C3Zr, C3Zi ; Ti = (Zr+Zi)
ADD CxTi, CxT1, CxT1, LSL #1 ; *= SQRT(2)/2 = 11585/(1<<14)
RSB CxTi, CxTi, CxTi, LSL #4
ADD CxTi, CxT1, CxTi, LSL #8
ADD CxTi, CxTi, CxT1, LSL #6
; Group 3 of the third stage.
SUB C3Zr, C3Yr, CxTr, ASR #14
ADD C3Zi, C3Yi, CxTi, ASR #14 ; Zr = Yr - SQRT(2)/2 * (Zi-Zr)
MOV C3Zr, C3Zr, ASR #3 ; Zi = Yi + SQRT(2)/2 * (Zr+Zi)
MOV C3Zi, C3Zi, ASR #3
ADD C3Yr, C3Yr, CxTr, ASR #14 ; Yr = Yr + SQRT(2)/2 * (Zi-Zr)
SUB C3Yi, C3Yi, CxTi, ASR #14 ; Yi = Yi - SQRT(2)/2 * (Zr+Zi)
MOV C3Yr, C3Yr, ASR #3
MOV C3Yi, C3Yi, ASR #3
STMIA OS_indx,{C2Zr,C2Zi,C3Zr,C3Zi} ; Store the results for groups 2 and 3.
SUB OS_indx, OS_indx, #32
STMIA OS_indx!,{C2Yr,C2Yi,C3Yr,C3Yi}
ADD OS_indx, OS_indx, #32
SUBS OS_cnt, OS_cnt, #1 ; Loop until all elements done.
BNE STG3_LOOP
;---------------------------------------------------------------------------
; Now the main OPTIMISED FFT loops start...
; The way it works is that we start at one end, lookup in the trig table
; the value for Sine and Cosine for that element, and then we do a cunning
; inner loop where we do ALL of the elements which require those values
; of Sine and Cosine, and additionally all elements with the cosine just
; negated and those with Sin and Cos swapped over (so we don't have to
; reload them). This also has the advantage that Sin and Cos are always
; positive in the following loops, making the multiplies faster.
; Swapped out registers
StartElem RN 3 ; First calculation in this group (k)
Tab RN 4 ; Index into trig. table, (wk/(2pi/N))
; Inner loop registers
Yadr RN 0 ; Yadr = address of first data value
Zadr RN 1 ; Zadr = address of second data value
CalcsLoop RN 2 ; counter for inner (butterfly) loop
Yr RN 3 ; Yr = first data value, real part
Yi RN 4 ; Yi = first data value, imaginary part
Zr RN 5 ; Zr = second data value, real part
Zi RN 6 ; Zi = second data value, imaginary part
Tr RN 7 ; Tr = temp complex value, real part
Ti RN 8 ; Ti = temp complex value, imaginary part
GrpStage RN 9 ; Number of groups in the current stage
TabStep RN 10 ; angle step
; R11 is Tp
Cos RN 12 ; Current cos(wk) - always +ve
Sin RN 13 ; Current sin(wk) - always +ve
; R14 is PDataAr
MOV GrpStage, #8 ; fourth stage has eight groups (n=16)
LDR Tp,[PDataAr,#MN]
MOV CalcsLoop, Tp, LSR#4 ; and N/16 calculations per group
MOV TabStep,#(TABLE_N/16) ; pi/8 initial angle increment
STGLOOP ; start of STAGES loop
STR CalcsLoop,[PDataAr,#MCalcsLoop] ; swap out CalcsLoop
; first do wk=0, cos=1, sin=0
LDR Yadr,[PDataAr,#MOutAddr] ; address of first element
EG0CALCLP ; start of calculations loop
ADD Zadr,Yadr,GrpStage,LSL #3 ; calculate address of element Z
LDMIA Yadr,{Yr,Yi} ; load element Y
LDMIA Zadr,{Zr,Zi} ; load element Z
ADD Tr,Yr,Zr
ADD Ti,Yi,Zi
SUB Zr,Yr,Zr ; Z.r = Y.r - Z.r
SUB Zi,Yi,Zi ; Z.i = Y.i - Z.i
MOV Yr,Tr,ASR #1 ; Y.r = Y.r + Z.r
MOV Yi,Ti,ASR #1 ; Y.i = Y.i + Z.i
MOV Zr,Zr,ASR #1
MOV Zi,Zi,ASR #1
STMIA Zadr,{Zr,Zi} ; store new values for next stage
STMIA Yadr,{Yr,Yi}
ADD Yadr,Yadr,GrpStage,LSL #4 ; prepare for next calculation
SUBS CalcsLoop,CalcsLoop,#1
BNE EG0CALCLP ; end of calculations loop
; now do wk = pi/4, cos = sin = sqr(2)/2 = 1/sqr(2)
LDMIA PDataAr,{Yadr,CalcsLoop} ; refresh Calcsloop + Yadr=out buffer
ADD Yadr,Yadr,GrpStage,LSL #1 ; first calculation (n/8 elements on)
EG1CALCLP ; start of calculations loop
ADD Zadr,Yadr,GrpStage,LSL #3 ; calculate address of element Z
LDMIA Yadr,{Yr,Yi} ; load element Y
LDMIA Zadr,{Zr,Zi} ; load element Z
ADD Tp,Zr,Zi
ADD Tr,Tp,Tp,LSL #1 ; MUL Tr,Tp,#11585
RSB Tr,Tr,Tr,LSL #4
ADD Tr,Tp,Tr,LSL #8
ADD Tr,Tr,Tp,LSL #6
SUB Tp,Zi,Zr
ADD Ti,Tp,Tp,LSL #1 ; MUL Ti,Tp,#11585
RSB Ti,Ti,Ti,LSL #4
ADD Ti,Tp,Ti,LSL #8
ADD Ti,Ti,Tp,LSL #6
SUB Zr,Yr,Tr,ASR #14 ; Z = Y-T
SUB Zi,Yi,Ti,ASR #14
ADD Yr,Yr,Tr,ASR #14 ; Y = Y+T
ADD Yi,Yi,Ti,ASR #14
MOV Yr,Yr,ASR #1
MOV Yi,Yi,ASR #1
STMIA Yadr,{Yr,Yi}
MOV Zr,Zr,ASR #1
MOV Zi,Zi,ASR #1
STMIA Zadr,{Zr,Zi} ; store new values for next stage
ADD Yadr,Yadr,GrpStage,LSL #4 ; prepare for next calculation
; of this group
SUBS CalcsLoop,CalcsLoop,#1
BNE EG1CALCLP ; end of calculations loop
; now do wk = -pi/2 (n/4), cos=0, sin=1
LDMIA PDataAr,{Yadr,CalcsLoop} ; refresh Calcsloop + Yadr=out buffer
ADD Yadr,Yadr,GrpStage,LSL #2 ; first calculation
EG2CALCLP ; start of calculations loop
ADD Zadr,Yadr,GrpStage,LSL #3 ; calculate address of element Z
LDMIA Yadr,{Yr,Yi} ; load element Y
LDMIA Zadr,{Zr,Zi} ; load element Z
ADD Tr,Yr,Zi
SUB Tp,Yr,Zi
MOV Yr,Tr,ASR #1 ; Y.r = Y.r + Z.i
ADD Ti,Yi,Zr
MOV Zi,Ti,ASR #1 ; Z.i = Y.i + Z.r
SUB Ti,Yi,Zr
MOV Yi,Ti,ASR #1 ; Y.i = Y.i - Z.r
STMIA Yadr,{Yr,Yi}
MOV Zr,Tp,ASR #1 ; Z.r = Y.r - Z.i
STMIA Zadr,{Zr,Zi} ; store new values for next stage
ADD Yadr,Yadr,GrpStage,LSL #4 ; prepare for next calculation
; of this group
SUBS CalcsLoop,CalcsLoop,#1
BNE EG2CALCLP ; end of calculation loop
; now do theta = -3pi/4 (3a/8), cos=-sqr(2)/2, sin=sqr(2)/2
LDMIA PDataAr,{Yadr,CalcsLoop} ; refresh Calcsloop + Yadr=out buffer
ADD Yadr,Yadr,GrpStage,LSL #2
ADD Yadr,Yadr,GrpStage,LSL #1 ; first calculation
EG3CALCLP ; start of calculations loop
ADD Zadr,Yadr,GrpStage,LSL #3 ; calculate address of element Z
LDMIA Yadr,{Yr,Yi} ; load element Y
LDMIA Zadr,{Zr,Zi} ; load element Z
SUB Tp,Zi,Zr
ADD Tr,Tp,Tp,LSL #1 ; MUL Tr,Tp,#11585
RSB Tr,Tr,Tr,LSL #4
ADD Tr,Tp,Tr,LSL #8
ADD Tr,Tr,Tp,LSL #6
ADD Tp,Zi,Zr
ADD Ti,Tp,Tp,LSL #1 ; MUL Ti,Tp,#11585
RSB Ti,Ti,Ti,LSL #4
ADD Ti,Tp,Ti,LSL #8
ADD Ti,Ti,Tp,LSL #6
SUB Zr,Yr,Tr,ASR #14 ; Z = Y-T
ADD Zi,Yi,Ti,ASR #14
ADD Yr,Yr,Tr,ASR #14 ; Y = Y+T
SUB Yi,Yi,Ti,ASR #14
MOV Yr,Yr,ASR #1
MOV Yi,Yi,ASR #1
STMIA Yadr,{Yr,Yi}
MOV Zr,Zr,ASR #1
MOV Zi,Zi,ASR #1
STMIA Zadr,{Zr,Zi} ; store new values for next stage
ADD Yadr,Yadr,GrpStage,LSL #4 ; prepare for next calculation
; of this group
SUBS CalcsLoop,CalcsLoop,#1
BNE EG3CALCLP ; end of calculations loop
; Finished the easy groups - done StartElem=k=0.
; Now let k vary between 1 and n/8-1, wk strictly between 0 and pi/4.
; Using Cos and Sin for this value of wk we can also do three other points
; (where wk takes the values pi/2-wk, pi/2+wk and pi-wk) at the same time.
MOV StartElem, #1 ; done k=0 above
LDR CalcsLoop,[PDataAr,#MCalcsLoop] ; retrieve CalcsLoop
MOV Tab,TabStep ; first angle
; rest of groups loop - k=1,2,3,...,n/8-1
GRPLOOP ; start of GROUPS loop
ADD Tp,PDataAr,#MStartElem
STMIA Tp,{StartElem,Tab} ; swap out StartElem and Tab
; look up sin and cosine in the trig table
ADRL Tp, TRIG_TABLE ; Look up Cos and Sin values in table.
LDR Tp, [Tp, Tab, LSL#2] ; get the sin/cos word
MOV Sin, Tp, LSR#16 ; extract sin
BIC Cos, Tp, Sin, LSL#16 ; extract cos
; first do for this wk
LDR Yadr,[PDataAr,#MOutAddr] ; Yadr=out buffer
ADD Yadr,Yadr,StartElem,LSL #3 ; address of first calculation
CALCLOOP1 ; start of calculations loop
ADD Zadr,Yadr,GrpStage,LSL #3 ; calculate address of element Z
LDMIA Yadr,{Yr,Yi} ; load element Y
LDMIA Zadr,{Zr,Zi} ; load element Z
MUL Tr,Zr,Cos ; t=complex_mult(k,cis(theta))
MLA Tr,Zi,Sin,Tr
MUL Tp,Zr,Sin
MUL Ti,Zi,Cos
SUB Ti,Ti,Tp
SUB Zr,Yr,Tr,ASR #14 ; Z = Y-T
SUB Zi,Yi,Ti,ASR #14
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -