📄 cr2fftnasm_outplace.asm
字号:
* scaling purpose.
* All the data are stored to output buffer after the division by 2, for scaling
* purpose.
*/
/*
* These four instructions reads the input data in R2, R3, R4 and R5 registers from
* bit reversed locations. The first stage computation is done. It is taken out of the
* loop for optimization point of view.
*/
I0 += M0 (BREV) || R2 = [I0]; //R2 reads the data
I0 += M0 (BREV) || R3 = [I0];
I0 += M0 (BREV) || R4 = [I0];
CC = P0 <= 0 (IU);
If CC Jump ESC_12;
lsetup(Stage12_strt, Stage12_end) LC0 = P0;
Stage12_strt:
R2 = R2 +|+ R3, R3 = R2 -|- R3 (ASR) || I0 += M0 (BREV) || R5 = [I0];
R4 = R4 +|+ R5, R5 = R4 -|- R5 (ASR, CO) || I0 += M0 (BREV) || R0 = [I0];
R2 = R2 +|+ R4, R4 = R2 -|- R4 (ASR) || I0 += M0 (BREV) || R1 = [I0];
R5 = R3 +|- R5, R3 = R3 -|+ R5 (ASR) || I0 += M0 (BREV) || R6 = [I0];
R0 = R0 +|+ R1, R1 = R0 -|- R1 (ASR) || R7 = [I0] || [I2++] = R2;
R6 = R6 +|+ R7, R7 = R6 -|- R7 (ASR, CO) || I0 += M0 (BREV) || [I2++] = R3;
R0 = R0 +|+ R6, R6 = R0 -|- R6 (ASR) || R2 = [I0] || [I2++] = R4;
R7 = R1 +|- R7, R1 = R1 -|+ R7 (ASR) || I0 += M0 (BREV) || [I2++] = R5;
R3 = [I0] || [I2++] = R0;
I0 += M0 (BREV) || [I2++] = R1;
R4 = [I0] || [I2++] = R6;
Stage12_end:I0 += M0 (BREV) || [I2++] = R7;
ESC_12:
R2 = R2 +|+ R3, R3 = R2 -|- R3 (ASR) || I0 += M0 (BREV) || R5 = [I0];
R4 = R4 +|+ R5, R5 = R4 -|- R5 (ASR, CO) || I0 += M0 (BREV) || R0 = [I0];
R2 = R2 +|+ R4, R4 = R2 -|- R4 (ASR) || I0 += M0 (BREV) || R1 = [I0];
R5 = R3 +|- R5, R3 = R3 -|+ R5 (ASR) || I0 += M0 (BREV) || R6 = [I0];
R0 = R0 +|+ R1, R1 = R0 -|- R1 (ASR) || [I2++] = R2 || R7 = [I0];
R6 = R6 +|+ R7, R7 = R6 -|- R7 (ASR, CO) || [I2++] = R3 || NOP;
R0 = R0 +|+ R6, R6 = R0 -|- R6 (ASR) || [I2++] = R4 || NOP;
R7 = R1 +|- R7, R1 = R1 -|+ R7 (ASR) || [I2++] = R5 || NOP;
[I2++] = R0;
[I2++] = R1;
[I2++] = R6;
[I2++] = R7;
/***************************************************************************************/
/*
* Here the register values are modified for the execution of middle stage. This satge is
* escaped through Esc_mid label if the input size is equal to 8.
*/
R1 = P1; //R1 = wst.
R1 = R1 << 2; //R1 = wst * 4
P3 = 4; //P3 holds the number of lines in each butterfly at stage 3.
R7 = P5;
R7.L = R1.L * R7.L (IS); //R7 = wst * 4 * twiddle offset
R2 = P4;
R3 = 8;
CC = R2 == R3; //If input array size is equal to 8, then go to last stage, because
If CC Jump Esc_mid; //middle stages does n't occur.
R0 = 0; //Counter for number of stages.
Find_m: //The computation of number of stages is done here.
R2 >>= 1;
R0 += 1;
CC = R2 == R3;
If !CC Jump Find_m; //R0 holds the value of m-3 and is never free
/************************* Implementation of Middle Part *******************************/
/*
* First of all, a loop for the number of stages - 3 is set. It is a general implementation
* of butterfly computation. The first nested loop is set for half of the number of butter-
* flies at each stage. The second nested loop is set for the number of lines in each butt-
* erfly. The computation is done on the output buffer. The output is stored after dividing
* by 2 for scaling purpose. In one loop two butterfly data are read and processed.
*
* The input is read from output buffer and after the computation at this stage the output
* is written back to output buffer.
*/
//B2 Input, B0 output.
Loopfor_m:
I0 = B0;
I2 = B2;
I3 = B2; //Address of output array.
P0 = P3 << 2;
M2 = P0; //M2 holds the offset of counterpart line.
P0 += -4;
M0 = P0;
P5 = P5 >> 1;
R7 = R7 >>> 1 || I3 += M2 || NOP;
M1 = R7;
P3 += -1;
lsetup(Loop1_strt, Loop1_end) LC0 = P5; //Loop is set for number of the butterfly
Loop1_strt:
I1 = B3; //Address of twiddle factor.
R2 = [I2++];
R3 = [I1++M1] || R4 = [I3++];
lsetup(Loop2_strt, Loop2_end) LC1 = P3; //Loop is set for the number of lines
Loop2_strt:
R5 = R2 +|+ R4, R6 = R2 -|- R4 (ASR) || R3 = [I1++M1] || R4 = [I3++]; //R3 = [P0++P2];
A1 = R3.L * R4.H, A0 = R3.L * R4.L || [I0++M2] = R5 || R2 = [I2++];
Loop2_end:R4.H = (A1 += R3.H * R4.L), R4.L = ( A0 -= R3.H * R4.H) || I0 -= M0 || [I0] = R6;
R5 = R2 +|+ R4, R6 = R2 -|- R4 (ASR) || I2 += M2 || NOP;
I3 += M2 || [I0++M2] = R5;
Loop1_end: [I0++] = R6;
P3 += 1;
P3 = P3 << 1;
R0 += -1;
B1 = B0;
B0 = B2;
B2 = B1;
CC = R0 == 0;
If !CC Jump Loopfor_m; //Loop for m.
/***************************************************************************************/
/************************* Implementation of Last Part *********************************/
/*
* This part implements the last stage of the butterfly. The label Esc_mid is used
* when the size of input data is 8. In this case the computation of middle stages have
* to be escaped. The increment in the twiddle factor offset is just 1. In the last stage
* there is only one butterfly. The loop is set for n/4. 4 data are read and processed at
* the same time.
*
* The input is read from the output buffer and after the computation, the final output
* is written to output buffer.
*/
//B2 Input , B0 Output.
Esc_mid:I0 = B0;
I2 = B2;
I3 = B2; //Address of output array.
P0 = P3 << 2;
M2 = P0; //M2 holds the offset of counterpart line.
P0 += -4;
M0 = P0;
R7 = R7 >>> 1 || I3 += M2 || NOP;
M1 = R7;
P3 += -1;
I1 = B3; //Address of twiddle factor.
R2 = [I2++];
R3 = [I1++M1] || R4 = [I3++];
lsetup(Last_strt, Last_end) LC1 = P3; //Loop is set for the number of lines
Last_strt:
R5 = R2 +|+ R4, R6 = R2 -|- R4 (ASR) || R3 = [I1++M1] || R4 = [I3++];
A1 = R3.L * R4.H, A0 = R3.L * R4.L || [I0++M2] = R5 || R2 = [I2++];
Last_end:R4.H = (A1 += R3.H * R4.L), R4.L = ( A0 -= R3.H * R4.H) || I0 -= M0 || [I0] = R6;
R5 = R2 +|+ R4, R6 = R2 -|- R4 (ASR) || NOP || NOP;
[I0++M2] = R5;
[I0] = R6;
R0 = B0;
R1 = M3;
CC = R0 == R1;
If CC Jump Terminate;
I0 = B0;
I1 = M3;
R0 = [I0++];
lsetup(Copy_strt, Copy_strt) LC0 = P4;
Copy_strt: [I1++] = R0 || R0 = [I0++];
/***************************************************************************************/
Terminate:
(R7:4, P5:3) = [SP++]; //Pop the registers before returning.
RTS; //Return.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -