📄 r8x8dct.s
字号:
/**************************************
Prototype : void _r8x8dct(fract16 *in, fract16 *coeff, fract16 *temp, fract16 *coeff1);
*in -> Pointer to Input vector.
*coeff -> Pointer to coefficients.
*temp -> Pointer to temporary data.
Registers Used : A0, A1, R0-R7, I0-I3, B0-B3, M0, M1, L0-L3, P0-P5, LC0, LC1.
Performance :Cycle Count : 508 Cycles.
*******************************************************************************/
/*.section L1_code;
.global __r8x8dct;
.align 8;
__r8x8dct:*/
#include "config_bfin.h"
//#include "tr8x8dct.h"
//#include "mds_def.h"
DEFUN(r8x8dct,mL1,
(fract16 *s, fract16 *coeff, fract16 *temp, fract16 *coeff1)):
/******************************* Function Prologue ***************************/
B0 = R0; //Pointer to Input matrix.
B3 = R1; //Pointer to Coefficients.
B2 = R2; //Pointer to Temporary matrix.
R0=[sp+12];
[--SP] = (R7:4, P5:3); //Pushing the Registers on stack.
B1=R0;
L0 = 0; //L registers are initialized to 0
L1 = 0; //-------- do --------
L2 = 0; //-------- do --------
L3 = 24; //L3 is set to 16 to make the coefficients
//array Circular.
/*
I0, I1, and I2 registers are used to read the input data. I3 register is used
to read the coefficients. P0 register are used for writing the output
data.
*/
M0 = 12 (X); // All these initialization are used in the
// modification of address offsets.
M1 = 16 (X);
P2 = 16;
P3 = 32 (X);
P4 = -110 (X);
P5 = -62 (X);
P0 = 2;
/*
B0 points to the "in" buffer and B2 points to "temp" buffer in the first
iteration. The input is read from "in" buffer and output is written to
"temp" buffer. In the second iteration of DCT_START B0 points to "temp" and
B2 points to "in" buffer. The input is read from "temp" buffer and output
is written to "in" buffer. "in" buffer holds the final output.
*/
I0 = B0; //I0 points to Input Element (0, 0)
I1 = B0; //Element 1 and 0 is read in R0.
I1 += M0 || R0 = [I0++];
//I1 points to Input Element (0, 6)
I2 = I1; //Element 6 is read in R3.H
I2 -= 4 || R3.H = W[I1++];
//I2 points to Input Element (0, 4)
I3 = B3; //I3 points to Coefficients
P0 = B2; //P0 points to temporary array Element (0, 0)
P1 = B2; //P1 points to temporary array
R7 = [P1++P2] ||R2 = [I2++];
//P1 points to temporary array Element (1, 0)
//R7 is a dummy read. Element 4 and 5 are read in R2
R3.L = W[I1--]; //Element 7 is read in R3.L
R1.H = W[I0++]; //Element 2 is read in R1.H
//******************************* Implementation of Row ********************
R0 = R0 +|+ R3, R3 = R0 -|- R3 || R1.L = W[I0++] || NOP; //R0(1+6,0+7);R3(1-6,0-7)
R1 = R1 +|+ R2, R2 = R1 -|- R2(CO)|| NOP || R7 = [I3++];//R1(2+5,3+4);R2(3-4,2-5)
R0 = R0 +|+R1, R1 = R0 -|- R1;
LSETUP (ROW_START, ROW_END) LC1 = P2 >> 1;
//The loop is set for 8 rows.
ROW_START:
R5 = (A1 = R0.L * R7.L), R4 = (A0 = R0.H * R7.L)(IS)|| W[I1]=R3.L||R6= [I3++];
R0.L = R5+R4(RND12)|| R7 = [I3++] || W[I0]=R2.L;
R5.L = R5-R4(RND12)||W[P0++P3] = R0.L || R3.L=W[I0];
A1 = R1.H * R7.L, A0 = R1.L * R7.L(IS) ||I0+=4 || R2.L=W[I1];
R1 = (A1 += R1.L * R7.H), R0 = (A0 -= R1.H * R7.H)(IS) || I0 += 4
|| R7 = [I3++];
R1.L=R1+R6(RND12)|| I1+=M1;
R0.L=R0+R6(RND12)||W[P0++P3] = R1.L;
// R7=(C1,C7)
A1 =R7.H*R2.L, A0 =R7.L*R2.L(IS) || W[P0++P3] = R5.L;
A1+=R7.L*R2.H, A0-=R7.H*R2.H (IS)|| W[P0++P2] = R0.L || R7=[I3++];
A1+=R7.H*R3.H, A0-=R7.L*R3.H (IS)|| R0 = [I0++]; // R7=(C3,C5)
R5 =(A1+=R7.L*R3.L), R4 =(A0+=R7.H*R3.L)(IS) || R1.H = W[I0++];
R4.L=R4+R6(RND12)|| R1.L = W[I0++] ;
R5.L=R5+R6(RND12)|| W[P0++P4] = R4.L;
A1 =R7.L*R2.L, A0 =R7.H*R2.L(IS) || W[P1++P3] = R5.L;
A1+=R7.H*R2.H, A0-=R7.L*R2.H(IS) || R7=[I3++] ; // R7=(C1,C7)
A1-=R7.H*R3.H, A0-=R7.L*R3.H(IS) || R3.H = W[I1++];
R5 =(A1+=R7.L*R3.L), R4 =(A0-=R7.H*R3.L)(IS) || R3.L = W[I1--];
R4.L=R4+R6(RND12)||I2+=M0 ;
R5.L=R5+R6(RND12)|| W[P1++P3] = R4.L;
R0 = R0 +|+ R3, R3 = R0 -|- R3 || W[P1++P5] = R5.L || R2 = [I2++];
R1 = R1 +|+ R2, R2 = R1 -|- R2 (CO)|| R7= [I3++];
ROW_END: R0 = R0 +|+ R1, R1 = R0 -|- R1;
B3 = B1;
B1 = B0; //Swapping of Input and output address pointers
B0 = B2; //B0 points to input buffer.
B2 = B1; //B2 points to output buffer.
I0 = B0; //I0 points to Input Element (0, 0)
I1 = B0; //Element 1 and 0 is read in R0.
I1 += M0 || R0 = [I0++];
//I1 points to Input Element (0, 6)
I2 = I1; //Element 6 is read in R3.H
I2 -= 4 || R3.H = W[I1++];
//I2 points to Input Element (0, 4)
I3 = B3; //I3 points to Coefficients
P0 = B2; //P0 points to temporary array Element (0, 0)
P1 = B2; //P1 points to temporary array
R7 = [P1++P2] ||R2 = [I2++];
//P1 points to temporary array Element (1, 0)
//R7 is a dummy read. Element 4 and 5 are read in R2
R3.L = W[I1--]; //Element 7 is read in R3.L
R1.H = W[I0++]; //Element 2 is read in R1.H
//******************************* Implementation of Column ********************
R0 = R0 +|+ R3, R3 = R0 -|- R3 || R1.L = W[I0++] || NOP; //R0(1+6,0+7);R3(1-6,0-7)
R1 = R1 +|+ R2, R2 = R1 -|- R2(CO)|| NOP || R7 = [I3++];//R1(2+5,3+4);R2(3-4,2-5)
LSETUP (COLUMN_START, COLUMN_END) LC1 = P2 >> 1;
//The loop is set for 8 columns.
COLUMN_START:
A1 = R0.L * R7.L, A0 = R0.H * R7.L(IS)|| W[I1]=R3.L||R6= [I3++];
A1 += R1.L * R7.L, A0 += R1.H * R7.L(IS)|| R7 = [I3++] || W[I0]=R2.L;
R5=A1+A0,R4=A1-A0(S);
R5=R5+R6(S)||I2+=M0;
R4=R4+R6(S)||W[P0++P3] = R5.H|| R3.L=W[I0];
A1 = R0.L * R7.H, A0 = R0.L * R7.L(IS)||I0+=4 || R2.L=W[I1];
A1 -= R1.L * R7.H, A0 -= R1.L * R7.L(IS);
A1 += R0.H * R7.L, A0 -= R0.H * R7.H(IS);
R1=(A1 -= R1.H * R7.L), R0=(A0 += R1.H * R7.H)(IS)|| I0 += 4
|| R7 = [I3++];
R1=R1+R6(S)||I1+=M1;
R0=R0+R6(S)||W[P0++P3] = R1.H;
// R7=(C1,C7)
A1 =R7.H*R2.L, A0 =R7.L*R2.L(IS) || W[P0++P3] = R4.H;
A1+=R7.L*R2.H, A0-=R7.H*R2.H (IS)|| W[P0++P2] = R0.H || R7=[I3++];
A1+=R7.H*R3.H, A0-=R7.L*R3.H (IS)|| R0 = [I0++]; // R7=(C3,C5)
R5 =(A1+=R7.L*R3.L), R4 =(A0+=R7.H*R3.L)(IS) || R1.H = W[I0++];
R4=R4+R6(S)||R1.L = W[I0++] ;
R5=R5+R6(S)||W[P0++P4] = R4.H;
A1 =R7.L*R2.L, A0 =R7.H*R2.L(IS) || W[P1++P3] = R5.H;
A1+=R7.H*R2.H, A0-=R7.L*R2.H (IS)|| R7=[I3++] ; // R7=(C1,C7)
A1-=R7.H*R3.H, A0-=R7.L*R3.H (IS)|| R3.H = W[I1++];
R5 =(A1+=R7.L*R3.L), R4 =(A0-=R7.H*R3.L)(IS) || R3.L = W[I1--];
R4=R4+R6(S);
R5=R5+R6(S)||W[P1++P3] = R4.H;
R0 = R0 +|+ R3, R3 = R0 -|- R3 || W[P1++P5] = R5.H || R2 = [I2++];
COLUMN_END: R1 = R1 +|+ R2, R2 = R1 -|- R2 (CO)|| R7= [I3++];
/*TERMINATE:
(R7:4,P5:3)=[SP++]; //Pop the registers before returning.
RTS; //Return.
NOP; //to avoid one stall if LINK or UNLINK happens to be
//the next instruction after RTS in the memory.
__r8x8dct.end: */
(R7:4,P5:3) = [SP++];
RTS;
DEFUN_END(r8x8dct)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -