📄 fdct.asm
字号:
(R7:4,P5:3)=[SP++]; //Pop the registers before returning.
RTS; //Return.
NOP; //to avoid one stall if LINK or UNLINK happens to be
//the next instruction after RTS in the memory.
//void xhDCT8x8Fwd_8u16s_C1R(unsigned char *pSrc, int srcStep, short *pDst,Ipp16s *coeff,Ipp16s *temp);
_xhDCT8x8Fwd_8u16s_C1R:
/******************************* Function Prologue ***************************/
[--SP] = (R7:4, P5:3); //Pushing the Registers on stack.
[--SP]=RETS;
B0 = R2; //Pointer to Input matrix.
R7 = [SP+32+12]; //Pointer to Coefficients.
B3=R7;
R6 = [SP+32+16]; //Pointer to Temporary matrix.
B2=R6;
L0 = 0; //L registers are initialized to 0
L1 = 0; //-------- do --------
L2 = 0;
I0=R0;
I2=R2;
M0=R1;
P0=8;
R0+=4;
I1=R0;
R0=[I0++M0]||R1=[I1++M0];
P0=8;
LSETUP(transfer8to16_start,transfer8to16_end)LC0=P0;
(R7,R6) = BYTEUNPACK R1:0;
transfer8to16_start:
(R5,R4) = BYTEUNPACK R1:0(R)||[I2++]=R6;
R0=[I0++M0]||[I2++]=R7;
(R7,R6) = BYTEUNPACK R1:0||[I2++]=R4;
transfer8to16_end:
R1=[I1++M0]||[I2++]=R5;
/*
I0=R0;
P1=I0;
I2=R2;
M0=R1;
R1+=-8;
P2=R1;
P0=64;
R0=B[P1++](Z);
LSETUP(transfer8to16_start,transfer8to16_end)LC0=P0;
transfer8to16_start:
W[I2++]=R0.L;
R0=B[P1++](Z);
W[I2++]=R0.L;
R0=B[P1++](Z);
W[I2++]=R0.L;
R0=B[P1++](Z);
W[I2++]=R0.L;
R0=B[P1++](Z);
W[I2++]=R0.L;
R0=B[P1++](Z);
W[I2++]=R0.L;
R0=B[P1++](Z);
W[I2++]=R0.L;
R0=B[P1++](Z);
W[I2++]=R0.L;
P1=P1+P2;
transfer8to16_end:
R0=B[P1++](Z);*/
R0=B0;
R1=B3;
R2=B2;
//-------- do --------
L3 = 16; //L3 is set to 16 to make the coefficients
//array Circular.
/*
I0, I1, and I2 registers are used to read the input data. I3 register is used
to read the coefficients. P0 and P1 registers are used for writing the output
data.
*/
M0 = 12 (X); // All these initialization are used in the
// modification of address offsets.
M1 = 16 (X);
P2 = 16;
P3 = 32 (X);
P4 = -110 (X);
P5 = -62 (X);
P0 = 2;
/*
According to Chen's algorithm, first 8-point DCT will be calculated for all
the 8 rows. The output of this calculation is stored in another transpose
matrix. Now again the 8-point DCT is applied on all the 8 rows. The output
is stored in matrix in transposed form. This is the final output.
Therefore,
a loop of 2 iteration (DCT_START, DCT_end) is set.
B0 points to the "in" buffer and B2 points to "temp" buffer in the first
iteration. The input is read from "in" buffer and output is written to
"temp" buffer. In the second iteration of DCT_START B0 points to "temp" and
B2 points to "in" buffer. The input is read from "temp" buffer and output
is written to "in" buffer. "in" buffer holds the final output.
*/
LSETUP (DCT_START_8U16S, DCT_END_8U16S) LC0 = P0;
DCT_START_8U16S:
I0 = B0; //I0 points to Input Element (0, 0)
I1 = B0; //Element 1 and 0 is read in R0.
I1 += M0 || R0 = [I0++];
//I1 points to Input Element (0, 6)
I2 = I1; //Element 6 is read in R3.H
I2 -= 4 || R3.H = W[I1++];
//I2 points to Input Element (0, 4)
I3 = B3; //I3 points to Coefficients
P0 = B2; //P0 points to temporary array Element (0, 0)
P1 = B2; //P1 points to temporary array
R7 = [P1++P2] || R2 = [I2++];
//P1 points to temporary array Element (1, 0)
//R7 is a dummy read. Element 4 and 5 are read in R2
R3.L = W[I1--]; //Element 7 is read in R3.L
R1.H = W[I0++]; //Element 2 is read in R1.H
//******************************* Implementation of Part 1 ********************
/*
All the additions from Stage 1 and Stage 2 are implemented in Part1 1. for
the optimization
It is taken out of the loop for optimization point of view. (Part 1)
The following instruction does the following job.
Element 0 = (Element 0 + Element 7) / 2.
Element 1 = (Element 1 + Element 6) / 2.
Element 6 = (Element 1 - Element 6) / 2.
Element 7 = (Element 0 - Element 7) / 2.
It reads the data 3 in R1.L.
*/
R0 = R0 +|+ R3, R3 = R0 -|- R3 (ASR) || R1.L = W[I0++] || NOP;
/*
This single instruction does the following job.
Element 2 = (Element 2 + Element 5) / 2.
Element 3 = (Element 3 + Element 4) / 2.
Element 4 = (Element 3 - Element 4) / 2.
Element 5 = (Element 2 - Element 5) / 2.
It reads the Coefficients C4 = cos(4*pi/16) in register R7.
*/
R1 = R1 +|+ R2, R2 = R1 -|- R2 (ASR, CO) || NOP || R7 = [I3++];
/*
At the end of stage 1 R0 has (1,0), R1 has (2,3), R2 has (4, 5) and
R3 has (6,7). Where notation (x, y) means the element from column x is in
upper half of register and element from column y is in lower half of the
register.
*/
//******************************* Implementation of Part 2 *********************
/*
The following addition/subtraction instruction does -
Element 0 = Element 0 + Element 3.
Element 1 = Element 1 + Element 2.
Element 2 = Element 1 - Element 2.
Element 3 = Element 0 - Element 3.
*/
R0 = R0 +|+ R1, R1 = R0 -|- R1;
LSETUP (ROW_START_8U16S, ROW_END_8U16S) LC1 = P2 >> 1;
//The loop is set for 8 rows.
ROW_START_8U16S:
/*
This is part 2 computation continued.....
The following two instructions do -
A1 = Element 6 * cos(pi/4)
A0 = Element 6 * cos(pi/4)
A1 = A1 - Element 5 * cos(pi/4)
A0 = A0 + Element 5 * cos(pi/4).
The instruction W[I0] = R3.L is used for packing it to R2.L.
*/
A1 = R3.H * R7.L, A0 = R3.H * R7.L || I1 += M1 || W[I0] = R3.L;
R4.H = ( A1 -= R2.L * R7.L), R4.L = (A0 += R2.L * R7.L) || I2 += M0;
/*
At the end of stage 2 R0 has (1,0), R1 has (2,3), R4 has (5, 6).
*/
//************************ Implementation of Part 3 ***************************
/*
The following two instruction does the job of stage 3 -
A1 = Element 0 * cos(pi/4)
A0 = Element 0 * cos(pi/4)
A1 = A1 - Element 1 * cos(pi/4)
A0 = A0 + Element 1 * cos(pi/4)
The value of coefficients C2 and C6 are read in register R7.
*/
A1 = R0.L * R7.L, A0 = R0.L * R7.L || NOP || R3.H = W[I1++];
R5.H = (A1 -= R0.H * R7.L), R5.L = (A0 += R0.H * R7.L)
|| R7 = [I3++];
/*
The following three instructions do -
A1 = Element 2 * cos(3pi/8)
A0 = Element 3 * cos(3pi/8)
A1 = A1 + Element 3 * cos(pi/8)
A0 = A0 - Element 2 * cos(pi/8)
R3 reads the value of cos pi/4.
The value of coefficients C7 and C1 is read in register R7.
Element 4 = Element 4 + Element 5.
Element 5 = Element 4 - Element 5.
Element 6 = Element 7 - Element 6.
Element 7 = Element 7 + Element 6.
*/
A1 = R1.H * R7.L, A0 = R1.L * R7.L || W[P0++P3] = R5.L
|| R2.L = W[I0];
R2 = R2 +|+ R4, R4 = R2 -|- R4 || I0 += 4 || R3.L = W[I1--];
R6.H = (A1 += R1.L * R7.H), R6.L = (A0 -= R1.H * R7.H) || I0 += 4
|| R7 = [I3++];
/*
At the end of part 3 R2 has (4, 7), R4 has (5,6), R5 has (1, 0) and
R6 has (2,3).
*/
//****************************** Implementation of Part 4 **********************
/*
The following two instructions do -
A1 = Element 4 * cos(7pi/16)
A0 = Element 7 * cos(7pi/16)
A1 = A1 + Element 7 * cos(pi/16)
A0 = A0 - Element 4 * cos(pi/16)
The value of next coefficients are read, and the registers are written to
their locations.
*/
A1 = R2.H * R7.L, A0 = R2.L * R7.L || W[P0++P3] = R6.H
|| R0 = [I0++];
R2.H = ( A1 += R2.L * R7.H), R2.L = ( A0 -= R2.H * R7.H)
|| W[P0++P3] = R5.H || R7 = [I3++];
/*
The following two instructions do -
A1 = Element 5 * cos(3pi/16)
A0 = Element 6 * cos(3pi/16)
A1 = A1 + Element 6 * cos(5pi/16)
A0 = A0 - Element 5 * cos(5pi/16)
The output values are written.
*/
A1 = R4.H * R7.H, A0 = R4.L *R7.H || W[P0++P2] = R6.L
|| R1.H = W[I0++];
R4.H = (A1 += R4.L * R7.L), R4.L = ( A0 -= R4.H * R7.L)
|| W[P0++P4] = R2.L || R1.L = W[I0++];
//******************* Implementation of Part 1 *****************************
/*
This is the same part as part 1 specified earlier. First time the part 1
calculation is done outside the loop, after wards it is done here. It serves
two purpose. Firts it computes part 1 for the next data, and it writes the
data 5, and 6 to its bit reversed order in transpose way.
*/
R0 = R0 +|+ R3, R3 = R0 -|- R3 (ASR) || W[P1++P3] = R2.H
|| R2 = [I2++];
R1 = R1 +|+ R2, R2 = R1 -|- R2 (ASR, CO) || W[P1++P3] = R4.L
|| R7 = [I3++];
ROW_END_8U16S: R0 = R0 +|+ R1, R1 = R0 -|- R1 || W[P1++P5] = R4.H || NOP;
B1 = B0; //Swapping of Input and output address pointers
B0 = B2; //B0 points to input buffer.
DCT_END_8U16S:
B2 = B1; //B2 points to output buffer.
_xhDCT8x8Fwd_8u16s_C1R.END:
RETS=[SP++] ;
(R7:4,P5:3)=[SP++]; //Pop the registers before returning.
RTS; //Return.
NOP;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -