📄 r8x8invdct.asm
字号:
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs ( Micro Signal Architecture 1.0 specification).
By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software.
********************************************************************************
Module Name : r8x8invdct.asm
Label name : __r8x8invdct
Version : 1.3
Change History :
Version Date Author Comments
1.3 11/18/2002 Swarnalatha Tested with VDSP++ 3.0
compiler 6.2.2 on
ADSP-21535 Rev.0.2
1.2 11/13/2002 Swarnalatha Tested with VDSP++3.0
on ADSP-21535 Rev.0.2
1.1 02/19/2002 Vijay Modified to match
silicon cycle count
1.0 03/13/2001 Vijay Original
Description : This is the implementation of Chen's algorithm of IDCT.
It is based on the separable nature of IDCT for multi-
dimension. The input matrix is 8x8 real data. First, one dime-
nsional 8-point IDCT is calculated for each of the 8 rows. The
output is stored in a separate matrix after transpose. Then
again 8-point IDCT is calculated on each row of matrix. The
output is again stored in a transpose matrix. This is final
output.
Chen's algorithm has 4 stages (parts) of implementation.
This implementation works only for 8x8 input. The input data
should be real. The range of input should be -256 to 255.
The algorithm is in-placed.
Note : The algorithm reads the input data from the "in"
matrix.
First 8-point IDCT will be calculated for all the 8 rows. This
output is stored in "temp" buffer in the transposed form at
bit reversed locations. Again the 8-point IDCT is applied on
all the 8 rows of "temp" buffer. Final output computed is
stored in "in" buffer in transposed form at bit reversed
locations. The operation of transposing the matrix and
calculation of bit reversed are carried out while writing the
data without any explicit code.
Output of function is provided "in" buffer in normal order.
Prototype : void _r8x8dct(fract16 *in, fract16 *coeff, fract16 *temp);
*in -> Pointer to Input vector.
*coeff -> Pointer to coefficients.
*temp -> Pointer to temporary data.
Registers Used : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M3, L0-L3, P0-P5, LC0,
LC1.
Performance :
Code Size : 340 Bytes.
Cycle Count : 319 Cycles.
******************************************************************************/
.section L1_code;
.global __r8x8invdct;
.align 8;
__r8x8invdct:
/******************** Function Prologue ***************************************/
[--SP] = (R7:4, P5:3); // Pushing the registers on stack.
B0 = R0; // Pointer to Input matrix.
B3 = R1; // Pointer to Coefficients
B2 = R2; // Pointer to Temporary matrix.
L0 = 0; // L registers are initialized to 0
L1 = 0; // --------- do --------
L2 = 0; // --------- do --------
L3 = 16; // L3 is used for making coefficients array
// circular.
M1 = 16 (X); // All these registers are initialized for
M2 = 7 (X); // modifying the address offsets.
M3 = 8(X);
P2 = 16;
P3 = 32 (X);
P4 = -110 (X);
P5 = -62 (X);
P0 = 2;
NOP;
/*
* According to Chen's algorithm, first 8-point IDCT will be calculated for all
* the 8 rows. The output of this calculation is stored in another transpose
* matrix. Now again the 8-point IDCT is applied on all the 8 rows. The output
* is stored in matrix transpose form. This is the final output. Therefore,
* a loop of 2 iteration (IDCT_START, IDCT_END) is set.
*
* B0 points to the "in" buffer and B2 points to "temp" buffer in the first
* iteration. The input is read from "in" buffer and output is written to
* "temp" buffer. In the second iteration of IDCT_START B0 points to "temp" and
* B2 points to "in" buffer. The input is read from "temp" buffer and output
* is written to "in" buffer. "in" buffer holds the final output.
*/
LSETUP(IDCT_START, IDCT_END) LC0 = P0;
IDCT_START:
I0 = B0; // I0 points to Input Element (0, 0)
I2 = B0; // I2 points to Input Element (0, 0)
I2 += M3 || R3.L = W[I0];
// Element 0 is read in R3.L
I1 = I2; // I1 points to input Element (0, 6)
I1 += 4 || R3.H = W[I2++];
// I2 points to input Element (0, 4)
// Element 4 is read in R3.H
I3 = B3; // I3 points to Coefficients
P0 = B2; // P0 points to array Element (0, 0) for writing
// output
P1 = B2;
R7.L = 0x5a82; // R7.L holds the coefficients C4.
P1 = P1 + P2; // P1 points to array element (1, 0) for writing
//output
/********************** Implementation of Part 1 ******************************/
/*
* The following operation is done in 2 instructions.
* A1 = Element 0 * cos(pi/4)
* A0 = Element 0 * cos(pi/4)
* A1 = A1 - Element 4 * cos(pi/4)
* A0 = A0 + Element 4 * cos(pi/4)
* At the same time the value of Element 2 and 6 are read in RH3 and RL3
respectively.
*/
A1 = R3.L * R7.L, A0 = R3.L * R7.L || I0 += 4 || R1.L = W[I1++];
R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L)
|| R1.H = W[I0++] || R7 = [I3++];
/*
* The following two instructions do -
* A1 = Element 2 * cos(3pi/8)
* A0 = Element 6 * cos(3pi/8)
* A1 = A1 - Element 6 * cos(pi/8)
* A0 = A0 + Element 2 * cos(pi/8)
* R2 reads the input elements ( 5, 3).
* R7 reads the coefficients value C5 and C3.
*/
A1= R1.H * R7.L, A0 = R1.L * R7.L || I0 -= 4 || R2.L = W[I0];
R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H) || R2.H = W[I2--]
|| R7 = [I3++];
/*
* The following three instructions do -
* A1 = Element 5 * cos(3pi/16)
* A0 = Element 3 * cos(3pi/16)
* A1 = A1 - Element 3 * cos(5pi/16)
* A0 = A0 + Element 5 * cos(5pi/16)
* Element 0 = (Element 0 + Element 6) / 2.
* Element 4 = (Element 4 + Element 2) / 2.
* Element 2 = (Element 4 - Element 2) / 2.
* Element 6 = (Element 0 - Element 6) / 2.
* The writing W[P0] = R5.L is done for packing purpose. The register locations
* for element 4 and 6 are swapped.
*/
A1 = R2.H * R7.L, A0 = R2.L * R7.L || R0.H = W[I0--] || NOP;
R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || R0.L = W[I1--] || NOP;
R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)
|| W[P0] = R5.L || R7 = [I3++];
/*
* At the end of part 1 R0 has (1, 7), R5 has (2, 6), R2 has (5, 3) and
* R3 has (4, 0).
* Where notation (x, y) means the element from column x is in upper half of
register
* and element from column y is in lower half of the register.
*/
// The loop for 7 is set. The last iteration is computed separately.
P2 = M2;
LSETUP (R0W_START, R0W_END) LC1 = P2;
P2 = 16;
R0W_START:
/****************** Implementation of Part 2 **********************************/
/*
* The following two instruction does the following job -
* A1 = Element 1 * cos(7pi/16)
* A0 = Element 7 * cos(7pi/16)
* A1 = A1 - Element 7 * cos(pi/16)
* A0 = A0 + Element 1 * cos(pi/16)
* The read to R1 is dummy.
* R7 reads the coefficient value C2 and C6.
*/
A1 = R0.H * R7.L, A0 = R0.L * R7.L || R1 = [I1++M1]
|| W[P1] = R3.H;
R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H) || I0 += M1
|| R7 = [I3++];
/*
* The following single instructions operates on 4 data as -
* Element 1 = (Element 1 + Element 5) / 2.
* Element 5 = (Element 1 - Element 5) / 2.
* Element 3 = (Element 7 - Element 3) / 2.
* Element 7 = (Element 7 + Element 3) / 2.
*/
R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0]
|| R1.L = W[I1++];
/*
* At the end of part 2 R0 has (1, 7), and R2 has (5, 3).
* The The registers R3.H and R5.L are being swapped.
*/
/******************** Implementation of Part 3 ********************************/
/*
* The following instruction does the following job.
* Element 0 = Element 0 + Element 7.
* Element 7 = Element 7 - Element 0.
* Element 6 = Element 6 + Element 1.
* Element 1 = Element 6 - Element 1.
* The elements 0, 1, 6 and 7 are final.
*/
R4 = R3 +|+ R0, R0 = R3 -|- R0 || I2 += M1 || R3.L = W[I0];
/*
* The following two instructions do -
* A1 = Element 3 * cos(pi/4)
* A0 = Element 3 * cos(pi/4)
* A1 = A1 - Element 5 * cos(pi/4)
* A0 = A0 + Element 5 * cos(pi/4)
*/
A1 = R2.L * R7.L, A0 = R2.L * R7.L || I0 += 4 || R3.H = W[I2++];
R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L)
|| R5.L = W[P1] || R1.H = W[I0++];
/*
* At the end of part 3 R0 has (1, 7), and R2 has (5, 3), R4 has (6, 0)
* and R5 has (2, 4). Registers R4 and R0 holds final output.
*/
/******************** Implementation of Part 4 ********************************/
/*
* It is the final stage computation.
* Element 4 = Element 4 + Element 3.
* Element 2 = Element 2 + Element 5.
* Element 5 = Element 5 - Element 2.
* Element 3 = Element 3 - Element 4.
*/
R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L
|| R2.L = W[I0--];
//R5 = 2, 4 and R6 = 5, 3 final
/*
* At the end of part 4 R0 has (1, 7), and R4 has (6, 0), R5 has (2, 4)
* and R6 has (5, 3). All the registers hold final output.
*/
/**************************** Implementation of Part 1
**********************************/
/*
* This is the same part as part 1 specified earlier. First time the part 1
calculation is
* done outside the loop, after wards it is done here. It serves two purpose.
* First it computes part 1 and it writes the data 2, 1, 3, 7, 4, 6 and 5 to
its bit
* reversed order in transpose way.
*/
A1 = R3.L * R7.L, A0 = R3.L * R7.L || W[P0++P3] = R5.H
|| R2.H = W[I0--];
R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L)
|| W[P0++P3] = R0.H || R7 = [I3++];
A1= R1.H * R7.L, A0 = R1.L * R7.L || W[P0++P2] = R6.L
|| R2.H = W[I2--];
R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H)
|| W[P0++P4] = R0.L || R7 = [I3++];
A1 = R2.H * R7.L, A0 = R2.L * R7.L || W[P1++P3] = R5.L
|| R0.H = W[I0--];
R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || W[P1++P3] = R4.H
|| R0.L = W[I1--];
W[P0] = R5.L;
R0W_END: R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)
|| W[P1++P5] = R6.H || R7 = [I3++];
/*
* The computation for 7 rows are over. The last row computation is done here.
* This is the same part as part 2, 3 and 4 done inside the loop.
*/
A1 = R0.H * R7.L, A0 = R0.L * R7.L || NOP || W[P1] = R3.H;
R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H) || R7 = [I3++];
R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0];
R4 = R3 +|+ R0, R0 = R3 -|- R0;
A1 = R2.L * R7.L, A0 = R2.L * R7.L;
R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L) || R5.L = W[P1];
R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L || NOP;
W[P0++P3] = R5.H; // The last outputs are written here.
W[P0++P3] = R0.H;
W[P0++P2] = R6.L;
W[P0++P4] = R0.L;
W[P1++P3] = R5.L;
W[P1++P3] = R4.H;
W[P1++P5] = R6.H;
B1 = B2; // The pointers to output and input are swapped
B2 = B0; // B0 points to Input buffer
IDCT_END:
B0 = B1; // B2 points to output buffer
TERMINATE:
(R7:4,P5:3)=[SP++]; //Pop the registers before returning.
RTS; //Return.
NOP; //to avoid one stall if LINK or UNLINK happens to be
//the next instruction after RTS in the memory.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -