📄 idct.asm
字号:
.section data1;
.byte2 _coeff[8] = {
0x30FB, //cos(3pi/8) C6
0x7641, //cos(pi/8) C2
0x6A6D, //cos(3pi/16)
0x471C, //cos(5pi/16)
0x18F8, //cos(7pi/16)
0x7D8A, //cos(pi/16)
0x5a82, // C4
0x5a82
};
.byte2 _temp_dct[64];
.section program;
.global _idct_asm;
_idct_asm:
link 0;
[ -- sp ] = (r7:1, p5:0);
r1.l=_coeff;
r1.h=_coeff;
r2.l=_temp_dct;
r2.h=_temp_dct;
//r0 for in buffer
// CLI r7;
call _r8x8invdct;
// STI r7;
(r7:1, p5:0) = [ sp ++ ];
unlink;
RTS;
_idct_asm.end:
/*
Prototype : void _r8x8dct(fract16 *in, fract16 *coeff, fract16 *temp);
*in -> Pointer to Input vector.
*coeff -> Pointer to coefficients.
*temp -> Pointer to temporary data.
Registers Used : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M3, L0-L3, P0-P5, LC0,
LC1.
Performance :
Code Size : 340 Bytes.
Cycle Count : 319 Cycles.
*/
.global _r8x8invdct;
.align 8;
_r8x8invdct:
/******************** Function Prologue ***************************************/
link 0; // Pushing the registers on stack.
[ -- sp ] = (r7:1, p5:0);
[--SP]=lc0;
[--SP]=lt0;
[--SP]=lb0;
[--SP]=lc1;
[--SP]=lt1;
[--SP]=lb1;
[-- sp ] = i0; [-- sp ] = i1; [-- sp ] = i2; [-- sp ] = i3;
[-- sp ] = b0; [-- sp ] = b1; [-- sp ] = b2; [-- sp ] = b3;
[-- sp ] = l0; [-- sp ] = l1; [-- sp ] = l2; [-- sp ] = l3;
[-- sp ] = m0; [-- sp ] = m1; [-- sp ] = m2; [-- sp ] = m3;
[--sp]=A0.x;[--sp]=A0.W;[--sp]=A1.X;[--sp]=A1.W;
B0 = R0; // Pointer to Input matrix.
B3 = R1; // Pointer to Coefficients
B2 = R2; // Pointer to Temporary matrix.
L0 = 0; // L registers are initialized to 0
L1 = 0; // --------- do --------
L2 = 0; // --------- do --------
L3 = 16; // L3 is used for making coefficients array
// circular.
M1 = 16 (X); // All these registers are initialized for
M2 = 7 (X); // modifying the address offsets.
M3 = 8(X);
P2 = 16;
P3 = 32 (X);
P4 = -110 (X);
P5 = -62 (X);
P0 = 2;
NOP;
/*
* According to Chen's algorithm, first 8-point IDCT will be calculated for all
* the 8 rows. The output of this calculation is stored in another transpose
* matrix. Now again the 8-point IDCT is applied on all the 8 rows. The output
* is stored in matrix transpose form. This is the final output. Therefore,
* a loop of 2 iteration (IDCT_START, IDCT_END) is set.
*
* B0 points to the "in" buffer and B2 points to "temp" buffer in the first
* iteration. The input is read from "in" buffer and output is written to
* "temp" buffer. In the second iteration of IDCT_START B0 points to "temp" and
* B2 points to "in" buffer. The input is read from "temp" buffer and output
* is written to "in" buffer. "in" buffer holds the final output.
*/
LSETUP(IDCT_START, IDCT_END) LC0 = P0;
IDCT_START:
I0 = B0; // I0 points to Input Element (0, 0)
I2 = B0; // I2 points to Input Element (0, 0)
I2 += M3 || R3.L = W[I0];
// Element 0 is read in R3.L
I1 = I2; // I1 points to input Element (0, 6)
I1 += 4 || R3.H = W[I2++];
// I2 points to input Element (0, 4)
// Element 4 is read in R3.H
I3 = B3; // I3 points to Coefficients
P0 = B2; // P0 points to array Element (0, 0) for writing
// output
P1 = B2;
R7.L = 0x5a82; // R7.L holds the coefficients C4.
P1 = P1 + P2; // P1 points to array element (1, 0) for writing
//output
/********************** Implementation of Part 1 ******************************/
/*
* The following operation is done in 2 instructions.
* A1 = Element 0 * cos(pi/4)
* A0 = Element 0 * cos(pi/4)
* A1 = A1 - Element 4 * cos(pi/4)
* A0 = A0 + Element 4 * cos(pi/4)
* At the same time the value of Element 2 and 6 are read in RH3 and RL3
respectively.
*/
A1 = R3.L * R7.L, A0 = R3.L * R7.L || I0 += 4 || R1.L = W[I1++];
R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L)
|| R1.H = W[I0++] || R7 = [I3++];
/*
* The following two instructions do -
* A1 = Element 2 * cos(3pi/8)
* A0 = Element 6 * cos(3pi/8)
* A1 = A1 - Element 6 * cos(pi/8)
* A0 = A0 + Element 2 * cos(pi/8)
* R2 reads the input elements ( 5, 3).
* R7 reads the coefficients value C5 and C3.
*/
A1= R1.H * R7.L, A0 = R1.L * R7.L || I0 -= 4 || R2.L = W[I0];
R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H) || R2.H = W[I2--]
|| R7 = [I3++];
/*
* The following three instructions do -
* A1 = Element 5 * cos(3pi/16)
* A0 = Element 3 * cos(3pi/16)
* A1 = A1 - Element 3 * cos(5pi/16)
* A0 = A0 + Element 5 * cos(5pi/16)
* Element 0 = (Element 0 + Element 6) / 2.
* Element 4 = (Element 4 + Element 2) / 2.
* Element 2 = (Element 4 - Element 2) / 2.
* Element 6 = (Element 0 - Element 6) / 2.
* The writing W[P0] = R5.L is done for packing purpose. The register locations
* for element 4 and 6 are swapped.
*/
A1 = R2.H * R7.L, A0 = R2.L * R7.L || R0.H = W[I0--] || NOP;
R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || R0.L = W[I1--] || NOP;
R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)
|| W[P0] = R5.L || R7 = [I3++];
/*
* At the end of part 1 R0 has (1, 7), R5 has (2, 6), R2 has (5, 3) and
* R3 has (4, 0).
* Where notation (x, y) means the element from column x is in upper half of
register
* and element from column y is in lower half of the register.
*/
// The loop for 7 is set. The last iteration is computed separately.
P2 = M2;
LSETUP (R0W_START, R0W_END) LC1 = P2;
P2 = 16;
R0W_START:
/****************** Implementation of Part 2 **********************************/
/*
* The following two instruction does the following job -
* A1 = Element 1 * cos(7pi/16)
* A0 = Element 7 * cos(7pi/16)
* A1 = A1 - Element 7 * cos(pi/16)
* A0 = A0 + Element 1 * cos(pi/16)
* The read to R1 is dummy.
* R7 reads the coefficient value C2 and C6.
*/
A1 = R0.H * R7.L, A0 = R0.L * R7.L || R1 = [I1++M1]
|| W[P1] = R3.H;
R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H) || I0 += M1
|| R7 = [I3++];
/*
* The following single instructions operates on 4 data as -
* Element 1 = (Element 1 + Element 5) / 2.
* Element 5 = (Element 1 - Element 5) / 2.
* Element 3 = (Element 7 - Element 3) / 2.
* Element 7 = (Element 7 + Element 3) / 2.
*/
R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0]
|| R1.L = W[I1++];
/*
* At the end of part 2 R0 has (1, 7), and R2 has (5, 3).
* The The registers R3.H and R5.L are being swapped.
*/
/******************** Implementation of Part 3 ********************************/
/*
* The following instruction does the following job.
* Element 0 = Element 0 + Element 7.
* Element 7 = Element 7 - Element 0.
* Element 6 = Element 6 + Element 1.
* Element 1 = Element 6 - Element 1.
* The elements 0, 1, 6 and 7 are final.
*/
R4 = R3 +|+ R0, R0 = R3 -|- R0 || I2 += M1 || R3.L = W[I0];
/*
* The following two instructions do -
* A1 = Element 3 * cos(pi/4)
* A0 = Element 3 * cos(pi/4)
* A1 = A1 - Element 5 * cos(pi/4)
* A0 = A0 + Element 5 * cos(pi/4)
*/
A1 = R2.L * R7.L, A0 = R2.L * R7.L || I0 += 4 || R3.H = W[I2++];
R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L)
|| R5.L = W[P1] || R1.H = W[I0++];
/*
* At the end of part 3 R0 has (1, 7), and R2 has (5, 3), R4 has (6, 0)
* and R5 has (2, 4). Registers R4 and R0 holds final output.
*/
/******************** Implementation of Part 4 ********************************/
/*
* It is the final stage computation.
* Element 4 = Element 4 + Element 3.
* Element 2 = Element 2 + Element 5.
* Element 5 = Element 5 - Element 2.
* Element 3 = Element 3 - Element 4.
*/
R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L
|| R2.L = W[I0--];
//R5 = 2, 4 and R6 = 5, 3 final
/*
* At the end of part 4 R0 has (1, 7), and R4 has (6, 0), R5 has (2, 4)
* and R6 has (5, 3). All the registers hold final output.
*/
/**************************** Implementation of Part 1
**********************************/
/*
* This is the same part as part 1 specified earlier. First time the part 1
calculation is
* done outside the loop, after wards it is done here. It serves two purpose.
* First it computes part 1 and it writes the data 2, 1, 3, 7, 4, 6 and 5 to
its bit
* reversed order in transpose way.
*/
A1 = R3.L * R7.L, A0 = R3.L * R7.L || W[P0++P3] = R5.H
|| R2.H = W[I0--];
R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L)
|| W[P0++P3] = R0.H || R7 = [I3++];
A1= R1.H * R7.L, A0 = R1.L * R7.L || W[P0++P2] = R6.L
|| R2.H = W[I2--];
R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H)
|| W[P0++P4] = R0.L || R7 = [I3++];
A1 = R2.H * R7.L, A0 = R2.L * R7.L || W[P1++P3] = R5.L
|| R0.H = W[I0--];
R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || W[P1++P3] = R4.H
|| R0.L = W[I1--];
W[P0] = R5.L;
R0W_END: R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)
|| W[P1++P5] = R6.H || R7 = [I3++];
/*
* The computation for 7 rows are over. The last row computation is done here.
* This is the same part as part 2, 3 and 4 done inside the loop.
*/
A1 = R0.H * R7.L, A0 = R0.L * R7.L || NOP || W[P1] = R3.H;
R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H) || R7 = [I3++];
R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0];
R4 = R3 +|+ R0, R0 = R3 -|- R0;
A1 = R2.L * R7.L, A0 = R2.L * R7.L;
R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L) || R5.L = W[P1];
R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L || NOP;
W[P0++P3] = R5.H; // The last outputs are written here.
W[P0++P3] = R0.H;
W[P0++P2] = R6.L;
W[P0++P4] = R0.L;
W[P1++P3] = R5.L;
W[P1++P3] = R4.H;
W[P1++P5] = R6.H;
B1 = B2; // The pointers to output and input are swapped
B2 = B0; // B0 points to Input buffer
IDCT_END:
B0 = B1; // B2 points to output buffer
TERMINATE:
A1.W=[sp++]; A1.X=[sp++]; A0.W=[sp++];A0.X=[sp++];
m3=[sp++];m2=[sp++];m1=[sp++];m0=[sp++];
l3=[sp++];l2=[sp++];l1=[sp++];l0=[sp++];
b3=[sp++];b2=[sp++];b1=[sp++];b0=[sp++];
i3=[sp++];i2=[sp++];i1=[sp++];i0=[sp++];
lb1=[SP++];
lt1=[SP++];
lc1=[SP++];
lb0=[SP++];
lt0=[SP++];
lc0=[SP++];
(r7:1, p5:0) = [ sp ++ ];
unlink;
RTS;
NOP; //to avoid one stall if LINK or UNLINK happens to be
//the next instruction after RTS in the memory.
_r8x8invdct.end:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -