📄 imdct36.asm
字号:
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs ( Micro Signal Architecture 1.0 specification).
By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software.
********************************************************************************
Module Name : imdct36.asm
Label name : __imdct36
Version : 1.3
Change History :
Version Date Author Comments
1.3 11/18/2002 Swarnalatha Tested with VDSP++ 3.0
compiler 6.2.2 on
ADSP-21535 Rev.0.2
1.2 11/13/2002 Swarnalatha Tested with VDSP++ 3.0
on ADSP-21535 Rev. 0.2
1.1 01/22/2002 Vijay Modified to match silicon
cycle count
1.0 07/17/2001 Vijay Original
Description : This asm routine is a 16-bit implementation of the 36-point
inverse modified discrete cosine transform (IMDCT) which is
used in the MPEG Layer III audio codec. The equation of the
IMDCT for a sequence X of length N/2 is given below:
y(n) = (2/N)*sum 0 to N/2-1 { X(k)cos(pi*(2k + 1)*(2n + 1
+ N/2)/2N)
k = 0 to N/2-1
n = 0 to N-1
Note : The MDCT exhibits antisymmetry. So the first eighteen
MDCT values are enough to compute the 36 point IMDCT
The algorithm used to implement the IMDCT is given as a
pseudocode below :
First stage computations :
X = sqrt(2)*inp;
y1(0) = X(0); y2(1) = -X(17);
y1(1) = -(X(2) + X(1)); y2(9) = X(2) - X(1);
y1(2) = X(3) + X(4); y2(8) = X(3) - X(4);
y1(3) = -(X(6) + X(5)); y2(7) = X(6) - X(5);
y1(4) = X(7) + X(8); y2(6) = X(7) - X(8);
y1(5) = -(X(10) + X(9)); y2(5) = X(10) - X(9);
y1(6) = X(11) + X(12); y2(4) = X(11) - X(12);
y1(7) = -(X(14) + X(13)); y2(3) = X(14) - X(13);
y1(8) = X(15) + X(16); y2(2) = X(15) - X(16);
9 Point IDCT :
WC = cos(pi/18*(2*(0:8)'+1)*(0:8));
a(9:-1:1) = WC*y1';
9 Point IDST :
WS = sin(pi/18*(2*(0:8)'+1)*((0:8)+1));
b = WS*[y2(9:-1:1)]';
Twiddle factor multiplications :
W1 = cos(pi/72*(17:-2:1));
W2 = sin(pi/72*(17:-2:1));
for i=0:8
c(i) = a(i)*W1(i) + b(8-i)*W2(i);
d(8-i) = -a(i)*W2(i) + b(8-i)*W1(i);
end
Last stage :
for i=0:8
x(i) = (d(i) + c(8-i));
x(17-i) = -x(i);
x(35-i) = (d(i) - c(8-i));
x(i+18) = x(35-i);
end
x = x/36;
Prototype : void _imdct36(fract16 *input, fract16 *output, fract16
*twid_coef);
Assumptions : The input buffer is assumed to be aligned to a 4 byte boundary
The length of the input and output buffers are always 18 and
36, respectively. The twid_factor array that is passed to the
routine must have 9x9 cosine and sine coefficients interleaved
in the following manner :
i = 0;
for n = 0 to 8
for k = 0 to 8
twid_coef[i] = cos(PI*k*(2*n + 1)/18);
twid_coef[i+1] = sin(PI*(k + 1)*(2*n + 1)/18);
i = i + 2;
end
end
Note : The index of the outer loop and inner loop should not
be interchanged and the twid_coef array is of type fract16
(short)
Registers Used : A0, A1, R0-R3, R6, R7, I0-I3, B2, M0, L0-L3, P0-P2, P5.
Performance :
Code size : 428 bytes
Cycle count : 258 cycles
*******************************************************************************/
.section L1_code;
.align 8;
.global __imdct36;
__imdct36:
[--SP] = (R7:6, P5:5);
SP += -40;
L0 = 0;
L1 = 0;
L2 = 0;
L3 = 0;
I0 = R0; // Input pointer
P2 = R1; // Output pointer
I3 = R2; // Twiddle factor array
I1 = SP; // Temporary storage in stack
I2 = SP;
M0 = -32 (X);
R3.L = 0x141D;
P0 = 9 (Z); // Loop count
R0 = [I0++] || I1 -= 4; // Scale the input data by sqrt(2)/9
LSETUP(ST_SCALE, ST_SCALE) LC0 = P0;
ST_SCALE:
R1.H = R0.H * R3.L, R1.L = R0.L * R3.L || R0 = [I0++] || [I1++]= R1;
/*************************** FIRST STAGE **************************************/
R3 = -R1 (V) || [I1 ++ M0] = R1;
R0 = [I1++];
R0.L = R0.L >>> 1 || R1 = [I1++];
R3 = R3 >>> 1 (V) || R2 = [I1++] || I2 -= 4;
LSETUP(ST_FIRST_STAGE, END_FIRST_STAGE) LC0 = P0 >> 1;
ST_FIRST_STAGE:
R6 = PACK(R2.L, R0.H) || [I2--] = R7;
R6 = R1 +|+ R6, R7 = R1 -|- R6 (ASR) || R1 = [I1++];
R0 = PACK(R7.L, R0.L);
R7 = PACK(R7.H, R6.L) || [I2--] = R0;
END_FIRST_STAGE:
R0 = PACK(R2.H, R6.H) || R2 = [I1++];
R3 = PACK(R3.H, R6.H) || [I2--] = R7 || I1 += M0;
[I2] = R3 || I1 -= 4;
/****************************** 9 POINT IDCT AND IDST ************************/
B2 = I2; // Base address of the input buffer of IDCT & IDST
L2 = 36; // Make the IDCT/IDST input array circular
R0 = [I1--] || I2 -= M0;// Dummy fetch to decrement I1
R1 = [I2--] || I1 -= 4; // First data of IDCT/IDST
R3 = [I3++] ; // First coefficient from the twid_coef array
MNOP; //to remove IAU stall
LSETUP(ST_OUT_LOOP, END_OUT_LOOP) LC1 = P0;
ST_OUT_LOOP:
A1 = R1.H * R3.H, A0 = R1.L * R3.L || R1 = [I2--] || R3 = [I3++];
// IDCT in A0 and IDST in A1
R2.H = (A1 += R1.H * R3.H), R2.L = (A0 -= R1.L * R3.L) || R1 = [I2--]
|| R3 = [I3++];
R2.H = (A1 += R1.H * R3.H), R2.L = (A0 += R1.L * R3.L) || R1 = [I2--]
|| R3 = [I3++];
R2.H = (A1 += R1.H * R3.H), R2.L = (A0 -= R1.L * R3.L) || R1 = [I2--]
|| R3 = [I3++];
R2.H = (A1 += R1.H * R3.H), R2.L = (A0 += R1.L * R3.L) || R1 = [I2--]
|| R3 = [I3++];
R2.H = (A1 += R1.H * R3.H), R2.L = (A0 -= R1.L * R3.L) || R1 = [I2--]
|| R3 = [I3++];
R2.H = (A1 += R1.H * R3.H), R2.L = (A0 += R1.L * R3.L) || R1 = [I2--]
|| R3 = [I3++];
R2.H = (A1 += R1.H * R3.H), R2.L = (A0 -= R1.L * R3.L) || R1 = [I2--]
|| R3 = [I3++];
R2.H = (A1 += R1.H * R3.H), R2.L = (A0 += R1.L * R3.L) || R1 = [I2--]
|| R3 = [I3++];
END_OUT_LOOP:
[I1++] = R2; // Store y1(i) and y2(8-i)
L2 = 0; // Restore length register
/************************** PUSH COEFFICIENTS IN STACK ************************/
R0.L = 0x3FF0;
R0.H = 0x02CB;
[I2--] = R0;
R0.L = 0x3F74;
R0.H = 0x085B;
[I2--] = R0;
R0.L = 0x3E7C;
R0.H = 0x0DDA;
[I2--] = R0;
R0.L = 0x3D0A;
R0.H = 0x133F;
[I2--] = R0;
R0.L = 0x3B21; // Push W1(i) = .5*cos((17:-2:1)*pi/72 and
R0.H = 0x187E; // W2(i) = .5*sin((17:-2:1)*pi/72 into stack
[I2--] = R0;
R0.L = 0x38C5;
R0.H = 0x1D8D;
[I2--] = R0;
R0.L = 0x35FA;
R0.H = 0x2263;
[I2--] = R0;
R0.L = 0x32C6;
R0.H = 0x26F6;
[I2] = R0 || I1 -= 4;
R0.L = 0x2F30;
R0.H = 0x2B3D;
/********************** TWIDDLE FACTOR MULTIPLICATION *************************/
LSETUP(ST_TWID_MULT, END_TWID_MULT) LC0 = P0;
// c(i) = a(i)*W1(i) + b(8-i)*W2(i);
ST_TWID_MULT:
A1 = R2.H * R0.L, A0 = R2.H * R0.H || [I1--] = R1;
// d(8-i) = -a(i)*W2(i) + b(8-i)*W1(i);
R1.H = (A1 -= R2.L * R0.H), R1.L = (A0 += R2.L * R0.L) || R2 = [I1];
END_TWID_MULT:
R0 = [I2++];
/****************************** OUTPUT COPY ***********************************/
P5 = P2;
P5 += 36;
P1 = P5;
P1 += -2;
R2.L = R1.H + R1.L (S) || I1 += 4;
LSETUP(ST_OUT_COPY, END_OUT_COPY) LC0 = P0;
P0 = P5;
P0 += 34;
ST_OUT_COPY:
R3.L = R1.H - R1.L (S) || W[P2++] = R2;
// Store x(0 to 8)
R2 = -R2 (V) || W[P5++] = R3 || R1 = [I1++];
// Store x(18 to 26)
R2.L = R1.H + R1.L (S) || W[P1--] = R2;
// Store x(17 to 9)
END_OUT_COPY:
W[P0--] = R3; // Store x(35 to 27)
SP += 40; // Restore stack pointer
(R7:6, P5:5) = [SP++]; // Retrieve call preserved registers
RTS;
NOP; //to avoid one stall if LINK or UNLINK happens to be
//the next instruction after RTS in the memory.
__imdct36.end:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -