📄 mdct36.asm
字号:
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs ( Micro Signal Architecture 1.0 specification).
By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software.
********************************************************************************
Module Name : mdct36.asm
Label name : __mdct36
Version : 1.3
Change History :
Version Date Author Comments
1.3 11/18/2002 Swarnalatha Tested with VDSP++ 3.0
compiler 6.2.2 on
ADSP-21535 Rev.0.2
1.2 11/13/2002 Swarnalatha Tested with VDSP++ 3.0
on ADSP-21535 Rev. 0.2
1.1 01/22/2002 Vijay Modified to match silicon
cycle count
1.0 06/26/2001 Vijay Original
Description : This asm routine is a 16-bit implementation of the 36-point
modified discrete cosine transform (MDCT) which is used in the
MPEG Layer III audio codec.
The equation of the MDCT for a sequence x of length N is
given below:
y(k) = sum 0 to N-1 { x(n)cos(pi*(2k + 1)*(2n + 1 + N/2)/2N)
n = 0 to N-1
k = 0 to N/2-1
Note : MDCT exhibits the antisymmetry property given by
y(N - k - 1) = -y(k), k = 0 to N/2 - 1
Algorithm :
The algorithm used to implement MDCT is given as a pseudocode
below:
First stage computations:
for i=0:8
d1(i) = x(i) - x(17-i); c1(8-i) = d1(i);
d2(i) = x(i+18) + x(35-i); c2(8-i) = d2(i);
end
c = c1 - c2;
d = d1 + d2;
Twiddle factor multiplications:
W1 = cos(pi/72*(17:-2:1));
W2 = sin(pi/72*(17:-2:1));
for i=0:8
a(8-i) = c(i)*W1(i) - d(8-i)*W2(i);
b(8-i) = c(i)*W2(i) + d(8-i)*W1(i);
end
9-Point DCT computation:
WC = cos(pi/18*(0:8)'*(2*(0:8)+1));
y1 = WC*a';
y1 = y1/sqrt(2);
9-Point DST computation:
WS = sin(pi/18*((0:8)'+1)*(2*(0:8)+1));
y2(9:-1:1) = WS*b';
y2 = y2/sqrt(2);
Last stage:
y(0) = y1(0);
y(1) = -(y1(1) + y2(8)); y(2) = -(y1(1) - y2(8));
y(3) = (y1(2) + y2(7)); y(4) = (y1(2) - y2(7));
y(5) = -(y1(3) + y2(6)); y(6) = -(y1(3) - y2(6));
y(7) = (y1(4) + y2(5)); y(8) = (y1(4) - y2(5));
y(9) = -(y1(5) + y2(4)); y(10) = -(y1(5) - y2(4));
y(11) = (y1(6) + y2(3)); y(12) = (y1(6) - y2(3));
y(13) = -(y1(7) + y2(2)); y(14) = -(y1(7) - y2(2));
y(15) = (y1(8) + y2(1)); y(16) = (y1(8) - y2(1));
y(17) = -y2(0);
Prototype : void _mdct12( fract16 *input, fract16 *output,
fract16 *twid_coef);
Assumptions : The input buffer is assumed to be aligned to a 4 byte boundary
The length of the input and output buffers are always 36 and
18, respectively
The input data must sufficient guard bits to avoid overflow
The twid_factor array that is passed to the routine must have
9x9 cosine and sine coefficients interleaved in the following
manner:
i = 0;
for k = 0 to 8
for n = 0 to 8
twid_coef[i] = (1/sqrt(2))*cos(PI/18*k*(2*n + 1));
twid_coef[i+1] = (1/sqrt(2))*sin(PI/18*(k + 1)*(2*n
+ 1));
i = i + 2;
end
end
Registers used : A0, A1, R0-R3, R6, R7, I0-I3, M0, M1, B0, L0-L3, P0-P2, P5,
LC0.
Performance :
Code size : 490 bytes
Cycle count : 239 cycles
*******************************************************************************/
.section L1_code;
.global __mdct36;
.align 8;
__mdct36:
[--SP] = (R7:6, P5:5);
L0 = 0;
L1 = 0;
L2 = 0;
L3 = 0;
SP += -40; // Offset stack pointer for temporary storage
I2 = SP; // Temporary array in stack
I3 = SP; // Temporary array in stack
P2 = SP;
I0 = R0; // Input pointer
I1 = R0;
P1 = R1; // Output pointer
P5 = R2; // 9 point cosine-sine coefficient array pointer
M0 = 36;
M1 = -32 (X);
I3 += M0;
/********************** PUSH COEFFICIENTS IN STACK ****************************/
P2 += -8;
R0.L = 0x7FE0;
R0.H = 0x0596;
[P2--] = R0;
R0.L = 0x7EE7;
R0.H = 0x10B5;
[P2--] = R0;
R0.L = 0x7CF7;
R0.H = 0x1BB5;
[P2--] = R0;
R0.L = 0x7A13;
R0.H = 0x267E;
[P2--] = R0;
R0.L = 0x7641; // Push W1(i) = cos((17:-2:1)*pi/72 and
R0.H = 0x30FC; // W2(i) = sin((17:-2:1)*pi/72 into stack
[P2--] = R0;
R0.L = 0x7189;
R0.H = 0x3B1B;
[P2--] = R0;
R0.L = 0x6BF4;
R0.H = 0x44C6;
[P2--] = R0;
R0.L = 0x658C;
R0.H = 0x4DEC;
[P2--] = R0;
R0.L = 0x5E5F;
R0.H = 0x567A;
[P2] = R0;
/*********************** FIRST STAGE *************************************/
P0 = 4;
R0 = [I0 ++ M0] || I1 -= M1;
R2 = [I0 ++ M1] || I2 -= 4;
R1 = [I1 ++ M0] || I3 += 4;
LSETUP(ST_FIRST_STAGE, END_FIRST_STAGE) LC0 = P0;
ST_FIRST_STAGE:
R1 = PACK(R1.L, R1.H) || R3 = [I1 ++ M1] || [I2++] = R6;
// d1(i) = x(i) - x(17-i);
R3 = PACK(R3.L, R3.H) || [I3--] = R7 || I1 -= 4;
// c1(8-i) = d1(i);
R6 = R0 -|- R1 (S) || R0 = [I0 ++ M0] || I1 -= 4;
// d2(i) = x(i+18) + x(35-i);
R7 = R2 +|+ R3 (S) || R2 = [I0 ++ M1];
// c2(8-i) = d2(i);
END_FIRST_STAGE:
R6 = R6 +|+ R7, R7 = R6 -|- R7 (S) || R1 = [I1 ++ M0];
// d = d1 + d2; c = c1 - c2;
I0 = P2;
R1.L = R0.L - R0.H (S) || [I2--] = R6;
// d1(8)
R1.H = R2.L + R2.H (S) || [I3++] = R7;
// c1(0)
R0.H = R1.L + R1.H (S) || R3 = [P2++];
// d(8)
R0.L = R1.L - R1.H (S); // c(0)
/************************ TWIDDLE FACTOR MULTIPLICATION **********************/
LSETUP(ST_TWID_MULT, END_TWID_MULT) LC0 = P0;
A1 = R0.L*R3.H, A0 = R0.L*R3.L;
R1.H = (A1 += R0.H*R3.L), R1.L = (A0 -= R0.H*R3.H);
ST_TWID_MULT:
R0 = PACK(R6.H, R7.H) || [I0++] = R1;
R2 = PACK(R6.L, R7.L) || R3 = [P2++];
A1 = R0.L*R3.H, A0 = R0.L*R3.L || R7 = [P2++];
// a(8-i) = c(i)*W1(i) - d(8-i)*W2(i);
R1.H = (A1 += R0.H*R3.L), R1.L = (A0 -= R0.H*R3.H) || R6 = [I2--];
A1 = R2.L*R7.H, A0 = R2.L*R7.L || [I0++] = R1;
// b(8-i) = c(i)*W2(i) + d(8-i)*W1(i);
END_TWID_MULT:
R1.H = (A1 += R2.H*R7.L), R1.L = (A0 -= R2.H*R7.H) || R7 = [I3++];
P2 += -36;
I2 += 4;
/*********************** 9 POINT DCT AND DST *******************************/
A1 = A0 = 0 || R3 = [P5++] || [I0--] = R1;
// Fetch the first coefficient from the twid_coef
// array
L0 = 36; // Make the DCT/DST input array circular
B0 = P2;
P0 = 9;
LSETUP(ST_OUT_LOOP, END_OUT_LOOP) LC0 = P0;
// DCT in A0 and DST in A1
ST_OUT_LOOP:
R0.H = (A1 += R1.H*R3.H), R0.L = (A0 += R1.L*R3.L)|| R1 = [I0--]
|| R3 = [P5++];
R0.H = (A1 += R1.H*R3.H), R0.L = (A0 += R1.L*R3.L)|| R1 = [I0--]
|| R3 = [P5++];
R0.H = (A1 += R1.H*R3.H), R0.L = (A0 += R1.L*R3.L)|| R1 = [I0--]
|| R3 = [P5++];
R0.H = (A1 += R1.H*R3.H), R0.L = (A0 += R1.L*R3.L)|| R1 = [I0--]
|| R3 = [P5++];
R0.H = (A1 += R1.H*R3.H), R0.L = (A0 += R1.L*R3.L)|| R1 = [I0--]
|| R3 = [P5++];
R0.H = (A1 += R1.H*R3.H), R0.L = (A0 += R1.L*R3.L)|| R1 = [I0--]
|| R3 = [P5++];
R0.H = (A1 += R1.H*R3.H), R0.L = (A0 += R1.L*R3.L)|| R1 = [I0--]
|| R3 = [P5++];
R0.H = (A1 += R1.H*R3.H), R0.L = (A0 += R1.L*R3.L)|| R1 = [I0--]
|| R3 = [P5++];
R0.H = (A1 += R1.H*R3.H), R0.L = (A0 += R1.L*R3.L)|| R1 = [I0--]
|| R3 = [P5++];
END_OUT_LOOP:
A1 = A0 = 0 || [I2++] = R0;
// Store y1(i) and y2(8-i)
/************************** LAST STAGE **********************************/
I1 = P1;
I1 -= 2; // Offset output buffer to account for
L0 = 0; // Make the buffer linear
I2 -= M0; // the first dummy write in the loop
R3.L = W[I1]||R0 = [I2++];
R3 = PACK(R0.L, R3.L) || R1 = [I2++];
LSETUP(ST_OUT_COPY, END_OUT_COPY) LC0 = P0 >> 1;
ST_OUT_COPY:
R2.L = R1.L + R0.H (S) || W[I1++] = R3.L;
// Compute output and store previous output
R2.H = R1.L - R0.H (S) || W[I1++] = R3.H;
R2 = -R2 (V) || R0 = [I2++];
R3.L = R0.L + R1.H (S) || W[I1++] = R2.L;
END_OUT_COPY:
R3.H = R0.L - R1.H (S) || R1 = [I2++] || W[I1++] = R2.H;
R0 = -R0 (V) || W[I1++] = R3.L;
W[I1++] = R3.H;
W[I1++] = R0.H; // Store last output
SP += 40; // Restore stack pointer
(R7:6, P5:5) = [SP++]; // Retrieve call preserved register contents
RTS;
NOP; //to avoid one stall if LINK or UNLINK happens to be
//the next instruction after RTS in the memory.
__mdct36.end:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -