📄 fdct.asm

📁 BF533上移植Xvid编解码中的DCT部分（AD汇编）
💻 ASM
字号:

	.section data1;
	.byte2 _coeff[8] = {0x5a82,0x5a82,0x30fb,0x7641,0x18f8,0x7d8a,0x471c,0x6a6d};
	.byte2 _temp_dct[64];
	
	.section program;	
	.global _fdct_asm;
_fdct_asm:
	link 0;
	[ -- sp ] = (r7:1, p5:0);
	r1.l=_coeff;
	r1.h=_coeff;
	
	r2.l=_temp_dct;
	r2.h=_temp_dct;
	
	call _r8x8dct;
	 
	(r7:1, p5:0) = [ sp ++ ];
	unlink;
	RTS;
_fdct_asm.end:





.global     _r8x8dct;
.align      8;
    
_r8x8dct:

/******************************* Function Prologue ***************************/
     link 0;  // Pushing the registers on stack.
    [ -- sp ] = (r7:1, p5:0);
    [--SP]=lc0;
	[--SP]=lt0;
	[--SP]=lb0;
	[--SP]=lc1;
	[--SP]=lt1;
	[--SP]=lb1;
	
	[-- sp ] = i0; [-- sp ] = i1; [-- sp ] = i2; [-- sp ] = i3;
	[-- sp ] = b0; [-- sp ] = b1; [-- sp ] = b2; [-- sp ] = b3;
	[-- sp ] = l0; [-- sp ] = l1; [-- sp ] = l2; [-- sp ] = l3;
	[-- sp ] = m0; [-- sp ] = m1; [-- sp ] = m2; [-- sp ] = m3;
	
	[--sp]=A0.x;[--sp]=A0.W;[--sp]=A1.X;[--sp]=A1.W;
	
	
    B0 = R0;                //Pointer to Input matrix.
    B3 = R1;                //Pointer to Coefficients.
    B2 = R2;                //Pointer to Temporary matrix.
    L0 = 0;                 //L registers are initialized to 0
    L1 = 0;                 //-------- do --------
    L2 = 0;                 //-------- do --------
    L3 = 16;                //L3 is set to 16 to make the coefficients
                            //array Circular.
/*
 I0, I1, and I2 registers are used to read the input data. I3 register is used
 to read the coefficients. P0 and P1 registers are used for writing the output
 data.  
*/  
    
    M0 = 12 (X);            // All these initialization are used in the
                            // modification of address offsets.
    M1 = 16 (X);
    P2 = 16;        
    P3 = 32 (X);
    P4 = -110 (X);
    P5 = -62 (X);  
    P0 = 2;
/*
   According to Chen's algorithm, first 8-point DCT will be calculated for all
   the 8 rows. The output of this calculation is stored in another transpose 
   matrix. Now again the 8-point DCT is applied on all the 8 rows. The output
   is stored in matrix in transposed form. This is the final output. 
   Therefore,
   a loop of 2 iteration (DCT_START, DCT_end) is set.

   B0 points to the "in" buffer and B2 points to "temp" buffer in the first 
   iteration. The input is read from "in" buffer and output is written to
   "temp" buffer. In the second iteration of DCT_START B0 points to "temp" and
   B2 points to "in" buffer. The input is read from "temp" buffer and output
   is written to "in" buffer. "in" buffer holds the final output.
*/
    
    LSETUP (DCT_START, DCT_END) LC0 = P0;
DCT_START:
        I0 = B0;            //I0 points to Input Element (0, 0)
        I1 = B0;            //Element 1 and 0 is read in R0.
        I1 += M0  || R0 = [I0++];
                            //I1 points to Input Element (0, 6) 
        I2 = I1;            //Element 6 is read in R3.H
        I2 -= 4   || R3.H = W[I1++];
                            //I2 points to Input Element (0, 4) 
    
        I3 = B3;            //I3 points to Coefficients
        P0 = B2;            //P0 points to temporary array Element (0, 0)
        P1 = B2;            //P1 points to temporary array
        R7 = [P1++P2] || R2 = [I2++];
        //R7=P1,P1+=P2
                            //P1 points to temporary array Element (1, 0) 
                            //R7 is a dummy read. Element 4 and 5 are read in R2
                            //R7 后来给COS值覆盖
        R3.L = W[I1--];     //Element 7 is read in R3.L
        R1.H = W[I0++];     //Element 2 is read in R1.H
    
//******************************* Implementation of Part 1 ********************
/*
  All the additions from Stage 1 and Stage 2 are implemented in Part1 1. for 
the optimization
  It is taken out of the loop for optimization point of view. (Part 1)
  The following instruction does the following job.
  Element 0 = (Element 0 + Element 7) / 2.
  Element 1 = (Element 1 + Element 6) / 2.
  Element 6 = (Element 1 - Element 6) / 2.
  Element 7 = (Element 0 - Element 7) / 2.
  It reads the data 3 in R1.L. 
*/
    
        R0 = R0 +|+ R3, R3 = R0 -|- R3 (ASR) || R1.L = W[I0++] || NOP;
    	//ASR A,算术右移，结果除2，S表示饱和处理，溢出时候取最大，R，右移
    	//注意此处为R0(32 bit ,高位 Element 1,0) ,R3(Element 6,7)
/*
 This single instruction does the following job.
 Element 2 = (Element 2 + Element 5) / 2.
 Element 3 = (Element 3 + Element 4) / 2.
 Element 4 = (Element 3 - Element 4) / 2.
 Element 5 = (Element 2 - Element 5) / 2.
 It reads the Coefficients C4 = cos(4*pi/16) in register R7.
*/
    
        R1 = R1 +|+ R2, R2 = R1 -|- R2 (ASR, CO) || NOP ||  R7 = [I3++];
        //CO 高低16位在目标寄存器中交换位置,也就是Element4,5交换了位置
    
/*
   At the end of stage 1 R0 has (1,0), R1 has (2,3), R2 has (4, 5) and
   R3 has (6,7). Where notation (x, y) means the element from column x is in
   upper half of register and element from column y is in lower half of the
   register.
*/
    
//******************************* Implementation of Part 2 *********************
/*
   The following addition/subtraction  instruction does - 
   Element 0 = Element 0 + Element 3.
   Element 1 = Element 1 + Element 2.
   Element 2 = Element 1 - Element 2.
   Element 3 = Element 0 - Element 3.
*/
        R0 = R0 +|+ R1, R1 = R0 -|- R1;
    
        LSETUP (ROW_START, ROW_END) LC1 = P2 >> 1;//loop for 8 max count
                            //The loop is set for 8 rows. 
ROW_START:
/*
   This is part 2 computation continued.....
   The following two instructions do -
   A1 = Element 6 * cos(pi/4) 
   A0 =  Element 6 * cos(pi/4)
   A1 = A1 - Element 5 * cos(pi/4)
   A0 = A0 + Element 5 * cos(pi/4).
   The instruction W[I0] = R3.L is used for packing it to R2.L. 
*/
            A1 = R3.H * R7.L, A0 = R3.H * R7.L  ||  I1 += M1 || W[I0] = R3.L;
            R4.H = ( A1 -= R2.L * R7.L), R4.L = (A0 += R2.L * R7.L) || I2 += M0;
/*
   At the end of stage 2 R0 has (1,0), R1 has (2,3), R4 has (5, 6).
*/
//************************ Implementation of Part 3 ***************************
/*
   The following two instruction does the job of stage 3 -
   A1 = Element 0 * cos(pi/4) 
   A0 =  Element 0 * cos(pi/4)
   A1 = A1 - Element 1 * cos(pi/4)
   A0 = A0 + Element 1 * cos(pi/4)
   The value of coefficients C2 and C6 are read in register R7. 
*/      
            A1 = R0.L * R7.L, A0 = R0.L * R7.L ||  NOP  || R3.H = W[I1++];
            R5.H = (A1 -= R0.H * R7.L), R5.L = (A0 += R0.H * R7.L)
            || R7 = [I3++];
    
/*
   The following three instructions do -
   A1 = Element 2 * cos(3pi/8) 
   A0 =  Element 3 * cos(3pi/8)
   A1 = A1 + Element 3 * cos(pi/8)
   A0 = A0 - Element 2 * cos(pi/8)
   R3 reads the value of cos pi/4.
   The value of coefficients C7 and C1 is read in register R7.
   Element 4 = Element 4 + Element 5.
   Element 5 = Element 4 - Element 5.
   Element 6 = Element 7 - Element 6.
   Element 7 = Element 7 + Element 6.
*/
            A1 = R1.H * R7.L, A0 = R1.L * R7.L ||  W[P0++P3] = R5.L
            || R2.L = W[I0];
            R2 = R2 +|+ R4, R4 = R2 -|- R4  ||  I0 += 4 || R3.L = W[I1--];
            R6.H = (A1 += R1.L * R7.H), R6.L = (A0 -= R1.H * R7.H) ||  I0 += 4
            || R7 = [I3++];
    
/*
   At the end of part 3 R2 has (4, 7), R4 has (5,6), R5 has (1, 0) and
   R6 has (2,3).
*/
//****************************** Implementation of Part 4 **********************
/*
   The following two instructions do -
   A1 = Element 4 * cos(7pi/16) 
   A0 =  Element 7 * cos(7pi/16)
   A1 = A1 + Element 7 * cos(pi/16)
   A0 = A0 - Element 4 * cos(pi/16)
   The value of next coefficients are read, and the registers are written to 
   their locations.
*/
    
            A1 = R2.H * R7.L, A0 = R2.L * R7.L ||  W[P0++P3] = R6.H
            || R0 = [I0++];
            R2.H = ( A1 += R2.L * R7.H), R2.L = ( A0 -= R2.H * R7.H)
            || W[P0++P3] = R5.H || R7 = [I3++]; 
/*
   The following two instructions do -
   A1 = Element 5 * cos(3pi/16) 
   A0 =  Element 6 * cos(3pi/16)
   A1 = A1 + Element 6 * cos(5pi/16)
   A0 = A0 - Element 5 * cos(5pi/16)
   The output values are written.
*/
    
            A1 = R4.H * R7.H, A0 = R4.L *R7.H ||  W[P0++P2] = R6.L 
            || R1.H = W[I0++]; 
            R4.H = (A1 += R4.L * R7.L), R4.L = ( A0 -= R4.H * R7.L)
            ||  W[P0++P4] = R2.L || R1.L = W[I0++];      
    
//******************* Implementation of Part 1 *****************************
/*
  This is the same part as part 1 specified earlier. First time the part 1 
  calculation is done outside the loop, after wards it is done here. It serves
  two purpose. Firts it computes part 1 for the next data, and it writes the
  data 5, and 6 to its bit reversed order in transpose way. 
*/
            R0 = R0 +|+ R3, R3 = R0 -|- R3 (ASR) || W[P1++P3] = R2.H
            || R2 = [I2++];
            R1 = R1 +|+ R2, R2 = R1 -|- R2 (ASR, CO) ||  W[P1++P3] = R4.L
            || R7 = [I3++]; 
ROW_END:    R0 = R0 +|+ R1, R1 = R0 -|- R1 ||  W[P1++P5] = R4.H || NOP;
        B1 = B0;            //Swapping of Input and output address pointers
        B0 = B2;            //B0 points to input buffer.
DCT_END:B2 = B1;            //B2 points to output buffer.
    
TERMINATE:
    A1.W=[sp++]; A1.X=[sp++]; A0.W=[sp++];A0.X=[sp++];

	m3=[sp++];m2=[sp++];m1=[sp++];m0=[sp++];
	l3=[sp++];l2=[sp++];l1=[sp++];l0=[sp++];
	b3=[sp++];b2=[sp++];b1=[sp++];b0=[sp++];
	i3=[sp++];i2=[sp++];i1=[sp++];i0=[sp++];

    lb1=[SP++];
	lt1=[SP++];
	lc1=[SP++];
	
	lb0=[SP++];
	lt0=[SP++];
	lc0=[SP++];

	(r7:1, p5:0) = [ sp ++ ];
	unlink;
	RTS;               //Return.
    NOP;                    //to avoid one stall if LINK or UNLINK happens to be
                            //the next instruction after RTS in the memory.
_r8x8dct.end:
💿 文件大小 17 K
👤 上传用户 pipinooad
📂 所属分类 DSP编程
📄 代码行数 273 行
💻 语言类型 ASM
🏷️ 相关标签

#Xvid #533 #DCT #BF
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -