⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 halfcopy.asm

📁 adi bf533视频编码程序
💻 ASM
📖 第 1 页 / 共 2 页
字号:

.section L1_code;
.global _xhCopyBlockHalfpel_MPEG4_8u;
.global _xhCopyMBHalfpel_MPEG4_8u;
/**********************************************************************
ippiCopyBlockHalfpel_MPEG4_8u (const Ipp8u* pSrc, int srcStep, Ipp8u*
  pDst, int dstStep, const IppMotionVector* pMV, int roundControl);
 //round_control位没有用 
 //I0和I1的值,要注意字节对齐的问题,尤其是pstart的值
 
 performance:
                       ASM         C
              cycle:   264         5820
************************************************************************/  
_xhCopyBlockHalfpel_MPEG4_8u:

                [--sp]=(r7:4,p5:3);
                [--sp]=i0;
                [--sp]=l0;
                [--sp]=i1;
                [--sp]=l1;
                [--sp]=i2;
                [--sp]=l2; 
                [--sp]=i3;
                [--sp]=l3;
                [--sp]=rets;
                
                b0=r1;//srcstep
                l2=0;
                l1=0;
                l0=0;
                l3=0;
                r7=[sp+16+64];//PMV
                
                i3=r7;
                r7.l=w[i3++];//pMV->dx
                r6.l=w[i3++];//pMV->dy
                r7=r7.l(x);
                r6=r6.l(x);
                r5=r7>>>1;//mv_off_x
                r4=r6>>>1;//mv_off_y
                r4=r4.l*r1.l(is);//mv_off_y*srcstep
                r4=r4+r5;//mv_off_y*srcstep+mv_off_x
                r0=r0+r4;//point to pstart
                
                i3=r2; //pDst
                i0=r0;
                i1=r0;//psrc
                r4=b0;//srcstep
                r5=[sp+12+64];//dststep
                p0=8;
                
                
 /********************************************/               
                cc=bittst(r7,0);//mv_off_x
                if cc jump HF_HH8;
                cc=bittst(r6,0);
                if cc jump FH8;
              
              r5+=-4;
              r4+=-8;
              m0=r4;//src step modify
              m1=r5;//dst step  modify
              i1=i0;
             lsetup (ff_8x8_start,ff_8x8_end)lc0=p0;
                disalgnexcpt||r0=[i0++]||r2=[i1++];
      
              ff_8x8_start:
                   disalgnexcpt||r1=[i0++]||r3=[i1++];
                   r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i1++m0];
                   r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];     
              ff_8x8_end:
                     disalgnexcpt||[i3++m1]=r7||r2=[i1++];
                            
             jump halfcopyblock_end;
             
            FH8:
               m1=r4;//src step       
               r4+=-8;
               m0=r4;//src modify
               r5+=-4;
               m2=r5; //dst modify
               lsetup (fh_8x8_start,fh_8x8_end)lc0=p0;
                   i1=i0;
                   i1+=m1;
                  disalgnexcpt||r0=[i0++]||r2=[i1++];
               fh_8x8_start:
                   disalgnexcpt||r1=[i0++]||r3=[i1++]; 
                   r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i1++m0];
                   r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];       
               fh_8x8_end:
                   disalgnexcpt||[i3++m2]=r7||r2=[i1++];         
               jump  halfcopyblock_end;
                
             HF_HH8:
                 cc=bittst(r6,0);
                 if cc jump HH8;
                 
                 m1=1;//to make the align8    
                 r4+=-8;
                 m0=r4;
                 i1=i0;
                 i1+=m1;
                 r5+=-4;
                 m3=r5;
                 r6=3;       //这一段主要是为I0的后两位全为1
                 r7=i0;      //考虑的。因为此时I0+1后,后两位
                 r7=r7&r6;   //全为0,此时所选的寄存器,就不是
                 cc=r7==r6;  //R3,而是R2,故分开考虑。
                 if cc jump byte_align_HF;
                lsetup(hf_8x8_start,hf_8x8_end)lc0=p0;
                i2=i0;
                disalgnexcpt||r0=[i0++]||r2=[i2++];
                hf_8x8_start:
 
                   
                   disalgnexcpt||r1=[i0++]||r3=[i2++];
                   r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i2++m0];
                   r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
                   
              hf_8x8_end:     
                   disalgnexcpt||[i3++m3]=r7||r2=[i2++];                 
                jump halfcopyblock_end;
                
             byte_align_HF: //特殊情况
                 lsetup(bytealign_start,bytealign_end) lc0=p0;
                 i2=i0;
                 i2+=4;
                  disalgnexcpt||r0=[i0++]||r2=[i2++];
                
                bytealign_start:
                    disalgnexcpt||r1=[i0++]||r3=[i2++];
                    r6=byteop1p(r1:0,r3:2)||r0=[i0++m0]||r2=[i2++m0];       
                    r7=byteop1p(r1:0,r3:2)(r)||[i3++]=r6||r0=[i0++];
                bytealign_end:
                    disalgnexcpt||[i3++m3]=r7||r2=[i2++];
                    //jump halfcopyblock_end;
                   /*
                    
    //R0 = P5;                // Address of the best match
    //I0 = R0;                // Address of best match
    //R0 += -1;
    //I1 = R0;                // Address of best match - 1
    //I3 = SP;                // Output buffer
                      // Address of the best match
                    // Address of best match
    R0 += 1;                // Address of best match - 1
    I0 = R0;                
    P2 = 8 (Z);            
    R5 +=-4;
    M3 = R5;               
    R4 += -12;                
    M1 = R4;
    MNOP;

    LSETUP(AVG2_LR_ST, AVG2_LR_END) LC0 = P2;
    DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
AVG2_LR_ST:
        DISALGNEXCPT || R1 = [I0++] || R3  =[I1++]; 
        R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++] || R2  =[I1++]; 
        R6 = BYTEOP1P(R1:0,R3:2)(R) || [I3++] = R6 || R1 = [I0++M1];
        DISALGNEXCPT  || [I3++M3] = R6 || R3  =[I1++M1];                          
        //R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++];
AVG2_LR_END:
        DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
               */
        jump halfcopyblock_end;
               
                    
              HH8:
              /*
               r5+=-4;
               m3=r5; 
               m1=1;
               m2=r4;//src step
               r4+=-8;
               m0=r4;
               i1=i0;
                 r6=3;       //这一段主要是为I0的后两位全为1
                 r7=i0;      //考虑的。因为此时I0+1后,后两位
                 r7=r7&r6;   //全为0,此时所选的寄存器,就不是
                 cc=r7==r6;  //R3,而是R2,故分开考虑。
               if cc jump byte_align_HH;

              // p1=2;
               lsetup(hh_8x8_start,hh_8x8_end)lc0=p0;          
                    i2=i0;
                    i2+=m2;
                    disalgnexcpt||r0=[i0++]||r2=[i2++];
               hh_8x8_start:
                    
                    disalgnexcpt||r1=[i0++]||r3=[i2++];
            
                   
                    r6=byteop2p(r1:0,r3:2)(rndl);
                    i0+=m1;
                    i1+=m1;
                    r7=byteop2p(r1:0,r3:2)(rndh);
                    i0-=m1;
                    i1-=m1;
                    r7=r7+r6;
                    disalgnexcpt||r0=[i0++m0];
                    disalgnexcpt||[i3++]=r7||r2=[i2++m0];
                    r6=byteop2p(r1:0,r3:2)(rndl,r);
                    i1+=m1;
                    i0+=m1;
                    r7=byteop2p(r1:0,r3:2)(rndh,r);
                    i1-=m1;
                    i0-=m1;                    
                    disalgnexcpt||r0=[i0++];
                    r7=r7+r6;
              hh_8x8_end:
                   disalgnexcpt||[i3++m3]=r7||r2=[i2++];
                   jump  halfcopyblock_end;
             byte_align_HH://特殊情况
                   lsetup(bytealign_hh_start,bytealign_hh_end)lc0=p0;
                    i2=i0;
                    i2+=m2;
                    disalgnexcpt||r0=[i0++]||r2=[i2++];
                  bytealign_hh_start:
                     disalgnexcpt||r1=[i0++]||r3=[i2++];  
                     r6=byteop2p(r1:0,r3:2)(rndl);
                     i0+=m1;
                     i1+=m1;
                     
                     r7=byteop2p(r1:0,r3:2)(rndh,r);
                     i0-=m1;
                     i1-=m1;
                     r7=r7+r6;
                     disalgnexcpt||r0=[i0++m0];
                     disalgnexcpt||[i3++]=r7||r2=[i2++m0];
                     r6=byteop2p(r1:0,r3:2)(rndl,r);
                     i0+=m1;
                     i1+=m1;
                     r7=byteop2p(r1:0,r3:2)(rndh);
                     i0-=m1;
                     i1-=m1;
                     disalgnexcpt||r0=[i0++];
                     r7=r7+r6;
                  bytealign_hh_end:
                   disalgnexcpt||[i3++m3]=r7||r2=[i2++];
                   //jump  halfcopyblock_end;                          
                    */   
         
                 
    M0 = 7;
    M3 = -3 (X);
    I2 = R0;
               // WINWIDTH
    R1 = R0 + R4 (S);
    I0 = R1;                // Address of best match - (WINWIDTH+1)
    I1 = R1;
     
    R4+=-5;      //-17
    M1 = R4;
    
    R5 +=-4;
    M2=R5;
    P2 = 8;   //17

    LSETUP(AVG4_ST, AVG4_END) LC0 = P2;
    DISALGNEXCPT || R0 = [I1++] || R2 = [I2++];
    DISALGNEXCPT || R1 = [I1++M3] || R3 = [I2++M3];
    R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || R0 = [I1++M0];
AVG4_ST:    
        DISALGNEXCPT || I0 += M3 || R2 = [I2++M0];
        R6 = BYTEOP2P(R1:0,R3:2) (RNDH) || R0 = [I1++M3] || R2 = [I2++M3];
        R7 = R6 + R7 (NS) || I0 -= M3;
    
        R7 = BYTEOP2P(R1:0,R3:2) (RNDL, R) || [I3++] = R7 || R1 = [I1++M1];
        DISALGNEXCPT || I0 += M3 || R3 = [I2++M1];
        R6 = BYTEOP2P(R1:0,R3:2) (RNDH, R) || R0 = [I1++] || R2 = [I2++];
        R7 = R6 + R7 (NS) || I0 -= M3;
    
    
        DISALGNEXCPT || R1 = [I1++M3] || R3 = [I2++M3];
    
        //R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || [I3++] = R7 || R0 = [I1++M1];
        //DISALGNEXCPT || I0 += M3 || R2 = [I2++M1];
        //R6 = BYTEOP2P(R1:0,R3:2) (RNDH) || R0 = [I1++] || R2 = [I2++];
        //R7 = R6 + R7 (NS) || I0 -= M3;
                   
        //DISALGNEXCPT || R1 = [I1++M3] || R3 = [I2++M3];
AVG4_END:
        R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || [I3++M2] = R7 || R0 = [I1++M0]; 
        
        
                 
  halfcopyblock_end:
                  rets=[sp++];
                  l3=[sp++];
                  i3=[sp++];
                  l2=[sp++];
                  i2=[sp++];
                  l1=[sp++];
                  i1=[sp++];
                  l0=[sp++];
                  i0=[sp++];
                  (r7:4,p5:3)=[sp++];
                  rts;
                                                                                        
 _xhCopyBlockHalfpel_MPEG4_8u.end:
 
 
 
 /*************************************************************************
 ippiCopyMBHalfpel_MPEG4_8u (const Ipp8u* pSrc, int srcStep, Ipp8u*
         pDst, int dstStep, const IppMotionVector* pMV, int roundControl);
     performance:
                          ASM    C
                  cycle:  653  22389      
     
***************************************************************************/                                    
_xhCopyMBHalfpel_MPEG4_8u:
                                        
                
                [--sp]=(r7:4,p5:3);
                [--sp]=i0;
                [--sp]=l0;
                [--sp]=i1;
                [--sp]=l1;
                [--sp]=i2;
                [--sp]=l2; 
                [--sp]=i3;
                [--sp]=l3;
                [--sp]=rets;
                
                b0=r1;//srcstep
                
                l3=0;
                r7=[sp+16+64];//PMV
                
                i3=r7;
                r7.l=w[i3++];//pMV->dx
                r6.l=w[i3++];//pMV->dy
                r7=r7.l(x);
                r6=r6.l(x);
                r5=r7>>>1;//mv_off_x
                r4=r6>>>1;//mv_off_y
                r4*=r1;//mv_off_y*srcstep
                r4=r4+r5;//mv_off_y*srcstep+mv_off_x
                r0=r0+r4;//point to pstart
                
                i3=r2; //pDst
                i0=r0;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -