asharp.cpp

来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· C++ 代码 · 共 432 行

CPP
432
字号
//
//      asharp (version 0.95) - adaptive sharpenning filter.
//
//      asharp engine implementation (C/MMX/ISSE)
//
//      Copyright (C) 2002 Marc Fauconneau
//
//  This program is free software; you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation; either version 2 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program; if not, write to the Free Software
//  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
//
//  Please contact me for any bugs or questions.
//  marc.fd@libertysurf.fr

//  Change log :
//         27 nov 2002 - ver 0.95  - First GPL release.

#include "stdafx.h"
#include "asharp.h"
#include "Tconfig.h"
#include "simd.h"

#pragma warning (push)
#pragma warning (disable:4799)

static void asharp_run_c(unsigned char* planeptr, int pitch,
                         int height, int width,
                         int T,int D, int B, int B2, int bf,unsigned char* lineptr)
{
 unsigned char *cfp=planeptr+pitch;
 unsigned char *lp=lineptr;
 memcpy(lp,planeptr,width);
 for (int y=1;y<height-1;y++)
  {
   int last = cfp[0];
   int x;
   for (x=1;x<width-1;x++)
    {
     int avg = 0;
     int dev = 0;
     int T2;
     int diff;
     int tmp;

     avg += lp[x-1];
     avg += lp[x  ];
     avg += lp[x+1];
     avg += last;
     avg += cfp[x    ];
     avg += cfp[x  +1];
     avg += cfp[x+pitch-1];
     avg += cfp[x+pitch  ];
     avg += cfp[x+pitch+1];

     avg *= (65535/9);
     avg >>= 16;

     #define CHECK(A) \
     if (abs(A-cfp[x])>dev) dev = abs(A-cfp[x]);

     if (bf)
      {
       if (y%8>0)
        {
         if (x%8>0) CHECK(lp[x-1])
         CHECK(lp[x  ])
         if (x%8<7) CHECK(lp[x+1])
        }
       if (x%8>0) CHECK(last)
       if (x%8<7) CHECK(cfp[x  +1])
       if (y%8<7)
        {
         if (x%8>0) CHECK(cfp[x+pitch-1])
         CHECK(cfp[x+pitch  ])
         if (x%8<7) CHECK(cfp[x+pitch+1])
        }
      }
     else
      {
       CHECK(lp[x-pitch-1])
       CHECK(lp[x-pitch  ])
       CHECK(lp[x-pitch+1])
       CHECK(last)
       CHECK(cfp[x  +1])
       CHECK(cfp[x+pitch-1])
       CHECK(cfp[x+pitch  ])
       CHECK(cfp[x+pitch+1])
      }
     #undef CHECK
     T2 = T;
     diff = cfp[x]-avg;
     int D2 = D;

     if (x%8==6) D2=(D2*B2)>>8;
     if (x%8==7) D2=(D2*B)>>8;
     if (x%8==0) D2=(D2*B)>>8;
     if (x%8==1) D2=(D2*B2)>>8;
     if (y%8==6) D2=(D2*B2)>>8;
     if (y%8==7) D2=(D2*B)>>8;
     if (y%8==0) D2=(D2*B)>>8;
     if (y%8==1) D2=(D2*B2)>>8;

     int Da = -32+(D>>7);
     if (D>0) T2 = ((((dev<<7)*D2)>>16)+Da)<<4;

     if (T2>T) T2=T;
     if (T2<-32) T2=-32;

     tmp = (((diff<<7)*T2)>>16)+cfp[x];

     if (tmp < 0) tmp = 0;
     if (tmp > 255) tmp = 255;
     lp[x-1] = (unsigned char)last;
     last = cfp[x];
     cfp[x] = (unsigned char)tmp;
    }
   lp[x] = cfp[x];
   cfp += pitch;
  }
}

static __forceinline void apply(__m64 &srch,__m64 &srcl,__m64 &difl,__m64 &difh,const __m64 &thrh,const __m64 &thrl)
{
 difh=_mm_slli_pi16(difh,7);     //  psllw   difh,7
 difl=_mm_slli_pi16(difl,7);     //  psllw   difl,7
 difh=_mm_mulhi_pi16(difh,thrh); //  pmulhw  difh,thrh
 difl=_mm_mulhi_pi16(difl,thrl); //  pmulhw  difl,thrl
 srch=_mm_add_pi16(srch,difh);   //  paddw   srch,difh
 srcl=_mm_add_pi16(srcl,difl);   //  paddw   srcl,difl
}

static __forceinline void diff(__m64 &srch,__m64 &srcl,__m64 &difl,__m64 &difh,__m64 &acch,__m64 &accl,const __m64 &c4w_inv9,const __m64 &zero)
{
 acch=_mm_mulhi_pi16(acch,c4w_inv9);//  pmulhw    acch,c4w_inv9
 accl=_mm_mulhi_pi16(accl,c4w_inv9);//  pmulhw    accl,c4w_inv9
 srcl=srch;                         //  movq      srcl,srch
 srch=_mm_unpackhi_pi8(srch,zero);  //  punpckhbw srch,zero
 srcl=_mm_unpacklo_pi8(srcl,zero);  //  punpcklbw srcl,zero
 difh=srch;                         //  movq      difh,srch
 difl=srcl;                         //  movq      difl,srcl
 difh=_mm_subs_pi16(difh,acch);     //  psubsw    difh,acch
 difl=_mm_subs_pi16(difl,accl);     //  psubsw    difl,accl
}

static __forceinline void acc8b(__m64 &srch,__m64 &srcl,__m64 &acch,__m64 &accl,const __m64 &zero)
{
 srcl=srch;                        //  movq      srcl,srch
 srch=_mm_unpackhi_pi8(srch,zero); //  punpckhbw srch,zero
 srcl=_mm_unpacklo_pi8(srcl,zero); //  punpcklbw srcl,zero
 accl=_mm_adds_pu16(accl,srcl);    //  paddusw   accl,srcl
 acch=_mm_adds_pu16(acch,srch);    //  paddusw   acch,srch
}

static __forceinline void loadAcc(const __m64 &offset,__m64 &srch,__m64 &srcl,__m64 &acch,__m64 &accl,const __m64 &zero)
{
 srch=offset;
 acc8b(srch,srcl,acch,accl,zero);
}

static __forceinline void run_a(unsigned char *srcp,unsigned char *tmpp,__m64 &last,int stride,__m64 &srch,__m64 &srcl,__m64 &acch,__m64 &accl,const __m64 &zero)
{
 acch=zero; //  pxor acch,acch
 accl=zero; //  pxor accl,accl
 loadAcc(*(__m64*)(tmpp-1),srch,srcl,acch,accl,zero);
 loadAcc(*(__m64*)(tmpp  ),srch,srcl,acch,accl,zero);
 loadAcc(*(__m64*)(tmpp+1),srch,srcl,acch,accl,zero);
 loadAcc(last             ,srch,srcl,acch,accl,zero);
 loadAcc(*(__m64*)(srcp  ),srch,srcl,acch,accl,zero);
 loadAcc(*(__m64*)(srcp+1),srch,srcl,acch,accl,zero);
 loadAcc(*(__m64*)(srcp+stride-1),srch,srcl,acch,accl,zero);
 loadAcc(*(__m64*)(srcp+stride  ),srch,srcl,acch,accl,zero);
 loadAcc(*(__m64*)(srcp+stride+1),srch,srcl,acch,accl,zero);
}

static __forceinline void run1(unsigned char *cfp,int x,unsigned char *lineptr,__m64 &mm7,int stride,__m64 &mm2,__m64 &mm3,__m64 &mm4,__m64 &mm5,__m64 &mm6,const __m64 &c4w_inv9,const __m64 &c4w_T,__m64 &mm1)
{
 unsigned char *ptr=cfp+x;
 unsigned char *tmp=lineptr+x;
 __m64 zero=_mm_setzero_si64();
 run_a(ptr,tmp,mm7,stride,mm2,mm3,mm4,mm5,zero);
 *(__m64*)(tmp-1)=mm7;
 mm1=*(__m64*)ptr;
 diff(mm1,mm2,mm6,mm7,mm4,mm5,c4w_inv9,zero);
 mm3=c4w_T;
 mm4=mm3;
 apply(mm1,mm2,mm6,mm7,mm3,mm4);
 mm2=_mm_packs_pu16(mm2,mm1);
 mm7=*(__m64*)(ptr+7);
 *(__m64*)ptr=mm2;
}

template<class Tsimd> struct Tasharp
{
 static __forceinline void check8bm(__m64 &cur,__m64 &tmp,__m64 &ref,__m64 &max,const __m64 &msk)
  {
   tmp=ref;                   //  movq    tmp,ref
   tmp=_mm_subs_pu8(tmp,cur); //  psubusb tmp,cur
   cur=_mm_subs_pu8(cur,ref); //  psubusb cur,ref
   tmp=_mm_or_si64(tmp,cur);  //  por     tmp,cur
   tmp=_mm_and_si64(tmp,msk); //  pand    tmp,msk
   Tsimd::pmaxub(max,tmp);
  }

 static __forceinline void loadCheck2(const __m64 &offset,__m64 &cur,__m64 &tmp,__m64 &ref,__m64 &max,const __m64 &msk)
  {
   cur=offset;
   check8bm(cur,tmp,ref,max,msk);
  }

 static __forceinline void check8b(__m64 &cur,__m64 &tmp,__m64 &ref,__m64 &max)
  {
   tmp=ref;                   //  movq    tmp,ref
   tmp=_mm_subs_pu8(tmp,cur); //  psubusb tmp,cur
   cur=_mm_subs_pu8(cur,ref); //  psubusb cur,ref
   tmp=_mm_or_si64(tmp,cur);  //  por     tmp,cur
   Tsimd::pmaxub(max,tmp);
  }

 static __forceinline void loadCheck(const __m64 &offset,__m64 &cur,__m64 &tmp,__m64 &ref,__m64 &max)
  {
   cur=offset;
   check8b(cur,tmp,ref,max);
  }

 static __forceinline void atresh(__m64 &thrh,__m64 &thrl,const __m64 &c4w_D1,const __m64 &c4w_D2,const __m64 &c4w_Td4,const __m64 &c4w_Da,__m64 &zero)
  {
   thrl=thrh;                        //  movq      thrl,thrh
   thrh=_mm_unpackhi_pi8(thrh,zero); //  punpckhbw thrh,zero
   thrl=_mm_unpacklo_pi8(thrl,zero); //  punpcklbw thrl,zero
   thrh=_mm_slli_pi16(thrh,7);       //  psllw     thrh,7
   thrl=_mm_slli_pi16(thrl,7);       //  psllw     thrl,7
   thrh=_mm_mulhi_pi16(thrh,c4w_D1); //  pmulhw    thrh,c4w_D1
   thrl=_mm_mulhi_pi16(thrl,c4w_D2); //  pmulhw    thrl,c4w_D2
   Tsimd::pminsw(thrh,c4w_Td4);
   Tsimd::pminsw(thrl,c4w_Td4);
   thrh=_mm_adds_pi16(thrh,c4w_Da);  //  paddsw    thrh,c4w_Da
   thrl=_mm_adds_pi16(thrl,c4w_Da);  //  paddsw    thrl,c4w_Da
   thrh=_mm_slli_pi16(thrh,4);       //  psllw             thrh,4
   thrl=_mm_slli_pi16(thrl,4);       //  psllw             thrl,4
  }

 struct Trun_c2
  {
   static __forceinline void run(unsigned char *srcp,unsigned char *tmpp,__m64 &last,int stride,__m64 &cur,__m64 &tmp,__m64 &ref,__m64 &max,const __m64 &c8b_00r,const __m64 &c8b_00l)
    {
     max=_mm_setzero_si64(); //  pxor      max,max
     ref=*(__m64*)srcp;      //  movq      ref,[srcp]
     loadCheck2(*(__m64*)(tmpp-1)       ,cur,tmp,ref,max,c8b_00l);
     loadCheck (*(__m64*)(tmpp  )       ,cur,tmp,ref,max);
     loadCheck2(*(__m64*)(tmpp+1)       ,cur,tmp,ref,max,c8b_00r);
     loadCheck2(last                    ,cur,tmp,ref,max,c8b_00l);
     loadCheck2(*(__m64*)(srcp+1)       ,cur,tmp,ref,max,c8b_00r);
     loadCheck2(*(__m64*)(srcp+stride-1),cur,tmp,ref,max,c8b_00l);
     loadCheck (*(__m64*)(srcp+stride  ),cur,tmp,ref,max);
     loadCheck2(*(__m64*)(srcp+stride+1),cur,tmp,ref,max,c8b_00r);
    }
  };

 struct Trun_c2u
  {
   static __forceinline void run(unsigned char *srcp,unsigned char *tmpp,__m64 &last,int stride,__m64 &cur,__m64 &tmp,__m64 &ref,__m64 &max,const __m64 &c8b_00r,const __m64 &c8b_00l)
    {
     max=_mm_setzero_si64();
     ref=*(__m64*)srcp;
     loadCheck2(*(__m64*)(tmpp-1),cur,tmp,ref,max,c8b_00l);
     loadCheck (*(__m64*)(tmpp  ),cur,tmp,ref,max);
     loadCheck2(*(__m64*)(tmpp+1),cur,tmp,ref,max,c8b_00r);
     loadCheck2(last             ,cur,tmp,ref,max,c8b_00l);
     loadCheck2(*(__m64*)(srcp+1),cur,tmp,ref,max,c8b_00r);
    }
  };

 struct Trun_c2d
  {
   static __forceinline void run(unsigned char *srcp,unsigned char *tmpp,__m64 &last,int stride,__m64 &cur,__m64 &tmp,__m64 &ref,__m64 &max,const __m64 &c8b_00r,const __m64 &c8b_00l)
    {
     max=_mm_setzero_si64();
     ref=*(__m64*)srcp;
     loadCheck2(last                    ,cur,tmp,ref,max,c8b_00l);
     loadCheck2(*(__m64*)(srcp+1)       ,cur,tmp,ref,max,c8b_00r);
     loadCheck2(*(__m64*)(srcp+stride-1),cur,tmp,ref,max,c8b_00l);
     loadCheck (*(__m64*)(srcp+stride  ),cur,tmp,ref,max);
     loadCheck2(*(__m64*)(srcp+stride+1),cur,tmp,ref,max,c8b_00r);
    }
  };

 struct Trun_c
  {
   static __forceinline void run(unsigned char *srcp,unsigned char *tmpp,__m64 &last,int stride,__m64 &cur,__m64 &tmp,__m64 &ref,__m64 &max,const __m64 &c8b_00r,const __m64 &c8b_00l)
    {
     max=_mm_setzero_si64();
     ref=*(__m64*)srcp;
     loadCheck(*(__m64*)(tmpp-1)       ,cur,tmp,ref,max);
     loadCheck(*(__m64*)(tmpp  )       ,cur,tmp,ref,max);
     loadCheck(*(__m64*)(tmpp+1)       ,cur,tmp,ref,max);
     loadCheck(last                    ,cur,tmp,ref,max);
     loadCheck(*(__m64*)(srcp+1)       ,cur,tmp,ref,max);
     loadCheck(*(__m64*)(srcp+stride-1),cur,tmp,ref,max);
     loadCheck(*(__m64*)(srcp+stride  ),cur,tmp,ref,max);
     loadCheck(*(__m64*)(srcp+stride+1),cur,tmp,ref,max);
    }
  };

 template<class Trun> static __forceinline void run2(unsigned char *cfp,int x,unsigned char *lineptr,__m64 &mm7,int stride,__m64 &mm2,__m64 &mm3,__m64 &mm4,__m64 &mm5,__m64 &mm6,const __m64 &c4w_inv9,__m64 &mm1,const __m64 &c8b_00r,const __m64 &c8b_00l,const __m64 &c4w_D1,const __m64 &c4w_D2,const __m64 &c4w_Td4,const __m64 &c4w_Da)
  {
   unsigned char *ptr=cfp+x;
   unsigned char *tmp=lineptr+x;
   __m64 mm0=_mm_setzero_si64();
   run_a(ptr,tmp,mm7,stride,mm2,mm3,mm4,mm5,mm0);
   Trun::run(ptr,tmp,mm7,stride,mm2,mm0,mm6,mm3,c8b_00r,c8b_00l);
   *(__m64*)(tmp-1)=mm7;
   mm0=_mm_setzero_si64();
   mm1=*(__m64*)ptr;
   diff(mm1,mm2,mm6,mm7,mm4,mm5,c4w_inv9,mm0);
   atresh(mm3,mm4,c4w_D1,c4w_D2,c4w_Td4,c4w_Da,mm0);
   apply(mm1,mm2,mm6,mm7,mm3,mm4);
   mm2=_mm_packs_pu16(mm2,mm1);
   mm7=*(__m64*)(ptr+7);
   *(__m64*)ptr=mm2;
  }

 static void asharp_run(unsigned char* planeptr, int pitch,
                        int height, int width,
                        int T,int D, int B, int B2, int bf, unsigned char* lineptrna)
  {
   unsigned char* lineptr = (unsigned char*)(((intptr_t)lineptrna)+8-(((intptr_t)lineptrna)&7));

   memcpy(lineptr,planeptr,width);
   unsigned char* cfp = planeptr+pitch;

   __align8(uint16_t,c4w_Dtab[3*8]);
   __align8(uint16_t,*c4w_Dtabp) = c4w_Dtab;

   for (int i=0;i<3;i++)
    {
     int D2=D;
     int D3=D;
     int D4=D;
     if (B<=128)
      {
       if (i==0) D2=(D2*B)>>8;
       if (i==1) D2=(D2*B2)>>8;
       D3=D2;
       D4=D2;
       D3=(D2*B2)>>8;
       D4=(D2*B)>>8;
      }
     c4w_Dtab[i*8+0] = (uint16_t)D4;
     c4w_Dtab[i*8+1] = (uint16_t)D3;
     c4w_Dtab[i*8+2] = (uint16_t)D2;
     c4w_Dtab[i*8+3] = (uint16_t)D2;
     c4w_Dtab[i*8+4] = (uint16_t)D2;
     c4w_Dtab[i*8+5] = (uint16_t)D2;
     c4w_Dtab[i*8+6] = (uint16_t)D3;
     c4w_Dtab[i*8+7] = (uint16_t)D4;
    }

   __m64 c4w_inv9=_mm_set1_pi16(65535/9);
   __m64 c4w_Da=_mm_set1_pi16(short(-32+(D>>7)));
   __m64 c8b_00r=_mm_set_pi8(-1,-1,-1,-1,-1,-1,-1,0);
   __m64 c8b_00l=_mm_set_pi8(0,-1,-1,-1,-1,-1,-1,-1);
   __m64 c4w_T=_mm_set1_pi16(short(T));
   __m64 c4w_Td4=_mm_set1_pi16(short(T>>4));
   __align8(uint8_t,v8b_first[8]);

   __m64 mm2,c4w_D1,c4w_D2;
   for (int y=1;y<height-1;y++)
    {
     if ((y&7)==0 || (y&7)==7)
      {
       c4w_D1=*(__m64*)(c4w_Dtabp+0);
       c4w_D2=mm2=*(__m64*)(c4w_Dtabp+4);
      }
     else if ((y&7)==1 || (y&7)==6)
      {
       c4w_D1=*(__m64*)(c4w_Dtabp+8);
       c4w_D2=mm2=*(__m64*)(c4w_Dtabp+12);
      }
     else
      {
       c4w_D1=*(__m64*)(c4w_Dtabp+16);
       c4w_D2=mm2=*(__m64*)(c4w_Dtabp+20);
      }

     *(__m64*)v8b_first=_mm_slli_si64(*(__m64*)cfp,8);
     v8b_first[0]=v8b_first[1];
     __m64 mm7=*(__m64*)v8b_first;

     __m64 mm1,mm3,mm4,mm5,mm6;
     if (bf && D>0)
      if ((y&7)==0)
       for (int x=0;x<width;x+=8)
        run2<Trun_c2>(cfp,x,lineptr,mm7,pitch,mm2,mm3,mm4,mm5,mm6,c4w_inv9,mm1,c8b_00r,c8b_00l,c4w_D1,c4w_D2,c4w_Td4,c4w_Da);
      else if ((y&7)==7)
       for (int x=0;x<width;x+=8)
        run2<Trun_c2u>(cfp,x,lineptr,mm7,pitch,mm2,mm3,mm4,mm5,mm6,c4w_inv9,mm1,c8b_00r,c8b_00l,c4w_D1,c4w_D2,c4w_Td4,c4w_Da);
      else
       for (int x=0;x<width;x+=8)
        run2<Trun_c2d>(cfp,x,lineptr,mm7,pitch,mm2,mm3,mm4,mm5,mm6,c4w_inv9,mm1,c8b_00r,c8b_00l,c4w_D1,c4w_D2,c4w_Td4,c4w_Da);
     else if (D>0)
      for (int x=0;x<width;x+=8)
       run2<Trun_c>(cfp,x,lineptr,mm7,pitch,mm2,mm3,mm4,mm5,mm6,c4w_inv9,mm1,c8b_00r,c8b_00l,c4w_D1,c4w_D2,c4w_Td4,c4w_Da);
     else
      for (int x=0;x<width;x+=8)
       run1(cfp,x,lineptr,mm7,pitch,mm2,mm3,mm4,mm5,mm6,c4w_inv9,c4w_T,mm1);
     cfp+=pitch;
    }
  }
};

asharp_run_fct* getAsharp(void)
{
 if (Tconfig::cpu_flags&FF_CPU_MMXEXT)
  return Tasharp<Tmmxext>::asharp_run;
 else if (Tconfig::cpu_flags&FF_CPU_MMX)
  return Tasharp<Tmmx>::asharp_run;
 else
  return asharp_run_c;
}

#pragma warning(pop)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?