⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 postprocess_altivec_template.c

📁 这是著名的TCPMP播放器在WINDWOWS,和WINCE下编译通过的源程序.笔者对其中的LIBMAD库做了针对ARM MPU的优化. 并增加了词幕功能.
💻 C
📖 第 1 页 / 共 4 页
字号:
/*
    AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>

    based on code by Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/


#ifdef CONFIG_DARWIN
#define AVV(x...) (x)
#else
#define AVV(x...) {x}
#endif

#define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
  do {									\
    __typeof__(src_a) tempA1, tempB1, tempC1, tempD1;			\
    __typeof__(src_a) tempE1, tempF1, tempG1, tempH1;			\
    __typeof__(src_a) tempA2, tempB2, tempC2, tempD2;			\
    __typeof__(src_a) tempE2, tempF2, tempG2, tempH2;			\
    tempA1 = vec_mergeh (src_a, src_e);					\
    tempB1 = vec_mergel (src_a, src_e);					\
    tempC1 = vec_mergeh (src_b, src_f);					\
    tempD1 = vec_mergel (src_b, src_f);					\
    tempE1 = vec_mergeh (src_c, src_g);					\
    tempF1 = vec_mergel (src_c, src_g);					\
    tempG1 = vec_mergeh (src_d, src_h);					\
    tempH1 = vec_mergel (src_d, src_h);					\
    tempA2 = vec_mergeh (tempA1, tempE1);				\
    tempB2 = vec_mergel (tempA1, tempE1);				\
    tempC2 = vec_mergeh (tempB1, tempF1);				\
    tempD2 = vec_mergel (tempB1, tempF1);				\
    tempE2 = vec_mergeh (tempC1, tempG1);				\
    tempF2 = vec_mergel (tempC1, tempG1);				\
    tempG2 = vec_mergeh (tempD1, tempH1);				\
    tempH2 = vec_mergel (tempD1, tempH1);				\
    src_a = vec_mergeh (tempA2, tempE2);				\
    src_b = vec_mergel (tempA2, tempE2);				\
    src_c = vec_mergeh (tempB2, tempF2);				\
    src_d = vec_mergel (tempB2, tempF2);				\
    src_e = vec_mergeh (tempC2, tempG2);				\
    src_f = vec_mergel (tempC2, tempG2);				\
    src_g = vec_mergeh (tempD2, tempH2);				\
    src_h = vec_mergel (tempD2, tempH2);				\
  } while (0)


static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
  /*
    this code makes no assumption on src or stride.
    One could remove the recomputation of the perm
    vector by assuming (stride % 16) == 0, unfortunately
    this is not always true.
  */
  register int y;
  short __attribute__ ((aligned(16))) data[8];
  int numEq;
  uint8_t *src2 = src;
  vector signed short v_dcOffset;
  vector signed short v2QP;
  vector unsigned short v4QP;
  vector unsigned short v_dcThreshold;
  const int properStride = (stride % 16);
  const int srcAlign = ((unsigned long)src2 % 16);
  const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
  const vector signed int zero = vec_splat_s32(0);
  const vector signed short mask = vec_splat_s16(1);
  vector signed int v_numEq = vec_splat_s32(0);
	
  data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
  data[1] = data[0] * 2 + 1;
  data[2] = c->QP * 2;
  data[3] = c->QP * 4;
  vector signed short v_data = vec_ld(0, data);
  v_dcOffset = vec_splat(v_data, 0);
  v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
  v2QP = vec_splat(v_data, 2);
  v4QP = (vector unsigned short)vec_splat(v_data, 3);

  src2 += stride * 4;

  vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;

#define LOAD_LINE(i)							\
  register int j##i = i * stride;					\
  vector unsigned char perm##i = vec_lvsl(j##i, src2);			\
  const vector unsigned char v_srcA1##i = vec_ld(j##i, src2);		\
  vector unsigned char v_srcA2##i;					\
  if (two_vectors)							\
    v_srcA2##i = vec_ld(j##i + 16, src2);				\
  const vector unsigned char v_srcA##i =				\
    vec_perm(v_srcA1##i, v_srcA2##i, perm##i);				\
  v_srcAss##i =                                                         \
    (vector signed short)vec_mergeh((vector signed char)zero,		\
				    (vector signed char)v_srcA##i)

#define LOAD_LINE_ALIGNED(i)                                            \
  register int j##i = i * stride;                                       \
  const vector unsigned char v_srcA##i = vec_ld(j##i, src2);            \
  v_srcAss##i =                                                         \
    (vector signed short)vec_mergeh((vector signed char)zero,		\
				    (vector signed char)v_srcA##i)

    // special casing the aligned case is worthwhile, as all call from
    // the (transposed) horizontable deblocks will be aligned, i naddition
    // to the naturraly aligned vertical deblocks.
    if (properStride && srcAlign) {
      LOAD_LINE_ALIGNED(0);
      LOAD_LINE_ALIGNED(1);
      LOAD_LINE_ALIGNED(2);
      LOAD_LINE_ALIGNED(3);
      LOAD_LINE_ALIGNED(4);
      LOAD_LINE_ALIGNED(5);
      LOAD_LINE_ALIGNED(6);
      LOAD_LINE_ALIGNED(7);
    } else {
      LOAD_LINE(0);
      LOAD_LINE(1);
      LOAD_LINE(2);
      LOAD_LINE(3);
      LOAD_LINE(4);
      LOAD_LINE(5);
      LOAD_LINE(6);
      LOAD_LINE(7);
    }
#undef LOAD_LINE
#undef LOAD_LINE_ALIGNED

#define ITER(i, j)							\
  const vector signed short v_diff##i =					\
    vec_sub(v_srcAss##i, v_srcAss##j);					\
  const vector signed short v_sum##i =					\
    vec_add(v_diff##i, v_dcOffset);					\
  const vector signed short v_comp##i =					\
    (vector signed short)vec_cmplt((vector unsigned short)v_sum##i,	\
				   v_dcThreshold);			\
  const vector signed short v_part##i = vec_and(mask, v_comp##i);	\
  v_numEq = vec_sum4s(v_part##i, v_numEq);

  ITER(0, 1);
  ITER(1, 2);
  ITER(2, 3);
  ITER(3, 4);
  ITER(4, 5);
  ITER(5, 6);
  ITER(6, 7);
#undef ITER

  v_numEq = vec_sums(v_numEq, zero);
	
  v_numEq = vec_splat(v_numEq, 3);
  vec_ste(v_numEq, 0, &numEq);

  if (numEq > c->ppMode.flatnessThreshold)
    {
      const vector unsigned char mmoP1 = (const vector unsigned char)
	AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
	    0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
      const vector unsigned char mmoP2 = (const vector unsigned char)
	AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
	    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
      const vector unsigned char mmoP = (const vector unsigned char)
	vec_lvsl(8, (unsigned char*)0);
      
      vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
      vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
      vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
      vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
      vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
      vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
      vector signed short mmoDiff = vec_sub(mmoL, mmoR);
      vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
      
      if (vec_any_gt(mmoSum, v4QP))
	return 0;
      else
	return 1;
    }
  else return 2; 
}

static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
  /*
    this code makes no assumption on src or stride.
    One could remove the recomputation of the perm
    vector by assuming (stride % 16) == 0, unfortunately
    this is not always true. Quite a lot of load/stores
    can be removed by assuming proper alignement of
    src & stride :-(
  */
  uint8_t *src2 = src;
  const vector signed int zero = vec_splat_s32(0);
  const int properStride = (stride % 16);
  const int srcAlign = ((unsigned long)src2 % 16);
  short __attribute__ ((aligned(16))) qp[8];
  qp[0] = c->QP;
  vector signed short vqp = vec_ld(0, qp);
  vqp = vec_splat(vqp, 0);
	
  src2 += stride*3;

  vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
  vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;
  vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;
  vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
	
#define LOAD_LINE(i)                                                    \
  const vector unsigned char perml##i =					\
    vec_lvsl(i * stride, src2);						\
  vbA##i = vec_ld(i * stride, src2);                                    \
  vbB##i = vec_ld(i * stride + 16, src2);                               \
  vbT##i = vec_perm(vbA##i, vbB##i, perml##i);                          \
  vb##i =                                                               \
    (vector signed short)vec_mergeh((vector unsigned char)zero,		\
				    (vector unsigned char)vbT##i)

#define LOAD_LINE_ALIGNED(i)                                            \
  register int j##i = i * stride;                                       \
  vbT##i = vec_ld(j##i, src2);                                          \
  vb##i =                                                               \
    (vector signed short)vec_mergeh((vector signed char)zero,		\
				    (vector signed char)vbT##i)

    // special casing the aligned case is worthwhile, as all call from
    // the (transposed) horizontable deblocks will be aligned, in addition
    // to the naturraly aligned vertical deblocks.
    if (properStride && srcAlign) {
      LOAD_LINE_ALIGNED(0);
      LOAD_LINE_ALIGNED(1);
      LOAD_LINE_ALIGNED(2);
      LOAD_LINE_ALIGNED(3);
      LOAD_LINE_ALIGNED(4);
      LOAD_LINE_ALIGNED(5);
      LOAD_LINE_ALIGNED(6);
      LOAD_LINE_ALIGNED(7);
      LOAD_LINE_ALIGNED(8);
      LOAD_LINE_ALIGNED(9);
    } else {
      LOAD_LINE(0);
      LOAD_LINE(1);
      LOAD_LINE(2);
      LOAD_LINE(3);
      LOAD_LINE(4);
      LOAD_LINE(5);
      LOAD_LINE(6);
      LOAD_LINE(7);
      LOAD_LINE(8);
      LOAD_LINE(9);
    }
#undef LOAD_LINE
#undef LOAD_LINE_ALIGNED

  const vector unsigned short v_1 = vec_splat_u16(1);
  const vector unsigned short v_2 = vec_splat_u16(2);
  const vector unsigned short v_4 = vec_splat_u16(4);

  const vector signed short v_diff01 = vec_sub(vb0, vb1);
  const vector unsigned short v_cmp01 =
    (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
  const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
  const vector signed short v_diff89 = vec_sub(vb8, vb9);
  const vector unsigned short v_cmp89 =
    (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
  const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
  
  const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
  const vector signed short temp02 = vec_add(vb2, vb3);
  const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
  const vector signed short v_sumsB0 = vec_add(temp02, temp03);

  const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
  const vector signed short v_sumsB1 = vec_add(temp11, vb4);

  const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
  const vector signed short v_sumsB2 = vec_add(temp21, vb5);

  const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
  const vector signed short v_sumsB3 = vec_add(temp31, vb6);

  const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
  const vector signed short v_sumsB4 = vec_add(temp41, vb7);

  const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
  const vector signed short v_sumsB5 = vec_add(temp51, vb8);

  const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
  const vector signed short v_sumsB6 = vec_add(temp61, v_last);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -