postprocess_altivec_template.c

来自「这是著名的TCPMP播放器在WINDWOWS,和WINCE下编译通过的源程序.笔」· C语言代码 · 共 1,197 行 · 第 1/4 页
1,197 行

  const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
  const vector signed short v_sumsB7 = vec_add(temp71, v_last);

  const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
  const vector signed short v_sumsB8 = vec_add(temp81, v_last);

  const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
  const vector signed short v_sumsB9 = vec_add(temp91, v_last);

#define COMPUTE_VR(i, j, k)						\
  const vector signed short temps1##i =					\
    vec_add(v_sumsB##i, v_sumsB##k);					\
  const vector signed short temps2##i =					\
    vec_mladd(vb##j, (vector signed short)v_2, temps1##i);		\
  const vector signed short  vr##j = vec_sra(temps2##i, v_4)

  COMPUTE_VR(0, 1, 2);
  COMPUTE_VR(1, 2, 3);
  COMPUTE_VR(2, 3, 4);
  COMPUTE_VR(3, 4, 5);
  COMPUTE_VR(4, 5, 6);
  COMPUTE_VR(5, 6, 7);
  COMPUTE_VR(6, 7, 8);
  COMPUTE_VR(7, 8, 9);

  const vector signed char neg1 = vec_splat_s8(-1);
  const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
								      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);

#define PACK_AND_STORE(i)					\
  const vector unsigned char perms##i =				\
    vec_lvsr(i * stride, src2);					\
  const vector unsigned char vf##i =				\
    vec_packsu(vr##i, (vector signed short)zero);		\
  const vector unsigned char vg##i =				\
    vec_perm(vf##i, vbT##i, permHH);				\
  const vector unsigned char mask##i =				\
    vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i);	\
  const vector unsigned char vg2##i =				\
    vec_perm(vg##i, vg##i, perms##i);				\
  const vector unsigned char svA##i =				\
    vec_sel(vbA##i, vg2##i, mask##i);				\
  const vector unsigned char svB##i =				\
    vec_sel(vg2##i, vbB##i, mask##i);				\
  vec_st(svA##i, i * stride, src2);				\
  vec_st(svB##i, i * stride + 16, src2)

#define PACK_AND_STORE_ALIGNED(i)				\
  const vector unsigned char vf##i =				\
    vec_packsu(vr##i, (vector signed short)zero);		\
  const vector unsigned char vg##i =				\
    vec_perm(vf##i, vbT##i, permHH);				\
  vec_st(vg##i, i * stride, src2)

  // special casing the aligned case is worthwhile, as all call from
  // the (transposed) horizontable deblocks will be aligned, in addition
  // to the naturraly aligned vertical deblocks.
  if (properStride && srcAlign) {
    PACK_AND_STORE_ALIGNED(1);
    PACK_AND_STORE_ALIGNED(2);
    PACK_AND_STORE_ALIGNED(3);
    PACK_AND_STORE_ALIGNED(4);
    PACK_AND_STORE_ALIGNED(5);
    PACK_AND_STORE_ALIGNED(6);
    PACK_AND_STORE_ALIGNED(7);
    PACK_AND_STORE_ALIGNED(8);
  } else {
    PACK_AND_STORE(1);
    PACK_AND_STORE(2);
    PACK_AND_STORE(3);
    PACK_AND_STORE(4);
    PACK_AND_STORE(5);
    PACK_AND_STORE(6);
    PACK_AND_STORE(7);
    PACK_AND_STORE(8);
  }
#undef PACK_AND_STORE
#undef PACK_AND_STORE_ALIGNED
}



static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) {
  /*
    this code makes no assumption on src or stride.
    One could remove the recomputation of the perm
    vector by assuming (stride % 16) == 0, unfortunately
    this is not always true. Quite a lot of load/stores
    can be removed by assuming proper alignement of
    src & stride :-(
  */
  uint8_t *src2 = src;
  const vector signed int zero = vec_splat_s32(0);
  short __attribute__ ((aligned(16))) qp[8];
  qp[0] = 8*c->QP;
  vector signed short vqp = vec_ld(0, qp);
  vqp = vec_splat(vqp, 0);

#define LOAD_LINE(i)                                                    \
  const vector unsigned char perm##i =					\
    vec_lvsl(i * stride, src2);						\
  const vector unsigned char vbA##i =					\
    vec_ld(i * stride, src2);						\
  const vector unsigned char vbB##i =					\
    vec_ld(i * stride + 16, src2);					\
  const vector unsigned char vbT##i =					\
    vec_perm(vbA##i, vbB##i, perm##i);					\
  const vector signed short vb##i =					\
    (vector signed short)vec_mergeh((vector unsigned char)zero,		\
				    (vector unsigned char)vbT##i)
  
  src2 += stride*3;
  
  LOAD_LINE(1);
  LOAD_LINE(2);
  LOAD_LINE(3);
  LOAD_LINE(4);
  LOAD_LINE(5);
  LOAD_LINE(6);
  LOAD_LINE(7);
  LOAD_LINE(8);
#undef LOAD_LINE
  
  const vector signed short v_1 = vec_splat_s16(1);
  const vector signed short v_2 = vec_splat_s16(2);
  const vector signed short v_5 = vec_splat_s16(5);
  const vector signed short v_32 = vec_sl(v_1,
					  (vector unsigned short)v_5);
  /* middle energy */
  const vector signed short l3minusl6 = vec_sub(vb3, vb6);
  const vector signed short l5minusl4 = vec_sub(vb5, vb4);
  const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);
  const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
  const vector signed short absmE = vec_abs(mE);
  /* left & right energy */
  const vector signed short l1minusl4 = vec_sub(vb1, vb4);
  const vector signed short l3minusl2 = vec_sub(vb3, vb2);
  const vector signed short l5minusl8 = vec_sub(vb5, vb8);
  const vector signed short l7minusl6 = vec_sub(vb7, vb6);
  const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero);
  const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero);
  const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);
  const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);
  /* d */
  const vector signed short ddiff = vec_sub(absmE,
                                            vec_min(vec_abs(lE),
                                                    vec_abs(rE)));
  const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero);
  const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);
  const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6));
  const vector signed short minusd = vec_sub((vector signed short)zero, d);
  const vector signed short finald = vec_sel(minusd,
                                             d,
                                             vec_cmpgt(vec_sub((vector signed short)zero, mE),
                                                       (vector signed short)zero));
  /* q */
  const vector signed short qtimes2 = vec_sub(vb4, vb5);
  /* for a shift right to behave like /2, we need to add one
     to all negative integer */
  const vector signed short rounddown = vec_sel((vector signed short)zero,
                                                v_1,
                                                vec_cmplt(qtimes2, (vector signed short)zero));
  const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));
  /* clamp */
  const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald);
  const vector signed short dclamp_P = vec_min(dclamp_P1, q);
  const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald);
  const vector signed short dclamp_N = vec_max(dclamp_N1, q);

  const vector signed short dclampedfinal = vec_sel(dclamp_N,
                                                    dclamp_P,
                                                    vec_cmpgt(q, (vector signed short)zero));
  const vector signed short dornotd = vec_sel((vector signed short)zero,
                                              dclampedfinal,
                                              vec_cmplt(absmE, vqp));
  /* add/substract to l4 and l5 */
  const vector signed short vb4minusd = vec_sub(vb4, dornotd);
  const vector signed short vb5plusd = vec_add(vb5, dornotd);
  /* finally, stores */
  const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
  const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
  
  const vector signed char neg1 = vec_splat_s8(-1);
  const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
								      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
	
#define STORE(i)						\
  const vector unsigned char perms##i =				\
    vec_lvsr(i * stride, src2);					\
  const vector unsigned char vg##i =				\
    vec_perm(st##i, vbT##i, permHH);				\
  const vector unsigned char mask##i =				\
    vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i);	\
  const vector unsigned char vg2##i =				\
    vec_perm(vg##i, vg##i, perms##i);				\
  const vector unsigned char svA##i =				\
    vec_sel(vbA##i, vg2##i, mask##i);				\
  const vector unsigned char svB##i =				\
    vec_sel(vg2##i, vbB##i, mask##i);				\
  vec_st(svA##i, i * stride, src2);				\
  vec_st(svB##i, i * stride + 16, src2)
	
  STORE(4);
  STORE(5);
}

static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
  /*
    this code makes no assumption on src or stride.
    One could remove the recomputation of the perm
    vector by assuming (stride % 16) == 0, unfortunately
    this is not always true. Quite a lot of load/stores
    can be removed by assuming proper alignement of
    src & stride :-(
  */
  uint8_t *srcCopy = src;
  uint8_t __attribute__((aligned(16))) dt[16];
  const vector unsigned char vuint8_1 = vec_splat_u8(1);
  const vector signed int zero = vec_splat_s32(0);
  vector unsigned char v_dt;
  dt[0] = deringThreshold;
  v_dt = vec_splat(vec_ld(0, dt), 0);

#define LOAD_LINE(i)							\
  const vector unsigned char perm##i =					\
    vec_lvsl(i * stride, srcCopy);					\
  vector unsigned char sA##i = vec_ld(i * stride, srcCopy);		\
  vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy);	\
  vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
	
  LOAD_LINE(0);
  LOAD_LINE(1);
  LOAD_LINE(2);
  LOAD_LINE(3);
  LOAD_LINE(4);
  LOAD_LINE(5);
  LOAD_LINE(6);
  LOAD_LINE(7);
  LOAD_LINE(8);
  LOAD_LINE(9);
#undef LOAD_LINE

  vector unsigned char v_avg;
  {
    const vector unsigned char trunc_perm = (vector unsigned char)
      AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
	  0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18);
    const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);
    const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
    const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
    const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
	  
#define EXTRACT(op) do {						\
      const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \
      const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \
      const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \
      const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \
      const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \
      const vector unsigned char s##op##_9 = vec_##op(s##op##_8h, s##op##_8l); \
      const vector unsigned char s##op##_9h = vec_mergeh(s##op##_9, s##op##_9); \
      const vector unsigned char s##op##_9l = vec_mergel(s##op##_9, s##op##_9); \
      const vector unsigned char s##op##_10 = vec_##op(s##op##_9h, s##op##_9l); \
      const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \
      const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \
      const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \
      const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \
      const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \
      v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0)
	  
    vector unsigned char v_min;
    vector unsigned char v_max;
    EXTRACT(min);
    EXTRACT(max);
#undef EXTRACT
	  
    if (vec_all_lt(vec_sub(v_max, v_min), v_dt))
      return;
	  
    v_avg = vec_avg(v_min, v_max);
  }
	
  signed int __attribute__((aligned(16))) S[8];
  {
    const vector unsigned short mask1 = (vector unsigned short)
      AVV(0x0001, 0x0002, 0x0004, 0x0008,
	  0x0010, 0x0020, 0x0040, 0x0080);
    const vector unsigned short mask2 = (vector unsigned short)
      AVV(0x0100, 0x0200, 0x0000, 0x0000,
	  0x0000, 0x0000, 0x0000, 0x0000);
	  
    const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
    const vector unsigned int vuint32_1 = vec_splat_u32(1);
	  
#define COMPARE(i)							\
    vector signed int sum##i;						\
    do {								\
      const vector unsigned char cmp##i =				\
	(vector unsigned char)vec_cmpgt(src##i, v_avg);			\
      const vector unsigned short cmpHi##i =				\
postprocess_altivec_template.c - 源码说明

本页面展示了「这是著名的TCPMP播放器在WINDWOWS,和WINCE下编译通过的源程序.笔者对其中的LIBMAD库做了针对ARM MPU的优化. 并增加了词幕功能.」中的 postprocess_altivec_template.c 源码文件，采用 C语言编程语言编写，共 1,197 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与WINDWOWS相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?