⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 postprocess_altivec_template.c

📁 ffmpeg源码分析
💻 C
📖 第 1 页 / 共 4 页
字号:
/*    AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>    based on code by Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)    This program is free software; you can redistribute it and/or modify    it under the terms of the GNU General Public License as published by    the Free Software Foundation; either version 2 of the License, or    (at your option) any later version.    This program is distributed in the hope that it will be useful,    but WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    GNU General Public License for more details.    You should have received a copy of the GNU General Public License    along with this program; if not, write to the Free Software    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA*/#ifdef CONFIG_DARWIN#define AVV(x...) (x)#else#define AVV(x...) {x}#endif#define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \  do {                                                                  \    __typeof__(src_a) tempA1, tempB1, tempC1, tempD1;                   \    __typeof__(src_a) tempE1, tempF1, tempG1, tempH1;                   \    __typeof__(src_a) tempA2, tempB2, tempC2, tempD2;                   \    __typeof__(src_a) tempE2, tempF2, tempG2, tempH2;                   \    tempA1 = vec_mergeh (src_a, src_e);                                 \    tempB1 = vec_mergel (src_a, src_e);                                 \    tempC1 = vec_mergeh (src_b, src_f);                                 \    tempD1 = vec_mergel (src_b, src_f);                                 \    tempE1 = vec_mergeh (src_c, src_g);                                 \    tempF1 = vec_mergel (src_c, src_g);                                 \    tempG1 = vec_mergeh (src_d, src_h);                                 \    tempH1 = vec_mergel (src_d, src_h);                                 \    tempA2 = vec_mergeh (tempA1, tempE1);                               \    tempB2 = vec_mergel (tempA1, tempE1);                               \    tempC2 = vec_mergeh (tempB1, tempF1);                               \    tempD2 = vec_mergel (tempB1, tempF1);                               \    tempE2 = vec_mergeh (tempC1, tempG1);                               \    tempF2 = vec_mergel (tempC1, tempG1);                               \    tempG2 = vec_mergeh (tempD1, tempH1);                               \    tempH2 = vec_mergel (tempD1, tempH1);                               \    src_a = vec_mergeh (tempA2, tempE2);                                \    src_b = vec_mergel (tempA2, tempE2);                                \    src_c = vec_mergeh (tempB2, tempF2);                                \    src_d = vec_mergel (tempB2, tempF2);                                \    src_e = vec_mergeh (tempC2, tempG2);                                \    src_f = vec_mergel (tempC2, tempG2);                                \    src_g = vec_mergeh (tempD2, tempH2);                                \    src_h = vec_mergel (tempD2, tempH2);                                \  } while (0)static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {  /*    this code makes no assumption on src or stride.    One could remove the recomputation of the perm    vector by assuming (stride % 16) == 0, unfortunately    this is not always true.  */  register int y;  short __attribute__ ((aligned(16))) data[8];  int numEq;  uint8_t *src2 = src;  vector signed short v_dcOffset;  vector signed short v2QP;  vector unsigned short v4QP;  vector unsigned short v_dcThreshold;  const int properStride = (stride % 16);  const int srcAlign = ((unsigned long)src2 % 16);  const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;  const vector signed int zero = vec_splat_s32(0);  const vector signed short mask = vec_splat_s16(1);  vector signed int v_numEq = vec_splat_s32(0);  data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;  data[1] = data[0] * 2 + 1;  data[2] = c->QP * 2;  data[3] = c->QP * 4;  vector signed short v_data = vec_ld(0, data);  v_dcOffset = vec_splat(v_data, 0);  v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);  v2QP = vec_splat(v_data, 2);  v4QP = (vector unsigned short)vec_splat(v_data, 3);  src2 += stride * 4;  vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;#define LOAD_LINE(i)                                                    \  register int j##i = i * stride;                                       \  vector unsigned char perm##i = vec_lvsl(j##i, src2);                  \  const vector unsigned char v_srcA1##i = vec_ld(j##i, src2);           \  vector unsigned char v_srcA2##i;                                      \  if (two_vectors)                                                      \    v_srcA2##i = vec_ld(j##i + 16, src2);                               \  const vector unsigned char v_srcA##i =                                \    vec_perm(v_srcA1##i, v_srcA2##i, perm##i);                          \  v_srcAss##i =                                                         \    (vector signed short)vec_mergeh((vector signed char)zero,           \                                    (vector signed char)v_srcA##i)#define LOAD_LINE_ALIGNED(i)                                            \  register int j##i = i * stride;                                       \  const vector unsigned char v_srcA##i = vec_ld(j##i, src2);            \  v_srcAss##i =                                                         \    (vector signed short)vec_mergeh((vector signed char)zero,           \                                    (vector signed char)v_srcA##i)    // special casing the aligned case is worthwhile, as all call from    // the (transposed) horizontable deblocks will be aligned, i naddition    // to the naturraly aligned vertical deblocks.    if (properStride && srcAlign) {      LOAD_LINE_ALIGNED(0);      LOAD_LINE_ALIGNED(1);      LOAD_LINE_ALIGNED(2);      LOAD_LINE_ALIGNED(3);      LOAD_LINE_ALIGNED(4);      LOAD_LINE_ALIGNED(5);      LOAD_LINE_ALIGNED(6);      LOAD_LINE_ALIGNED(7);    } else {      LOAD_LINE(0);      LOAD_LINE(1);      LOAD_LINE(2);      LOAD_LINE(3);      LOAD_LINE(4);      LOAD_LINE(5);      LOAD_LINE(6);      LOAD_LINE(7);    }#undef LOAD_LINE#undef LOAD_LINE_ALIGNED#define ITER(i, j)                                                      \  const vector signed short v_diff##i =                                 \    vec_sub(v_srcAss##i, v_srcAss##j);                                  \  const vector signed short v_sum##i =                                  \    vec_add(v_diff##i, v_dcOffset);                                     \  const vector signed short v_comp##i =                                 \    (vector signed short)vec_cmplt((vector unsigned short)v_sum##i,     \                                   v_dcThreshold);                      \  const vector signed short v_part##i = vec_and(mask, v_comp##i);       \  v_numEq = vec_sum4s(v_part##i, v_numEq);  ITER(0, 1);  ITER(1, 2);  ITER(2, 3);  ITER(3, 4);  ITER(4, 5);  ITER(5, 6);  ITER(6, 7);#undef ITER  v_numEq = vec_sums(v_numEq, zero);  v_numEq = vec_splat(v_numEq, 3);  vec_ste(v_numEq, 0, &numEq);  if (numEq > c->ppMode.flatnessThreshold)    {      const vector unsigned char mmoP1 = (const vector unsigned char)        AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,            0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);      const vector unsigned char mmoP2 = (const vector unsigned char)        AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,            0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);      const vector unsigned char mmoP = (const vector unsigned char)        vec_lvsl(8, (unsigned char*)0);      vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);      vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);      vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);      vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);      vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);      vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);      vector signed short mmoDiff = vec_sub(mmoL, mmoR);      vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);      if (vec_any_gt(mmoSum, v4QP))        return 0;      else        return 1;    }  else return 2;}static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {  /*    this code makes no assumption on src or stride.    One could remove the recomputation of the perm    vector by assuming (stride % 16) == 0, unfortunately    this is not always true. Quite a lot of load/stores    can be removed by assuming proper alignement of    src & stride :-(  */  uint8_t *src2 = src;  const vector signed int zero = vec_splat_s32(0);  const int properStride = (stride % 16);  const int srcAlign = ((unsigned long)src2 % 16);  short __attribute__ ((aligned(16))) qp[8];  qp[0] = c->QP;  vector signed short vqp = vec_ld(0, qp);  vqp = vec_splat(vqp, 0);  src2 += stride*3;  vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;  vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;  vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;  vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;#define LOAD_LINE(i)                                                    \  const vector unsigned char perml##i =                                 \    vec_lvsl(i * stride, src2);                                         \  vbA##i = vec_ld(i * stride, src2);                                    \  vbB##i = vec_ld(i * stride + 16, src2);                               \  vbT##i = vec_perm(vbA##i, vbB##i, perml##i);                          \  vb##i =                                                               \    (vector signed short)vec_mergeh((vector unsigned char)zero,         \                                    (vector unsigned char)vbT##i)#define LOAD_LINE_ALIGNED(i)                                            \  register int j##i = i * stride;                                       \  vbT##i = vec_ld(j##i, src2);                                          \  vb##i =                                                               \    (vector signed short)vec_mergeh((vector signed char)zero,           \                                    (vector signed char)vbT##i)    // special casing the aligned case is worthwhile, as all call from    // the (transposed) horizontable deblocks will be aligned, in addition    // to the naturraly aligned vertical deblocks.    if (properStride && srcAlign) {      LOAD_LINE_ALIGNED(0);      LOAD_LINE_ALIGNED(1);      LOAD_LINE_ALIGNED(2);      LOAD_LINE_ALIGNED(3);      LOAD_LINE_ALIGNED(4);      LOAD_LINE_ALIGNED(5);      LOAD_LINE_ALIGNED(6);      LOAD_LINE_ALIGNED(7);      LOAD_LINE_ALIGNED(8);      LOAD_LINE_ALIGNED(9);    } else {      LOAD_LINE(0);      LOAD_LINE(1);      LOAD_LINE(2);      LOAD_LINE(3);      LOAD_LINE(4);      LOAD_LINE(5);      LOAD_LINE(6);      LOAD_LINE(7);      LOAD_LINE(8);      LOAD_LINE(9);    }#undef LOAD_LINE#undef LOAD_LINE_ALIGNED  const vector unsigned short v_1 = vec_splat_u16(1);  const vector unsigned short v_2 = vec_splat_u16(2);  const vector unsigned short v_4 = vec_splat_u16(4);  const vector signed short v_diff01 = vec_sub(vb0, vb1);  const vector unsigned short v_cmp01 =    (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);  const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);  const vector signed short v_diff89 = vec_sub(vb8, vb9);  const vector unsigned short v_cmp89 =    (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);  const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);  const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);  const vector signed short temp02 = vec_add(vb2, vb3);  const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);  const vector signed short v_sumsB0 = vec_add(temp02, temp03);  const vector signed short temp11 = vec_sub(v_sumsB0, v_first);  const vector signed short v_sumsB1 = vec_add(temp11, vb4);  const vector signed short temp21 = vec_sub(v_sumsB1, v_first);  const vector signed short v_sumsB2 = vec_add(temp21, vb5);  const vector signed short temp31 = vec_sub(v_sumsB2, v_first);  const vector signed short v_sumsB3 = vec_add(temp31, vb6);  const vector signed short temp41 = vec_sub(v_sumsB3, v_first);  const vector signed short v_sumsB4 = vec_add(temp41, vb7);  const vector signed short temp51 = vec_sub(v_sumsB4, vb1);  const vector signed short v_sumsB5 = vec_add(temp51, vb8);  const vector signed short temp61 = vec_sub(v_sumsB5, vb2);  const vector signed short v_sumsB6 = vec_add(temp61, v_last);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -