📄 postprocess_altivec_template.c.svn-base

📁 ffmpeg最新源码
💻 SVN-BASE
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/* * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org> * * based on code by Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */#include "libavutil/avutil.h"#define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \    do {                                                          \        __typeof__(src_a) tempA1, tempB1, tempC1, tempD1;         \        __typeof__(src_a) tempE1, tempF1, tempG1, tempH1;         \        __typeof__(src_a) tempA2, tempB2, tempC2, tempD2;         \        __typeof__(src_a) tempE2, tempF2, tempG2, tempH2;         \        tempA1 = vec_mergeh (src_a, src_e);                       \        tempB1 = vec_mergel (src_a, src_e);                       \        tempC1 = vec_mergeh (src_b, src_f);                       \        tempD1 = vec_mergel (src_b, src_f);                       \        tempE1 = vec_mergeh (src_c, src_g);                       \        tempF1 = vec_mergel (src_c, src_g);                       \        tempG1 = vec_mergeh (src_d, src_h);                       \        tempH1 = vec_mergel (src_d, src_h);                       \        tempA2 = vec_mergeh (tempA1, tempE1);                     \        tempB2 = vec_mergel (tempA1, tempE1);                     \        tempC2 = vec_mergeh (tempB1, tempF1);                     \        tempD2 = vec_mergel (tempB1, tempF1);                     \        tempE2 = vec_mergeh (tempC1, tempG1);                     \        tempF2 = vec_mergel (tempC1, tempG1);                     \        tempG2 = vec_mergeh (tempD1, tempH1);                     \        tempH2 = vec_mergel (tempD1, tempH1);                     \        src_a = vec_mergeh (tempA2, tempE2);                      \        src_b = vec_mergel (tempA2, tempE2);                      \        src_c = vec_mergeh (tempB2, tempF2);                      \        src_d = vec_mergel (tempB2, tempF2);                      \        src_e = vec_mergeh (tempC2, tempG2);                      \        src_f = vec_mergel (tempC2, tempG2);                      \        src_g = vec_mergeh (tempD2, tempH2);                      \        src_h = vec_mergel (tempD2, tempH2);                      \    } while (0)static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {    /*    this code makes no assumption on src or stride.    One could remove the recomputation of the perm    vector by assuming (stride % 16) == 0, unfortunately    this is not always true.    */    DECLARE_ALIGNED(16, short, data[8]) =                    {                        ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1,                        data[0] * 2 + 1,                        c->QP * 2,                        c->QP * 4                    };    int numEq;    uint8_t *src2 = src;    vector signed short v_dcOffset;    vector signed short v2QP;    vector unsigned short v4QP;    vector unsigned short v_dcThreshold;    const int properStride = (stride % 16);    const int srcAlign = ((unsigned long)src2 % 16);    const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;    const vector signed int zero = vec_splat_s32(0);    const vector signed short mask = vec_splat_s16(1);    vector signed int v_numEq = vec_splat_s32(0);    vector signed short v_data = vec_ld(0, data);    vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3,                        v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;//FIXME avoid this mess if possible    register int j0 = 0,                 j1 = stride,                 j2 = 2 * stride,                 j3 = 3 * stride,                 j4 = 4 * stride,                 j5 = 5 * stride,                 j6 = 6 * stride,                 j7 = 7 * stride;    vector unsigned char v_srcA0, v_srcA1, v_srcA2, v_srcA3,                         v_srcA4, v_srcA5, v_srcA6, v_srcA7;    v_dcOffset = vec_splat(v_data, 0);    v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);    v2QP = vec_splat(v_data, 2);    v4QP = (vector unsigned short)vec_splat(v_data, 3);    src2 += stride * 4;#define LOAD_LINE(i)                                                    \    {                                                                   \    vector unsigned char perm##i = vec_lvsl(j##i, src2);                \    vector unsigned char v_srcA2##i;                                    \    vector unsigned char v_srcA1##i = vec_ld(j##i, src2);               \    if (two_vectors)                                                    \        v_srcA2##i = vec_ld(j##i + 16, src2);                           \    v_srcA##i =                                                         \        vec_perm(v_srcA1##i, v_srcA2##i, perm##i);                      \    v_srcAss##i =                                                       \        (vector signed short)vec_mergeh((vector signed char)zero,       \                                        (vector signed char)v_srcA##i); }#define LOAD_LINE_ALIGNED(i)                                            \    v_srcA##i = vec_ld(j##i, src2);                                     \    v_srcAss##i =                                                       \        (vector signed short)vec_mergeh((vector signed char)zero,       \                                        (vector signed char)v_srcA##i)    /* Special-casing the aligned case is worthwhile, as all calls from     * the (transposed) horizontable deblocks will be aligned, in addition     * to the naturally aligned vertical deblocks. */    if (properStride && srcAlign) {        LOAD_LINE_ALIGNED(0);        LOAD_LINE_ALIGNED(1);        LOAD_LINE_ALIGNED(2);        LOAD_LINE_ALIGNED(3);        LOAD_LINE_ALIGNED(4);        LOAD_LINE_ALIGNED(5);        LOAD_LINE_ALIGNED(6);        LOAD_LINE_ALIGNED(7);    } else {        LOAD_LINE(0);        LOAD_LINE(1);        LOAD_LINE(2);        LOAD_LINE(3);        LOAD_LINE(4);        LOAD_LINE(5);        LOAD_LINE(6);        LOAD_LINE(7);    }#undef LOAD_LINE#undef LOAD_LINE_ALIGNED#define ITER(i, j)                                                      \    const vector signed short v_diff##i =                               \        vec_sub(v_srcAss##i, v_srcAss##j);                              \    const vector signed short v_sum##i =                                \        vec_add(v_diff##i, v_dcOffset);                                 \    const vector signed short v_comp##i =                               \        (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \                                       v_dcThreshold);                  \    const vector signed short v_part##i = vec_and(mask, v_comp##i);    {        ITER(0, 1)        ITER(1, 2)        ITER(2, 3)        ITER(3, 4)        ITER(4, 5)        ITER(5, 6)        ITER(6, 7)        v_numEq = vec_sum4s(v_part0, v_numEq);        v_numEq = vec_sum4s(v_part1, v_numEq);        v_numEq = vec_sum4s(v_part2, v_numEq);        v_numEq = vec_sum4s(v_part3, v_numEq);        v_numEq = vec_sum4s(v_part4, v_numEq);        v_numEq = vec_sum4s(v_part5, v_numEq);        v_numEq = vec_sum4s(v_part6, v_numEq);    }#undef ITER    v_numEq = vec_sums(v_numEq, zero);    v_numEq = vec_splat(v_numEq, 3);    vec_ste(v_numEq, 0, &numEq);    if (numEq > c->ppMode.flatnessThreshold){        const vector unsigned char mmoP1 = (const vector unsigned char)            AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,                0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);        const vector unsigned char mmoP2 = (const vector unsigned char)            AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,                0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);        const vector unsigned char mmoP = (const vector unsigned char)            vec_lvsl(8, (unsigned char*)0);        vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);        vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);        vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);        vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);        vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);        vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);        vector signed short mmoDiff = vec_sub(mmoL, mmoR);        vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);        if (vec_any_gt(mmoSum, v4QP))            return 0;        else            return 1;    }    else return 2;}static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {    /*    this code makes no assumption on src or stride.    One could remove the recomputation of the perm    vector by assuming (stride % 16) == 0, unfortunately    this is not always true. Quite a lot of load/stores    can be removed by assuming proper alignment of    src & stride :-(    */    uint8_t *src2 = src;    const vector signed int zero = vec_splat_s32(0);    const int properStride = (stride % 16);    const int srcAlign = ((unsigned long)src2 % 16);    DECLARE_ALIGNED(16, short, qp[8]) = {c->QP};    vector signed short vqp = vec_ld(0, qp);    vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;    vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;    vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;    vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;    vector unsigned char perml0, perml1, perml2, perml3, perml4,                         perml5, perml6, perml7, perml8, perml9;    register int j0 = 0,                 j1 = stride,                 j2 = 2 * stride,                 j3 = 3 * stride,                 j4 = 4 * stride,                 j5 = 5 * stride,                 j6 = 6 * stride,                 j7 = 7 * stride,                 j8 = 8 * stride,                 j9 = 9 * stride;    vqp = vec_splat(vqp, 0);    src2 += stride*3;#define LOAD_LINE(i)                                                    \    perml##i = vec_lvsl(i * stride, src2);                              \    vbA##i = vec_ld(i * stride, src2);                                  \    vbB##i = vec_ld(i * stride + 16, src2);                             \    vbT##i = vec_perm(vbA##i, vbB##i, perml##i);                        \    vb##i =                                                             \        (vector signed short)vec_mergeh((vector unsigned char)zero,     \                                        (vector unsigned char)vbT##i)#define LOAD_LINE_ALIGNED(i)                                            \    vbT##i = vec_ld(j##i, src2);                                        \    vb##i =                                                             \        (vector signed short)vec_mergeh((vector signed char)zero,       \                                        (vector signed char)vbT##i)      /* Special-casing the aligned case is worthwhile, as all calls from       * the (transposed) horizontable deblocks will be aligned, in addition       * to the naturally aligned vertical deblocks. */    if (properStride && srcAlign) {          LOAD_LINE_ALIGNED(0);          LOAD_LINE_ALIGNED(1);          LOAD_LINE_ALIGNED(2);          LOAD_LINE_ALIGNED(3);          LOAD_LINE_ALIGNED(4);          LOAD_LINE_ALIGNED(5);          LOAD_LINE_ALIGNED(6);          LOAD_LINE_ALIGNED(7);          LOAD_LINE_ALIGNED(8);          LOAD_LINE_ALIGNED(9);    } else {          LOAD_LINE(0);          LOAD_LINE(1);          LOAD_LINE(2);          LOAD_LINE(3);          LOAD_LINE(4);          LOAD_LINE(5);          LOAD_LINE(6);          LOAD_LINE(7);          LOAD_LINE(8);          LOAD_LINE(9);    }#undef LOAD_LINE#undef LOAD_LINE_ALIGNED    {        const vector unsigned short v_2 = vec_splat_u16(2);        const vector unsigned short v_4 = vec_splat_u16(4);        const vector signed short v_diff01 = vec_sub(vb0, vb1);        const vector unsigned short v_cmp01 =            (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);        const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);        const vector signed short v_diff89 = vec_sub(vb8, vb9);        const vector unsigned short v_cmp89 =            (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);        const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);        const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -