📄 postprocess_altivec_template.c.svn-base

📁 ffmpeg最新源码
💻 SVN-BASE
📖 第 1 页 / 共 4 页
字号:
        const vector signed short temp02 = vec_add(vb2, vb3);        const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);        const vector signed short v_sumsB0 = vec_add(temp02, temp03);        const vector signed short temp11 = vec_sub(v_sumsB0, v_first);        const vector signed short v_sumsB1 = vec_add(temp11, vb4);        const vector signed short temp21 = vec_sub(v_sumsB1, v_first);        const vector signed short v_sumsB2 = vec_add(temp21, vb5);        const vector signed short temp31 = vec_sub(v_sumsB2, v_first);        const vector signed short v_sumsB3 = vec_add(temp31, vb6);        const vector signed short temp41 = vec_sub(v_sumsB3, v_first);        const vector signed short v_sumsB4 = vec_add(temp41, vb7);        const vector signed short temp51 = vec_sub(v_sumsB4, vb1);        const vector signed short v_sumsB5 = vec_add(temp51, vb8);        const vector signed short temp61 = vec_sub(v_sumsB5, vb2);        const vector signed short v_sumsB6 = vec_add(temp61, v_last);        const vector signed short temp71 = vec_sub(v_sumsB6, vb3);        const vector signed short v_sumsB7 = vec_add(temp71, v_last);        const vector signed short temp81 = vec_sub(v_sumsB7, vb4);        const vector signed short v_sumsB8 = vec_add(temp81, v_last);        const vector signed short temp91 = vec_sub(v_sumsB8, vb5);        const vector signed short v_sumsB9 = vec_add(temp91, v_last);    #define COMPUTE_VR(i, j, k)                                             \        const vector signed short temps1##i =                               \            vec_add(v_sumsB##i, v_sumsB##k);                                \        const vector signed short temps2##i =                               \            vec_mladd(vb##j, (vector signed short)v_2, temps1##i);          \        const vector signed short  vr##j = vec_sra(temps2##i, v_4)        COMPUTE_VR(0, 1, 2);        COMPUTE_VR(1, 2, 3);        COMPUTE_VR(2, 3, 4);        COMPUTE_VR(3, 4, 5);        COMPUTE_VR(4, 5, 6);        COMPUTE_VR(5, 6, 7);        COMPUTE_VR(6, 7, 8);        COMPUTE_VR(7, 8, 9);        const vector signed char neg1 = vec_splat_s8(-1);        const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,                                                                            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);#define PACK_AND_STORE(i)                                       \{   const vector unsigned char perms##i =                       \        vec_lvsr(i * stride, src2);                             \    const vector unsigned char vf##i =                          \        vec_packsu(vr##i, (vector signed short)zero);           \    const vector unsigned char vg##i =                          \        vec_perm(vf##i, vbT##i, permHH);                        \    const vector unsigned char mask##i =                        \        vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \    const vector unsigned char vg2##i =                         \        vec_perm(vg##i, vg##i, perms##i);                       \    const vector unsigned char svA##i =                         \        vec_sel(vbA##i, vg2##i, mask##i);                       \    const vector unsigned char svB##i =                         \        vec_sel(vg2##i, vbB##i, mask##i);                       \    vec_st(svA##i, i * stride, src2);                           \    vec_st(svB##i, i * stride + 16, src2);}#define PACK_AND_STORE_ALIGNED(i)                               \{   const vector unsigned char vf##i =                          \        vec_packsu(vr##i, (vector signed short)zero);           \    const vector unsigned char vg##i =                          \        vec_perm(vf##i, vbT##i, permHH);                        \    vec_st(vg##i, i * stride, src2);}        /* Special-casing the aligned case is worthwhile, as all calls from         * the (transposed) horizontable deblocks will be aligned, in addition         * to the naturally aligned vertical deblocks. */        if (properStride && srcAlign) {            PACK_AND_STORE_ALIGNED(1)            PACK_AND_STORE_ALIGNED(2)            PACK_AND_STORE_ALIGNED(3)            PACK_AND_STORE_ALIGNED(4)            PACK_AND_STORE_ALIGNED(5)            PACK_AND_STORE_ALIGNED(6)            PACK_AND_STORE_ALIGNED(7)            PACK_AND_STORE_ALIGNED(8)        } else {            PACK_AND_STORE(1)            PACK_AND_STORE(2)            PACK_AND_STORE(3)            PACK_AND_STORE(4)            PACK_AND_STORE(5)            PACK_AND_STORE(6)            PACK_AND_STORE(7)            PACK_AND_STORE(8)        }    #undef PACK_AND_STORE    #undef PACK_AND_STORE_ALIGNED    }}static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) {    /*    this code makes no assumption on src or stride.    One could remove the recomputation of the perm    vector by assuming (stride % 16) == 0, unfortunately    this is not always true. Quite a lot of load/stores    can be removed by assuming proper alignment of    src & stride :-(    */    uint8_t *src2 = src + stride*3;    const vector signed int zero = vec_splat_s32(0);    DECLARE_ALIGNED(16, short, qp[8]) = {8*c->QP};    vector signed short vqp = vec_splat(                                (vector signed short)vec_ld(0, qp), 0);#define LOAD_LINE(i)                                                    \    const vector unsigned char perm##i =                                \        vec_lvsl(i * stride, src2);                                     \    const vector unsigned char vbA##i =                                 \        vec_ld(i * stride, src2);                                       \    const vector unsigned char vbB##i =                                 \        vec_ld(i * stride + 16, src2);                                  \    const vector unsigned char vbT##i =                                 \        vec_perm(vbA##i, vbB##i, perm##i);                              \    const vector signed short vb##i =                                   \        (vector signed short)vec_mergeh((vector unsigned char)zero,     \                                        (vector unsigned char)vbT##i)     LOAD_LINE(1);     LOAD_LINE(2);     LOAD_LINE(3);     LOAD_LINE(4);     LOAD_LINE(5);     LOAD_LINE(6);     LOAD_LINE(7);     LOAD_LINE(8);#undef LOAD_LINE     const vector signed short v_1 = vec_splat_s16(1);     const vector signed short v_2 = vec_splat_s16(2);     const vector signed short v_5 = vec_splat_s16(5);     const vector signed short v_32 = vec_sl(v_1,                                             (vector unsigned short)v_5);     /* middle energy */     const vector signed short l3minusl6 = vec_sub(vb3, vb6);     const vector signed short l5minusl4 = vec_sub(vb5, vb4);     const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);     const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);     const vector signed short absmE = vec_abs(mE);     /* left & right energy */     const vector signed short l1minusl4 = vec_sub(vb1, vb4);     const vector signed short l3minusl2 = vec_sub(vb3, vb2);     const vector signed short l5minusl8 = vec_sub(vb5, vb8);     const vector signed short l7minusl6 = vec_sub(vb7, vb6);     const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero);     const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero);     const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);     const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);     /* d */     const vector signed short ddiff = vec_sub(absmE,                                               vec_min(vec_abs(lE),                                                       vec_abs(rE)));     const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero);     const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);     const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6));     const vector signed short minusd = vec_sub((vector signed short)zero, d);     const vector signed short finald = vec_sel(minusd,                                                d,                                                vec_cmpgt(vec_sub((vector signed short)zero, mE),                                                          (vector signed short)zero));     /* q */     const vector signed short qtimes2 = vec_sub(vb4, vb5);     /* for a shift right to behave like /2, we need to add one        to all negative integer */     const vector signed short rounddown = vec_sel((vector signed short)zero,                                                   v_1,                                                   vec_cmplt(qtimes2, (vector signed short)zero));     const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));     /* clamp */     const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald);     const vector signed short dclamp_P = vec_min(dclamp_P1, q);     const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald);     const vector signed short dclamp_N = vec_max(dclamp_N1, q);     const vector signed short dclampedfinal = vec_sel(dclamp_N,                                                       dclamp_P,                                                       vec_cmpgt(q, (vector signed short)zero));     const vector signed short dornotd = vec_sel((vector signed short)zero,                                                 dclampedfinal,                                                 vec_cmplt(absmE, vqp));     /* add/subtract to l4 and l5 */     const vector signed short vb4minusd = vec_sub(vb4, dornotd);     const vector signed short vb5plusd  = vec_add(vb5, dornotd);     /* finally, stores */     const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);     const vector unsigned char st5 = vec_packsu(vb5plusd,  (vector signed short)zero);     const vector signed char neg1 = vec_splat_s8(-1);     const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,                                                                         0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);#define STORE(i)                                                \{    const vector unsigned char perms##i =                      \         vec_lvsr(i * stride, src2);                            \     const vector unsigned char vg##i =                         \         vec_perm(st##i, vbT##i, permHH);                       \     const vector unsigned char mask##i =                       \         vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \     const vector unsigned char vg2##i =                        \         vec_perm(vg##i, vg##i, perms##i);                      \     const vector unsigned char svA##i =                        \         vec_sel(vbA##i, vg2##i, mask##i);                      \     const vector unsigned char svB##i =                        \         vec_sel(vg2##i, vbB##i, mask##i);                      \     vec_st(svA##i, i * stride, src2);                          \     vec_st(svB##i, i * stride + 16, src2);}     STORE(4)     STORE(5)}static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {    /*    this code makes no assumption on src or stride.    One could remove the recomputation of the perm    vector by assuming (stride % 16) == 0, unfortunately    this is not always true. Quite a lot of load/stores    can be removed by assuming proper alignment of    src & stride :-(    */    uint8_t *srcCopy = src;    DECLARE_ALIGNED(16, uint8_t, dt[16]);    const vector signed int zero = vec_splat_s32(0);    vector unsigned char v_dt;    dt[0] = deringThreshold;    v_dt = vec_splat(vec_ld(0, dt), 0);#define LOAD_LINE(i)                                                  \    const vector unsigned char perm##i =                              \        vec_lvsl(i * stride, srcCopy);                                \    vector unsigned char sA##i = vec_ld(i * stride, srcCopy);         \    vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy);    \    vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)    LOAD_LINE(0);    LOAD_LINE(1);    LOAD_LINE(2);    LOAD_LINE(3);    LOAD_LINE(4);    LOAD_LINE(5);    LOAD_LINE(6);    LOAD_LINE(7);    LOAD_LINE(8);    LOAD_LINE(9);#undef LOAD_LINE    vector unsigned char v_avg;    {    const vector unsigned char trunc_perm = (vector unsigned char)        AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,            0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18);    const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);    const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);    const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);    const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);#define EXTRACT(op) do {                                                \    const vector unsigned char s##op##_1   = vec_##op(trunc_src12, trunc_src34); \    const vector unsigned char s##op##_2   = vec_##op(trunc_src56, trunc_src78); \    const vector unsigned char s##op##_6   = vec_##op(s##op##_1, s##op##_2);     \    const vector unsigned char s##op##_8h  = vec_mergeh(s##op##_6, s##op##_6);   \    const vector unsigned char s##op##_8l  = vec_mergel(s##op##_6, s##op##_6);   \    const vector unsigned char s##op##_9   = vec_##op(s##op##_8h, s##op##_8l);   \    const vector unsigned char s##op##_9h  = vec_mergeh(s##op##_9, s##op##_9);   \    const vector unsigned char s##op##_9l  = vec_mergel(s##op##_9, s##op##_9);   \    const vector unsigned char s##op##_10  = vec_##op(s##op##_9h, s##op##_9l);   \    const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \    const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \    const vector unsigned char s##op##_11  = vec_##op(s##op##_10h, s##op##_10l); \    const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \    const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \    v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0)    vector unsigned char v_min;    vector unsigned char v_max;    EXTRACT(min);    EXTRACT(max);#undef EXTRACT    if (vec_all_lt(vec_sub(v_max, v_min), v_dt))        return;    v_avg = vec_avg(v_min, v_max);    }    DECLARE_ALIGNED(16, signed int, S[8]);    {    const vector unsigned short mask1 = (vector unsigned short)        AVV(0x0001, 0x0002, 0x0004, 0x0008,
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -