📄 postprocess_altivec_template.c
字号:
const vector signed short temp71 = vec_sub(v_sumsB6, vb3); const vector signed short v_sumsB7 = vec_add(temp71, v_last); const vector signed short temp81 = vec_sub(v_sumsB7, vb4); const vector signed short v_sumsB8 = vec_add(temp81, v_last); const vector signed short temp91 = vec_sub(v_sumsB8, vb5); const vector signed short v_sumsB9 = vec_add(temp91, v_last);#define COMPUTE_VR(i, j, k) \ const vector signed short temps1##i = \ vec_add(v_sumsB##i, v_sumsB##k); \ const vector signed short temps2##i = \ vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ const vector signed short vr##j = vec_sra(temps2##i, v_4) COMPUTE_VR(0, 1, 2); COMPUTE_VR(1, 2, 3); COMPUTE_VR(2, 3, 4); COMPUTE_VR(3, 4, 5); COMPUTE_VR(4, 5, 6); COMPUTE_VR(5, 6, 7); COMPUTE_VR(6, 7, 8); COMPUTE_VR(7, 8, 9); const vector signed char neg1 = vec_splat_s8(-1); const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);#define PACK_AND_STORE(i) \ const vector unsigned char perms##i = \ vec_lvsr(i * stride, src2); \ const vector unsigned char vf##i = \ vec_packsu(vr##i, (vector signed short)zero); \ const vector unsigned char vg##i = \ vec_perm(vf##i, vbT##i, permHH); \ const vector unsigned char mask##i = \ vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ const vector unsigned char vg2##i = \ vec_perm(vg##i, vg##i, perms##i); \ const vector unsigned char svA##i = \ vec_sel(vbA##i, vg2##i, mask##i); \ const vector unsigned char svB##i = \ vec_sel(vg2##i, vbB##i, mask##i); \ vec_st(svA##i, i * stride, src2); \ vec_st(svB##i, i * stride + 16, src2)#define PACK_AND_STORE_ALIGNED(i) \ const vector unsigned char vf##i = \ vec_packsu(vr##i, (vector signed short)zero); \ const vector unsigned char vg##i = \ vec_perm(vf##i, vbT##i, permHH); \ vec_st(vg##i, i * stride, src2) // special casing the aligned case is worthwhile, as all call from // the (transposed) horizontable deblocks will be aligned, in addition // to the naturraly aligned vertical deblocks. if (properStride && srcAlign) { PACK_AND_STORE_ALIGNED(1); PACK_AND_STORE_ALIGNED(2); PACK_AND_STORE_ALIGNED(3); PACK_AND_STORE_ALIGNED(4); PACK_AND_STORE_ALIGNED(5); PACK_AND_STORE_ALIGNED(6); PACK_AND_STORE_ALIGNED(7); PACK_AND_STORE_ALIGNED(8); } else { PACK_AND_STORE(1); PACK_AND_STORE(2); PACK_AND_STORE(3); PACK_AND_STORE(4); PACK_AND_STORE(5); PACK_AND_STORE(6); PACK_AND_STORE(7); PACK_AND_STORE(8); }#undef PACK_AND_STORE#undef PACK_AND_STORE_ALIGNED}static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) { /* this code makes no assumption on src or stride. One could remove the recomputation of the perm vector by assuming (stride % 16) == 0, unfortunately this is not always true. Quite a lot of load/stores can be removed by assuming proper alignement of src & stride :-( */ uint8_t *src2 = src; const vector signed int zero = vec_splat_s32(0); short __attribute__ ((aligned(16))) qp[8]; qp[0] = 8*c->QP; vector signed short vqp = vec_ld(0, qp); vqp = vec_splat(vqp, 0);#define LOAD_LINE(i) \ const vector unsigned char perm##i = \ vec_lvsl(i * stride, src2); \ const vector unsigned char vbA##i = \ vec_ld(i * stride, src2); \ const vector unsigned char vbB##i = \ vec_ld(i * stride + 16, src2); \ const vector unsigned char vbT##i = \ vec_perm(vbA##i, vbB##i, perm##i); \ const vector signed short vb##i = \ (vector signed short)vec_mergeh((vector unsigned char)zero, \ (vector unsigned char)vbT##i) src2 += stride*3; LOAD_LINE(1); LOAD_LINE(2); LOAD_LINE(3); LOAD_LINE(4); LOAD_LINE(5); LOAD_LINE(6); LOAD_LINE(7); LOAD_LINE(8);#undef LOAD_LINE const vector signed short v_1 = vec_splat_s16(1); const vector signed short v_2 = vec_splat_s16(2); const vector signed short v_5 = vec_splat_s16(5); const vector signed short v_32 = vec_sl(v_1, (vector unsigned short)v_5); /* middle energy */ const vector signed short l3minusl6 = vec_sub(vb3, vb6); const vector signed short l5minusl4 = vec_sub(vb5, vb4); const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero); const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6); const vector signed short absmE = vec_abs(mE); /* left & right energy */ const vector signed short l1minusl4 = vec_sub(vb1, vb4); const vector signed short l3minusl2 = vec_sub(vb3, vb2); const vector signed short l5minusl8 = vec_sub(vb5, vb8); const vector signed short l7minusl6 = vec_sub(vb7, vb6); const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero); const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero); const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4); const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8); /* d */ const vector signed short ddiff = vec_sub(absmE, vec_min(vec_abs(lE), vec_abs(rE))); const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero); const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32); const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6)); const vector signed short minusd = vec_sub((vector signed short)zero, d); const vector signed short finald = vec_sel(minusd, d, vec_cmpgt(vec_sub((vector signed short)zero, mE), (vector signed short)zero)); /* q */ const vector signed short qtimes2 = vec_sub(vb4, vb5); /* for a shift right to behave like /2, we need to add one to all negative integer */ const vector signed short rounddown = vec_sel((vector signed short)zero, v_1, vec_cmplt(qtimes2, (vector signed short)zero)); const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1)); /* clamp */ const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald); const vector signed short dclamp_P = vec_min(dclamp_P1, q); const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald); const vector signed short dclamp_N = vec_max(dclamp_N1, q); const vector signed short dclampedfinal = vec_sel(dclamp_N, dclamp_P, vec_cmpgt(q, (vector signed short)zero)); const vector signed short dornotd = vec_sel((vector signed short)zero, dclampedfinal, vec_cmplt(absmE, vqp)); /* add/substract to l4 and l5 */ const vector signed short vb4minusd = vec_sub(vb4, dornotd); const vector signed short vb5plusd = vec_add(vb5, dornotd); /* finally, stores */ const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); const vector signed char neg1 = vec_splat_s8(-1); const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);#define STORE(i) \ const vector unsigned char perms##i = \ vec_lvsr(i * stride, src2); \ const vector unsigned char vg##i = \ vec_perm(st##i, vbT##i, permHH); \ const vector unsigned char mask##i = \ vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ const vector unsigned char vg2##i = \ vec_perm(vg##i, vg##i, perms##i); \ const vector unsigned char svA##i = \ vec_sel(vbA##i, vg2##i, mask##i); \ const vector unsigned char svB##i = \ vec_sel(vg2##i, vbB##i, mask##i); \ vec_st(svA##i, i * stride, src2); \ vec_st(svB##i, i * stride + 16, src2) STORE(4); STORE(5);}static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) { /* this code makes no assumption on src or stride. One could remove the recomputation of the perm vector by assuming (stride % 16) == 0, unfortunately this is not always true. Quite a lot of load/stores can be removed by assuming proper alignement of src & stride :-( */ uint8_t *srcCopy = src; uint8_t __attribute__((aligned(16))) dt[16]; const vector unsigned char vuint8_1 = vec_splat_u8(1); const vector signed int zero = vec_splat_s32(0); vector unsigned char v_dt; dt[0] = deringThreshold; v_dt = vec_splat(vec_ld(0, dt), 0);#define LOAD_LINE(i) \ const vector unsigned char perm##i = \ vec_lvsl(i * stride, srcCopy); \ vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) LOAD_LINE(0); LOAD_LINE(1); LOAD_LINE(2); LOAD_LINE(3); LOAD_LINE(4); LOAD_LINE(5); LOAD_LINE(6); LOAD_LINE(7); LOAD_LINE(8); LOAD_LINE(9);#undef LOAD_LINE vector unsigned char v_avg; { const vector unsigned char trunc_perm = (vector unsigned char) AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);#define EXTRACT(op) do { \ const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \ const vector unsigned char s##op##_9 = vec_##op(s##op##_8h, s##op##_8l); \ const vector unsigned char s##op##_9h = vec_mergeh(s##op##_9, s##op##_9); \ const vector unsigned char s##op##_9l = vec_mergel(s##op##_9, s##op##_9); \ const vector unsigned char s##op##_10 = vec_##op(s##op##_9h, s##op##_9l); \ const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \ const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \ const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \ const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \ const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \ v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0) vector unsigned char v_min; vector unsigned char v_max; EXTRACT(min); EXTRACT(max);#undef EXTRACT if (vec_all_lt(vec_sub(v_max, v_min), v_dt)) return; v_avg = vec_avg(v_min, v_max); } signed int __attribute__((aligned(16))) S[8]; { const vector unsigned short mask1 = (vector unsigned short) AVV(0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080); const vector unsigned short mask2 = (vector unsigned short) AVV(0x0100, 0x0200, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000); const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); const vector unsigned int vuint32_1 = vec_splat_u32(1);#define COMPARE(i) \ vector signed int sum##i; \ do { \ const vector unsigned char cmp##i = \ (vector unsigned char)vec_cmpgt(src##i, v_avg); \ const vector unsigned short cmpHi##i = \
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -