📄 build_sub22_mests.c
字号:
/* pblk += rowstride; */ v8x1b = vec_ld(stride1, pblk); ld1 = vec_ld(stride1_16, pblk); /* align8x2_0 = align8x2 */ align8x2_0 = vec_sld(align8x2, align8x2, 0); align8x2_2 = vec_splat_u8(1); align8x2_2 = vec_add(align8x2, align8x2_2 /* (1) */ ); vref8x2 = vec_ld(0, pref); /* pref += rowstride; */ ld3 = vec_ld(stride1, pref); v8x1a = vec_perm(v8x1a, ld0, perm1); v8x1b = vec_perm(v8x1b, ld1, perm1); vref8x2 = vec_perm(vref8x2, ld3, perm2); i = ih; do { /* while (--i) */ /* load next row */ /* pblk += rowstride; */ pblk += stride2; ld0 = vec_ld(0, pblk); ld1 = vec_ld(16, pblk); /* calculate (0,0) */ vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_0); max = vec_max(vblk8x2, vref8x2); min = vec_min(vblk8x2, vref8x2); dif = vec_sub(max, min); sads = vec_sum4s(dif, sads); /* calculate (2,0) */ vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_2); max = vec_max(vblk8x2, vref8x2); min = vec_min(vblk8x2, vref8x2); dif = vec_sub(max, min); sad20 = vec_sum4s(dif, sad20); /* load into v8x1a, then v8x1b will be the top row */ v8x1a = vec_perm(ld0, ld1, perm1); /* load next row */ /* pblk += rowstride; */ ld0 = vec_ld(stride1, pblk); ld1 = vec_ld(stride1_16, pblk); /* calculate (0,2) */ vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_0); max = vec_max(vblk8x2, vref8x2); min = vec_min(vblk8x2, vref8x2); dif = vec_sub(max, min); sad02 = vec_sum4s(dif, sad02); /* calculate (2,2) */ vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_2); max = vec_max(vblk8x2, vref8x2); min = vec_min(vblk8x2, vref8x2); /* pref += rowstride; */ pref += stride2; vref8x2 = vec_ld(0, pref); /* pref += rowstride; */ ld3 = vec_ld(stride1, pref); dif = vec_sub(max, min); sad22 = vec_sum4s(dif, sad22); v8x1b = vec_perm(ld0, ld1, perm1); vref8x2 = vec_perm(vref8x2, ld3, perm2); } while (--i); /* load next row */ /* pblk += rowstride; */ pblk += stride2; ld0 = vec_ld(0, pblk); ld1 = vec_ld(16, pblk); /* calculate (0,0) */ vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_0); max = vec_max(vblk8x2, vref8x2); min = vec_min(vblk8x2, vref8x2); dif = vec_sub(max, min); sads = vec_sum4s(dif, sads); /* calculate (2,0) */ vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_2); /* load into v8x1a, then v8x1b will be the top row */ v8x1a = vec_perm(ld0, ld1, perm1); /* }}} */ } /* calculate (2,0) */ max = vec_max(vblk8x2, vref8x2); min = vec_min(vblk8x2, vref8x2); dif = vec_sub(max, min); sad20 = vec_sum4s(dif, sad20); /* calculate (0,2) */ vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_0); max = vec_max(vblk8x2, vref8x2); min = vec_min(vblk8x2, vref8x2); dif = vec_sub(max, min); sad02 = vec_sum4s(dif, sad02); /* calculate (2,2) */ vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_2); max = vec_max(vblk8x2, vref8x2); min = vec_min(vblk8x2, vref8x2); dif = vec_sub(max, min); sad22 = vec_sum4s(dif, sad22); /* calculate final sums {{{ */ vs32(sads) = vec_sums(vs32(sads), vs32(zero)); vs32(sad20) = vec_sums(vs32(sad20), vs32(zero)); vs32(sad02) = vec_sums(vs32(sad02), vs32(zero)); vs32(sad22) = vec_sums(vs32(sad22), vs32(zero)); /* }}} */ /* sads = {sads, sad20, sad02, sad22} {{{ */ vu32(sads) = vec_mergel(vu32(sads), vu32(sad02)); vu32(sad20) = vec_mergel(vu32(sad20), vu32(sad22)); vu32(sads) = vec_mergel(vu32(sads), vu32(sad20)); /* }}} */ } /* }}} */#ifdef VERIFY_BUILD_SUB22_MESTS /* {{{ */ if (verify) verify_sads(s22orgblk, s22blk, rowstride, h, (int*)&sads, 4);#endif /* }}} */ /* add penalty, clip xy, arrange into me_result_s ... {{{ */ { vector signed char xy; xy = vec_ld(0, (signed char*) &vio.xy); vu32(xy) = vec_splat(vu32(xy), 0); /* splat vio.xy */ xy = vec_add(xy, xy22); /* adjust xy values for elements 1-3 */ /* add distance penalty {{{ */ /* penalty = (max(abs(x),abs(y))<<3) */ { vector signed char xyabs; vector unsigned int xxxx, yyyy; vector unsigned int xymax, penalty; /* (abs(x),abs(y)) */ xyabs = vec_subs(vs8(zero), xy); xyabs = vec_max(xyabs, xy); /* xxxx = (x, x, x, x), yyyy = (y, y, y, y) * (0,0,x,y, 0,0,x,y, 0,0,x,y, 0,0,x,y) |/- permute vector -\| * (0,0,0,x, 0,0,0,x, 0,0,0,x, 0,0,0,x) |lvsl+(0x0000000F,...)| * (0,0,0,y, 0,0,0,y, 0,0,0,y, 0,0,0,y) |lvsl+(0x00000010,...)| */ vs8(xxxx) = vec_perm(vs8(zero), xyabs, xint); vs8(yyyy) = vec_perm(vs8(zero), xyabs, yint); /* penalty = max(abs(x),abs(y)) << 3 */ xymax = vec_max(xxxx, yyyy); penalty = vec_splat_u32(3); penalty = vec_sl(xymax, penalty /* (3,...) */ ); sads = vec_add(sads, penalty); } /* }}} */ /* mask sads x <= (ihigh - i0) && y <= (jhigh - j0) {{{ */ /* the first cmpgt (s8) will flag any x and/or y coordinates... {{{ * as out of bounds. the second cmpgt (u32) will complete the * mask if the x or y flag for that result is set. * * Example: {{{ * X Y X Y X Y X Y * [0 0 < <] [0 0 < <] [0 0 > <] [0 0 < >] * vb8(xymask) = vec_cmpgt(vu8(xy), xylim) * [0 0 0 0] [0 0 0 0] [0 0 1 0] [0 0 0 1] * vb32(xymask) = vec_cmpgt(vu32(xymask), vu32(zero)) * [0 0 0 0] [0 0 0 0] [1 1 1 1] [1 1 1 1] * * Legend: 0=0x00 (<)=(xy[n] <= xymax[n]) * 1=0xff (>)=(xy[n] > xymax[n]) * }}} */ /* }}} */ { vector bool int xymask; vb8(xymask) = vec_cmpgt(xy, xylim); xymask = vec_cmpgt(vu32(xymask), zero); /* add (saturated) xymask to sads thereby forcing * masked values above the threshold. */ sads = vec_adds(sads, vu32(xymask)); } /* }}} */ /* arrange sad and xy into me_result_s form and store {{{ */ { vector unsigned int mests; /* mests = ( sad, xy, sad, xy, sad, xy, sad, xy ) {{{ * * ( 0, sad, 0, sad, 0, sad, 0, sad ) * ( sad, sad, sad, sad, sad, sad, sad, sad ) * * ( 0, xy, 0, xy, 0, xy, 0, xy ) * ( xy, xy, xy, xy, xy, xy, xy, xy ) * * ( sad, xy, sad, xy, sad, xy, sad, xy ) */ /* }}} */ vu16(xy) = vec_pack(vu32(xy), vu32(xy)); vu16(mests) = vec_pack(vu32(sads), vu32(sads)); vu16(mests) = vec_mergeh(vu16(mests), vu16(xy)); vec_st(mests, 0, (unsigned int*)&vio.mests); } /* }}} */ } /* }}} */ if (vec_any_lt(sads, vthreshold)) { me_result_s m0, m1, m2, m3; unsigned int w0, w1, w2, w3; m0 = vio.mests[0]; m1 = vio.mests[1]; m2 = vio.mests[2]; m3 = vio.mests[3]; w0 = m0.weight; w1 = m1.weight; w2 = m2.weight; w3 = m3.weight; if (w0 < threshold) *(++cres) = m0; if (w1 < threshold) *(++cres) = m1; if (w2 < threshold) *(++cres) = m2; if (w3 < threshold) *(++cres) = m3; } } while (--len); cres++; /* increment to account for earlier decrement */ len = cres - sub22set->mests; sub22set->len = len; AMBER_STOP;#ifdef USE_SMR_PPC if ((len | reduction) > 0) len = sub_mean_reduction_ppc(len, sub22set, reduction); return len;#else#if ALTIVEC_TEST_FUNCTION(sub_mean_reduction) ALTIVEC_TEST_SUFFIX(sub_mean_reduction)(sub22set, reduction, &min_weight);#else ALTIVEC_SUFFIX(sub_mean_reduction)(sub22set, reduction, &min_weight);#endif return sub22set->len;#endif}#if ALTIVEC_TEST_FUNCTION(build_sub22_mests) /* {{{ */#define BUILD_SUB22_MESTS_PFMT \ "sub44set=0x%X, sub22set=0x%X, i0=%d, j0=%d, ihigh=%d, jhigh=%d, " \ "null_ctl_sad=%d, s22org=0x%X, s22blk=0x%X, rowstride=%d, h=%d, " \ "reduction=%d"# ifdef ALTIVEC_VERIFYint build_sub22_mests_altivec_verify(BUILD_SUB22_MESTS_PDECL){ int i, len1, len2; unsigned long checksum1, checksum2; len1 = _build_sub22_mests_altivec(BUILD_SUB22_MESTS_ARGS, 1 /*verify*/); for (checksum1 = i = 0; i < len1; i++) { checksum1 += sub22set->mests[i].weight; checksum1 += abs(sub22set->mests[i].x); checksum1 += abs(sub22set->mests[i].y); } len2 = ALTIVEC_TEST_WITH(build_sub22_mests)(BUILD_SUB22_MESTS_ARGS); for (checksum2 = i = 0; i < len2; i++) { checksum2 += sub22set->mests[i].weight; checksum2 += abs(sub22set->mests[i].x); checksum2 += abs(sub22set->mests[i].y); } if (len1 != len2 || checksum1 != checksum2) { mjpeg_debug("build_sub22_mests(" BUILD_SUB22_MESTS_PFMT ")", BUILD_SUB22_MESTS_ARGS); mjpeg_debug("build_sub22_mests: sub44set->len=%d", sub44set->len); mjpeg_debug("build_sub22_mests: checksums differ %d[%d] != %d[%d]", checksum1, len1, checksum2, len2); }#if 0 else { mjpeg_info("build_sub22_mests(" BUILD_SUB22_MESTS_PFMT ")", BUILD_SUB22_MESTS_ARGS); mjpeg_info("build_sub22_mests: sub44set->len=%d", sub44set->len); mjpeg_info("build_sub22_mests: checksum %d[%d]", checksum1, len1); }#endif return len2;}static void verify_sads(uint8_t *blk1, uint8_t *blk2, int stride, int h, int *sads, int count){ int i, s, s2; uint8_t *pblk; pblk = blk1; for (i = 0; i < count; i++) { s2 = sads[i]; /* s = sad_sub22(pblk, blk2, stride, h); {{{ */#if ALTIVEC_TEST_FUNCTION(sad_sub22) s = ALTIVEC_TEST_WITH(sad_sub22)(pblk, blk2, stride, h);#else s = sad_sub22(pblk, blk2, stride, h);#endif /* }}} */ if (s2 != s) { mjpeg_debug("build_sub22_mests: sads[%d]=%d != %d" "=sad_sub22(blk1=0x%X(0x%X), blk2=0x%X, " "stride=%d, h=%d)", i, s2, s, pblk, blk1, blk2, stride, h); } if (i == 1) pblk += stride - 1; else pblk += 1; }}# else#undef BENCHMARK_EPILOG#define BENCHMARK_EPILOG \ mjpeg_info("build_sub22_mests: sub44set->len=%d", sub44set->len); \ mjpeg_info("build_sub22_mests: sub22set->len=%d", sub22set->len);ALTIVEC_TEST(build_sub22_mests, int, (BUILD_SUB22_MESTS_PDECL), BUILD_SUB22_MESTS_PFMT, BUILD_SUB22_MESTS_ARGS);# endif#endif /* }}} *//* vim:set foldmethod=marker foldlevel=0: */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -