build_sub22_mests.c

来自「Motion JPEG编解码器源代码」· C语言代码 · 共 702 行 · 第 1/2 页
702 行
/* build_sub22_mests.c, this file is part of the * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder * Copyright (C) 2002  James Klicman <james@klicman.org> * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */#ifdef HAVE_CONFIG_H#include <config.h>#endif#include "altivec_motion.h"#include "vectorize.h"#include <math.h>#include "../mjpeg_logging.h"/* #define AMBER_ENABLE */#include "amber.h"#ifdef HAVE_ALTIVEC_H/* include last to ensure AltiVec type semantics, especially for bool. */#include <altivec.h>#endif#define USE_SMR_PPC#ifdef USE_SMR_PPCextern int sub_mean_reduction_ppc(int len, me_result_set *set, int reduction);#endif/* * Get SAD for 2*2 subsampled macroblocks: *  (0,0) (+2,0) (0,+2) (+2,+2) pixel-space coordinates *  (0,0) (+1,0) (0,+1) (+1,+1) 2*2 subsampled coordinates * *   blk         (blk) *   blk(+2,  0) (blk += 1) *   blk( 0, +2) (blk += rowstride-1) *   blk(+2, +2) (blk += 1) * * Iterate through all rows 2 at a time, calculating all 4 sads as we go. * * Hints regarding input: *   a) blk may be vector aligned, mostly not aligned *   b) ref is about 50% vector aligned and 50% 8 byte aligned *   c) rowstride is always a multiple of 16 *   d) h == 4 or 8 * * NOTES: Since ref is always 8 byte aligned and we are only interested in *        the first 8 bytes, the data can always be retreived with one vec_ld. *        This "one vec_ld" optimization is also attempted for blk. * *        The permutation vectors only need to be calculated once since *        rowstride is always a multiple of 16. */#define BUILD_SUB22_MESTS_PDECL /* {{{ */                                    \  me_result_set *sub44set,                                                   \  me_result_set *sub22set,                                                   \  int i0,  int j0, int ihigh, int jhigh,                                     \  int null_ctl_sad,                                                          \  uint8_t *s22org,  uint8_t *s22blk,                                         \  int rowstride, int h,                                                      \  int reduction                                                              \  /* }}} */#define BUILD_SUB22_MESTS_ARGS /* {{{ */                                     \  sub44set, sub22set,                                                        \  i0,  j0, ihigh, jhigh,                                                     \  null_ctl_sad,                                                              \  s22org,  s22blk,                                                           \  rowstride, h,                                                              \  reduction                                                                  \  /* }}} *//* int build_sub22_mests_altivec(BUILD_SUB22_MESTS_PDECL) {{{ */#if defined(ALTIVEC_VERIFY) && ALTIVEC_TEST_FUNCTION(build_sub22_mests)#define VERIFY_BUILD_SUB22_MESTSstatic void verify_sads(uint8_t *blk1, uint8_t *blk2, int stride, int h,			int *sads, int count);static int _build_sub22_mests_altivec(BUILD_SUB22_MESTS_PDECL, int verify);int build_sub22_mests_altivec(BUILD_SUB22_MESTS_PDECL){  return _build_sub22_mests_altivec(BUILD_SUB22_MESTS_ARGS, 0 /* no verify */);}static int _build_sub22_mests_altivec(BUILD_SUB22_MESTS_PDECL, int verify)#elseint build_sub22_mests_altivec(BUILD_SUB22_MESTS_PDECL)#endif /* }}} */{    int i, ih;    int x, y;    uint8_t *s22orgblk;    int len;    me_result_s *sub44mests;    me_result_s *cres;    me_result_s mres;    /* */    vector unsigned int  zero;    vector unsigned char lvsl;    vector unsigned char perm2;    vector unsigned char align8x2;    vector unsigned int  sads;    vector signed char   xy22,			 xylim;    vector unsigned char xint,			 yint;    vector unsigned int  vthreshold;    unsigned int         threshold;    int stride1, stride2, stride1_16, stride2_16;    union {	vector unsigned char _align16;	struct {	    me_result_s xylim;	    unsigned int threshold;	} init;	me_result_s xy;	me_result_s mests[4];    } vio;#ifdef ALTIVEC_DST    DataStreamControl dsc;#endif#ifndef USE_SMR_PPC    int min_weight;#endif#ifdef ALTIVEC_VERIFY /* {{{ */  if (((unsigned long)s22blk & 0x7) != 0)   mjpeg_error_exit1("build_sub22_mests: s22blk %% 8 != 0, (0x%X)", s22blk);  if (NOT_VECTOR_ALIGNED(rowstride))    mjpeg_error_exit1("build_sub22_mests: rowstride %% 16 != 0, (%d)",		rowstride);  if (h != 4 && h != 8)    mjpeg_error_exit1("build_sub22_mests: h != [4|8], (%d)", h);#if 0  if (NOT_VECTOR_ALIGNED(cres))    mjpeg_warn("build_sub22_mests: cres %% 16 != 0, (0x%X)",cres);#endif#endif /* }}} */    AMBER_START;    len = sub44set->len;    if (len < 1) {	    /* sub44set->len is sometimes zero. we can */	sub22set->len = 0;  /* save a lot of effort if we stop short.  */	return 0;    }#ifdef ALTIVEC_DST    dsc.control = DATA_STREAM_CONTROL(1,0,0);    dsc.block.count = h;    dsc.block.stride = rowstride;    vec_dst(s22blk, dsc.control, 0);    /* increase size to 2 and increment count */    dsc.control += DATA_STREAM_CONTROL(1,1,0);#endif    sub44mests = sub44set->mests;    cres = sub22set->mests;    cres--; /* decrement cres so all stores can be done with stwu */    /* execute instructions that are not dependent on pack_bits */    zero  = vec_splat_u32(0); /* initialize to zero */    /* lvsl = 0x(00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F) {{{ */    lvsl = vec_lvsl(0, (unsigned char*) 0);    /* }}} */    /* 8*8 or 8*4 calculated in 8*2 chunks */    /* align8x2 = 0x(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) {{{ */    align8x2 = vec_sld(lvsl, lvsl, 8);    perm2    = vec_lvsr(0, (unsigned char*)0);    align8x2 = vec_sld(align8x2, perm2, 8);    /* }}} */    mres.weight = 0;	    /* weight must be zero */    mres.x = ihigh - i0;    /* x <= (ihigh - i0) */    mres.y = jhigh - j0;    /* y <= (jhigh - j0) */    vio.init.xylim = mres;    threshold = 6 * null_ctl_sad / (reduction << 2);    vio.init.threshold = threshold;    xy22 = (vector signed char)VCONST(0,0,0,0, 0,0,2,0, 0,0,0,2, 0,0,2,2);    vu32(xint) = vec_splat_u32(0xf);    xint = vec_add(xint, lvsl);    vu32(yint) = vec_splat_u32(1);    yint = vec_add(yint, xint);    perm2 = vec_lvsl(0, s22blk);    perm2 = vec_splat(perm2, 0);    perm2 = vec_add(perm2, align8x2);    stride1 = rowstride;    stride2 = rowstride + rowstride;    stride1_16 = stride1 + 16;    stride2_16 = stride2 + 16;    ih = (h >> 1) - 1;    vthreshold = vec_ld(0, (unsigned int*) &vio.init);    vu32(xylim) = vec_splat(vu32(vthreshold), 0);      /* vio.init.xylim */    vu32(vthreshold) = vec_splat(vu32(vthreshold), 1); /* vio.init.threshold */    do { /* while (--len) */	mres = *sub44mests;	x = mres.x;	y = mres.y;	s22orgblk = s22org + ((y+j0)>>1)*rowstride + ((x+i0)>>1);#ifdef ALTIVEC_DST	vec_dst(s22orgblk, dsc.control, 1);#endif	mres.weight = 0; /* weight must be zero */	vio.xy = mres;	sub44mests++;	/* calculate SADs for 2*2 subsampled macroblocks: {{{ */	{	    vector unsigned int  sad20, sad02, sad22;	    vector unsigned char max, min, dif;	    vector unsigned char perm1;	    vector unsigned char align8x2_0, align8x2_2;	    vector unsigned char ld0, ld1, ld3;	    vector unsigned char v8x1a, v8x1b;	    vector unsigned char vblk8x2;	    vector unsigned char vref8x2;	    uint8_t *pblk, *pref;	    sads = zero;	    sad20 = zero;	    sad02 = zero;	    sad22 = zero;	    pblk = s22orgblk;	    pref = s22blk;	    perm1 = vec_lvsl(0, pblk); /* initialize permute vector */	    if (((unsigned long)pblk & 0xf) < 8) {		/* {{{ */		v8x1a = vec_ld(0, pblk);		/* pblk += rowstride; */		v8x1b = vec_ld(stride1, pblk);		vref8x2 = vec_ld(0, pref);		/* pref += rowstride; */		ld3 = vec_ld(stride1, pref);		align8x2_0 = vec_splat(perm1, 0);		align8x2_0 = vec_add(align8x2_0, align8x2);		align8x2_2 = vec_splat(perm1, 1);		align8x2_2 = vec_add(align8x2_2, align8x2);		vref8x2 = vec_perm(vref8x2, ld3, perm2);		i = ih;		do { /* while (--i) */		    /* load next row */		    /* pblk += rowstride; */		    pblk += stride2;		    ld0 = vec_ld(0, pblk);		    /* calculate (0,0) */		    vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_0);		    max = vec_max(vblk8x2, vref8x2);		    min = vec_min(vblk8x2, vref8x2);		    dif = vec_sub(max, min);		    sads = vec_sum4s(dif, sads);		    /* calculate (2,0) */		    vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_2);		    max = vec_max(vblk8x2, vref8x2);		    min = vec_min(vblk8x2, vref8x2);		    dif = vec_sub(max, min);		    sad20 = vec_sum4s(dif, sad20);		    /* load into v8x1a, then v8x1b will be the top row */		    v8x1a = vec_sld(ld0, ld0, 0); /* v8x1a = ld0; */		    /* load next row */		    /* pblk += rowstride; */		    ld0 = vec_ld(stride1, pblk);		    /* calculate (0,2) */		    vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_0);		    max = vec_max(vblk8x2, vref8x2);		    min = vec_min(vblk8x2, vref8x2);		    dif = vec_sub(max, min);		    sad02 = vec_sum4s(dif, sad02);		    /* calculate (2,2) */		    vblk8x2 = vec_perm(v8x1b, v8x1a, align8x2_2);		    max = vec_max(vblk8x2, vref8x2);		    min = vec_min(vblk8x2, vref8x2);		    /* pref += rowstride; */		    pref += stride2;		    vref8x2 = vec_ld(0, pref);		    /* pref += rowstride; */		    ld3 = vec_ld(stride1, pref);		    dif = vec_sub(max, min);		    sad22 = vec_sum4s(dif, sad22);		    v8x1b = vec_sld(ld0, ld0, 0); /* v8x1b = ld0; */		    vref8x2 = vec_perm(vref8x2, ld3, perm2);		} while (--i);		/* load next row */		/* pblk += rowstride; */		pblk += stride2;		ld0 = vec_ld(0, pblk);		/* calculate (0,0) */		vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_0);		max = vec_max(vblk8x2, vref8x2);		min = vec_min(vblk8x2, vref8x2);		dif = vec_sub(max, min);		sads = vec_sum4s(dif, sads);		/* calculate (2,0) */		vblk8x2 = vec_perm(v8x1a, v8x1b, align8x2_2);		/* load into v8x1a, then v8x1b will be the top row */		v8x1a = vec_sld(ld0, ld0, 0); /* v8x1a = ld0; */		/* }}} */	    } else {		/* {{{ */		v8x1a = vec_ld(0, pblk);		ld0 = vec_ld(16, pblk);
build_sub22_mests.c - 源码说明

本页面展示了「Motion JPEG编解码器源代码」中的 build_sub22_mests.c 源码文件，采用 C语言编程语言编写，共 702 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Motion相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?