📄 sumsq.c
字号:
/* sumsq.c, this file is part of the * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder * Copyright (C) 2002 James Klicman <james@klicman.org> * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#ifdef HAVE_CONFIG_H#include <config.h>#endif#include "altivec_motion.h"#include "vectorize.h"#include "../mjpeg_logging.h"/* #define AMBER_ENABLE */#include "amber.h"#ifdef HAVE_ALTIVEC_H/* include last to ensure AltiVec type semantics, especially for bool. */#include <altivec.h>#endif/* * Input requirements: * b) blk2 is always vector aligned * c) rowstride is a multiple of 16 * d) h is either 8 or 16 */#define SUMSQ_PDECL \ uint8_t *blk1, \ uint8_t *blk2, \ int rowstride, \ int hx, \ int hy, \ int h \#define SUMSQ_ARGS blk1, blk2, rowstride, hx, hy, h/* * for (j = 0; j < h; j++) { * for (i = 0; i < 16; i++) { * d = blk[i] - ref[i]; * sum += d * d; * } * } */static int sumsq_00(SUMSQ_PDECL) /* {{{ */{ int i; unsigned char *pblk1, *pblk2; vector unsigned char blk1A, blk2A, blk1B, blk2B; vector unsigned char blk1A0, blk1B0; vector unsigned char blk1A1, blk1B1; vector unsigned char minA, minB; vector unsigned char maxA, maxB; vector unsigned char difA, difB; vector unsigned int sum; vector signed int zero; vector unsigned char perm; union { vector signed int v; struct { signed int pad[3]; signed int sum; } s; } vo; pblk1 = blk1; pblk2 = blk2; i = (h >> 1) - 1; zero = vec_splat_s32(0); sum = vec_splat_u32(0); if (VECTOR_ALIGNED(pblk1)) { blk1A = vec_ld(0, pblk1); pblk1 += rowstride; blk1B = vec_ld(0, pblk1); blk2A = vec_ld(0, pblk2); pblk2 += rowstride; blk2B = vec_ld(0, pblk2); do { maxA = vec_max(blk1A, blk2A); minA = vec_min(blk1A, blk2A); pblk1 += rowstride; blk1A = vec_ld(0, pblk1); pblk2 += rowstride; blk2A = vec_ld(0, pblk2); difA = vec_sub(maxA, minA); sum = vec_msum(difA, difA, sum); maxB = vec_max(blk1B, blk2B); minB = vec_min(blk1B, blk2B); pblk1 += rowstride; blk1B = vec_ld(0, pblk1); pblk2 += rowstride; blk2B = vec_ld(0, pblk2); difB = vec_sub(maxB, minB); sum = vec_msum(difB, difB, sum); } while (--i); } else { perm = vec_lvsl(0, pblk1); blk1A0 = vec_ld(0, pblk1); blk1A1 = vec_ld(16, pblk1); pblk1 += rowstride; blk1B0 = vec_ld(0, pblk1); blk1B1 = vec_ld(16, pblk1); blk2A = vec_ld(0, pblk2); pblk2 += rowstride; blk2B = vec_ld(0, pblk2); do { blk1A = vec_perm(blk1A0, blk1A1, perm); pblk1 += rowstride; blk1A0 = vec_ld(0, pblk1); blk1A1 = vec_ld(16, pblk1); maxA = vec_max(blk1A, blk2A); minA = vec_min(blk1A, blk2A); pblk2 += rowstride; blk2A = vec_ld(0, pblk2); difA = vec_sub(maxA, minA); sum = vec_msum(difA, difA, sum); blk1B = vec_perm(blk1B0, blk1B1, perm); pblk1 += rowstride; blk1B0 = vec_ld(0, pblk1); blk1B1 = vec_ld(16, pblk1); maxB = vec_max(blk1B, blk2B); minB = vec_min(blk1B, blk2B); pblk2 += rowstride; blk2B = vec_ld(0, pblk2); difB = vec_sub(maxB, minB); sum = vec_msum(difB, difB, sum); } while (--i); blk1A = vec_perm(blk1A0, blk1A1, perm); blk1B = vec_perm(blk1B0, blk1B1, perm); } maxA = vec_max(blk1A, blk2A); minA = vec_min(blk1A, blk2A); difA = vec_sub(maxA, minA); sum = vec_msum(difA, difA, sum); maxB = vec_max(blk1B, blk2B); minB = vec_min(blk1B, blk2B); difB = vec_sub(maxB, minB); sum = vec_msum(difB, difB, sum); vo.v = vec_sums(vs32(sum), zero); AMBER_STOP; return vo.s.sum;} /* }}} *//* * s = rowstride * for (j = 0; j < h; j++) { * for (i = 0; i < 16; i++) { * d = ((int)(p1[i]+p1[i+1]+1)>>1) - p2[i]; * sum += d * d; * } * p1 += s; * p2 += s; * } */static int sumsq_10(SUMSQ_PDECL) /* {{{ */{ int i; unsigned char *pB, *pR; vector unsigned char l0, l1, l2, l3, lR, lB0, lB1, perm0, perm1; vector unsigned short b0H, b0L, b1H, b1L; vector unsigned short bH, bL; vector unsigned char max, min, dif; vector unsigned int sum; vector unsigned char zero; vector unsigned short one; union { vector signed int v; struct { signed int pad[3]; signed int sum; } s; } vo;#define ISAD() /* {{{ */ \ /* pB[i] + pB[i+1] */ \ bH = vec_add(b0H, b1H); \ bL = vec_add(b0L, b1L); \ \ /* (pB[i]+pB[i+1]) + 1 */ \ bH = vec_add(bH, one); \ bL = vec_add(bL, one); \ \ /* (pB[i]+pB[i+1]+1) >> 1 */ \ bH = vec_sra(bH, one); \ bL = vec_sra(bL, one); \ \ /* d = abs( ((pB[i]+pB[i+1]+1)>>1) - pR[i] ) */ \ vu8(bH) = vec_packsu(bH, bL); \ min = vec_min(vu8(bH), lR); \ max = vec_max(vu8(bH), lR); \ dif = vec_sub(max, min); \ \ /* sum += d * d */ \ sum = vec_msum(dif, dif, sum); \ /* }}} */ pB = blk1, pR = blk2; l0 = vec_ld(0, pB); l1 = vec_ld(16, pB); pB += rowstride; l2 = vec_ld(0, pB); l3 = vec_ld(16, pB); lR = vec_ld(0, pR); /* initialize constants */ zero = vec_splat_u8(0); one = vec_splat_u16(1); sum = vec_splat_u32(0); perm0 = vec_lvsl(0, pB); perm1 = vec_splat_u8(1); perm1 = vec_add(perm0, perm1); i = (h >> 1) - 1; do { /* while (--i) */ lB0 = vec_perm(l0, l1, perm0); lB1 = vec_perm(l0, l1, perm1); pB += rowstride; l0 = vec_ld(0, pB); l1 = vec_ld(16, pB); /* (unsigned short[]) pB[0-7] */ vu8(b0H) = vec_mergeh(zero, lB0); /* (unsigned short[]) pB[8-15] */ vu8(b0L) = vec_mergel(zero, lB0); /* (unsigned short[]) pB[1-8] */ vu8(b1H) = vec_mergeh(zero, lB1); /* (unsigned short[]) pB[9-16] */ vu8(b1L) = vec_mergel(zero, lB1); ISAD(); pR += rowstride; lR = vec_ld(0, pR); lB0 = vec_perm(l2, l3, perm0); lB1 = vec_perm(l2, l3, perm1); pB += rowstride; l2 = vec_ld(0, pB); l3 = vec_ld(16, pB); /* (unsigned short[]) pB[0-7] */ vu8(b0H) = vec_mergeh(zero, lB0); /* (unsigned short[]) pB[8-15] */ vu8(b0L) = vec_mergel(zero, lB0); /* (unsigned short[]) pB[1-8] */ vu8(b1H) = vec_mergeh(zero, lB1); /* (unsigned short[]) pB[9-16] */ vu8(b1L) = vec_mergel(zero, lB1); ISAD(); pR += rowstride; lR = vec_ld(0, pR); } while (--i); lB0 = vec_perm(l0, l1, perm0); lB1 = vec_perm(l0, l1, perm1); /* (unsigned short[]) pB[0-7] */ vu8(b0H) = vec_mergeh(zero, lB0); /* (unsigned short[]) pB[8-15] */ vu8(b0L) = vec_mergel(zero, lB0); /* (unsigned short[]) pB[1-8] */ vu8(b1H) = vec_mergeh(zero, lB1); /* (unsigned short[]) pB[9-16] */ vu8(b1L) = vec_mergel(zero, lB1); ISAD(); pR += rowstride; lR = vec_ld(0, pR); lB0 = vec_perm(l2, l3, perm0); lB1 = vec_perm(l2, l3, perm1); /* (unsigned short[]) pB[0-7] */ vu8(b0H) = vec_mergeh(zero, lB0); /* (unsigned short[]) pB[8-15] */ vu8(b0L) = vec_mergel(zero, lB0); /* (unsigned short[]) pB[1-8] */ vu8(b1H) = vec_mergeh(zero, lB1); /* (unsigned short[]) pB[9-16] */ vu8(b1L) = vec_mergel(zero, lB1); ISAD(); vo.v = vec_sums(vs32(sum), vs32(zero)); return vo.s.sum;#undef ISAD} /* }}} *//* * s = rowstride * for (j = 0; j < h; j++) { * for (i = 0; i < 16; i++) { * d = ((int)(p1[i]+p1[i+s]+1)>>1) - p2[i]; * sum += d * d; * } * p1 += s; * p2 += s; * } */static int sumsq_01(SUMSQ_PDECL) /* {{{ */{ int i; unsigned char *pB, *pR; vector unsigned char l0, l1, lR, lB0, lB1, perm; vector unsigned short b0H, b0L, b1H, b1L; vector unsigned short bH, bL; vector unsigned char max, min, dif; vector unsigned int sum; vector unsigned char zero; vector unsigned short one; union { vector signed int v; struct { signed int pad[3]; signed int sum; } s; } vo;#define ISAD() /* {{{ */ \ /* pB[i] + pB[i+s] */ \ bH = vec_add(b0H, b1H); \ bL = vec_add(b0L, b1L); \ \ /* (pB[i]+pB[i+s]) + 1 */ \ bH = vec_add(bH, one); \ bL = vec_add(bL, one); \ \ /* (pB[i]+pB[i+s]+1) >> 1 */ \ bH = vec_sra(bH, one); \ bL = vec_sra(bL, one); \ \ /* d = abs( ((pB[i]+pB[i+s]+1)>>1) - pR[i] ) */ \ vu8(bH) = vec_packsu(bH, bL); \ min = vec_min(vu8(bH), lR); \ max = vec_max(vu8(bH), lR); \ dif = vec_sub(max, min); \ \ /* sum += d * d */ \ sum = vec_msum(dif, dif, sum); \ /* }}} */ pB = blk1, pR = blk2;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -