vf_fspp.c
来自「君正早期ucos系统(只有早期的才不没有打包成库),MPLAYER,文件系统,图」· C语言 代码 · 共 2,126 行 · 第 1/5 页
C
2,126 行
/* Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at> Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA*//* * This implementation is based on an algorithm described in * "Aria Nosratinia Embedded Post-Processing for * Enhancement of Compressed Images (1999)" * (http://citeseer.nj.nec.com/nosratinia99embedded.html) * Futher, with splitting (i)dct into hor/ver passes, one of them can be * performed once per block, not pixel. This allows for much better speed. *//* Heavily optimized version of SPP filter by Nikolaj */ #include <uclib.h>#include <uclib.h>#include <uclib.h>#include <inttypes.h>#include <math.h>#include "config.h"#include "mp_msg.h"#include "cpudetect.h"#include "libavcodec/avcodec.h"#include "libavcodec/dsputil.h"#ifdef HAVE_MALLOC_H#include <uclib.h>#endif#include "img_format.h"#include "mp_image.h"#include "vf.h"#include "libvo/fastmemcpy.h"#undef memcpy#define memcpy uc_memcpy//===========================================================================//#define BLOCKSZ 12static const short custom_threshold[64]=// values (296) can't be too high// -it causes too big quant dependence// or maybe overflow(check), which results in some flashing{ 71, 296, 295, 237, 71, 40, 38, 19, 245, 193, 185, 121, 102, 73, 53, 27, 158, 129, 141, 107, 97, 73, 50, 26, 102, 116, 109, 98, 82, 66, 45, 23, 71, 94, 95, 81, 70, 56, 38, 20, 56, 77, 74, 66, 56, 44, 30, 15, 38, 53, 50, 45, 38, 30, 21, 11, 20, 27, 26, 23, 20, 15, 11, 5};static const uint8_t __attribute__((aligned(32))) dither[8][8]={ { 0, 48, 12, 60, 3, 51, 15, 63, }, { 32, 16, 44, 28, 35, 19, 47, 31, }, { 8, 56, 4, 52, 11, 59, 7, 55, }, { 40, 24, 36, 20, 43, 27, 39, 23, }, { 2, 50, 14, 62, 1, 49, 13, 61, }, { 34, 18, 46, 30, 33, 17, 45, 29, }, { 10, 58, 6, 54, 9, 57, 5, 53, }, { 42, 26, 38, 22, 41, 25, 37, 21, },};struct vf_priv_s { //align 16 ! uint64_t threshold_mtx_noq[8*2]; uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions int log2_count; int temp_stride; int qp; int mpeg2; int prev_q; uint8_t *src; int16_t *temp; int bframes; char *non_b_qp;};#ifndef HAVE_MMX//This func reads from 1 slice, 1 and clears 0 & 1static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale){int y, x;#define STORE(pos) \ temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale); \ src[x + pos]=src[x + pos - 8*src_stride]=0; \ if(temp & 0x100) temp= ~(temp>>31); \ dst[x + pos]= temp; for(y=0; y<height; y++){ const uint8_t *d= dither[y]; for(x=0; x<width; x+=8){ int temp; STORE(0); STORE(1); STORE(2); STORE(3); STORE(4); STORE(5); STORE(6); STORE(7); } src+=src_stride; dst+=dst_stride; }}//This func reads from 2 slices, 0 & 2 and clears 2-ndstatic void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale){int y, x;#define STORE2(pos) \ temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale); \ src[x + pos + 16*src_stride]=0; \ if(temp & 0x100) temp= ~(temp>>31); \ dst[x + pos]= temp; for(y=0; y<height; y++){ const uint8_t *d= dither[y]; for(x=0; x<width; x+=8){ int temp; STORE2(0); STORE2(1); STORE2(2); STORE2(3); STORE2(4); STORE2(5); STORE2(6); STORE2(7); } src+=src_stride; dst+=dst_stride; }}static void mul_thrmat_c(struct vf_priv_s *p,int q){ int a; for(a=0;a<64;a++) ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C}static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);static void row_idct_c(DCTELEM* workspace, int16_t* output_adr, int output_stride, int cnt);static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);//this is rather ugly, but there is no need for function pointers#define store_slice_s store_slice_c#define store_slice2_s store_slice2_c#define mul_thrmat_s mul_thrmat_c#define column_fidct_s column_fidct_c#define row_idct_s row_idct_c#define row_fdct_s row_fdct_c#else /* HAVE_MMX *///This func reads from 1 slice, 1 and clears 0 & 1static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale){ const uint8_t *od=&dither[0][0]; const uint8_t *end=&dither[height][0]; width = (width+7)&~7; dst_stride-=width; //src_stride=(src_stride-width)*2; asm volatile( "mov %5, %%"REG_d" \n\t" "mov %6, %%"REG_S" \n\t" "mov %7, %%"REG_D" \n\t" "mov %1, %%"REG_a" \n\t" "movd %%"REG_d", %%mm5 \n\t" "xor $-1, %%"REG_d" \n\t" "mov %%"REG_a", %%"REG_c" \n\t" "add $7, %%"REG_d" \n\t" "neg %%"REG_a" \n\t" "sub %0, %%"REG_c" \n\t" "add %%"REG_c", %%"REG_c" \n\t" "movd %%"REG_d", %%mm2 \n\t" "mov %%"REG_c", %1 \n\t" "mov %2, %%"REG_d" \n\t" "shl $4, %%"REG_a" \n\t" "2: \n\t" "movq (%%"REG_d"), %%mm3 \n\t" "movq %%mm3, %%mm4 \n\t" "pxor %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm3 \n\t" "punpckhbw %%mm7, %%mm4 \n\t" "mov %0, %%"REG_c" \n\t" "psraw %%mm5, %%mm3 \n\t" "psraw %%mm5, %%mm4 \n\t" "1: \n\t" "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t" "movq (%%"REG_S"), %%mm0 \n\t" "movq 8(%%"REG_S"), %%mm1 \n\t" "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t" "paddw %%mm3, %%mm0 \n\t" "paddw %%mm4, %%mm1 \n\t" "movq %%mm7, (%%"REG_S") \n\t" "psraw %%mm2, %%mm0 \n\t" "psraw %%mm2, %%mm1 \n\t" "movq %%mm7, 8(%%"REG_S") \n\t" "packuswb %%mm1, %%mm0 \n\t" "add $16, %%"REG_S" \n\t" "movq %%mm0, (%%"REG_D") \n\t" "add $8, %%"REG_D" \n\t" "sub $8, %%"REG_c" \n\t" "jg 1b \n\t" "add %1, %%"REG_S" \n\t" "add $8, %%"REG_d" \n\t" "add %3, %%"REG_D" \n\t" "cmp %4, %%"REG_d" \n\t" "jl 2b \n\t" : : "m" (width), "m" (src_stride), "g" (od), "m" (dst_stride), "g" (end), "m" (log2_scale), "m" (src), "m" (dst) //input : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D ); }//This func reads from 2 slices, 0 & 2 and clears 2-ndstatic void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale){ const uint8_t *od=&dither[0][0]; const uint8_t *end=&dither[height][0]; width = (width+7)&~7; dst_stride-=width; //src_stride=(src_stride-width)*2; asm volatile( "mov %5, %%"REG_d" \n\t" "mov %6, %%"REG_S" \n\t" "mov %7, %%"REG_D" \n\t" "mov %1, %%"REG_a" \n\t" "movd %%"REG_d", %%mm5 \n\t" "xor $-1, %%"REG_d" \n\t" "mov %%"REG_a", %%"REG_c" \n\t" "add $7, %%"REG_d" \n\t" "sub %0, %%"REG_c" \n\t" "add %%"REG_c", %%"REG_c" \n\t" "movd %%"REG_d", %%mm2 \n\t" "mov %%"REG_c", %1 \n\t" "mov %2, %%"REG_d" \n\t" "shl $5, %%"REG_a" \n\t" "2: \n\t" "movq (%%"REG_d"), %%mm3 \n\t" "movq %%mm3, %%mm4 \n\t" "pxor %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm3 \n\t" "punpckhbw %%mm7, %%mm4 \n\t" "mov %0, %%"REG_c" \n\t" "psraw %%mm5, %%mm3 \n\t" "psraw %%mm5, %%mm4 \n\t" "1: \n\t" "movq (%%"REG_S"), %%mm0 \n\t" "movq 8(%%"REG_S"), %%mm1 \n\t" "paddw %%mm3, %%mm0 \n\t" "paddw (%%"REG_S",%%"REG_a",), %%mm0 \n\t" "paddw %%mm4, %%mm1 \n\t" "movq 8(%%"REG_S",%%"REG_a",), %%mm6 \n\t" "movq %%mm7, (%%"REG_S",%%"REG_a",) \n\t" "psraw %%mm2, %%mm0 \n\t" "paddw %%mm6, %%mm1 \n\t" "movq %%mm7, 8(%%"REG_S",%%"REG_a",) \n\t" "psraw %%mm2, %%mm1 \n\t" "packuswb %%mm1, %%mm0 \n\t" "movq %%mm0, (%%"REG_D") \n\t" "add $16, %%"REG_S" \n\t" "add $8, %%"REG_D" \n\t" "sub $8, %%"REG_c" \n\t" "jg 1b \n\t" "add %1, %%"REG_S" \n\t" "add $8, %%"REG_d" \n\t" "add %3, %%"REG_D" \n\t" "cmp %4, %%"REG_d" \n\t" "jl 2b \n\t" : : "m" (width), "m" (src_stride), "g" (od), "m" (dst_stride), "g" (end), "m" (log2_scale), "m" (src), "m" (dst) //input : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S ); }static void mul_thrmat_mmx(struct vf_priv_s *p, int q){ uint64_t *adr=&p->threshold_mtx_noq[0]; asm volatile( "movd %0, %%mm7 \n\t" "add $8*8*2, %%"REG_D" \n\t" "movq 0*8(%%"REG_S"), %%mm0 \n\t" "punpcklwd %%mm7, %%mm7 \n\t" "movq 1*8(%%"REG_S"), %%mm1 \n\t" "punpckldq %%mm7, %%mm7 \n\t" "pmullw %%mm7, %%mm0 \n\t" "movq 2*8(%%"REG_S"), %%mm2 \n\t" "pmullw %%mm7, %%mm1 \n\t" "movq 3*8(%%"REG_S"), %%mm3 \n\t" "pmullw %%mm7, %%mm2 \n\t" "movq %%mm0, 0*8(%%"REG_D") \n\t" "movq 4*8(%%"REG_S"), %%mm4 \n\t" "pmullw %%mm7, %%mm3 \n\t" "movq %%mm1, 1*8(%%"REG_D") \n\t" "movq 5*8(%%"REG_S"), %%mm5 \n\t" "pmullw %%mm7, %%mm4 \n\t" "movq %%mm2, 2*8(%%"REG_D") \n\t" "movq 6*8(%%"REG_S"), %%mm6 \n\t" "pmullw %%mm7, %%mm5 \n\t" "movq %%mm3, 3*8(%%"REG_D") \n\t" "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t" "pmullw %%mm7, %%mm6 \n\t" "movq %%mm4, 4*8(%%"REG_D") \n\t" "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t" "pmullw %%mm7, %%mm0 \n\t" "movq %%mm5, 5*8(%%"REG_D") \n\t" "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t" "pmullw %%mm7, %%mm1 \n\t" "movq %%mm6, 6*8(%%"REG_D") \n\t" "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t" "pmullw %%mm7, %%mm2 \n\t" "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t" "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t" "pmullw %%mm7, %%mm3 \n\t" "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t" "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t" "pmullw %%mm7, %%mm4 \n\t" "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t" "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t" "pmullw %%mm7, %%mm5 \n\t" "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t" "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t" "pmullw %%mm7, %%mm6 \n\t" "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t" "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t" "pmullw %%mm7, %%mm0 \n\t" "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t" "pmullw %%mm7, %%mm1 \n\t" "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t" "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t" "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t" : "+g" (q), "+S" (adr), "+D" (adr) : );}static void column_fidct_mmx(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);static void row_idct_mmx(DCTELEM* workspace, int16_t* output_adr, int output_stride, int cnt);static void row_fdct_mmx(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);#define store_slice_s store_slice_mmx#define store_slice2_s store_slice2_mmx#define mul_thrmat_s mul_thrmat_mmx#define column_fidct_s column_fidct_mmx#define row_idct_s row_idct_mmx#define row_fdct_s row_fdct_mmx#endif // HAVE_MMXstatic void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int width, int height, uint8_t *qp_store, int qp_stride, int is_luma){ int x, x0, y, es, qy, t; const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15)) const int step=6-p->log2_count; const int qps= 3 + is_luma; int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ]; DCTELEM *block= (DCTELEM *)block_align; DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ); memset(block3, 0, 4*8*BLOCKSZ); //p->src=src-src_stride*8-8;//! if (!src || !dst) return; // HACK avoid crash for Y8 colourspace for(y=0; y<height; y++){ int index= 8 + 8*stride + y*stride; fast_memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers for(x=0; x<8; x++){
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?