📄 yuvscaler_bicubic.c
字号:
/* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.*/// Version of the 25/05/2003// Copyright X. Biquard xbiquard@free.fr// Great speed Warning : function mjpeg_debug implies an implicit test => may slow down a lot the execution// of the program.// SIMD accelerated multiplications with csplineh not possible since value to be multiply do not stand// in int16_t, but in int32_t// Maybe possible in SSE since xmm registers of 128 bits available// IL FAUT NETTOYER LE HEADER : PAS BESOIN DE TOUTES CES VARIABLES GLOBALES A LA CON#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <string.h>#include <math.h>#include <signal.h>#include "yuv4mpeg.h"#include "mjpeg_types.h"#include "yuvscaler.h"#include "attributes.h"#include "../utils/mmx.h"extern unsigned int output_active_width;extern unsigned int output_active_height;extern unsigned int output_width;extern unsigned int input_width;extern unsigned int input_useful_width;extern unsigned int input_useful_height;extern unsigned int specific;// Defines#define FLOAT2INTEGERPOWER 11#define FLOAT2INTOFFSET 1024#define DBLEFLOAT2INT 22#define DBLEFLOAT2INTOFFSET 2097152// MMX test#ifdef HAVE_ASM_MMXextern int32_t *mmx_res;extern int mmx; // =1 for mmx activated#endif// MMX testextern int32_t *intermediate;// *************************************************************************************intcubic_scale (uint8_t * padded_input, uint8_t * output, unsigned int *in_col, unsigned int *in_line, int16_t *cspline_w, uint16_t width_neighbors, uint8_t zero_width_neighbors, int16_t *cspline_h, uint16_t height_neighbors, uint8_t zero_height_neighbors, unsigned int half){ // Warning: because cubic-spline values may be <0 or >1.0, a range test on value is mandatory unsigned int local_output_active_width = output_active_width >> half; unsigned int local_output_active_height = output_active_height >> half; unsigned int local_output_width = output_width >> half; unsigned int local_input_useful_height = input_useful_height >> half; unsigned int local_input_useful_width = input_useful_width >> half; unsigned int local_padded_height = local_input_useful_height + height_neighbors -1; unsigned int local_padded_width = local_input_useful_width + width_neighbors -1; unsigned int out_line, out_col,w,h; unsigned int *in_line_p,*in_col_p; int16_t output_offset; uint8_t *output_p,*line,*line_begin,*line_0; int16_t *cspline_wp,*cspline_hp,*cspline_hp0; int32_t value=0,value1=0,*intermediate_p,*inter_begin;// mjpeg_debug ("Start of cubic_scale "); output_p = output; output_offset = local_output_width-local_output_active_width; /* *INDENT-OFF* */ switch(specific) { case 0: { // First scale along the Width, not the height width_neighbors-=zero_width_neighbors; height_neighbors-=zero_height_neighbors; intermediate_p=intermediate; line_0 = padded_input; for (out_line = 0; out_line < local_padded_height; out_line++) { cspline_wp=cspline_w; in_col_p=in_col; for (out_col = 0; out_col < local_output_active_width; out_col++) { line = line_0 + *(in_col_p++); value1=*(line++)*(*(cspline_wp++)); for (w=1;w<width_neighbors-1;w++) value1+=*(line++)*(*(cspline_wp++)); value1+=*(line)*(*(cspline_wp++)); if (zero_width_neighbors) cspline_wp++; *(intermediate_p++)=value1; } // a line of intermediate in now finished. Make line_0 points on the next line of padded_input line_0+=local_padded_width; } // Intermediate now contains an width-scaled frame // we now scale it along the height, not the width cspline_hp0=cspline_h; in_line_p=in_line; for (out_line = 0; out_line < local_output_active_height; out_line++) { inter_begin=intermediate + *(in_line_p++) * local_output_active_width; for (out_col = 0; out_col < local_output_active_width-1; out_col++) { cspline_hp=cspline_hp0; value1 = *(inter_begin)*(*(cspline_hp++)); for (h=1;h<height_neighbors-1;h++) value1 += (*(inter_begin+h*local_output_active_width))*(*(cspline_hp++)); value1 += (*((inter_begin++)+h*local_output_active_width))*(*(cspline_hp)); if (value1 < 0) *(output_p++) = 0; else { value =(value1 + DBLEFLOAT2INTOFFSET) >> DBLEFLOAT2INT; if (value > 255) *(output_p++) = 255; else *(output_p++) = (uint8_t) value; } } // last out_col to be treated => cspline_hp0 incremented => we use the C "++" facility value1 = *(inter_begin)*(*(cspline_hp0++)); for (h=1;h<height_neighbors;h++) value1 += (*(inter_begin+h*local_output_active_width))*(*(cspline_hp0++)); if (zero_height_neighbors) cspline_hp0++; if (value1 < 0) *(output_p++) = 0; else { value =(value1 + DBLEFLOAT2INTOFFSET) >> DBLEFLOAT2INT; if (value > 255) *(output_p++) = 255; else *(output_p++) = (uint8_t) value; } // a line on output is now finished. We jump to the beginning of the next line output_p+=output_offset; } } break; case 1:/* #ifdef HAVE_ASM_MMX if (mmx==1) { // We only downscale on width, not height emms(); pxor_r2r(mm7,mm7); // only zeros in mm7 line_0=padded_input; for (out_line = 0; out_line < local_output_active_height; out_line++) { cspline_wp=cspline_w; in_col_p=in_col; for (out_col = 0; out_col < local_output_active_width; out_col++) { line = line_0 + *(in_col_p++); switch(width_neighbors) { // NB : c'est l'op閞ation movq_r2m qui coute le plus en temps, seulement pour la premi鑢e !!! case 4: movq_m2r(*cspline_wp,mm0); movq_m2r(*(line),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm0,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1=mmx_res[0]+mmx_res[1]; break; case 6: // cspline_w in mm0 and mm1 movq_m2r(*cspline_wp,mm0); movq_m2r(*(cspline_wp+4),mm1); // 4 pixels in mm6 and next 4 in mm5 movq_m2r(*(line),mm6);// __m64 mm6= _mm_unpacklo_pi16(mm6, mm7); punpcklbw_r2r(mm7,mm6); movq_m2r(*(line+4),mm5); punpcklbw_r2r(mm7,mm5); // multiply and add => these take more than one cycle and may be done in parallel pmaddwd_r2r(mm0,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res);// movntq_r2m(mm6,*(mmx_t *)mmx_res); value1=mmx_res[0]+mmx_res[1]; pmaddwd_r2r(mm1,mm5); movq_r2m(mm5,*(mmx_t *)mmx_res);// movntq_r2m(mm5,*(mmx_t *)mmx_res); value1+=mmx_res[0]; break; case 8: movq_m2r(*cspline_wp,mm0); movq_m2r(*(cspline_wp+4),mm1); movq_m2r(*(line),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm0,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+4),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm1,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1+=mmx_res[0]+mmx_res[1]; break; case 10: movq_m2r(*cspline_wp,mm0); movq_m2r(*(cspline_wp+4),mm1); movq_m2r(*(cspline_wp+8),mm2); movq_m2r(*(line),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm0,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+4),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm1,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1+=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+8),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm2,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1+=mmx_res[0]; break; case 12: movq_m2r(*cspline_wp,mm0); movq_m2r(*(cspline_wp+4),mm1); movq_m2r(*(cspline_wp+8),mm2); movq_m2r(*(line),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm0,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+4),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm1,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1+=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+8),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm2,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1+=mmx_res[0]+mmx_res[1]; break; case 14: movq_m2r(*cspline_wp,mm0); movq_m2r(*(cspline_wp+4),mm1); movq_m2r(*(cspline_wp+8),mm2); movq_m2r(*(cspline_wp+12),mm3); movq_m2r(*(line),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm0,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+4),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm1,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1+=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+8),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm2,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1+=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+12),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm3,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1+=mmx_res[0]; break; case 16: movq_m2r(*cspline_wp,mm0); movq_m2r(*(cspline_wp+4),mm1); movq_m2r(*(cspline_wp+8),mm2); movq_m2r(*(cspline_wp+12),mm3); movq_m2r(*(line),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm0,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+4),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm1,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1+=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+8),mm6); punpcklbw_r2r(mm7,mm6); pmaddwd_r2r(mm2,mm6); movq_r2m(mm6,*(mmx_t *)mmx_res); value1+=mmx_res[0]+mmx_res[1]; movq_m2r(*(line+12),mm6); punpcklbw_r2r(mm7,mm6);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -