📄 jdsample.c
字号:
"paddb %%mm5,%%mm2 \n\t" // add in byte "punpckhbw %%mm6,%%mm1 \n\t" // unpack "punpckhbw %%mm6,%%mm2 \n\t" // unpack "paddw %%mm0,%%mm1 \n\t" // add in result from multiply to "previous" data "paddw bias1w,%%mm1 \n\t" // add in bias "paddw %%mm0,%%mm2 \n\t" // add in result from multiply to "next" data "paddw bias2w,%%mm2 \n\t" // add in bias "psrlw $2,%%mm1 \n\t" // convert from word to byte "psrlw $2,%%mm2 \n\t" // convert from word to byte "psllq $8,%%mm2 \n\t" // prepare for interleave "paddb %%mm1,%%mm2 \n\t" // do interleave "movq %%mm2,8(%%edi) \n\t" // write out results "addl $16,%%edi \n\t" // increment output buffer pointer "addl $8,%%esi \n\t" // increment input buffer pointer "subl $8,%%ecx \n\t" // increment column counter "cmpl $8,%%ecx \n\t" // cmp with 8 "jg col_loop_a \n\t" // if > 8 goto main loop "last_col: \n\t" // Special last column case - process low 8 bytes of mm7 "movq %%mm7,%%mm0 \n\t" // copy input data "movq %%mm7,%%mm1 \n\t" // copy input data "movq %%mm7,%%mm2 \n\t" // copy input data "punpcklbw %%mm6,%%mm0 \n\t" // unpack lo data "pmullw mul3w,%%mm0 \n\t" // multiply by 3; i[0][1][2][3] "psllq $8,%%mm1 \n\t" // shift left to get previous byte "movq %%mm3,%%mm5 \n\t" // retrieve copy of "previous" state "psrlq $56,%%mm5 \n\t" // shift left for MSB "paddb %%mm5,%%mm1 \n\t" // add in byte "psrlq $8,%%mm2 \n\t" // shift rt for "next" state "punpcklbw %%mm6,%%mm1 \n\t" // unpack "punpcklbw %%mm6,%%mm2 \n\t" // unpack "paddw %%mm0,%%mm1 \n\t" // add in result from multiply to "previous" data "paddw bias1w,%%mm1 \n\t" // add in bias "paddw %%mm0,%%mm2 \n\t" // add in result from multiply to "next" data "paddw bias2w,%%mm2 \n\t" // add in bias "psrlw $2,%%mm1 \n\t" // convert from word to byte "psrlw $2,%%mm2 \n\t" // convert from word to byte "psllq $8,%%mm2 \n\t" // prepare for interleave "paddb %%mm1,%%mm2 \n\t" // do interleave "movq %%mm2,(%%edi) \n\t" // write out results // Special last column case - process hi 8 bytes of mm7 "movq %%mm7,%%mm0 \n\t" // copy input data "movq %%mm7,%%mm1 \n\t" // copy input data "movq %%mm7,%%mm2 \n\t" // copy input data "punpckhbw %%mm6,%%mm0 \n\t" // unpack hi data "pmullw mul3w,%%mm0 \n\t" // multiply by 3; i[0][1][2][3] "psllq $8,%%mm1 \n\t" // shift left to get previous byte "psrlq $8,%%mm2 \n\t" // shift rt for "next" state "pand mask1,%%mm7 \n\t" // mask out all but MSB "paddb %%mm7,%%mm2 \n\t" // add in byte "punpckhbw %%mm6,%%mm1 \n\t" // unpack "punpckhbw %%mm6,%%mm2 \n\t" // unpack "paddw %%mm0,%%mm1 \n\t" // add in result from multiply to "previous" data "paddw bias1w,%%mm1 \n\t" // add in bias "paddw %%mm0,%%mm2 \n\t" // add in result from multiply to "next" data "paddw bias2w,%%mm2 \n\t" // add in bias "psrlw $2,%%mm1 \n\t" // convert from word to byte "psrlw $2,%%mm2 \n\t" // convert from word to byte "psllq $8,%%mm2 \n\t" // prepare for interleave "paddb %%mm1,%%mm2 \n\t" // do interleave "movq %%mm2,8(%%edi) \n\t" // write out results "emms \n\t" : // no output regs // %0 %1 %2 %3 %4 : "m"(hsize), "m"(inptr), "m"(outptr) : "eax", "ecx", "edx", "esi", "edi", "memory", "cc", "st" );#endif }}METHODDEF(void)h2v1_fancy_upsample_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)#elseMETHODDEF(void)h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)#endif{ JSAMPARRAY output_data = *output_data_ptr; register JSAMPROW inptr, outptr; register int invalue; register JDIMENSION colctr; int inrow; for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) { inptr = input_data[inrow]; outptr = output_data[inrow]; /* Special case for first column */ invalue = GETJSAMPLE(*inptr++); *outptr++ = (JSAMPLE) invalue; *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2); for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) { /* General case: 3/4 * nearer pixel + 1/4 * further pixel */ invalue = GETJSAMPLE(*inptr++) * 3; *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2); *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2); } /* Special case for last column */ invalue = GETJSAMPLE(*inptr); *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2); *outptr++ = (JSAMPLE) invalue; }}/* * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. * Again a triangle filter; see comments for h2v1 case, above. * * It is OK for us to reference the adjacent input rows because we demanded * context from the main buffer controller (see initialization code). */#if defined(HAVE_MMX_INTEL_MNEMONICS) || defined(HAVE_MMX_ATT_MNEMONICS) /* Silly forward definitions */METHODDEF(void)h2v2_fancy_upsample_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);METHODDEF(void)h2v2_fancy_upsample_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr);METHODDEF(void)h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr){ if (MMXAvailable) h2v2_fancy_upsample_mmx(cinfo, compptr, input_data, output_data_ptr); else h2v2_fancy_upsample_orig(cinfo, compptr, input_data, output_data_ptr);}METHODDEF(void)h2v2_fancy_upsample_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr, JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr){ JSAMPARRAY output_data = *output_data_ptr;#if defined(HAVE_MMX_INTEL_MNEMONICS) /* I think that's needed for an Intel compiler */ register JSAMPROW inptr0, inptr1, inptr2, outptr, outptr2, save_val; /* pointers to unsigned char */#else /* but you get warnings on a GNU compiler */ JSAMPROW inptr0, inptr1, inptr2, outptr, outptr2, save_val; /* pointers to unsigned char */#endif int inrow, outrow, vsamp = cinfo->max_v_samp_factor, c = 0, dsamp = compptr->downsampled_width, out_offset = dsamp * 4, dsamp_2xw = dsamp * 2; inrow = outrow = 0; while (outrow < cinfo->max_v_samp_factor) { /* inptr0 points to nearest input row, inptr1 points to next nearest */ inptr0 = input_data[inrow]; inptr1 = input_data[inrow-1]; inptr2 = input_data[inrow+1]; outptr = output_data[outrow++]; save_val = outptr + out_offset; outptr2 = output_data[outrow++];#if defined(HAVE_MMX_INTEL_MNEMONICS) _asm { /* This is what we are trying to accomplish here mm0 mm2 mm1 mm3 o1 = (9 * i0[0] + 3 * i1[0] + 3 * i0[-1] + i1[-1] + 8) >> 4 o3 = (9 * i0[1] + 3 * i1[1] + 3 * i0[0] + i1[0] + 8) >> 4 o5 = (9 * i0[2] + 3 * i1[2] + 3 * i0[1] + i1[1] + 8) >> 4 o7 = (9 * i0[3] + 3 * i1[3] + 3 * i0[2] + i1[2] + 8) >> 4 mm0 mm2 mm1 mm3 o2 = (9 * i0[0] + 3 * i1[0] + 3 * i0[1] + i1[1] + 7) >> 4 o4 = (9 * i0[1] + 3 * i1[1] + 3 * i0[2] + i1[2] + 7) >> 4 o6 = (9 * i0[2] + 3 * i1[2] + 3 * i0[3] + i1[3] + 7) >> 4 o8 = (9 * i0[3] + 3 * i1[3] + 3 * i0[4] + i1[4] + 7) >> 4 output_buf = [o1 o2 o3 o4 o5 o6 o7 o8] NOTE: for special first and last column cases o1 = (12 * i0[0] + 4 * i1[0] + 3 * 0 + 0 + 8) >> 4 */ ;// Part 1 of the output - process lo data for o1 o3 o5 o7 mov ecx, dsamp ;// columns to process mov edx, inptr0 ;// input row1 mov esi, inptr1 ;// input row2 mov edi, outptr ;// output buffer mov eax, save_val movq mm0, [edx] ;// get data from input row 0 movq mm2, [esi] ;// get data from input row 1 movq mm4, mm0 ;// save to process hi half of input0 movq mm5, mm2 ;// save to process hi half of input1 punpcklbw mm0, noval ;// process inptr0 movq mm1, mm0 ;// copy inptr0 psllq mm1, 16 ;// shift for first column special case i0[-1] pmullw mm0, mul9ws ;// multiply by special case constant pmullw mm1, mul3w ;// multiply input1 by 3 punpcklbw mm2, noval ;// process inptr1 movq mm3, mm2 ;// copy inptr0 psllq mm3, 16 ;// shift for first column special case i1[-1] pmullw mm2, mul3ws ;// multiply by special case constant paddw mm1, mm0 ;// Add up results for movq [eax], mm1 movq mm6, mm1 ;// with the next results paddw mm3, mm2 ;// final o1 o3 o5 o7 paddw mm6, mm3 ;// output to be interleaved paddw mm6, bias8w ;// Add even bias psrlw mm6, 4 ;// convert from word to byte (truncate) ;// Part 2 of the output - process lo data for o2 o4 o6 o8 movq mm0, mm4 ;// get data from input row 0 movq mm2, mm5 ;// get data from input row 1 movq mm1, mm0 ;// copy inptr0 for unpack movq mm3, mm2 ;// copy inptr1 for unpack punpcklbw mm0, noval ;// process inptr1 psrlq mm1, 8 ;// shift right for i0[1][2][3][4] punpcklbw mm1, noval ;// process inptr1 pmullw mm0, mul9w ;// multiply by nearest point constant pmullw mm1, mul3w ;// multiply by next nearest constant punpcklbw mm2, noval ;// process inptr1 psrlq mm3, 8 ;// shift right for i1[1][2][3][4] punpcklbw mm3, noval ;// process inptr1 pmullw mm2, mul3w ;// multiply by next nearest constant paddw mm0, mm1 ;// Add up results for final o2 o4 o6 o8 movq [eax+8], mm0 paddw mm0, mm3 ;// previous results for o1 o3 o5 o7 paddw mm0, bias7w ;// Add odd bias paddw mm0, mm2 ;// output to be interleaved with the psrlw mm0, 4 ;// convert back to byte (with truncate) psllq mm0, 8 ;// prepare to interleave output results paddw mm6, mm0 ;// interleave results movq [edi], mm6 ;// write to output buffer add edi, 8 ;// increment output pointer add eax, 16 sub ecx, 8 cmp ecx, 0 jle last_column ;// End of special case. Now for generic loop col_loop: ;// Part 2 of the output movq mm0, mm4 ;// get data from input row 0 movq mm2, mm5 ;// get data from input row 1 movq mm1, mm0 ;// copy inptr0 for unpack movq mm3, mm2 ;// copy inptr1 for unpack movq input0, mm0 movq input1, mm2 punpckhbw mm0, noval ;// process inptr1[0] psllq mm1, 8 ;// shift for inptr0[-1] punpckhbw mm1, noval ;// process inptr1[1] pmullw mm0, mul9w ;// multiply by special case constant pmullw mm1, mul3w ;// multiply inptr1 by 3 punpckhbw mm2, noval ;// process inptr1[0] psllq mm3, 8 ;// shift for inptr1[-1] punpckhbw mm3, noval ;// process inptr1 pmullw mm2, mul3w ;// multiply by special case constant paddw mm1, mm0 ;// Add up results for movq [eax], mm1 movq mm6, mm1 ;// with the next results paddw mm6, bias8w ;// Add even bias paddw mm3, mm2 ;// final o1 o3 o5 o7 paddw mm6, mm3 ;// output to be interleaved psrlw mm6, 4 ;// convert from word to byte (truncate) ;// process hi data for o2 o4 o6 o8 movq mm1, mm4 ;// get data from input row 0 movq mm3, mm5 ;// copy inptr1 for unpack psrlq mm1, 8 ;// shift right for i0[1][2][3][4] movq mm4, [edx + 8] ;// need to add in a byte from the next column ;// load next inptr0 to mm4 for future use movq mm7, mm4 psllq mm7, 56 ;// shift for MSB paddb mm1, mm7 ;// add in MSB from next input0 column punpckhbw mm1, noval ;// process inptr0 pmullw mm1, mul3w ;// multiply by next nearest constant psrlq mm3, 8 ;// shift right for i1[1][2][3][4] movq mm5, [esi + 8] ;// need to add in a byte from the next column ;// load next inptr1 to mm5 for future use movq mm7, mm5 psllq mm7, 56 ;// shift for MSB paddb mm3, mm7 ;// add in MSB from next input1 column punpckhbw mm3, noval ;// process inptr1 paddw mm0, mm1 ;// Add odd bias movq [eax+8], mm0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -