📄 pnggccrd.c
字号:
"secondloop24: \n\t" "sall %%edx \n\t" // move high bit to CF "jnc skip24 \n\t" // if CF = 0 "movw (%%esi), %%ax \n\t" "movw %%ax, (%%edi) \n\t" "xorl %%eax, %%eax \n\t" "movb 2(%%esi), %%al \n\t" "movb %%al, 2(%%edi) \n\t" "skip24: \n\t" "addl $3, %%esi \n\t" "addl $3, %%edi \n\t" "decl %%ecx \n\t" "jnz secondloop24 \n\t" "end24: \n\t" "EMMS \n\t" // DONE : "=a" (dummy_value_a), // output regs (dummy) "=d" (dummy_value_d), "=c" (dummy_value_c), "=S" (dummy_value_S), "=D" (dummy_value_D) : "3" (srcptr), // esi // input regs "4" (dstptr), // edi "0" (diff), // eax// was (unmask) "b" RESERVED // ebx // Global Offset Table idx "2" (len), // ecx "1" (mask) // edx#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ : "%mm0", "%mm1", "%mm2" // clobber list , "%mm4", "%mm5", "%mm6", "%mm7"#endif ); } else /* mmx _not supported - Use modified C routine */#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ { register png_uint_32 i; png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass]; /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ register int stride = BPP3 * png_pass_inc[png_ptr->pass]; /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass]; /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ int diff = (int) (png_ptr->width & 7); /* amount lost */ register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */ srcptr = png_ptr->row_buf + 1 + initial_val; dstptr = row + initial_val; for (i = initial_val; i < final_val; i += stride) { png_memcpy(dstptr, srcptr, rep_bytes); srcptr += stride; dstptr += stride; } if (diff) /* number of leftover pixels: 3 for pngtest */ { final_val+=diff*BPP3; for (; i < final_val; i += stride) { if (rep_bytes > (int)(final_val-i)) rep_bytes = (int)(final_val-i); png_memcpy(dstptr, srcptr, rep_bytes); srcptr += stride; dstptr += stride; } } } /* end of else (_mmx_supported) */ break; } /* end 24 bpp */ case 32: /* png_ptr->row_info.pixel_depth */ { png_bytep srcptr; png_bytep dstptr;#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)#if !defined(PNG_1_0_X) if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) /* && _mmx_supported */ )#else if (_mmx_supported)#endif { png_uint_32 len; int diff; int dummy_value_a; // fix 'forbidden register spilled' error int dummy_value_d; int dummy_value_c; int dummy_value_S; int dummy_value_D; _unmask = ~mask; // global variable for -fPIC version srcptr = png_ptr->row_buf + 1; dstptr = row; len = png_ptr->width &~7; // reduce to multiple of 8 diff = (int) (png_ptr->width & 7); // amount lost // __asm__ __volatile__ ( "movd _unmask, %%mm7 \n\t" // load bit pattern "psubb %%mm6, %%mm6 \n\t" // zero mm6 "punpcklbw %%mm7, %%mm7 \n\t" "punpcklwd %%mm7, %%mm7 \n\t" "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks "movq _mask32_0, %%mm0 \n\t" "movq _mask32_1, %%mm1 \n\t" "movq _mask32_2, %%mm2 \n\t" "movq _mask32_3, %%mm3 \n\t" "pand %%mm7, %%mm0 \n\t" "pand %%mm7, %%mm1 \n\t" "pand %%mm7, %%mm2 \n\t" "pand %%mm7, %%mm3 \n\t" "pcmpeqb %%mm6, %%mm0 \n\t" "pcmpeqb %%mm6, %%mm1 \n\t" "pcmpeqb %%mm6, %%mm2 \n\t" "pcmpeqb %%mm6, %%mm3 \n\t"// preload "movl len, %%ecx \n\t" // load length of line// preload "movl srcptr, %%esi \n\t" // load source// preload "movl dstptr, %%edi \n\t" // load dest "cmpl $0, %%ecx \n\t" // lcr "jz mainloop32end \n\t" "mainloop32: \n\t" "movq (%%esi), %%mm4 \n\t" "pand %%mm0, %%mm4 \n\t" "movq %%mm0, %%mm6 \n\t" "movq (%%edi), %%mm7 \n\t" "pandn %%mm7, %%mm6 \n\t" "por %%mm6, %%mm4 \n\t" "movq %%mm4, (%%edi) \n\t" "movq 8(%%esi), %%mm5 \n\t" "pand %%mm1, %%mm5 \n\t" "movq %%mm1, %%mm7 \n\t" "movq 8(%%edi), %%mm6 \n\t" "pandn %%mm6, %%mm7 \n\t" "por %%mm7, %%mm5 \n\t" "movq %%mm5, 8(%%edi) \n\t" "movq 16(%%esi), %%mm6 \n\t" "pand %%mm2, %%mm6 \n\t" "movq %%mm2, %%mm4 \n\t" "movq 16(%%edi), %%mm7 \n\t" "pandn %%mm7, %%mm4 \n\t" "por %%mm4, %%mm6 \n\t" "movq %%mm6, 16(%%edi) \n\t" "movq 24(%%esi), %%mm7 \n\t" "pand %%mm3, %%mm7 \n\t" "movq %%mm3, %%mm5 \n\t" "movq 24(%%edi), %%mm4 \n\t" "pandn %%mm4, %%mm5 \n\t" "por %%mm5, %%mm7 \n\t" "movq %%mm7, 24(%%edi) \n\t" "addl $32, %%esi \n\t" // inc by 32 bytes processed "addl $32, %%edi \n\t" "subl $8, %%ecx \n\t" // dec by 8 pixels processed "ja mainloop32 \n\t" "mainloop32end: \n\t"// preload "movl diff, %%ecx \n\t" // (diff is in eax) "movl %%eax, %%ecx \n\t" "cmpl $0, %%ecx \n\t" "jz end32 \n\t"// preload "movl mask, %%edx \n\t" "sall $24, %%edx \n\t" // low byte => high byte "secondloop32: \n\t" "sall %%edx \n\t" // move high bit to CF "jnc skip32 \n\t" // if CF = 0 "movl (%%esi), %%eax \n\t" "movl %%eax, (%%edi) \n\t" "skip32: \n\t" "addl $4, %%esi \n\t" "addl $4, %%edi \n\t" "decl %%ecx \n\t" "jnz secondloop32 \n\t" "end32: \n\t" "EMMS \n\t" // DONE : "=a" (dummy_value_a), // output regs (dummy) "=d" (dummy_value_d), "=c" (dummy_value_c), "=S" (dummy_value_S), "=D" (dummy_value_D) : "3" (srcptr), // esi // input regs "4" (dstptr), // edi "0" (diff), // eax// was (unmask) "b" RESERVED // ebx // Global Offset Table idx "2" (len), // ecx "1" (mask) // edx#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list , "%mm4", "%mm5", "%mm6", "%mm7"#endif ); } else /* mmx _not supported - Use modified C routine */#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ { register png_uint_32 i; png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass]; /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ register int stride = BPP4 * png_pass_inc[png_ptr->pass]; /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass]; /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ int diff = (int) (png_ptr->width & 7); /* amount lost */ register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */ srcptr = png_ptr->row_buf + 1 + initial_val; dstptr = row + initial_val; for (i = initial_val; i < final_val; i += stride) { png_memcpy(dstptr, srcptr, rep_bytes); srcptr += stride; dstptr += stride; } if (diff) /* number of leftover pixels: 3 for pngtest */ { final_val+=diff*BPP4; for (; i < final_val; i += stride) { if (rep_bytes > (int)(final_val-i)) rep_bytes = (int)(final_val-i); png_memcpy(dstptr, srcptr, rep_bytes); srcptr += stride; dstptr += stride; } } } /* end of else (_mmx_supported) */ break; } /* end 32 bpp */ case 48: /* png_ptr->row_info.pixel_depth */ { png_bytep srcptr; png_bytep dstptr;#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)#if !defined(PNG_1_0_X) if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) /* && _mmx_supported */ )#else if (_mmx_supported)#endif { png_uint_32 len; int diff; int dummy_value_a; // fix 'forbidden register spilled' error int dummy_value_d; int dummy_value_c; int dummy_value_S; int dummy_value_D; _unmask = ~mask; // global variable for -fPIC version srcptr = png_ptr->row_buf + 1; dstptr = row; len = png_ptr->width &~7; // reduce to multiple of 8 diff = (int) (png_ptr->width & 7); // amount lost // __asm__ __volatile__ ( "movd _unmask, %%mm7 \n\t" // load bit pattern "psubb %%mm6, %%mm6 \n\t" // zero mm6 "punpcklbw %%mm7, %%mm7 \n\t" "punpcklwd %%mm7, %%mm7 \n\t" "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks "movq _mask48_0, %%mm0 \n\t" "movq _mask48_1, %%mm1 \n\t" "movq _mask48_2, %%mm2 \n\t" "movq _mask48_3, %%mm3 \n\t" "movq _mask48_4, %%mm4 \n\t" "movq _mask48_5, %%mm5 \n\t" "pand %%mm7, %%mm0 \n\t" "pand %%mm7, %%mm1 \n\t" "pand %%mm7, %%mm2 \n\t" "pand %%mm7, %%mm3 \n\t" "pand %%mm7, %%mm4 \n\t" "pand %%mm7, %%mm5 \n\t" "pcmpeqb %%mm6, %%mm0 \n\t" "pcmpeqb %%mm6, %%mm1 \n\t" "pcmpeqb %%mm6, %%mm2 \n\t" "pcmpeqb %%mm6, %%mm3 \n\t" "pcmpeqb %%mm6, %%mm4 \n\t" "pcmpeqb %%mm6, %%mm5 \n\t"// preload "movl len, %%ecx \n\t" // load length of line// preload "movl srcptr, %%esi \n\t" // load source// preload "movl dstptr, %%edi \n\t" // load dest "cmpl $0, %%ecx \n\t" "jz mainloop48end \n\t" "mainloop48: \n\t" "movq (%%esi), %%mm7 \n\t" "pand %%mm0, %%mm7 \n\t" "movq %%mm0, %%mm6 \n\t" "pandn (%%edi), %%mm6 \n\t" "por %%mm6, %%mm7 \n\t" "movq %%mm7, (%%edi) \n\t" "movq 8(%%esi), %%mm6 \n\t" "pand %%mm1, %%mm6 \n\t" "movq %%mm1, %%mm7 \n\t" "pandn 8(%%edi), %%mm7 \n\t" "por %%mm7, %%mm6 \n\t"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -