pnggccrd.c
来自「A*算法 A*算法 A*算法 A*算法A*算法A*算法」· C语言 代码 · 共 1,563 行 · 第 1/5 页
C
1,563 行
/* preload "movl srcptr, %%esi \n\t" // load source */
/* preload "movl dstptr, %%edi \n\t" // load dest */
"cmpl $0, %%ecx \n\t"
"jz mainloop24end \n\t"
"mainloop24: \n\t"
"movq (%%esi), %%mm4 \n\t"
"pand %%mm0, %%mm4 \n\t"
"movq %%mm0, %%mm6 \n\t"
"movq (%%edi), %%mm7 \n\t"
"pandn %%mm7, %%mm6 \n\t"
"por %%mm6, %%mm4 \n\t"
"movq %%mm4, (%%edi) \n\t"
"movq 8(%%esi), %%mm5 \n\t"
"pand %%mm1, %%mm5 \n\t"
"movq %%mm1, %%mm7 \n\t"
"movq 8(%%edi), %%mm6 \n\t"
"pandn %%mm6, %%mm7 \n\t"
"por %%mm7, %%mm5 \n\t"
"movq %%mm5, 8(%%edi) \n\t"
"movq 16(%%esi), %%mm6 \n\t"
"pand %%mm2, %%mm6 \n\t"
"movq %%mm2, %%mm4 \n\t"
"movq 16(%%edi), %%mm7 \n\t"
"pandn %%mm7, %%mm4 \n\t"
"por %%mm4, %%mm6 \n\t"
"movq %%mm6, 16(%%edi) \n\t"
"addl $24, %%esi \n\t" /* inc by 24 bytes processed */
"addl $24, %%edi \n\t"
"subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
"ja mainloop24 \n\t"
"mainloop24end: \n\t"
/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
"movl %%eax, %%ecx \n\t"
"cmpl $0, %%ecx \n\t"
"jz end24 \n\t"
/* preload "movl mask, %%edx \n\t" */
"sall $24, %%edx \n\t" /* make low byte, high byte */
"secondloop24: \n\t"
"sall %%edx \n\t" /* move high bit to CF */
"jnc skip24 \n\t" /* if CF = 0 */
"movw (%%esi), %%ax \n\t"
"movw %%ax, (%%edi) \n\t"
"xorl %%eax, %%eax \n\t"
"movb 2(%%esi), %%al \n\t"
"movb %%al, 2(%%edi) \n\t"
"skip24: \n\t"
"addl $3, %%esi \n\t"
"addl $3, %%edi \n\t"
"decl %%ecx \n\t"
"jnz secondloop24 \n\t"
"end24: \n\t"
"EMMS \n\t" /* DONE */
: "=a" (dummy_value_a), /* output regs (dummy) */
"=d" (dummy_value_d),
"=c" (dummy_value_c),
"=S" (dummy_value_S),
"=D" (dummy_value_D)
: "3" (srcptr), /* esi // input regs */
"4" (dstptr), /* edi */
"0" (diff), /* eax */
/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
"2" (len), /* ecx */
"1" (mask) /* edx */
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
: "%mm0", "%mm1", "%mm2" /* clobber list */
, "%mm4", "%mm5", "%mm6", "%mm7"
#endif
);
}
else /* mmx _not supported - Use modified C routine */
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
{
register png_uint_32 i;
png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
register int stride = BPP3 * png_pass_inc[png_ptr->pass];
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
int diff = (int) (png_ptr->width & 7); /* amount lost */
register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
srcptr = png_ptr->row_buf + 1 + initial_val;
dstptr = row + initial_val;
for (i = initial_val; i < final_val; i += stride)
{
png_memcpy(dstptr, srcptr, rep_bytes);
srcptr += stride;
dstptr += stride;
}
if (diff) /* number of leftover pixels: 3 for pngtest */
{
final_val+=diff*BPP3;
for (; i < final_val; i += stride)
{
if (rep_bytes > (int)(final_val-i))
rep_bytes = (int)(final_val-i);
png_memcpy(dstptr, srcptr, rep_bytes);
srcptr += stride;
dstptr += stride;
}
}
} /* end of else (_mmx_supported) */
break;
} /* end 24 bpp */
case 32: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
png_bytep dstptr;
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
/* && _mmx_supported */ )
#else
if (_mmx_supported)
#endif
{
png_uint_32 len;
int diff;
int dummy_value_a; /* fix 'forbidden register spilled' error */
int dummy_value_d;
int dummy_value_c;
int dummy_value_S;
int dummy_value_D;
_unmask = ~mask; /* global variable for -fPIC version */
srcptr = png_ptr->row_buf + 1;
dstptr = row;
len = png_ptr->width &~7; /* reduce to multiple of 8 */
diff = (int) (png_ptr->width & 7); /* amount lost // */
__asm__ __volatile__ (
"movd _unmask, %%mm7 \n\t" /* load bit pattern */
"psubb %%mm6, %%mm6 \n\t" /* zero mm6 */
"punpcklbw %%mm7, %%mm7 \n\t"
"punpcklwd %%mm7, %%mm7 \n\t"
"punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */
"movq _mask32_0, %%mm0 \n\t"
"movq _mask32_1, %%mm1 \n\t"
"movq _mask32_2, %%mm2 \n\t"
"movq _mask32_3, %%mm3 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm7, %%mm2 \n\t"
"pand %%mm7, %%mm3 \n\t"
"pcmpeqb %%mm6, %%mm0 \n\t"
"pcmpeqb %%mm6, %%mm1 \n\t"
"pcmpeqb %%mm6, %%mm2 \n\t"
"pcmpeqb %%mm6, %%mm3 \n\t"
/* preload "movl len, %%ecx \n\t" // load length of line */
/* preload "movl srcptr, %%esi \n\t" // load source */
/* preload "movl dstptr, %%edi \n\t" // load dest */
"cmpl $0, %%ecx \n\t" /* lcr */
"jz mainloop32end \n\t"
"mainloop32: \n\t"
"movq (%%esi), %%mm4 \n\t"
"pand %%mm0, %%mm4 \n\t"
"movq %%mm0, %%mm6 \n\t"
"movq (%%edi), %%mm7 \n\t"
"pandn %%mm7, %%mm6 \n\t"
"por %%mm6, %%mm4 \n\t"
"movq %%mm4, (%%edi) \n\t"
"movq 8(%%esi), %%mm5 \n\t"
"pand %%mm1, %%mm5 \n\t"
"movq %%mm1, %%mm7 \n\t"
"movq 8(%%edi), %%mm6 \n\t"
"pandn %%mm6, %%mm7 \n\t"
"por %%mm7, %%mm5 \n\t"
"movq %%mm5, 8(%%edi) \n\t"
"movq 16(%%esi), %%mm6 \n\t"
"pand %%mm2, %%mm6 \n\t"
"movq %%mm2, %%mm4 \n\t"
"movq 16(%%edi), %%mm7 \n\t"
"pandn %%mm7, %%mm4 \n\t"
"por %%mm4, %%mm6 \n\t"
"movq %%mm6, 16(%%edi) \n\t"
"movq 24(%%esi), %%mm7 \n\t"
"pand %%mm3, %%mm7 \n\t"
"movq %%mm3, %%mm5 \n\t"
"movq 24(%%edi), %%mm4 \n\t"
"pandn %%mm4, %%mm5 \n\t"
"por %%mm5, %%mm7 \n\t"
"movq %%mm7, 24(%%edi) \n\t"
"addl $32, %%esi \n\t" /* inc by 32 bytes processed */
"addl $32, %%edi \n\t"
"subl $8, %%ecx \n\t" /* dec by 8 pixels processed */
"ja mainloop32 \n\t"
"mainloop32end: \n\t"
/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */
"movl %%eax, %%ecx \n\t"
"cmpl $0, %%ecx \n\t"
"jz end32 \n\t"
/* preload "movl mask, %%edx \n\t" */
"sall $24, %%edx \n\t" /* low byte => high byte */
"secondloop32: \n\t"
"sall %%edx \n\t" /* move high bit to CF */
"jnc skip32 \n\t" /* if CF = 0 */
"movl (%%esi), %%eax \n\t"
"movl %%eax, (%%edi) \n\t"
"skip32: \n\t"
"addl $4, %%esi \n\t"
"addl $4, %%edi \n\t"
"decl %%ecx \n\t"
"jnz secondloop32 \n\t"
"end32: \n\t"
"EMMS \n\t" /* DONE */
: "=a" (dummy_value_a), /* output regs (dummy) */
"=d" (dummy_value_d),
"=c" (dummy_value_c),
"=S" (dummy_value_S),
"=D" (dummy_value_D)
: "3" (srcptr), /* esi // input regs */
"4" (dstptr), /* edi */
"0" (diff), /* eax */
/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */
"2" (len), /* ecx */
"1" (mask) /* edx */
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
: "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */
, "%mm4", "%mm5", "%mm6", "%mm7"
#endif
);
}
else /* mmx _not supported - Use modified C routine */
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
{
register png_uint_32 i;
png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
register int stride = BPP4 * png_pass_inc[png_ptr->pass];
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
int diff = (int) (png_ptr->width & 7); /* amount lost */
register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
srcptr = png_ptr->row_buf + 1 + initial_val;
dstptr = row + initial_val;
for (i = initial_val; i < final_val; i += stride)
{
png_memcpy(dstptr, srcptr, rep_bytes);
srcptr += stride;
dstptr += stride;
}
if (diff) /* number of leftover pixels: 3 for pngtest */
{
final_val+=diff*BPP4;
for (; i < final_val; i += stride)
{
if (rep_bytes > (int)(final_val-i))
rep_bytes = (int)(final_val-i);
png_memcpy(dstptr, srcptr, rep_bytes);
srcptr += stride;
dstptr += stride;
}
}
} /* end of else (_mmx_supported) */
break;
} /* end 32 bpp */
case 48: /* png_ptr->row_info.pixel_depth */
{
png_bytep srcptr;
png_bytep dstptr;
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
/* && _mmx_supported */ )
#else
if (_mmx_supported)
#endif
{
png_uint_32 len;
int diff;
int dummy_value_a; /* fix 'forbidden register spilled' error */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?