📄 ckuusx.c
字号:
non-Unicode cases.*/intscanfile(name,flag,nscanfile) char * name; int * flag, nscanfile; { FILE * fp; /* File pointer */ unsigned char buf[SCANFILEBUF]; /* File data buffer for analysis */ int x, val = -1, count = 0; /* Workers */ int rc = -1; /* Return code */ int pv = -1; /* Pattern-match value */ int eof = 0; /* Flag for file EOF encountered */ int bytes = 0; /* Total byte count */#ifdef UNICODE unsigned int c0, c1; /* First 2 file bytes (for BOM) */#endif /* UNICODE */ extern int pipesend, filepeek; register int i; /* Loop control */ int readsize = 0; /* How much to read */ int eightbit = 0; /* Number of bytes with 8th bit on */ int c0controls = 0; /* C0 non-text control-char counter */ int c0noniso = 0; /* C0 non-ISO control-char counter */ int c1controls = 0; /* C1 control-character counter */ unsigned int c; /* Current character */ int runmax = 0; /* Longest run of 0 bytes */ int runzero = 0; /* Run of 0 bytes */ int pctzero = 0; /* Percentage of 0 bytes */ int txtcz = 0;#ifdef CK_CTRLZ extern int eofmethod;#endif /* CK_CTRLZ */#ifdef UNICODE int notutf8 = 0; /* Nonzero if definitely not UTF-8 */ int utf8state = 0; /* UTF-8 recognizer state */ int oddzero = 0; /* Number of 0 bytes in odd postions */ int evenzero = 0; /* and in even positions */ int lfnul = 0; /* Number of <LF><NUL> sequences */ int crlf = 0; /* Number of <CRLF> sequences */#else int notutf8 = 1;#endif /* UNICODE */#ifdef COMMENT#ifdef EVENMAX int oddrun = 0, oddmax = 0, oddbyte = 0, oddmaxbyte = 0; int evenrun = 0, evenmax = 0, evenbyte = 0, evenmaxbyte = 0;#endif /* EVENMAX */#endif /* COMMENT */#ifndef NOXFER if (pipesend || calibrate || sndarray) /* Only for real files */ return(-1);#endif /* NOXFER */ debug(F111,"scanfile",name,nscanfile);#ifdef PATTERNS if (!filepeek) { pv = matchname(name,1,-1); if (pv < 0) rc = -1; else rc = (pv == 1) ? FT_BIN : FT_TEXT; debug(F111,"scanfile !filepeek result",name,rc); return(rc); }#endif /* PATTERNS */#ifdef VMS/* We don't scan in VMS where text files have various record formats in *//* which record headers contain seemingly non-text bytes. So the best *//* we can do in VMS is tell whether the file is text or binary, period. */ { int b, x; b = binary; /* Save current binary setting */ if (zopeni(ZIFILE,name) > 0) { /* In VMS this sets binary */ x = binary; /* Get result */ zclose(ZIFILE); /* Close the file */ binary = b; /* Restore previous binary setting */ rc = x ? FT_BIN : FT_TEXT; val = 0; goto xscanfile; } }#endif /* VMS */ eof = 0; /* End-of-file reached indicator */#ifdef OS2 fp = fopen(name, "rb"); /* Open the file in binary mode */#else fp = fopen(name, "r");#endif /* OS2 */ if (!fp) /* Failed? */ return(-1); while (1) { /* One or more gulps from file */ if (eof) { /* EOF from last time? */ debug(F111,"scanfile at EOF",name,bytes); if (runzero > runmax) runmax = runzero; break; } if (nscanfile < 0) { /* Reading whole file */ readsize = SCANFILEBUF; } else { /* Reading first nscanfilee bytes */ readsize = nscanfile - bytes; if (readsize < 1) break; if (readsize > SCANFILEBUF) readsize = SCANFILEBUF; } debug(F101,"scanfile readsize","",readsize); count = fread(buf,1,readsize,fp); /* Read a buffer */ if (count == EOF || count == 0) { debug(F111,"scanfile EOF",name,count); break; } debug(F111,"scanfile buffer ok",name,count); if (bytes == 0 && count > 8) { /* PDF files can look like text in the beginning. */ if (!ckstrcmp((char *)buf,"%PDF-1.",7,1)) { if (isdigit(buf[7])) { if (buf[8] == '\015' || count > 9 && buf[8] == SP && buf[9] == '\015') {#ifdef DEBUG buf[8] = NUL; debug(F110,"scanfile PDF",buf,0);#endif /* DEBUG */ binary = 1; /* But they are binary. */ break; } } } else if (!ckstrcmp((char *)buf,"%!PS-Ado",8,1)) { /* Ditto for PostScript */#ifdef DEBUG int i; for (i = 8; i < count; i++) { if (buf[i] < '!') { buf[i] = NUL; break; } } debug(F110,"scanfile PostScript",buf,0);#endif /* DEBUG */ binary = 1; break;#ifndef NOPCLSCAN } else if (!ckstrcmp((char *)buf,") HP-PCL",8,1)) { /* HP PCL printer language */#ifdef DEBUG int i; for (i = 8; i < count; i++) { if (buf[i] < '!') { buf[i] = NUL; break; } } debug(F110,"scanfile PCL",buf,0);#endif /* DEBUG */ binary = 1; break; } #endif /* NOPCLSCAN */#ifndef NOPJLSCAN else if (buf[0] == '\033' && (buf[1] == 'E' || buf[1] == '%')) { /* Ditto for PJL Job printer header */#ifdef DEBUG int i; for (i = 2; i < count; i++) { if (buf[i] < '!') { buf[i] = NUL; break; } } debug(F110,"scanfile PJL Job printer header",buf,0);#endif /* DEBUG */ binary = 1; break;#endif /* NOPJLSCAN */ } }#ifdef UNICODE if (bytes == 0 && count > 1) { int incl_cnt = 0; /* First look for BOM */ c0 = (unsigned)((unsigned)buf[0]&0xFF); /* First file byte */ c1 = (unsigned)((unsigned)buf[1]&0xFF); /* Second byte */ if (c0 == 0xFE && c1 == 0xFF) { /* UCS-2 BE */ rc = FT_UCS2; val = 0; debug(F111,"scanfile UCS2 BOM BE",ckitoa(val),rc); incl_cnt++; } else if (c0 == 0xFF && c1 == 0xFE) { /* UCS-2 LE */ rc = FT_UCS2; val = 1; debug(F111,"scanfile UCS2 BOM LE",ckitoa(val),rc); incl_cnt++; } else if (count > 2) if (c0 == 0xEF && c1 == 0xBB && (unsigned)((unsigned)buf[2]&0xFF) == 0xBF) { rc = FT_UTF8; debug(F111,"scanfile UTF8 BOM",ckitoa(val),rc); incl_cnt++; } if (incl_cnt) { /* Have BOM */ bytes += count; goto xscanfile; } }#endif /* UNICODE */ bytes += count; /* Count bytes read */ eof = feof(fp); /* Flag for at EOF */ for (i = 0; i < count; i++) { /* For each byte... */ c = (unsigned)buf[i]; /* For ease of reference */ if (!c) { /* Zero byte? */#ifdef EVENMAX if (i&1) /* In odd position */ oddzero++; else evenzero++; /* In even position */#endif /* EVENMAX */ runzero++; } else { /* Not a zero byte */ if (runzero > runmax) runmax = runzero; if (runmax > 2) /* That's all we need to be certain */ break; /* it's a binary file. */ runzero = 0; }#ifdef COMMENT#ifdef EVENMAX/* This is to catch UCS-2 with a non-ASCII, non-Latin-1 repertoire */ if (i > 1) { /* Look for runs of alternating chars */ if (i&1) { if (c == buf[i-2]) { /* In odd positions */ oddrun++; oddbyte = c; } else { oddmax = oddrun; oddmaxbyte = oddbyte; } } else { /* and even positions */ if (c == buf[i-2]) { evenrun++; evenbyte = c; } else { evenmax = evenrun; evenmaxbyte = evenbyte; } } }#endif /* EVENMAX */#endif /* COMMENT */ if ((c & 0x80) == 0) { /* We have a 7-bit byte */#ifdef UNICODE if (i > 0 && c == 10) { /* Linefeed */ if (buf[i-1] == 0) lfnul++; /* Preceded by NUL */ else if (buf[i-1] == 13) crlf++; /* or by CR... */ }#endif /* UNICODE */ if (c < ' ') { /* Check for CO controls */ if (c != LF && c != CR && c != HT && c != FF) { c0controls++; if (c != ESC && c != SO && c != SI) c0noniso++; } if ((c == '\032') /* Ctrl-Z */#ifdef COMMENT && eof && (i >= count - 2)#endif /* COMMENT */ ) { c0controls--; c0noniso--;#ifdef CK_CTRLZ if (eofmethod == XYEOF_Z && txtcz == 0) { if (c0controls == 0) /* All text prior to Ctrl-Z */ txtcz = 1; }#endif /* CK_CTRLZ */ } }#ifdef UNICODE if (!notutf8 && utf8state) { /* In UTF-8 sequence? */ utf8state = 0; debug(F000,"scanfile","7-bit byte in UTF8 sequence",c); notutf8++; /* Then it's not UTF-8 */ continue; }#endif /* UNICODE */ } else { /* We have an 8-bit byte */ eightbit++; /* Count it */ if (c >= 0x80 && c < 0xA0) /* Check for C1 controls */ c1controls++;#ifdef UNICODE if (!notutf8) { /* If it might still be UTF8... */ switch (utf8state) { /* Enter the UTF-8 state machine */ case 0: /* First byte... */ if ((c & 0xE0) == 0xC0) { /* Tells number of */ utf8state = 1; /* subsequent bytes */ } else if ((c & 0xF0) == 0xE0) { utf8state = 2; } else if ((c & 0xF8) == 0xF0) { utf8state = 3; } else { notutf8++; } break; case 1: /* Subsequent byte */ case 2: case 3: if ((c & 0xC0) != 0x80) { /* Must start with 10 */ debug(F000,"scanfile", "bad byte in UTF8 sequence",c); notutf8++; break; } utf8state--; /* Good, one less in this sequence */ break; default: /* Shouldn't happen */ debug(F111,"scanfile","bad UTF8 state",utf8state); notutf8++; } }#endif /* UNICODE */ } } } fclose(fp); /* Close the file */ debug(F101,"scanfile bytes","",bytes); if (bytes == 0) /* If nothing was read */ return(-1); /* we're done. */#ifdef EVENMAX /* In case we had a run that never broke... */#ifdef COMMENT if (oddmax == 0) { oddmax = oddrun; oddmaxbyte = oddbyte; } if (evenmax == 0) { evenmax = evenrun; evenmaxbyte = evenbyte; }#endif /* COMMENT */ if (runmax == 0) { runmax = runzero; }#endif /* EVENMAX */#ifdef UNICODE if (bytes > 100) /* Bytes is not 0 */ pctzero = (evenzero + oddzero) / (bytes / 100); else pctzero = ((evenzero + oddzero) * 100) / bytes;#endif /* UNICODE */#ifdef DEBUG if (deblog) { /* If debugging, dump statistics */ debug(F101,"scanfile c0controls ","",c0controls); debug(F101,"scanfile c0noniso ","",c0noniso); debug(F101,"scanfile c1controls ","",c1controls); debug(F101,"scanfile eightbit ","",eightbit);#ifdef UNICODE debug(F101,"scanfile crlf ","",crlf); debug(F101,"scanfile lfnul ","",lfnul); debug(F101,"scanfile notutf8 ","",notutf8); debug(F101,"scanfile evenzero ","",evenzero); debug(F101,"scanfile oddzero ","",oddzero); debug(F101,"scanfile even/odd ","",(evenzero / (oddzero + 1))); debug(F101,"scanfile odd/even ","",(oddzero / (evenzero + 1))); debug(F101,"scanfile pctzero ","",pctzero);#endif /* UNICODE */#ifdef COMMENT#ifdef EVENMAX debug(F101,"scanfile oddmax ","",oddmax); debug(F101,"scanfile oddmaxbyte ","",oddmaxbyte); debug(F101,"scanfile evenmax ","",evenmax); debug(F101,"scanfile evenmaxbyte","",evenmaxbyte);#endif /* EVENMAX */#endif /* COMMENT */ debug(F101,"scanfile runmax ","",runmax); }#endif /* DEBUG */#ifdef UNICODE x = eightbit ? bytes / 20 : bytes / 4; /* For UCS-2... */ if (runmax > 2) { /* File has run of more than 2 NULs */ debug(F100,"scanfile BIN runmax","",0); rc = FT_BIN; /* so it can't be any kind of text. */ goto xscanfile; } else if (rc == FT_UCS2 || (rc == FT_UTF8 && runmax == 0)) { goto xscanfile; /* File starts with a BOM */ } else if (eightbit > 0 && !notutf8) { /* File has 8-bit data */ if (runmax > 0) { /* and runs of NULs */ debug(F100,"scanfile BIN (nnUTF8) runmax","",0); rc = FT_BIN; /* UTF-8 doesn't have NULs */ } else { /* No NULs */ debug(F100,"scanfile UTF8 (nnUTF8 + runmax == 0)","",0); rc = FT_UTF8; /* and not not UTF-8, so is UTF-8 */ } goto xscanfile; }/* For UCS-2 detection, see if the text contains lines delimited by ASCII controls and containing spaces, ASCII digits, or other ASCII characters, thus forcing the presence of a certain percentage of zero bytes. For this purpose require 20% zero bytes, with at least six times as many in even (odd) positions as in odd (even) positions.*/ if ((evenzero >= x && oddzero == 0) || ((((evenzero / (oddzero + 1)) > 6) && (pctzero > 20)) && (crlf == 0) && (lfnul > 1)) ) { debug(F100,"scanfile UCS2 noBOM BE (even/oddzero)","",0); rc = FT_UCS2; val = 0; } else if ((evenzero == 0 && oddzero >= x) || ((((oddzero / (evenzero + 1)) > 6) && (pctzero > 20)) && (crlf == 0) && (lfnul > 1)) ) { debug(F100,"scanfile UCS2 noBOM LE (even/oddzero)","",0); rc = FT_UCS2; val = 1;#ifdef COMMENT#ifdef EVENMAX/* If the tests above fail, we still might have UCS-2 if there are significant runs of identical bytes in alternating positions, but only if it also has unusual C0 controls (otherwise we'd pick up hex files here). NOTE: We don't actually do this -- EVENMAX is not defined (see comments above at first occurrence of EVENMAX).*/ } else if (c0noniso && evenmax > bytes / 4) { debug(F100,"scanfile UCS2 BE (evenmax)","",0); rc = FT_UCS2; val = 0; } else if (c0noniso && oddmax > bytes / 4) { debug(F100,"scanfile UCS2 LE (evenmax)","",0); rc = FT_UCS2; val = 1;#endif /* EVENMAX */#endif /* COMMENT */ }/* It seems to be UCS-2 but let's be more certain since there is no BOM... If the number of 7- and 8-bit characters is approximately equal, it might be a compressed file. In this case we decide based on the name.*/ if (rc == FT_UCS2) { if (eightbit > 0) { int j, k; j = (c1controls * 100) / (c0controls + 1); debug(F101,"scanfile c1/c0 ","",j); k = (bytes * 100) / eightbit; debug(F101,"scanfile pct 8bit ","",k); if (k > 40 && k < 60 && j > 60) { if (ckmatch("{*.Z,*.gz,*.zip,*.ZIP}",name,1,1)) { debug(F110,"scanfile 8-bit BIN compressed",name,0); rc = FT_BIN;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -