📄 sort5.c
字号:
#ifndef lintstatic char *sccsid = "@(#)sort5.c 4.1 (ULTRIX) 7/2/90";#endif lint/************************************************************************ * * * Copyright (c) 1984,1988 by * * Digital Equipment Corporation, Maynard, MA * * Bull, France * * Siemens AG, FR Germany * * All rights reserved. * * * * This software is furnished under a license and may be used and * * copied only in accordance with the terms of such license and * * with the inclusion of the above copyright notice. This * * software or any other copies thereof may not be provided or * * otherwise made available to any other person. No title to and * * ownership of the software is hereby transferred. * * * * This software is derived from software received from the * * University of California, Berkeley, and from Bell * * Laboratories. Use, duplication, or disclosure is subject to * * restrictions under license agreements with University of * * California and with AT&T. * * * * The information in this software is subject to change without * * notice and should not be construed as a commitment by Digital * * Equipment Corporation. * * * * Digital assumes no responsibility for the use or reliability * * of its software on equipment which is not supplied by Digital. * * * ************************************************************************//* * Modification History: * * 08-Jun-88 Mark Parenti * Changed signal handlers to void. *//* * source code for the sort program. * * For internationalization a (partial) rewrite was necessary to allow for * 1) the dynamic memory management used by the internationalization * routines * 2) To take out all the ascii codeset dependent tables/parts of * code. * * NAME: * sort -- sort and/or merge files * * SYNOPSIS: * sort [-cmu] [-o output] [-y kmem] [-z recsz] [-tx] [-T dir] [-bdfinrM] * [+pos[-pos]] [file...] * * DESCRIPTION: * The command sort sorts lines of all the named files together * and writes the result on stdout. Stdin is read when '-' or no * file arguments are given. * * Comparisons are based on one or more sort keys extracted from * each line of input. By default the entire input line is the sort * key and ordering is lexicographic by default collation sequence * taken from the current International UNIX database. * * When the last line of an input file is missing a newline, sort * appends the newline, issues a diagnostic message and continues. * * OPTIONS: * The following options alter the default behaviour: * * -T dir Use the named directory for temporary files * * -c Check that the input file is sorted according to the * ordering rules. No output is given unless the file is * out of sort. * * -m Do only the merge part. All input files are assumed to be * already sorted. * * -u Supress all but one in each set of lines having equal keys. * This also works with the -c option. * * -o output Use the named file for output of the sort instead of stdout. * Output may be the same as one of the input files. * * -y kmem Set the amount of memory used to kmem kilo bytes. If this * option is ommitted, sort begins with a default size and * continues to use more space as needed. If no kmem argument * is given, sort will use the maximum. A kmem value of 0 * will start using minimal memory. The maximum and minimum * values are set at compile time and values outside this range * are mapped onto max or min respectivly. * * -z recsz Take the maximum length of an input line to be recsz bytes. * Sort will determine the maximal size automatically when * the sort phase is run through, so this option only has * to be specified when either -c or -m is in effect. Lines * longer than recsz will cause sort to terminate abnormally. * * The following options override the default ordering rules: * * -d Only letters, digits and white space is significant in * comparisons. * * -f Fold lower case letters to upper case. * * -i Ignore control characters in comparisons. * * -n Sort numerically on an initial numeric string, consisting * of optional leading white space, optional minus sign and * zero or more digits with optional radix character. * * -r reverse the sense of comparisons. * * -M Use numerical values of monthnames for comparisons * * When given before a restricted sort key, the ordering rules apply * globally to all sort keys. All of these options may also be * attached to a specific sort key. In this case the attached options * take precedence. * * The notation +pos1 -pos2 restricts a sort key to the field beginning * at pos1 and ending at pos2 inclusive. A missing pos2 means the end * of the line. * * Pos1 and pos2 have the form m[.n][bdfinrM]. * where m designates the m+1th field, * n designates the n+1th character in this field and * [bdfinrM] is the ordering rule to apply to the field. * * A missing .n in pos1 denotes the first character of the field. * A missing .n in pos2 denotes the last character of the field, * values for n larger than zero specify characters after the end of * the field. * * A field is starts with optional white space and ends with the first * white space character after a non white space sequence. * * The following options influence the interpretation of fields: * * -b Ignore leading blanks when determining the starting and ending * positions of a restricted sort key. * * -tx Use x as the field separation character. x is not considered * to be part of a field. Each occurence of x is significant and * starts a new field. * * When there are multiple sort keys, later keys are compared only * after all earlier keys compare equal. Lines that otherwise compare * equal are ordered with all codes significant. * * STATUS: * != 0 for various trouble conditions and for disorder under the -c * option, * 0 otherwise. *//* * include files */#include <stdio.h>#include <ctype.h>#include <signal.h>#include <sys/types.h>#include <sys/stat.h>#include <values.h>#include <locale.h>#include <langinfo.h>#include <nl_types.h>/* * message handling */#ifdef INTLM /************** X/OPEN message handling ************/#define NL_SETN 1 /* set number */nl_catd _m_catd;nl_catd catopen();char *catgets();#else#define catgets(a,b,c,d) d#endif/* * The following constant _NFILE comes from stdio.h and is the number * of file pointers available to a program. */#define NFILES _NFILE#define N NFILES - 5 /* stdin, stdout, stderr, output, database */#define NF 10 /* max number of field specifications */#define MTHRESH 8 /* threshhold for median of 3 qksort */#if (NFILES > 32)#define TREEZ 64 /* no less than N and best pow of 2 */#else#define TREEZ 32 /* no less than N and best pow of 2 */#endif/* * Memory administration: * * Using a lot of memory is great when sorting a lot of data. * Using a megabyte to sort the output of `who' loses big. * MAXMEM, MINMEM and DEFMEM define the absolute maximum, * minimum and default memory requirements. Administrators * can override any or all of these via defines at compile time. * Users can override the amount allocated (within the limits * of MAXMEM and MINMEM) on the command line. * * For PDP-11s, memory is limited by the maximum unsigned number, 2^16-1. * Administrators can override this too. * Arguments to core getting routines must be unsigned. * Unsigned long not supported on 11s. */#ifndef MAXMEM#ifdef pdp11#define MAXMEM ((1L << 16)-1) /* 64 kB maximum */#else#define MAXMEM 1048576 /* 1 MB maximum */#endif#endif#ifndef MINMEM#define MINMEM 16384 /* 16 kB minimum */#endif#ifndef DEFMEM#define DEFMEM 32768 /* 32 kB default */#endif/* * First parameter passed to diag function determines the action taken * after the diagnostic message has been output. */#define WARN 0 /* warning msg only: return to caller */#define TERM 1 /* fatal error: call term() */#define EXIT 2 /* fatal error: call exit() *//* * comparison routines available */int asciicmp(); /* use ascii comparison */int numcmp(); /* use numeric comparison */int monthcmp(); /* use monthname comparison */int intlcmp(); /* external comparison from database */int tagcmp(); /* compare tag fields */void disorder(); /* disorder message and exit */#define USE catgets(_m_catd, NL_SETN, 1, "invalid use of command line options")#define blank(c) (c == ' ' || c == '\t')/* * globals for file handling */#define MAXPLEN 100 /* max length of a temp file pathname */#define NAMEOHD 12 /* length of tmp suffix "/stm00000aa" */FILE *os; /* output file descriptor */char *dirtry[] = {"/usr/tmp", "/tmp", NULL}; /* temp file area */char *file; /* pointer to tempfile name */int nfiles; /* number of files to sort/merge */int maxrec; /* size of one record (-z option) */int cmp(); /* compare field routine */int cmpa(); /* compare whole line routines */int (*compare)() = cmpa; /* compare routine to use */char *eol(); /* skip to eol in a string */char *skip(); /* skip according to field specs */void term(); /* terminate sort *//* * globals for option handling */int mflg; /* merge only flag */int nway; /* ??? */int cflg; /* check only flag */int uflg; /* unique flag */char *outfil = "-"; /* output file name, if any */int unsafeout; /* kludge to assure -m -o works */char tabchar; /* field separator character given */int tag = 0; /* TRUE if doing a tag sort *//* * globals for memory management */unsigned tryfor; /* memory we want to have */unsigned getmem(); /* memory allocator during sort */char *malloc(); /* memory allocator */char *realloc(); /* memory allocator */long memmoved = 0; /* amount moved during realloc */ /* state of last getline */#define OKAY 0#define INHEAD 1#define INREAD 2#define INTAG 3int getstate = OKAY; /* used in sort/getline */char *recstart = (char *)0; /* used if getline fails *//* * Below earg[cv] replace arg[cv] in place while command arguments are * scanned. */char **eargv; /* pointer to file argument vector */int eargc; /* argument count for file arguments *//* * The following is used to implement tree sort/insert */struct btree { /* binary tree construct */ char *rp; /* contents of line */ int rn; /* diverse uses ?? */};struct btree tree[TREEZ];struct btree *treep[TREEZ];int blkcnt[TREEZ];char **blkcur[TREEZ];long wasfirst = 0;long notfirst = 0;int bonus;/* * tables used in field specification. * see specification of struct field below * The interesting point is that always table[128] is used as starting * point (sign extension for characters is assumed!). * * NAME TYPE FUNCTION * fold: code used to fold upper to lower case * nofold: code used to differ between upper and lower case * zero: ignore used to ignore no characters * nonprint: ignore used to ignore non printable characters * dict: ignore used to ignore characters not in the dictionary */char fold[256] = { 0200,0201,0202,0203,0204,0205,0206,0207, 0210,0211,0212,0213,0214,0215,0216,0217, 0220,0221,0222,0223,0224,0225,0226,0227, 0230,0231,0232,0233,0234,0235,0236,0237, 0240,0241,0242,0243,0244,0245,0246,0247, 0250,0251,0252,0253,0254,0255,0256,0257, 0260,0261,0262,0263,0264,0265,0266,0267, 0270,0271,0272,0273,0274,0275,0276,0277, 0300,0301,0302,0303,0304,0305,0306,0307, 0310,0311,0312,0313,0314,0315,0316,0317, 0320,0321,0322,0323,0324,0325,0326,0327, 0330,0331,0332,0333,0334,0335,0336,0337, 0340,0341,0342,0343,0344,0345,0346,0347, 0350,0351,0352,0353,0354,0355,0356,0357, 0360,0361,0362,0363,0364,0365,0366,0367, 0370,0371,0372,0373,0374,0375,0376,0377, 0000,0001,0002,0003,0004,0005,0006,0007, 0010,0011,0012,0013,0014,0015,0016,0017, 0020,0021,0022,0023,0024,0025,0026,0027, 0030,0031,0032,0033,0034,0035,0036,0037, 0040,0041,0042,0043,0044,0045,0046,0047, 0050,0051,0052,0053,0054,0055,0056,0057, 0060,0061,0062,0063,0064,0065,0066,0067, 0070,0071,0072,0073,0074,0075,0076,0077, 0100,0101,0102,0103,0104,0105,0106,0107, 0110,0111,0112,0113,0114,0115,0116,0117, 0120,0121,0122,0123,0124,0125,0126,0127, 0130,0131,0132,0133,0134,0135,0136,0137, 0140,0101,0102,0103,0104,0105,0106,0107, 0110,0111,0112,0113,0114,0115,0116,0117, 0120,0121,0122,0123,0124,0125,0126,0127, 0130,0131,0132,0173,0174,0175,0176,0177};char nofold[256] = { 0200,0201,0202,0203,0204,0205,0206,0207, 0210,0211,0212,0213,0214,0215,0216,0217, 0220,0221,0222,0223,0224,0225,0226,0227, 0230,0231,0232,0233,0234,0235,0236,0237, 0240,0241,0242,0243,0244,0245,0246,0247, 0250,0251,0252,0253,0254,0255,0256,0257, 0260,0261,0262,0263,0264,0265,0266,0267, 0270,0271,0272,0273,0274,0275,0276,0277, 0300,0301,0302,0303,0304,0305,0306,0307, 0310,0311,0312,0313,0314,0315,0316,0317, 0320,0321,0322,0323,0324,0325,0326,0327, 0330,0331,0332,0333,0334,0335,0336,0337, 0340,0341,0342,0343,0344,0345,0346,0347, 0350,0351,0352,0353,0354,0355,0356,0357, 0360,0361,0362,0363,0364,0365,0366,0367, 0370,0371,0372,0373,0374,0375,0376,0377, 0000,0001,0002,0003,0004,0005,0006,0007, 0010,0011,0012,0013,0014,0015,0016,0017, 0020,0021,0022,0023,0024,0025,0026,0027, 0030,0031,0032,0033,0034,0035,0036,0037, 0040,0041,0042,0043,0044,0045,0046,0047, 0050,0051,0052,0053,0054,0055,0056,0057, 0060,0061,0062,0063,0064,0065,0066,0067, 0070,0071,0072,0073,0074,0075,0076,0077, 0100,0101,0102,0103,0104,0105,0106,0107, 0110,0111,0112,0113,0114,0115,0116,0117, 0120,0121,0122,0123,0124,0125,0126,0127, 0130,0131,0132,0133,0134,0135,0136,0137, 0140,0141,0142,0143,0144,0145,0146,0147, 0150,0151,0152,0153,0154,0155,0156,0157, 0160,0161,0162,0163,0164,0165,0166,0167, 0170,0171,0172,0173,0174,0175,0176,0177};char zero[256];char nonprint[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};char dict[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1};/* * structure used to hold a field specification * The array used all have two elements. * Element 1 (index 0) holds the information for +pos * Element 2 (index 1) holds the information for -pos */struct field { char *code; /* pointer to table fold/nofold */ char *ignore; /* pointer to table zero/nonprint/dict */ int (*fcmp)(); /* pointer to comparison routine */ int rflg; /* sign of comparison 1 or -1 */ int bflg[2]; /* ignore leading/trailing blanks */ int m[2]; /* field number for + and - pos */ int n[2]; /* byte number for + and - pos */};/* * specification of fields. * fields[0] is used if no fields are specified and for the * global bnifrM options in case of additional field specifications */struct field fields[NF]; /* The array of field specifications *//* * prototype field specification. This is also the default. */struct field proto = { nofold+128, /* upper and lower case are distinct */ zero+128, /* do not ignore any characters */ asciicmp, /* use ASCII native comparison */ 1, /* ordinary sense of comparison */ 0,0, /* no not ignore blanks */ 0,-1, /* take whole line */ 0,0 /* byte positions are ignored */};int nfields; /* number of fields to sort */char *setfil(); /* generate temp file names */FILE *output(); /* open an output file */FILE *input(); /* open an input file */void doclose(); /* close a file used by sort *//* * main -- sort/merge files */main(argc, argv)int argc;char **argv;{ register a; char *arg; /* pointer into argument */ struct field *p; struct field *q; int i; /* temporary uses */ /* * close any file descriptors that may have been left open * sort/merge may need them all. */ for (i = 3; i < NFILES; i++) (void) close(i); /* * open corresponding catalogue file */#ifdef INTLM _m_catd = catopen("sort", 0);#endif if (setlocale(LC_ALL, "")) { proto.fcmp = intlcmp; compare = cmp; } fields[nfields] = proto; initree(); eargv = argv; tryfor = DEFMEM; /* * scan arguments of sort command */ while (--argc > 0) { if (**++argv == '-') { /* * option or field specification */ for (arg = *argv; ; ) { switch (*++arg) { case '\0': /* end of option list or lone - */ if (arg[-1] == '-') eargv[eargc++] = "-"; break; case 'o': /* output file specified */ if (*(arg + 1) != '\0') outfil = arg + 1; else if(--argc > 0) outfil = *++argv; else diag(EXIT, catgets(_m_catd, NL_SETN, 2, "can't identify output file"),""); break; case 'T': /* location of temp files */ if (--argc > 0) { if ((strlen(*++argv) + NAMEOHD) > MAXPLEN) diag(EXIT, catgets(_m_catd, NL_SETN, 3, "path name too long:"), *argv); else dirtry[0] = *argv; } break; case 'c': /* only check whether sorted */ cflg = 1; continue; case 'm': /* merge only */ mflg = 1; continue; case 'y': /* memory to use for sort */ if (*++arg) { if (isdigit(*arg)) tryfor = number(&arg) * 1024; else diag(EXIT, USE,""); } else { --arg; tryfor = MAXMEM; } continue; case 'z': /* max linelength for merge */ if (*++arg && isdigit(*arg)) maxrec = number(&arg); else diag(EXIT, USE,""); continue; case 'X': /* do a TAG sort */ tag = 1; compare = tagcmp; continue; default: /* field specification */ field(++*argv, nfields > 0); break; } break; } } else if (**argv == '+')
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -