📄 tr.c
字号:
if (s2)
{
get_s2_spec_stats (s2, s1->length);
if (s2->n_indefinite_repeats > 1)
{
error (EXIT_FAILURE, 0,
_("only one [c*] repeat construct may appear in string2"));
}
if (translating)
{
if (s2->has_equiv_class)
{
error (EXIT_FAILURE, 0,
_("[=c=] expressions may not appear in string2 \
when translating"));
}
if (s1->length > s2->length)
{
if (!truncate_set1)
{
/* string2 must be non-empty unless --truncate-set1 is
given or string1 is empty. */
if (s2->length == 0)
error (EXIT_FAILURE, 0,
_("when not truncating set1, string2 must be non-empty"));
string2_extend (s1, s2);
}
}
if (complement && s1->has_char_class
&& ! (s2->length == s1->length && homogeneous_spec_list (s2)))
{
error (EXIT_FAILURE, 0,
_("when translating with complemented character classes,\
\nstring2 must map all characters in the domain to one"));
}
if (s2->has_restricted_char_class)
{
error (EXIT_FAILURE, 0,
_("when translating, the only character classes that may \
appear in\nstring2 are `upper' and `lower'"));
}
}
else
/* Not translating. */
{
if (s2->n_indefinite_repeats > 0)
error (EXIT_FAILURE, 0,
_("the [c*] construct may appear in string2 only \
when translating"));
}
}
}
/* Read buffers of SIZE bytes via the function READER (if READER is
NULL, read from stdin) until EOF. When non-NULL, READER is either
read_and_delete or read_and_xlate. After each buffer is read, it is
processed and written to stdout. The buffers are processed so that
multiple consecutive occurrences of the same character in the input
stream are replaced by a single occurrence of that character if the
character is in the squeeze set. */
static void
squeeze_filter (unsigned char *buf, size_t size, Filter reader)
{
unsigned int char_to_squeeze = NOT_A_CHAR;
size_t i = 0;
size_t nr = 0;
for (;;)
{
size_t begin;
if (i >= nr)
{
if (reader == NULL)
{
nr = safe_read (0, (char *) buf, size);
if (nr == SAFE_READ_ERROR)
error (EXIT_FAILURE, errno, _("read error"));
}
else
{
nr = (*reader) (buf, size, NULL);
}
if (nr == 0)
break;
i = 0;
}
begin = i;
if (char_to_squeeze == NOT_A_CHAR)
{
size_t out_len;
/* Here, by being a little tricky, we can get a significant
performance increase in most cases when the input is
reasonably large. Since tr will modify the input only
if two consecutive (and identical) input characters are
in the squeeze set, we can step by two through the data
when searching for a character in the squeeze set. This
means there may be a little more work in a few cases and
perhaps twice as much work in the worst cases where most
of the input is removed by squeezing repeats. But most
uses of this functionality seem to remove less than 20-30%
of the input. */
for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
; /* empty */
/* There is a special case when i == nr and we've just
skipped a character (the last one in buf) that is in
the squeeze set. */
if (i == nr && in_squeeze_set[buf[i - 1]])
--i;
if (i >= nr)
out_len = nr - begin;
else
{
char_to_squeeze = buf[i];
/* We're about to output buf[begin..i]. */
out_len = i - begin + 1;
/* But since we stepped by 2 in the loop above,
out_len may be one too large. */
if (i > 0 && buf[i - 1] == char_to_squeeze)
--out_len;
/* Advance i to the index of first character to be
considered when looking for a char different from
char_to_squeeze. */
++i;
}
if (out_len > 0
&& fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
error (EXIT_FAILURE, errno, _("write error"));
}
if (char_to_squeeze != NOT_A_CHAR)
{
/* Advance i to index of first char != char_to_squeeze
(or to nr if all the rest of the characters in this
buffer are the same as char_to_squeeze). */
for (; i < nr && buf[i] == char_to_squeeze; i++)
; /* empty */
if (i < nr)
char_to_squeeze = NOT_A_CHAR;
/* If (i >= nr) we've squeezed the last character in this buffer.
So now we have to read a new buffer and continue comparing
characters against char_to_squeeze. */
}
}
}
/* Read buffers of SIZE bytes from stdin until one is found that
contains at least one character not in the delete set. Store
in the array BUF, all characters from that buffer that are not
in the delete set, and return the number of characters saved
or 0 upon EOF. */
static size_t
read_and_delete (unsigned char *buf, size_t size, Filter not_used)
{
size_t n_saved;
static int hit_eof = 0;
assert (not_used == NULL);
if (hit_eof)
return 0;
/* This enclosing do-while loop is to make sure that
we don't return zero (indicating EOF) when we've
just deleted all the characters in a buffer. */
do
{
size_t i;
size_t nr = safe_read (0, (char *) buf, size);
if (nr == SAFE_READ_ERROR)
error (EXIT_FAILURE, errno, _("read error"));
if (nr == 0)
{
hit_eof = 1;
return 0;
}
/* This first loop may be a waste of code, but gives much
better performance when no characters are deleted in
the beginning of a buffer. It just avoids the copying
of buf[i] into buf[n_saved] when it would be a NOP. */
for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
/* empty */ ;
n_saved = i;
for (++i; i < nr; i++)
if (!in_delete_set[buf[i]])
buf[n_saved++] = buf[i];
}
while (n_saved == 0);
return n_saved;
}
/* Read at most SIZE bytes from stdin into the array BUF. Then
perform the in-place and one-to-one mapping specified by the global
array `xlate'. Return the number of characters read, or 0 upon EOF. */
static size_t
read_and_xlate (unsigned char *buf, size_t size, Filter not_used)
{
size_t bytes_read = 0;
static int hit_eof = 0;
size_t i;
assert (not_used == NULL);
if (hit_eof)
return 0;
bytes_read = safe_read (0, (char *) buf, size);
if (bytes_read == SAFE_READ_ERROR)
error (EXIT_FAILURE, errno, _("read error"));
if (bytes_read == 0)
{
hit_eof = 1;
return 0;
}
for (i = 0; i < bytes_read; i++)
buf[i] = xlate[buf[i]];
return bytes_read;
}
/* Initialize a boolean membership set IN_SET with the character
values obtained by traversing the linked list of constructs S
using the function `get_next'. If COMPLEMENT_THIS_SET is
nonzero the resulting set is complemented. */
static void
set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
{
int c;
size_t i;
memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
s->state = BEGIN_STATE;
while ((c = get_next (s, NULL)) != -1)
in_set[c] = 1;
if (complement_this_set)
for (i = 0; i < N_CHARS; i++)
in_set[i] = (!in_set[i]);
}
int
main (int argc, char **argv)
{
int c;
int non_option_args;
struct Spec_list buf1, buf2;
struct Spec_list *s1 = &buf1;
struct Spec_list *s2 = &buf2;
program_name = argv[0];
setlocale (LC_ALL, "");
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
atexit (close_stdout);
while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
{
switch (c)
{
case 0:
break;
case 'c':
complement = 1;
break;
case 'd':
delete = 1;
break;
case 's':
squeeze_repeats = 1;
break;
case 't':
truncate_set1 = 1;
break;
case_GETOPT_HELP_CHAR;
case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
default:
usage (2);
break;
}
}
posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
non_option_args = argc - optind;
translating = (non_option_args == 2 && !delete);
/* Change this test if it is valid to give tr no options and
no args at all. POSIX doesn't specifically say anything
either way, but it looks like they implied it's invalid
by omission. If you want to make tr do a slow imitation
of `cat' use `tr a a'. */
if (non_option_args > 2)
{
error (0, 0, _("too many arguments"));
usage (2);
}
if (!delete && !squeeze_repeats && non_option_args != 2)
error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
if (delete && squeeze_repeats && non_option_args != 2)
error (EXIT_FAILURE, 0, _("two strings must be given when both \
deleting and squeezing repeats"));
/* If --delete is given without --squeeze-repeats, then
only one string argument may be specified. But POSIX
says to ignore any string2 in this case, so if POSIXLY_CORRECT
is set, pretend we never saw string2. But I think
this deserves a fatal error, so that's the default. */
if ((delete && !squeeze_repeats) && non_option_args != 1)
{
if (posix_pedantic && non_option_args == 2)
--non_option_args;
else
error (EXIT_FAILURE, 0,
_("only one string may be given when deleting \
without squeezing repeats"));
}
if (squeeze_repeats && non_option_args == 0)
error (EXIT_FAILURE, 0,
_("at least one string must be given when squeezing repeats"));
spec_init (s1);
if (parse_str ((unsigned char *) argv[optind], s1))
exit (EXIT_FAILURE);
if (non_option_args == 2)
{
spec_init (s2);
if (parse_str ((unsigned char *) argv[optind + 1], s2))
exit (EXIT_FAILURE);
}
else
s2 = NULL;
validate (s1, s2);
/* Use binary I/O, since `tr' is sometimes used to transliterate
non-printable characters, or characters which are stripped away
by text-mode reads (like CR and ^Z). */
SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
if (squeeze_repeats && non_option_args == 1)
{
set_initialize (s1, complement, in_squeeze_set);
squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
}
else if (delete && non_option_args == 1)
{
size_t nr;
set_initialize (s1, complement, in_delete_set);
do
{
nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
error (EXIT_FAILURE, errno, _("write error"));
}
while (nr > 0);
}
else if (squeeze_repeats && delete && non_option_args == 2)
{
set_initialize (s1, complement, in_delete_set);
set_initialize (s2, 0, in_squeeze_set);
squeeze_filter (io_buf, IO_BUF_SIZE, read_and_delete);
}
else if (translating)
{
if (complement)
{
int i;
SET_TYPE *in_s1 = in_delete_set;
set_initialize (s1, 0, in_s1);
s2->state = BEGIN_STATE;
for (i = 0; i < N_CHARS; i++)
xlate[i] = i;
for (i = 0; i < N_CHARS; i++)
{
if (!in_s1[i])
{
int ch = get_next (s2, NULL);
assert (ch != -1 || truncate_set1);
if (ch == -1)
{
/* This will happen when tr is invoked like e.g.
tr -cs A-Za-z0-9 '\012'. */
break;
}
xlate[i] = ch;
}
}
assert (get_next (s2, NULL) == -1 || truncate_set1);
}
else
{
int c1, c2;
int i;
enum Upper_Lower_class class_s1;
enum Upper_Lower_class class_s2;
for (i = 0; i < N_CHARS; i++)
xlate[i] = i;
s1->state = BEGIN_STATE;
s2->state = BEGIN_STATE;
for (;;)
{
c1 = get_next (s1, &class_s1);
c2 = get_next (s2, &class_s2);
if (!class_ok[(int) class_s1][(int) class_s2])
error (EXIT_FAILURE, 0,
_("misaligned [:upper:] and/or [:lower:] construct"));
if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
{
for (i = 0; i < N_CHARS; i++)
if (ISLOWER (i))
xlate[i] = toupper (i);
}
else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
{
for (i = 0; i < N_CHARS; i++)
if (ISUPPER (i))
xlate[i] = tolower (i);
}
else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
|| (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
{
/* By default, GNU tr permits the identity mappings: from
[:upper:] to [:upper:] and [:lower:] to [:lower:]. But
when POSIXLY_CORRECT is set, those evoke diagnostics. */
if (posix_pedantic)
{
error (EXIT_FAILURE, 0,
_("\
invalid identity mapping; when translating, any [:lower:] or [:upper:]\n\
construct in string1 must be aligned with a corresponding construct\n\
([:upper:] or [:lower:], respectively) in string2"));
}
}
else
{
/* The following should have been checked by validate... */
if (c1 == -1 || c2 == -1)
break;
xlate[c1] = c2;
}
}
assert (c1 == -1 || truncate_set1);
}
if (squeeze_repeats)
{
set_initialize (s2, 0, in_squeeze_set);
squeeze_filter (io_buf, IO_BUF_SIZE, read_and_xlate);
}
else
{
size_t bytes_read;
do
{
bytes_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
if (bytes_read > 0
&& fwrite ((char *) io_buf, 1, bytes_read, stdout) == 0)
error (EXIT_FAILURE, errno, _("write error"));
}
while (bytes_read > 0);
}
}
if (close (STDIN_FILENO) != 0)
error (EXIT_FAILURE, errno, _("standard input"));
exit (EXIT_SUCCESS);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -