📄 regx.c
字号:
/*
* These functions compile a regular expression into an NFA and recognize
* a pattern.
*
* Regular expressions are often used to describe a pattern that might
* match several different or similar words. An example of a regular
* expression subset is the wild card characters '*' and '?' used to list
* files in a directory.
*
* Might as well read the books and papers written by those who first wrote
* the various [a-z]?greps. Ken Thompson was the author of grep. In one
* weekend, Al Aho wrote egrep and fgrep. Incidentally, when Dr. Aho was
* the guest speaker at the Distinguished Lecture Series at Georgia Tech,
* he personally autographed my copy of his book _Compilers: Principles,
* Techniques, and Tools_, aka the "Dragon Book."
*
* See:
*
* Ken Thompson, "Regular Expression Search Algorithm." _Communications
* of the ACM_, 11 (No. 6): 419-422, 1968.
*
* Alfred V. Aho, Ravi Sethi, and Jeffrey D. Ullman, _Compilers: Principles,
* Techniques, and Tools_, Addison-Wesley, Reading, Mass., 1986,
* Chapter 3, "Lexical Analysis", pp 83-158. ISBN 0-201-10088-6.
*
* Robert Sedgewick, _Algorithms in C_, Addison-Wesley, Reading, Mass.,
* 1990, Chapter 20, "Pattern Matching", pp. 293-303, and Chapter 21,
* "Parsing", pp. 305-317. ISBN 0-201-51425-7.
*
* Andrew Hume, "A Tale of Two Greps." _Software--Practice and Experience_,
* 18 (No. 11): 1063-1072, 1988.
*
*
* Regular Expressions in TDE
*
* The core of the regular expression search algorithm in TDE is based on
* the nondeterministic finite-state machine in Dr. Segdewick's book.
* Additional parsing, operator precedence, and nfa construction and
* simulation info is from Dr. Aho's book. The nfa node naming convention
* and additional nfa construction guidelines are from Ken Thompson's paper.
* I'm much too stupid to compile a regular expression into assembly language
* as suggested in Ken Thompson's paper, but I did get the idea to do the
* next best thing and added C library functions as NNODE terminal types.
*
* Searching for regular expressions in TDE is not very fast. The worst
* case search, .*x, where x is any word or letter, is probably the slowest
* of any implementation with a regular expression search function. However,
* TDE can handle a large regular expression subset.
*
*
* New editor name: TDE, the Thomson-Davis Editor.
* Author: Frank Davis
* Date: June 5, 1991, version 1.0
* Date: July 29, 1991, version 1.1
* Date: October 5, 1991, version 1.2
* Date: January 20, 1992, version 1.3
* Date: February 17, 1992, version 1.4
* Date: April 1, 1992, version 1.5
* Date: June 5, 1992, version 2.0
* Date: October 31, 1992, version 2.1
* Date: April 1, 1993, version 2.2
* Date: June 5, 1993, version 3.0
*
* This code is released into the public domain, Frank Davis.
* You may use and distribute it freely.
*/
#include "tdestr.h"
#include "common.h"
#include "tdefunc.h"
#include "define.h"
/*
* types of nodes in the NFA.
*
* let's use the node naming convention in Ken Thompson's reg ex paper.
*/
#define CNODE 0
#define NNODE 1
#define SCAN -1
/*
* types of terminals in NNODEs.
*
* starting with simple ASCII terminals, it's easy to build in other types of
* terminals, e.g. wild chars, BOL, EOL, and character classes. actually,
* we can easily build ANSI C ctype library function NNODEs.
*/
#define STRAIGHT_ASCII 0
#define IGNORE_ASCII 1
#define WILD 2
#define BOL 3
#define EOL 4
#define CLASS 5
#define NOTCLASS 6
#define WHITESPACE 7
#define ALPHA 8
#define UPPER 9
#define LOWER 10
#define ALPHANUM 11
#define DECIMAL 12
#define HEX 13
/*
* types of terminals in CNODEs.
*/
#define START 0
#define ACCEPT 1
#define OR_NODE 2
#define JUXTA 3
#define CLOSURE 4
#define ZERO_OR_ONE 5
int lookahead;
int regx_rc;
int reg_len;
int parser_state;
char class_bits[32];
int bit[8] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
int c1;
int c2;
int ttype;
int regx_error_line;
NFA_TYPE *nfa_pointer;
int nfa_status;
int search_len;
int search_col;
text_ptr search_string;
int queued_states[REGX_SIZE+2];
int deque[REGX_SIZE*2];
int dq_head;
int dq_tail;
int stacked_node_count;
int reset_count;
/*
* Name: find_regx
* Purpose: To set up and find a regular expression.
* Date: June 5, 1993
* Passed: window: pointer to current window
* Notes: Keep the regular expression until changed.
*/
int find_regx( WINDOW *window )
{
int direction;
int new_string;
char pattern[MAX_COLS]; /* text to be found */
long found_line;
long bin_offset;
line_list_ptr ll;
register WINDOW *win; /* put window pointer in a register */
int rc;
int old_rcol;
int rcol;
switch (g_status.command) {
case FindRegX :
new_string = TRUE;
direction = FORWARD;
break;
case RepeatFindRegX :
new_string = FALSE;
direction = FORWARD;
break;
case RepeatFindRegXBackward :
new_string = FALSE;
direction = BACKWARD;
break;
default :
new_string = 0;
direction = 0;
assert( FALSE );
break;
}
win = window;
entab_linebuff( );
if (un_copy_line( win->ll, win, TRUE ) == ERROR)
return( ERROR );
regx_error_line = win->bottom_line;
/*
* get search text, using previous as default
*/
rc = OK;
if (new_string == TRUE) {
*pattern = '\0';
if (regx.search_defined == OK) {
assert( strlen( (char *)regx.pattern ) < MAX_COLS );
strcpy( pattern, (char *)regx.pattern );
}
/*
* string to find:
*/
if (get_name( reg1, win->bottom_line, pattern,
g_display.message_color ) != OK || *pattern == '\0')
return( ERROR );
regx.search_defined = OK;
assert( strlen( pattern ) < MAX_COLS );
strcpy( (char *)regx.pattern, pattern );
rc = build_nfa( );
if (rc == ERROR)
regx.search_defined = ERROR;
}
if (regx.search_defined == OK) {
old_rcol = win->rcol;
if (mode.inflate_tabs)
win->rcol = entab_adjust_rcol( win->ll->line, win->ll->len, win->rcol);
update_line( win );
show_search_message( SEARCHING, g_display.diag_color );
bin_offset = win->bin_offset;
if (direction == FORWARD) {
if ((ll = forward_regx_search( win, &found_line, &rcol )) != NULL) {
if (g_status.wrapped && g_status.macro_executing)
rc = ask_wrap_replace( win, &new_string );
if (rc == OK)
find_adjust( win, ll, found_line, rcol );
else
win->bin_offset = bin_offset;
}
} else {
if ((ll = backward_regx_search( win, &found_line, &rcol )) != NULL) {
if (g_status.wrapped && g_status.macro_executing)
rc = ask_wrap_replace( win, &new_string );
if (rc == OK)
find_adjust( win, ll, found_line, rcol );
else
win->bin_offset = bin_offset;
}
}
if (g_status.wrapped)
show_search_message( WRAPPED, g_display.diag_color );
else {
if (nfa_status == OK)
show_search_message( CLR_SEARCH, g_display.mode_color );
else
g_status.wrapped = TRUE;
}
if (ll == NULL) {
/*
* string not found
*/
if (mode.inflate_tabs)
win->rcol = old_rcol;
combine_strings( pattern, find5a, (char *)regx.pattern, find5b );
error( WARNING, win->bottom_line, pattern );
rc = ERROR;
}
show_curl_line( win );
make_ruler( win );
show_ruler( win );
} else {
/*
* find pattern not defined
*/
error( WARNING, win->bottom_line, find6 );
rc = ERROR;
}
return( rc );
}
/*
* Name: forward_regx_search
* Purpose: search forward for regular expression
* Date: June 5, 1993
* Passed: window: pointer to current window
* rline: pointer to real line counter
* rcol: pointer to real column variable
* Returns: position in file if found or NULL if not found
* Notes: Start searching from cursor position to end of file. If we hit
* end of file without a match, start searching from the beginning
* of file to cursor position. (do wrapped searches)
*/
line_list_ptr forward_regx_search( WINDOW *window, long *rline, int *rcol )
{
register int len;
int i;
int end;
register WINDOW *win; /* put window pointer in a register */
line_list_ptr ll;
/*
* if cursor is beyond end of line then start at end of line
*/
win = window;
*rline = win->rline;
i = win->rcol + 1;
ll = win->ll;
len = ll->len;
if (i > len && len != EOF) {
ll = ll->next;
++*rline;
i = 0;
}
if (i < 0)
i = 0;
*rcol = i;
ll = regx_search_forward( ll, rline, rcol );
if (ll == NULL) {
end = 0;
if (win->ll->next != NULL) {
end = win->ll->next->len;
win->ll->next->len = EOF;
}
/*
* haven't found pattern yet - now search from beginning of file
*/
g_status.wrapped = TRUE;
*rcol = 0;
*rline = 1L;
ll = regx_search_forward( win->file_info->line_list, rline, rcol );
if (ll == win->ll && *rcol >= win->rcol)
ll = NULL;
if (win->ll->next != NULL)
win->ll->next->len = end;
}
flush_keyboard( );
if (ll != NULL)
bin_offset_adjust( win, *rline );
return( ll );
}
/*
* Name: regx_search_forward
* Purpose: search forward for pattern using nfa
* Date: June 5, 1993
* Passed: ll: pointer to current node in linked list
* rline: pointer to real line counter
* col: column in ll->line to begin search
* Returns: position in file if found or NULL if not found
* Notes: run the nfa machine on each character in each line
*/
line_list_ptr regx_search_forward( line_list_ptr ll, long *rline, int *col )
{
if (ll->len == EOF)
return( NULL );
else {
switch (g_status.command) {
case DefineRegXGrep :
case RepeatGrep :
nfa_pointer = &sas_nfa;
stacked_node_count = sas_regx.node_count;
break;
case FindRegX :
case RepeatFindRegX :
nfa_pointer = &nfa;
stacked_node_count = regx.node_count;
break;
default :
assert( FALSE );
break;
}
nfa_status = OK;
search_string = ll->line;
search_len = ll->len;
search_col = *col;
reset_count = stacked_node_count * sizeof(int);
for (; !g_status.control_break;) {
for (; search_col <= search_len; search_col++) {
if (nfa_match( ) != ERROR) {
*col = search_col;
return( ll );
}
}
++*rline;
ll = ll->next;
search_string = ll->line;
if (ll->len == EOF)
return( NULL );
search_len = ll->len;
search_col = 0;
}
return( NULL );
}
}
/*
* Name: backward_regx_search
* Purpose: search backward for regular expression
* Date: June 5, 1993
* Passed: window: pointer to current window
* rline: pointer to real line counter
* rcol: pointer to real column variable
* Returns: position in file if found or NULL if not found
* Notes: Start searching from cursor position to end of file. If we hit
* end of file without a match, start searching from the beginning
* of file to cursor position. (do wrapped searches)
*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -