📄 affixmgr.cxx
字号:
#include "license.readme"
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include "affixmgr.hxx"
#include "affentry.hxx"
//using namespace std;
#pragma warning(disable:4244)
#pragma warning(disable:4514)
#pragma warning(disable:4706)
// First some base level utility routines
extern void mychomp(char * s);
extern char * mystrdup(const char * s);
extern char * myrevstrdup(const char * s);
extern char * mystrsep(char ** sptr, const char delim);
extern int isSubset(const char * s1, const char * s2);
extern int isRevSubset(const char * s1, const char * end_of_s2, int len_s2);
AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr)
{
// register hash manager and load affix data from aff file
pHMgr = ptr;
trystring = NULL;
encoding=NULL;
reptable = NULL;
numrep = 0;
maptable = NULL;
nummap = 0;
compound=NULL;
nosplitsugs= (0==1);
cpdmin = 3; // default value
for (int i=0; i < SETSIZE; i++) {
pStart[i] = NULL;
sStart[i] = NULL;
pFlag[i] = NULL;
sFlag[i] = NULL;
}
if (parse_file(affpath)) {
fprintf(stderr,"Failure loading aff file %s\n",affpath);
fflush(stderr);
}
}
AffixMgr::~AffixMgr()
{
// pass through linked prefix entries and clean up
for (int i=0; i < SETSIZE ;i++) {
pFlag[i] = NULL;
PfxEntry * ptr = (PfxEntry *)pStart[i];
PfxEntry * nptr = NULL;
while (ptr) {
nptr = ptr->getNext();
delete(ptr);
ptr = nptr;
nptr = NULL;
}
}
// pass through linked suffix entries and clean up
for (int j=0; j < SETSIZE ; j++) {
sFlag[j] = NULL;
SfxEntry * ptr = (SfxEntry *)sStart[j];
SfxEntry * nptr = NULL;
while (ptr) {
nptr = ptr->getNext();
delete(ptr);
ptr = nptr;
nptr = NULL;
}
}
if (trystring) free(trystring);
trystring=NULL;
if (encoding) free(encoding);
encoding=NULL;
if (maptable) {
for (int j=0; j < nummap; j++) {
free(maptable[j].set);
maptable[j].set = NULL;
maptable[j].len = 0;
}
free(maptable);
maptable = NULL;
}
nummap = 0;
if (reptable) {
for (int j=0; j < numrep; j++) {
free(reptable[j].pattern);
free(reptable[j].replacement);
reptable[j].pattern = NULL;
reptable[j].replacement = NULL;
}
free(reptable);
reptable = NULL;
}
numrep = 0;
if (compound) free(compound);
compound=NULL;
pHMgr = NULL;
cpdmin = 0;
}
// read in aff file and build up prefix and suffix entry objects
int AffixMgr::parse_file(const char * affpath)
{
// io buffers
char line[MAXLNLEN+1];
// affix type
char ft;
// open the affix file
FILE * afflst;
afflst = fopen(affpath,"r");
if (!afflst) {
fprintf(stderr,"Error - could not open affix description file %s\n",affpath);
return 1;
}
// step one is to parse the affix file building up the internal
// affix data structures
// read in each line ignoring any that do not
// start with a known line type indicator
while (fgets(line,MAXLNLEN,afflst)) {
mychomp(line);
/* parse in the try string */
if (strncmp(line,"TRY",3) == 0) {
if (parse_try(line)) {
return 1;
}
}
/* parse in the name of the character set used by the .dict and .aff */
if (strncmp(line,"SET",3) == 0) {
if (parse_set(line)) {
return 1;
}
}
/* parse in the flag used by the controlled compound words */
if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
if (parse_cpdflag(line)) {
return 1;
}
}
/* parse in the flag used by the controlled compound words */
if (strncmp(line,"COMPOUNDMIN",11) == 0) {
if (parse_cpdmin(line)) {
return 1;
}
}
/* parse in the typical fault correcting table */
if (strncmp(line,"REP",3) == 0) {
if (parse_reptable(line, afflst)) {
return 1;
}
}
/* parse in the related character map table */
if (strncmp(line,"MAP",3) == 0) {
if (parse_maptable(line, afflst)) {
return 1;
}
}
// parse this affix: P - prefix, S - suffix
ft = ' ';
if (strncmp(line,"PFX",3) == 0) ft = 'P';
if (strncmp(line,"SFX",3) == 0) ft = 'S';
if (ft != ' ') {
if (parse_affix(line, ft, afflst)) {
return 1;
}
}
// handle NOSPLITSUGS
if (strncmp(line,"NOSPLITSUGS",11) == 0)
nosplitsugs=(0==0);
}
fclose(afflst);
// convert affix trees to sorted list
process_pfx_tree_to_list();
process_sfx_tree_to_list();
// now we can speed up performance greatly taking advantage of the
// relationship between the affixes and the idea of "subsets".
// View each prefix as a potential leading subset of another and view
// each suffix (reversed) as a potential trailing subset of another.
// To illustrate this relationship if we know the prefix "ab" is found in the
// word to examine, only prefixes that "ab" is a leading subset of need be examined.
// Furthermore is "ab" is not present then none of the prefixes that "ab" is
// is a subset need be examined.
// The same argument goes for suffix string that are reversed.
// Then to top this off why not examine the first char of the word to quickly
// limit the set of prefixes to examine (i.e. the prefixes to examine must
// be leading supersets of the first character of the word (if they exist)
// To take advantage of this "subset" relationship, we need to add two links
// from entry. One to take next if the current prefix is found (call it nexteq)
// and one to take next if the current prefix is not found (call it nextne).
// Since we have built ordered lists, all that remains is to properly intialize
// the nextne and nexteq pointers that relate them
process_pfx_order();
process_sfx_order();
return 0;
}
// we want to be able to quickly access prefix information
// both by prefix flag, and sorted by prefix string itself
// so we need to set up two indexes
int AffixMgr::build_pfxtree(AffEntry* pfxptr)
{
PfxEntry * ptr;
PfxEntry * pptr;
PfxEntry * ep = (PfxEntry*) pfxptr;
// get the right starting points
const char * key = ep->getKey();
const unsigned char flg = ep->getFlag();
// first index by flag which must exist
ptr = (PfxEntry*)pFlag[flg];
ep->setFlgNxt(ptr);
pFlag[flg] = (AffEntry *) ep;
// handle the special case of null affix string
if (strlen(key) == 0) {
// always inset them at head of list at element 0
ptr = (PfxEntry*)pStart[0];
ep->setNext(ptr);
pStart[0] = (AffEntry*)ep;
return 0;
}
// now handle the normal case
ep->setNextEQ(NULL);
ep->setNextNE(NULL);
unsigned char sp = *((const unsigned char *)key);
ptr = (PfxEntry*)pStart[sp];
// handle the first insert
if (!ptr) {
pStart[sp] = (AffEntry*)ep;
return 0;
}
// otherwise use binary tree insertion so that a sorted
// list can easily be generated later
pptr = NULL;
for (;;) {
pptr = ptr;
if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
ptr = ptr->getNextEQ();
if (!ptr) {
pptr->setNextEQ(ep);
break;
}
} else {
ptr = ptr->getNextNE();
if (!ptr) {
pptr->setNextNE(ep);
break;
}
}
}
return 0;
}
// we want to be able to quickly access suffix information
// both by suffix flag, and sorted by the reverse of the
// suffix string itself; so we need to set up two indexes
int AffixMgr::build_sfxtree(AffEntry* sfxptr)
{
SfxEntry * ptr;
SfxEntry * pptr;
SfxEntry * ep = (SfxEntry *) sfxptr;
/* get the right starting point */
const char * key = ep->getKey();
const unsigned char flg = ep->getFlag();
// first index by flag which must exist
ptr = (SfxEntry*)sFlag[flg];
ep->setFlgNxt(ptr);
sFlag[flg] = (AffEntry *) ep;
// next index by affix string
// handle the special case of null affix string
if (strlen(key) == 0) {
// always inset them at head of list at element 0
ptr = (SfxEntry*)sStart[0];
ep->setNext(ptr);
sStart[0] = (AffEntry*)ep;
return 0;
}
// now handle the normal case
ep->setNextEQ(NULL);
ep->setNextNE(NULL);
unsigned char sp = *((const unsigned char *)key);
ptr = (SfxEntry*)sStart[sp];
// handle the first insert
if (!ptr) {
sStart[sp] = (AffEntry*)ep;
return 0;
}
// otherwise use binary tree insertion so that a sorted
// list can easily be generated later
pptr = NULL;
for (;;) {
pptr = ptr;
if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
ptr = ptr->getNextEQ();
if (!ptr) {
pptr->setNextEQ(ep);
break;
}
} else {
ptr = ptr->getNextNE();
if (!ptr) {
pptr->setNextNE(ep);
break;
}
}
}
return 0;
}
// convert from binary tree to sorted list
int AffixMgr::process_pfx_tree_to_list()
{
for (int i=1; i< SETSIZE; i++) {
pStart[i] = process_pfx_in_order(pStart[i],NULL);
}
return 0;
}
AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
{
if (ptr) {
nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
}
return nptr;
}
// convert from binary tree to sorted list
int AffixMgr:: process_sfx_tree_to_list()
{
for (int i=1; i< SETSIZE; i++) {
sStart[i] = process_sfx_in_order(sStart[i],NULL);
}
return 0;
}
AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
{
if (ptr) {
nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
}
return nptr;
}
// reinitialize the PfxEntry links NextEQ and NextNE to speed searching
// using the idea of leading subsets this time
int AffixMgr::process_pfx_order()
{
PfxEntry* ptr;
// loop through each prefix list starting point
for (int i=1; i < SETSIZE; i++) {
ptr = (PfxEntry*)pStart[i];
// look through the remainder of the list
// and find next entry with affix that
// the current one is not a subset of
// mark that as destination for NextNE
// use next in list that you are a subset
// of as NextEQ
for (; ptr != NULL; ptr = ptr->getNext()) {
PfxEntry * nptr = ptr->getNext();
for (; nptr != NULL; nptr = nptr->getNext()) {
if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
}
ptr->setNextNE(nptr);
ptr->setNextEQ(NULL);
if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey()))
ptr->setNextEQ(ptr->getNext());
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -