📄 stopper.cpp
字号:
/****************************************************************************** ** Filename: stopper.c ** Purpose: Stopping criteria for word classifier. ** Author: Dan Johnson ** History: Mon Apr 29 14:56:49 1991, DSJ, Created. ** ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. ******************************************************************************//**---------------------------------------------------------------------------- Include Files and Type Defines----------------------------------------------------------------------------**/#include "stopper.h"#include "emalloc.h"#include "matchdefs.h"#include "debug.h"#include "callcpp.h"#include "permute.h"#include "context.h"#include "permnum.h"#include "danerror.h"#include "const.h"#include "freelist.h"#include "efio.h"#include "globals.h"#include "scanutils.h"#include <stdio.h>#include <string.h>#include <ctype.h>#include <math.h>#ifdef __UNIX__#include <assert.h>#endif/* these are kludges - add appropriate .h file later */extern float CertaintyScale; /* from subfeat.h */#define MAX_WERD_SIZE 100#define MAX_AMBIG_SIZE 3#define DANGEROUS_AMBIGS "tessdata/DangAmbigs"typedef LIST AMBIG_TABLE;typedef struct{ UINT8 Class; UINT16 NumChunks; float Certainty;}CHAR_CHOICE;typedef struct{ float Rating; float Certainty; FLOAT32 AdjustFactor; int Length; CHAR_CHOICE Blob[1];} VIABLE_CHOICE_STRUCT;typedef VIABLE_CHOICE_STRUCT *VIABLE_CHOICE;typedef struct{ VIABLE_CHOICE Choice; float ChunkCertainty[MAX_NUM_CHUNKS]; UINT8 ChunkClass[MAX_NUM_CHUNKS];}EXPANDED_CHOICE;/**---------------------------------------------------------------------------- Macros----------------------------------------------------------------------------**/#define BestCertainty(Choices) (((VIABLE_CHOICE) first (Choices))->Certainty)#define BestRating(Choices) (((VIABLE_CHOICE) first (Choices))->Rating)#define BestFactor(Choices) (((VIABLE_CHOICE) first (Choices))->AdjustFactor)#define AmbigThreshold(F1,F2) (((F2) - (F1)) * AmbigThresholdGain - \ AmbigThresholdOffset)/*--------------------------------------------------------------------------- Private Function Prototoypes----------------------------------------------------------------------------*/void AddNewChunk(VIABLE_CHOICE Choice, int Blob);int AmbigsFound(char *Word, char *CurrentChar, const char *Tail, LIST Ambigs, DANGERR *fixpt);int ChoiceSameAs(A_CHOICE *Choice, VIABLE_CHOICE ViableChoice);int CmpChoiceRatings(void *arg1, //VIABLE_CHOICE Choice1, void *arg2); //VIABLE_CHOICE Choice2);void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice);AMBIG_TABLE *FillAmbigTable();int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice, void *item2); //EXPANDED_CHOICE *BestChoice);int LengthOfShortestAlphaRun(register char *Word);VIABLE_CHOICE NewViableChoice (A_CHOICE * Choice,FLOAT32 AdjustFactor, float Certainties[]);void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);void ReplaceDuplicateChoice (VIABLE_CHOICE OldChoice,A_CHOICE * NewChoice,FLOAT32 AdjustFactor, float Certainties[]);int StringSameAs(const char *String, VIABLE_CHOICE ViableChoice);int UniformCertainties(CHOICES_LIST Choices, A_CHOICE *BestChoice);/**---------------------------------------------------------------------------- Global Data Definitions and Declarations----------------------------------------------------------------------------**//* Name of file containing potentially dangerous ambiguities */static const char *DangerousAmbigs = DANGEROUS_AMBIGS;/* Word for which stopper debug information should be printed to stdout */static char *WordToDebug = NULL;/* flag used to disable accumulation of word choices during compound word permutation */BOOL8 KeepWordChoices = TRUE;/* additional certainty padding allowed before a word is rejected */static FLOAT32 RejectOffset = 0.0;/* structures to keep track of viable word choices */static VIABLE_CHOICE BestRawChoice = NULL;static LIST BestChoices = NIL;static PIECES_STATE CurrentSegmentation;make_float_var (NonDictCertainty, -2.50, MakeNonDictCertainty,17, 2, SetNonDictCertainty,"Certainty threshold for non-dict words");make_float_var (RejectCertaintyOffset, 1.0, MakeRejectCertaintyOffset,17, 3, SetRejectCertaintyOffset, "Reject certainty offset");make_int_var (SmallWordSize, 2, MakeSmallWordSize,17, 4, SetSmallWordSize,"Size of dict word to be treated as non-dict word");make_float_var (CertaintyPerChar, -0.50, MakeCertaintyPerChar,17, 5, SetCertaintyPerChar,"Certainty to add for each dict char above SmallWordSize");make_float_var (CertaintyVariation, 3.0, MakeCertaintyVariation,17, 6, SetCertaintyVariation,"Max certaintly variation allowed in a word (in sigma)");make_int_var (StopperDebugLevel, 0, MakeStopperDebugLevel,17, 7, SetStopperDebugLevel, "Stopper debug level");make_float_var (AmbigThresholdGain, 8.0, MakeAmbigThresholdGain,17, 8, SetAmbigThresholdGain,"Gain factor for ambiguity threshold");make_float_var (AmbigThresholdOffset, 1.5, MakeAmbigThresholdOffset,17, 9, SetAmbigThresholdOffset,"Certainty offset for ambiguity threshold");//extern char *demodir;extern int first_pass;INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");/**---------------------------------------------------------------------------- Public Code----------------------------------------------------------------------------**//*---------------------------------------------------------------------------*/int AcceptableChoice(CHOICES_LIST Choices, A_CHOICE *BestChoice, A_CHOICE *RawChoice, DANGERR *fixpt) {/* ** Parameters: ** Choices choices for current segmentation ** BestChoice best choice for current segmentation ** RawChoice best raw choice for current segmentation ** Globals: ** NonDictCertainty certainty for a non-dict word ** SmallWordSize size of word to be treated as non-word ** CertaintyPerChar certainty to add for each dict char ** Operation: Return TRUE if the results from this segmentation are ** good enough to stop. Otherwise return FALSE. ** Return: TRUE or FALSE. ** Exceptions: none ** History: Mon Apr 29 14:57:32 1991, DSJ, Created. */ float CertaintyThreshold = NonDictCertainty; int WordSize; if (fixpt != NULL) fixpt->index = -1; if ((BestChoice == NULL) || (class_string (BestChoice) == NULL)) return (FALSE); if (StopperDebugLevel >= 1) cprintf ("\nStopper: %s (word=%c, case=%c, punct=%c)\n", class_string (BestChoice), (valid_word (class_string (BestChoice)) ? 'y' : 'n'), (case_ok (class_string (BestChoice)) ? 'y' : 'n'), ((punctuation_ok (class_string (BestChoice)) != -1) ? 'y' : 'n')); if (valid_word (class_string (BestChoice)) && case_ok (class_string (BestChoice)) && punctuation_ok (class_string (BestChoice)) != -1) { WordSize = LengthOfShortestAlphaRun (class_string (BestChoice)); WordSize -= SmallWordSize; if (WordSize < 0) WordSize = 0; CertaintyThreshold += WordSize * CertaintyPerChar; } else if (stopper_numbers_on && valid_number (class_string (BestChoice))) { CertaintyThreshold += stopper_numbers_on * CertaintyPerChar; } if (StopperDebugLevel >= 1) cprintf ("Stopper: Certainty = %4.1f, Threshold = %4.1f\n", class_certainty (BestChoice), CertaintyThreshold); if (NoDangerousAmbig (class_string (BestChoice), fixpt) && class_certainty (BestChoice) > CertaintyThreshold && UniformCertainties (Choices, BestChoice)) return (TRUE); else return (FALSE);} /* AcceptableChoice *//*---------------------------------------------------------------------------*/int AcceptableResult(A_CHOICE *BestChoice, A_CHOICE *RawChoice) {/* ** Parameters: ** BestChoice best choice for current word ** RawChoice best raw choice for current word ** Globals: ** NonDictCertainty certainty for a non-dict word ** SmallWordSize size of word to be treated as non-word ** CertaintyPerChar certainty to add for each dict char ** BestChoices list of all good choices found ** RejectOffset allowed offset before a word is rejected ** Operation: Return FALSE if the best choice for the current word ** is questionable and should be tried again on the second ** pass or should be flagged to the user. ** Return: TRUE or FALSE. ** Exceptions: none ** History: Thu May 9 14:05:05 1991, DSJ, Created. */ float CertaintyThreshold = NonDictCertainty - RejectOffset; int WordSize; if (StopperDebugLevel >= 1) cprintf ("\nRejecter: %s (word=%c, case=%c, punct=%c, unambig=%c)\n", class_string (BestChoice), (valid_word (class_string (BestChoice)) ? 'y' : 'n'), (case_ok (class_string (BestChoice)) ? 'y' : 'n'), ((punctuation_ok (class_string (BestChoice)) != -1) ? 'y' : 'n'), ((rest (BestChoices) != NIL) ? 'n' : 'y')); if ((BestChoice == NULL) || (class_string (BestChoice) == NULL) || CurrentWordAmbig ()) return (FALSE); if (valid_word (class_string (BestChoice)) && case_ok (class_string (BestChoice)) && punctuation_ok (class_string (BestChoice)) != -1) { WordSize = LengthOfShortestAlphaRun (class_string (BestChoice)); WordSize -= SmallWordSize; if (WordSize < 0) WordSize = 0; CertaintyThreshold += WordSize * CertaintyPerChar; } if (StopperDebugLevel >= 1) cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", class_certainty (BestChoice), CertaintyThreshold); if (class_certainty (BestChoice) > CertaintyThreshold) { if (StopperDebugLevel >= 1) cprintf ("ACCEPTED\n"); return (TRUE); } else { if (StopperDebugLevel >= 1) cprintf ("REJECTED\n"); return (FALSE); }} /* AcceptableResult *//*---------------------------------------------------------------------------*/int AlternativeChoicesWorseThan(FLOAT32 Threshold) {/* ** Parameters: ** Threshold minimum adjust factor for alternative choices ** Globals: ** BestChoices alternative choices for current word ** Operation: This routine returns TRUE if there are no alternative ** choices for the current word OR if all alternatives have ** an adjust factor worse than Threshold. ** Return: TRUE or FALSE. ** Exceptions: none ** History: Mon Jun 3 09:36:31 1991, DSJ, Created. */ LIST Alternatives; VIABLE_CHOICE Choice; Alternatives = rest (BestChoices); iterate(Alternatives) { Choice = (VIABLE_CHOICE) first (Alternatives); if (Choice->AdjustFactor <= Threshold) return (FALSE); } return (TRUE);} /* AlternativeChoicesWorseThan *//*---------------------------------------------------------------------------*/int CurrentBestChoiceIs(const char *Word) {/* ** Parameters: ** Word string to compare to current best choice ** Globals: ** BestChoices set of best choices for current word ** Operation: Returns TRUE if Word is the same as the current best ** choice, FALSE otherwise. ** Return: TRUE or FALSE ** Exceptions: none ** History: Thu May 30 14:44:22 1991, DSJ, Created. */ return (BestChoices != NIL && StringSameAs (Word, (VIABLE_CHOICE) first (BestChoices)));} /* CurrentBestChoiceIs *//*---------------------------------------------------------------------------*/FLOAT32 CurrentBestChoiceAdjustFactor() {/* ** Parameters: none ** Globals: ** BestChoices set of best choices for current word ** Operation: Return the adjustment factor for the best choice for ** the current word. ** Return: Adjust factor for current best choice. ** Exceptions: none ** History: Thu May 30 14:48:24 1991, DSJ, Created. */ VIABLE_CHOICE BestChoice; if (BestChoices == NIL) return (MAX_FLOAT32); BestChoice = (VIABLE_CHOICE) first (BestChoices); return (BestChoice->AdjustFactor);} /* CurrentBestChoiceAdjustFactor *//*---------------------------------------------------------------------------*/int CurrentWordAmbig() {/* ** Parameters: none ** Globals: ** BestChoices set of best choices for current word ** Operation: This routine returns TRUE if there are multiple good ** choices for the current word and FALSE otherwise. ** Return: TRUE or FALSE ** Exceptions: none ** History: Wed May 22 15:38:38 1991, DSJ, Created. */ return (rest (BestChoices) != NIL);} /* CurrentWordAmbig *//*---------------------------------------------------------------------------*/void DebugWordChoices() {/* ** Parameters: none ** Globals: ** BestRawChoice ** BestChoices ** Operation: Print the current choices for this word to stdout. ** Return: none ** Exceptions: none ** History: Wed May 15 13:52:08 1991, DSJ, Created. */ LIST Choices; int i; char LabelString[80]; if (StopperDebugLevel >= 1 || WordToDebug && BestChoices && StringSameAs (WordToDebug, (VIABLE_CHOICE) first (BestChoices))) { if (BestRawChoice) PrintViableChoice (stdout, "\nBest Raw Choice: ", BestRawChoice); i = 1; Choices = BestChoices; if (Choices) cprintf ("\nBest Cooked Choices:\n"); iterate(Choices) { sprintf (LabelString, "Cooked Choice #%d: ", i); PrintViableChoice (stdout, LabelString, (VIABLE_CHOICE) first (Choices)); i++; } }} /* DebugWordChoices *//*---------------------------------------------------------------------------*/void FilterWordChoices() {/* ** Parameters: none ** Globals: ** BestChoices set of choices for current word ** Operation: This routine removes from BestChoices all choices which ** are not within a reasonable range of the best choice. ** Return: none ** Exceptions: none ** History: Wed May 15 13:08:24 1991, DSJ, Created. */ EXPANDED_CHOICE BestChoice; if (BestChoices == NIL || second (BestChoices) == NIL) return; /* compute certainties and class for each chunk in best choice */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -