stopper.cpp
来自「一个google的OCR源码」· C++ 代码 · 共 1,459 行 · 第 1/4 页
CPP
1,459 行
strcat(TestString_lengths, lengths); if (!unicharset.contains_unichar(buffer)) illegal_char = true; } fscanf (AmbigFile, "%d", &AmbigPartSize); for (i = 0; i < AmbigPartSize; ++i) { fscanf (AmbigFile, "%s", buffer); strcat(ReplacementString, buffer); lengths[0] = strlen(buffer); strcat(ReplacementString_lengths, lengths); if (!unicharset.contains_unichar(buffer)) illegal_char = true; } if (strlen (TestString_lengths) > MAX_AMBIG_SIZE || strlen (ReplacementString_lengths) > MAX_AMBIG_SIZE) DoError (0, "Illegal ambiguity specification!"); if (illegal_char) { continue; } AmbigSpec = (AMBIG_SPEC *) Emalloc (sizeof (AMBIG_SPEC)); strcpy(AmbigSpec->ambig, TestString + TestString_lengths[0]); strcat(AmbigSpec->ambig, " "); strcat(AmbigSpec->ambig, ReplacementString); strcpy(AmbigSpec->lengths, TestString_lengths + 1); lengths[0] = 1; strcat(AmbigSpec->lengths, lengths); strcat(AmbigSpec->lengths, ReplacementString_lengths); unichar_id = unicharset.unichar_to_id(TestString, TestString_lengths[0]); NewTable[unichar_id] = push_last (NewTable[unichar_id], AmbigSpec); } fclose(AmbigFile); return (NewTable);} /* FillAmbigTable *//*---------------------------------------------------------------------------*/int FreeBadChoice(void *item1, //VIABLE_CHOICE Choice, void *item2) { //EXPANDED_CHOICE *BestChoice)/* ** Parameters: ** Choice choice to be tested ** BestChoice best choice found ** Globals: ** AmbigThresholdGain ** AmbigThresholdOffset ** Operation: If the certainty of any chunk in Choice is not ambiguous ** with the corresponding chunk in the best choice, free ** Choice and return TRUE. Otherwise, return FALSE. ** Return: TRUE or FALSE. ** Exceptions: none ** History: Wed May 15 13:20:26 1991, DSJ, Created. */ int i, j, Chunk; FLOAT32 Threshold; VIABLE_CHOICE Choice; EXPANDED_CHOICE *BestChoice; Choice = (VIABLE_CHOICE) item1; BestChoice = (EXPANDED_CHOICE *) item2; Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor, Choice->AdjustFactor); for (i = 0, Chunk = 0; i < Choice->Length; i++) for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] && Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] < Threshold) { memfree(Choice); return (TRUE); } return (FALSE);} /* FreeBadChoice *//*---------------------------------------------------------------------------*/int LengthOfShortestAlphaRun(register char *Word, const char *Word_lengths) {/* ** Parameters: ** Word word to be tested ** Word_lengths lengths of the unichars in Word ** Globals: none ** Operation: Return the length of the shortest alpha run in Word. ** Return: Return the length of the shortest alpha run in Word. ** Exceptions: none ** History: Tue May 14 07:50:45 1991, DSJ, Created. */ register int Shortest = MAXINT; register int Length; for (; *Word; Word += *(Word_lengths++)) if (unicharset.get_isalpha(Word, *Word_lengths)) { for (Length = 1, Word += *(Word_lengths++); *Word && unicharset.get_isalpha(Word, *Word_lengths); Word += *(Word_lengths++), Length++); if (Length < Shortest) Shortest = Length; if (*Word == 0) break; } if (Shortest == MAXINT) Shortest = 0; return (Shortest);} /* LengthOfShortestAlphaRun *//*---------------------------------------------------------------------------*/VIABLE_CHOICENewViableChoice (A_CHOICE * Choice, FLOAT32 AdjustFactor, float Certainties[]) {/* ** Parameters: ** Choice choice to be converted to a viable choice ** AdjustFactor factor used to adjust ratings for Choice ** Certainties certainty for each character in Choice ** Globals: ** CurrentSegmentation segmentation corresponding to Choice ** Operation: Allocate a new viable choice data structure, copy ** Choice, Certainties, and CurrentSegmentation into it, ** and return a pointer to it. ** Return: Ptr to new viable choice. ** Exceptions: none ** History: Thu May 16 15:28:29 1991, DSJ, Created. */ VIABLE_CHOICE NewChoice; int Length; char *Word; char *Word_lengths; CHAR_CHOICE *NewChar; BLOB_WIDTH *BlobWidth; Length = strlen (class_lengths (Choice)); assert (Length <= MAX_NUM_CHUNKS && Length > 0); NewChoice = (VIABLE_CHOICE) Emalloc (sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE)); NewChoice->Rating = class_probability (Choice); NewChoice->Certainty = class_certainty (Choice); NewChoice->AdjustFactor = AdjustFactor; NewChoice->Length = Length; for (Word = class_string (Choice), Word_lengths = class_lengths (Choice), NewChar = &(NewChoice->Blob[0]), BlobWidth = CurrentSegmentation; *Word; Word += *(Word_lengths++), NewChar++, Certainties++, BlobWidth++) { NewChar->Class = unicharset.unichar_to_id(Word, *Word_lengths); NewChar->NumChunks = *BlobWidth; NewChar->Certainty = *Certainties; } return (NewChoice);} /* NewViableChoice *//*---------------------------------------------------------------------------*/void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {/* ** Parameters: ** File open text file to print Choice to ** Label text label to be printed with Choice ** Choice choice to be printed ** Globals: none ** Operation: This routine dumps a text representation of the ** specified Choice to File. ** Return: none ** Exceptions: none ** History: Mon May 20 11:16:44 1991, DSJ, Created. */ int i, j; fprintf (File, "%s", Label); fprintf (File, "(R=%5.1f, C=%4.1f, F=%4.2f) ", Choice->Rating, Choice->Certainty, Choice->AdjustFactor); for (i = 0; i < Choice->Length; i++) fprintf (File, "%s", unicharset.id_to_unichar(Choice->Blob[i].Class)); fprintf (File, "\n"); for (i = 0; i < Choice->Length; i++) { fprintf (File, " %s", unicharset.id_to_unichar(Choice->Blob[i].Class)); for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++) fprintf (File, " "); } fprintf (File, "\n"); for (i = 0; i < Choice->Length; i++) { for (j = 0; j < Choice->Blob[i].NumChunks; j++) fprintf (File, "%3d", (int) (Choice->Blob[i].Certainty * -10.0)); } fprintf (File, "\n");} /* PrintViableChoice *//*---------------------------------------------------------------------------*/voidReplaceDuplicateChoice (VIABLE_CHOICE OldChoice,A_CHOICE * NewChoice,FLOAT32 AdjustFactor, float Certainties[]) {/* ** Parameters: ** OldChoice existing viable choice to be replaced ** NewChoice choice to replace OldChoice with ** AdjustFactor factor used to adjust ratings for OldChoice ** Certainties certainty for each character in OldChoice ** Globals: ** CurrentSegmentation segmentation for NewChoice ** Operation: This routine is used whenever a better segmentation (or ** contextual interpretation) is found for a word which already ** exists. The OldChoice is updated with the relevant ** information from the new choice. The text string itself ** does not need to be copied since, by definition, has not ** changed. ** Return: none ** Exceptions: none ** History: Fri May 17 13:35:58 1991, DSJ, Created. */ char *Word; char *Word_lengths; CHAR_CHOICE *NewChar; BLOB_WIDTH *BlobWidth; OldChoice->Rating = class_probability (NewChoice); OldChoice->Certainty = class_certainty (NewChoice); OldChoice->AdjustFactor = AdjustFactor; for (Word = class_string (NewChoice), Word_lengths = class_lengths (NewChoice), NewChar = &(OldChoice->Blob[0]), BlobWidth = CurrentSegmentation; *Word; Word += *(Word_lengths++), NewChar++, Certainties++, BlobWidth++) { NewChar->NumChunks = *BlobWidth; NewChar->Certainty = *Certainties; }} /* ReplaceDuplicateChoice *//*---------------------------------------------------------------------------*/int StringSameAs(const char *String, const char *String_lengths, VIABLE_CHOICE ViableChoice) {/* ** Parameters: ** String string to compare to ViableChoice ** String_lengths lengths of unichars in String ** ViableChoice viable choice to compare to String ** Globals: none ** Operation: This routine compares String to ViableChoice and ** returns TRUE if they are the same, FALSE otherwise. ** Return: TRUE or FALSE. ** Exceptions: none ** History: Fri May 17 08:48:04 1991, DSJ, Created. */ CHAR_CHOICE *Char; int i; int current_unichar_length; for (Char = &(ViableChoice->Blob[0]), i = 0; i < ViableChoice->Length; String += *(String_lengths++), Char++, i++) { current_unichar_length = strlen(unicharset.id_to_unichar(Char->Class)); if (current_unichar_length != *String_lengths || strncmp(String, unicharset.id_to_unichar(Char->Class), current_unichar_length) != 0) return (FALSE); } if (*String == 0) return (TRUE); else return (FALSE);} /* StringSameAs *//*---------------------------------------------------------------------------*/int UniformCertainties(CHOICES_LIST Choices, A_CHOICE *BestChoice) {/* ** Parameters: ** Choices choices for current segmentation ** BestChoice best choice for current segmentation ** Globals: ** CertaintyVariation max allowed certainty variation ** Operation: This routine returns TRUE if the certainty of the ** BestChoice word is within a reasonable range of the average ** certainties for the best choices for each character in ** the segmentation. This test is used to catch words in which ** one character is much worse than the other characters in ** the word (i.e. FALSE will be returned in that case). ** The algorithm computes the mean and std deviation of the ** certainties in the word with the worst certainty thrown out. ** Return: TRUE or FALSE. ** Exceptions: none ** History: Tue May 14 08:23:21 1991, DSJ, Created. */ int i; CHOICES CharChoices; float Certainty; float WorstCertainty = MAX_FLOAT32; float CertaintyThreshold; FLOAT64 TotalCertainty; FLOAT64 TotalCertaintySquared; FLOAT64 Variance; FLOAT32 Mean, StdDev; int WordLength; WordLength = array_count (Choices); if (WordLength < 3) return (TRUE); TotalCertainty = TotalCertaintySquared = 0.0; for_each_choice(Choices, i) { CharChoices = (CHOICES) array_index (Choices, i); Certainty = best_certainty (CharChoices); TotalCertainty += Certainty; TotalCertaintySquared += Certainty * Certainty; if (Certainty < WorstCertainty) WorstCertainty = Certainty; } /* subtract off worst certainty from statistics */ WordLength--; TotalCertainty -= WorstCertainty; TotalCertaintySquared -= WorstCertainty * WorstCertainty; Mean = TotalCertainty / WordLength; Variance = ((WordLength * TotalCertaintySquared - TotalCertainty * TotalCertainty) / (WordLength * (WordLength - 1))); if (Variance < 0.0) Variance = 0.0; StdDev = sqrt (Variance); CertaintyThreshold = Mean - CertaintyVariation * StdDev; if (CertaintyThreshold > NonDictCertainty) CertaintyThreshold = NonDictCertainty; if (class_certainty (BestChoice) < CertaintyThreshold) { if (StopperDebugLevel >= 1) cprintf ("Stopper: Non-uniform certainty = %4.1f (m=%4.1f, s=%4.1f, t=%4.1f)\n", class_certainty (BestChoice), Mean, StdDev, CertaintyThreshold); return (FALSE); } else return (TRUE);} /* UniformCertainties */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?