mftraining.cpp

来自「一个google的OCR源码」· C++ 代码 · 共 1,342 行 · 第 1/3 页

CPP
1,342
字号
/********************************************************************************	Filename:	mfTraining.c**	Purpose:	Separates training pages into files for each character.**				Strips from files only the features and there parameters of				the feature type mf.**	Author:		Dan Johnson**	Revisment:	Christy Russon**	Environment: HPUX 6.5**	Library:     HPUX 6.5**	History:     Fri Aug 18 08:53:50 1989, DSJ, Created.**		     5/25/90, DSJ, Adapted to multiple feature types.**				Tuesday, May 17, 1998 Changes made to make feature specific and**				simplify structures. First step in simplifying training process.** **	(c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License.******************************************************************************//**----------------------------------------------------------------------------					Include Files and Type Defines----------------------------------------------------------------------------**/#include "oldlist.h"#include "efio.h"#include "emalloc.h"#include "featdefs.h"#include "tessopt.h"#include "ocrfeatures.h"#include "mf.h"#include "general.h"#include "clusttool.h"#include "cluster.h"#include "protos.h"#include "minmax.h"#include "debug.h"#include "tprintf.h"#include "const.h"#include "mergenf.h"#include "name2char.h"#include "intproto.h"#include "variables.h"#include "freelist.h"#include "efio.h"#include "danerror.h"#include "globals.h"#include <string.h>#include <stdio.h>#define _USE_MATH_DEFINES#include <math.h>#ifdef WIN32#ifndef M_PI#define M_PI 3.14159265358979323846#endif#endif#define MAXNAMESIZE	80#define MAX_NUM_SAMPLES	10000#define PROGRAM_FEATURE_TYPE "mf"#define MINSD (1.0f / 128.0f)#define MINSD_ANGLE (1.0f / 64.0f)int	row_number;						/* cjn: fixes link problem */typedef struct{  char		*Label;  int       SampleCount;  LIST		List;}LABELEDLISTNODE, *LABELEDLIST;typedef struct{	char* Label;	int	NumMerged[MAX_NUM_PROTOS];	CLASS_TYPE Class;}MERGE_CLASS_NODE;typedef MERGE_CLASS_NODE* MERGE_CLASS;#define round(x,frag)(floor(x/frag+.5)*frag)/**----------------------------------------------------------------------------					Public Function Prototypes----------------------------------------------------------------------------**/int main (     int	argc,     char	**argv);/**----------------------------------------------------------------------------					Private Function Prototypes----------------------------------------------------------------------------**/void ParseArguments(int	argc,char	**argv);char *GetNextFilename ();LIST ReadTrainingSamples (     FILE	*File);LABELEDLIST FindList (     LIST	List,     char	*Label);MERGE_CLASS FindClass (     LIST	List,     char	*Label);LABELEDLIST NewLabeledList (     char	*Label);MERGE_CLASS NewLabeledClass (     char	*Label);void WriteTrainingSamples (     char	*Directory,     LIST	CharList);void WriteClusteredTrainingSamples (     char	*Directory,     LIST	ProtoList,	 CLUSTERER *Clusterer,	 LABELEDLIST CharSample);/**/void WriteMergedTrainingSamples(    char	*Directory,	LIST ClassList);void WriteMicrofeat(    char	*Directory,	LIST	ClassList);void WriteProtos(	FILE* File,	MERGE_CLASS MergeClass);void WriteConfigs(	FILE* File,	CLASS_TYPE Class);void FreeTrainingSamples (     LIST	CharList);void FreeLabeledClassList (     LIST	ClassList);void FreeLabeledList (     LABELEDLIST	LabeledList);CLUSTERER *SetUpForClustering(     LABELEDLIST	CharSample);/*PARAMDESC *ConvertToPARAMDESC(	PARAM_DESC* Param_Desc,	int N);*/void MergeInsignificantProtos(LIST ProtoList, const char* label,                              CLUSTERER	*Clusterer, CLUSTERCONFIG *Config);LIST RemoveInsignificantProtos(	LIST ProtoList,	BOOL8 KeepSigProtos,	BOOL8 KeepInsigProtos,	int N);void CleanUpUnusedData(	LIST ProtoList);void Normalize (   float  *Values);void SetUpForFloat2Int(	LIST LabeledClassList);void WritePFFMTable(INT_TEMPLATES Templates, const char* filename);//--------------Global Data Definitions and Declarations--------------static char FontName[MAXNAMESIZE];// globals used for parsing command line argumentsstatic char	*Directory = NULL;static int	MaxNumSamples = MAX_NUM_SAMPLES;static int	Argc;static char	**Argv;// globals used to control what information is saved in the output filestatic BOOL8		ShowAllSamples = FALSE;static BOOL8		ShowSignificantProtos = TRUE;static BOOL8		ShowInsignificantProtos = FALSE;// global variable to hold configuration parameters to control clustering// -M 0.40   -B 0.05   -I 1.0   -C 1e-6.static CLUSTERCONFIG Config ={ elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };static FLOAT32 RoundingAccuracy = 0.0f;// The unicharset used during mftrainingstatic UNICHARSET unicharset_mftraining;const char* test_ch = "";/*----------------------------------------------------------------------------						Public Code-----------------------------------------------------------------------------*/void DisplayProtoList(const char* ch, LIST protolist) {  void* window = c_create_window("Char samples", 50, 200,                                 520, 520, -130.0, 130.0, -130.0, 130.0);  LIST proto = protolist;  iterate(proto) {    PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE *>(first_node(proto));    if (prototype->Significant)      c_line_color_index(window, Green);    else if (prototype->NumSamples == 0)      c_line_color_index(window, Blue);    else if (prototype->Merged)      c_line_color_index(window, Magenta);    else      c_line_color_index(window, Red);    float x = CenterX(prototype->Mean);    float y = CenterY(prototype->Mean);    double angle = OrientationOf(prototype->Mean) * 2 * M_PI;    float dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);    float dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);    c_move(window, (x - dx) * 256, (y - dy) * 256);    c_draw(window, (x + dx) * 256, (y + dy) * 256);    if (prototype->Significant)      tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n",              x, y, dx, dy, prototype->NumSamples);    else if (prototype->NumSamples > 0 && !prototype->Merged)      tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n",              x, y, dx, dy, prototype->NumSamples);  }  c_make_current(window);}/*---------------------------------------------------------------------------*/int main (int argc, char **argv) {/***	Parameters:**		argc	number of command line arguments**		argv	array of command line arguments**	Globals: none**	Operation:**		This program reads in a text file consisting of feature**		samples from a training page in the following format:****			FontName CharName NumberOfFeatureTypes(N)**			   FeatureTypeName1 NumberOfFeatures(M)**			      Feature1**			      ...**			      FeatureM**			   FeatureTypeName2 NumberOfFeatures(M)**			      Feature1**			      ...**			      FeatureM**			   ...**			   FeatureTypeNameN NumberOfFeatures(M)**			      Feature1**			      ...**			      FeatureM**			FontName CharName ...****		The result of this program is a binary inttemp file used by**		the OCR engine.**	Return: none**	Exceptions: none**	History:	Fri Aug 18 08:56:17 1989, DSJ, Created.**				Mon May 18 1998, Christy Russson, Revistion started.*/  char	*PageName;  FILE	*TrainingPage;  FILE	*OutFile;  LIST	CharList;  CLUSTERER	*Clusterer = NULL;  LIST		ProtoList = NIL;  LABELEDLIST CharSample;  PROTOTYPE	*Prototype;  LIST   	ClassList = NIL;  int		Cid, Pid;  PROTO		Proto;  PROTO_STRUCT	DummyProto;  BIT_VECTOR	Config2;  MERGE_CLASS	MergeClass;  INT_TEMPLATES	IntTemplates;  LIST pCharList, pProtoList;  char Filename[MAXNAMESIZE];  // Clean the unichar set  unicharset_mftraining.clear();  // Space character needed to represent NIL classification  unicharset_mftraining.unichar_insert(" ");  ParseArguments (argc, argv);  InitFastTrainerVars ();  InitSubfeatureVars ();  while ((PageName = GetNextFilename()) != NULL) {    printf ("Reading %s ...\n", PageName);    TrainingPage = Efopen (PageName, "r");    CharList = ReadTrainingSamples (TrainingPage);    fclose (TrainingPage);    //WriteTrainingSamples (Directory, CharList);    pCharList = CharList;    iterate(pCharList) {      //Cluster      CharSample = (LABELEDLIST) first_node (pCharList);//    printf ("\nClustering %s ...", CharSample->Label);      Clusterer = SetUpForClustering(CharSample);      Config.MagicSamples = CharSample->SampleCount;      ProtoList = ClusterSamples(Clusterer, &Config);      CleanUpUnusedData(ProtoList);      //Merge      MergeInsignificantProtos(ProtoList, CharSample->Label,                               Clusterer, &Config);      if (strcmp(test_ch, CharSample->Label) == 0)        DisplayProtoList(test_ch, ProtoList);      ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos,                                            ShowInsignificantProtos,                                            Clusterer->SampleSize);      FreeClusterer(Clusterer);      MergeClass = FindClass (ClassList, CharSample->Label);      if (MergeClass == NULL) {        MergeClass = NewLabeledClass (CharSample->Label);        ClassList = push (ClassList, MergeClass);      }      Cid = AddConfigToClass(MergeClass->Class);      pProtoList = ProtoList;      iterate (pProtoList) {        Prototype = (PROTOTYPE *) first_node (pProtoList);        // see if proto can be approximated by existing proto        Pid = FindClosestExistingProto(MergeClass->Class,                                       MergeClass->NumMerged, Prototype);        if (Pid == NO_PROTO) {          Pid = AddProtoToClass (MergeClass->Class);          Proto = ProtoIn (MergeClass->Class, Pid);          MakeNewFromOld (Proto, Prototype);          MergeClass->NumMerged[Pid] = 1;        }        else {          MakeNewFromOld (&DummyProto, Prototype);          ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto,              (FLOAT32) MergeClass->NumMerged[Pid], 1.0,              ProtoIn (MergeClass->Class, Pid));          MergeClass->NumMerged[Pid] ++;        }        Config2 = ConfigIn (MergeClass->Class, Cid);        AddProtoToConfig (Pid, Config2);      }      FreeProtoList (&ProtoList);    }    FreeTrainingSamples (CharList);  }  //WriteMergedTrainingSamples(Directory,ClassList);  WriteMicrofeat(Directory, ClassList);  InitIntProtoVars ();  InitPrototypes ();  SetUpForFloat2Int(ClassList);  IntTemplates = CreateIntTemplates(TrainingData, unicharset_mftraining);  strcpy (Filename, "");  if (Directory != NULL) {    strcat (Filename, Directory);    strcat (Filename, "/");  }  strcat (Filename, "inttemp");#ifdef __UNIX__  OutFile = Efopen (Filename, "w");#else  OutFile = Efopen (Filename, "wb");#endif  WriteIntTemplates(OutFile, IntTemplates, unicharset_mftraining);  fclose (OutFile);  strcpy (Filename, "");  if (Directory != NULL) {    strcat (Filename, Directory);    strcat (Filename, "/");  }  strcat (Filename, "pffmtable");  // Now create pffmtable.  WritePFFMTable(IntTemplates, Filename);  printf ("Done!\n"); /**/  FreeLabeledClassList (ClassList);  return 0;}	/* main *//**----------------------------------------------------------------------------							Private Code----------------------------------------------------------------------------**//*---------------------------------------------------------------------------*/void ParseArguments(int	argc,char	**argv)/***	Parameters:**		argc	number of command line arguments to parse**		argv	command line arguments**	Globals:**		ShowAllSamples		flag controlling samples display**		ShowSignificantProtos	flag controlling proto display**		ShowInsignificantProtos	flag controlling proto display**		Config			current clustering parameters**		tessoptarg, tessoptind		defined by tessopt sys call**		Argc, Argv		global copies of argc and argv**	Operation:**		This routine parses the command line arguments that were**		passed to the program.  The legal arguments are:**			-d		"turn off display of samples"**			-p		"turn off significant protos"**			-n		"turn off insignificant proto"**			-S [ spherical | elliptical | mixed | automatic ]**			-M MinSamples	"min samples per prototype (%)"**			-B MaxIllegal	"max illegal chars per cluster (%)"**			-I Independence	"0 to 1"**			-C Confidence	"1e-200 to 1.0"**			-D Directory**			-N MaxNumSamples**			-R RoundingAccuracy**	Return: none**	Exceptions: Illegal options terminate the program.**	History: 7/24/89, DSJ, Created.*/{	int		Option;	int		ParametersRead;	BOOL8		Error;	Error = FALSE;	Argc = argc;	Argv = argv;	while (( Option = tessopt( argc, argv, "R:N:D:C:I:M:B:S:d:n:p" )) != EOF )	{		switch ( Option )		{		case 'n':			ShowInsignificantProtos = FALSE;			break;		case 'p':			ShowSignificantProtos = FALSE;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?