dlltest.cpp

来自「一个google的OCR源码」· C++ 代码 · 共 164 行

CPP
164
字号
/********************************************************************** * File:        dlltest.cpp * Description: Main program to test the tessdll interface. * Author:      Ray Smith * Created:     Wed May 16 15:17:46 PDT 2007 * * (C) Copyright 2007, Google Inc. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/#define _UNICODE#include "stdafx.h"#include "imgs.h"#include "unichar.h"#include "tessdll.h"/********************************************************************** *  main() * **********************************************************************/static wchar_t *make_unicode_string(const char *utf8){  int size = 0, out_index = 0;  wchar_t *out;  /* first calculate the size of the target string */  int used = 0;  int utf8_len = strlen(utf8);  while (used < utf8_len) {    int step = UNICHAR::utf8_step(utf8 + used);    if (step == 0)      break;    used += step;    ++size;  }  out = (wchar_t *) malloc((size + 1) * sizeof(wchar_t));  if (out == NULL)      return NULL;  /* now convert to Unicode */  used = 0;  while (used < utf8_len) {    int step = UNICHAR::utf8_step(utf8 + used);    if (step == 0)      break;    UNICHAR ch(utf8 + used, step);    out[out_index++] = ch.first_uni();    used += step;  }  out[out_index] = 0;  return out;}int main(int argc, char **argv) {  if (argc < 3 || argc > 4) {    fprintf(stderr, "Usage:%s imagename outputname [lang]\n", argv[0]);    exit(1);  }  IMAGE image;  if (image.read_header(argv[1]) < 0) {    fprintf(stderr, "Can't open %s\n", argv[1]);    exit(1);  }  if (image.read(image.get_ysize ()) < 0) {    fprintf(stderr, "Can't read %s\n", argv[1]);    exit(1);  }   TessDllAPI api(argc > 3 ? argv[3] : "eng");  api.BeginPageUpright(image.get_xsize(), image.get_ysize(), image.get_buffer(),		       image.get_bpp());  ETEXT_DESC* output = api.Recognize_all_Words();  FILE* fp = fopen(argv[2],"w");  if (fp == NULL) {    fprintf(stderr, "Can't create %s\n", argv[2]);    exit(1);  }  // It should be noted that the format for char_code for version 2.0 and beyond is UTF8  // which means that ASCII characters will come out as one structure but other characters  // will be returned in two or more instances of this structure with a single byte of the  // UTF8 code in each, but each will have the same bounding box.  // Programs which want to handle languagues with different characters sets will need to  // handle extended characters appropriately, but *all* code needs to be prepared to  // receive UTF8 coded characters for characters such as bullet and fancy quotes.  int j;  for (int i = 0; i < output->count; i = j) {    const EANYCODE_CHAR* ch = &output->text[i];	  unsigned char unistr[UNICHAR_LEN];		    for (int b = 0; b < ch->blanks; ++b)      fprintf(fp, "\n");    for (j = i; j < output->count; j++)	  {		  const EANYCODE_CHAR* unich = &output->text[j];		  if (ch->left != unich->left || ch->right != unich->right ||          ch->top != unich->top || ch->bottom != unich->bottom)			  break;		  unistr[j - i] = static_cast<unsigned char>(unich->char_code);	  }    unistr[j - i] = '\0';		      wchar_t *utf16ch=make_unicode_string(reinterpret_cast<const char*>(unistr));#ifndef _UNICODE    // If we aren't in _UNICODE mode, print string only if ascii.    if (ch->char_code <= 0x7f) {      fprintf(fp, "%s", unistr);#else    // %S is a microsoft-special. Attempts to translate the Unicode    // back to the current locale to print in 8 bit    fprintf(fp, "%S", utf16ch);#endif    // Print the hex codes of the utf8 code.    for (int x = 0; unistr[x] != '\0'; ++x)      fprintf(fp, "[%x]", unistr[x]);		fprintf(fp, "->");    // Print the hex codes of the unicode.    for (int y = 0; utf16ch[y] != 0; ++y)      fprintf(fp, "[%x]", utf16ch[y]);    // Print the coords.    fprintf(fp, "(%d,%d)->(%d,%d)\n",      ch->left, ch->bottom, ch->right, ch->top);    if (ch->formatting & 64)      fprintf(fp, "<nl>\n\n");    if (ch->formatting & 128)      fprintf(fp, "<para>\n\n");	  free(utf16ch);  }  fclose(fp);  return 0;}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?