ITEEDU

Tesseract-ocr项目功能模块的实现

在进行测试的过程中，由于处于摸索和学习的阶段，所以在实现图片文字识别过程中，曾经历了好几个版本，下边简单地做以讲解。

第一阶段：通过命令端测试

安装完毕后，设置好环境变量，我是建立了一个ocr.sh文件，每次执行一下子：source ocr.sh即可。

#!bin/bash
export  /home/administrator/tesseract-ocr/bin:
export  /home/administrator/tesseract-ocr/lib:

在终端输入

tesseract   picture1.tif  5   -l   chi_sim

即可将图片picture1.tif中的文字信息提取出来，并放在5.txt文件中。

第二阶段：结果输出到终端

在测试阶段，我在eclipse中，修改了tesseract-ocr源码中的tesseractmain.cpp函数，得出了自己的程序代码，具体代码：

//读取图片，分析图片，提取其中的文字内容，输出到终端上来
  #include <ctype.h>
  #include "applybox.h"
  #include "control.h"
  #include "tessvars.h"
  #include "tessedit.h"
  #include "baseapi.h"
  #include "thresholder.h"
  #include "pageres.h"
  #include "imgs.h"
  #include "varabled.h"
  #include "tprintf.h"
  #include "tesseractmain.h"
  #include "stderr.h"
  #include "notdll.h"
  #include "mainblk.h"
  #include "output.h"
  #include "globals.h"
  #include "helpers.h"
  #include "blread.h"
  #include "tfacep.h"
  #include "callnet.h"
  #include "strings.h"
  #include   "varable.h"
  #include   "tessclas.h"
  #include   "notdll.h"
  #ifdef USING_GETTEXT
  #include <libintl.h>
  #include <locale.h>
  #define _(x) gettext(x)
  #else
  #define _(x) (x)
  #endif
  #ifdef HAVE_LIBTIFF
  #include "tiffio.h"
  #endif
  #ifdef HAVE_LIBLEPT
  #include "allheaders.h"
  #else
  class Pix;
  #endif
  #ifdef _TIFFIO_
  void read_tiff_image(TIFF* tif, IMAGE* image);
  #endif
  const int kMaxIntSize = 22;
  char szAppName[] = "Tessedit";   //app name
  #define EXTERN
  BOOL_VAR(tessedit_create_boxfile, FALSE, "Output text with boxes");
  BOOL_VAR(tessedit_create_hocr, FALSE, "Output HTML with hOCR markup");
  BOOL_VAR(tessedit_read_image, TRUE, "Ensure the image is read");
  INT_VAR(tessedit_serial_unlv, 0,
  "0->Whole page, 1->serial no adapt, 2->serial with adapt");
  INT_VAR(tessedit_page_number, -1,
  "-1 -> All pages, else specific page to process");
  BOOL_VAR(tessedit_write_images, FALSE, "Capture the image from the IPE");
  BOOL_VAR(tessedit_debug_to_screen, FALSE, "Dont use debug file");

/*
  convert the input_file into the STRING*,and put it into the text_out
  */

  void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_index,
  tesseract::TessBaseAPI* api, STRING* text_out) {
  api->SetInputName(input_file);
  #ifdef HAVE_LIBLEPT
  if (pix != NULL) {
  api->SetImage(pix);
  } else {
  #endif
  int bytes_per_line = check_legal_image_size(image->get_xsize(),
  image->get_ysize(),
  image->get_bpp());
  api->SetImage(image->get_buffer(), image->get_xsize(), image->get_ysize(),
  image->get_bpp() / 8, bytes_per_line);
  #ifdef HAVE_LIBLEPT
  }
  #endif
  if (tessedit_serial_unlv == 0) {
  char* text;
  if (tessedit_create_boxfile)
  text = api->GetBoxText(page_index);
  else if (tessedit_write_unlv)
  text = api->GetUNLVText();
  else if (tessedit_create_hocr)
  text = api->GetHOCRText(page_index + 1);
  else
  text = api->GetUTF8Text();
  *text_out += text;
  delete [] text;
  } else {
  BLOCK_LIST blocks;
  STRING filename = input_file;
  const char* lastdot = strrchr(filename.string(), '.');
  if (lastdot != NULL) {
  filename[lastdot - filename.string()] = '\0';
  }
  if (!read_unlv_file(filename, image->get_xsize(), image->get_ysize(),
  &blocks)) {
  fprintf(stderr, _("Error: Must have a unlv zone file %s to read!\n"),
  filename.string());
  return;
  }
  BLOCK_IT b_it = &blocks;
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
  BLOCK* block = b_it.data();
  TBOX box = block->bounding_box();
  api->SetRectangle(box.left(), image->get_ysize() - box.top(),
  box.width(), box.height());
  char* text = api->GetUNLVText();
  *text_out += text;
  delete [] text;
  if (tessedit_serial_unlv == 1)
  api->ClearAdaptiveClassifier();
  }
  }
  if (tessedit_write_images) {
  page_image.write("tessinput.tif");
  }
  }
/*end TesseractImage*/
/*Ocr  * */
  char* Ocr(const char *input,char *output,const char* lang){
  lang = "eng";
  tesseract::TessBaseAPI  api;
  api.Init(input,lang, 0, 0, false);
  IMAGE image;
  STRING text_out;
  int page_number = tessedit_page_number;
  if (page_number < 0){
  page_number = 0;
  }
  FILE* fp = fopen(input, "rb");
  if (fp == NULL) {
  tprintf(_("Image file %s cannot be opened!\n"),input);
  fclose(fp);
  exit(1);
  }
 #ifdef HAVE_LIBLEPT
  int page = page_number;
  int npages = 0;
  bool is_tiff = fileFormatIsTiff(fp);
  if (is_tiff) {
  int tiffstat = tiffGetCount(fp, &npages);
  if (tiffstat == 1) {
  fprintf (stderr, _("Error reading file %s!\n"),input);
  fclose(fp);
  exit(1);
  }
  //fprintf (stderr, "%d pages\n", npages);
  }
  fclose(fp);
  fp = NULL;
  Pix *pix;
  if (is_tiff) {
  for (; (pix = pixReadTiff(input, page)) != NULL; ++page) {
  if (page > 0)
  tprintf(_("Page %d\n"), page);
  char page_str[kMaxIntSize];
  snprintf(page_str, kMaxIntSize - 1, "%d", page);
  api.SetVariable("applybox_page", page_str);
 // Run tesseract on the page!
  TesseractImage(input, NULL, pix, page, &api, &text_out);
  pixDestroy(&pix);
  if (tessedit_page_number >= 0 || npages == 1) {
  break;
  }
  }
  } else {
  // The file is not a tiff file, so use the general pixRead function.
  // If the image fails to read, try it as a list of filenames.
  PIX* pix = pixRead(input);
  if (pix == NULL) {
  FILE* fimg = fopen(input, "r");
  if (fimg == NULL) {
  tprintf(_("File %s cannot be opened!\n"), input);
  fclose(fimg);
  exit(1);
  }
  char filename[MAX_PATH];
  while (fgets(filename, sizeof(filename), fimg) != NULL) {
  chomp_string(filename);
  pix = pixRead(filename);
  if (pix == NULL) {
  tprintf(_("Image file %s cannot be read!\n"), filename);
  fclose(fimg);
  exit(1);
  }
  tprintf(_("Page %d : %s\n"), page, filename);
  TesseractImage(filename, NULL, pix, page, &api, &text_out);
  pixDestroy(&pix);
  ++page;
  }
  fclose(fimg);
  } else {
  TesseractImage(input, NULL, pix, 0, &api, &text_out);
  pixDestroy(&pix);
  }
  }
  #else
  #ifdef _TIFFIO_
  int len = strlen(input);
  TIFF* archive = NULL;
  do {
  // Since libtiff keeps all read images in memory we have to close the
  // file and reopen it for every page, and seek to the appropriate page.
  if (archive != NULL)
  TIFFClose(archive);
  archive = TIFFOpen(input, "r");
  if (archive == NULL) {
  tprintf(_("Read of file %s failed.\n"), input);
  exit(1);
  }
  if (page_number > 0)
  tprintf(_("Page %d\n"), page_number);
 // Seek to the appropriate page.
  for (int i = 0; i < page_number; ++i) {
  TIFFReadDirectory(archive);
  }
  char page_str[kMaxIntSize];
  snprintf(page_str, kMaxIntSize - 1, "%d", page_number);
  api.SetVariable("applybox_page", page_str);
  // Read the current page into the Tesseract image.
  IMAGE image;
  read_tiff_image(archive, &image);
 // Run tesseract on the page!
  TesseractImage(input, &image, NULL, page_number, &api, &text_out);
  ++page_number;
  // Do this while there are more pages in the tiff file.
  } while (TIFFReadDirectory(archive) &&
  (page_number <= tessedit_page_number || tessedit_page_number < 0));
  TIFFClose(archive);
  } else {
  #endif
  // Using built-in image library to read bmp, or tiff without libtiff.
  if (image.read_header(input) < 0) {
  tprintf(_("Read of file %s failed.\n"), input);
  exit(1);
  }
  if (image.read(image.get_ysize ()) < 0)
  MEMORY_OUT.error("error", EXIT, _("Read of image %s"), input);
  invert_image(&image);
  TesseractImage(input, &image, NULL, 0, &api, &text_out);
  #ifdef _TIFFIO_
  }
  delete[] ext;
  #endif
  #endif  // HAVE_LIBLEPT
 char outs[900];
  //output =(char*)malloc(strlen(text_out.string())+1);
  output=outs;
  memset(output,0,sizeof(output));
  strcpy(output,text_out.string());
  //free(output);
  return output;                      //Normal exit
  }
//main
  int main(int argc,char ** argv){
  const char* input="/home/administrator/donate.tif";
  char * outs= 0;
  outs=Ocr(input,outs,"eng");
  while(outs!=NULL){
  printf("%c",*outs);
  outs++;
  }
  return 0;
  }

第三阶段：通过其他途径传递过来的参数

最后，一阶段，最终实现了，我们想要想要的功能，而且就那么简单的几行：

//初始化,读取图片，生成image对象，通过image对象获取图片的相关参数，调用//提取图片文字信息的接口函数 
  #include "tessedit.h"
  #include "baseapi.h"
  #include "imgs.h"
  #include "varabled.h"
  #include "tprintf.h"
  #include "tesseractmain.h"
  #include "stderr.h"
  #include "tessvars.h"
  #include <MagickWand.h>
  //#include  <MagickCore.h>
  #include "convert.h"
#define _(x) (x)
  char *ocr(char *input,char *output)
  {
  tesseract::TessBaseAPI  api;
  const char* lang = "eng";          //eng为英文包，chi_sim为汉文包 
  api.Init("/tmp",lang, 0, 0, false);//init the  language
  api.SetPageSegMode(tesseract::PSM_AUTO);//设置自动进行版面分析 
  IMAGE image;  //在这里只是为了测试，所以还是从读取图片开始 
  if (image.read_header(input) < 0) {//读取文件中的元信息 
  tprintf(_("Read of file %s  failed.\n"), input);
  exit(1);
  }
  if (image.read(image.get_ysize ()) <  0)
  MEMORY_OUT.error("test", EXIT, _("Read of image  %s"),  input);
  invert_image(&image);
//图片读取结束，获取调用ocr接口所需要的参数 
  const unsigned char* imagedata =  image.get_buffer();
  int bits_per_pixel =  image.get_bpp()/8;
  int bytes_per_line = check_legal_image_size(image.get_xsize(),  image.get_ysize(),
  image.get_bpp());
  int xsize =  image.get_xsize();
  int ysize =  image.get_ysize();
  //所需要的参数中left和top为0，代表从最左，最顶开始转换图片 
  //get.bpp(),return bits  per pixel
  //get.bpp()/8,return bytes per pixel
  //调用ocr接口函数，并输出我们需要的字符串 
  output=api.TesseractRect(imagedata,  bits_per_pixel,
  bytes_per_line, 0, 0,
  xsize,  ysize);
  return output;
  }
  int main(int argc,char **argv) {
 char *input = "/home/administrator/tmp.bmp";
  char *output =NULL;
  output = ocr(input,output);
  printf("%s",output);
  delete []output;
  return 0;                      //Normal exit
  }
  /*函数原型 
  * char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
  int bytes_per_pixel,
  int bytes_per_line,
  int left, int top,
  int width, int height)；*/

tesseract-ocr源码官方网站

http://tesseract-ocr.googlecode.com/svn/trunk/api/
有用的api函数的实现，呵呵
http://tesseract-ocr.repairfaq.org/main.html
其他api查找首页