ITEEDU

Tesseract-ocr的JNI测试

(1) 编写Main.java接口具体代码如下：

  import java.awt.Image;    
  import java.io.File;    
  import java.io.FileInputStream;    
  import java.io.FileNotFoundException;    
  import java.io.IOException;    
  import java.io.InputStream;    
  import javax.imageio.ImageIO;    
  public class Main {    
  	static {    
  		System.loadLibrary("tesseractjni");   
          }    
  	public native String OCR(String input,String output,String lang);    
          //二进制流时，bytes_per_pixel是0    
  	public static void main(String[] args) throws InstantiationException, IllegalAccessException, FileNotFoundException, IOException { 
                 Main m = new Main();   
              InputStream input_file = new FileInputStream(new File("/home/administrator/5.tif"));     
              if(input_file==null)    
                  System.out.println("inputStream null");    
               String input = input_file.toString();    
               //String output = "";      
              StringBuffer   out   =   new   StringBuffer();   
              byte[]   b   =   new   byte[4096];    
              for(int  n; (n   =   input_file.read(b))!=-1;){    
                  out.append(new   String(b,   0,   n));    
           }    
          input=out.toString();    
          // System.out.println(input);	
          String lang = chi_sim; 
          String output = “”;      
          m.OCR(input, output, lang);   
        } 
   
  }

该代码的功能是：通过java传递过来图片的内容（以字符串形式）、所加载的语言包最后调用了OCR接口，来实现处理图片上的文字信息，并返回output的值，即为图片上的文字信息。

(2)编译Main.java

命令：javac Main.java

生成Main.class

(3)生成Main.h

命令:javah Main

Main.h的内容：

/* DO NOT EDIT THIS FILE - it is machine generated */  
 #include "jni.h"
/* Header for class Main */ 
#ifndef _Included_Main  
#define _Included_Main  
#ifdef __cplusplus  
extern "C" {  
#endif  
/* * Class:     Main  
 * Method:    OCR  
 * Signature: (Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;)Ljava/lang/String; 
   */  
JNIEXPORT jstring JNICALL Java_Main_OCR  
  (JNIEnv *, jobject, jstring, jstring, jstring);  
  #ifdef __cplusplus 
  } #endif  
#endif

(4)在eclipse中实现OCR接口函数

首先，确保tesseract-ocr在eclipse中配置正确；

其次，将jni.h和jni_md.h这两个文件从系统中所安装的java-jdk中拷贝出来，放到当前工程目录下边；

然后，将Main.h也拷贝到当前目录下边；

最后，编写OCR接口函数。

//测试，输入一个字符串，输出一个字符串的main函数的测试  
//input:为指向图片所在的绝对路径的字符指针；  
//output：为指向存放处理完毕后，所得字符串的指针；  
//返回output字符指针  
//lang参数在此初始化为“eng”,则为英文包；初始化为"chi_sim",则为汉语包  
//这里对output采用的是动态分配内存，由于存在版面等的分析，实际存储字符串所占用的空间并不多，造成内存分配浪费  
  //添加了jni机制，目标：用Main.class来调用libtesseractjni.so #include "Main.h"    
  #include <ctype.h>    
  #include "applybox.h"    
  #include "control.h"    
  #include "tessvars.h"    
  #include "tessedit.h"    
  #include "baseapi.h"    
  #include "thresholder.h"    
  #include "pageres.h"    
  #include "imgs.h"    
  #include "varabled.h"    
  #include "tprintf.h"    
  #include "tesseractmain.h"    
  #include "stderr.h"   
  #include "notdll.h"    
  #include "mainblk.h"    
  #include "output.h"    
  #include "globals.h"    
  #include "helpers.h"    
  #include "blread.h"    
  #include "tfacep.h"    
  #include "callnet.h"     
  #include "strings.h"    
  #include   "varable.h"    
  #include   "tessclas.h"    
  #include   "notdll.h"    
  #ifdef USING_GETTEXT    
  #include <libintl.h>    
  #include <locale.h>    
  #define _(x) gettext(x)    
  #else    
  #define _(x) (x)    
  #endif    
  #ifdef HAVE_LIBTIFF    
  #include "tiffio.h"    
  #endif    
  #ifdef HAVE_LIBLEPT    
  #include "allheaders.h"    
  #else    
  class Pix;    
  #endif    
  #ifdef _TIFFIO_    
  void read_tiff_image(TIFF* tif, IMAGE* image);    
  #endif    
  const int kMaxIntSize = 22;  
  #define EXTERN    
  BOOL_VAR(tessedit_create_boxfile, FALSE, "Output text with boxes"); 
  BOOL_VAR(tessedit_create_hocr, FALSE, "Output HTML with hOCR markup"); 
  BOOL_VAR(tessedit_read_image, TRUE, "Ensure the image is read"); 
  INT_VAR(tessedit_serial_unlv, 0, 
  "0->Whole page, 1->serial no adapt, 2->serial with adapt"); 
  INT_VAR(tessedit_page_number, -1,    
          "-1 -> All pages, else specific page to process"); 
  BOOL_VAR(tessedit_write_images, FALSE, "Capture the image from the IPE"); 
  BOOL_VAR(tessedit_debug_to_screen, FALSE, "Dont use debug file"); 
  /* 
  convert the input_file into the STRING*,and put it into the text_out 
    */ 
  void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_index, 
   	                    tesseract::TessBaseAPI* api, STRING* text_out) { 
    	  api->SetInputName(input_file); 
  #ifdef HAVE_LIBLEPT 
  if (pix != NULL) {    
  	    api->SetImage(pix);    
  	  } else {    
  	#endif    
  	    int bytes_per_line = check_legal_image_size(image->get_xsize(),    
  	                                                image->get_ysize(),    
  	                                                image->get_bpp());    
  	    api->SetImage(image->get_buffer(), image->get_xsize(), image->get_ysize(),    
  	                  image->get_bpp() / 8, bytes_per_line);    
  	#ifdef HAVE_LIBLEPT    
  	  }    
  	#endif    
  	  if (tessedit_serial_unlv == 0) {   
  	    char* text;    
  	    if (tessedit_create_boxfile)    
  	      text = api->GetBoxText(page_index);    
  	    else if (tessedit_write_unlv)    
  	      text = api->GetUNLVText();    
  	    else if (tessedit_create_hocr)    
  	      text = api->GetHOCRText(page_index + 1);    
  	    else    
  	      text = api->GetUTF8Text();    
  	    *text_out += text;    
  	    delete [] text;    
  	  } else {    
  	    BLOCK_LIST blocks;    
  	    STRING filename = input_file;    
  	    const char* lastdot = strrchr(filename.string(), '.');    
  	    if (lastdot != NULL) {    
  	      filename[lastdot - filename.string()] = '\0';
    	}    
  	    if (!read_unlv_file(filename, image->get_xsize(), image->get_ysize(),    
  	                        &blocks)) {    
  	      fprintf(stderr, _("Error: Must have a unlv zone file %s to read!\n"),    
  	              filename.string());    
  	      return;    
  	    }    
  	    BLOCK_IT b_it = &blocks;    
  	    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {    
  	      BLOCK* block = b_it.data();    
  	      TBOX box = block->bounding_box();    
  	      api->SetRectangle(box.left(), image->get_ysize() - box.top(),    
  	                        box.width(), box.height());    
  	      char* text = api->GetUNLVText();    
  	      *text_out += text;    
  	      delete [] text;    
  	      if (tessedit_serial_unlv == 1)    
  	        api->ClearAdaptiveClassifier();    
  	    } 
  	  }    
  	 /* if (tessedit_write_images) {    
  	    page_image.write("tessinput.tif");    
  	  }*/    
  }    
  /*end TesseractImage*/    
  /*Java_Main_OCR Native funciton   
   * */    
  JNIEXPORT jstring JNICALL Java_Main_OCR(JNIEnv *env, jobject jo, jstring jinput, jstring joutput, jstring jlang){ 
  	const char *input;    
  	char *output;    
  	const char *lang;    
  	input = env->GetStringUTFChars(jinput, 0);    
  	//output =env->GetStringUTFChars(joutput, 0);    
  	lang =  env->GetStringUTFChars(jlang, 0);    
  	tesseract::TessBaseAPI  api; 
  	api.Init(input,lang, 0, 0, false); 
  	IMAGE image;    
  	STRING text_out;    
  	int page_number = tessedit_page_number;    
  	if (page_number < 0){    
  	    page_number = 0;    
  	}    
  	FILE* fp = fopen(input, "rb");    
  	if (fp == NULL) {    
  	    tprintf(_("Image file %s cannot be opened!\n"),input);    
  	    fclose(fp);    
  	    exit(1);    
  	}    
  	#ifdef HAVE_LIBLEPT    
  	  int page = page_number;    
  	  int npages = 0;    
  	  bool is_tiff = fileFormatIsTiff(fp);    
  	  if (is_tiff) {    
  	    int tiffstat = tiffGetCount(fp, &npages);    
  	    if (tiffstat == 1) {    
  	      fprintf (stderr, _("Error reading file %s!\n"),input);    
  	      fclose(fp);    
  	      exit(1);    
  	    }    
  	    //fprintf (stderr, "%d pages\n", npages);    
  	  }    
  	  fclose(fp);    
  	  fp = NULL;    
  	  Pix *pix;    
  	    if (is_tiff) {    
  	      for (; (pix = pixReadTiff(input, page)) != NULL; ++page) {    
  	        if (page > 0)    
  	          tprintf(_("Page %d\n"), page);    
  	        char page_str[kMaxIntSize];    
  	        snprintf(page_str, kMaxIntSize - 1, "%d", page);    
  	        api.SetVariable("applybox_page", page_str);    
  	        // Run tesseract on the page!    
  	        TesseractImage(input, NULL, pix, page, &api, &text_out);    
  	        pixDestroy(&pix);    
  	        if (tessedit_page_number >= 0 || npages == 1) {    
  	          break;    
  	        }    
  	      } 
  	    }    
  	    else {    
  	    	      // The file is not a tiff file, so use the general pixRead function. 
  	    	      // If the image fails to read, try it as a list of filenames.    
  	    	      PIX* pix = pixRead(input);    
  	    	      if (pix == NULL) {    
  	    	        FILE* fimg = fopen(input, "r");    
  	    	        if (fimg == NULL) {    
  	    	          tprintf(_("File %s cannot be opened!\n"), input);    
  	    	          fclose(fimg);    
  	    	          exit(1);    
  	    	        }    
  	    	        char filename[MAX_PATH];    
  	    	        while (fgets(filename, sizeof(filename), fimg) != NULL) {    
  	    	          chomp_string(filename);    
  	    	          pix = pixRead(filename);    
  	    	          if (pix == NULL) {    
  	    	            tprintf(_("Image file %s cannot be read!\n"), filename);   
  	    	            fclose(fimg);    
  	    	            exit(1);    
  	    	          }    
  	    	          tprintf(_("Page %d : %s\n"), page, filename);    
  	    	          TesseractImage(filename, NULL, pix, page, &api, &text_out);    
  	    	          pixDestroy(&pix);    
  	    	          ++page;    
  	    	        }    
  	    	        fclose(fimg);    
  	    	      } else {    
  	    	        TesseractImage(input, NULL, pix, 0, &api, &text_out);    
  	    	        pixDestroy(&pix);    
  	    	      }    
  	    	    }    
  	    #else    
  	    #ifdef _TIFFIO_    
  	      int len = strlen(input);    
  	      TIFF* archive = NULL;   
  	        do {    
  	          // Since libtiff keeps all read images in memory we have to close the    
  	          // file and reopen it for every page, and seek to the appropriate page.    
  	          if (archive != NULL)    
  	            TIFFClose(archive);    
  	          archive = TIFFOpen(input, "r");    
  	          if (archive == NULL) {    
  	            tprintf(_("Read of file %s failed.\n"), input);    
  	            exit(1);    
  	          } 
             if (page_number > 0)    
  	            tprintf(_("Page %d\n"), page_number);    
  	          // Seek to the appropriate page.    
  	          for (int i = 0; i < page_number; ++i) {    
  	            TIFFReadDirectory(archive);    
  	          }    
  	          char page_str[kMaxIntSize];    
  	          snprintf(page_str, kMaxIntSize - 1, "%d", page_number);    
  	          api.SetVariable("applybox_page", page_str);    
  	          // Read the current page into the Tesseract image.    
  	          IMAGE image;    
  	          read_tiff_image(archive, &image);    
  	          // Run tesseract on the page!    
  	          TesseractImage(input, &image, NULL, page_number, &api, &text_out); 
   	      ++page_number;    
  	        // Do this while there are more pages in the tiff file.    
  	        } while (TIFFReadDirectory(archive) && 
     	                 (page_number <= tessedit_page_number || tessedit_page_number < 0));    
  	        TIFFClose(archive);    
  	      } else {    
  	    #endif  
   
      // Using built-in image library to read bmp, or tiff without libtiff.    
      if (image.read_header(input) < 0) {    
        tprintf(_("Read of file %s failed.\n"), input);    
        exit(1);    
      }    
      if (image.read(image.get_ysize ()) < 0)    
        MEMORY_OUT.error("error", EXIT, _("Read of image %s"), input);    
      invert_image(&image);    
      TesseractImage(input, &image, NULL, 0, &api, &text_out);    
  #ifdef _TIFFIO_    
    }    
    delete[] ext;    
  #endif    
  #endif  // HAVE_LIBLEPT   
       output =(char*)malloc(strlen(text_out.string())+1);    
       memset(output,0,sizeof(output));   
       strcpy(output,text_out.string());    
       joutput = env->NewStringUTF(output);    
       free(output);    
     /* while(output!=NULL){    
            	printf("%c",*output);    
            	output++;    
      }*/    
     return joutput;    //Normal exit    
  }    
  /* * end Java_Main_OCR */

（5）编译OCR

第一：在命令行去操作：

g++ -fPIC -D_REENTRANT -I/usr/lib/jvm/java-1.5.0-sun-1.5.0.19/include -I/usr/lib/jvm/java-1.5.0-sun-1.5.0.19/include/linux -c tesseractmain.cpp

或者

g++  -fPIC -D_REENTRANT -I /usr/lib/jvm/java-6-openjdk/include

-c tesseract.cpp

其中，-I所加载的路径，是jni.h和jni_md.h所在的目录。

最终生成tesseractmain.o文件。

第二：在eclipse中操作：

需要将jni.h和jni_md.h以及Main.h均放在所测试的工程目录下。

直接bulid即可，生成.o文件。

（6)利用.o文件生成libtesseract.so文件

     g++ -shared tesseractmain.o -o libtesseractjni.so

至此，之后，我们所需要的libtesseractjni.so生成。

确保libtesseractjni.so文件的路径添加到LD_LIBRARY_PATH中。

运行java Main.class:java Main

注意：在Main.java中，System.loadLibrary("tesseractjni");，其中添加绝对路径，或者添加.so文件的全称都无法识别，这一点与在eclipse下的Lib寻找路径类似。

这样子设置虽然可以识别libtesseractjni.so文件了，但是却无法找到ocr中的所需要链接的东西。

当然，在失败之前，还进行了其他版本的测试：

Main.java函数：

  import java.awt.Image; 
  import java.io.File; 
  import java.io.FileInputStream; 
  import java.io.FileNotFoundException; 
  import java.io.IOException; 
  import java.io.InputStream; 
  import javax.imageio.ImageIO; 
   
  public class Main { 
  	static { 
   
  		System.loadLibrary("tesseractjni"); 
                 // System.load("home/administrator/workspace/testorc2/libtesseractjni.so"); 
          } 
  	public native String OCR(String input,String output,String lang,int bytes_per_pixel,int bytes_per_line,int left,int top,int width, int height); 
          //二进制流时，bytes_per_pixel是0 
  	public static void main(String[] args) throws InstantiationException, IllegalAccessException, FileNotFoundException, IOException { 
              Main m = new Main();      
              InputStream input_file = new FileInputStream(new File("/home/administrator/5.tif"));  
              if(input_file==null) 
                  System.out.println("inputStream null"); 
               String input = input_file.toString(); 
               //String output = "";    
               
              StringBuffer   out   =   new   StringBuffer(); 
              byte[]   b   =   new   byte[4096]; 
              for(int  n; (n   =   input_file.read(b))!=-1;){ 
                  out.append(new   String(b,   0,   n)); 
           } 
          input=out.toString(); 
          // System.out.println(input);	 
         int bytes_per_pixel=0; 
         int bytes_per_line=50; 
         int left =0; 
         int top = 0; 
         int width =1024; 
         int height =800; 
          
         m.OCR(input, input, input, bytes_per_pixel, bytes_per_line, left, top, width, height); 
        } 
  }

Main.h函数

/* DO NOT EDIT THIS FILE - it is machine generated */ 
  #include "jni.h" 
  /* Header for class Main */ 
   
  #ifndef _Included_Main 
  #define _Included_Main 
  #ifdef __cplusplus 
  extern "C" { 
  #endif 
  /* 
   * Class:     Main 
   * Method:    OCR 
   * Signature: (Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;IIIIII)Ljava/lang/String; 
   */ 
  JNIEXPORT jstring JNICALL Java_Main_OCR 
    (JNIEnv *, jobject, jstring, jstring, jstring, jint, jint, jint, jint, jint, jint); 
   
  #ifdef __cplusplus 
  } 
  #endif 
  #endif

ocr.cpp函数

/*测试，输入一个字符串，输出一个字符串的main函数的测试

input:为指向图片所在的绝对路径的字符指针；

output：为指向存放处理完毕后，所得字符串的指针；

返回output字符指针

lang参数在此初始化为“eng”,则为英文包；初始化为"chi_sim",则为汉语包

这里对output采用的是动态分配内存，由于存在版面等的分析，实际存储字符串所占用的空间并不多，造成内存分配浪费

添加了jni机制，目标：用Main.class来调用libtesseractjni.so*/

#include "Main.h" 
  #include "tessedit.h" 
  #include "baseapi.h" 
  #include "tesseractmain.h" 
  #include "strings.h" 
   
   
  /* 
  convert the input_file into the STRING*,and put it into the text_out 
   */ 
  JNIEXPORT jstring JNICALL Java_Main_OCR(JNIEnv *env, jobject jo, jstring jinput, jstring joutput, jstring jlang,jint jbytes_per_pixel,jint jbytes_per_line,jint jleft,jint jtop,jint jwidth,jint jheight){ 
   
     const unsigned char* imagedata; 
     int bytes_per_pixel = jbytes_per_pixel;//二进制流时，为0 
     int bytes_per_line = jbytes_per_line; 
     int left = jleft;//默认为0 
     int top = jtop;//默认为0 
     int width = jwidth; 
     int height = jheight; 
     tesseract::TessBaseAPI* api; 
     char *input = (char *)malloc(strlen(env->GetStringUTFChars(jinput, 0))+1); 
     strcpy(input,env->GetStringUTFChars(jinput, 0)); 
     imagedata = (unsigned char*)input; 
     char *output; 
     output = (char*)malloc(strlen(api->TesseractRect(imagedata, bytes_per_pixel, bytes_per_line, left, top, width, height))); 
      //output=api->TesseractRect(imagedata, bytes_per_pixel, bytes_per_line, left, top, width, height); 
      strcpy(output,api->TesseractRect(imagedata, bytes_per_pixel, bytes_per_line, left, top, width, height)); 
      //编码格式是UTF8 
      joutput = env->NewStringUTF(output); 
      free(input); 
      free(output); 
      return joutput; 
   }

问题总结

1.测试运行时的错误

java Main时，一直：

java Main 
   
  Exception in thread "main" java.lang.UnsatisfiedLinkError: /home/administrator/workspace/testorc2/libtesseractjni.so: /home/administrator/workspace/testorc2/libtesseractjni.so: undefined symbol: _ZN9tesseract11TessBaseAPI12SetInputNameEPKc 
  	at java.lang.ClassLoader$NativeLibrary.load(Native Method) 
  	at java.lang.ClassLoader.loadLibrary0(ClassLoader.java:1750) 
  	at java.lang.ClassLoader.loadLibrary(ClassLoader.java:1675) 
  	at java.lang.Runtime.loadLibrary0(Runtime.java:840) 
  	at java.lang.System.loadLibrary(System.java:1047) 
  	at Main.<clinit>(Main.java:10) 
  Could not find the main class: Main. Program will exit.

2.代码编写中的不足

在所实现的native函数中JNIEXPORT jstring JNICALL Java_Main_OCR(JNIEnv *env, jobject jo, jstring jinput, jstring joutput, jstring jlang)；

参数jinput转换成了只读的const char* input,jlang也相应做了转换；

参数joutput由于要只想一个字符串，且作为返回值，需要在程序中给其分配内存：

在测试工程中所采用的两种分配方式的测试结果都不是很理想。

3.最主要的是GCC命令行选项

JNI编译时，需要注意GCC或者G++命令行的参数：

（1）-shared 该选项指定生成动态连接库（让连接器生成T类型的导出符号表，有时候也生成弱连接W类型的导出符号），不用该标志外部程序无法连接。相当于一个可执行文件

（2）-fPIC：表示编译为位置独立的代码，不用此选项的话编译后的代码是位置相关的所以动态载入时是通过代码拷贝的方式来满足不同进程的需要，而不能达到真正代码段共享的目的。

（3）-L.：表示要连接的库在当前目录中

（4）-ltest：编译器查找动态连接库时有隐含的命名规则，即在给出的名字前面加上lib，后面加上.so来确定库的名称

（5）LD_LIBRARY_PATH：这个环境变量指示动态连接器可以装载动态库的路径。

当然如果有root权限的话，可以修改/etc/ld.so.conf文件，然后调用 /sbin/ldconfig来达到同样的目的，不过如果没有root权限，那么只能采用输出LD_LIBRARY_PATH的方法了。

JNI调用动态库时的问题

调用动态库的时候有几个问题会经常碰到，有时，明明已经将库的头文件所在目录通过 “-I” include进来了，库所在文件通过 “-L”参数引导，并指定了“-l”的库名，但通过ldd命令察看时，就是死活找不到你指定链接的so文件，这时你要作的就是通过修改LD_LIBRARY_PATH或者/etc/ld.so.conf文件来指定动态库的目录。通常这样做就可以解决库无法链接的问题了。