站长原创,版权所有ITEEDU,2011-07-16
在进行测试的过程中,由于处于摸索和学习的阶段,所以在实现图片文字识别过程中,曾经历了好几个版本,下边简单地做以讲解。
#!bin/bash export /home/administrator/tesseract-ocr/bin: export /home/administrator/tesseract-ocr/lib:在终端输入
tesseract picture1.tif 5 -l chi_sim即可将图片picture1.tif中的文字信息提取出来,并放在5.txt文件中。
在测试阶段,我在eclipse中,修改了tesseract-ocr源码中的tesseractmain.cpp函数,得出了自己的程序代码,具体代码:
//读取图片,分析图片,提取其中的文字内容,输出到终端上来
#include <ctype.h>
#include "applybox.h"
#include "control.h"
#include "tessvars.h"
#include "tessedit.h"
#include "baseapi.h"
#include "thresholder.h"
#include "pageres.h"
#include "imgs.h"
#include "varabled.h"
#include "tprintf.h"
#include "tesseractmain.h"
#include "stderr.h"
#include "notdll.h"
#include "mainblk.h"
#include "output.h"
#include "globals.h"
#include "helpers.h"
#include "blread.h"
#include "tfacep.h"
#include "callnet.h"
#include "strings.h"
#include "varable.h"
#include "tessclas.h"
#include "notdll.h"
#ifdef USING_GETTEXT
#include <libintl.h>
#include <locale.h>
#define _(x) gettext(x)
#else
#define _(x) (x)
#endif
#ifdef HAVE_LIBTIFF
#include "tiffio.h"
#endif
#ifdef HAVE_LIBLEPT
#include "allheaders.h"
#else
class Pix;
#endif
#ifdef _TIFFIO_
void read_tiff_image(TIFF* tif, IMAGE* image);
#endif
const int kMaxIntSize = 22;
char szAppName[] = "Tessedit"; //app name
#define EXTERN
BOOL_VAR(tessedit_create_boxfile, FALSE, "Output text with boxes");
BOOL_VAR(tessedit_create_hocr, FALSE, "Output HTML with hOCR markup");
BOOL_VAR(tessedit_read_image, TRUE, "Ensure the image is read");
INT_VAR(tessedit_serial_unlv, 0,
"0->Whole page, 1->serial no adapt, 2->serial with adapt");
INT_VAR(tessedit_page_number, -1,
"-1 -> All pages, else specific page to process");
BOOL_VAR(tessedit_write_images, FALSE, "Capture the image from the IPE");
BOOL_VAR(tessedit_debug_to_screen, FALSE, "Dont use debug file");
/*
convert the input_file into the STRING*,and put it into the text_out
*/
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_index,
tesseract::TessBaseAPI* api, STRING* text_out) {
api->SetInputName(input_file);
#ifdef HAVE_LIBLEPT
if (pix != NULL) {
api->SetImage(pix);
} else {
#endif
int bytes_per_line = check_legal_image_size(image->get_xsize(),
image->get_ysize(),
image->get_bpp());
api->SetImage(image->get_buffer(), image->get_xsize(), image->get_ysize(),
image->get_bpp() / 8, bytes_per_line);
#ifdef HAVE_LIBLEPT
}
#endif
if (tessedit_serial_unlv == 0) {
char* text;
if (tessedit_create_boxfile)
text = api->GetBoxText(page_index);
else if (tessedit_write_unlv)
text = api->GetUNLVText();
else if (tessedit_create_hocr)
text = api->GetHOCRText(page_index + 1);
else
text = api->GetUTF8Text();
*text_out += text;
delete [] text;
} else {
BLOCK_LIST blocks;
STRING filename = input_file;
const char* lastdot = strrchr(filename.string(), '.');
if (lastdot != NULL) {
filename[lastdot - filename.string()] = '\0';
}
if (!read_unlv_file(filename, image->get_xsize(), image->get_ysize(),
&blocks)) {
fprintf(stderr, _("Error: Must have a unlv zone file %s to read!\n"),
filename.string());
return;
}
BLOCK_IT b_it = &blocks;
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
TBOX box = block->bounding_box();
api->SetRectangle(box.left(), image->get_ysize() - box.top(),
box.width(), box.height());
char* text = api->GetUNLVText();
*text_out += text;
delete [] text;
if (tessedit_serial_unlv == 1)
api->ClearAdaptiveClassifier();
}
}
if (tessedit_write_images) {
page_image.write("tessinput.tif");
}
}
/*end TesseractImage*/
/*Ocr * */
char* Ocr(const char *input,char *output,const char* lang){
lang = "eng";
tesseract::TessBaseAPI api;
api.Init(input,lang, 0, 0, false);
IMAGE image;
STRING text_out;
int page_number = tessedit_page_number;
if (page_number < 0){
page_number = 0;
}
FILE* fp = fopen(input, "rb");
if (fp == NULL) {
tprintf(_("Image file %s cannot be opened!\n"),input);
fclose(fp);
exit(1);
}
#ifdef HAVE_LIBLEPT
int page = page_number;
int npages = 0;
bool is_tiff = fileFormatIsTiff(fp);
if (is_tiff) {
int tiffstat = tiffGetCount(fp, &npages);
if (tiffstat == 1) {
fprintf (stderr, _("Error reading file %s!\n"),input);
fclose(fp);
exit(1);
}
//fprintf (stderr, "%d pages\n", npages);
}
fclose(fp);
fp = NULL;
Pix *pix;
if (is_tiff) {
for (; (pix = pixReadTiff(input, page)) != NULL; ++page) {
if (page > 0)
tprintf(_("Page %d\n"), page);
char page_str[kMaxIntSize];
snprintf(page_str, kMaxIntSize - 1, "%d", page);
api.SetVariable("applybox_page", page_str);
// Run tesseract on the page!
TesseractImage(input, NULL, pix, page, &api, &text_out);
pixDestroy(&pix);
if (tessedit_page_number >= 0 || npages == 1) {
break;
}
}
} else {
// The file is not a tiff file, so use the general pixRead function.
// If the image fails to read, try it as a list of filenames.
PIX* pix = pixRead(input);
if (pix == NULL) {
FILE* fimg = fopen(input, "r");
if (fimg == NULL) {
tprintf(_("File %s cannot be opened!\n"), input);
fclose(fimg);
exit(1);
}
char filename[MAX_PATH];
while (fgets(filename, sizeof(filename), fimg) != NULL) {
chomp_string(filename);
pix = pixRead(filename);
if (pix == NULL) {
tprintf(_("Image file %s cannot be read!\n"), filename);
fclose(fimg);
exit(1);
}
tprintf(_("Page %d : %s\n"), page, filename);
TesseractImage(filename, NULL, pix, page, &api, &text_out);
pixDestroy(&pix);
++page;
}
fclose(fimg);
} else {
TesseractImage(input, NULL, pix, 0, &api, &text_out);
pixDestroy(&pix);
}
}
#else
#ifdef _TIFFIO_
int len = strlen(input);
TIFF* archive = NULL;
do {
// Since libtiff keeps all read images in memory we have to close the
// file and reopen it for every page, and seek to the appropriate page.
if (archive != NULL)
TIFFClose(archive);
archive = TIFFOpen(input, "r");
if (archive == NULL) {
tprintf(_("Read of file %s failed.\n"), input);
exit(1);
}
if (page_number > 0)
tprintf(_("Page %d\n"), page_number);
// Seek to the appropriate page.
for (int i = 0; i < page_number; ++i) {
TIFFReadDirectory(archive);
}
char page_str[kMaxIntSize];
snprintf(page_str, kMaxIntSize - 1, "%d", page_number);
api.SetVariable("applybox_page", page_str);
// Read the current page into the Tesseract image.
IMAGE image;
read_tiff_image(archive, &image);
// Run tesseract on the page!
TesseractImage(input, &image, NULL, page_number, &api, &text_out);
++page_number;
// Do this while there are more pages in the tiff file.
} while (TIFFReadDirectory(archive) &&
(page_number <= tessedit_page_number || tessedit_page_number < 0));
TIFFClose(archive);
} else {
#endif
// Using built-in image library to read bmp, or tiff without libtiff.
if (image.read_header(input) < 0) {
tprintf(_("Read of file %s failed.\n"), input);
exit(1);
}
if (image.read(image.get_ysize ()) < 0)
MEMORY_OUT.error("error", EXIT, _("Read of image %s"), input);
invert_image(&image);
TesseractImage(input, &image, NULL, 0, &api, &text_out);
#ifdef _TIFFIO_
}
delete[] ext;
#endif
#endif // HAVE_LIBLEPT
char outs[900];
//output =(char*)malloc(strlen(text_out.string())+1);
output=outs;
memset(output,0,sizeof(output));
strcpy(output,text_out.string());
//free(output);
return output; //Normal exit
}
//main
int main(int argc,char ** argv){
const char* input="/home/administrator/donate.tif";
char * outs= 0;
outs=Ocr(input,outs,"eng");
while(outs!=NULL){
printf("%c",*outs);
outs++;
}
return 0;
}
//初始化,读取图片,生成image对象,通过image对象获取图片的相关参数,调用//提取图片文字信息的接口函数 #include "tessedit.h" #include "baseapi.h" #include "imgs.h" #include "varabled.h" #include "tprintf.h" #include "tesseractmain.h" #include "stderr.h" #include "tessvars.h" #include <MagickWand.h> //#include <MagickCore.h> #include "convert.h"#define _(x) (x) char *ocr(char *input,char *output) { tesseract::TessBaseAPI api; const char* lang = "eng"; //eng为英文包,chi_sim为汉文包 api.Init("/tmp",lang, 0, 0, false);//init the language api.SetPageSegMode(tesseract::PSM_AUTO);//设置自动进行版面分析 IMAGE image; //在这里只是为了测试,所以还是从读取图片开始 if (image.read_header(input) < 0) {//读取文件中的元信息 tprintf(_("Read of file %s failed.\n"), input); exit(1); } if (image.read(image.get_ysize ()) < 0) MEMORY_OUT.error("test", EXIT, _("Read of image %s"), input); invert_image(&image); //图片读取结束,获取调用ocr接口所需要的参数 const unsigned char* imagedata = image.get_buffer(); int bits_per_pixel = image.get_bpp()/8; int bytes_per_line = check_legal_image_size(image.get_xsize(), image.get_ysize(), image.get_bpp()); int xsize = image.get_xsize(); int ysize = image.get_ysize(); //所需要的参数中left和top为0,代表从最左,最顶开始转换图片 //get.bpp(),return bits per pixel //get.bpp()/8,return bytes per pixel //调用ocr接口函数,并输出我们需要的字符串 output=api.TesseractRect(imagedata, bits_per_pixel, bytes_per_line, 0, 0, xsize, ysize); return output; } int main(int argc,char **argv) { char *input = "/home/administrator/tmp.bmp"; char *output =NULL; output = ocr(input,output); printf("%s",output); delete []output; return 0; //Normal exit } /*函数原型 * char* TessBaseAPI::TesseractRect(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height);*/