i used this attached finetune.sh file ... but that raised error. could u 
help me ?

thanks
 

> ###### MAKING TRAINING DATA ######
>
>
>> === Starting training for language 'eng'
>
> [Tue, May 15, 2018 11:42:36 AM] /c/Program Files 
>> (x86)/Tesseract-OCR/text2image --fonts_dir=C:WindowsFonts --font=Arial 
>> --outputbase=/tmp/font_tmp.CpgpM0lbxD/sample_text.txt 
>> --text=/tmp/font_tmp.CpgpM0lbxD/sample_text.txt 
>> --fontconfig_tmpdir=/tmp/font_tmp.CpgpM0lbxD
>
> Rendered page 0 to file 
>> C:/Users/asus/AppData/Local/Temp/font_tmp.CpgpM0lbxD/sample_text.txt.tif
>
>
>> === Phase I: Generating training images ===
>
> Rendering using Arial
>
> Rendering using Corbel
>
> [Tue, May 15, 2018 11:42:37 AM] /c/Program Files 
>> (x86)/Tesseract-OCR/text2image --fontconfig_tmpdir=/tmp/font_tmp.CpgpM0lbxD 
>> --fonts_dir=C:WindowsFonts --strip_unrenderable_words --leading=32 
>> --char_spacing=0.0 --exposure=0 
>> --outputbase=/tmp/tmp.6m4B2TUln1/eng/eng.Arial.exp0 --max_pages=3 
>> --font=Arial --text=./langdata/eng/eng.training_text
>
> [Tue, May 15, 2018 11:42:37 AM] /c/Program Files 
>> (x86)/Tesseract-OCR/text2image --fontconfig_tmpdir=/tmp/font_tmp.CpgpM0lbxD 
>> --fonts_dir=C:WindowsFonts --strip_unrenderable_words --leading=32 
>> --char_spacing=0.0 --exposure=0 
>> --outputbase=/tmp/tmp.6m4B2TUln1/eng/eng.Corbel.exp0 --max_pages=3 
>> --font=Corbel --text=./langdata/eng/eng.training_text
>
> Stripped 2 unrenderable words
>
> Rendered page 0 to file 
>> C:/Users/asus/AppData/Local/Temp/tmp.6m4B2TUln1/eng/eng.Arial.exp0.tif
>
> Stripped 1 unrenderable words
>
> Rendered page 1 to file 
>> C:/Users/asus/AppData/Local/Temp/tmp.6m4B2TUln1/eng/eng.Arial.exp0.tif
>
> Stripped 2 unrenderable words
>
> Rendered page 0 to file 
>> C:/Users/asus/AppData/Local/Temp/tmp.6m4B2TUln1/eng/eng.Corbel.exp0.tif
>
> Stripped 1 unrenderable words
>
> Rendered page 1 to file 
>> C:/Users/asus/AppData/Local/Temp/tmp.6m4B2TUln1/eng/eng.Corbel.exp0.tif
>
>
>> === Phase UP: Generating unicharset and unichar properties files ===
>
> [Tue, May 15, 2018 11:42:39 AM] /c/Program Files 
>> (x86)/Tesseract-OCR/unicharset_extractor --output_unicharset 
>> /tmp/tmp.6m4B2TUln1/eng/eng.unicharset --norm_mode 1 
>> /tmp/tmp.6m4B2TUln1/eng/eng.Arial.exp0.box 
>> /tmp/tmp.6m4B2TUln1/eng/eng.Corbel.exp0.box
>
> Extracting unicharset from box file 
>> C:/Users/asus/AppData/Local/Temp/tmp.6m4B2TUln1/eng/eng.Arial.exp0.box
>
> Extracting unicharset from box file 
>> C:/Users/asus/AppData/Local/Temp/tmp.6m4B2TUln1/eng/eng.Corbel.exp0.box
>
> ICU ERROR: U_FILE_ACCESS_ERRORERROR: 
>> /tmp/tmp.6m4B2TUln1/eng/eng.unicharset does not exist or is not readable
>
> ###### MAKING EVAL DATA ######
>
>
>> === Starting training for language 'eng'
>
> [Tue, May 15, 2018 11:42:40 AM] /c/Program Files 
>> (x86)/Tesseract-OCR/text2image --fonts_dir=C:WindowsFonts --font=Calibri 
>> --outputbase=/tmp/font_tmp.n0qq4iJk4q/sample_text.txt 
>> --text=/tmp/font_tmp.n0qq4iJk4q/sample_text.txt 
>> --fontconfig_tmpdir=/tmp/font_tmp.n0qq4iJk4q
>
> Rendered page 0 to file 
>> C:/Users/asus/AppData/Local/Temp/font_tmp.n0qq4iJk4q/sample_text.txt.tif
>
>
>> === Phase I: Generating training images ===
>
> Rendering using Calibri
>
> [Tue, May 15, 2018 11:42:40 AM] /c/Program Files 
>> (x86)/Tesseract-OCR/text2image --fontconfig_tmpdir=/tmp/font_tmp.n0qq4iJk4q 
>> --fonts_dir=C:WindowsFonts --strip_unrenderable_words --leading=32 
>> --char_spacing=0.0 --exposure=0 
>> --outputbase=/tmp/tmp.h0l64TAxEq/eng/eng.Calibri.exp0 --max_pages=3 
>> --font=Calibri --text=./langdata/eng/eng.training_text
>
> Stripped 2 unrenderable words
>
> Rendered page 0 to file 
>> C:/Users/asus/AppData/Local/Temp/tmp.h0l64TAxEq/eng/eng.Calibri.exp0.tif
>
> Stripped 1 unrenderable words
>
> Rendered page 1 to file 
>> C:/Users/asus/AppData/Local/Temp/tmp.h0l64TAxEq/eng/eng.Calibri.exp0.tif
>
>
>> === Phase UP: Generating unicharset and unichar properties files ===
>
> [Tue, May 15, 2018 11:42:42 AM] /c/Program Files 
>> (x86)/Tesseract-OCR/unicharset_extractor --output_unicharset 
>> /tmp/tmp.h0l64TAxEq/eng/eng.unicharset --norm_mode 1 
>> /tmp/tmp.h0l64TAxEq/eng/eng.Calibri.exp0.box
>
> Extracting unicharset from box file 
>> C:/Users/asus/AppData/Local/Temp/tmp.h0l64TAxEq/eng/eng.Calibri.exp0.box
>
> ICU ERROR: U_FILE_ACCESS_ERRORERROR: 
>> /tmp/tmp.h0l64TAxEq/eng/eng.unicharset does not exist or is not readable
>
> #### combine_tessdata to extract lstm model from previous trained set ####
>
> Extracting tessdata components from ./tessdata_best/eng.traineddata
>
> Wrote ./trained_plus_chars/eng.lstm
>
> Version string:4.00.00alpha:eng:synth20170629
>
> 17:lstm:size=401636, offset=192
>
> 18:lstm-punc-dawg:size=4322, offset=401828
>
> 19:lstm-word-dawg:size=3694794, offset=406150
>
> 20:lstm-number-dawg:size=4738, offset=4100944
>
> 21:lstm-unicharset:size=6360, offset=4105682
>
> 22:lstm-recoder:size=1012, offset=4112042
>
> 23:version:size=30, offset=4113054
>
> #### training from previous optimum  #####
>
> finetune.sh: line 119: 11664 Segmentation fault      lstmtraining 
>> --model_output $train_output_dir/pluschars --continue_from 
>> $train_output_dir/$Lang.lstm --old_traineddata 
>> $tessdata_dir/$Lang.traineddata --traineddata 
>> $train_output_dir/$Lang/$Lang.traineddata --max_iterations $MaxIterations 
>> --debug_interval -1 --eval_listfile 
>> $eval_output_dir/$Lang.training_files.txt --train_listfile 
>> $train_output_dir/$Lang.training_files.txt
>
> #### Building final trained file ./trained_plus_chars/eng_NEW.traineddata 
>> d####
>
> finetune.sh: line 130: 11320 Segmentation fault      lstmtraining 
>> --stop_training --continue_from $train_output_dir/pluschars_checkpoint 
>> --traineddata $train_output_dir/$Lang/$Lang.traineddata --model_output 
>> $final_trained_data_file
>
>

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to tesseract-ocr+unsubscr...@googlegroups.com.
To post to this group, send email to tesseract-ocr@googlegroups.com.
Visit this group at https://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/7c46c196-e08d-4541-9f3b-b8a768792c9a%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
#!/bin/bash

# original script by J Klein <jetm...@gmail.com> - https://pastebin.com/gNLvXkiM
# based on https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00#fine-tuning-for--a-few-characters

# Language
Lang=eng

# Number of Iterations
MaxIterations=3000

# directory with training scripts - this is not the usual place
#   because they are not installed by default
tesstrain_dir=./tesseract-training

# directory with the old 'best' training set
tessdata_dir=./tessdata_best

# downloaded directory with language data -
# IMPORTANT - ADD THE NEW CHARS TO langdata/$Lang/$Lang.training_text with
#    about 15 instances per char
langdata_dir=./langdata

# fonts directory for this system  /mnt/c/Windows/Fonts
fonts_dir=C:\Windows\Fonts

# fonts to use for training - a minimal set for fast tests
fonts_for_training="'Arial' \
  'Corbel'"

  
# fonts for computing evals of best fit model
fonts_for_eval="Calibri"

# output directories for this run
train_output_dir=./trained_plus_chars
eval_output_dir=./eval_plus_chars

# the output trained data file to drop into tesseract
final_trained_data_file=$train_output_dir/{$Lang}_NEW.traineddata

# fatal bug workaround for pango
#export  PANGOCAIRO_BACKEND=fc 

################################################################
# variables to set tasks performed
MakeTraining=yes
MakeEval=yes
MakeLSTM=yes
RunTraining=yes
BuildFinalTrainedFile=yes
################################################################

if [ $MakeTraining = "yes" ]; then
    echo "###### MAKING TRAINING DATA ######"
    rm -rf $train_output_dir
    mkdir $train_output_dir

# the EVAL handles the quotes in the font list
eval $tesstrain_dir/tesstrain.sh  \
     --fonts_dir $fonts_dir \
     --fontlist $fonts_for_training \
     --lang $Lang \
     --linedata_only\
     --noextract_font_properties \
     --exposures "0" \
     --langdata_dir $langdata_dir \
     --tessdata_dir $tessdata_dir \
     --output_dir $train_output_dir
fi

# at this point, $train_output_dir should have $Lang.FontX.exp0.lstmf
# and $Lang.training_files.txt


# eval data
if [ $MakeEval = "yes" ]; then
    echo "###### MAKING EVAL DATA ######"
    rm -rf $eval_output_dir
    mkdir $eval_output_dir
    
eval $tesstrain_dir/tesstrain.sh \
     --fonts_dir $fonts_dir\
     --fontlist $fonts_for_eval \
     --lang $Lang \
     --linedata_only \
     --noextract_font_properties \
     --langdata_dir  $langdata_dir \
     --tessdata_dir  $tessdata_dir \
     --output_dir $eval_output_dir
 
fi

# at this point, $eval_output_dir should have similar files as
# $train_output_dir but for different font set

if [ $MakeLSTM = "yes" ]; then
    echo "#### combine_tessdata to extract lstm model from previous trained set ####"
    
    combine_tessdata \
	  -e $tessdata_dir/$Lang.traineddata  \
	   $train_output_dir/$Lang.lstm
fi

# at this point, we should have $train_output_dir/$Lang.lstm

if [ $RunTraining = "yes" ]; then
    echo "#### training from previous optimum  #####"
    
    lstmtraining \
	--model_output    $train_output_dir/pluschars \
	--continue_from   $train_output_dir/$Lang.lstm \
	--old_traineddata $tessdata_dir/$Lang.traineddata \
	--traineddata     $train_output_dir/$Lang/$Lang.traineddata \
	--max_iterations $MaxIterations \
	--debug_interval -1 \
	--eval_listfile $eval_output_dir/$Lang.training_files.txt \
	--train_listfile $train_output_dir/$Lang.training_files.txt 
fi


if [ $BuildFinalTrainedFile = "yes" ] ; then
    echo "#### Building final trained file $final_trained_data_file d####"
    
    lstmtraining \
	--stop_training \
	--continue_from $train_output_dir/pluschars_checkpoint \
	--traineddata $train_output_dir/$Lang/$Lang.traineddata \
	--model_output $final_trained_data_file
fi



# now  $final_trained_data_file is substituted for installed

##################### added by shree for testing the new traineddata 

cp $train_output_dir/{$Lang}_NEW.traineddata $tessdata_dir/{$Lang}_NEW.traineddata

# now run OCR on ${img_file} and compare output from $Lang and {$Lang}_NEW

img_files=$(ls ./testimage*.png)

for img_file in ${img_files}; do
  echo "****************************" ${img_file} "**********************************"
    time tesseract --tessdata-dir $tessdata_dir   ${img_file} ${img_file%.*}-$Lang  --oem 1 --psm 6 -l $Lang
    time tesseract --tessdata-dir $tessdata_dir   ${img_file} ${img_file%.*}-{$Lang}_NEW  --oem 1 --psm 6 -l {$Lang}_NEW
done


Reply via email to