Finetune with your specific font - see eg. below which uses IMPACT font.

#!/bin/bash

time ~/tesseract/src/training/tesstrain.sh \
  --fonts_dir /usr/share/fonts \
  --lang eng --linedata_only \
  --noextract_font_properties \
  --langdata_dir ~/langdata \
  --tessdata_dir ~/tessdata \
  --fontlist "Impact Condensed" \
  --training_text ~/langdata/eng/eng.training_text \
  --workspace_dir ~/tmp/ \
  --save_box_tiff \
  --output_dir ~/tesstutorial/engtrainfont

time ~/tesseract/src/training/tesstrain.sh \
  --fonts_dir /usr/share/fonts \
  --lang eng --linedata_only \
  --noextract_font_properties \
  --langdata_dir ~/langdata \
  --tessdata_dir ~/tessdata \
  --fontlist "Impact Condensed" \
  --training_text ~/langdata/eng/eng.mywordlist.training_text \
  --workspace_dir ~/tmp/ \
  --save_box_tiff \
  --output_dir ~/tesstutorial/engevalwordlist

#
https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00#fine-tuning-for-impact

echo "/n ****** Finetune one of the fully-trained existing models:
***********"

mkdir -p ~/tesstutorial/impact_from_full

combine_tessdata -e ~/tessdata_best/eng.traineddata \
  ~/tesstutorial/impact_from_full/eng.lstm

time ~/tesseract/src/training/lstmtraining \
  --model_output ~/tesstutorial/impact_from_full/impact \
  --continue_from ~/tesstutorial/impact_from_full/eng.lstm \
  --traineddata ~/tessdata_best/eng.traineddata \
  --train_listfile ~/tesstutorial/engtrainfont/eng.training_files.txt \
  --debug_interval -1 \
  --max_iterations 400

echo -e "\n*********** eval on training data ******\n"

time ~/tesseract/src/training/lstmeval \
  --model ~/tesstutorial/impact_from_full/impact_checkpoint \
  --traineddata ~/tessdata_best/eng.traineddata \
  --eval_listfile ~/tesstutorial/engtrainfont/eng.training_files.txt

echo -e "\n***********eval on eval data ******\n"

time ~/tesseract/src/training/lstmeval \
  --model ~/tesstutorial/impact_from_full/impact_checkpoint \
  --traineddata ~/tessdata_best/eng.traineddata \
  --eval_listfile ~/tesstutorial/engevalwordlist/eng.training_files.txt

echo -e "\n*********** convert to traineddata  ******\n"

time ../tesseract/src/training/lstmtraining \
  --stop_training \
  --continue_from ~/tesstutorial/impact_from_full/impact_checkpoint \
  --traineddata ~/tessdata_best/eng.traineddata \
  --model_output ~/tesstutorial/engtrainfont/eng.traineddata


On Mon, Jan 28, 2019 at 9:37 PM Daniel Ferenc <voo...@gmail.com> wrote:

> Hi,
>
> I need to train Tesseract for only a specific wordlist (about 13600 words)
> and one specific font. I tried following the training tutorial on the Wiki
> but I'm not sure if i'm doing anything wrong - the traineddata file is
> about 7 megabytes and i combined it with the eng.traineddata to get any
> traineddata file because after finishing the training I had no traineddata
> file at all. Can anyone please help me?
>
> --
> You received this message because you are subscribed to the Google Groups
> "tesseract-ocr" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to tesseract-ocr+unsubscr...@googlegroups.com.
> To post to this group, send email to tesseract-ocr@googlegroups.com.
> Visit this group at https://groups.google.com/group/tesseract-ocr.
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/tesseract-ocr/1909bad8-d28d-4660-812d-47d0310e67c2%40googlegroups.com
> <https://groups.google.com/d/msgid/tesseract-ocr/1909bad8-d28d-4660-812d-47d0310e67c2%40googlegroups.com?utm_medium=email&utm_source=footer>
> .
> For more options, visit https://groups.google.com/d/optout.
>


-- 

____________________________________________________________
भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to tesseract-ocr+unsubscr...@googlegroups.com.
To post to this group, send email to tesseract-ocr@googlegroups.com.
Visit this group at https://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduUK-CQcgDBYyJWk67okigxGVaNbG8UVAFehDUZXox-zNQ%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to