Finetune with your specific font - see eg. below which uses IMPACT font. #!/bin/bash
time ~/tesseract/src/training/tesstrain.sh \ --fonts_dir /usr/share/fonts \ --lang eng --linedata_only \ --noextract_font_properties \ --langdata_dir ~/langdata \ --tessdata_dir ~/tessdata \ --fontlist "Impact Condensed" \ --training_text ~/langdata/eng/eng.training_text \ --workspace_dir ~/tmp/ \ --save_box_tiff \ --output_dir ~/tesstutorial/engtrainfont time ~/tesseract/src/training/tesstrain.sh \ --fonts_dir /usr/share/fonts \ --lang eng --linedata_only \ --noextract_font_properties \ --langdata_dir ~/langdata \ --tessdata_dir ~/tessdata \ --fontlist "Impact Condensed" \ --training_text ~/langdata/eng/eng.mywordlist.training_text \ --workspace_dir ~/tmp/ \ --save_box_tiff \ --output_dir ~/tesstutorial/engevalwordlist # https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00#fine-tuning-for-impact echo "/n ****** Finetune one of the fully-trained existing models: ***********" mkdir -p ~/tesstutorial/impact_from_full combine_tessdata -e ~/tessdata_best/eng.traineddata \ ~/tesstutorial/impact_from_full/eng.lstm time ~/tesseract/src/training/lstmtraining \ --model_output ~/tesstutorial/impact_from_full/impact \ --continue_from ~/tesstutorial/impact_from_full/eng.lstm \ --traineddata ~/tessdata_best/eng.traineddata \ --train_listfile ~/tesstutorial/engtrainfont/eng.training_files.txt \ --debug_interval -1 \ --max_iterations 400 echo -e "\n*********** eval on training data ******\n" time ~/tesseract/src/training/lstmeval \ --model ~/tesstutorial/impact_from_full/impact_checkpoint \ --traineddata ~/tessdata_best/eng.traineddata \ --eval_listfile ~/tesstutorial/engtrainfont/eng.training_files.txt echo -e "\n***********eval on eval data ******\n" time ~/tesseract/src/training/lstmeval \ --model ~/tesstutorial/impact_from_full/impact_checkpoint \ --traineddata ~/tessdata_best/eng.traineddata \ --eval_listfile ~/tesstutorial/engevalwordlist/eng.training_files.txt echo -e "\n*********** convert to traineddata ******\n" time ../tesseract/src/training/lstmtraining \ --stop_training \ --continue_from ~/tesstutorial/impact_from_full/impact_checkpoint \ --traineddata ~/tessdata_best/eng.traineddata \ --model_output ~/tesstutorial/engtrainfont/eng.traineddata On Mon, Jan 28, 2019 at 9:37 PM Daniel Ferenc <voo...@gmail.com> wrote: > Hi, > > I need to train Tesseract for only a specific wordlist (about 13600 words) > and one specific font. I tried following the training tutorial on the Wiki > but I'm not sure if i'm doing anything wrong - the traineddata file is > about 7 megabytes and i combined it with the eng.traineddata to get any > traineddata file because after finishing the training I had no traineddata > file at all. Can anyone please help me? > > -- > You received this message because you are subscribed to the Google Groups > "tesseract-ocr" group. > To unsubscribe from this group and stop receiving emails from it, send an > email to tesseract-ocr+unsubscr...@googlegroups.com. > To post to this group, send email to tesseract-ocr@googlegroups.com. > Visit this group at https://groups.google.com/group/tesseract-ocr. > To view this discussion on the web visit > https://groups.google.com/d/msgid/tesseract-ocr/1909bad8-d28d-4660-812d-47d0310e67c2%40googlegroups.com > <https://groups.google.com/d/msgid/tesseract-ocr/1909bad8-d28d-4660-812d-47d0310e67c2%40googlegroups.com?utm_medium=email&utm_source=footer> > . > For more options, visit https://groups.google.com/d/optout. > -- ____________________________________________________________ भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To post to this group, send email to tesseract-ocr@googlegroups.com. Visit this group at https://groups.google.com/group/tesseract-ocr. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduUK-CQcgDBYyJWk67okigxGVaNbG8UVAFehDUZXox-zNQ%40mail.gmail.com. For more options, visit https://groups.google.com/d/optout.