#=== CHECK THAT TESSERACT AND TRAINING TOOLS ARE INSTALLED

tesseract -v
text2image -v
unicharset_extractor -v
set_unicharset_properties -v
combine_lang_model -v
lstmtraining -v
lstmeval -v

#===  MAKE DIRECTORIES AND DOWNLOAD REQUIRED FILES

mkdir -p ~/tessscratch
cd ~/tessscratch
wget -O lstm.train
https://raw.githubusercontent.com/tesseract-ocr/tesseract/master/tessdata/configs/lstm.train
wget -O radical-stroke.txt
https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/radical-stroke.txt
mkdir -p mylangdata
mkdir -p mylangdata/foo

#=== CREATE YOUT TRAINING TEXT FOR NEW LANGUAGE foo.
#=== FOR TRAINING FROM SCRATCH, IT SHOULD BE THOSANDS OF LINES.
#=== HERE A COPY OF ENGLISH TRAINING TEXT (72 LINES) IS MADE AS AN
ILLUSTRATION.

wget -O mylangdata/foo/foo.training_text
https://raw.githubusercontent.com/tesseract-ocr/langdata/master/eng/eng.training_text

#=== MAKE BOX/TIFF PAIRS USING TRAINING TEXT AND TWO FONTS.

text2image --strip_unrenderable_words --leading=32 --xsize=3600
--char_spacing=0.0 --exposure=0  --max_pages=0 \
--fonts_dir=/usr/share/fonts \
--font="Arial Unicode MS" \
--text=mylangdata/foo/foo.training_text \
--outputbase=foo.Arial.exp0

text2image --strip_unrenderable_words --leading=32 --xsize=3600
--char_spacing=0.0 --exposure=0  --max_pages=0 \
--fonts_dir=/usr/share/fonts \
--font="Courier New" \
--text=mylangdata/foo/foo.training_text \
--outputbase=foo.Courier.exp0

#=== EXTRACT UNICHARSET & SET PROPERTIES FROM BOX FILES.

unicharset_extractor --output_unicharset foo.unicharset --norm_mode 1
foo.Arial.exp0.box  foo.Courier.exp0.box
set_unicharset_properties -U foo.unicharset -O foo.unicharset -X
foo.xheights --script_dir=.

#=== CREATE LSTMF FILES.

tesseract foo.Arial.exp0.tif foo.Arial.exp0 --psm 6 lstm.train
tesseract foo.Courier.exp0.tif foo.Courier.exp0 --psm 6 lstm.train
ls -1 *.lstmf > foo.training_files.txt

#=== CREATE STARTER TRAINEDDATA

mkdir -p fooscratch

combine_lang_model \
--input_unicharset foo.unicharset \
--script_dir . \
--output_dir fooscratch \
--lang foo

#=== RUN LSTM TRAINING -
#=== hundreds of thousands of iterations may be needed for real
training_text.

lstmtraining \
--model_output  fooscratch/LAYER \
--net_spec '[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c111]' \
--learning_rate 20e-4 \
--traineddata  fooscratch/foo/foo.traineddata \
--train_listfile  foo.training_files.txt   \
--debug_interval -1 \
--max_iterations 100


lstmtraining \
--model_output  fooscratch/LAYER \
--net_spec '[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c111]' \
--learning_rate 20e-4 \
--traineddata  fooscratch/foo/foo.traineddata \
--train_listfile  foo.training_files.txt   \
--debug_interval 0 \
--max_iterations 5000





On Wed, Apr 3, 2019 at 8:16 PM Shobhit Kapil <shobhitka...@gmail.com> wrote:

> Hi Team,
>
> I am not at all aware of training tesseract 4, is there any way that how
> to learn train tesseract 4.
> By reading the document also i am not getting from where to start and what
> to start.
>
> Thanks,
> Shobhit
>
> --
> You received this message because you are subscribed to the Google Groups
> "tesseract-ocr" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to tesseract-ocr+unsubscr...@googlegroups.com.
> To post to this group, send email to tesseract-ocr@googlegroups.com.
> Visit this group at https://groups.google.com/group/tesseract-ocr.
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/tesseract-ocr/39ad00ed-c9f7-42dd-896b-ae0dfbd58dbd%40googlegroups.com
> <https://groups.google.com/d/msgid/tesseract-ocr/39ad00ed-c9f7-42dd-896b-ae0dfbd58dbd%40googlegroups.com?utm_medium=email&utm_source=footer>
> .
> For more options, visit https://groups.google.com/d/optout.
>


-- 

____________________________________________________________
भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to tesseract-ocr+unsubscr...@googlegroups.com.
To post to this group, send email to tesseract-ocr@googlegroups.com.
Visit this group at https://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduVupBTyHTo1E40C7DigbrCRo4xnupi8x3TYM3OQS5R9Pw%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to