[tesseract-ocr] Re: trainning question

Ali hussain Tue, 25 Jul 2023 01:47:05 -0700

 
# import subprocess

# # List of font names
# font_names = ['ben']


# for font in font_names:
#     command = f"lstmtraining --continue_from 
data/ben/checkpoints/ben_19.535_298_300.checkpoint --traineddata 
data/ben/ben.traineddata --model_output data/ben/checkpoints/ben 
--train_listfile data/ben/list.train --eval_listfile data/ben/list.eval 
--max_iterations 1000"
#     subprocess.run(command, shell=True)


i fixed the problem and this code work by selected checkpoint 
On Thursday, 20 July, 2023 at 7:51:43 am UTC+6 Ali hussain wrote:

> I'm new in Tesseract and trying to train my own fonts on Tesseract 5.3.2 
> but I have to know if the electricity is cut off or if I cancel vs code or 
> something like that of the process of training then if I run the training 
> command again so after that it starts from begging or from electricity cut 
> off?
>
> I have already to tested it but every time starts from begging. so I need 
> to know any method to apply this problem to handle this. because it takes a 
> lot of time and is not necessary to start by begging every time or it's 
> normal?
>
>
> I use this command to create text-to-image.tif files for multiple fonts in 
> Tesseract 5.3.2: 
>
> import os
> import random
> import pathlib
> import subprocess
>
> training_text_file = 'langdata/ben.training_text'
> font_list = ['FL Badhon Ansari Rh. Unicode',
>              'F Khairuddin Barbarusa Rah. Uni',
>              'F Mahfuj Art Unicode Italic',
>              'F Mahfuj Art Unicode',
>              'FL Niribili Plain Unicode',
>              'FL Niribili Plain Unicode Itali Italic'
>              ]  # Add more fonts as needed
>
> lines = []
>
> with open(training_text_file, 'r') as input_file:
>     for line in input_file.readlines():
>         lines.append(line.strip())
>
> output_directory = 'tesstrain/data/ben-ground-truth'
>
> if not os.path.exists(output_directory):
>     os.mkdir(output_directory)
>
> random.shuffle(lines)
>
> count = 100
>
> lines = lines[:count]
>
> line_count = 0
> for line in lines:
>     for font in font_list:
>         training_text_file_name = pathlib.Path(training_text_file).stem
>         line_training_text = os.path.join(
>             output_directory, 
> f'{training_text_file_name}_{line_count}.gt.txt')
>         with open(line_training_text, 'w') as output_file:
>             output_file.writelines([line])
>
>         file_base_name = f'ben_{line_count}'
>
>         subprocess.run([
>             'text2image',
>             f'--font={font}',
>             f'--text={line_training_text}',
>             f'--outputbase={output_directory}/{file_base_name}',
>             '--max_pages=1',
>             '--strip_unrenderable_words',
>             '--leading=32',
>             '--xsize=3600',
>             '--ysize=350',
>             '--char_spacing=1.0',
>             '--exposure=0',
>             '--unicharset_file=langdata/ben.unicharset'
>         ])
>
>         line_count += 1
>
>
>
> and this command is for training :
>
> import subprocess
>
> # List of font names
> font_names = ['ben']
>
> for font in font_names:
>     command = f"TESSDATA_PREFIX=../tesseract/tessdata make training 
> MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata 
> MAX_ITERATIONS=10000 LANG_TYPE=Indic"
>     subprocess.run(command, shell=True)

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/b13ba048-7eb8-41fc-82cc-fdc1ec2247e4n%40googlegroups.com.

[tesseract-ocr] Re: trainning question

Reply via email to