Thank you man. This is very useful. On Tuesday, July 25, 2023 at 12:01:20 PM UTC+3 mdalihu...@gmail.com wrote:
> make sure the command of the training file will be under tesstrain folder > and run the first command for training data and if you train from any > checkpoint then run the second post command. > On Tuesday, 25 July, 2023 at 2:48:25 pm UTC+6 Ali hussain wrote: > >> import subprocess >> >> # List of font names >> font_names = ['ben'] >> >> for font in font_names: >> command = f"lstmtraining --continue_from >> data/ben/checkpoints/ben_19.535_298_300.checkpoint --traineddata >> data/ben/ben.traineddata --model_output data/ben/checkpoints/ben >> --train_listfile data/ben/list.train --eval_listfile data/ben/list.eval >> --max_iterations 1000" >> subprocess.run(command, shell=True) >> i fixed the problem and this code work for me by adding the checkpoint. >> On Thursday, 20 July, 2023 at 7:51:43 am UTC+6 Ali hussain wrote: >> >>> I'm new in Tesseract and trying to train my own fonts on Tesseract 5.3.2 >>> but I have to know if the electricity is cut off or if I cancel vs code or >>> something like that of the process of training then if I run the training >>> command again so after that it starts from begging or from electricity cut >>> off? >>> >>> I have already to tested it but every time starts from begging. so I >>> need to know any method to apply this problem to handle this. because it >>> takes a lot of time and is not necessary to start by begging every time or >>> it's normal? >>> >>> >>> I use this command to create text-to-image.tif files for multiple fonts >>> in Tesseract 5.3.2: >>> >>> import os >>> import random >>> import pathlib >>> import subprocess >>> >>> training_text_file = 'langdata/ben.training_text' >>> font_list = ['FL Badhon Ansari Rh. Unicode', >>> 'F Khairuddin Barbarusa Rah. Uni', >>> 'F Mahfuj Art Unicode Italic', >>> 'F Mahfuj Art Unicode', >>> 'FL Niribili Plain Unicode', >>> 'FL Niribili Plain Unicode Itali Italic' >>> ] # Add more fonts as needed >>> >>> lines = [] >>> >>> with open(training_text_file, 'r') as input_file: >>> for line in input_file.readlines(): >>> lines.append(line.strip()) >>> >>> output_directory = 'tesstrain/data/ben-ground-truth' >>> >>> if not os.path.exists(output_directory): >>> os.mkdir(output_directory) >>> >>> random.shuffle(lines) >>> >>> count = 100 >>> >>> lines = lines[:count] >>> >>> line_count = 0 >>> for line in lines: >>> for font in font_list: >>> training_text_file_name = pathlib.Path(training_text_file).stem >>> line_training_text = os.path.join( >>> output_directory, >>> f'{training_text_file_name}_{line_count}.gt.txt') >>> with open(line_training_text, 'w') as output_file: >>> output_file.writelines([line]) >>> >>> file_base_name = f'ben_{line_count}' >>> >>> subprocess.run([ >>> 'text2image', >>> f'--font={font}', >>> f'--text={line_training_text}', >>> f'--outputbase={output_directory}/{file_base_name}', >>> '--max_pages=1', >>> '--strip_unrenderable_words', >>> '--leading=32', >>> '--xsize=3600', >>> '--ysize=350', >>> '--char_spacing=1.0', >>> '--exposure=0', >>> '--unicharset_file=langdata/ben.unicharset' >>> ]) >>> >>> line_count += 1 >>> >>> >>> >>> and this command is for training : >>> >>> import subprocess >>> >>> # List of font names >>> font_names = ['ben'] >>> >>> for font in font_names: >>> command = f"TESSDATA_PREFIX=../tesseract/tessdata make training >>> MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata >>> MAX_ITERATIONS=10000 LANG_TYPE=Indic" >>> subprocess.run(command, shell=True) >> >> -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/bfccab71-1bc4-46f2-8301-38732db5b73an%40googlegroups.com.