[tesseract-ocr] Re: the unexpected result after trained data

Ali hussain Wed, 09 Aug 2023 11:24:47 -0700

and one more thing I have used langdata_best.
On Wednesday, 9 August, 2023 at 11:39:21 pm UTC+6 Ali hussain wrote:


> I have trained some new fonts by fine-tune methods for the Bengali 
> language in Tesseract 5 and I have used all official trained_text and other 
> things also.  everything is good but the problem is the default font which 
> was trained before that does not convert text like prev but my new fonts 
> work well. I don't understand why it's happening. I share code based to 
> understand what going on.
>
>
> *codes  for creating tif, gt.txt, .box files:*
> import os
> import random
> import pathlib
> import subprocess
> import argparse
> from FontList import FontList
>
> def read_line_count():
>     if os.path.exists('line_count.txt'):
>         with open('line_count.txt', 'r') as file:
>             return int(file.read())
>     return 0
>
> def write_line_count(line_count):
>     with open('line_count.txt', 'w') as file:
>         file.write(str(line_count))
>
> def create_training_data(training_text_file, font_list, output_directory, 
> start_line=None, end_line=None):
>     lines = []
>     with open(training_text_file, 'r') as input_file:
>         for line in input_file.readlines():
>             lines.append(line.strip())
>     
>     if not os.path.exists(output_directory):
>         os.mkdir(output_directory)
>     
>     random.shuffle(lines)
>     
>     if start_line is None:
>         line_count = read_line_count()  # Set the starting line_count 
> from the file
>     else:
>         line_count = start_line
>     
>     if end_line is None:
>         end_line_count = len(lines) - 1  # Set the ending line_count
>     else:
>         end_line_count = min(end_line, len(lines) - 1)
>     
>     for font in font_list.fonts:  # Iterate through all the fonts in the 
> font_list
>         font_serial = 1
>         for line in lines:
>             training_text_file_name = pathlib.Path(training_text_file
> ).stem
>             
>             # Generate a unique serial number for each line
>             line_serial = f"{line_count:d}"
>             
>             # GT (Ground Truth) text filename
>             line_gt_text = os.path.join(output_directory, f'{
> training_text_file_name}_{line_serial}.gt.txt')
>             with open(line_gt_text, 'w') as output_file:
>                 output_file.writelines([line])
>             
>             # Image filename
>             file_base_name = f'ben_{line_serial}'  # Unique filename for 
> each font
>             subprocess.run([
>                 'text2image',
>                 f'--font={font}',
>                 f'--text={line_gt_text}',
>                 f'--outputbase={output_directory}/{file_base_name}',
>                 '--max_pages=1',
>                 '--strip_unrenderable_words',
>                 '--leading=36',
>                 '--xsize=3600',
>                 '--ysize=350',
>                 '--char_spacing=1.0',
>                 '--exposure=0',
>                 '--unicharset_file=langdata/ben.unicharset',
>             ])
>             
>             line_count += 1
>             font_serial += 1
>         
>         # Reset font_serial for the next font iteration
>         font_serial = 1
>     
>     write_line_count(line_count)  # Update the line_count in the file
>
> if __name__ == "__main__":
>     parser = argparse.ArgumentParser()
>     parser.add_argument('--start', type=int, help='Starting line count 
> (inclusive)')
>     parser.add_argument('--end', type=int, help='Ending line count 
> (inclusive)')
>     args = parser.parse_args()
>     
>     training_text_file = 'langdata/ben.training_text'
>     output_directory = 'tesstrain/data/ben-ground-truth'
>     
>     # Create an instance of the FontList class
>     font_list = FontList()
>      
>     create_training_data(training_text_file, font_list, output_directory, 
> args.start, args.end)
>
>
> *and for training code:*
>
> import subprocess
>
> # List of font names
> font_names = ['ben']
>
> for font in font_names:
>     command = f"TESSDATA_PREFIX=../tesseract/tessdata make training 
> MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata 
> MAX_ITERATIONS=10000 LANG_TYPE=Indic"
>     subprocess.run(command, shell=True)
>
>
> any suggestion to identify to extract the problem.
> thanks everyone
>
>
>
>
>
>
>
>

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/a8422cae-bfd9-4e29-b8d5-3a9ac9fe623fn%40googlegroups.com.

[tesseract-ocr] Re: the unexpected result after trained data

Reply via email to