[tesseract-ocr] accuracy problem after trained in fine-tune

Ali hussain Wed, 09 Aug 2023 11:27:02 -0700

I have trained some new fonts by fine-tune methods for the Bengali language 
in Tesseract 5 and I have used all official trained_text and tessdata_best 
and other things also.  everything is good but the problem is the default 
font which was trained before that does not convert text like prev but my 
new fonts work well. I don't understand why it's happening. I share code 
based to understand what going on.



*codes  for creating tif, gt.txt, .box files:*
import os
import random
import pathlib
import subprocess
import argparse
from FontList import FontList

def read_line_count():
    if os.path.exists('line_count.txt'):
        with open('line_count.txt', 'r') as file:
            return int(file.read())
    return 0

def write_line_count(line_count):
    with open('line_count.txt', 'w') as file:
        file.write(str(line_count))

def create_training_data(training_text_file, font_list, output_directory, 
start_line=None, end_line=None):
    lines = []
    with open(training_text_file, 'r') as input_file:
        for line in input_file.readlines():
            lines.append(line.strip())
    
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    
    random.shuffle(lines)
    
    if start_line is None:
        line_count = read_line_count()  # Set the starting line_count from 
the file
    else:
        line_count = start_line
    
    if end_line is None:
        end_line_count = len(lines) - 1  # Set the ending line_count
    else:
        end_line_count = min(end_line, len(lines) - 1)
    
    for font in font_list.fonts:  # Iterate through all the fonts in the 
font_list
        font_serial = 1
        for line in lines:
            training_text_file_name = pathlib.Path(training_text_file).stem
            
            # Generate a unique serial number for each line
            line_serial = f"{line_count:d}"
            
            # GT (Ground Truth) text filename
            line_gt_text = os.path.join(output_directory, f'{
training_text_file_name}_{line_serial}.gt.txt')
            with open(line_gt_text, 'w') as output_file:
                output_file.writelines([line])
            
            # Image filename
            file_base_name = f'ben_{line_serial}'  # Unique filename for 
each font
            subprocess.run([
                'text2image',
                f'--font={font}',
                f'--text={line_gt_text}',
                f'--outputbase={output_directory}/{file_base_name}',
                '--max_pages=1',
                '--strip_unrenderable_words',
                '--leading=36',
                '--xsize=3600',
                '--ysize=350',
                '--char_spacing=1.0',
                '--exposure=0',
                '--unicharset_file=langdata/ben.unicharset',
            ])
            
            line_count += 1
            font_serial += 1
        
        # Reset font_serial for the next font iteration
        font_serial = 1
    
    write_line_count(line_count)  # Update the line_count in the file

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--start', type=int, help='Starting line count 
(inclusive)')
    parser.add_argument('--end', type=int, help='Ending line count 
(inclusive)')
    args = parser.parse_args()
    
    training_text_file = 'langdata/ben.training_text'
    output_directory = 'tesstrain/data/ben-ground-truth'
    
    # Create an instance of the FontList class
    font_list = FontList()
     
    create_training_data(training_text_file, font_list, output_directory, 
args.start, args.end)


*and for training code:*

import subprocess

# List of font names
font_names = ['ben']

for font in font_names:
    command = f"TESSDATA_PREFIX=../tesseract/tessdata make training 
MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata 
MAX_ITERATIONS=10000 LANG_TYPE=Indic"
    subprocess.run(command, shell=True)


any suggestion to identify to extract the problem.
thanks, everyone

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to tesseract-ocr+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/406cd733-b265-4118-a7ca-de75871cac39n%40googlegroups.com.

[tesseract-ocr] accuracy problem after trained in fine-tune

Reply via email to