I have trained some new fonts by fine-tune methods for the Bengali language in Tesseract 5 and I have used all official trained_text and tessdata_best and other things also. everything is good but the problem is the default font which was trained before that does not convert text like prev but my new fonts work well. I don't understand why it's happening. I share code based to understand what going on.
*codes for creating tif, gt.txt, .box files:* import os import random import pathlib import subprocess import argparse from FontList import FontList def read_line_count(): if os.path.exists('line_count.txt'): with open('line_count.txt', 'r') as file: return int(file.read()) return 0 def write_line_count(line_count): with open('line_count.txt', 'w') as file: file.write(str(line_count)) def create_training_data(training_text_file, font_list, output_directory, start_line=None, end_line=None): lines = [] with open(training_text_file, 'r') as input_file: for line in input_file.readlines(): lines.append(line.strip()) if not os.path.exists(output_directory): os.mkdir(output_directory) random.shuffle(lines) if start_line is None: line_count = read_line_count() # Set the starting line_count from the file else: line_count = start_line if end_line is None: end_line_count = len(lines) - 1 # Set the ending line_count else: end_line_count = min(end_line, len(lines) - 1) for font in font_list.fonts: # Iterate through all the fonts in the font_list font_serial = 1 for line in lines: training_text_file_name = pathlib.Path(training_text_file).stem # Generate a unique serial number for each line line_serial = f"{line_count:d}" # GT (Ground Truth) text filename line_gt_text = os.path.join(output_directory, f'{ training_text_file_name}_{line_serial}.gt.txt') with open(line_gt_text, 'w') as output_file: output_file.writelines([line]) # Image filename file_base_name = f'ben_{line_serial}' # Unique filename for each font subprocess.run([ 'text2image', f'--font={font}', f'--text={line_gt_text}', f'--outputbase={output_directory}/{file_base_name}', '--max_pages=1', '--strip_unrenderable_words', '--leading=36', '--xsize=3600', '--ysize=350', '--char_spacing=1.0', '--exposure=0', '--unicharset_file=langdata/ben.unicharset', ]) line_count += 1 font_serial += 1 # Reset font_serial for the next font iteration font_serial = 1 write_line_count(line_count) # Update the line_count in the file if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--start', type=int, help='Starting line count (inclusive)') parser.add_argument('--end', type=int, help='Ending line count (inclusive)') args = parser.parse_args() training_text_file = 'langdata/ben.training_text' output_directory = 'tesstrain/data/ben-ground-truth' # Create an instance of the FontList class font_list = FontList() create_training_data(training_text_file, font_list, output_directory, args.start, args.end) *and for training code:* import subprocess # List of font names font_names = ['ben'] for font in font_names: command = f"TESSDATA_PREFIX=../tesseract/tessdata make training MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata MAX_ITERATIONS=10000 LANG_TYPE=Indic" subprocess.run(command, shell=True) any suggestion to identify to extract the problem. thanks, everyone -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/406cd733-b265-4118-a7ca-de75871cac39n%40googlegroups.com.