Include the default fonts also in your fine-tuning list of fonts and see if that helps.
On Wed, Aug 9, 2023, 2:27 PM Ali hussain <mdalihussain...@gmail.com> wrote: > I have trained some new fonts by fine-tune methods for the Bengali > language in Tesseract 5 and I have used all official trained_text and > tessdata_best and other things also. everything is good but the problem is > the default font which was trained before that does not convert text like > prev but my new fonts work well. I don't understand why it's happening. I > share code based to understand what going on. > > > *codes for creating tif, gt.txt, .box files:* > import os > import random > import pathlib > import subprocess > import argparse > from FontList import FontList > > def read_line_count(): > if os.path.exists('line_count.txt'): > with open('line_count.txt', 'r') as file: > return int(file.read()) > return 0 > > def write_line_count(line_count): > with open('line_count.txt', 'w') as file: > file.write(str(line_count)) > > def create_training_data(training_text_file, font_list, output_directory, > start_line=None, end_line=None): > lines = [] > with open(training_text_file, 'r') as input_file: > for line in input_file.readlines(): > lines.append(line.strip()) > > if not os.path.exists(output_directory): > os.mkdir(output_directory) > > random.shuffle(lines) > > if start_line is None: > line_count = read_line_count() # Set the starting line_count > from the file > else: > line_count = start_line > > if end_line is None: > end_line_count = len(lines) - 1 # Set the ending line_count > else: > end_line_count = min(end_line, len(lines) - 1) > > for font in font_list.fonts: # Iterate through all the fonts in the > font_list > font_serial = 1 > for line in lines: > training_text_file_name = pathlib.Path(training_text_file > ).stem > > # Generate a unique serial number for each line > line_serial = f"{line_count:d}" > > # GT (Ground Truth) text filename > line_gt_text = os.path.join(output_directory, f'{ > training_text_file_name}_{line_serial}.gt.txt') > with open(line_gt_text, 'w') as output_file: > output_file.writelines([line]) > > # Image filename > file_base_name = f'ben_{line_serial}' # Unique filename for > each font > subprocess.run([ > 'text2image', > f'--font={font}', > f'--text={line_gt_text}', > f'--outputbase={output_directory}/{file_base_name}', > '--max_pages=1', > '--strip_unrenderable_words', > '--leading=36', > '--xsize=3600', > '--ysize=350', > '--char_spacing=1.0', > '--exposure=0', > '--unicharset_file=langdata/ben.unicharset', > ]) > > line_count += 1 > font_serial += 1 > > # Reset font_serial for the next font iteration > font_serial = 1 > > write_line_count(line_count) # Update the line_count in the file > > if __name__ == "__main__": > parser = argparse.ArgumentParser() > parser.add_argument('--start', type=int, help='Starting line count > (inclusive)') > parser.add_argument('--end', type=int, help='Ending line count > (inclusive)') > args = parser.parse_args() > > training_text_file = 'langdata/ben.training_text' > output_directory = 'tesstrain/data/ben-ground-truth' > > # Create an instance of the FontList class > font_list = FontList() > > create_training_data(training_text_file, font_list, output_directory, > args.start, args.end) > > > *and for training code:* > > import subprocess > > # List of font names > font_names = ['ben'] > > for font in font_names: > command = f"TESSDATA_PREFIX=../tesseract/tessdata make training > MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata > MAX_ITERATIONS=10000 LANG_TYPE=Indic" > subprocess.run(command, shell=True) > > > any suggestion to identify to extract the problem. > thanks, everyone > > -- > You received this message because you are subscribed to the Google Groups > "tesseract-ocr" group. > To unsubscribe from this group and stop receiving emails from it, send an > email to tesseract-ocr+unsubscr...@googlegroups.com. > To view this discussion on the web visit > https://groups.google.com/d/msgid/tesseract-ocr/406cd733-b265-4118-a7ca-de75871cac39n%40googlegroups.com > <https://groups.google.com/d/msgid/tesseract-ocr/406cd733-b265-4118-a7ca-de75871cac39n%40googlegroups.com?utm_medium=email&utm_source=footer> > . > -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduX7zV88zLZzVaWDMgjX_4AxhLi710u44WqOsBVxaTyL6Q%40mail.gmail.com.