and one more thing I have used langdata_best. On Wednesday, 9 August, 2023 at 11:39:21 pm UTC+6 Ali hussain wrote:
> I have trained some new fonts by fine-tune methods for the Bengali > language in Tesseract 5 and I have used all official trained_text and other > things also. everything is good but the problem is the default font which > was trained before that does not convert text like prev but my new fonts > work well. I don't understand why it's happening. I share code based to > understand what going on. > > > *codes for creating tif, gt.txt, .box files:* > import os > import random > import pathlib > import subprocess > import argparse > from FontList import FontList > > def read_line_count(): > if os.path.exists('line_count.txt'): > with open('line_count.txt', 'r') as file: > return int(file.read()) > return 0 > > def write_line_count(line_count): > with open('line_count.txt', 'w') as file: > file.write(str(line_count)) > > def create_training_data(training_text_file, font_list, output_directory, > start_line=None, end_line=None): > lines = [] > with open(training_text_file, 'r') as input_file: > for line in input_file.readlines(): > lines.append(line.strip()) > > if not os.path.exists(output_directory): > os.mkdir(output_directory) > > random.shuffle(lines) > > if start_line is None: > line_count = read_line_count() # Set the starting line_count > from the file > else: > line_count = start_line > > if end_line is None: > end_line_count = len(lines) - 1 # Set the ending line_count > else: > end_line_count = min(end_line, len(lines) - 1) > > for font in font_list.fonts: # Iterate through all the fonts in the > font_list > font_serial = 1 > for line in lines: > training_text_file_name = pathlib.Path(training_text_file > ).stem > > # Generate a unique serial number for each line > line_serial = f"{line_count:d}" > > # GT (Ground Truth) text filename > line_gt_text = os.path.join(output_directory, f'{ > training_text_file_name}_{line_serial}.gt.txt') > with open(line_gt_text, 'w') as output_file: > output_file.writelines([line]) > > # Image filename > file_base_name = f'ben_{line_serial}' # Unique filename for > each font > subprocess.run([ > 'text2image', > f'--font={font}', > f'--text={line_gt_text}', > f'--outputbase={output_directory}/{file_base_name}', > '--max_pages=1', > '--strip_unrenderable_words', > '--leading=36', > '--xsize=3600', > '--ysize=350', > '--char_spacing=1.0', > '--exposure=0', > '--unicharset_file=langdata/ben.unicharset', > ]) > > line_count += 1 > font_serial += 1 > > # Reset font_serial for the next font iteration > font_serial = 1 > > write_line_count(line_count) # Update the line_count in the file > > if __name__ == "__main__": > parser = argparse.ArgumentParser() > parser.add_argument('--start', type=int, help='Starting line count > (inclusive)') > parser.add_argument('--end', type=int, help='Ending line count > (inclusive)') > args = parser.parse_args() > > training_text_file = 'langdata/ben.training_text' > output_directory = 'tesstrain/data/ben-ground-truth' > > # Create an instance of the FontList class > font_list = FontList() > > create_training_data(training_text_file, font_list, output_directory, > args.start, args.end) > > > *and for training code:* > > import subprocess > > # List of font names > font_names = ['ben'] > > for font in font_names: > command = f"TESSDATA_PREFIX=../tesseract/tessdata make training > MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata > MAX_ITERATIONS=10000 LANG_TYPE=Indic" > subprocess.run(command, shell=True) > > > any suggestion to identify to extract the problem. > thanks everyone > > > > > > > > -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/a8422cae-bfd9-4e29-b8d5-3a9ac9fe623fn%40googlegroups.com.