ok, I will try as you said. one more thing, what's the role of the trained_text lines will be? I have seen Bengali text are long words of lines. so I wanna know how many words or characters will be the better choice for the train? and '--xsize=3600','--ysize=350', will be according to words of lines?
On Thursday, 10 August, 2023 at 1:10:14 am UTC+6 shree wrote: > Include the default fonts also in your fine-tuning list of fonts and see > if that helps. > > On Wed, Aug 9, 2023, 2:27 PM Ali hussain <mdalihu...@gmail.com> wrote: > >> I have trained some new fonts by fine-tune methods for the Bengali >> language in Tesseract 5 and I have used all official trained_text and >> tessdata_best and other things also. everything is good but the problem is >> the default font which was trained before that does not convert text like >> prev but my new fonts work well. I don't understand why it's happening. I >> share code based to understand what going on. >> >> >> *codes for creating tif, gt.txt, .box files:* >> import os >> import random >> import pathlib >> import subprocess >> import argparse >> from FontList import FontList >> >> def read_line_count(): >> if os.path.exists('line_count.txt'): >> with open('line_count.txt', 'r') as file: >> return int(file.read()) >> return 0 >> >> def write_line_count(line_count): >> with open('line_count.txt', 'w') as file: >> file.write(str(line_count)) >> >> def create_training_data(training_text_file, font_list, output_directory, >> start_line=None, end_line=None): >> lines = [] >> with open(training_text_file, 'r') as input_file: >> for line in input_file.readlines(): >> lines.append(line.strip()) >> >> if not os.path.exists(output_directory): >> os.mkdir(output_directory) >> >> random.shuffle(lines) >> >> if start_line is None: >> line_count = read_line_count() # Set the starting line_count >> from the file >> else: >> line_count = start_line >> >> if end_line is None: >> end_line_count = len(lines) - 1 # Set the ending line_count >> else: >> end_line_count = min(end_line, len(lines) - 1) >> >> for font in font_list.fonts: # Iterate through all the fonts in the >> font_list >> font_serial = 1 >> for line in lines: >> training_text_file_name = pathlib.Path(training_text_file >> ).stem >> >> # Generate a unique serial number for each line >> line_serial = f"{line_count:d}" >> >> # GT (Ground Truth) text filename >> line_gt_text = os.path.join(output_directory, f'{ >> training_text_file_name}_{line_serial}.gt.txt') >> with open(line_gt_text, 'w') as output_file: >> output_file.writelines([line]) >> >> # Image filename >> file_base_name = f'ben_{line_serial}' # Unique filename for >> each font >> subprocess.run([ >> 'text2image', >> f'--font={font}', >> f'--text={line_gt_text}', >> f'--outputbase={output_directory}/{file_base_name}', >> '--max_pages=1', >> '--strip_unrenderable_words', >> '--leading=36', >> '--xsize=3600', >> '--ysize=350', >> '--char_spacing=1.0', >> '--exposure=0', >> '--unicharset_file=langdata/ben.unicharset', >> ]) >> >> line_count += 1 >> font_serial += 1 >> >> # Reset font_serial for the next font iteration >> font_serial = 1 >> >> write_line_count(line_count) # Update the line_count in the file >> >> if __name__ == "__main__": >> parser = argparse.ArgumentParser() >> parser.add_argument('--start', type=int, help='Starting line count >> (inclusive)') >> parser.add_argument('--end', type=int, help='Ending line count >> (inclusive)') >> args = parser.parse_args() >> >> training_text_file = 'langdata/ben.training_text' >> output_directory = 'tesstrain/data/ben-ground-truth' >> >> # Create an instance of the FontList class >> font_list = FontList() >> >> create_training_data(training_text_file, font_list, >> output_directory, args.start, args.end) >> >> >> *and for training code:* >> >> import subprocess >> >> # List of font names >> font_names = ['ben'] >> >> for font in font_names: >> command = f"TESSDATA_PREFIX=../tesseract/tessdata make training >> MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata >> MAX_ITERATIONS=10000 LANG_TYPE=Indic" >> subprocess.run(command, shell=True) >> >> >> any suggestion to identify to extract the problem. >> thanks, everyone >> >> -- >> You received this message because you are subscribed to the Google Groups >> "tesseract-ocr" group. >> To unsubscribe from this group and stop receiving emails from it, send an >> email to tesseract-oc...@googlegroups.com. >> To view this discussion on the web visit >> https://groups.google.com/d/msgid/tesseract-ocr/406cd733-b265-4118-a7ca-de75871cac39n%40googlegroups.com >> >> <https://groups.google.com/d/msgid/tesseract-ocr/406cd733-b265-4118-a7ca-de75871cac39n%40googlegroups.com?utm_medium=email&utm_source=footer> >> . >> > -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/7521396f-908e-446b-bc3f-9f833dda997bn%40googlegroups.com.