it's okay I managed to solve the issue, dont worry about this anymore On Monday, 28 April 2025 at 15:59:37 UTC+8 zdenop wrote:
> > > β Training failed: Command '['make', 'unicharset', 'lists', >> 'proto-model', 'tesseract-langdata', 'training', 'MODEL_NAME=jpn1', >> 'START_MODEL=jpn', 'TESSDATA=C:/Users/Chan Jian >> Sen/Documents/TesseractFineT Training failed: Command > > > Why are you showing us so much python code if your shell command fails? Or > does it work in the terminal? > > ) BTW: > >> for fname in os.listdir(tessdata_path): >> if fname.startswith(model_prefix) and fname.endswith( >> ".traineddata"): >> suffix = fname[len(model_prefix):-len(".traineddata")] > > > What is your python version? 2.x? Have you heard about `pathlib` (or > `glob`)? > > Zdenko > > > po 28. 4. 2025 o 9:43 Jiansen Chan <jian...@gmail.com> napΓsal(a): > >> My goal is to automate model training in tesseract OCR for Japanese >> words. The user should just paste ground truth files and picture files into >> a particular folder, and then use that data to train a new model. this >> process should be able to be carried out multiple times. Every single time >> data is added to the folder I expect an automated model training. >> >> However, this is the error that i run into when I try to run automated >> tesseract training on VSCode. What I did is that I had a script that uses >> watchdog to detect newly added .tif/.png files alongside their >> corresponding .gt.txt files into a particular folder (from which the model >> is supposed to treat as training data and use it to train). The watcher >> file looks something like this: >> >> (watcher.py) >> >> import time >> import os >> from watchdog.observers import Observer >> from watchdog.events import FileSystemEventHandler >> from pathlib import Path >> from training.tesseract_training import run_tesseract_training >> from training.training_model_utils import get_latest_and_next_model >> WATCHED_FOLDER = r"C:\Users\Chan Jian Sen\Documents\ocr-japanese\I >> NPUT_TRAINING_DATA" #ground truth put here >> tesstrain_dir = r"C:\Users\Chan Jian Sen\Documents\T >> esseractFineTuningJpn5\tesstrain" >> >> class TrainingInputHandler(FileSystemEventHandler): >> >> def on_modified(self, event): >> self.check_and_trigger_training() >> >> def on_created(self, event): >> self.check_and_trigger_training() >> >> def check_and_trigger_training(self): >> files = os.listdir(WATCHED_FOLDER) >> pngs = {Path(f).stem for f in files if f.endswith('.png')} >> gts = {Path(f).stem for f in files if f.endswith('.gt.txt')} >> common = pngs & gts >> >> if len(common) == 0: >> print("β³ Waiting for matching .png and .gt.txt pairs...") >> >> >> tessdata_path = r"C:\Users\Chan Jian Sen\Documents\T >> esseractFineTuningJpn5\tessdata" >> start_model, new_model = get_latest_and_next_model(tessdata_path) >> >> print(f"π Using {start_model} as base, training new model: { >> new_model}") #problem here is the the old model they saw it as jpn and >> the new model as jpn1 >> >> run_tesseract_training(tesstrain_dir, new_model, start_model) #the >> first parameter MUST be your tesstrain folder >> observer.stop() >> >> if __name__ == "__main__": >> print(f"π Watching training data folder: {WATCHED_FOLDER}") >> event_handler = TrainingInputHandler() >> observer = Observer() >> observer.schedule(event_handler, WATCHED_FOLDER, recursive=False) >> observer.start() >> >> try: >> while observer.is_alive(): >> time.sleep(1) >> except KeyboardInterrupt: >> observer.stop() >> observer.join() >> >> >> >> >> To generate a new model name (since I want to automate model training), i >> also have these functions here: >> (training_model_utils.py) >> import os >> >> def get_model_names(tessdata_path, model_prefix="jpn"): >> models = [] >> for fname in os.listdir(tessdata_path): >> if fname.startswith(model_prefix) and fname.endswith( >> ".traineddata"): >> suffix = fname[len(model_prefix):-len(".traineddata")] >> if suffix == "": >> models.append((0, "jpn")) >> elif suffix.isdigit(): >> models.append((int(suffix), f"{model_prefix}{suffix}")) >> models.sort() >> return models >> >> def get_latest_and_next_model(tessdata_path, model_prefix="jpn"): >> models = get_model_names(tessdata_path, model_prefix) >> if not models: >> return model_prefix, f"{model_prefix}2" >> latest = models[-1][1] >> next_num = models[-1][0] + 1 >> next_model = f"{model_prefix}{next_num}" if next_num > 0 else f"{ >> model_prefix}2" >> return latest, next_model >> >> I also coded the make training procedure into VSCode, with a python >> script that calls for it. This code snippet below is meant to run the >> tesseract training. >> (tesseract_training.py) >> import subprocess >> import os >> >> def run_tesseract_training(training_dir, model_name, start_model, >> max_iterations=4000): #previously start model is jpn >> """ >> Run the full Tesseract tesstrain workflow including unicharset and >> langdata. >> """ >> tessdata_path = r"C:\Users\Chan Jian Sen\Documents\T >> esseractFineTuningJpn5\tessdata" >> # Important: replace backslashes with forward slashes >> tessdata_path = tessdata_path.replace("\\", "/") >> command = [ >> "make", >> "unicharset", "lists", "proto-model", "tesseract-langdata", >> "training", >> f"MODEL_NAME={model_name}", >> f"START_MODEL={start_model}", >> f"TESSDATA={tessdata_path}", # Adjust path depending on where >> your .traineddata are >> f"GROUND_TRUTH_DIR={training_dir}", >> f"MAX_ITERATIONS={max_iterations}", >> "LEARNING_RATE=0.001" >> ] >> >> print("π Running full Tesseract training pipeline...") >> try: >> subprocess.run(command, cwd=r"C:\Users\Chan Jian Sen\Documents\T >> esseractFineTuningJpn5\tesstrain", shell=True, check=True) >> print(f"β Training complete: {model_name}.traineddata generated." >> ) >> except subprocess.CalledProcessError as e: >> print(f"β Training failed: {e}") >> >> However, when I run the code an issue appears, and I'm not sure how to >> deal with it: >> >> >> PS C:\Users\Chan Jian Sen\Documents\ocr-japanese> c:; cd 'c:\Users\Chan >> Jian Sen\Documents\ocr-japanese'; & 'c:\Users\Chan J >> Sen\.vscode\extensions\ms-python.debugpy-2025.6.0-win32-x64\bundled\libs\debugpy\launcher' >> >> '58725' '--' 'C:\Users\Chan Jian S >> π Watching training data folder: C:\Users\Chan Jian >> Sen\Documents\ocr-japanese\INPUT_TRAINING_DATA >> β³ Waiting for matching .png and .gt.txt pairs... >> π Using jpn as base, training new model: jpn1 >> π Running full Tesseract training pipeline... >> You are using make version: 4.4.1 >> Makefile:438: *** mixed implicit and normal rules: deprecated syntax >> combine_tessdata -u C:/Users/Chan Jian >> Sen/Documents/TesseractFineTuningJpn5/tessdata/jpn.traineddata data/jpn/jpn1 >> π Watching training data folder: C:\Users\Chan Jian >> Sen\Documents\ocr-japanπ Watching training data folder: C:\Users\Chan Jian >> Sen\Documents\ocr-japanese\INPUT_TRAINING_DATA >> π Watching training data folder: C:\Users\Chan Jian >> Sen\Documents\ocr-japanπ Watching training data folder: C:\Users\Chan Jian >> Sen\Documents\ocr-japanese\INPUT_TRAINING_DATA >> β³ Waiting for matching .png and .gt.txt pairs... >> π Using jpn as base, training new model: jpn1 >> π Running full Tesseract training pipeline... >> You are using make version: 4.4.1 >> Makefile:438: *** mixed implicit and normal rules: deprecated syntax >> combine_tessdata -u C:/Users/Chan Jian >> Sen/Documents/TesseractFineTuningJpn5/tessdata/jpn.traineddata data/jpn/jpn1 >> Failed to read C:/Users/Chan >> make: *** [Makefile:207: data/jpn/jpn1.lstm-unicharset] Error 1 >> β Training failed: Command '['make', 'unicharset', 'lists', >> 'proto-model', 'tesseract-langdata', 'training', 'MODEL_NAME=jpn1', >> 'START_MODEL=jpn', 'TESSDATA=C:/Users/Chan Jian >> Sen/Documents/TesseractFineT Training failed: Command '['make', >> 'unicharset', 'lists', 'proto-model', ATIONS=4000', 'LEARNING_RATE=0.001']' >> returned no >> uningJpn5/tessdata', 'GROUND_TRUTH_DIR=C:\\Users\\Chan Jian >> Sen\\Documents\\TesseractFineTuningJpn5\\tesstrain', 'MAX_ITERATIONS=4000', >> 'LEARNING_RATE=0.001']' returned non-zero exit status 2. >> >> (Yellow parts is the error). Would greatly appreciate for any help given! >> Sorry if it looks complicated hahah >> >> -- >> You received this message because you are subscribed to the Google Groups >> "tesseract-ocr" group. >> To unsubscribe from this group and stop receiving emails from it, send an >> email to tesseract-oc...@googlegroups.com. >> To view this discussion visit >> https://groups.google.com/d/msgid/tesseract-ocr/1d1b27e3-fd8d-43c5-a801-50cfcaa196efn%40googlegroups.com >> >> <https://groups.google.com/d/msgid/tesseract-ocr/1d1b27e3-fd8d-43c5-a801-50cfcaa196efn%40googlegroups.com?utm_medium=email&utm_source=footer> >> . >> > -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To view this discussion visit https://groups.google.com/d/msgid/tesseract-ocr/2aecc2d6-a549-4063-b652-e017456eed59n%40googlegroups.com.