It's okay, solved the issue. I didn't put enough training data into the folder hence no training data
On Monday, 28 April 2025 at 10:43:05 UTC+8 Jiansen Chan wrote: > My goal is to automate model training in tesseract OCR for Japanese words. > The user should just paste ground truth files and picture files into a > particular folder, and then use that data to train a new model. this > process should be able to be carried out multiple times. Every single time > data is added to the folder I expect an automated model training. > > However, this is the error that i run into when I try to run automated > tesseract training on VSCode. What I did is that I had a script that uses > watchdog to detect newly added .tif/.png files alongside their > corresponding .gt.txt files into a particular folder (from which the model > is supposed to treat as training data and use it to train). The watcher > file looks something like this: > > (watcher_trainng.py) > import time > import os > from watchdog.observers import Observer > from watchdog.events import FileSystemEventHandler > from pathlib import Path > from training.tesseract_training import run_tesseract_training > from training.training_model_utils import get_latest_and_next_model > WATCHED_FOLDER = r"C:\Users\Chan Jian Sen\Documents\ocr-japanese\I > NPUT_TRAINING_DATA" #ground truth put here > tesstrain_dir = r"C:\Users\Chan Jian Sen\Documents\TesseractFineTuningJpn5 > \tesstrain" > > > class TrainingInputHandler(FileSystemEventHandler): > > def on_modified(self, event): > self.check_and_trigger_training() > > def on_created(self, event): > self.check_and_trigger_training() > > def check_and_trigger_training(self): > files = os.listdir(WATCHED_FOLDER) > pngs = {Path(f).stem for f in files if f.endswith('.png')} > gts = {Path(f).stem for f in files if f.endswith('.gt.txt')} > common = pngs & gts > > if len(common) == 0: > print("⏳ Waiting for matching .png and .gt.txt pairs...") > > tessdata_path = r"C:\Users\Chan Jian Sen\Documents\T > esseractFineTuningJpn5\tessdata" > start_model, new_model = get_latest_and_next_model(tessdata_path) > > print(f"🔁 Using {start_model} as base, training new model: { > new_model}") #problem here is the the old model they saw it as jpn and > the new model as jpn1 > > run_tesseract_training(tesstrain_dir, new_model, start_model) #the > first parameter MUST be your tesstrain folder > observer.stop() > > if __name__ == "__main__": > print(f"👀 Watching training data folder: {WATCHED_FOLDER}") > event_handler = TrainingInputHandler() > observer = Observer() > observer.schedule(event_handler, WATCHED_FOLDER, recursive=False) > observer.start() > > try: > while observer.is_alive(): > time.sleep(1) > except KeyboardInterrupt: > observer.stop() > observer.join() > > To generate a new model name (since I want to automate model training), i > also have these functions here: > (training_model_utils.py) > import os > > def get_model_names(tessdata_path, model_prefix="jpn"): > models = [] > for fname in os.listdir(tessdata_path): > if fname.startswith(model_prefix) and fname.endswith( > ".traineddata"): > suffix = fname[len(model_prefix):-len(".traineddata")] > if suffix == "": > models.append((0, "jpn")) > elif suffix.isdigit(): > models.append((int(suffix), f"{model_prefix}{suffix}")) > models.sort() > return models > > def get_latest_and_next_model(tessdata_path, model_prefix="jpn"): > models = get_model_names(tessdata_path, model_prefix) > if not models: > return model_prefix, f"{model_prefix}2" > latest = models[-1][1] > next_num = models[-1][0] + 1 > next_model = f"{model_prefix}{next_num}" if next_num > 0 else f"{ > model_prefix}2" > return latest, next_model > > > > > I also coded the make training procedure into VSCode, with a python script > that calls for it. This code snippet below is meant to run the tesseract > training. > (tesseract_training.py) > import subprocess > import os > > def run_tesseract_training(training_dir, model_name, start_model, > max_iterations=4000): #previously start model is jpn > """ > Run the full Tesseract tesstrain workflow including unicharset and > langdata. > """ > tessdata_path = r"C:\Users\Chan Jian Sen\Documents\T > esseractFineTuningJpn5\tessdata" > # Important: replace backslashes with forward slashes > tessdata_path = tessdata_path.replace("\\", "/") > env = os.environ.copy() > env["TESSDATA_PREFIX"] = tessdata_path > command = [ > "make", > "unicharset", "lists", "proto-model", "tesseract-langdata", > "training", > f"MODEL_NAME={model_name}", > f"START_MODEL={start_model}", > f"TESSDATA={tessdata_path}", # Adjust path depending on where > your .traineddata are > f"GROUND_TRUTH_DIR={training_dir}", > f"MAX_ITERATIONS={max_iterations}", > "LEARNING_RATE=0.001" > ] > > print("🚀 Running full Tesseract training pipeline...") > try: > subprocess.run(command, cwd=r"C:\Users\Chan Jian Sen\Documents\T > esseractFineTuningJpn5\tesstrain", shell=True, check=True, env=env) > print(f"✅ Training complete: {model_name}.traineddata generated.") > except subprocess.CalledProcessError as e: > print(f"❌ Training failed: {e}") > > > However this is my terminal output when I run the watcher file. > PS C:\Users\Chan Jian Sen\Documents\ocr-japanese> c:; cd 'c:\Users\Chan > Jian Sen\Documents\ocr-japanese'; & 'c:\Users\Chan Jian > Sen\AppData\Local\Programs\Python\Python39\python.exe' 'c:\Users\Chan Jian > Sen\.vscode\extensions\ms-python.debugpy-2025.6.0-win32-x64\bundled\libs\debugpy\launcher' > > '56444' '--' 'C:\Users\Chan Jian > Sen\Documents\ocr-japanese\watcher_training.py' > gpy-2025.6.0-win32-x64\x5cbundled\x5clibs\x5cdebugpy\x5clauncher' '56444' > '--' 'C:\x5cUsers\x5cChan Jian > Sen\x5cDocuments\x5cocr-japanese\x5cwatcher_training.py' > ;0a5d0c8e-f6f4-44db-b1ea-a49791670afe👀 Watching training data folder: > C:\Users\Chan Jian Sen\Documents\ocr-japanese\INPUT_TRAINING_DATA > > ⏳ Waiting for matching .png and .gt.txt pairs... > 🔁 Using jpn as base, training new model: jpn1 > 🚀 Running full Tesseract training pipeline... > You are using make version: 4.4.1 > Makefile:438: *** mixed implicit and normal rules: deprecated syntax > combine_tessdata -u C:/Users/Chan Jian > Sen/Documents/TesseractFineTuningJpn5/tessdata/jpn.traineddata data/jpn/jpn1 > Failed to read C:/Users/Chan > make: *** [Makefile:207: data/jpn/jpn1.lstm-unicharset] Error 1 > > > Would greatly appreciate for any help given. Sorry if it's quite a lot to > digest. > -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To view this discussion visit https://groups.google.com/d/msgid/tesseract-ocr/eaf6bc12-bcca-4ee4-b3d6-0ce684537fe6n%40googlegroups.com.