Hi, Several have commented on how hard it is to test an OSIS xml file against v11ns especially since it goes off into an infinite loop. (I’ve posted a patch that fixes that) But it is still a process of trial and error to find an appropriate v11n. So, I’ve been iterating with chatGPT to create a python script to find a best fit v11n. Since I don’t know python, I can’t vouch for the script beyond it worked for a simple test case that had an extra chapter for Genesis and had some extra verses at the end of a chapter in that book. I offer it, as a starting place. See the attached file. It has a —debug flag. The first argument is expected to be the OSIS xml file. The second argument is optional and gives the location to the include directory of svn/sword/trunk/include with all the canon*.h files. If you don’t supply the argument, it uses the web to load the canon*.h files from https://www.crosswire.org/svn/sword/trunk/include. It will score the fitness of each of the v11ns. It gives the score as a %, but I don’t know what that means. I told it that it should prioritize book matches, then chapter matches and finally verse matches. I don’t know how well it did that scoring. I didn’t test for that. The output is alphabetized. If more than one v11n have the same high score, they are listed. In His Service, DM |
import re import xml.etree.ElementTree as ET from collections import defaultdict from pathlib import Path import sys import requests import argparse
DEBUG = False REMOTE_URL = "https://www.crosswire.org/svn/sword/trunk/include/" def parse_osis(file_path): osis_structure = defaultdict(lambda: defaultdict(set)) context = ET.iterparse(file_path, events=('start',)) for event, elem in context: if elem.tag.endswith('verse'): osisid = elem.attrib.get('osisID') if osisid: parts = osisid.split('.') if len(parts) == 3: book, chapter, verse = parts try: chapter = int(re.sub(r'\D', '', chapter)) verse = int(re.sub(r'\D', '', verse)) osis_structure[book][chapter].add(verse) except ValueError: continue elem.clear() if DEBUG: print("Parsed OSIS structure:") for book in osis_structure: print(f" {book}: {sorted(osis_structure[book].keys())}") return osis_structure def load_canon_file(canon_path): if canon_path.startswith("http://") or canon_path.startswith("https://"): if DEBUG: print(f"Downloading: {canon_path}") response = requests.get(canon_path) if response.status_code == 200: if DEBUG: print(f"Downloaded {len(response.text)} bytes from {canon_path}") return response.text else: raise ValueError(f"Failed to download {canon_path} (status {response.status_code})") else: if DEBUG: print(f"Reading local file: {canon_path}") with open(canon_path, encoding='utf-8') as f: content = f.read() if DEBUG: print(f"Loaded {len(content)} bytes from local file") return content def parse_books_array(content, array_name): pattern = re.compile( rf'struct\s+sbook\s+{re.escape(array_name)}\s*\[\s*\]\s*=\s*\{{(.*?)\}};', re.DOTALL | re.IGNORECASE ) match = pattern.search(content) if not match: if DEBUG: print(f"No {array_name} array found") return [] entries = re.findall(r'\{\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*(\d+)\s*\}', match.group(1)) if DEBUG: print(f"Parsed {len(entries)} entries from {array_name}") return [(full, abbrev, osis, int(ch)) for full, abbrev, osis, ch in entries] def parse_vm_array(content, v11n_name): patterns = [] flags = re.DOTALL | re.IGNORECASE if v11n_name == "KJV": patterns = [ re.compile(r'int\s+vm\s*\[\s*\]\s*=\s*\{(.*?)\};', flags), re.compile(r'int\s+vm_kjv\s*\[\s*\]\s*=\s*\{(.*?)\};', flags) ] else: patterns = [re.compile(r'int\s+vm_' + re.escape(v11n_name) + r'\s*\[\s*\]\s*=\s*\{(.*?)\};', flags)] for pattern in patterns: vm_match = pattern.search(content) if vm_match: vm_entries = list(map(int, re.findall(r'\d+', vm_match.group(1)))) if DEBUG: print(f"Parsed {len(vm_entries)} verse max entries for {v11n_name}") return vm_entries if DEBUG: print(f"No vm array found for {v11n_name}") return [] def parse_canon_file(file_path, base_ot=None, base_nt=None): fname = file_path.split("/")[-1] if file_path.startswith("http") else Path(file_path).name if "canon_null" in fname or "canon_abbrevs" in fname: if DEBUG: print(f"Skipping {fname} (excluded file)") return None, {} v11n_name = Path(fname).stem.replace("canon", "").lstrip('_') or "KJV" content = load_canon_file(file_path) if DEBUG: print(f"Processing {fname} as versification '{v11n_name}'") otbooks_name = f"otbooks_{v11n_name}" if v11n_name != "KJV" else "otbooks" ntbooks_name = f"ntbooks_{v11n_name}" if v11n_name != "KJV" else "ntbooks" ot_books = parse_books_array(content, otbooks_name) nt_books = parse_books_array(content, ntbooks_name) if not ot_books and base_ot is not None: if DEBUG: print(f"Falling back to base OT books for {v11n_name}") ot_books = base_ot if not nt_books and base_nt is not None: if DEBUG: print(f"Falling back to base NT books for {v11n_name}") nt_books = base_nt books = ot_books + nt_books vm_entries = parse_vm_array(content, v11n_name) if not vm_entries: if DEBUG: print(f"No vm array found in {v11n_name}") return v11n_name, {} structure = defaultdict(dict) i = 0 for _, _, osis, chapters in books: for ch in range(1, chapters + 1): if i < len(vm_entries): structure[osis][ch] = vm_entries[i] i += 1 else: break if DEBUG: print(f"Parsed structure for {v11n_name} with {len(structure)} books") return v11n_name, structure def score_v11n(osis_structure, v11n_structure): score = 0 max_score = 0 mismatch_details = [] for book, chapters in osis_structure.items(): max_score += len(chapters) * 2 if book not in v11n_structure: mismatch_details.append(f'Missing book: {book}') continue for ch_num, verses in chapters.items(): max_score += len(verses) if ch_num not in v11n_structure[book]: mismatch_details.append(f'{book} missing chapter {ch_num}') continue score += 2 max_verse = v11n_structure[book][ch_num] matching_verses = sum(1 for v in verses if v <= max_verse) score += matching_verses missed = len(verses) - matching_verses if missed > 0: mismatch_details.append(f'{book} {ch_num}: {missed} verse(s) too many') fit_percent = (score / max_score * 100) if max_score else 0 return fit_percent, mismatch_details def find_best_fit(osis_file, canon_dir): osis_structure = parse_osis(osis_file) results = [] canon_files = [] if canon_dir.startswith("http://") or canon_dir.startswith("https://"): index_url = canon_dir.rstrip("/") + "/" if DEBUG: print(f"Fetching directory listing from {index_url}") index_html = requests.get(index_url).text canon_files = re.findall(r'href=[\'"]?(canon[^\'"]+\.h)[\'"]?', index_html) canon_files = [index_url + fname for fname in canon_files if not ("canon_null" in fname or "canon_abbrevs" in fname)] if index_url + "canon.h" not in canon_files: canon_files.append(index_url + "canon.h") if DEBUG: print(f"Found {len(canon_files)} canon files in remote directory:") for f in canon_files: print(f" - {f}") else: canon_files = list(Path(canon_dir).glob("canon*.h")) kjv_path = Path(canon_dir) / "canon.h" if kjv_path not in canon_files: canon_files.append(kjv_path) base_ot = base_nt = None base_canon_path = None if canon_dir.startswith("http://") or canon_dir.startswith("https://"): base_canon_path = canon_dir.rstrip("/") + "/canon.h" else: base_canon_path = str(Path(canon_dir) / "canon.h") if DEBUG: print(f"Loading base canon file from {base_canon_path}") base_content = load_canon_file(base_canon_path) base_ot = parse_books_array(base_content, "otbooks") base_nt = parse_books_array(base_content, "ntbooks") for canon_file in canon_files: try: name, structure = parse_canon_file(str(canon_file), base_ot, base_nt) if name is None: continue fit, mismatches = score_v11n(osis_structure, structure) results.append((fit, name, mismatches)) except Exception as e: print(f"Failed to parse {canon_file}: {e}") results.sort(reverse=True) results.sort(key=lambda r: (-r[0], r[1].lower())) for fit, name, mismatches in results: print(f"{name}: {fit:.2f}% fit") for m in mismatches: print(f" - {m}") if results: best_score = results[0][0] best_matches = [(fit, name) for fit, name, _ in results if fit == best_score] print(f"\nBest match(es) ({best_score:.2f}%):") for _, name in best_matches: print(f" - {name}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Find best-fit versification for an OSIS file.") parser.add_argument("osis_file", help="Path to OSIS XML file") parser.add_argument("canon_path", nargs="?", default=REMOTE_URL, help="Path or URL to canon headers (default: remote CrossWire URL)") parser.add_argument("--debug", action="store_true", help="Enable debug output") args = parser.parse_args() DEBUG = args.debug if DEBUG: print("Debug mode enabled\n") find_best_fit(args.osis_file, args.canon_path)
_______________________________________________ sword-devel mailing list: sword-devel@crosswire.org http://crosswire.org/mailman/listinfo/sword-devel Instructions to unsubscribe/change your settings at above page