#!/usr/bin/env python3 """Analyze all JSON files in packs-src/ for text quality issues.""" import json import os import re import sys from pathlib import Path from html.parser import HTMLParser BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange") PACKS = BASE / "packs-src" REGLES = BASE / "regles.txt" # Load PDF text pdf_lines = REGLES.read_text(encoding="utf-8").splitlines() pdf_text = REGLES.read_text(encoding="utf-8") issues = [] # ---------- helpers ---------- def strip_html(html): """Remove HTML tags and return plain text.""" return re.sub(r'<[^>]+>', '', html or '') def check_unclosed_tags(html): """Returns list of unclosed/mismatched tags.""" open_tags = re.findall(r'<([a-zA-Z][a-zA-Z0-9]*)[^>]*>', html) close_tags = re.findall(r'', html) issues_found = [] # basic: count opens vs closes for block-level tags for tag in ['ul', 'ol', 'li', 'p', 'div', 'strong', 'em', 'b', 'i']: opens = open_tags.count(tag) closes = close_tags.count(tag) if opens != closes: issues_found.append(f"<{tag}>: {opens} open, {closes} close") return issues_found def has_bad_newlines(html): """Check for literal \\n inside HTML strings that would render as bad breaks.""" # In JSON, \n is a newline. In HTML strings, raw newlines can be bad. return '\n' in html def looks_truncated(text): """Heuristics for truncation - text ends without proper punctuation.""" if not text: return False plain = strip_html(text).strip() if not plain: return False # ends without sentence-ending punctuation if plain and plain[-1] not in '.!?»)': return True return False def looks_truncated_strict(text): """Stricter: ends mid-word or mid-sentence.""" if not text: return False plain = strip_html(text).strip() if not plain: return False # ends mid-word (no space before end, no punctuation) last_char = plain[-1] if plain else '' if last_char.isalpha() or last_char in ',;:-(': return True return False def get_field(data, path): """Get nested field value by dot-path.""" parts = path.split('.') cur = data for p in parts: if isinstance(cur, dict): cur = cur.get(p) else: return None if cur is None: return None return cur def search_pdf(keyword, context=300): """Search PDF text for a keyword and return surrounding context.""" # clean keyword for searching kw = re.sub(r'<[^>]+>', '', keyword).strip() if len(kw) < 10: return None # take last 30 chars of plain text as search key search_key = kw[-30:].strip() # normalize whitespace search_key_norm = re.sub(r'\s+', ' ', search_key) # Try to find in PDF idx = pdf_text.find(search_key_norm) if idx == -1: # try shorter search_key_norm = re.sub(r'\s+', ' ', kw[-20:].strip()) idx = pdf_text.find(search_key_norm) if idx == -1: # try even shorter search_key_norm = re.sub(r'\s+', ' ', kw[-15:].strip()) idx = pdf_text.find(search_key_norm) if idx == -1: return None start = max(0, idx - 50) end = min(len(pdf_text), idx + len(search_key_norm) + context) return pdf_text[start:end].replace('\n', ' ') def get_all_html_fields(data, prefix=""): """Recursively yield (field_path, value) for all string fields containing HTML.""" if isinstance(data, dict): for k, v in data.items(): path = f"{prefix}.{k}" if prefix else k if isinstance(v, str) and ('<' in v or len(v) > 50): yield path, v elif isinstance(v, (dict, list)): yield from get_all_html_fields(v, path) elif isinstance(data, list): for i, v in enumerate(data): yield from get_all_html_fields(v, f"{prefix}[{i}]") # ---------- fields to check ---------- IMPORTANT_FIELDS = [ "system.description", "system.effects", "system.examples", "system.components", "system.notes", "system.style", "system.techniques.technique1.technique", "system.techniques.technique2.technique", "system.techniques.technique3.technique", ] # ---------- main scan ---------- json_files = sorted(PACKS.rglob("*.json")) print(f"Scanning {len(json_files)} JSON files...", flush=True) for jf in json_files: rel = str(jf.relative_to(PACKS)) try: data = json.loads(jf.read_text(encoding="utf-8")) except json.JSONDecodeError as e: issues.append({ "file": rel, "field": "(file)", "issue": "json_parse_error", "current_text": str(e), "correct_continuation": None, }) continue item_name = data.get("name", "(unnamed)") # Check all relevant fields for field in IMPORTANT_FIELDS: val = get_field(data, field) if not val or not isinstance(val, str): continue plain = strip_html(val).strip() # 1. Check truncation (strict) if looks_truncated_strict(val): pdf_context = search_pdf(val) issues.append({ "file": rel, "field": field, "issue": "truncated", "item_name": item_name, "current_end": f"...{plain[-100:]}", "current_full_preview": f"{plain[:200]}", "correct_continuation": pdf_context, }) # 2. Check bad newlines in HTML strings if has_bad_newlines(val): issues.append({ "file": rel, "field": field, "issue": "unwanted_newlines", "item_name": item_name, "current_text": val[:300], "correct_continuation": None, }) # 3. Check malformed HTML html_errors = check_unclosed_tags(val) if html_errors: issues.append({ "file": rel, "field": field, "issue": "malformed_html", "item_name": item_name, "html_errors": html_errors, "current_text": val[:300], "correct_continuation": None, }) # 4. Check system.style (plain text field, can also be truncated) style_val = get_field(data, "system.style") if style_val and isinstance(style_val, str): plain_style = style_val.strip() if plain_style and plain_style[-1] not in '.!?»)': pdf_context = search_pdf(plain_style) issues.append({ "file": rel, "field": "system.style", "issue": "truncated", "item_name": item_name, "current_end": f"...{plain_style[-100:]}", "current_full_preview": f"{plain_style[:200]}", "correct_continuation": pdf_context, }) # 5. Bleeding content: look for HTML tags in non-HTML fields for field in ["system.style", "system.reference", "system.speciality"]: val = get_field(data, field) if val and isinstance(val, str) and '<' in val: issues.append({ "file": rel, "field": field, "issue": "html_in_plain_field", "item_name": item_name, "current_text": val[:300], "correct_continuation": None, }) # 6. Check for text outside HTML tags in description-like fields (bleeding) for field in ["system.description", "system.effects", "system.examples", "system.components", "system.notes"]: val = get_field(data, field) if not val or not isinstance(val, str): continue # Strip all HTML and check if leading text is outside tags # e.g., "

foo

some leaked text

bar

" # Check if there's text before the first tag stripped = val.strip() if stripped and not stripped.startswith('<'): issues.append({ "file": rel, "field": field, "issue": "text_outside_html_tags", "item_name": item_name, "current_text": val[:300], "correct_continuation": None, }) # 7. Check technique fields for bleeding (multiple paragraphs that shouldn't be there) for tkey in ["technique1", "technique2", "technique3"]: tech = get_field(data, f"system.techniques.{tkey}") if not tech: continue tech_text = tech.get("technique", "") if tech_text: plain = strip_html(tech_text).strip() # Check for suspiciously long techniques that might have bled content # Techniques with multiple

blocks may be fine, but flag very long ones p_count = tech_text.count('

') if p_count > 3: issues.append({ "file": rel, "field": f"system.techniques.{tkey}.technique", "issue": "possible_bleeding_content", "item_name": item_name, "paragraph_count": p_count, "current_text": tech_text[:400], "correct_continuation": None, }) print(f"Found {len(issues)} potential issues.", flush=True) # ---------- output ---------- out_json = BASE / "compendium-issues.json" out_txt = BASE / "compendium-issues.txt" with open(out_json, 'w', encoding='utf-8') as f: json.dump(issues, f, ensure_ascii=False, indent=2) # Group by issue type for summary from collections import defaultdict by_type = defaultdict(list) by_file = defaultdict(list) for issue in issues: by_type[issue['issue']].append(issue) by_file[issue['file']].append(issue) with open(out_txt, 'w', encoding='utf-8') as f: f.write("=" * 80 + "\n") f.write("COMPENDIUM TEXT QUALITY REPORT\n") f.write("Les Chroniques de l'Étrange — FoundryVTT\n") f.write("=" * 80 + "\n\n") f.write(f"Total files scanned: {len(json_files)}\n") f.write(f"Total issues found: {len(issues)}\n\n") f.write("SUMMARY BY ISSUE TYPE:\n") for itype, ilist in sorted(by_type.items()): f.write(f" {itype}: {len(ilist)}\n") f.write("\n") f.write("=" * 80 + "\n") f.write("DETAILED ISSUES BY FILE\n") f.write("=" * 80 + "\n\n") for fpath in sorted(by_file.keys()): f.write(f"\n--- {fpath} ---\n") for issue in by_file[fpath]: f.write(f" FIELD: {issue['field']}\n") f.write(f" ISSUE: {issue['issue']}\n") if issue.get('item_name'): f.write(f" ITEM: {issue['item_name']}\n") if issue.get('current_end'): f.write(f" END: {issue['current_end']}\n") if issue.get('current_full_preview'): f.write(f" TEXT: {issue['current_full_preview'][:200]}\n") if issue.get('current_text'): f.write(f" TEXT: {issue['current_text'][:200]}\n") if issue.get('html_errors'): f.write(f" HTML ERRORS: {issue['html_errors']}\n") if issue.get('correct_continuation'): f.write(f" PDF: {issue['correct_continuation'][:300]}\n") f.write("\n") print(f"Reports written to:\n {out_json}\n {out_txt}", flush=True)