Correction compendiums

2026-04-27 17:49:00 +02:00
parent d12a7debdf
commit 1e252ff6f2
136 changed files with 38971 additions and 345 deletions
@@ -0,0 +1,330 @@
+#!/usr/bin/env python3
+"""Analyze all JSON files in packs-src/ for text quality issues."""
+
+import json
+import os
+import re
+import sys
+from pathlib import Path
+from html.parser import HTMLParser
+
+BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange")
+PACKS = BASE / "packs-src"
+REGLES = BASE / "regles.txt"
+
+# Load PDF text
+pdf_lines = REGLES.read_text(encoding="utf-8").splitlines()
+pdf_text = REGLES.read_text(encoding="utf-8")
+
+issues = []
+
+# ---------- helpers ----------
+
+def strip_html(html):
+    """Remove HTML tags and return plain text."""
+    return re.sub(r'<[^>]+>', '', html or '')
+
+def check_unclosed_tags(html):
+    """Returns list of unclosed/mismatched tags."""
+    open_tags = re.findall(r'<([a-zA-Z][a-zA-Z0-9]*)[^>]*>', html)
+    close_tags = re.findall(r'</([a-zA-Z][a-zA-Z0-9]*)>', html)
+    issues_found = []
+    # basic: count opens vs closes for block-level tags
+    for tag in ['ul', 'ol', 'li', 'p', 'div', 'strong', 'em', 'b', 'i']:
+        opens = open_tags.count(tag)
+        closes = close_tags.count(tag)
+        if opens != closes:
+            issues_found.append(f"<{tag}>: {opens} open, {closes} close")
+    return issues_found
+
+def has_bad_newlines(html):
+    """Check for literal \\n inside HTML strings that would render as bad breaks."""
+    # In JSON, \n is a newline. In HTML strings, raw newlines can be bad.
+    return '\n' in html
+
+def looks_truncated(text):
+    """Heuristics for truncation - text ends without proper punctuation."""
+    if not text:
+        return False
+    plain = strip_html(text).strip()
+    if not plain:
+        return False
+    # ends without sentence-ending punctuation
+    if plain and plain[-1] not in '.!?»)':
+        return True
+    return False
+
+def looks_truncated_strict(text):
+    """Stricter: ends mid-word or mid-sentence."""
+    if not text:
+        return False
+    plain = strip_html(text).strip()
+    if not plain:
+        return False
+    # ends mid-word (no space before end, no punctuation)
+    last_char = plain[-1] if plain else ''
+    if last_char.isalpha() or last_char in ',;:-(':
+        return True
+    return False
+
+def get_field(data, path):
+    """Get nested field value by dot-path."""
+    parts = path.split('.')
+    cur = data
+    for p in parts:
+        if isinstance(cur, dict):
+            cur = cur.get(p)
+        else:
+            return None
+        if cur is None:
+            return None
+    return cur
+
+def search_pdf(keyword, context=300):
+    """Search PDF text for a keyword and return surrounding context."""
+    # clean keyword for searching
+    kw = re.sub(r'<[^>]+>', '', keyword).strip()
+    if len(kw) < 10:
+        return None
+    # take last 30 chars of plain text as search key
+    search_key = kw[-30:].strip()
+    # normalize whitespace
+    search_key_norm = re.sub(r'\s+', ' ', search_key)
+    
+    # Try to find in PDF
+    idx = pdf_text.find(search_key_norm)
+    if idx == -1:
+        # try shorter
+        search_key_norm = re.sub(r'\s+', ' ', kw[-20:].strip())
+        idx = pdf_text.find(search_key_norm)
+    if idx == -1:
+        # try even shorter
+        search_key_norm = re.sub(r'\s+', ' ', kw[-15:].strip())
+        idx = pdf_text.find(search_key_norm)
+    
+    if idx == -1:
+        return None
+    
+    start = max(0, idx - 50)
+    end = min(len(pdf_text), idx + len(search_key_norm) + context)
+    return pdf_text[start:end].replace('\n', ' ')
+
+def get_all_html_fields(data, prefix=""):
+    """Recursively yield (field_path, value) for all string fields containing HTML."""
+    if isinstance(data, dict):
+        for k, v in data.items():
+            path = f"{prefix}.{k}" if prefix else k
+            if isinstance(v, str) and ('<' in v or len(v) > 50):
+                yield path, v
+            elif isinstance(v, (dict, list)):
+                yield from get_all_html_fields(v, path)
+    elif isinstance(data, list):
+        for i, v in enumerate(data):
+            yield from get_all_html_fields(v, f"{prefix}[{i}]")
+
+# ---------- fields to check ----------
+
+IMPORTANT_FIELDS = [
+    "system.description",
+    "system.effects",
+    "system.examples",
+    "system.components",
+    "system.notes",
+    "system.style",
+    "system.techniques.technique1.technique",
+    "system.techniques.technique2.technique",
+    "system.techniques.technique3.technique",
+]
+
+# ---------- main scan ----------
+
+json_files = sorted(PACKS.rglob("*.json"))
+print(f"Scanning {len(json_files)} JSON files...", flush=True)
+
+for jf in json_files:
+    rel = str(jf.relative_to(PACKS))
+    try:
+        data = json.loads(jf.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as e:
+        issues.append({
+            "file": rel,
+            "field": "(file)",
+            "issue": "json_parse_error",
+            "current_text": str(e),
+            "correct_continuation": None,
+        })
+        continue
+
+    item_name = data.get("name", "(unnamed)")
+
+    # Check all relevant fields
+    for field in IMPORTANT_FIELDS:
+        val = get_field(data, field)
+        if not val or not isinstance(val, str):
+            continue
+
+        plain = strip_html(val).strip()
+        
+        # 1. Check truncation (strict)
+        if looks_truncated_strict(val):
+            pdf_context = search_pdf(val)
+            issues.append({
+                "file": rel,
+                "field": field,
+                "issue": "truncated",
+                "item_name": item_name,
+                "current_end": f"...{plain[-100:]}",
+                "current_full_preview": f"{plain[:200]}",
+                "correct_continuation": pdf_context,
+            })
+        
+        # 2. Check bad newlines in HTML strings
+        if has_bad_newlines(val):
+            issues.append({
+                "file": rel,
+                "field": field,
+                "issue": "unwanted_newlines",
+                "item_name": item_name,
+                "current_text": val[:300],
+                "correct_continuation": None,
+            })
+        
+        # 3. Check malformed HTML
+        html_errors = check_unclosed_tags(val)
+        if html_errors:
+            issues.append({
+                "file": rel,
+                "field": field,
+                "issue": "malformed_html",
+                "item_name": item_name,
+                "html_errors": html_errors,
+                "current_text": val[:300],
+                "correct_continuation": None,
+            })
+
+    # 4. Check system.style (plain text field, can also be truncated)
+    style_val = get_field(data, "system.style")
+    if style_val and isinstance(style_val, str):
+        plain_style = style_val.strip()
+        if plain_style and plain_style[-1] not in '.!?»)':
+            pdf_context = search_pdf(plain_style)
+            issues.append({
+                "file": rel,
+                "field": "system.style",
+                "issue": "truncated",
+                "item_name": item_name,
+                "current_end": f"...{plain_style[-100:]}",
+                "current_full_preview": f"{plain_style[:200]}",
+                "correct_continuation": pdf_context,
+            })
+
+    # 5. Bleeding content: look for HTML tags in non-HTML fields  
+    for field in ["system.style", "system.reference", "system.speciality"]:
+        val = get_field(data, field)
+        if val and isinstance(val, str) and '<' in val:
+            issues.append({
+                "file": rel,
+                "field": field,
+                "issue": "html_in_plain_field",
+                "item_name": item_name,
+                "current_text": val[:300],
+                "correct_continuation": None,
+            })
+
+    # 6. Check for text outside HTML tags in description-like fields (bleeding)
+    for field in ["system.description", "system.effects", "system.examples", "system.components", "system.notes"]:
+        val = get_field(data, field)
+        if not val or not isinstance(val, str):
+            continue
+        # Strip all HTML and check if leading text is outside tags
+        # e.g., "<p>foo</p> some leaked text <p>bar</p>"
+        # Check if there's text before the first tag
+        stripped = val.strip()
+        if stripped and not stripped.startswith('<'):
+            issues.append({
+                "file": rel,
+                "field": field,
+                "issue": "text_outside_html_tags",
+                "item_name": item_name,
+                "current_text": val[:300],
+                "correct_continuation": None,
+            })
+
+    # 7. Check technique fields for bleeding (multiple paragraphs that shouldn't be there)
+    for tkey in ["technique1", "technique2", "technique3"]:
+        tech = get_field(data, f"system.techniques.{tkey}")
+        if not tech:
+            continue
+        tech_text = tech.get("technique", "")
+        if tech_text:
+            plain = strip_html(tech_text).strip()
+            # Check for suspiciously long techniques that might have bled content
+            # Techniques with multiple <p> blocks may be fine, but flag very long ones
+            p_count = tech_text.count('</p>')
+            if p_count > 3:
+                issues.append({
+                    "file": rel,
+                    "field": f"system.techniques.{tkey}.technique",
+                    "issue": "possible_bleeding_content",
+                    "item_name": item_name,
+                    "paragraph_count": p_count,
+                    "current_text": tech_text[:400],
+                    "correct_continuation": None,
+                })
+
+print(f"Found {len(issues)} potential issues.", flush=True)
+
+# ---------- output ----------
+
+out_json = BASE / "compendium-issues.json"
+out_txt = BASE / "compendium-issues.txt"
+
+with open(out_json, 'w', encoding='utf-8') as f:
+    json.dump(issues, f, ensure_ascii=False, indent=2)
+
+# Group by issue type for summary
+from collections import defaultdict
+by_type = defaultdict(list)
+by_file = defaultdict(list)
+for issue in issues:
+    by_type[issue['issue']].append(issue)
+    by_file[issue['file']].append(issue)
+
+with open(out_txt, 'w', encoding='utf-8') as f:
+    f.write("=" * 80 + "\n")
+    f.write("COMPENDIUM TEXT QUALITY REPORT\n")
+    f.write("Les Chroniques de l'Étrange — FoundryVTT\n")
+    f.write("=" * 80 + "\n\n")
+    
+    f.write(f"Total files scanned: {len(json_files)}\n")
+    f.write(f"Total issues found: {len(issues)}\n\n")
+    
+    f.write("SUMMARY BY ISSUE TYPE:\n")
+    for itype, ilist in sorted(by_type.items()):
+        f.write(f"  {itype}: {len(ilist)}\n")
+    f.write("\n")
+    
+    f.write("=" * 80 + "\n")
+    f.write("DETAILED ISSUES BY FILE\n")
+    f.write("=" * 80 + "\n\n")
+    
+    for fpath in sorted(by_file.keys()):
+        f.write(f"\n--- {fpath} ---\n")
+        for issue in by_file[fpath]:
+            f.write(f"  FIELD: {issue['field']}\n")
+            f.write(f"  ISSUE: {issue['issue']}\n")
+            if issue.get('item_name'):
+                f.write(f"  ITEM:  {issue['item_name']}\n")
+            if issue.get('current_end'):
+                f.write(f"  END:   {issue['current_end']}\n")
+            if issue.get('current_full_preview'):
+                f.write(f"  TEXT:  {issue['current_full_preview'][:200]}\n")
+            if issue.get('current_text'):
+                f.write(f"  TEXT:  {issue['current_text'][:200]}\n")
+            if issue.get('html_errors'):
+                f.write(f"  HTML ERRORS: {issue['html_errors']}\n")
+            if issue.get('correct_continuation'):
+                f.write(f"  PDF:   {issue['correct_continuation'][:300]}\n")
+            f.write("\n")
+
+print(f"Reports written to:\n  {out_json}\n  {out_txt}", flush=True)