Correction compendiums

2026-04-27 17:49:00 +02:00
parent d12a7debdf
commit 1e252ff6f2
136 changed files with 38971 additions and 345 deletions
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+import json, re
+from pathlib import Path
+from collections import defaultdict
+
+BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange")
+PACKS = BASE / "packs-src"
+pdf_text = (BASE / "regles.txt").read_text(encoding="utf-8")
+
+WATERMARK_RE = re.compile(
+    r's\s*c\s*r\s*a\s*l\s*e\s*l|les\s+chroniqu|de\s+l.etrange|chr.niqu|hr\s+ng',
+    re.IGNORECASE)
+
+def strip_html(html):
+    return re.sub(r'<[^>]+>', '', html or '').strip()
+
+def has_watermark_bleed(text):
+    plain = strip_html(text)
+    return bool(WATERMARK_RE.search(plain))
+
+def has_bad_newlines(text):
+    lines = text.split('\n')
+    if len(lines) <= 1:
+        return False
+    for line in lines:
+        s = line.strip()
+        if s and not re.match(r'^<[/a-zA-Z]', s) and not s.endswith('>') and len(s) > 3:
+            return True
+    return False
+
+def looks_truncated(text):
+    if not text:
+        return False
+    plain = strip_html(text).strip()
+    plain_clean = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
+    if not plain_clean:
+        plain_clean = plain
+    last = plain_clean[-1] if plain_clean else ''
+    return last.isalpha() or last in ',;:-('
+
+def get_field(data, path):
+    parts = path.split('.')
+    cur = data
+    for p in parts:
+        if isinstance(cur, dict):
+            cur = cur.get(p)
+        else:
+            return None
+        if cur is None:
+            return None
+    return cur
+
+def pdf_search(keyword_text, context=500):
+    plain = strip_html(keyword_text)
+    plain = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
+    if len(plain) < 15:
+        return None
+    for suffix_len in [40, 30, 20, 15]:
+        suffix = re.sub(r'\s+', ' ', plain[-suffix_len:]).strip()
+        if len(suffix) < 10:
+            continue
+        idx = pdf_text.find(suffix)
+        if idx != -1:
+            snippet = pdf_text[idx:min(len(pdf_text), idx + len(suffix) + context)]
+            snippet = re.sub(r'\n+', ' ', snippet)
+            snippet = re.sub(r'\s{3,}', '  ', snippet)
+            return snippet[:600]
+    return None
+
+issues = []
+all_files = sorted(PACKS.rglob("*.json"))
+print(f"Scanning {len(all_files)} files...", flush=True)
+
+HTML_FIELDS = [
+    "system.description",
+    "system.effects",
+    "system.examples",
+    "system.components",
+    "system.notes",
+    "system.techniques.technique1.technique",
+    "system.techniques.technique2.technique",
+    "system.techniques.technique3.technique",
+]
+PLAIN_FIELDS = ["system.style"]
+
+for jf in sorted(all_files):
+    rel = str(jf.relative_to(PACKS))
+    try:
+        data = json.loads(jf.read_text(encoding="utf-8"))
+    except Exception as e:
+        issues.append({"file": rel, "field": "(file)", "issue": "json_error",
+                       "item_name": "?", "current_text": str(e)})
+        continue
+
+    name = data.get("name", "?")
+
+    def add_issue(field, issue_type, **kwargs):
+        issues.append({"file": rel, "field": field, "issue": issue_type,
+                       "item_name": name, **kwargs})
+
+    for field in HTML_FIELDS + PLAIN_FIELDS:
+        val = get_field(data, field)
+        if not val or not isinstance(val, str) or not val.strip():
+            continue
+        plain = strip_html(val).strip()
+
+        if has_watermark_bleed(val):
+            pdf_ctx = pdf_search(val)
+            add_issue(field, "bleeding_watermark",
+                      current_text=val[:400],
+                      plain_text=plain[:300],
+                      pdf_context=pdf_ctx)
+
+        elif looks_truncated(val):
+            is_ingredient = 'cde-ingredients' in rel
+            if is_ingredient and len(plain) < 30:
+                add_issue(field, "truncated_or_short",
+                          current_text=plain,
+                          note="May be legitimate (ingredient quantity)",
+                          pdf_context=pdf_search(plain))
+            else:
+                pdf_ctx = pdf_search(val)
+                add_issue(field, "truncated",
+                          current_end=plain[-120:],
+                          current_preview=plain[:200],
+                          pdf_context=pdf_ctx)
+
+        if has_bad_newlines(val):
+            add_issue(field, "unwanted_newlines",
+                      current_text=val[:400],
+                      plain_text=plain[:300])
+
+    for tkey in ['technique1', 'technique2', 'technique3']:
+        tech = get_field(data, f"system.techniques.{tkey}")
+        if not tech:
+            continue
+        t_text = tech.get("technique", "")
+        if not t_text:
+            continue
+        plain_t = strip_html(t_text)
+        activation_count = plain_t.count("Activation :")
+        if activation_count > 1:
+            add_issue(f"system.techniques.{tkey}.technique",
+                      "bleeding_multiple_techniques",
+                      activation_count=activation_count,
+                      current_text=t_text[:500],
+                      note=f"{activation_count} 'Activation :' markers found")
+        if ("Style" in plain_t or "Orientation :" in plain_t) and len(plain_t) > 300:
+            add_issue(f"system.techniques.{tkey}.technique",
+                      "bleeding_style_or_orientation",
+                      current_text=t_text[:500],
+                      note="Contains 'Style' or 'Orientation' markers inside technique text")
+
+print(f"Found {len(issues)} issues.", flush=True)
+
+out_json = BASE / "compendium-issues.json"
+out_txt = BASE / "compendium-issues.txt"
+
+with open(out_json, 'w', encoding='utf-8') as f:
+    json.dump(issues, f, ensure_ascii=False, indent=2)
+
+by_type = defaultdict(list)
+by_file = defaultdict(list)
+for iss in issues:
+    by_type[iss['issue']].append(iss)
+    by_file[iss['file']].append(iss)
+
+with open(out_txt, 'w', encoding='utf-8') as f:
+    w = f.write
+    w("=" * 80 + "\n")
+    w("COMPENDIUM TEXT QUALITY REPORT\n")
+    w("Les Chroniques de l'Etrange — FoundryVTT\n")
+    w("=" * 80 + "\n\n")
+    w(f"Files scanned: {len(all_files)}\n")
+    w(f"Files with issues: {len(by_file)}\n")
+    w(f"Total issues: {len(issues)}\n\n")
+    w("SUMMARY BY ISSUE TYPE:\n")
+    for itype, ilist in sorted(by_type.items(), key=lambda x: -len(x[1])):
+        w(f"  {itype:50s}  {len(ilist):3d}\n")
+    w("\nFILES WITH ISSUES:\n")
+    for fpath in sorted(by_file.keys()):
+        types = sorted(set(i['issue'] for i in by_file[fpath]))
+        w(f"  {fpath}  [{', '.join(types)}]\n")
+    w("\n")
+    w("=" * 80 + "\n")
+    w("DETAILED ISSUES\n")
+    w("=" * 80 + "\n")
+    for itype in ['bleeding_watermark', 'bleeding_multiple_techniques',
+                  'bleeding_style_or_orientation', 'truncated',
+                  'unwanted_newlines', 'truncated_or_short']:
+        ilist = by_type.get(itype, [])
+        if not ilist:
+            continue
+        w(f"\n{'─'*80}\n")
+        w(f"ISSUE TYPE: {itype}  ({len(ilist)} occurrences)\n")
+        w(f"{'─'*80}\n")
+        for iss in ilist:
+            w(f"\n  File:  {iss['file']}\n")
+            w(f"  Item:  {iss.get('item_name','?')}\n")
+            w(f"  Field: {iss['field']}\n")
+            if iss.get('note'):
+                w(f"  Note:  {iss['note']}\n")
+            if iss.get('current_end'):
+                w(f"  Ends:  ...{iss['current_end']}\n")
+            if iss.get('current_preview'):
+                w(f"  Text:  {iss['current_preview'][:200]}\n")
+            if iss.get('current_text'):
+                ct = iss['current_text']
+                w(f"  Text:  {ct[:300]}\n")
+            if iss.get('pdf_context'):
+                w(f"  PDF>>: {iss['pdf_context'][:400]}\n")
+
+print(f"Written: {out_json}\n        {out_txt}", flush=True)