Avec initiative

2026-04-29 22:21:41 +02:00
parent 64ab54daf3
commit 1fa6dbf2f8
6 changed files with 0 additions and 4533 deletions
@@ -1,330 +0,0 @@
-#!/usr/bin/env python3
-"""Analyze all JSON files in packs-src/ for text quality issues."""
-
-import json
-import os
-import re
-import sys
-from pathlib import Path
-from html.parser import HTMLParser
-
-BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange")
-PACKS = BASE / "packs-src"
-REGLES = BASE / "regles.txt"
-
-# Load PDF text
-pdf_lines = REGLES.read_text(encoding="utf-8").splitlines()
-pdf_text = REGLES.read_text(encoding="utf-8")
-
-issues = []
-
-# ---------- helpers ----------
-
-def strip_html(html):
-    """Remove HTML tags and return plain text."""
-    return re.sub(r'<[^>]+>', '', html or '')
-
-def check_unclosed_tags(html):
-    """Returns list of unclosed/mismatched tags."""
-    open_tags = re.findall(r'<([a-zA-Z][a-zA-Z0-9]*)[^>]*>', html)
-    close_tags = re.findall(r'</([a-zA-Z][a-zA-Z0-9]*)>', html)
-    issues_found = []
-    # basic: count opens vs closes for block-level tags
-    for tag in ['ul', 'ol', 'li', 'p', 'div', 'strong', 'em', 'b', 'i']:
-        opens = open_tags.count(tag)
-        closes = close_tags.count(tag)
-        if opens != closes:
-            issues_found.append(f"<{tag}>: {opens} open, {closes} close")
-    return issues_found
-
-def has_bad_newlines(html):
-    """Check for literal \\n inside HTML strings that would render as bad breaks."""
-    # In JSON, \n is a newline. In HTML strings, raw newlines can be bad.
-    return '\n' in html
-
-def looks_truncated(text):
-    """Heuristics for truncation - text ends without proper punctuation."""
-    if not text:
-        return False
-    plain = strip_html(text).strip()
-    if not plain:
-        return False
-    # ends without sentence-ending punctuation
-    if plain and plain[-1] not in '.!?»)':
-        return True
-    return False
-
-def looks_truncated_strict(text):
-    """Stricter: ends mid-word or mid-sentence."""
-    if not text:
-        return False
-    plain = strip_html(text).strip()
-    if not plain:
-        return False
-    # ends mid-word (no space before end, no punctuation)
-    last_char = plain[-1] if plain else ''
-    if last_char.isalpha() or last_char in ',;:-(':
-        return True
-    return False
-
-def get_field(data, path):
-    """Get nested field value by dot-path."""
-    parts = path.split('.')
-    cur = data
-    for p in parts:
-        if isinstance(cur, dict):
-            cur = cur.get(p)
-        else:
-            return None
-        if cur is None:
-            return None
-    return cur
-
-def search_pdf(keyword, context=300):
-    """Search PDF text for a keyword and return surrounding context."""
-    # clean keyword for searching
-    kw = re.sub(r'<[^>]+>', '', keyword).strip()
-    if len(kw) < 10:
-        return None
-    # take last 30 chars of plain text as search key
-    search_key = kw[-30:].strip()
-    # normalize whitespace
-    search_key_norm = re.sub(r'\s+', ' ', search_key)
-    
-    # Try to find in PDF
-    idx = pdf_text.find(search_key_norm)
-    if idx == -1:
-        # try shorter
-        search_key_norm = re.sub(r'\s+', ' ', kw[-20:].strip())
-        idx = pdf_text.find(search_key_norm)
-    if idx == -1:
-        # try even shorter
-        search_key_norm = re.sub(r'\s+', ' ', kw[-15:].strip())
-        idx = pdf_text.find(search_key_norm)
-    
-    if idx == -1:
-        return None
-    
-    start = max(0, idx - 50)
-    end = min(len(pdf_text), idx + len(search_key_norm) + context)
-    return pdf_text[start:end].replace('\n', ' ')
-
-def get_all_html_fields(data, prefix=""):
-    """Recursively yield (field_path, value) for all string fields containing HTML."""
-    if isinstance(data, dict):
-        for k, v in data.items():
-            path = f"{prefix}.{k}" if prefix else k
-            if isinstance(v, str) and ('<' in v or len(v) > 50):
-                yield path, v
-            elif isinstance(v, (dict, list)):
-                yield from get_all_html_fields(v, path)
-    elif isinstance(data, list):
-        for i, v in enumerate(data):
-            yield from get_all_html_fields(v, f"{prefix}[{i}]")
-
-# ---------- fields to check ----------
-
-IMPORTANT_FIELDS = [
-    "system.description",
-    "system.effects",
-    "system.examples",
-    "system.components",
-    "system.notes",
-    "system.style",
-    "system.techniques.technique1.technique",
-    "system.techniques.technique2.technique",
-    "system.techniques.technique3.technique",
-]
-
-# ---------- main scan ----------
-
-json_files = sorted(PACKS.rglob("*.json"))
-print(f"Scanning {len(json_files)} JSON files...", flush=True)
-
-for jf in json_files:
-    rel = str(jf.relative_to(PACKS))
-    try:
-        data = json.loads(jf.read_text(encoding="utf-8"))
-    except json.JSONDecodeError as e:
-        issues.append({
-            "file": rel,
-            "field": "(file)",
-            "issue": "json_parse_error",
-            "current_text": str(e),
-            "correct_continuation": None,
-        })
-        continue
-
-    item_name = data.get("name", "(unnamed)")
-
-    # Check all relevant fields
-    for field in IMPORTANT_FIELDS:
-        val = get_field(data, field)
-        if not val or not isinstance(val, str):
-            continue
-
-        plain = strip_html(val).strip()
-        
-        # 1. Check truncation (strict)
-        if looks_truncated_strict(val):
-            pdf_context = search_pdf(val)
-            issues.append({
-                "file": rel,
-                "field": field,
-                "issue": "truncated",
-                "item_name": item_name,
-                "current_end": f"...{plain[-100:]}",
-                "current_full_preview": f"{plain[:200]}",
-                "correct_continuation": pdf_context,
-            })
-        
-        # 2. Check bad newlines in HTML strings
-        if has_bad_newlines(val):
-            issues.append({
-                "file": rel,
-                "field": field,
-                "issue": "unwanted_newlines",
-                "item_name": item_name,
-                "current_text": val[:300],
-                "correct_continuation": None,
-            })
-        
-        # 3. Check malformed HTML
-        html_errors = check_unclosed_tags(val)
-        if html_errors:
-            issues.append({
-                "file": rel,
-                "field": field,
-                "issue": "malformed_html",
-                "item_name": item_name,
-                "html_errors": html_errors,
-                "current_text": val[:300],
-                "correct_continuation": None,
-            })
-
-    # 4. Check system.style (plain text field, can also be truncated)
-    style_val = get_field(data, "system.style")
-    if style_val and isinstance(style_val, str):
-        plain_style = style_val.strip()
-        if plain_style and plain_style[-1] not in '.!?»)':
-            pdf_context = search_pdf(plain_style)
-            issues.append({
-                "file": rel,
-                "field": "system.style",
-                "issue": "truncated",
-                "item_name": item_name,
-                "current_end": f"...{plain_style[-100:]}",
-                "current_full_preview": f"{plain_style[:200]}",
-                "correct_continuation": pdf_context,
-            })
-
-    # 5. Bleeding content: look for HTML tags in non-HTML fields  
-    for field in ["system.style", "system.reference", "system.speciality"]:
-        val = get_field(data, field)
-        if val and isinstance(val, str) and '<' in val:
-            issues.append({
-                "file": rel,
-                "field": field,
-                "issue": "html_in_plain_field",
-                "item_name": item_name,
-                "current_text": val[:300],
-                "correct_continuation": None,
-            })
-
-    # 6. Check for text outside HTML tags in description-like fields (bleeding)
-    for field in ["system.description", "system.effects", "system.examples", "system.components", "system.notes"]:
-        val = get_field(data, field)
-        if not val or not isinstance(val, str):
-            continue
-        # Strip all HTML and check if leading text is outside tags
-        # e.g., "<p>foo</p> some leaked text <p>bar</p>"
-        # Check if there's text before the first tag
-        stripped = val.strip()
-        if stripped and not stripped.startswith('<'):
-            issues.append({
-                "file": rel,
-                "field": field,
-                "issue": "text_outside_html_tags",
-                "item_name": item_name,
-                "current_text": val[:300],
-                "correct_continuation": None,
-            })
-
-    # 7. Check technique fields for bleeding (multiple paragraphs that shouldn't be there)
-    for tkey in ["technique1", "technique2", "technique3"]:
-        tech = get_field(data, f"system.techniques.{tkey}")
-        if not tech:
-            continue
-        tech_text = tech.get("technique", "")
-        if tech_text:
-            plain = strip_html(tech_text).strip()
-            # Check for suspiciously long techniques that might have bled content
-            # Techniques with multiple <p> blocks may be fine, but flag very long ones
-            p_count = tech_text.count('</p>')
-            if p_count > 3:
-                issues.append({
-                    "file": rel,
-                    "field": f"system.techniques.{tkey}.technique",
-                    "issue": "possible_bleeding_content",
-                    "item_name": item_name,
-                    "paragraph_count": p_count,
-                    "current_text": tech_text[:400],
-                    "correct_continuation": None,
-                })
-
-print(f"Found {len(issues)} potential issues.", flush=True)
-
-# ---------- output ----------
-
-out_json = BASE / "compendium-issues.json"
-out_txt = BASE / "compendium-issues.txt"
-
-with open(out_json, 'w', encoding='utf-8') as f:
-    json.dump(issues, f, ensure_ascii=False, indent=2)
-
-# Group by issue type for summary
-from collections import defaultdict
-by_type = defaultdict(list)
-by_file = defaultdict(list)
-for issue in issues:
-    by_type[issue['issue']].append(issue)
-    by_file[issue['file']].append(issue)
-
-with open(out_txt, 'w', encoding='utf-8') as f:
-    f.write("=" * 80 + "\n")
-    f.write("COMPENDIUM TEXT QUALITY REPORT\n")
-    f.write("Les Chroniques de l'Étrange — FoundryVTT\n")
-    f.write("=" * 80 + "\n\n")
-    
-    f.write(f"Total files scanned: {len(json_files)}\n")
-    f.write(f"Total issues found: {len(issues)}\n\n")
-    
-    f.write("SUMMARY BY ISSUE TYPE:\n")
-    for itype, ilist in sorted(by_type.items()):
-        f.write(f"  {itype}: {len(ilist)}\n")
-    f.write("\n")
-    
-    f.write("=" * 80 + "\n")
-    f.write("DETAILED ISSUES BY FILE\n")
-    f.write("=" * 80 + "\n\n")
-    
-    for fpath in sorted(by_file.keys()):
-        f.write(f"\n--- {fpath} ---\n")
-        for issue in by_file[fpath]:
-            f.write(f"  FIELD: {issue['field']}\n")
-            f.write(f"  ISSUE: {issue['issue']}\n")
-            if issue.get('item_name'):
-                f.write(f"  ITEM:  {issue['item_name']}\n")
-            if issue.get('current_end'):
-                f.write(f"  END:   {issue['current_end']}\n")
-            if issue.get('current_full_preview'):
-                f.write(f"  TEXT:  {issue['current_full_preview'][:200]}\n")
-            if issue.get('current_text'):
-                f.write(f"  TEXT:  {issue['current_text'][:200]}\n")
-            if issue.get('html_errors'):
-                f.write(f"  HTML ERRORS: {issue['html_errors']}\n")
-            if issue.get('correct_continuation'):
-                f.write(f"  PDF:   {issue['correct_continuation'][:300]}\n")
-            f.write("\n")
-
-print(f"Reports written to:\n  {out_json}\n  {out_txt}", flush=True)
@@ -1,213 +0,0 @@
-#!/usr/bin/env python3
-import json, re
-from pathlib import Path
-from collections import defaultdict
-
-BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange")
-PACKS = BASE / "packs-src"
-pdf_text = (BASE / "regles.txt").read_text(encoding="utf-8")
-
-WATERMARK_RE = re.compile(
-    r's\s*c\s*r\s*a\s*l\s*e\s*l|les\s+chroniqu|de\s+l.etrange|chr.niqu|hr\s+ng',
-    re.IGNORECASE)
-
-def strip_html(html):
-    return re.sub(r'<[^>]+>', '', html or '').strip()
-
-def has_watermark_bleed(text):
-    plain = strip_html(text)
-    return bool(WATERMARK_RE.search(plain))
-
-def has_bad_newlines(text):
-    lines = text.split('\n')
-    if len(lines) <= 1:
-        return False
-    for line in lines:
-        s = line.strip()
-        if s and not re.match(r'^<[/a-zA-Z]', s) and not s.endswith('>') and len(s) > 3:
-            return True
-    return False
-
-def looks_truncated(text):
-    if not text:
-        return False
-    plain = strip_html(text).strip()
-    plain_clean = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
-    if not plain_clean:
-        plain_clean = plain
-    last = plain_clean[-1] if plain_clean else ''
-    return last.isalpha() or last in ',;:-('
-
-def get_field(data, path):
-    parts = path.split('.')
-    cur = data
-    for p in parts:
-        if isinstance(cur, dict):
-            cur = cur.get(p)
-        else:
-            return None
-        if cur is None:
-            return None
-    return cur
-
-def pdf_search(keyword_text, context=500):
-    plain = strip_html(keyword_text)
-    plain = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
-    if len(plain) < 15:
-        return None
-    for suffix_len in [40, 30, 20, 15]:
-        suffix = re.sub(r'\s+', ' ', plain[-suffix_len:]).strip()
-        if len(suffix) < 10:
-            continue
-        idx = pdf_text.find(suffix)
-        if idx != -1:
-            snippet = pdf_text[idx:min(len(pdf_text), idx + len(suffix) + context)]
-            snippet = re.sub(r'\n+', ' ', snippet)
-            snippet = re.sub(r'\s{3,}', '  ', snippet)
-            return snippet[:600]
-    return None
-
-issues = []
-all_files = sorted(PACKS.rglob("*.json"))
-print(f"Scanning {len(all_files)} files...", flush=True)
-
-HTML_FIELDS = [
-    "system.description",
-    "system.effects",
-    "system.examples",
-    "system.components",
-    "system.notes",
-    "system.techniques.technique1.technique",
-    "system.techniques.technique2.technique",
-    "system.techniques.technique3.technique",
-]
-PLAIN_FIELDS = ["system.style"]
-
-for jf in sorted(all_files):
-    rel = str(jf.relative_to(PACKS))
-    try:
-        data = json.loads(jf.read_text(encoding="utf-8"))
-    except Exception as e:
-        issues.append({"file": rel, "field": "(file)", "issue": "json_error",
-                       "item_name": "?", "current_text": str(e)})
-        continue
-
-    name = data.get("name", "?")
-
-    def add_issue(field, issue_type, **kwargs):
-        issues.append({"file": rel, "field": field, "issue": issue_type,
-                       "item_name": name, **kwargs})
-
-    for field in HTML_FIELDS + PLAIN_FIELDS:
-        val = get_field(data, field)
-        if not val or not isinstance(val, str) or not val.strip():
-            continue
-        plain = strip_html(val).strip()
-
-        if has_watermark_bleed(val):
-            pdf_ctx = pdf_search(val)
-            add_issue(field, "bleeding_watermark",
-                      current_text=val[:400],
-                      plain_text=plain[:300],
-                      pdf_context=pdf_ctx)
-
-        elif looks_truncated(val):
-            is_ingredient = 'cde-ingredients' in rel
-            if is_ingredient and len(plain) < 30:
-                add_issue(field, "truncated_or_short",
-                          current_text=plain,
-                          note="May be legitimate (ingredient quantity)",
-                          pdf_context=pdf_search(plain))
-            else:
-                pdf_ctx = pdf_search(val)
-                add_issue(field, "truncated",
-                          current_end=plain[-120:],
-                          current_preview=plain[:200],
-                          pdf_context=pdf_ctx)
-
-        if has_bad_newlines(val):
-            add_issue(field, "unwanted_newlines",
-                      current_text=val[:400],
-                      plain_text=plain[:300])
-
-    for tkey in ['technique1', 'technique2', 'technique3']:
-        tech = get_field(data, f"system.techniques.{tkey}")
-        if not tech:
-            continue
-        t_text = tech.get("technique", "")
-        if not t_text:
-            continue
-        plain_t = strip_html(t_text)
-        activation_count = plain_t.count("Activation :")
-        if activation_count > 1:
-            add_issue(f"system.techniques.{tkey}.technique",
-                      "bleeding_multiple_techniques",
-                      activation_count=activation_count,
-                      current_text=t_text[:500],
-                      note=f"{activation_count} 'Activation :' markers found")
-        if ("Style" in plain_t or "Orientation :" in plain_t) and len(plain_t) > 300:
-            add_issue(f"system.techniques.{tkey}.technique",
-                      "bleeding_style_or_orientation",
-                      current_text=t_text[:500],
-                      note="Contains 'Style' or 'Orientation' markers inside technique text")
-
-print(f"Found {len(issues)} issues.", flush=True)
-
-out_json = BASE / "compendium-issues.json"
-out_txt = BASE / "compendium-issues.txt"
-
-with open(out_json, 'w', encoding='utf-8') as f:
-    json.dump(issues, f, ensure_ascii=False, indent=2)
-
-by_type = defaultdict(list)
-by_file = defaultdict(list)
-for iss in issues:
-    by_type[iss['issue']].append(iss)
-    by_file[iss['file']].append(iss)
-
-with open(out_txt, 'w', encoding='utf-8') as f:
-    w = f.write
-    w("=" * 80 + "\n")
-    w("COMPENDIUM TEXT QUALITY REPORT\n")
-    w("Les Chroniques de l'Etrange — FoundryVTT\n")
-    w("=" * 80 + "\n\n")
-    w(f"Files scanned: {len(all_files)}\n")
-    w(f"Files with issues: {len(by_file)}\n")
-    w(f"Total issues: {len(issues)}\n\n")
-    w("SUMMARY BY ISSUE TYPE:\n")
-    for itype, ilist in sorted(by_type.items(), key=lambda x: -len(x[1])):
-        w(f"  {itype:50s}  {len(ilist):3d}\n")
-    w("\nFILES WITH ISSUES:\n")
-    for fpath in sorted(by_file.keys()):
-        types = sorted(set(i['issue'] for i in by_file[fpath]))
-        w(f"  {fpath}  [{', '.join(types)}]\n")
-    w("\n")
-    w("=" * 80 + "\n")
-    w("DETAILED ISSUES\n")
-    w("=" * 80 + "\n")
-    for itype in ['bleeding_watermark', 'bleeding_multiple_techniques',
-                  'bleeding_style_or_orientation', 'truncated',
-                  'unwanted_newlines', 'truncated_or_short']:
-        ilist = by_type.get(itype, [])
-        if not ilist:
-            continue
-        w(f"\n{'─'*80}\n")
-        w(f"ISSUE TYPE: {itype}  ({len(ilist)} occurrences)\n")
-        w(f"{'─'*80}\n")
-        for iss in ilist:
-            w(f"\n  File:  {iss['file']}\n")
-            w(f"  Item:  {iss.get('item_name','?')}\n")
-            w(f"  Field: {iss['field']}\n")
-            if iss.get('note'):
-                w(f"  Note:  {iss['note']}\n")
-            if iss.get('current_end'):
-                w(f"  Ends:  ...{iss['current_end']}\n")
-            if iss.get('current_preview'):
-                w(f"  Text:  {iss['current_preview'][:200]}\n")
-            if iss.get('current_text'):
-                ct = iss['current_text']
-                w(f"  Text:  {ct[:300]}\n")
-            if iss.get('pdf_context'):
-                w(f"  PDF>>: {iss['pdf_context'][:400]}\n")
-
-print(f"Written: {out_json}\n        {out_txt}", flush=True)
@@ -1,317 +0,0 @@
-#!/usr/bin/env python3
-"""Final comprehensive analysis including missing beginnings and garbled content."""
-
-import json, re
-from pathlib import Path
-from collections import defaultdict
-
-BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange")
-PACKS = BASE / "packs-src"
-pdf_text = (BASE / "regles.txt").read_text(encoding="utf-8")
-
-WATERMARK_RE = re.compile(
-    r's\s*c\s*r\s*a\s*l\s*e\s*l|les\s+chroniqu|de\s+l.etrange|chr.niqu|hr\s+ng',
-    re.IGNORECASE)
-
-def strip_html(html):
-    return re.sub(r'<[^>]+>', '', html or '').strip()
-
-def has_watermark_bleed(text):
-    plain = strip_html(text)
-    return bool(WATERMARK_RE.search(plain))
-
-def has_bad_newlines(text):
-    lines = text.split('\n')
-    if len(lines) <= 1:
-        return False
-    for line in lines:
-        s = line.strip()
-        if s and not re.match(r'^<[/a-zA-Z]', s) and not s.endswith('>') and len(s) > 3:
-            return True
-    return False
-
-def looks_truncated(text):
-    """Text appears cut off at the end."""
-    if not text:
-        return False
-    plain = strip_html(text).strip()
-    plain_clean = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
-    if not plain_clean:
-        plain_clean = plain
-    last = plain_clean[-1] if plain_clean else ''
-    return last.isalpha() or last in ',;:-('
-
-def looks_missing_beginning(text):
-    """Text starts mid-sentence (lowercase, or starts with punctuation)."""
-    if not text:
-        return False
-    plain = strip_html(text).strip()
-    if not plain:
-        return False
-    first_char = plain[0]
-    # Starts with lowercase letter (unlikely to be intentional)
-    if first_char.islower():
-        return True
-    # Starts with a bullet/list item that makes no sense
-    if re.match(r'^(et|ou|de|du|des|les|la|le|un|une|à|au|aux|mais|car|si|que)\s', plain, re.IGNORECASE):
-        return True
-    return False
-
-def is_garbled_page_layout(text):
-    """Detects when text is broken into single-letter paragraphs (PDF artifact)."""
-    # Pattern: multiple single-letter <p> tags = garbled page layout
-    single_p = re.findall(r'<p>([a-zA-Z0-9])</p>', text)
-    if len(single_p) >= 5:
-        return True
-    return False
-
-def get_field(data, path):
-    parts = path.split('.')
-    cur = data
-    for p in parts:
-        if isinstance(cur, dict):
-            cur = cur.get(p)
-        else:
-            return None
-        if cur is None:
-            return None
-    return cur
-
-def pdf_search(keyword_text, context=500):
-    """Search PDF text after the given keyword."""
-    plain = strip_html(keyword_text)
-    plain = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
-    if len(plain) < 15:
-        return None
-    for suffix_len in [40, 30, 20, 15]:
-        suffix = re.sub(r'\s+', ' ', plain[-suffix_len:]).strip()
-        if len(suffix) < 10:
-            continue
-        idx = pdf_text.find(suffix)
-        if idx != -1:
-            snippet = pdf_text[idx:min(len(pdf_text), idx + len(suffix) + context)]
-            snippet = re.sub(r'\n+', ' ', snippet)
-            snippet = re.sub(r'\s{3,}', '  ', snippet)
-            return snippet[:600]
-    return None
-
-def pdf_search_forward(keyword_text, context=500):
-    """Search PDF text BEFORE the start of the given text (find what precedes it)."""
-    plain = strip_html(keyword_text)
-    plain = re.sub(r'\s+', ' ', plain[:60]).strip()
-    if len(plain) < 15:
-        return None
-    # Search for the prefix
-    for prefix_len in [50, 40, 30, 20]:
-        prefix = re.sub(r'\s+', ' ', plain[:prefix_len]).strip()
-        if len(prefix) < 10:
-            continue
-        idx = pdf_text.find(prefix)
-        if idx != -1:
-            # Get text before this position
-            start = max(0, idx - context)
-            snippet = pdf_text[start:idx + len(prefix)]
-            snippet = re.sub(r'\n+', ' ', snippet)
-            snippet = re.sub(r'\s{3,}', '  ', snippet)
-            return snippet[-400:]
-    return None
-
-issues = []
-all_files = sorted(PACKS.rglob("*.json"))
-print(f"Scanning {len(all_files)} files...", flush=True)
-
-HTML_FIELDS = [
-    "system.description",
-    "system.effects",
-    "system.examples",
-    "system.components",
-    "system.notes",
-    "system.techniques.technique1.technique",
-    "system.techniques.technique2.technique",
-    "system.techniques.technique3.technique",
-]
-PLAIN_FIELDS = ["system.style"]
-
-for jf in sorted(all_files):
-    rel = str(jf.relative_to(PACKS))
-    try:
-        data = json.loads(jf.read_text(encoding="utf-8"))
-    except Exception as e:
-        issues.append({"file": rel, "field": "(file)", "issue": "json_error",
-                       "item_name": "?", "current_text": str(e)})
-        continue
-
-    name = data.get("name", "?")
-
-    def add_issue(field, issue_type, **kwargs):
-        issues.append({"file": rel, "field": field, "issue": issue_type,
-                       "item_name": name, **kwargs})
-
-    for field in HTML_FIELDS + PLAIN_FIELDS:
-        val = get_field(data, field)
-        if not val or not isinstance(val, str) or not val.strip():
-            continue
-        plain = strip_html(val).strip()
-
-        # --- Garbled page layout ---
-        if is_garbled_page_layout(val):
-            add_issue(field, "garbled_page_layout",
-                      current_text=val[:400],
-                      note="Text broken into single-character <p> tags — PDF layout artifact")
-            continue  # other checks not useful
-
-        # --- Watermark bleeding ---
-        if has_watermark_bleed(val):
-            pdf_ctx = pdf_search(val)
-            add_issue(field, "bleeding_watermark",
-                      current_text=val[:400],
-                      plain_text=plain[:300],
-                      pdf_context=pdf_ctx)
-
-        # --- Missing beginning ---
-        if looks_missing_beginning(val):
-            pdf_ctx = pdf_search_forward(val)
-            add_issue(field, "missing_beginning",
-                      current_start=plain[:150],
-                      pdf_context_before=pdf_ctx)
-
-        # --- Truncation ---
-        if looks_truncated(val):
-            is_ingredient = 'cde-ingredients' in rel
-            if is_ingredient and len(plain) < 30:
-                add_issue(field, "truncated_or_short",
-                          current_text=plain,
-                          note="May be legitimate (ingredient quantity/name)")
-            else:
-                pdf_ctx = pdf_search(val)
-                add_issue(field, "truncated",
-                          current_end=plain[-120:],
-                          current_preview=plain[:200],
-                          pdf_context=pdf_ctx)
-
-        # --- Unwanted newlines ---
-        if has_bad_newlines(val):
-            add_issue(field, "unwanted_newlines",
-                      current_text=val[:400],
-                      plain_text=plain[:300])
-
-    # --- Technique-level checks ---
-    for tkey in ['technique1', 'technique2', 'technique3']:
-        tech = get_field(data, f"system.techniques.{tkey}")
-        if not tech:
-            continue
-        t_text = tech.get("technique", "")
-        if not t_text:
-            continue
-        plain_t = strip_html(t_text)
-        activation_count = plain_t.count("Activation :")
-        if activation_count > 1:
-            add_issue(f"system.techniques.{tkey}.technique",
-                      "bleeding_multiple_techniques",
-                      activation_count=activation_count,
-                      current_text=t_text[:500],
-                      note=f"{activation_count} 'Activation :' markers — multiple techniques merged")
-        if ("Style" in plain_t or "Orientation :" in plain_t) and len(plain_t) > 300:
-            add_issue(f"system.techniques.{tkey}.technique",
-                      "bleeding_style_or_orientation",
-                      current_text=t_text[:500],
-                      note="Contains 'Style' or 'Orientation' markers — extra text from page layout")
-
-print(f"Found {len(issues)} issues.", flush=True)
-
-# Deduplicate (same file+field+issue_type)
-seen = set()
-deduped = []
-for iss in issues:
-    key = (iss['file'], iss['field'], iss['issue'])
-    if key not in seen:
-        seen.add(key)
-        deduped.append(iss)
-issues = deduped
-print(f"After dedup: {len(issues)} issues.", flush=True)
-
-out_json = BASE / "compendium-issues.json"
-out_txt = BASE / "compendium-issues.txt"
-
-with open(out_json, 'w', encoding='utf-8') as f:
-    json.dump(issues, f, ensure_ascii=False, indent=2)
-
-by_type = defaultdict(list)
-by_file = defaultdict(list)
-for iss in issues:
-    by_type[iss['issue']].append(iss)
-    by_file[iss['file']].append(iss)
-
-PRIORITY_ORDER = [
-    'garbled_page_layout',
-    'missing_beginning',
-    'bleeding_watermark',
-    'bleeding_multiple_techniques',
-    'bleeding_style_or_orientation',
-    'truncated',
-    'unwanted_newlines',
-    'truncated_or_short',
-]
-
-with open(out_txt, 'w', encoding='utf-8') as f:
-    w = f.write
-    w("=" * 80 + "\n")
-    w("COMPENDIUM TEXT QUALITY REPORT\n")
-    w("Les Chroniques de l'Etrange — FoundryVTT\n")
-    w("=" * 80 + "\n\n")
-    w(f"Files scanned: {len(all_files)}\n")
-    w(f"Files with issues: {len(by_file)}\n")
-    w(f"Total issues: {len(issues)}\n\n")
-    
-    w("SUMMARY BY ISSUE TYPE:\n")
-    for itype in PRIORITY_ORDER:
-        ilist = by_type.get(itype, [])
-        if ilist:
-            w(f"  {itype:50s}  {len(ilist):3d}\n")
-    other_types = set(by_type.keys()) - set(PRIORITY_ORDER)
-    for itype in sorted(other_types):
-        ilist = by_type.get(itype, [])
-        if ilist:
-            w(f"  {itype:50s}  {len(ilist):3d}\n")
-    w("\n")
-    
-    w("FILES WITH ISSUES:\n")
-    for fpath in sorted(by_file.keys()):
-        types = sorted(set(i['issue'] for i in by_file[fpath]))
-        w(f"  {fpath}\n    [{', '.join(types)}]\n")
-    w("\n")
-    
-    w("=" * 80 + "\n")
-    w("DETAILED ISSUES (by priority)\n")
-    w("=" * 80 + "\n")
-    
-    for itype in PRIORITY_ORDER + sorted(set(by_type.keys()) - set(PRIORITY_ORDER)):
-        ilist = by_type.get(itype, [])
-        if not ilist:
-            continue
-        w(f"\n{'─'*80}\n")
-        w(f"ISSUE TYPE: {itype}  ({len(ilist)} occurrences)\n")
-        w(f"{'─'*80}\n")
-        for iss in ilist:
-            w(f"\n  File:  {iss['file']}\n")
-            w(f"  Item:  {iss.get('item_name','?')}\n")
-            w(f"  Field: {iss['field']}\n")
-            if iss.get('note'):
-                w(f"  Note:  {iss['note']}\n")
-            if iss.get('current_start'):
-                w(f"  Starts: {iss['current_start'][:150]}\n")
-            if iss.get('current_end'):
-                w(f"  Ends:   ...{iss['current_end']}\n")
-            if iss.get('current_preview'):
-                w(f"  Text:   {iss['current_preview'][:200]}\n")
-            if iss.get('current_text'):
-                ct = iss['current_text']
-                w(f"  Text:   {ct[:300]}\n")
-            if iss.get('plain_text'):
-                w(f"  Plain:  {iss['plain_text'][:200]}\n")
-            if iss.get('pdf_context'):
-                w(f"  PDF>>:  {iss['pdf_context'][:400]}\n")
-            if iss.get('pdf_context_before'):
-                w(f"  <<PDF:  {iss['pdf_context_before'][:400]}\n")
-
-print(f"Written: {out_json}\n        {out_txt}", flush=True)
@@ -1,306 +0,0 @@
-#!/usr/bin/env python3
-"""Final comprehensive analysis — clean version with bug fixes."""
-
-import json, re
-from pathlib import Path
-from collections import defaultdict
-
-BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange")
-PACKS = BASE / "packs-src"
-pdf_text = (BASE / "regles.txt").read_text(encoding="utf-8")
-
-WATERMARK_RE = re.compile(
-    r's\s*c\s*r\s*a\s*l\s*e\s*l|les\s+chroniqu|de\s+l.etrange|chr.niqu|hr\s+ng',
-    re.IGNORECASE)
-
-def strip_html(html):
-    return re.sub(r'<[^>]+>', '', html or '').strip()
-
-def has_watermark_bleed(text):
-    plain = strip_html(text)
-    return bool(WATERMARK_RE.search(plain))
-
-def has_bad_newlines(text):
-    lines = text.split('\n')
-    if len(lines) <= 1:
-        return False
-    for line in lines:
-        s = line.strip()
-        if s and not re.match(r'^<[/a-zA-Z]', s) and not s.endswith('>') and len(s) > 3:
-            return True
-    return False
-
-def looks_truncated(text):
-    """Text appears cut off at the end."""
-    if not text:
-        return False
-    plain = strip_html(text).strip()
-    if not plain:
-        return False
-    # Remove watermark garbage from end before checking
-    plain_clean = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
-    if not plain_clean:
-        return False  # FIX: was using original empty plain, now correctly returns False
-    last = plain_clean[-1]
-    return last.isalpha() or last in ',;:-('
-
-def looks_missing_beginning(text):
-    """Text starts mid-sentence (truly lowercase first char only)."""
-    if not text:
-        return False
-    plain = strip_html(text).strip()
-    if not plain:
-        return False
-    # Only flag if truly starts lowercase (not French articles/prepositions)
-    first_char = plain[0]
-    return first_char.islower()
-
-def is_garbled_page_layout(text):
-    """Multiple single-letter <p> tags = garbled PDF artifact."""
-    single_p = re.findall(r'<p>([a-zA-Z0-9])</p>', text)
-    return len(single_p) >= 5
-
-def get_field(data, path):
-    parts = path.split('.')
-    cur = data
-    for p in parts:
-        if isinstance(cur, dict):
-            cur = cur.get(p)
-        else:
-            return None
-        if cur is None:
-            return None
-    return cur
-
-def pdf_search_after(keyword_text, context=500):
-    """Return PDF text after the given keyword."""
-    plain = strip_html(keyword_text)
-    plain = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
-    if len(plain) < 15:
-        return None
-    for suffix_len in [40, 30, 20, 15]:
-        suffix = re.sub(r'\s+', ' ', plain[-suffix_len:]).strip()
-        if len(suffix) < 10:
-            continue
-        idx = pdf_text.find(suffix)
-        if idx != -1:
-            snippet = pdf_text[idx:min(len(pdf_text), idx + len(suffix) + context)]
-            snippet = re.sub(r'\n+', ' ', snippet)
-            snippet = re.sub(r'\s{3,}', '  ', snippet)
-            return snippet[:600]
-    return None
-
-def pdf_search_before(keyword_text, context=400):
-    """Return PDF text before the given keyword."""
-    plain = strip_html(keyword_text)
-    plain_start = re.sub(r'\s+', ' ', plain[:60]).strip()
-    if len(plain_start) < 15:
-        return None
-    for prefix_len in [50, 40, 30, 20]:
-        prefix = re.sub(r'\s+', ' ', plain_start[:prefix_len]).strip()
-        if len(prefix) < 10:
-            continue
-        idx = pdf_text.find(prefix)
-        if idx != -1:
-            start = max(0, idx - context)
-            snippet = pdf_text[start:idx + len(prefix)]
-            snippet = re.sub(r'\n+', ' ', snippet)
-            snippet = re.sub(r'\s{3,}', '  ', snippet)
-            return snippet[-400:]
-    return None
-
-issues = []
-all_files = sorted(PACKS.rglob("*.json"))
-print(f"Scanning {len(all_files)} files...", flush=True)
-
-HTML_FIELDS = [
-    "system.description",
-    "system.effects",
-    "system.examples",
-    "system.components",
-    "system.notes",
-    "system.techniques.technique1.technique",
-    "system.techniques.technique2.technique",
-    "system.techniques.technique3.technique",
-]
-PLAIN_FIELDS = ["system.style"]
-
-for jf in sorted(all_files):
-    rel = str(jf.relative_to(PACKS))
-    try:
-        data = json.loads(jf.read_text(encoding="utf-8"))
-    except Exception as e:
-        issues.append({"file": rel, "field": "(file)", "issue": "json_error",
-                       "item_name": "?", "current_text": str(e)})
-        continue
-
-    name = data.get("name", "?")
-
-    def add(field, issue_type, **kwargs):
-        issues.append({"file": rel, "field": field, "issue": issue_type,
-                       "item_name": name, **kwargs})
-
-    for field in HTML_FIELDS + PLAIN_FIELDS:
-        val = get_field(data, field)
-        if not val or not isinstance(val, str) or not val.strip():
-            continue
-        plain = strip_html(val).strip()
-
-        # Garbled page layout (skip other checks)
-        if is_garbled_page_layout(val):
-            add(field, "garbled_page_layout",
-                current_text=val[:400],
-                note="Text broken into single-character <p> tags — PDF layout artifact")
-            continue
-
-        # Watermark bleeding
-        if has_watermark_bleed(val):
-            add(field, "bleeding_watermark",
-                current_text=val[:400],
-                plain_text=plain[:300],
-                pdf_context=pdf_search_after(val))
-
-        # Missing beginning (only truly lowercase-starting)
-        if looks_missing_beginning(val):
-            add(field, "missing_beginning",
-                current_start=plain[:150],
-                pdf_context_before=pdf_search_before(val))
-
-        # Truncation
-        if looks_truncated(val):
-            # Skip empty ingredient placeholders
-            is_ingredient = 'cde-ingredients' in rel
-            if is_ingredient:
-                # Only flag if there's actually short meaningful text
-                if plain and len(plain) < 30:
-                    add(field, "empty_or_short_ingredient",
-                        current_text=plain,
-                        note="Short ingredient description — check if intentional")
-            else:
-                add(field, "truncated",
-                    current_end=plain[-120:],
-                    current_preview=plain[:200],
-                    pdf_context=pdf_search_after(val))
-
-        # Unwanted newlines
-        if has_bad_newlines(val):
-            add(field, "unwanted_newlines",
-                current_text=val[:400],
-                plain_text=plain[:200])
-
-    # Technique cross-checks
-    for tkey in ['technique1', 'technique2', 'technique3']:
-        tech = get_field(data, f"system.techniques.{tkey}")
-        if not tech:
-            continue
-        t_text = tech.get("technique", "")
-        if not t_text:
-            continue
-        plain_t = strip_html(t_text)
-        activation_count = plain_t.count("Activation :")
-        if activation_count > 1:
-            add(f"system.techniques.{tkey}.technique",
-                "bleeding_multiple_techniques",
-                activation_count=activation_count,
-                current_text=t_text[:500],
-                note=f"{activation_count} 'Activation :' markers — multiple techniques merged")
-        if ("Style" in plain_t or "Orientation :" in plain_t) and len(plain_t) > 300:
-            already = any(i['file'] == rel and i['field'] == f"system.techniques.{tkey}.technique"
-                         and i['issue'] == 'bleeding_style_or_orientation' for i in issues)
-            if not already:
-                add(f"system.techniques.{tkey}.technique",
-                    "bleeding_style_or_orientation",
-                    current_text=t_text[:500],
-                    note="Contains 'Style' or 'Orientation' — extra text from PDF page layout")
-
-print(f"Found {len(issues)} issues.", flush=True)
-
-out_json = BASE / "compendium-issues.json"
-out_txt = BASE / "compendium-issues.txt"
-
-with open(out_json, 'w', encoding='utf-8') as f:
-    json.dump(issues, f, ensure_ascii=False, indent=2)
-
-by_type = defaultdict(list)
-by_file = defaultdict(list)
-for iss in issues:
-    by_type[iss['issue']].append(iss)
-    by_file[iss['file']].append(iss)
-
-PRIORITY_ORDER = [
-    'garbled_page_layout',
-    'missing_beginning',
-    'bleeding_watermark',
-    'bleeding_multiple_techniques',
-    'bleeding_style_or_orientation',
-    'truncated',
-    'unwanted_newlines',
-    'empty_or_short_ingredient',
-]
-
-with open(out_txt, 'w', encoding='utf-8') as f:
-    w = f.write
-    w("=" * 80 + "\n")
-    w("COMPENDIUM TEXT QUALITY REPORT\n")
-    w("Les Chroniques de l'Etrange — FoundryVTT\n")
-    w("=" * 80 + "\n\n")
-    w(f"Files scanned: {len(all_files)}\n")
-    w(f"Files with issues: {len(by_file)}\n")
-    w(f"Total issues: {len(issues)}\n\n")
-
-    w("SUMMARY BY ISSUE TYPE:\n")
-    for itype in PRIORITY_ORDER:
-        ilist = by_type.get(itype, [])
-        if ilist:
-            desc = {
-                'garbled_page_layout': 'text broken into single-char HTML tags (PDF artifact)',
-                'missing_beginning': 'field starts mid-word (lowercase start = truncated at front)',
-                'bleeding_watermark': '"Les Chroniques de l\'Étrange" watermark fragments in text',
-                'bleeding_multiple_techniques': 'multiple techniques merged into one field',
-                'bleeding_style_or_orientation': 'Style/Orientation text bled into technique field',
-                'truncated': 'field ends mid-sentence without proper punctuation',
-                'unwanted_newlines': 'raw newlines inside HTML string values',
-                'empty_or_short_ingredient': 'ingredient has empty or very short description',
-            }.get(itype, '')
-            w(f"  {itype:45s}  {len(ilist):3d}  — {desc}\n")
-
-    w("\nFILES WITH ISSUES:\n")
-    for fpath in sorted(by_file.keys()):
-        types = sorted(set(i['issue'] for i in by_file[fpath]))
-        w(f"  {fpath}\n    [{', '.join(types)}]\n")
-
-    w("\n")
-    w("=" * 80 + "\n")
-    w("DETAILED ISSUES (by priority)\n")
-    w("=" * 80 + "\n")
-
-    for itype in PRIORITY_ORDER:
-        ilist = by_type.get(itype, [])
-        if not ilist:
-            continue
-        w(f"\n{'─'*80}\n")
-        w(f"ISSUE TYPE: {itype}  ({len(ilist)} occurrences)\n")
-        w(f"{'─'*80}\n")
-        for iss in ilist:
-            w(f"\n  File:  {iss['file']}\n")
-            w(f"  Item:  {iss.get('item_name','?')}\n")
-            w(f"  Field: {iss['field']}\n")
-            if iss.get('note'):
-                w(f"  Note:  {iss['note']}\n")
-            if iss.get('current_start'):
-                w(f"  Starts: {iss['current_start'][:160]}\n")
-            if iss.get('current_end'):
-                w(f"  Ends:   ...{iss['current_end']}\n")
-            if iss.get('current_preview'):
-                w(f"  Text:   {iss['current_preview'][:200]}\n")
-            if iss.get('current_text'):
-                ct = iss['current_text']
-                w(f"  Text:   {ct[:300]}\n")
-            if iss.get('plain_text'):
-                w(f"  Plain:  {iss['plain_text'][:200]}\n")
-            if iss.get('pdf_context'):
-                w(f"  PDF>>:  {iss['pdf_context'][:400]}\n")
-            if iss.get('pdf_context_before'):
-                w(f"  <<PDF:  {iss['pdf_context_before'][:400]}\n")
-
-print(f"Written: {out_json}\n        {out_txt}", flush=True)