fvtt-chroniques-de-l-etrange/analyze_final.py

#!/usr/bin/env python3
"""Final comprehensive analysis — clean version with bug fixes."""

import json, re
from pathlib import Path
from collections import defaultdict

BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange")
PACKS = BASE / "packs-src"
pdf_text = (BASE / "regles.txt").read_text(encoding="utf-8")

WATERMARK_RE = re.compile(
    r's\s*c\s*r\s*a\s*l\s*e\s*l|les\s+chroniqu|de\s+l.etrange|chr.niqu|hr\s+ng',
    re.IGNORECASE)

def strip_html(html):
    return re.sub(r'<[^>]+>', '', html or '').strip()

def has_watermark_bleed(text):
    plain = strip_html(text)
    return bool(WATERMARK_RE.search(plain))

def has_bad_newlines(text):
    lines = text.split('\n')
    if len(lines) <= 1:
        return False
    for line in lines:
        s = line.strip()
        if s and not re.match(r'^<[/a-zA-Z]', s) and not s.endswith('>') and len(s) > 3:
            return True
    return False

def looks_truncated(text):
    """Text appears cut off at the end."""
    if not text:
        return False
    plain = strip_html(text).strip()
    if not plain:
        return False
    # Remove watermark garbage from end before checking
    plain_clean = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
    if not plain_clean:
        return False  # FIX: was using original empty plain, now correctly returns False
    last = plain_clean[-1]
    return last.isalpha() or last in ',;:-('

def looks_missing_beginning(text):
    """Text starts mid-sentence (truly lowercase first char only)."""
    if not text:
        return False
    plain = strip_html(text).strip()
    if not plain:
        return False
    # Only flag if truly starts lowercase (not French articles/prepositions)
    first_char = plain[0]
    return first_char.islower()

def is_garbled_page_layout(text):
    """Multiple single-letter <p> tags = garbled PDF artifact."""
    single_p = re.findall(r'<p>([a-zA-Z0-9])</p>', text)
    return len(single_p) >= 5

def get_field(data, path):
    parts = path.split('.')
    cur = data
    for p in parts:
        if isinstance(cur, dict):
            cur = cur.get(p)
        else:
            return None
        if cur is None:
            return None
    return cur

def pdf_search_after(keyword_text, context=500):
    """Return PDF text after the given keyword."""
    plain = strip_html(keyword_text)
    plain = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
    if len(plain) < 15:
        return None
    for suffix_len in [40, 30, 20, 15]:
        suffix = re.sub(r'\s+', ' ', plain[-suffix_len:]).strip()
        if len(suffix) < 10:
            continue
        idx = pdf_text.find(suffix)
        if idx != -1:
            snippet = pdf_text[idx:min(len(pdf_text), idx + len(suffix) + context)]
            snippet = re.sub(r'\n+', ' ', snippet)
            snippet = re.sub(r'\s{3,}', '  ', snippet)
            return snippet[:600]
    return None

def pdf_search_before(keyword_text, context=400):
    """Return PDF text before the given keyword."""
    plain = strip_html(keyword_text)
    plain_start = re.sub(r'\s+', ' ', plain[:60]).strip()
    if len(plain_start) < 15:
        return None
    for prefix_len in [50, 40, 30, 20]:
        prefix = re.sub(r'\s+', ' ', plain_start[:prefix_len]).strip()
        if len(prefix) < 10:
            continue
        idx = pdf_text.find(prefix)
        if idx != -1:
            start = max(0, idx - context)
            snippet = pdf_text[start:idx + len(prefix)]
            snippet = re.sub(r'\n+', ' ', snippet)
            snippet = re.sub(r'\s{3,}', '  ', snippet)
            return snippet[-400:]
    return None

issues = []
all_files = sorted(PACKS.rglob("*.json"))
print(f"Scanning {len(all_files)} files...", flush=True)

HTML_FIELDS = [
    "system.description",
    "system.effects",
    "system.examples",
    "system.components",
    "system.notes",
    "system.techniques.technique1.technique",
    "system.techniques.technique2.technique",
    "system.techniques.technique3.technique",
]
PLAIN_FIELDS = ["system.style"]

for jf in sorted(all_files):
    rel = str(jf.relative_to(PACKS))
    try:
        data = json.loads(jf.read_text(encoding="utf-8"))
    except Exception as e:
        issues.append({"file": rel, "field": "(file)", "issue": "json_error",
                       "item_name": "?", "current_text": str(e)})
        continue

    name = data.get("name", "?")

    def add(field, issue_type, **kwargs):
        issues.append({"file": rel, "field": field, "issue": issue_type,
                       "item_name": name, **kwargs})

    for field in HTML_FIELDS + PLAIN_FIELDS:
        val = get_field(data, field)
        if not val or not isinstance(val, str) or not val.strip():
            continue
        plain = strip_html(val).strip()

        # Garbled page layout (skip other checks)
        if is_garbled_page_layout(val):
            add(field, "garbled_page_layout",
                current_text=val[:400],
                note="Text broken into single-character <p> tags — PDF layout artifact")
            continue

        # Watermark bleeding
        if has_watermark_bleed(val):
            add(field, "bleeding_watermark",
                current_text=val[:400],
                plain_text=plain[:300],
                pdf_context=pdf_search_after(val))

        # Missing beginning (only truly lowercase-starting)
        if looks_missing_beginning(val):
            add(field, "missing_beginning",
                current_start=plain[:150],
                pdf_context_before=pdf_search_before(val))

        # Truncation
        if looks_truncated(val):
            # Skip empty ingredient placeholders
            is_ingredient = 'cde-ingredients' in rel
            if is_ingredient:
                # Only flag if there's actually short meaningful text
                if plain and len(plain) < 30:
                    add(field, "empty_or_short_ingredient",
                        current_text=plain,
                        note="Short ingredient description — check if intentional")
            else:
                add(field, "truncated",
                    current_end=plain[-120:],
                    current_preview=plain[:200],
                    pdf_context=pdf_search_after(val))

        # Unwanted newlines
        if has_bad_newlines(val):
            add(field, "unwanted_newlines",
                current_text=val[:400],
                plain_text=plain[:200])

    # Technique cross-checks
    for tkey in ['technique1', 'technique2', 'technique3']:
        tech = get_field(data, f"system.techniques.{tkey}")
        if not tech:
            continue
        t_text = tech.get("technique", "")
        if not t_text:
            continue
        plain_t = strip_html(t_text)
        activation_count = plain_t.count("Activation :")
        if activation_count > 1:
            add(f"system.techniques.{tkey}.technique",
                "bleeding_multiple_techniques",
                activation_count=activation_count,
                current_text=t_text[:500],
                note=f"{activation_count} 'Activation :' markers — multiple techniques merged")
        if ("Style" in plain_t or "Orientation :" in plain_t) and len(plain_t) > 300:
            already = any(i['file'] == rel and i['field'] == f"system.techniques.{tkey}.technique"
                         and i['issue'] == 'bleeding_style_or_orientation' for i in issues)
            if not already:
                add(f"system.techniques.{tkey}.technique",
                    "bleeding_style_or_orientation",
                    current_text=t_text[:500],
                    note="Contains 'Style' or 'Orientation' — extra text from PDF page layout")

print(f"Found {len(issues)} issues.", flush=True)

out_json = BASE / "compendium-issues.json"
out_txt = BASE / "compendium-issues.txt"

with open(out_json, 'w', encoding='utf-8') as f:
    json.dump(issues, f, ensure_ascii=False, indent=2)

by_type = defaultdict(list)
by_file = defaultdict(list)
for iss in issues:
    by_type[iss['issue']].append(iss)
    by_file[iss['file']].append(iss)

PRIORITY_ORDER = [
    'garbled_page_layout',
    'missing_beginning',
    'bleeding_watermark',
    'bleeding_multiple_techniques',
    'bleeding_style_or_orientation',
    'truncated',
    'unwanted_newlines',
    'empty_or_short_ingredient',
]

with open(out_txt, 'w', encoding='utf-8') as f:
    w = f.write
    w("=" * 80 + "\n")
    w("COMPENDIUM TEXT QUALITY REPORT\n")
    w("Les Chroniques de l'Etrange — FoundryVTT\n")
    w("=" * 80 + "\n\n")
    w(f"Files scanned: {len(all_files)}\n")
    w(f"Files with issues: {len(by_file)}\n")
    w(f"Total issues: {len(issues)}\n\n")

    w("SUMMARY BY ISSUE TYPE:\n")
    for itype in PRIORITY_ORDER:
        ilist = by_type.get(itype, [])
        if ilist:
            desc = {
                'garbled_page_layout': 'text broken into single-char HTML tags (PDF artifact)',
                'missing_beginning': 'field starts mid-word (lowercase start = truncated at front)',
                'bleeding_watermark': '"Les Chroniques de l\'Étrange" watermark fragments in text',
                'bleeding_multiple_techniques': 'multiple techniques merged into one field',
                'bleeding_style_or_orientation': 'Style/Orientation text bled into technique field',
                'truncated': 'field ends mid-sentence without proper punctuation',
                'unwanted_newlines': 'raw newlines inside HTML string values',
                'empty_or_short_ingredient': 'ingredient has empty or very short description',
            }.get(itype, '')
            w(f"  {itype:45s}  {len(ilist):3d}  — {desc}\n")

    w("\nFILES WITH ISSUES:\n")
    for fpath in sorted(by_file.keys()):
        types = sorted(set(i['issue'] for i in by_file[fpath]))
        w(f"  {fpath}\n    [{', '.join(types)}]\n")

    w("\n")
    w("=" * 80 + "\n")
    w("DETAILED ISSUES (by priority)\n")
    w("=" * 80 + "\n")

    for itype in PRIORITY_ORDER:
        ilist = by_type.get(itype, [])
        if not ilist:
            continue
        w(f"\n{'─'*80}\n")
        w(f"ISSUE TYPE: {itype}  ({len(ilist)} occurrences)\n")
        w(f"{'─'*80}\n")
        for iss in ilist:
            w(f"\n  File:  {iss['file']}\n")
            w(f"  Item:  {iss.get('item_name','?')}\n")
            w(f"  Field: {iss['field']}\n")
            if iss.get('note'):
                w(f"  Note:  {iss['note']}\n")
            if iss.get('current_start'):
                w(f"  Starts: {iss['current_start'][:160]}\n")
            if iss.get('current_end'):
                w(f"  Ends:   ...{iss['current_end']}\n")
            if iss.get('current_preview'):
                w(f"  Text:   {iss['current_preview'][:200]}\n")
            if iss.get('current_text'):
                ct = iss['current_text']
                w(f"  Text:   {ct[:300]}\n")
            if iss.get('plain_text'):
                w(f"  Plain:  {iss['plain_text'][:200]}\n")
            if iss.get('pdf_context'):
                w(f"  PDF>>:  {iss['pdf_context'][:400]}\n")
            if iss.get('pdf_context_before'):
                w(f"  <<PDF:  {iss['pdf_context_before'][:400]}\n")

print(f"Written: {out_json}\n        {out_txt}", flush=True)