#!/usr/bin/env python3 """Final comprehensive analysis — clean version with bug fixes.""" import json, re from pathlib import Path from collections import defaultdict BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange") PACKS = BASE / "packs-src" pdf_text = (BASE / "regles.txt").read_text(encoding="utf-8") WATERMARK_RE = re.compile( r's\s*c\s*r\s*a\s*l\s*e\s*l|les\s+chroniqu|de\s+l.etrange|chr.niqu|hr\s+ng', re.IGNORECASE) def strip_html(html): return re.sub(r'<[^>]+>', '', html or '').strip() def has_watermark_bleed(text): plain = strip_html(text) return bool(WATERMARK_RE.search(plain)) def has_bad_newlines(text): lines = text.split('\n') if len(lines) <= 1: return False for line in lines: s = line.strip() if s and not re.match(r'^<[/a-zA-Z]', s) and not s.endswith('>') and len(s) > 3: return True return False def looks_truncated(text): """Text appears cut off at the end.""" if not text: return False plain = strip_html(text).strip() if not plain: return False # Remove watermark garbage from end before checking plain_clean = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip() if not plain_clean: return False # FIX: was using original empty plain, now correctly returns False last = plain_clean[-1] return last.isalpha() or last in ',;:-(' def looks_missing_beginning(text): """Text starts mid-sentence (truly lowercase first char only).""" if not text: return False plain = strip_html(text).strip() if not plain: return False # Only flag if truly starts lowercase (not French articles/prepositions) first_char = plain[0] return first_char.islower() def is_garbled_page_layout(text): """Multiple single-letter

tags = garbled PDF artifact.""" single_p = re.findall(r'

([a-zA-Z0-9])

', text) return len(single_p) >= 5 def get_field(data, path): parts = path.split('.') cur = data for p in parts: if isinstance(cur, dict): cur = cur.get(p) else: return None if cur is None: return None return cur def pdf_search_after(keyword_text, context=500): """Return PDF text after the given keyword.""" plain = strip_html(keyword_text) plain = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip() if len(plain) < 15: return None for suffix_len in [40, 30, 20, 15]: suffix = re.sub(r'\s+', ' ', plain[-suffix_len:]).strip() if len(suffix) < 10: continue idx = pdf_text.find(suffix) if idx != -1: snippet = pdf_text[idx:min(len(pdf_text), idx + len(suffix) + context)] snippet = re.sub(r'\n+', ' ', snippet) snippet = re.sub(r'\s{3,}', ' ', snippet) return snippet[:600] return None def pdf_search_before(keyword_text, context=400): """Return PDF text before the given keyword.""" plain = strip_html(keyword_text) plain_start = re.sub(r'\s+', ' ', plain[:60]).strip() if len(plain_start) < 15: return None for prefix_len in [50, 40, 30, 20]: prefix = re.sub(r'\s+', ' ', plain_start[:prefix_len]).strip() if len(prefix) < 10: continue idx = pdf_text.find(prefix) if idx != -1: start = max(0, idx - context) snippet = pdf_text[start:idx + len(prefix)] snippet = re.sub(r'\n+', ' ', snippet) snippet = re.sub(r'\s{3,}', ' ', snippet) return snippet[-400:] return None issues = [] all_files = sorted(PACKS.rglob("*.json")) print(f"Scanning {len(all_files)} files...", flush=True) HTML_FIELDS = [ "system.description", "system.effects", "system.examples", "system.components", "system.notes", "system.techniques.technique1.technique", "system.techniques.technique2.technique", "system.techniques.technique3.technique", ] PLAIN_FIELDS = ["system.style"] for jf in sorted(all_files): rel = str(jf.relative_to(PACKS)) try: data = json.loads(jf.read_text(encoding="utf-8")) except Exception as e: issues.append({"file": rel, "field": "(file)", "issue": "json_error", "item_name": "?", "current_text": str(e)}) continue name = data.get("name", "?") def add(field, issue_type, **kwargs): issues.append({"file": rel, "field": field, "issue": issue_type, "item_name": name, **kwargs}) for field in HTML_FIELDS + PLAIN_FIELDS: val = get_field(data, field) if not val or not isinstance(val, str) or not val.strip(): continue plain = strip_html(val).strip() # Garbled page layout (skip other checks) if is_garbled_page_layout(val): add(field, "garbled_page_layout", current_text=val[:400], note="Text broken into single-character

tags — PDF layout artifact") continue # Watermark bleeding if has_watermark_bleed(val): add(field, "bleeding_watermark", current_text=val[:400], plain_text=plain[:300], pdf_context=pdf_search_after(val)) # Missing beginning (only truly lowercase-starting) if looks_missing_beginning(val): add(field, "missing_beginning", current_start=plain[:150], pdf_context_before=pdf_search_before(val)) # Truncation if looks_truncated(val): # Skip empty ingredient placeholders is_ingredient = 'cde-ingredients' in rel if is_ingredient: # Only flag if there's actually short meaningful text if plain and len(plain) < 30: add(field, "empty_or_short_ingredient", current_text=plain, note="Short ingredient description — check if intentional") else: add(field, "truncated", current_end=plain[-120:], current_preview=plain[:200], pdf_context=pdf_search_after(val)) # Unwanted newlines if has_bad_newlines(val): add(field, "unwanted_newlines", current_text=val[:400], plain_text=plain[:200]) # Technique cross-checks for tkey in ['technique1', 'technique2', 'technique3']: tech = get_field(data, f"system.techniques.{tkey}") if not tech: continue t_text = tech.get("technique", "") if not t_text: continue plain_t = strip_html(t_text) activation_count = plain_t.count("Activation :") if activation_count > 1: add(f"system.techniques.{tkey}.technique", "bleeding_multiple_techniques", activation_count=activation_count, current_text=t_text[:500], note=f"{activation_count} 'Activation :' markers — multiple techniques merged") if ("Style" in plain_t or "Orientation :" in plain_t) and len(plain_t) > 300: already = any(i['file'] == rel and i['field'] == f"system.techniques.{tkey}.technique" and i['issue'] == 'bleeding_style_or_orientation' for i in issues) if not already: add(f"system.techniques.{tkey}.technique", "bleeding_style_or_orientation", current_text=t_text[:500], note="Contains 'Style' or 'Orientation' — extra text from PDF page layout") print(f"Found {len(issues)} issues.", flush=True) out_json = BASE / "compendium-issues.json" out_txt = BASE / "compendium-issues.txt" with open(out_json, 'w', encoding='utf-8') as f: json.dump(issues, f, ensure_ascii=False, indent=2) by_type = defaultdict(list) by_file = defaultdict(list) for iss in issues: by_type[iss['issue']].append(iss) by_file[iss['file']].append(iss) PRIORITY_ORDER = [ 'garbled_page_layout', 'missing_beginning', 'bleeding_watermark', 'bleeding_multiple_techniques', 'bleeding_style_or_orientation', 'truncated', 'unwanted_newlines', 'empty_or_short_ingredient', ] with open(out_txt, 'w', encoding='utf-8') as f: w = f.write w("=" * 80 + "\n") w("COMPENDIUM TEXT QUALITY REPORT\n") w("Les Chroniques de l'Etrange — FoundryVTT\n") w("=" * 80 + "\n\n") w(f"Files scanned: {len(all_files)}\n") w(f"Files with issues: {len(by_file)}\n") w(f"Total issues: {len(issues)}\n\n") w("SUMMARY BY ISSUE TYPE:\n") for itype in PRIORITY_ORDER: ilist = by_type.get(itype, []) if ilist: desc = { 'garbled_page_layout': 'text broken into single-char HTML tags (PDF artifact)', 'missing_beginning': 'field starts mid-word (lowercase start = truncated at front)', 'bleeding_watermark': '"Les Chroniques de l\'Étrange" watermark fragments in text', 'bleeding_multiple_techniques': 'multiple techniques merged into one field', 'bleeding_style_or_orientation': 'Style/Orientation text bled into technique field', 'truncated': 'field ends mid-sentence without proper punctuation', 'unwanted_newlines': 'raw newlines inside HTML string values', 'empty_or_short_ingredient': 'ingredient has empty or very short description', }.get(itype, '') w(f" {itype:45s} {len(ilist):3d} — {desc}\n") w("\nFILES WITH ISSUES:\n") for fpath in sorted(by_file.keys()): types = sorted(set(i['issue'] for i in by_file[fpath])) w(f" {fpath}\n [{', '.join(types)}]\n") w("\n") w("=" * 80 + "\n") w("DETAILED ISSUES (by priority)\n") w("=" * 80 + "\n") for itype in PRIORITY_ORDER: ilist = by_type.get(itype, []) if not ilist: continue w(f"\n{'─'*80}\n") w(f"ISSUE TYPE: {itype} ({len(ilist)} occurrences)\n") w(f"{'─'*80}\n") for iss in ilist: w(f"\n File: {iss['file']}\n") w(f" Item: {iss.get('item_name','?')}\n") w(f" Field: {iss['field']}\n") if iss.get('note'): w(f" Note: {iss['note']}\n") if iss.get('current_start'): w(f" Starts: {iss['current_start'][:160]}\n") if iss.get('current_end'): w(f" Ends: ...{iss['current_end']}\n") if iss.get('current_preview'): w(f" Text: {iss['current_preview'][:200]}\n") if iss.get('current_text'): ct = iss['current_text'] w(f" Text: {ct[:300]}\n") if iss.get('plain_text'): w(f" Plain: {iss['plain_text'][:200]}\n") if iss.get('pdf_context'): w(f" PDF>>: {iss['pdf_context'][:400]}\n") if iss.get('pdf_context_before'): w(f" <