Correction compendiums
This commit is contained in:
@@ -0,0 +1,330 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Analyze all JSON files in packs-src/ for text quality issues."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from html.parser import HTMLParser
|
||||
|
||||
BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange")
|
||||
PACKS = BASE / "packs-src"
|
||||
REGLES = BASE / "regles.txt"
|
||||
|
||||
# Load PDF text
|
||||
pdf_lines = REGLES.read_text(encoding="utf-8").splitlines()
|
||||
pdf_text = REGLES.read_text(encoding="utf-8")
|
||||
|
||||
issues = []
|
||||
|
||||
# ---------- helpers ----------
|
||||
|
||||
def strip_html(html):
|
||||
"""Remove HTML tags and return plain text."""
|
||||
return re.sub(r'<[^>]+>', '', html or '')
|
||||
|
||||
def check_unclosed_tags(html):
|
||||
"""Returns list of unclosed/mismatched tags."""
|
||||
open_tags = re.findall(r'<([a-zA-Z][a-zA-Z0-9]*)[^>]*>', html)
|
||||
close_tags = re.findall(r'</([a-zA-Z][a-zA-Z0-9]*)>', html)
|
||||
issues_found = []
|
||||
# basic: count opens vs closes for block-level tags
|
||||
for tag in ['ul', 'ol', 'li', 'p', 'div', 'strong', 'em', 'b', 'i']:
|
||||
opens = open_tags.count(tag)
|
||||
closes = close_tags.count(tag)
|
||||
if opens != closes:
|
||||
issues_found.append(f"<{tag}>: {opens} open, {closes} close")
|
||||
return issues_found
|
||||
|
||||
def has_bad_newlines(html):
|
||||
"""Check for literal \\n inside HTML strings that would render as bad breaks."""
|
||||
# In JSON, \n is a newline. In HTML strings, raw newlines can be bad.
|
||||
return '\n' in html
|
||||
|
||||
def looks_truncated(text):
|
||||
"""Heuristics for truncation - text ends without proper punctuation."""
|
||||
if not text:
|
||||
return False
|
||||
plain = strip_html(text).strip()
|
||||
if not plain:
|
||||
return False
|
||||
# ends without sentence-ending punctuation
|
||||
if plain and plain[-1] not in '.!?»)':
|
||||
return True
|
||||
return False
|
||||
|
||||
def looks_truncated_strict(text):
|
||||
"""Stricter: ends mid-word or mid-sentence."""
|
||||
if not text:
|
||||
return False
|
||||
plain = strip_html(text).strip()
|
||||
if not plain:
|
||||
return False
|
||||
# ends mid-word (no space before end, no punctuation)
|
||||
last_char = plain[-1] if plain else ''
|
||||
if last_char.isalpha() or last_char in ',;:-(':
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_field(data, path):
|
||||
"""Get nested field value by dot-path."""
|
||||
parts = path.split('.')
|
||||
cur = data
|
||||
for p in parts:
|
||||
if isinstance(cur, dict):
|
||||
cur = cur.get(p)
|
||||
else:
|
||||
return None
|
||||
if cur is None:
|
||||
return None
|
||||
return cur
|
||||
|
||||
def search_pdf(keyword, context=300):
|
||||
"""Search PDF text for a keyword and return surrounding context."""
|
||||
# clean keyword for searching
|
||||
kw = re.sub(r'<[^>]+>', '', keyword).strip()
|
||||
if len(kw) < 10:
|
||||
return None
|
||||
# take last 30 chars of plain text as search key
|
||||
search_key = kw[-30:].strip()
|
||||
# normalize whitespace
|
||||
search_key_norm = re.sub(r'\s+', ' ', search_key)
|
||||
|
||||
# Try to find in PDF
|
||||
idx = pdf_text.find(search_key_norm)
|
||||
if idx == -1:
|
||||
# try shorter
|
||||
search_key_norm = re.sub(r'\s+', ' ', kw[-20:].strip())
|
||||
idx = pdf_text.find(search_key_norm)
|
||||
if idx == -1:
|
||||
# try even shorter
|
||||
search_key_norm = re.sub(r'\s+', ' ', kw[-15:].strip())
|
||||
idx = pdf_text.find(search_key_norm)
|
||||
|
||||
if idx == -1:
|
||||
return None
|
||||
|
||||
start = max(0, idx - 50)
|
||||
end = min(len(pdf_text), idx + len(search_key_norm) + context)
|
||||
return pdf_text[start:end].replace('\n', ' ')
|
||||
|
||||
def get_all_html_fields(data, prefix=""):
|
||||
"""Recursively yield (field_path, value) for all string fields containing HTML."""
|
||||
if isinstance(data, dict):
|
||||
for k, v in data.items():
|
||||
path = f"{prefix}.{k}" if prefix else k
|
||||
if isinstance(v, str) and ('<' in v or len(v) > 50):
|
||||
yield path, v
|
||||
elif isinstance(v, (dict, list)):
|
||||
yield from get_all_html_fields(v, path)
|
||||
elif isinstance(data, list):
|
||||
for i, v in enumerate(data):
|
||||
yield from get_all_html_fields(v, f"{prefix}[{i}]")
|
||||
|
||||
# ---------- fields to check ----------
|
||||
|
||||
IMPORTANT_FIELDS = [
|
||||
"system.description",
|
||||
"system.effects",
|
||||
"system.examples",
|
||||
"system.components",
|
||||
"system.notes",
|
||||
"system.style",
|
||||
"system.techniques.technique1.technique",
|
||||
"system.techniques.technique2.technique",
|
||||
"system.techniques.technique3.technique",
|
||||
]
|
||||
|
||||
# ---------- main scan ----------
|
||||
|
||||
json_files = sorted(PACKS.rglob("*.json"))
|
||||
print(f"Scanning {len(json_files)} JSON files...", flush=True)
|
||||
|
||||
for jf in json_files:
|
||||
rel = str(jf.relative_to(PACKS))
|
||||
try:
|
||||
data = json.loads(jf.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError as e:
|
||||
issues.append({
|
||||
"file": rel,
|
||||
"field": "(file)",
|
||||
"issue": "json_parse_error",
|
||||
"current_text": str(e),
|
||||
"correct_continuation": None,
|
||||
})
|
||||
continue
|
||||
|
||||
item_name = data.get("name", "(unnamed)")
|
||||
|
||||
# Check all relevant fields
|
||||
for field in IMPORTANT_FIELDS:
|
||||
val = get_field(data, field)
|
||||
if not val or not isinstance(val, str):
|
||||
continue
|
||||
|
||||
plain = strip_html(val).strip()
|
||||
|
||||
# 1. Check truncation (strict)
|
||||
if looks_truncated_strict(val):
|
||||
pdf_context = search_pdf(val)
|
||||
issues.append({
|
||||
"file": rel,
|
||||
"field": field,
|
||||
"issue": "truncated",
|
||||
"item_name": item_name,
|
||||
"current_end": f"...{plain[-100:]}",
|
||||
"current_full_preview": f"{plain[:200]}",
|
||||
"correct_continuation": pdf_context,
|
||||
})
|
||||
|
||||
# 2. Check bad newlines in HTML strings
|
||||
if has_bad_newlines(val):
|
||||
issues.append({
|
||||
"file": rel,
|
||||
"field": field,
|
||||
"issue": "unwanted_newlines",
|
||||
"item_name": item_name,
|
||||
"current_text": val[:300],
|
||||
"correct_continuation": None,
|
||||
})
|
||||
|
||||
# 3. Check malformed HTML
|
||||
html_errors = check_unclosed_tags(val)
|
||||
if html_errors:
|
||||
issues.append({
|
||||
"file": rel,
|
||||
"field": field,
|
||||
"issue": "malformed_html",
|
||||
"item_name": item_name,
|
||||
"html_errors": html_errors,
|
||||
"current_text": val[:300],
|
||||
"correct_continuation": None,
|
||||
})
|
||||
|
||||
# 4. Check system.style (plain text field, can also be truncated)
|
||||
style_val = get_field(data, "system.style")
|
||||
if style_val and isinstance(style_val, str):
|
||||
plain_style = style_val.strip()
|
||||
if plain_style and plain_style[-1] not in '.!?»)':
|
||||
pdf_context = search_pdf(plain_style)
|
||||
issues.append({
|
||||
"file": rel,
|
||||
"field": "system.style",
|
||||
"issue": "truncated",
|
||||
"item_name": item_name,
|
||||
"current_end": f"...{plain_style[-100:]}",
|
||||
"current_full_preview": f"{plain_style[:200]}",
|
||||
"correct_continuation": pdf_context,
|
||||
})
|
||||
|
||||
# 5. Bleeding content: look for HTML tags in non-HTML fields
|
||||
for field in ["system.style", "system.reference", "system.speciality"]:
|
||||
val = get_field(data, field)
|
||||
if val and isinstance(val, str) and '<' in val:
|
||||
issues.append({
|
||||
"file": rel,
|
||||
"field": field,
|
||||
"issue": "html_in_plain_field",
|
||||
"item_name": item_name,
|
||||
"current_text": val[:300],
|
||||
"correct_continuation": None,
|
||||
})
|
||||
|
||||
# 6. Check for text outside HTML tags in description-like fields (bleeding)
|
||||
for field in ["system.description", "system.effects", "system.examples", "system.components", "system.notes"]:
|
||||
val = get_field(data, field)
|
||||
if not val or not isinstance(val, str):
|
||||
continue
|
||||
# Strip all HTML and check if leading text is outside tags
|
||||
# e.g., "<p>foo</p> some leaked text <p>bar</p>"
|
||||
# Check if there's text before the first tag
|
||||
stripped = val.strip()
|
||||
if stripped and not stripped.startswith('<'):
|
||||
issues.append({
|
||||
"file": rel,
|
||||
"field": field,
|
||||
"issue": "text_outside_html_tags",
|
||||
"item_name": item_name,
|
||||
"current_text": val[:300],
|
||||
"correct_continuation": None,
|
||||
})
|
||||
|
||||
# 7. Check technique fields for bleeding (multiple paragraphs that shouldn't be there)
|
||||
for tkey in ["technique1", "technique2", "technique3"]:
|
||||
tech = get_field(data, f"system.techniques.{tkey}")
|
||||
if not tech:
|
||||
continue
|
||||
tech_text = tech.get("technique", "")
|
||||
if tech_text:
|
||||
plain = strip_html(tech_text).strip()
|
||||
# Check for suspiciously long techniques that might have bled content
|
||||
# Techniques with multiple <p> blocks may be fine, but flag very long ones
|
||||
p_count = tech_text.count('</p>')
|
||||
if p_count > 3:
|
||||
issues.append({
|
||||
"file": rel,
|
||||
"field": f"system.techniques.{tkey}.technique",
|
||||
"issue": "possible_bleeding_content",
|
||||
"item_name": item_name,
|
||||
"paragraph_count": p_count,
|
||||
"current_text": tech_text[:400],
|
||||
"correct_continuation": None,
|
||||
})
|
||||
|
||||
print(f"Found {len(issues)} potential issues.", flush=True)
|
||||
|
||||
# ---------- output ----------
|
||||
|
||||
out_json = BASE / "compendium-issues.json"
|
||||
out_txt = BASE / "compendium-issues.txt"
|
||||
|
||||
with open(out_json, 'w', encoding='utf-8') as f:
|
||||
json.dump(issues, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Group by issue type for summary
|
||||
from collections import defaultdict
|
||||
by_type = defaultdict(list)
|
||||
by_file = defaultdict(list)
|
||||
for issue in issues:
|
||||
by_type[issue['issue']].append(issue)
|
||||
by_file[issue['file']].append(issue)
|
||||
|
||||
with open(out_txt, 'w', encoding='utf-8') as f:
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("COMPENDIUM TEXT QUALITY REPORT\n")
|
||||
f.write("Les Chroniques de l'Étrange — FoundryVTT\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
f.write(f"Total files scanned: {len(json_files)}\n")
|
||||
f.write(f"Total issues found: {len(issues)}\n\n")
|
||||
|
||||
f.write("SUMMARY BY ISSUE TYPE:\n")
|
||||
for itype, ilist in sorted(by_type.items()):
|
||||
f.write(f" {itype}: {len(ilist)}\n")
|
||||
f.write("\n")
|
||||
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("DETAILED ISSUES BY FILE\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
for fpath in sorted(by_file.keys()):
|
||||
f.write(f"\n--- {fpath} ---\n")
|
||||
for issue in by_file[fpath]:
|
||||
f.write(f" FIELD: {issue['field']}\n")
|
||||
f.write(f" ISSUE: {issue['issue']}\n")
|
||||
if issue.get('item_name'):
|
||||
f.write(f" ITEM: {issue['item_name']}\n")
|
||||
if issue.get('current_end'):
|
||||
f.write(f" END: {issue['current_end']}\n")
|
||||
if issue.get('current_full_preview'):
|
||||
f.write(f" TEXT: {issue['current_full_preview'][:200]}\n")
|
||||
if issue.get('current_text'):
|
||||
f.write(f" TEXT: {issue['current_text'][:200]}\n")
|
||||
if issue.get('html_errors'):
|
||||
f.write(f" HTML ERRORS: {issue['html_errors']}\n")
|
||||
if issue.get('correct_continuation'):
|
||||
f.write(f" PDF: {issue['correct_continuation'][:300]}\n")
|
||||
f.write("\n")
|
||||
|
||||
print(f"Reports written to:\n {out_json}\n {out_txt}", flush=True)
|
||||
Reference in New Issue
Block a user