Correction compendiums
This commit is contained in:
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
import json, re
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
BASE = Path("/home/morr/work/uberwald/fvtt-chroniques-de-l-etrange")
|
||||
PACKS = BASE / "packs-src"
|
||||
pdf_text = (BASE / "regles.txt").read_text(encoding="utf-8")
|
||||
|
||||
WATERMARK_RE = re.compile(
|
||||
r's\s*c\s*r\s*a\s*l\s*e\s*l|les\s+chroniqu|de\s+l.etrange|chr.niqu|hr\s+ng',
|
||||
re.IGNORECASE)
|
||||
|
||||
def strip_html(html):
|
||||
return re.sub(r'<[^>]+>', '', html or '').strip()
|
||||
|
||||
def has_watermark_bleed(text):
|
||||
plain = strip_html(text)
|
||||
return bool(WATERMARK_RE.search(plain))
|
||||
|
||||
def has_bad_newlines(text):
|
||||
lines = text.split('\n')
|
||||
if len(lines) <= 1:
|
||||
return False
|
||||
for line in lines:
|
||||
s = line.strip()
|
||||
if s and not re.match(r'^<[/a-zA-Z]', s) and not s.endswith('>') and len(s) > 3:
|
||||
return True
|
||||
return False
|
||||
|
||||
def looks_truncated(text):
|
||||
if not text:
|
||||
return False
|
||||
plain = strip_html(text).strip()
|
||||
plain_clean = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
|
||||
if not plain_clean:
|
||||
plain_clean = plain
|
||||
last = plain_clean[-1] if plain_clean else ''
|
||||
return last.isalpha() or last in ',;:-('
|
||||
|
||||
def get_field(data, path):
|
||||
parts = path.split('.')
|
||||
cur = data
|
||||
for p in parts:
|
||||
if isinstance(cur, dict):
|
||||
cur = cur.get(p)
|
||||
else:
|
||||
return None
|
||||
if cur is None:
|
||||
return None
|
||||
return cur
|
||||
|
||||
def pdf_search(keyword_text, context=500):
|
||||
plain = strip_html(keyword_text)
|
||||
plain = re.sub(r'\s+[a-z]\s+[a-z]\s+[a-z]{1,2}\s+.*$', '', plain).strip()
|
||||
if len(plain) < 15:
|
||||
return None
|
||||
for suffix_len in [40, 30, 20, 15]:
|
||||
suffix = re.sub(r'\s+', ' ', plain[-suffix_len:]).strip()
|
||||
if len(suffix) < 10:
|
||||
continue
|
||||
idx = pdf_text.find(suffix)
|
||||
if idx != -1:
|
||||
snippet = pdf_text[idx:min(len(pdf_text), idx + len(suffix) + context)]
|
||||
snippet = re.sub(r'\n+', ' ', snippet)
|
||||
snippet = re.sub(r'\s{3,}', ' ', snippet)
|
||||
return snippet[:600]
|
||||
return None
|
||||
|
||||
issues = []
|
||||
all_files = sorted(PACKS.rglob("*.json"))
|
||||
print(f"Scanning {len(all_files)} files...", flush=True)
|
||||
|
||||
HTML_FIELDS = [
|
||||
"system.description",
|
||||
"system.effects",
|
||||
"system.examples",
|
||||
"system.components",
|
||||
"system.notes",
|
||||
"system.techniques.technique1.technique",
|
||||
"system.techniques.technique2.technique",
|
||||
"system.techniques.technique3.technique",
|
||||
]
|
||||
PLAIN_FIELDS = ["system.style"]
|
||||
|
||||
for jf in sorted(all_files):
|
||||
rel = str(jf.relative_to(PACKS))
|
||||
try:
|
||||
data = json.loads(jf.read_text(encoding="utf-8"))
|
||||
except Exception as e:
|
||||
issues.append({"file": rel, "field": "(file)", "issue": "json_error",
|
||||
"item_name": "?", "current_text": str(e)})
|
||||
continue
|
||||
|
||||
name = data.get("name", "?")
|
||||
|
||||
def add_issue(field, issue_type, **kwargs):
|
||||
issues.append({"file": rel, "field": field, "issue": issue_type,
|
||||
"item_name": name, **kwargs})
|
||||
|
||||
for field in HTML_FIELDS + PLAIN_FIELDS:
|
||||
val = get_field(data, field)
|
||||
if not val or not isinstance(val, str) or not val.strip():
|
||||
continue
|
||||
plain = strip_html(val).strip()
|
||||
|
||||
if has_watermark_bleed(val):
|
||||
pdf_ctx = pdf_search(val)
|
||||
add_issue(field, "bleeding_watermark",
|
||||
current_text=val[:400],
|
||||
plain_text=plain[:300],
|
||||
pdf_context=pdf_ctx)
|
||||
|
||||
elif looks_truncated(val):
|
||||
is_ingredient = 'cde-ingredients' in rel
|
||||
if is_ingredient and len(plain) < 30:
|
||||
add_issue(field, "truncated_or_short",
|
||||
current_text=plain,
|
||||
note="May be legitimate (ingredient quantity)",
|
||||
pdf_context=pdf_search(plain))
|
||||
else:
|
||||
pdf_ctx = pdf_search(val)
|
||||
add_issue(field, "truncated",
|
||||
current_end=plain[-120:],
|
||||
current_preview=plain[:200],
|
||||
pdf_context=pdf_ctx)
|
||||
|
||||
if has_bad_newlines(val):
|
||||
add_issue(field, "unwanted_newlines",
|
||||
current_text=val[:400],
|
||||
plain_text=plain[:300])
|
||||
|
||||
for tkey in ['technique1', 'technique2', 'technique3']:
|
||||
tech = get_field(data, f"system.techniques.{tkey}")
|
||||
if not tech:
|
||||
continue
|
||||
t_text = tech.get("technique", "")
|
||||
if not t_text:
|
||||
continue
|
||||
plain_t = strip_html(t_text)
|
||||
activation_count = plain_t.count("Activation :")
|
||||
if activation_count > 1:
|
||||
add_issue(f"system.techniques.{tkey}.technique",
|
||||
"bleeding_multiple_techniques",
|
||||
activation_count=activation_count,
|
||||
current_text=t_text[:500],
|
||||
note=f"{activation_count} 'Activation :' markers found")
|
||||
if ("Style" in plain_t or "Orientation :" in plain_t) and len(plain_t) > 300:
|
||||
add_issue(f"system.techniques.{tkey}.technique",
|
||||
"bleeding_style_or_orientation",
|
||||
current_text=t_text[:500],
|
||||
note="Contains 'Style' or 'Orientation' markers inside technique text")
|
||||
|
||||
print(f"Found {len(issues)} issues.", flush=True)
|
||||
|
||||
out_json = BASE / "compendium-issues.json"
|
||||
out_txt = BASE / "compendium-issues.txt"
|
||||
|
||||
with open(out_json, 'w', encoding='utf-8') as f:
|
||||
json.dump(issues, f, ensure_ascii=False, indent=2)
|
||||
|
||||
by_type = defaultdict(list)
|
||||
by_file = defaultdict(list)
|
||||
for iss in issues:
|
||||
by_type[iss['issue']].append(iss)
|
||||
by_file[iss['file']].append(iss)
|
||||
|
||||
with open(out_txt, 'w', encoding='utf-8') as f:
|
||||
w = f.write
|
||||
w("=" * 80 + "\n")
|
||||
w("COMPENDIUM TEXT QUALITY REPORT\n")
|
||||
w("Les Chroniques de l'Etrange — FoundryVTT\n")
|
||||
w("=" * 80 + "\n\n")
|
||||
w(f"Files scanned: {len(all_files)}\n")
|
||||
w(f"Files with issues: {len(by_file)}\n")
|
||||
w(f"Total issues: {len(issues)}\n\n")
|
||||
w("SUMMARY BY ISSUE TYPE:\n")
|
||||
for itype, ilist in sorted(by_type.items(), key=lambda x: -len(x[1])):
|
||||
w(f" {itype:50s} {len(ilist):3d}\n")
|
||||
w("\nFILES WITH ISSUES:\n")
|
||||
for fpath in sorted(by_file.keys()):
|
||||
types = sorted(set(i['issue'] for i in by_file[fpath]))
|
||||
w(f" {fpath} [{', '.join(types)}]\n")
|
||||
w("\n")
|
||||
w("=" * 80 + "\n")
|
||||
w("DETAILED ISSUES\n")
|
||||
w("=" * 80 + "\n")
|
||||
for itype in ['bleeding_watermark', 'bleeding_multiple_techniques',
|
||||
'bleeding_style_or_orientation', 'truncated',
|
||||
'unwanted_newlines', 'truncated_or_short']:
|
||||
ilist = by_type.get(itype, [])
|
||||
if not ilist:
|
||||
continue
|
||||
w(f"\n{'─'*80}\n")
|
||||
w(f"ISSUE TYPE: {itype} ({len(ilist)} occurrences)\n")
|
||||
w(f"{'─'*80}\n")
|
||||
for iss in ilist:
|
||||
w(f"\n File: {iss['file']}\n")
|
||||
w(f" Item: {iss.get('item_name','?')}\n")
|
||||
w(f" Field: {iss['field']}\n")
|
||||
if iss.get('note'):
|
||||
w(f" Note: {iss['note']}\n")
|
||||
if iss.get('current_end'):
|
||||
w(f" Ends: ...{iss['current_end']}\n")
|
||||
if iss.get('current_preview'):
|
||||
w(f" Text: {iss['current_preview'][:200]}\n")
|
||||
if iss.get('current_text'):
|
||||
ct = iss['current_text']
|
||||
w(f" Text: {ct[:300]}\n")
|
||||
if iss.get('pdf_context'):
|
||||
w(f" PDF>>: {iss['pdf_context'][:400]}\n")
|
||||
|
||||
print(f"Written: {out_json}\n {out_txt}", flush=True)
|
||||
Reference in New Issue
Block a user