Files
2026-04-27 17:49:00 +02:00

339 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Fix OCR corruption artifacts in cde-kungfus and cde-spells JSON packs.
Handles:
- OCR watermark artifact <p> fragments
- Page number <p> tags
- </li></ul><p>continuation</p> structure in spell components/effects/examples
- \n characters inside HTML strings (kung fu files)
- Bleed text from adjacent PDF columns
- Wrong content in style fields
"""
import json
import re
import os
from pathlib import Path
PACKS_DIR = Path(__file__).parent.parent / "packs-src"
# ─────────────────────────────────────────
# Patterns for OCR artifact <p> blocks
# ─────────────────────────────────────────
# These are standalone <p> tags that contain ONLY artifact/junk text
ARTIFACT_PARA_RE = re.compile(
r'<p>\s*(?:'
r'\d{2,3}' # page numbers: 272, 302, 303 …
r'|s\b' # single letter 's'
r'|ue\b'
r'|o\b'
r'|de\b'
r'|怪' # Chinese character used as bullet marker leftover
r'|niq\s+e'
r'|hr\s+ng'
r'|s\s+c\s+ra[^<]{0,30}'
r'|le\s+l[\'\u2019]ét[^<]{0,30}'
r'|les\s+chroniqu[^<]{0,30}'
r'|de\s+l[\'\u2019]ét[^<]{0,30}'
r'|les\s+chroniques'
r'|de\s+l[\'\u2019]étrange'
r')\s*</p>',
re.IGNORECASE
)
# OCR artifact text that can appear inline inside a <p> (before </p>)
INLINE_ARTIFACT_RE = re.compile(
r'\s+(?:'
r's\s+c\s+ra\s+le\s+l[\'\u2019]ét[^<]*'
r'|les\s+chroniqu[^<]*'
r'|niq\s+e[^<]*'
r'|hr\s+ng[^<]*'
r'|de\s+l[\'\u2019]ét[^<]*'
r'|de\s+l[\'\u2019]étrange[^<]*'
r')(?=</p>)',
re.IGNORECASE
)
# Trailing page numbers at end of text node inside <p>
TRAILING_PAGENO_RE = re.compile(r'\s+\d{2,3}\s*(?=</p>)')
def strip_artifact_paragraphs(html: str) -> str:
"""Remove standalone <p> blocks that contain only OCR artifact text."""
prev = None
while prev != html:
prev = html
html = ARTIFACT_PARA_RE.sub('', html)
return html
def strip_inline_artifacts(html: str) -> str:
"""Remove inline OCR artifact text from inside <p>...</p>."""
html = INLINE_ARTIFACT_RE.sub('', html)
html = TRAILING_PAGENO_RE.sub('', html)
return html
def fix_ul_continuation(html: str) -> str:
"""
Fix the pattern: </li></ul><p>continuation</p>
→ <br>continuation</li></ul>
Repeats until stable (handles chains of continuations).
"""
prev = None
while prev != html:
prev = html
html = re.sub(
r'</li></ul>\s*<p>([^<]+)</p>',
r'<br>\1</li></ul>',
html
)
return html
def strip_trailing_bleed_markers(html: str) -> str:
"""
Strip known bleed-start markers and everything after them.
These appear when the PDF's adjacent column/next section intrudes
at the end of a field.
"""
markers = [
# "la magie" header that separates spell sections
r'<p>\s*la\s+magie\s*</p>.*',
# section markers in Caps
r'<p>\s*DOMINER[^<]+</p>.*',
# next-spell metadata bleeds in (怪 icon = bullet point)
r'<p>\s*怪\s+Temps\s+de\s+réalisation.*',
r'<p>\s*怪\s+Élément.*',
r'<p>\s*怪\s+Hei\s*:.*',
r'<p>\s*Effets\s*:\s*</p>.*',
# next kung fu item header bleed
r'(?:Les\s+Trois\s+Joyaux|Double-peau|La\s+Lance\s+du|Le\s+Faite\s+Suprême|Le\s+Seot\s+Gaau|Les\s+Mille\s+Frelons|Les\s+Poignards|Wing\s+Chun|Hung\s+Gar|Jeet\s+Kune)\s+Orientation\s*:.*',
r'de\s+de\s+Tigre\s+la\s+Bâton.*',
r'loyale\s+du\s+Lance\s+Général\s+Yue\s+Fei.*',
]
for marker in markers:
html = re.sub(marker, '', html, flags=re.DOTALL | re.IGNORECASE)
return html
def clean_html_field(html: str) -> str:
"""Apply all cleanup steps to an HTML field value."""
if not html:
return html
html = strip_artifact_paragraphs(html)
html = strip_inline_artifacts(html)
html = fix_ul_continuation(html)
html = strip_trailing_bleed_markers(html)
# Final pass to catch any artifact paragraphs revealed by previous removals
html = strip_artifact_paragraphs(html)
html = strip_inline_artifacts(html)
return html.strip()
# ─────────────────────────────────────────
# Kung Fu specific fixups
# ─────────────────────────────────────────
# OCR artifacts + bleed that appear inline in technique <p> text
KUNGFU_INLINE_BLEED_RE = re.compile(
r'\s+(?:'
r'de\s+l[\'\u2019]étrange\s+Techniques[^<]*'
r'|Techniques\s+s[^<]*' # "Techniques s\n\n\nhr ng..." separator garbage
r'|Style\s+Le\s+d[^<]*' # "Style Le déséquilibre..." bleed
r'|niq\s+e[^<]*'
r'|hr\s+ng[^<]*'
r'|s\s+c\s+ra[^<]*'
r'|le\s+l[\'\u2019]ét[^<]*'
r')(?=</p>)',
re.IGNORECASE
)
# "Orientation : ..." headers from next item in technique fields
TECHNIQUE_ITEM_BLEED_RE = re.compile(
r'\s+(?:Orientation|Aspect|Compétence|Spécialité)\s*:[^<]*'
r'(?:Orientation|Aspect|Compétence|Spécialité)[^<]*(?=</p>)',
re.IGNORECASE
)
def clean_kungfu_technique(html: str) -> str:
"""Clean a technique HTML field."""
if not html:
return html
# Remove literal newlines (PDF extraction artifact)
html = html.replace('\n', ' ')
# Collapse multiple spaces
html = re.sub(r' +', ' ', html)
html = clean_html_field(html)
html = KUNGFU_INLINE_BLEED_RE.sub('', html)
html = TECHNIQUE_ITEM_BLEED_RE.sub('', html)
return html.strip()
# Plain-text artifact pattern (for style fields which are not HTML)
PLAINTEXT_ARTIFACT_RE = re.compile(
r'\s+(?:s\s+)?ue\s+niq\s+e\s+o\s+hr\s+ng\s+s\s+c\s+ra\s+le\s+l[\'\u2019]ét\s+de.*$',
re.DOTALL | re.IGNORECASE
)
# Also match partial artifact runs at end of plain text
PLAINTEXT_ARTIFACT_RE2 = re.compile(
r'\s+s\s+ue\s+niq\s+e.*$',
re.DOTALL | re.IGNORECASE
)
def clean_kungfu_style(style: str, filename: str) -> str:
"""Clean the style field. Handles the 'wrong content' case for la_boxe_de_livresse."""
if not style:
return style
# la_boxe_de_livresse has Wing Chun's style text — must be cleared
if 'la_boxe_de_livresse' in filename:
if 'Wing Chun' in style or 'système de combat' in style:
return ''
# Strip trailing " Techniques" (common bleed marker at end of style)
style = re.sub(r'\s+Techniques\s*$', '', style).strip()
# Strip plain-text OCR artifact sequences (e.g. "s ue niq e o hr ng ...")
style = PLAINTEXT_ARTIFACT_RE.sub('', style).strip()
style = PLAINTEXT_ARTIFACT_RE2.sub('', style).strip()
# Strip bleed from next item (plain text): "Le Gun-fu Orientation : ..."
style = re.sub(
r'\s+(?:Le\s+Gun|Les\s+Trois\s+Joyaux|Double-peau|Wing\s+Chun|Hung\s+Gar|Le\s+Seot)\s+Orientation\s*:.*$',
'', style, flags=re.DOTALL | re.IGNORECASE
).strip()
return style
def process_kungfu_file(path: Path) -> dict:
"""Process a kung fu JSON file; return a dict of changes made."""
data = json.loads(path.read_text(encoding='utf-8'))
system = data['system']
changes = []
for field in ('description', 'style'):
original = system.get(field, '')
if not original:
continue
if field == 'style':
cleaned = clean_kungfu_style(original, path.name)
else:
cleaned = clean_html_field(original)
if cleaned != original:
system[field] = cleaned
changes.append(f'{field}: modified (len {len(original)}{len(cleaned)})')
for tkey, tval in system.get('techniques', {}).items():
for subfield in ('technique',):
original = tval.get(subfield, '')
if not original:
continue
cleaned = clean_kungfu_technique(original)
if cleaned != original:
tval[subfield] = cleaned
changes.append(f'{tkey}.{subfield}: modified (len {len(original)}{len(cleaned)})')
if changes:
path.write_text(
json.dumps(data, ensure_ascii=False, indent=2) + '\n',
encoding='utf-8'
)
return {path.name: changes}
# ─────────────────────────────────────────
# Spell specific fixups
# ─────────────────────────────────────────
def process_spell_file(path: Path) -> dict:
"""Process a spell JSON file; return a dict of changes made."""
data = json.loads(path.read_text(encoding='utf-8'))
system = data['system']
changes = []
for field in ('description', 'effects', 'examples', 'components'):
original = system.get(field, '')
if not original:
continue
cleaned = clean_html_field(original)
if cleaned != original:
system[field] = cleaned
changes.append(f'{field}: modified (len {len(original)}{len(cleaned)})')
if changes:
path.write_text(
json.dumps(data, ensure_ascii=False, indent=2) + '\n',
encoding='utf-8'
)
return {path.name: changes}
# ─────────────────────────────────────────
# Main
# ─────────────────────────────────────────
def main():
all_changes = {}
print("=== Processing kung fu files ===")
for p in sorted((PACKS_DIR / 'cde-kungfus').glob('*.json')):
result = process_kungfu_file(p)
all_changes.update(result)
fname = list(result.keys())[0]
if result[fname]:
print(f" {fname}:")
for c in result[fname]:
print(f" {c}")
else:
print(f" {fname}: no changes")
print("\n=== Processing spell files ===")
for p in sorted((PACKS_DIR / 'cde-spells').glob('*.json')):
result = process_spell_file(p)
all_changes.update(result)
fname = list(result.keys())[0]
if result[fname]:
print(f" {fname}:")
for c in result[fname]:
print(f" {c}")
else:
print(f" {fname}: no changes")
# Summary
changed = {k: v for k, v in all_changes.items() if v}
unchanged = {k: v for k, v in all_changes.items() if not v}
print(f"\n=== Summary ===")
print(f" Files modified: {len(changed)}")
print(f" Files unchanged: {len(unchanged)}")
print()
print("NOTE: Truncated sentences (where content was cut mid-word before bleed)")
print("still need manual restoration from the PDF rulebook.")
print("Fields likely still truncated:")
truncated_hints = {
'wing_chun_wing_ceon.json': ['description (ends mid-sentence)', 'technique1 (content from wrong item — needs full replacement)'],
'la_boxe_de_livresse_zeoi_kyun.json': ['style (cleared — needs correct Zeoi Kyun style text)', 'technique1,2,3 (truncated)'],
'le_double_peau.json': ['technique1, technique2 (truncated)'],
'les_trois_joyaux.json': ['technique1,2,3 (truncated)'],
'le_faite_supreme_de_lepee_taaigik_gim.json': ['technique1,2 (truncated)'],
'la_paume_des_huit_trigrammes_baatgwaa_zoeng.json': ['technique1,2,3 (truncated)'],
'alchemy_acupuncture.json': ['effects (starts with partial sentence from components bleed)'],
'alchemy_poisons.json': ['effects (starts with partial sentence from components bleed)'],
}
for fname, hints in truncated_hints.items():
print(f" {fname}:")
for h in hints:
print(f" - {h}")
if __name__ == '__main__':
main()