fvtt-chroniques-de-l-etrange/tools/fix_corrupted_packs.py

#!/usr/bin/env python3
"""
Fix OCR corruption artifacts in cde-kungfus and cde-spells JSON packs.

Handles:
  - OCR watermark artifact <p> fragments
  - Page number <p> tags
  - </li></ul><p>continuation</p> structure in spell components/effects/examples
  - \n characters inside HTML strings (kung fu files)
  - Bleed text from adjacent PDF columns
  - Wrong content in style fields
"""

import json
import re
import os
from pathlib import Path

PACKS_DIR = Path(__file__).parent.parent / "packs-src"

# ─────────────────────────────────────────
# Patterns for OCR artifact <p> blocks
# ─────────────────────────────────────────
# These are standalone <p> tags that contain ONLY artifact/junk text
ARTIFACT_PARA_RE = re.compile(
    r'<p>\s*(?:'
    r'\d{2,3}'                      # page numbers: 272, 302, 303 …
    r'|s\b'                         # single letter 's'
    r'|ue\b'
    r'|o\b'
    r'|de\b'
    r'|怪'                          # Chinese character used as bullet marker leftover
    r'|niq\s+e'
    r'|hr\s+ng'
    r'|s\s+c\s+ra[^<]{0,30}'
    r'|le\s+l[\'\u2019]ét[^<]{0,30}'
    r'|les\s+chroniqu[^<]{0,30}'
    r'|de\s+l[\'\u2019]ét[^<]{0,30}'
    r'|les\s+chroniques'
    r'|de\s+l[\'\u2019]étrange'
    r')\s*</p>',
    re.IGNORECASE
)

# OCR artifact text that can appear inline inside a <p> (before </p>)
INLINE_ARTIFACT_RE = re.compile(
    r'\s+(?:'
    r's\s+c\s+ra\s+le\s+l[\'\u2019]ét[^<]*'
    r'|les\s+chroniqu[^<]*'
    r'|niq\s+e[^<]*'
    r'|hr\s+ng[^<]*'
    r'|de\s+l[\'\u2019]ét[^<]*'
    r'|de\s+l[\'\u2019]étrange[^<]*'
    r')(?=</p>)',
    re.IGNORECASE
)

# Trailing page numbers at end of text node inside <p>
TRAILING_PAGENO_RE = re.compile(r'\s+\d{2,3}\s*(?=</p>)')


def strip_artifact_paragraphs(html: str) -> str:
    """Remove standalone <p> blocks that contain only OCR artifact text."""
    prev = None
    while prev != html:
        prev = html
        html = ARTIFACT_PARA_RE.sub('', html)
    return html


def strip_inline_artifacts(html: str) -> str:
    """Remove inline OCR artifact text from inside <p>...</p>."""
    html = INLINE_ARTIFACT_RE.sub('', html)
    html = TRAILING_PAGENO_RE.sub('', html)
    return html


def fix_ul_continuation(html: str) -> str:
    """
    Fix the pattern: </li></ul><p>continuation</p>
    → <br>continuation</li></ul>

    Repeats until stable (handles chains of continuations).
    """
    prev = None
    while prev != html:
        prev = html
        html = re.sub(
            r'</li></ul>\s*<p>([^<]+)</p>',
            r'<br>\1</li></ul>',
            html
        )
    return html


def strip_trailing_bleed_markers(html: str) -> str:
    """
    Strip known bleed-start markers and everything after them.

    These appear when the PDF's adjacent column/next section intrudes
    at the end of a field.
    """
    markers = [
        # "la magie" header that separates spell sections
        r'<p>\s*la\s+magie\s*</p>.*',
        # section markers in Caps
        r'<p>\s*DOMINER[^<]+</p>.*',
        # next-spell metadata bleeds in (怪 icon = bullet point)
        r'<p>\s*怪\s+Temps\s+de\s+réalisation.*',
        r'<p>\s*怪\s+Élément.*',
        r'<p>\s*怪\s+Hei\s*:.*',
        r'<p>\s*Effets\s*:\s*</p>.*',
        # next kung fu item header bleed
        r'(?:Les\s+Trois\s+Joyaux|Double-peau|La\s+Lance\s+du|Le\s+Faite\s+Suprême|Le\s+Seot\s+Gaau|Les\s+Mille\s+Frelons|Les\s+Poignards|Wing\s+Chun|Hung\s+Gar|Jeet\s+Kune)\s+Orientation\s*:.*',
        r'de\s+de\s+Tigre\s+la\s+Bâton.*',
        r'loyale\s+du\s+Lance\s+Général\s+Yue\s+Fei.*',
    ]
    for marker in markers:
        html = re.sub(marker, '', html, flags=re.DOTALL | re.IGNORECASE)
    return html


def clean_html_field(html: str) -> str:
    """Apply all cleanup steps to an HTML field value."""
    if not html:
        return html
    html = strip_artifact_paragraphs(html)
    html = strip_inline_artifacts(html)
    html = fix_ul_continuation(html)
    html = strip_trailing_bleed_markers(html)
    # Final pass to catch any artifact paragraphs revealed by previous removals
    html = strip_artifact_paragraphs(html)
    html = strip_inline_artifacts(html)
    return html.strip()


# ─────────────────────────────────────────
# Kung Fu specific fixups
# ─────────────────────────────────────────

# OCR artifacts + bleed that appear inline in technique <p> text
KUNGFU_INLINE_BLEED_RE = re.compile(
    r'\s+(?:'
    r'de\s+l[\'\u2019]étrange\s+Techniques[^<]*'
    r'|Techniques\s+s[^<]*'          # "Techniques  s\n\n\nhr ng..." separator garbage
    r'|Style\s+Le\s+d[^<]*'          # "Style Le déséquilibre..." bleed
    r'|niq\s+e[^<]*'
    r'|hr\s+ng[^<]*'
    r'|s\s+c\s+ra[^<]*'
    r'|le\s+l[\'\u2019]ét[^<]*'
    r')(?=</p>)',
    re.IGNORECASE
)

# "Orientation : ..." headers from next item in technique fields
TECHNIQUE_ITEM_BLEED_RE = re.compile(
    r'\s+(?:Orientation|Aspect|Compétence|Spécialité)\s*:[^<]*'
    r'(?:Orientation|Aspect|Compétence|Spécialité)[^<]*(?=</p>)',
    re.IGNORECASE
)

def clean_kungfu_technique(html: str) -> str:
    """Clean a technique HTML field."""
    if not html:
        return html
    # Remove literal newlines (PDF extraction artifact)
    html = html.replace('\n', ' ')
    # Collapse multiple spaces
    html = re.sub(r'  +', ' ', html)
    html = clean_html_field(html)
    html = KUNGFU_INLINE_BLEED_RE.sub('', html)
    html = TECHNIQUE_ITEM_BLEED_RE.sub('', html)
    return html.strip()


# Plain-text artifact pattern (for style fields which are not HTML)
PLAINTEXT_ARTIFACT_RE = re.compile(
    r'\s+(?:s\s+)?ue\s+niq\s+e\s+o\s+hr\s+ng\s+s\s+c\s+ra\s+le\s+l[\'\u2019]ét\s+de.*$',
    re.DOTALL | re.IGNORECASE
)
# Also match partial artifact runs at end of plain text
PLAINTEXT_ARTIFACT_RE2 = re.compile(
    r'\s+s\s+ue\s+niq\s+e.*$',
    re.DOTALL | re.IGNORECASE
)


def clean_kungfu_style(style: str, filename: str) -> str:
    """Clean the style field.  Handles the 'wrong content' case for la_boxe_de_livresse."""
    if not style:
        return style

    # la_boxe_de_livresse has Wing Chun's style text — must be cleared
    if 'la_boxe_de_livresse' in filename:
        if 'Wing Chun' in style or 'système de combat' in style:
            return ''

    # Strip trailing " Techniques" (common bleed marker at end of style)
    style = re.sub(r'\s+Techniques\s*$', '', style).strip()

    # Strip plain-text OCR artifact sequences (e.g. "s ue niq e o hr ng ...")
    style = PLAINTEXT_ARTIFACT_RE.sub('', style).strip()
    style = PLAINTEXT_ARTIFACT_RE2.sub('', style).strip()

    # Strip bleed from next item (plain text): "Le Gun-fu Orientation : ..."
    style = re.sub(
        r'\s+(?:Le\s+Gun|Les\s+Trois\s+Joyaux|Double-peau|Wing\s+Chun|Hung\s+Gar|Le\s+Seot)\s+Orientation\s*:.*$',
        '', style, flags=re.DOTALL | re.IGNORECASE
    ).strip()

    return style


def process_kungfu_file(path: Path) -> dict:
    """Process a kung fu JSON file; return a dict of changes made."""
    data = json.loads(path.read_text(encoding='utf-8'))
    system = data['system']
    changes = []

    for field in ('description', 'style'):
        original = system.get(field, '')
        if not original:
            continue
        if field == 'style':
            cleaned = clean_kungfu_style(original, path.name)
        else:
            cleaned = clean_html_field(original)
        if cleaned != original:
            system[field] = cleaned
            changes.append(f'{field}: modified (len {len(original)}→{len(cleaned)})')

    for tkey, tval in system.get('techniques', {}).items():
        for subfield in ('technique',):
            original = tval.get(subfield, '')
            if not original:
                continue
            cleaned = clean_kungfu_technique(original)
            if cleaned != original:
                tval[subfield] = cleaned
                changes.append(f'{tkey}.{subfield}: modified (len {len(original)}→{len(cleaned)})')

    if changes:
        path.write_text(
            json.dumps(data, ensure_ascii=False, indent=2) + '\n',
            encoding='utf-8'
        )

    return {path.name: changes}


# ─────────────────────────────────────────
# Spell specific fixups
# ─────────────────────────────────────────

def process_spell_file(path: Path) -> dict:
    """Process a spell JSON file; return a dict of changes made."""
    data = json.loads(path.read_text(encoding='utf-8'))
    system = data['system']
    changes = []

    for field in ('description', 'effects', 'examples', 'components'):
        original = system.get(field, '')
        if not original:
            continue
        cleaned = clean_html_field(original)
        if cleaned != original:
            system[field] = cleaned
            changes.append(f'{field}: modified (len {len(original)}→{len(cleaned)})')

    if changes:
        path.write_text(
            json.dumps(data, ensure_ascii=False, indent=2) + '\n',
            encoding='utf-8'
        )

    return {path.name: changes}


# ─────────────────────────────────────────
# Main
# ─────────────────────────────────────────

def main():
    all_changes = {}

    print("=== Processing kung fu files ===")
    for p in sorted((PACKS_DIR / 'cde-kungfus').glob('*.json')):
        result = process_kungfu_file(p)
        all_changes.update(result)
        fname = list(result.keys())[0]
        if result[fname]:
            print(f"  {fname}:")
            for c in result[fname]:
                print(f"    {c}")
        else:
            print(f"  {fname}: no changes")

    print("\n=== Processing spell files ===")
    for p in sorted((PACKS_DIR / 'cde-spells').glob('*.json')):
        result = process_spell_file(p)
        all_changes.update(result)
        fname = list(result.keys())[0]
        if result[fname]:
            print(f"  {fname}:")
            for c in result[fname]:
                print(f"    {c}")
        else:
            print(f"  {fname}: no changes")

    # Summary
    changed = {k: v for k, v in all_changes.items() if v}
    unchanged = {k: v for k, v in all_changes.items() if not v}
    print(f"\n=== Summary ===")
    print(f"  Files modified: {len(changed)}")
    print(f"  Files unchanged: {len(unchanged)}")
    print()
    print("NOTE: Truncated sentences (where content was cut mid-word before bleed)")
    print("still need manual restoration from the PDF rulebook.")
    print("Fields likely still truncated:")
    truncated_hints = {
        'wing_chun_wing_ceon.json': ['description (ends mid-sentence)', 'technique1 (content from wrong item — needs full replacement)'],
        'la_boxe_de_livresse_zeoi_kyun.json': ['style (cleared — needs correct Zeoi Kyun style text)', 'technique1,2,3 (truncated)'],
        'le_double_peau.json': ['technique1, technique2 (truncated)'],
        'les_trois_joyaux.json': ['technique1,2,3 (truncated)'],
        'le_faite_supreme_de_lepee_taaigik_gim.json': ['technique1,2 (truncated)'],
        'la_paume_des_huit_trigrammes_baatgwaa_zoeng.json': ['technique1,2,3 (truncated)'],
        'alchemy_acupuncture.json': ['effects (starts with partial sentence from components bleed)'],
        'alchemy_poisons.json': ['effects (starts with partial sentence from components bleed)'],
    }
    for fname, hints in truncated_hints.items():
        print(f"  {fname}:")
        for h in hints:
            print(f"    - {h}")


if __name__ == '__main__':
    main()