#!/usr/bin/env python3 """ Fix OCR corruption artifacts in cde-kungfus and cde-spells JSON packs. Handles: - OCR watermark artifact

fragments - Page number

tags -

continuation

structure in spell components/effects/examples - \n characters inside HTML strings (kung fu files) - Bleed text from adjacent PDF columns - Wrong content in style fields """ import json import re import os from pathlib import Path PACKS_DIR = Path(__file__).parent.parent / "packs-src" # ───────────────────────────────────────── # Patterns for OCR artifact

blocks # ───────────────────────────────────────── # These are standalone

tags that contain ONLY artifact/junk text ARTIFACT_PARA_RE = re.compile( r'

\s*(?:' r'\d{2,3}' # page numbers: 272, 302, 303 … r'|s\b' # single letter 's' r'|ue\b' r'|o\b' r'|de\b' r'|怪' # Chinese character used as bullet marker leftover r'|niq\s+e' r'|hr\s+ng' r'|s\s+c\s+ra[^<]{0,30}' r'|le\s+l[\'\u2019]ét[^<]{0,30}' r'|les\s+chroniqu[^<]{0,30}' r'|de\s+l[\'\u2019]ét[^<]{0,30}' r'|les\s+chroniques' r'|de\s+l[\'\u2019]étrange' r')\s*

', re.IGNORECASE ) # OCR artifact text that can appear inline inside a

(before

) INLINE_ARTIFACT_RE = re.compile( r'\s+(?:' r's\s+c\s+ra\s+le\s+l[\'\u2019]ét[^<]*' r'|les\s+chroniqu[^<]*' r'|niq\s+e[^<]*' r'|hr\s+ng[^<]*' r'|de\s+l[\'\u2019]ét[^<]*' r'|de\s+l[\'\u2019]étrange[^<]*' r')(?=

)', re.IGNORECASE ) # Trailing page numbers at end of text node inside

TRAILING_PAGENO_RE = re.compile(r'\s+\d{2,3}\s*(?=

)') def strip_artifact_paragraphs(html: str) -> str: """Remove standalone

blocks that contain only OCR artifact text.""" prev = None while prev != html: prev = html html = ARTIFACT_PARA_RE.sub('', html) return html def strip_inline_artifacts(html: str) -> str: """Remove inline OCR artifact text from inside

...

.""" html = INLINE_ARTIFACT_RE.sub('', html) html = TRAILING_PAGENO_RE.sub('', html) return html def fix_ul_continuation(html: str) -> str: """ Fix the pattern:

continuation


continuation Repeats until stable (handles chains of continuations). """ prev = None while prev != html: prev = html html = re.sub( r'\s*

([^<]+)

', r'
\1', html ) return html def strip_trailing_bleed_markers(html: str) -> str: """ Strip known bleed-start markers and everything after them. These appear when the PDF's adjacent column/next section intrudes at the end of a field. """ markers = [ # "la magie" header that separates spell sections r'

\s*la\s+magie\s*

.*', # section markers in Caps r'

\s*DOMINER[^<]+

.*', # next-spell metadata bleeds in (怪 icon = bullet point) r'

\s*怪\s+Temps\s+de\s+réalisation.*', r'

\s*怪\s+Élément.*', r'

\s*怪\s+Hei\s*:.*', r'

\s*Effets\s*:\s*

.*', # next kung fu item header bleed r'(?:Les\s+Trois\s+Joyaux|Double-peau|La\s+Lance\s+du|Le\s+Faite\s+Suprême|Le\s+Seot\s+Gaau|Les\s+Mille\s+Frelons|Les\s+Poignards|Wing\s+Chun|Hung\s+Gar|Jeet\s+Kune)\s+Orientation\s*:.*', r'de\s+de\s+Tigre\s+la\s+Bâton.*', r'loyale\s+du\s+Lance\s+Général\s+Yue\s+Fei.*', ] for marker in markers: html = re.sub(marker, '', html, flags=re.DOTALL | re.IGNORECASE) return html def clean_html_field(html: str) -> str: """Apply all cleanup steps to an HTML field value.""" if not html: return html html = strip_artifact_paragraphs(html) html = strip_inline_artifacts(html) html = fix_ul_continuation(html) html = strip_trailing_bleed_markers(html) # Final pass to catch any artifact paragraphs revealed by previous removals html = strip_artifact_paragraphs(html) html = strip_inline_artifacts(html) return html.strip() # ───────────────────────────────────────── # Kung Fu specific fixups # ───────────────────────────────────────── # OCR artifacts + bleed that appear inline in technique

text KUNGFU_INLINE_BLEED_RE = re.compile( r'\s+(?:' r'de\s+l[\'\u2019]étrange\s+Techniques[^<]*' r'|Techniques\s+s[^<]*' # "Techniques s\n\n\nhr ng..." separator garbage r'|Style\s+Le\s+d[^<]*' # "Style Le déséquilibre..." bleed r'|niq\s+e[^<]*' r'|hr\s+ng[^<]*' r'|s\s+c\s+ra[^<]*' r'|le\s+l[\'\u2019]ét[^<]*' r')(?=

)', re.IGNORECASE ) # "Orientation : ..." headers from next item in technique fields TECHNIQUE_ITEM_BLEED_RE = re.compile( r'\s+(?:Orientation|Aspect|Compétence|Spécialité)\s*:[^<]*' r'(?:Orientation|Aspect|Compétence|Spécialité)[^<]*(?=

)', re.IGNORECASE ) def clean_kungfu_technique(html: str) -> str: """Clean a technique HTML field.""" if not html: return html # Remove literal newlines (PDF extraction artifact) html = html.replace('\n', ' ') # Collapse multiple spaces html = re.sub(r' +', ' ', html) html = clean_html_field(html) html = KUNGFU_INLINE_BLEED_RE.sub('', html) html = TECHNIQUE_ITEM_BLEED_RE.sub('', html) return html.strip() # Plain-text artifact pattern (for style fields which are not HTML) PLAINTEXT_ARTIFACT_RE = re.compile( r'\s+(?:s\s+)?ue\s+niq\s+e\s+o\s+hr\s+ng\s+s\s+c\s+ra\s+le\s+l[\'\u2019]ét\s+de.*$', re.DOTALL | re.IGNORECASE ) # Also match partial artifact runs at end of plain text PLAINTEXT_ARTIFACT_RE2 = re.compile( r'\s+s\s+ue\s+niq\s+e.*$', re.DOTALL | re.IGNORECASE ) def clean_kungfu_style(style: str, filename: str) -> str: """Clean the style field. Handles the 'wrong content' case for la_boxe_de_livresse.""" if not style: return style # la_boxe_de_livresse has Wing Chun's style text — must be cleared if 'la_boxe_de_livresse' in filename: if 'Wing Chun' in style or 'système de combat' in style: return '' # Strip trailing " Techniques" (common bleed marker at end of style) style = re.sub(r'\s+Techniques\s*$', '', style).strip() # Strip plain-text OCR artifact sequences (e.g. "s ue niq e o hr ng ...") style = PLAINTEXT_ARTIFACT_RE.sub('', style).strip() style = PLAINTEXT_ARTIFACT_RE2.sub('', style).strip() # Strip bleed from next item (plain text): "Le Gun-fu Orientation : ..." style = re.sub( r'\s+(?:Le\s+Gun|Les\s+Trois\s+Joyaux|Double-peau|Wing\s+Chun|Hung\s+Gar|Le\s+Seot)\s+Orientation\s*:.*$', '', style, flags=re.DOTALL | re.IGNORECASE ).strip() return style def process_kungfu_file(path: Path) -> dict: """Process a kung fu JSON file; return a dict of changes made.""" data = json.loads(path.read_text(encoding='utf-8')) system = data['system'] changes = [] for field in ('description', 'style'): original = system.get(field, '') if not original: continue if field == 'style': cleaned = clean_kungfu_style(original, path.name) else: cleaned = clean_html_field(original) if cleaned != original: system[field] = cleaned changes.append(f'{field}: modified (len {len(original)}→{len(cleaned)})') for tkey, tval in system.get('techniques', {}).items(): for subfield in ('technique',): original = tval.get(subfield, '') if not original: continue cleaned = clean_kungfu_technique(original) if cleaned != original: tval[subfield] = cleaned changes.append(f'{tkey}.{subfield}: modified (len {len(original)}→{len(cleaned)})') if changes: path.write_text( json.dumps(data, ensure_ascii=False, indent=2) + '\n', encoding='utf-8' ) return {path.name: changes} # ───────────────────────────────────────── # Spell specific fixups # ───────────────────────────────────────── def process_spell_file(path: Path) -> dict: """Process a spell JSON file; return a dict of changes made.""" data = json.loads(path.read_text(encoding='utf-8')) system = data['system'] changes = [] for field in ('description', 'effects', 'examples', 'components'): original = system.get(field, '') if not original: continue cleaned = clean_html_field(original) if cleaned != original: system[field] = cleaned changes.append(f'{field}: modified (len {len(original)}→{len(cleaned)})') if changes: path.write_text( json.dumps(data, ensure_ascii=False, indent=2) + '\n', encoding='utf-8' ) return {path.name: changes} # ───────────────────────────────────────── # Main # ───────────────────────────────────────── def main(): all_changes = {} print("=== Processing kung fu files ===") for p in sorted((PACKS_DIR / 'cde-kungfus').glob('*.json')): result = process_kungfu_file(p) all_changes.update(result) fname = list(result.keys())[0] if result[fname]: print(f" {fname}:") for c in result[fname]: print(f" {c}") else: print(f" {fname}: no changes") print("\n=== Processing spell files ===") for p in sorted((PACKS_DIR / 'cde-spells').glob('*.json')): result = process_spell_file(p) all_changes.update(result) fname = list(result.keys())[0] if result[fname]: print(f" {fname}:") for c in result[fname]: print(f" {c}") else: print(f" {fname}: no changes") # Summary changed = {k: v for k, v in all_changes.items() if v} unchanged = {k: v for k, v in all_changes.items() if not v} print(f"\n=== Summary ===") print(f" Files modified: {len(changed)}") print(f" Files unchanged: {len(unchanged)}") print() print("NOTE: Truncated sentences (where content was cut mid-word before bleed)") print("still need manual restoration from the PDF rulebook.") print("Fields likely still truncated:") truncated_hints = { 'wing_chun_wing_ceon.json': ['description (ends mid-sentence)', 'technique1 (content from wrong item — needs full replacement)'], 'la_boxe_de_livresse_zeoi_kyun.json': ['style (cleared — needs correct Zeoi Kyun style text)', 'technique1,2,3 (truncated)'], 'le_double_peau.json': ['technique1, technique2 (truncated)'], 'les_trois_joyaux.json': ['technique1,2,3 (truncated)'], 'le_faite_supreme_de_lepee_taaigik_gim.json': ['technique1,2 (truncated)'], 'la_paume_des_huit_trigrammes_baatgwaa_zoeng.json': ['technique1,2,3 (truncated)'], 'alchemy_acupuncture.json': ['effects (starts with partial sentence from components bleed)'], 'alchemy_poisons.json': ['effects (starts with partial sentence from components bleed)'], } for fname, hints in truncated_hints.items(): print(f" {fname}:") for h in hints: print(f" - {h}") if __name__ == '__main__': main()