339 lines
12 KiB
Python
339 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix OCR corruption artifacts in cde-kungfus and cde-spells JSON packs.
|
|
|
|
Handles:
|
|
- OCR watermark artifact <p> fragments
|
|
- Page number <p> tags
|
|
- </li></ul><p>continuation</p> structure in spell components/effects/examples
|
|
- \n characters inside HTML strings (kung fu files)
|
|
- Bleed text from adjacent PDF columns
|
|
- Wrong content in style fields
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import os
|
|
from pathlib import Path
|
|
|
|
PACKS_DIR = Path(__file__).parent.parent / "packs-src"
|
|
|
|
# ─────────────────────────────────────────
|
|
# Patterns for OCR artifact <p> blocks
|
|
# ─────────────────────────────────────────
|
|
# These are standalone <p> tags that contain ONLY artifact/junk text
|
|
ARTIFACT_PARA_RE = re.compile(
|
|
r'<p>\s*(?:'
|
|
r'\d{2,3}' # page numbers: 272, 302, 303 …
|
|
r'|s\b' # single letter 's'
|
|
r'|ue\b'
|
|
r'|o\b'
|
|
r'|de\b'
|
|
r'|怪' # Chinese character used as bullet marker leftover
|
|
r'|niq\s+e'
|
|
r'|hr\s+ng'
|
|
r'|s\s+c\s+ra[^<]{0,30}'
|
|
r'|le\s+l[\'\u2019]ét[^<]{0,30}'
|
|
r'|les\s+chroniqu[^<]{0,30}'
|
|
r'|de\s+l[\'\u2019]ét[^<]{0,30}'
|
|
r'|les\s+chroniques'
|
|
r'|de\s+l[\'\u2019]étrange'
|
|
r')\s*</p>',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
# OCR artifact text that can appear inline inside a <p> (before </p>)
|
|
INLINE_ARTIFACT_RE = re.compile(
|
|
r'\s+(?:'
|
|
r's\s+c\s+ra\s+le\s+l[\'\u2019]ét[^<]*'
|
|
r'|les\s+chroniqu[^<]*'
|
|
r'|niq\s+e[^<]*'
|
|
r'|hr\s+ng[^<]*'
|
|
r'|de\s+l[\'\u2019]ét[^<]*'
|
|
r'|de\s+l[\'\u2019]étrange[^<]*'
|
|
r')(?=</p>)',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
# Trailing page numbers at end of text node inside <p>
|
|
TRAILING_PAGENO_RE = re.compile(r'\s+\d{2,3}\s*(?=</p>)')
|
|
|
|
|
|
def strip_artifact_paragraphs(html: str) -> str:
|
|
"""Remove standalone <p> blocks that contain only OCR artifact text."""
|
|
prev = None
|
|
while prev != html:
|
|
prev = html
|
|
html = ARTIFACT_PARA_RE.sub('', html)
|
|
return html
|
|
|
|
|
|
def strip_inline_artifacts(html: str) -> str:
|
|
"""Remove inline OCR artifact text from inside <p>...</p>."""
|
|
html = INLINE_ARTIFACT_RE.sub('', html)
|
|
html = TRAILING_PAGENO_RE.sub('', html)
|
|
return html
|
|
|
|
|
|
def fix_ul_continuation(html: str) -> str:
|
|
"""
|
|
Fix the pattern: </li></ul><p>continuation</p>
|
|
→ <br>continuation</li></ul>
|
|
|
|
Repeats until stable (handles chains of continuations).
|
|
"""
|
|
prev = None
|
|
while prev != html:
|
|
prev = html
|
|
html = re.sub(
|
|
r'</li></ul>\s*<p>([^<]+)</p>',
|
|
r'<br>\1</li></ul>',
|
|
html
|
|
)
|
|
return html
|
|
|
|
|
|
def strip_trailing_bleed_markers(html: str) -> str:
|
|
"""
|
|
Strip known bleed-start markers and everything after them.
|
|
|
|
These appear when the PDF's adjacent column/next section intrudes
|
|
at the end of a field.
|
|
"""
|
|
markers = [
|
|
# "la magie" header that separates spell sections
|
|
r'<p>\s*la\s+magie\s*</p>.*',
|
|
# section markers in Caps
|
|
r'<p>\s*DOMINER[^<]+</p>.*',
|
|
# next-spell metadata bleeds in (怪 icon = bullet point)
|
|
r'<p>\s*怪\s+Temps\s+de\s+réalisation.*',
|
|
r'<p>\s*怪\s+Élément.*',
|
|
r'<p>\s*怪\s+Hei\s*:.*',
|
|
r'<p>\s*Effets\s*:\s*</p>.*',
|
|
# next kung fu item header bleed
|
|
r'(?:Les\s+Trois\s+Joyaux|Double-peau|La\s+Lance\s+du|Le\s+Faite\s+Suprême|Le\s+Seot\s+Gaau|Les\s+Mille\s+Frelons|Les\s+Poignards|Wing\s+Chun|Hung\s+Gar|Jeet\s+Kune)\s+Orientation\s*:.*',
|
|
r'de\s+de\s+Tigre\s+la\s+Bâton.*',
|
|
r'loyale\s+du\s+Lance\s+Général\s+Yue\s+Fei.*',
|
|
]
|
|
for marker in markers:
|
|
html = re.sub(marker, '', html, flags=re.DOTALL | re.IGNORECASE)
|
|
return html
|
|
|
|
|
|
def clean_html_field(html: str) -> str:
|
|
"""Apply all cleanup steps to an HTML field value."""
|
|
if not html:
|
|
return html
|
|
html = strip_artifact_paragraphs(html)
|
|
html = strip_inline_artifacts(html)
|
|
html = fix_ul_continuation(html)
|
|
html = strip_trailing_bleed_markers(html)
|
|
# Final pass to catch any artifact paragraphs revealed by previous removals
|
|
html = strip_artifact_paragraphs(html)
|
|
html = strip_inline_artifacts(html)
|
|
return html.strip()
|
|
|
|
|
|
# ─────────────────────────────────────────
|
|
# Kung Fu specific fixups
|
|
# ─────────────────────────────────────────
|
|
|
|
# OCR artifacts + bleed that appear inline in technique <p> text
|
|
KUNGFU_INLINE_BLEED_RE = re.compile(
|
|
r'\s+(?:'
|
|
r'de\s+l[\'\u2019]étrange\s+Techniques[^<]*'
|
|
r'|Techniques\s+s[^<]*' # "Techniques s\n\n\nhr ng..." separator garbage
|
|
r'|Style\s+Le\s+d[^<]*' # "Style Le déséquilibre..." bleed
|
|
r'|niq\s+e[^<]*'
|
|
r'|hr\s+ng[^<]*'
|
|
r'|s\s+c\s+ra[^<]*'
|
|
r'|le\s+l[\'\u2019]ét[^<]*'
|
|
r')(?=</p>)',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
# "Orientation : ..." headers from next item in technique fields
|
|
TECHNIQUE_ITEM_BLEED_RE = re.compile(
|
|
r'\s+(?:Orientation|Aspect|Compétence|Spécialité)\s*:[^<]*'
|
|
r'(?:Orientation|Aspect|Compétence|Spécialité)[^<]*(?=</p>)',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
def clean_kungfu_technique(html: str) -> str:
|
|
"""Clean a technique HTML field."""
|
|
if not html:
|
|
return html
|
|
# Remove literal newlines (PDF extraction artifact)
|
|
html = html.replace('\n', ' ')
|
|
# Collapse multiple spaces
|
|
html = re.sub(r' +', ' ', html)
|
|
html = clean_html_field(html)
|
|
html = KUNGFU_INLINE_BLEED_RE.sub('', html)
|
|
html = TECHNIQUE_ITEM_BLEED_RE.sub('', html)
|
|
return html.strip()
|
|
|
|
|
|
|
|
# Plain-text artifact pattern (for style fields which are not HTML)
|
|
PLAINTEXT_ARTIFACT_RE = re.compile(
|
|
r'\s+(?:s\s+)?ue\s+niq\s+e\s+o\s+hr\s+ng\s+s\s+c\s+ra\s+le\s+l[\'\u2019]ét\s+de.*$',
|
|
re.DOTALL | re.IGNORECASE
|
|
)
|
|
# Also match partial artifact runs at end of plain text
|
|
PLAINTEXT_ARTIFACT_RE2 = re.compile(
|
|
r'\s+s\s+ue\s+niq\s+e.*$',
|
|
re.DOTALL | re.IGNORECASE
|
|
)
|
|
|
|
|
|
def clean_kungfu_style(style: str, filename: str) -> str:
|
|
"""Clean the style field. Handles the 'wrong content' case for la_boxe_de_livresse."""
|
|
if not style:
|
|
return style
|
|
|
|
# la_boxe_de_livresse has Wing Chun's style text — must be cleared
|
|
if 'la_boxe_de_livresse' in filename:
|
|
if 'Wing Chun' in style or 'système de combat' in style:
|
|
return ''
|
|
|
|
# Strip trailing " Techniques" (common bleed marker at end of style)
|
|
style = re.sub(r'\s+Techniques\s*$', '', style).strip()
|
|
|
|
# Strip plain-text OCR artifact sequences (e.g. "s ue niq e o hr ng ...")
|
|
style = PLAINTEXT_ARTIFACT_RE.sub('', style).strip()
|
|
style = PLAINTEXT_ARTIFACT_RE2.sub('', style).strip()
|
|
|
|
# Strip bleed from next item (plain text): "Le Gun-fu Orientation : ..."
|
|
style = re.sub(
|
|
r'\s+(?:Le\s+Gun|Les\s+Trois\s+Joyaux|Double-peau|Wing\s+Chun|Hung\s+Gar|Le\s+Seot)\s+Orientation\s*:.*$',
|
|
'', style, flags=re.DOTALL | re.IGNORECASE
|
|
).strip()
|
|
|
|
return style
|
|
|
|
|
|
def process_kungfu_file(path: Path) -> dict:
|
|
"""Process a kung fu JSON file; return a dict of changes made."""
|
|
data = json.loads(path.read_text(encoding='utf-8'))
|
|
system = data['system']
|
|
changes = []
|
|
|
|
for field in ('description', 'style'):
|
|
original = system.get(field, '')
|
|
if not original:
|
|
continue
|
|
if field == 'style':
|
|
cleaned = clean_kungfu_style(original, path.name)
|
|
else:
|
|
cleaned = clean_html_field(original)
|
|
if cleaned != original:
|
|
system[field] = cleaned
|
|
changes.append(f'{field}: modified (len {len(original)}→{len(cleaned)})')
|
|
|
|
for tkey, tval in system.get('techniques', {}).items():
|
|
for subfield in ('technique',):
|
|
original = tval.get(subfield, '')
|
|
if not original:
|
|
continue
|
|
cleaned = clean_kungfu_technique(original)
|
|
if cleaned != original:
|
|
tval[subfield] = cleaned
|
|
changes.append(f'{tkey}.{subfield}: modified (len {len(original)}→{len(cleaned)})')
|
|
|
|
if changes:
|
|
path.write_text(
|
|
json.dumps(data, ensure_ascii=False, indent=2) + '\n',
|
|
encoding='utf-8'
|
|
)
|
|
|
|
return {path.name: changes}
|
|
|
|
|
|
# ─────────────────────────────────────────
|
|
# Spell specific fixups
|
|
# ─────────────────────────────────────────
|
|
|
|
def process_spell_file(path: Path) -> dict:
|
|
"""Process a spell JSON file; return a dict of changes made."""
|
|
data = json.loads(path.read_text(encoding='utf-8'))
|
|
system = data['system']
|
|
changes = []
|
|
|
|
for field in ('description', 'effects', 'examples', 'components'):
|
|
original = system.get(field, '')
|
|
if not original:
|
|
continue
|
|
cleaned = clean_html_field(original)
|
|
if cleaned != original:
|
|
system[field] = cleaned
|
|
changes.append(f'{field}: modified (len {len(original)}→{len(cleaned)})')
|
|
|
|
if changes:
|
|
path.write_text(
|
|
json.dumps(data, ensure_ascii=False, indent=2) + '\n',
|
|
encoding='utf-8'
|
|
)
|
|
|
|
return {path.name: changes}
|
|
|
|
|
|
# ─────────────────────────────────────────
|
|
# Main
|
|
# ─────────────────────────────────────────
|
|
|
|
def main():
|
|
all_changes = {}
|
|
|
|
print("=== Processing kung fu files ===")
|
|
for p in sorted((PACKS_DIR / 'cde-kungfus').glob('*.json')):
|
|
result = process_kungfu_file(p)
|
|
all_changes.update(result)
|
|
fname = list(result.keys())[0]
|
|
if result[fname]:
|
|
print(f" {fname}:")
|
|
for c in result[fname]:
|
|
print(f" {c}")
|
|
else:
|
|
print(f" {fname}: no changes")
|
|
|
|
print("\n=== Processing spell files ===")
|
|
for p in sorted((PACKS_DIR / 'cde-spells').glob('*.json')):
|
|
result = process_spell_file(p)
|
|
all_changes.update(result)
|
|
fname = list(result.keys())[0]
|
|
if result[fname]:
|
|
print(f" {fname}:")
|
|
for c in result[fname]:
|
|
print(f" {c}")
|
|
else:
|
|
print(f" {fname}: no changes")
|
|
|
|
# Summary
|
|
changed = {k: v for k, v in all_changes.items() if v}
|
|
unchanged = {k: v for k, v in all_changes.items() if not v}
|
|
print(f"\n=== Summary ===")
|
|
print(f" Files modified: {len(changed)}")
|
|
print(f" Files unchanged: {len(unchanged)}")
|
|
print()
|
|
print("NOTE: Truncated sentences (where content was cut mid-word before bleed)")
|
|
print("still need manual restoration from the PDF rulebook.")
|
|
print("Fields likely still truncated:")
|
|
truncated_hints = {
|
|
'wing_chun_wing_ceon.json': ['description (ends mid-sentence)', 'technique1 (content from wrong item — needs full replacement)'],
|
|
'la_boxe_de_livresse_zeoi_kyun.json': ['style (cleared — needs correct Zeoi Kyun style text)', 'technique1,2,3 (truncated)'],
|
|
'le_double_peau.json': ['technique1, technique2 (truncated)'],
|
|
'les_trois_joyaux.json': ['technique1,2,3 (truncated)'],
|
|
'le_faite_supreme_de_lepee_taaigik_gim.json': ['technique1,2 (truncated)'],
|
|
'la_paume_des_huit_trigrammes_baatgwaa_zoeng.json': ['technique1,2,3 (truncated)'],
|
|
'alchemy_acupuncture.json': ['effects (starts with partial sentence from components bleed)'],
|
|
'alchemy_poisons.json': ['effects (starts with partial sentence from components bleed)'],
|
|
}
|
|
for fname, hints in truncated_hints.items():
|
|
print(f" {fname}:")
|
|
for h in hints:
|
|
print(f" - {h}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|