37 lines
983 B
Python
37 lines
983 B
Python
#!/usr/bin/env python3
|
|
"""Extract Machine Gods of the Noxian Expanse rules PDF to plain text."""
|
|
|
|
import fitz
|
|
import sys
|
|
import re
|
|
from pathlib import Path
|
|
|
|
RULES_PDF = Path("rules/Machine Gods of the Noxian Expanse - Core Rules BETA 3.pdf")
|
|
OUTPUT_TXT = Path("rules/rules_full.txt")
|
|
|
|
|
|
def extract_pdf_text(pdf_path: Path) -> str:
|
|
doc = fitz.open(str(pdf_path))
|
|
pages = []
|
|
for i, page in enumerate(doc):
|
|
text = page.get_text("text")
|
|
pages.append(f"===== PAGE {i+1} =====\n{text}")
|
|
doc.close()
|
|
return "\n\n".join(pages)
|
|
|
|
|
|
def main():
|
|
if not RULES_PDF.exists():
|
|
print(f"Error: {RULES_PDF} not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
print(f"Extracting {RULES_PDF}...")
|
|
text = extract_pdf_text(RULES_PDF)
|
|
OUTPUT_TXT.write_text(text, encoding="utf-8")
|
|
print(f"Done — {len(text)} chars written to {OUTPUT_TXT}")
|
|
print(f"Total pages extracted: {text.count('===== PAGE')}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|