Various fixes for official release
This commit is contained in:
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract Machine Gods of the Noxian Expanse rules PDF to plain text."""
|
||||
|
||||
import fitz
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
RULES_PDF = Path("rules/Machine Gods of the Noxian Expanse - Core Rules BETA 3.pdf")
|
||||
OUTPUT_TXT = Path("rules/rules_full.txt")
|
||||
|
||||
|
||||
def extract_pdf_text(pdf_path: Path) -> str:
|
||||
doc = fitz.open(str(pdf_path))
|
||||
pages = []
|
||||
for i, page in enumerate(doc):
|
||||
text = page.get_text("text")
|
||||
pages.append(f"===== PAGE {i+1} =====\n{text}")
|
||||
doc.close()
|
||||
return "\n\n".join(pages)
|
||||
|
||||
|
||||
def main():
|
||||
if not RULES_PDF.exists():
|
||||
print(f"Error: {RULES_PDF} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Extracting {RULES_PDF}...")
|
||||
text = extract_pdf_text(RULES_PDF)
|
||||
OUTPUT_TXT.write_text(text, encoding="utf-8")
|
||||
print(f"Done — {len(text)} chars written to {OUTPUT_TXT}")
|
||||
print(f"Total pages extracted: {text.count('===== PAGE')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user