import fitz, re, json pdf_path = '/home/morr/work/uberwald/fvtt-celestopol/__regles/Célestopol 1922 Fiches de prêts à jouer v1_cdjdr.pdf' doc = fitz.open(pdf_path) def is_green(color, tol=0.008): if not color: return False return (abs(color[0]-0.131) < tol and abs(color[1]-0.284) < tol and abs(color[2]-0.160) < tol) SKILL_LIST = ['ARTIFICE','ATTRACTION','COERCITION','FAVEUR', 'ÉCHAUFFOURÉE','EFFACEMENT','MOBILITÉ','PROUESSE', 'APPRÉCIATION','ARTS','INSPIRATION','TRAQUE', 'INSTRUCTION','MERV.TECH.','RAISONNEMENT','TRAITEMENT'] def norm(s): return s.strip().upper().replace('É','E').replace('È','E').replace('Ê','E').replace('Ô','O').replace('Â','A').replace('Î','I').replace('Œ','OE').replace('.','') def get_skill_values(page): words = page.get_text("words") skill_y = {} for w in words: wn = norm(w[4]) for sk in SKILL_LIST: if wn == norm(sk) and w[0] < 430: skill_y[sk] = (w[1]+w[3])/2 # MERV.TECH. special for w in words: if 'MERV' in w[4].upper() and w[0] < 430: skill_y['MERV.TECH.'] = (w[1]+w[3])/2 green_circles = [] for d in page.get_drawings(): if is_green(d.get('fill')): rect = d['rect'] ws = rect.x1 - rect.x0 if 5.5 < ws < 8.5: green_circles.append(((rect.x0+rect.x1)/2, (rect.y0+rect.y1)/2)) skills = {} for sk in SKILL_LIST: sy = skill_y.get(sk, None) if sy is not None: skills[sk] = sum(1 for cx, cy in green_circles if abs(cy - sy) < 7) else: skills[sk] = 0 return skills def get_resistances(page): words = page.get_text("words") domain_y = {} for w in sorted(words, key=lambda x: x[1]): t = norm(w[4]) x0 = w[0] if t == 'AME' and 300 < x0 < 500: domain_y['ame'] = w[1] elif t == 'CORPS' and 300 < x0 < 500: domain_y['corps'] = w[1] elif t == 'COEUR' and 300 < x0 < 500: domain_y['coeur'] = w[1] elif t == 'ESPRIT' and 300 < x0 < 500: domain_y['esprit'] = w[1] res = {} for dom, dy in domain_y.items(): for w in words: if w[4].strip().isdigit() and w[0] > 480 and abs(w[1]-dy) < 35: res[dom] = int(w[4].strip()) break return res def get_anomalie_name(stats_page): # Extract from text: the anomalie name appears in the bottom-right of the page # Parse cleanly using raw text text = stats_page.get_text("text") lines = [l.strip() for l in text.split('\n') if l.strip()] skip_words = {'ANOMALIE','NV','RÉSISTANCE','RESISTANCE'} skip_starts = ['pour ','lors ','gagner ','trouver ','éviter ','sortir ','obtenir ', 'lors d', 'Vider ', 'Gain ', 'en pui', 'pour ne', 'pour ré'] for i, line in enumerate(lines): if 'ANOMALIE' in line.upper() or 'NV' in line: # Look in next few lines for the name for j in range(i+1, min(i+10, len(lines))): l = lines[j] if not any(l.startswith(s) for s in skip_starts) and l not in skip_words: if l and l[0].isupper() and len(l) > 1: return l return "?" def get_anomalie_niveau(stats_page): words = stats_page.get_text("words") for w in sorted(words, key=lambda x: (x[1],x[0])): if w[4].strip().isdigit() and w[0] > 480 and w[1] > 650: return int(w[4].strip()) return None def get_char_base_info(stats_page): blocks = stats_page.get_text("dict")["blocks"] name = None for block in blocks: for line in block.get("lines", []): for span in line.get("spans", []): if span.get("size", 0) > 11 and 'Bold' in span.get("font",""): y = span["origin"][1] t = span["text"].strip() if t and len(t) > 3 and 150 < y < 250: name = t return {'name': name} def get_raw_text(page): html = page.get_text("html") clean = re.sub(r'<[^>]+>', ' ', html) clean = re.sub(r'&#x([0-9a-fA-F]+);', lambda m: chr(int(m.group(1),16)), clean) clean = re.sub(r'&#([0-9]+);', lambda m: chr(int(m.group(1))), clean) return re.sub(r'\s+', ' ', clean).strip() def parse_aspects_page(asp_page): text = asp_page.get_text("text") lines = [l.strip() for l in text.split('\n') if l.strip()] return lines # Characters: (name_idx, anom_desc_idx, stats_idx, aspects_idx) CHARACTERS = [ (0, 1, 2, 3), (4, 5, 6, 7), (8, 9, 10, 11), (12, 13, 14, 15), (16, 17, 18, 19), (20, 21, 22, 23), (24, 25, 26, 27), (28, 29, 30, 31), ] for n_idx, a_idx, s_idx, asp_idx in CHARACTERS: sp = doc[s_idx] skills = get_skill_values(sp) res = get_resistances(sp) anom_name = get_anomalie_name(sp) anom_nv = get_anomalie_niveau(sp) char_info = get_char_base_info(sp) anom_desc = get_raw_text(doc[a_idx]) asp_lines = parse_aspects_page(doc[asp_idx]) print(f"\n{'='*70}") print(f"NAME: {char_info.get('name','?')}") print(f"SKILLS: {json.dumps(skills, ensure_ascii=False)}") print(f"RESISTANCES: {res}") print(f"ANOMALIE: {anom_name} nv{anom_nv}") print(f"ANOM DESC (first 300 chars): {anom_desc[:300]}") print("ASPECTS LINES:") for i,l in enumerate(asp_lines[:60]): print(f" {i:2d}: {l}") doc.close() print("\nDONE")