Files
fvtt-prism-rpg/tools/scrapeWikiToMarkdown.mjs
2025-11-06 00:01:59 +01:00

172 lines
4.5 KiB
JavaScript

#!/usr/bin/env node
import puppeteer from 'puppeteer';
import TurndownService from 'turndown';
import fs from 'fs';
import path from 'path';
const BASE_URL = 'https://corvanis.wiki';
const PAGES_LIST_FILE = './tools/wikiPagesList.txt';
const OUTPUT_FILE = './prism-rules/prism-wiki-complete.md';
// Create output directory
const outputDir = path.dirname(OUTPUT_FILE);
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Initialize Turndown for HTML to Markdown conversion
const turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced',
});
/**
* Wait for a specific time
* @param {number} ms Milliseconds to wait
* @returns {Promise<void>}
*/
function wait(ms) {
return new Promise((resolve) => {
setTimeout(resolve, ms);
});
}
/**
* Sanitize page title for heading
* @param {string} path Page path
* @returns {string} Clean title
*/
function getPageTitle(path) {
return path
.replace(/^\/Prism\+\(Testing\)\/Prism\/?/, '')
.replace(/\+/g, ' ')
.trim() || 'Prism Home';
}
/**
* Scrape a page and convert to Markdown
* @param {object} page Puppeteer page object
* @param {string} pagePath Page path to scrape
* @returns {Promise<string>} Markdown content
*/
async function scrapePage(page, pagePath) {
const url = BASE_URL + pagePath;
console.log(`📄 Scraping: ${pagePath}`);
try {
// Navigate to the page
await page.goto(url, {
waitUntil: 'networkidle0',
timeout: 30000,
});
// Wait for the main content to load
await page.waitForSelector('.published-container', { timeout: 10000 });
// Wait a bit more for dynamic content
await wait(2000);
// Extract the main content
const content = await page.evaluate(() => {
const container = document.querySelector('.published-container');
if (!container) {
return null;
}
// Remove navigation elements
const nav = container.querySelector('.site-body-left-column');
if (nav) {
nav.remove();
}
// Get the main content area
const mainContent = container.querySelector('.site-body-center-column');
return mainContent ? mainContent.innerHTML : container.innerHTML;
});
if (!content) {
console.log(` ⚠️ Pas de contenu trouvé pour ${pagePath}`);
return '';
}
// Convert HTML to Markdown
const markdown = turndownService.turndown(content);
console.log(` ✓ Converti (${markdown.length} caractères)`);
return markdown;
} catch (error) {
console.error(` ❌ Erreur: ${error.message}`);
return '';
}
}
/**
* Main function
*/
async function main() {
console.log('🚀 Début du scraping du wiki Prism...\n');
// Read the list of pages
let pages = [];
if (fs.existsSync(PAGES_LIST_FILE)) {
const content = fs.readFileSync(PAGES_LIST_FILE, 'utf-8');
pages = content
.split('\n')
.map((line) => line.trim())
.filter((line) => line && !line.startsWith('#'));
} else {
console.error(`❌ Fichier de liste introuvable: ${PAGES_LIST_FILE}`);
process.exit(1);
}
console.log(`📋 ${pages.length} page(s) à scraper\n`);
// Launch browser
console.log('🌐 Lancement du navigateur...\n');
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 800 });
// Scrape all pages
let allMarkdown = '# Prism RPG - Wiki Complete\n\n';
allMarkdown += `*Scraped from ${BASE_URL} on ${new Date().toLocaleDateString('fr-FR')}*\n\n`;
allMarkdown += '---\n\n';
for (const pagePath of pages) {
const title = getPageTitle(pagePath);
const level = pagePath === '/Prism+(Testing)/Prism' ? '##' : '###';
allMarkdown += `${level} ${title}\n\n`;
const markdown = await scrapePage(page, pagePath);
if (markdown) {
allMarkdown += `${markdown}\n\n`;
allMarkdown += '---\n\n';
}
// Wait between requests
await wait(1000);
}
// Close browser
await browser.close();
// Save the combined Markdown file
fs.writeFileSync(OUTPUT_FILE, allMarkdown, 'utf-8');
console.log(`\n✅ Scraping terminé!`);
console.log(` Pages scrapées: ${pages.length}`);
console.log(` Fichier généré: ${OUTPUT_FILE}`);
console.log(` Taille: ${(allMarkdown.length / 1024).toFixed(2)} KB`);
}
main().catch((error) => {
console.error('❌ Erreur fatale:', error);
process.exit(1);
});