#!/usr/bin/env node import https from 'https'; import http from 'http'; import fs from 'fs'; import path from 'path'; import { URL } from 'url'; const BASE_URL = 'https://corvanis.wiki'; const START_PAGE = '/Prism+(Testing)/Prism'; const OUTPUT_DIR = './wiki-downloads'; // Known Prism pages structure - will be expanded as we find more const KNOWN_SECTIONS = [ '', '/Rules', '/Character+Creation', '/Combat', '/Equipment', '/Magic', '/Spells', '/Miracles', '/Monsters', '/Bestiary', ]; // Set to track visited pages const visitedPages = new Set(); const pagesToVisit = [START_PAGE]; // Create output directory if (!fs.existsSync(OUTPUT_DIR)) { fs.mkdirSync(OUTPUT_DIR, { recursive: true }); } /** * Make an HTTP(S) request * @param {string} url The URL to request * @returns {Promise<{data: string, statusCode: number}>} The response data and status code */ function makeRequest(url) { return new Promise((resolve, reject) => { const urlObj = new URL(url); const protocol = urlObj.protocol === 'https:' ? https : http; protocol.get(url, (res) => { let data = ''; res.on('data', (chunk) => { data += chunk; }); res.on('end', () => { resolve({ data, statusCode: res.statusCode }); }); }).on('error', (err) => { reject(err); }); }); } /** * Extract links from HTML content * @param {string} html The HTML content to parse * @param {string} currentPath The current page path * @returns {string[]} Array of extracted links */ function extractLinks(html, currentPath) { const links = new Set(); // Match links that contain Prism+(Testing)/Prism in the path const linkRegex = /href=["']([^"']*Prism\+\(Testing\)\/Prism[^"']*)["']/gi; let match; while ((match = linkRegex.exec(html)) !== null) { let link = match[1]; // Clean up the link link = link.split('#')[0]; // Remove anchors link = link.split('?')[0]; // Remove query params // Make sure it starts with / if (!link.startsWith('/')) { if (link.startsWith('http')) { try { const urlObj = new URL(link); link = urlObj.pathname; } catch (e) { continue; } } else { continue; } } // Only keep links under Prism+(Testing)/Prism if (link.includes('Prism+(Testing)/Prism')) { links.add(link); } } return Array.from(links); } /** * Sanitize filename * @param {string} str The string to sanitize * @returns {string} Sanitized filename */ function sanitizeFilename(str) { return str .replace(/[^a-z0-9_\-+()]/gi, '_') .replace(/_+/g, '_') .replace(/^_|_$/g, ''); } /** * Download a page * @param {string} pagePath The path of the page to download * @returns {Promise} */ async function downloadPage(pagePath) { if (visitedPages.has(pagePath)) { return; } visitedPages.add(pagePath); console.log(`Téléchargement: ${pagePath}`); try { const url = BASE_URL + pagePath; const { data, statusCode } = await makeRequest(url); if (statusCode !== 200) { console.error(` ❌ Erreur ${statusCode} pour ${pagePath}`); return; } // Save the HTML file const filename = `${sanitizeFilename(pagePath)}.html`; const filepath = path.join(OUTPUT_DIR, filename); fs.writeFileSync(filepath, data, 'utf-8'); console.log(` ✓ Sauvegardé: ${filename}`); // Extract and queue new links const links = extractLinks(data, pagePath); for (const link of links) { if (!visitedPages.has(link)) { pagesToVisit.push(link); } } // Wait a bit to avoid overwhelming the server await new Promise((resolve) => { setTimeout(resolve, 500); }); } catch (error) { console.error(` ❌ Erreur lors du téléchargement de ${pagePath}:`, error.message); } } /** * Main function */ async function main() { console.log('🚀 Début du téléchargement des pages wiki...'); console.log(` Base URL: ${BASE_URL}`); console.log(` Page de départ: ${START_PAGE}`); console.log(` Répertoire de sortie: ${OUTPUT_DIR}\n`); while (pagesToVisit.length > 0) { const page = pagesToVisit.shift(); await downloadPage(page); } console.log(`\n✅ Téléchargement terminé!`); console.log(` Pages téléchargées: ${visitedPages.size}`); console.log(` Fichiers sauvegardés dans: ${OUTPUT_DIR}`); // Save a list of downloaded pages const listPath = path.join(OUTPUT_DIR, '_pages-list.txt'); fs.writeFileSync(listPath, Array.from(visitedPages).sort().join('\n')); console.log(` Liste des pages: ${listPath}`); } main().catch(console.error);