fvtt-prism-rpg/tools/downloadWikiPages.mjs

#!/usr/bin/env node

import https from 'https';
import http from 'http';
import fs from 'fs';
import path from 'path';
import { URL } from 'url';

const BASE_URL = 'https://corvanis.wiki';
const START_PAGE = '/Prism+(Testing)/Prism';
const OUTPUT_DIR = './wiki-downloads';

// Known Prism pages structure - will be expanded as we find more
const KNOWN_SECTIONS = [
  '',
  '/Rules',
  '/Character+Creation',
  '/Combat',
  '/Equipment',
  '/Magic',
  '/Spells',
  '/Miracles',
  '/Monsters',
  '/Bestiary',
];

// Set to track visited pages
const visitedPages = new Set();
const pagesToVisit = [START_PAGE];

// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
  fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}

/**
 * Make an HTTP(S) request
 * @param {string} url The URL to request
 * @returns {Promise<{data: string, statusCode: number}>} The response data and status code
 */
function makeRequest(url) {
  return new Promise((resolve, reject) => {
    const urlObj = new URL(url);
    const protocol = urlObj.protocol === 'https:' ? https : http;

    protocol.get(url, (res) => {
      let data = '';

      res.on('data', (chunk) => {
        data += chunk;
      });

      res.on('end', () => {
        resolve({ data, statusCode: res.statusCode });
      });
    }).on('error', (err) => {
      reject(err);
    });
  });
}

/**
 * Extract links from HTML content
 * @param {string} html The HTML content to parse
 * @param {string} currentPath The current page path
 * @returns {string[]} Array of extracted links
 */
function extractLinks(html, currentPath) {
  const links = new Set();

  // Match links that contain Prism+(Testing)/Prism in the path
  const linkRegex = /href=["']([^"']*Prism\+\(Testing\)\/Prism[^"']*)["']/gi;
  let match;

  while ((match = linkRegex.exec(html)) !== null) {
    let link = match[1];

    // Clean up the link
    link = link.split('#')[0]; // Remove anchors
    link = link.split('?')[0]; // Remove query params

    // Make sure it starts with /
    if (!link.startsWith('/')) {
      if (link.startsWith('http')) {
        try {
          const urlObj = new URL(link);
          link = urlObj.pathname;
        } catch (e) {
          continue;
        }
      } else {
        continue;
      }
    }

    // Only keep links under Prism+(Testing)/Prism
    if (link.includes('Prism+(Testing)/Prism')) {
      links.add(link);
    }
  }

  return Array.from(links);
}

/**
 * Sanitize filename
 * @param {string} str The string to sanitize
 * @returns {string} Sanitized filename
 */
function sanitizeFilename(str) {
  return str
    .replace(/[^a-z0-9_\-+()]/gi, '_')
    .replace(/_+/g, '_')
    .replace(/^_|_$/g, '');
}

/**
 * Download a page
 * @param {string} pagePath The path of the page to download
 * @returns {Promise<void>}
 */
async function downloadPage(pagePath) {
  if (visitedPages.has(pagePath)) {
    return;
  }

  visitedPages.add(pagePath);
  console.log(`Téléchargement: ${pagePath}`);

  try {
    const url = BASE_URL + pagePath;
    const { data, statusCode } = await makeRequest(url);

    if (statusCode !== 200) {
      console.error(`  ❌ Erreur ${statusCode} pour ${pagePath}`);
      return;
    }

    // Save the HTML file
    const filename = `${sanitizeFilename(pagePath)}.html`;
    const filepath = path.join(OUTPUT_DIR, filename);
    fs.writeFileSync(filepath, data, 'utf-8');
    console.log(`  ✓ Sauvegardé: ${filename}`);

    // Extract and queue new links
    const links = extractLinks(data, pagePath);
    for (const link of links) {
      if (!visitedPages.has(link)) {
        pagesToVisit.push(link);
      }
    }

    // Wait a bit to avoid overwhelming the server
    await new Promise((resolve) => {
      setTimeout(resolve, 500);
    });

  } catch (error) {
    console.error(`  ❌ Erreur lors du téléchargement de ${pagePath}:`, error.message);
  }
}

/**
 * Main function
 */
async function main() {
  console.log('🚀 Début du téléchargement des pages wiki...');
  console.log(`   Base URL: ${BASE_URL}`);
  console.log(`   Page de départ: ${START_PAGE}`);
  console.log(`   Répertoire de sortie: ${OUTPUT_DIR}\n`);

  while (pagesToVisit.length > 0) {
    const page = pagesToVisit.shift();
    await downloadPage(page);
  }

  console.log(`\n✅ Téléchargement terminé!`);
  console.log(`   Pages téléchargées: ${visitedPages.size}`);
  console.log(`   Fichiers sauvegardés dans: ${OUTPUT_DIR}`);

  // Save a list of downloaded pages
  const listPath = path.join(OUTPUT_DIR, '_pages-list.txt');
  fs.writeFileSync(listPath, Array.from(visitedPages).sort().join('\n'));
  console.log(`   Liste des pages: ${listPath}`);
}

main().catch(console.error);