First adaptation pass

2025-11-06 00:01:59 +01:00
parent 5b1fd847c2
commit 6b883f8126
112 changed files with 44142 additions and 953 deletions
@@ -0,0 +1,277 @@
+#!/usr/bin/env node
+
+import puppeteer from 'puppeteer';
+import TurndownService from 'turndown';
+import fs from 'fs';
+import path from 'path';
+
+const BASE_URL = 'https://corvanis.wiki';
+const START_PAGE = '/Prism+(Testing)/Prism';
+const OUTPUT_FILE = './prism-rules/prism-wiki-complete.md';
+const MAX_PAGES = 500; // Increased limit to get more pages
+
+// Create output directory
+const outputDir = path.dirname(OUTPUT_FILE);
+if (!fs.existsSync(outputDir)) {
+  fs.mkdirSync(outputDir, { recursive: true });
+}
+
+// Initialize Turndown for HTML to Markdown conversion
+const turndownService = new TurndownService({
+  headingStyle: 'atx',
+  codeBlockStyle: 'fenced',
+});
+
+// Track visited pages
+const visitedUrls = new Set();
+const pagesContent = [];
+
+/**
+ * Wait for a specific time
+ * @param {number} ms Milliseconds to wait
+ * @returns {Promise<void>}
+ */
+function wait(ms) {
+  return new Promise((resolve) => {
+    setTimeout(resolve, ms);
+  });
+}
+
+/**
+ * Extract Prism-related links from the page
+ * @param {object} page Puppeteer page object
+ * @returns {Promise<string[]>} Array of URLs
+ */
+async function extractPrismLinks(page) {
+  return await page.evaluate(() => {
+    const links = [];
+    const anchors = document.querySelectorAll('a[href*="Prism"]');
+
+    anchors.forEach((anchor) => {
+      const href = anchor.getAttribute('href');
+      if (href && href.includes('Prism')) {
+        // Extract the path
+        const url = new URL(href, window.location.href);
+        const path = url.pathname;
+
+        // Only keep Prism Testing paths
+        if (path.includes('Prism+(Testing)') || path.includes('Prism+%28Testing%29')) {
+          links.push(path);
+        }
+      }
+    });
+
+    return [...new Set(links)];
+  });
+}
+
+/**
+ * Sanitize page title for heading
+ * @param {string} urlPath Page path
+ * @returns {string} Clean title
+ */
+function getPageTitle(urlPath) {
+  return urlPath
+    .replace(/.*\/([^/]+)$/, '$1')
+    .replace(/\+/g, ' ')
+    .replace(/%28/g, '(')
+    .replace(/%29/g, ')')
+    .replace(/_/g, ' ')
+    .trim();
+}
+
+/**
+ * Get heading level based on URL depth
+ * @param {string} urlPath Page path
+ * @returns {string} Markdown heading level
+ */
+function getHeadingLevel(urlPath) {
+  const depth = urlPath.split('/').filter((s) => s).length;
+  return '#'.repeat(Math.min(depth + 1, 6));
+}
+
+/**
+ * Scrape a page and convert to Markdown
+ * @param {object} page Puppeteer page object
+ * @param {string} pagePath Page path to scrape
+ * @returns {Promise<{content: string, links: string[]}>} Markdown content and found links
+ */
+async function scrapePage(page, pagePath) {
+  if (visitedUrls.has(pagePath)) {
+    return { content: '', links: [] };
+  }
+
+  visitedUrls.add(pagePath);
+  const url = BASE_URL + pagePath;
+  console.log(`📄 Scraping [${visitedUrls.size}]: ${pagePath}`);
+
+  try {
+    // Navigate to the page
+    const response = await page.goto(url, {
+      waitUntil: 'networkidle0',
+      timeout: 30000,
+    });
+
+    // Check if page exists
+    if (response.status() === 404) {
+      console.log(`  ⚠️  Page non trouvée (404)`);
+      return { content: '', links: [] };
+    }
+
+    // Wait for the main content to load
+    await page.waitForSelector('.published-container', { timeout: 10000 });
+
+    // Wait a bit more for dynamic content
+    await wait(2000);
+
+    // Extract links for crawling
+    const links = await extractPrismLinks(page);
+
+    // Extract the main content
+    const content = await page.evaluate(() => {
+      // Try different selectors for content
+      let mainContent = document.querySelector('.markdown-preview-view');
+      if (!mainContent) {
+        mainContent = document.querySelector('.site-body-center-column');
+      }
+      if (!mainContent) {
+        mainContent = document.querySelector('.published-container');
+      }
+
+      if (!mainContent) {
+        return null;
+      }
+
+      // Clone to avoid modifying the page
+      const clone = mainContent.cloneNode(true);
+
+      // Remove navigation and UI elements
+      const removeSelectors = [
+        '.site-body-left-column',
+        '.site-body-right-column',
+        '.tree-item-self',
+        '.search-input-container',
+        '.graph-view',
+        'nav',
+        '.frontmatter-container',
+      ];
+
+      removeSelectors.forEach((selector) => {
+        const elements = clone.querySelectorAll(selector);
+        elements.forEach((el) => {
+          el.remove();
+        });
+      });
+
+      return clone.innerHTML;
+    });
+
+    if (!content || content.trim().length < 50) {
+      console.log(`  ⚠️  Contenu vide ou trop court`);
+      return { content: '', links };
+    }
+
+    // Convert HTML to Markdown
+    const markdown = turndownService.turndown(content);
+    console.log(`  ✓ Converti (${markdown.length} caractères, ${links.length} liens)`);
+
+    return { content: markdown, links };
+  } catch (error) {
+    console.error(`  ❌ Erreur: ${error.message}`);
+    return { content: '', links: [] };
+  }
+}
+
+/**
+ * Main function
+ */
+async function main() {
+  console.log('🚀 Début du scraping automatique du wiki Prism...\n');
+
+  // Launch browser
+  console.log('🌐 Lancement du navigateur...\n');
+  const browser = await puppeteer.launch({
+    headless: true,
+    args: ['--no-sandbox', '--disable-setuid-sandbox'],
+  });
+
+  const page = await browser.newPage();
+  await page.setViewport({ width: 1280, height: 800 });
+
+  // Queue of pages to visit
+  const toVisit = [START_PAGE];
+
+  // Crawl pages
+  while (toVisit.length > 0 && visitedUrls.size < MAX_PAGES) {
+    const currentPath = toVisit.shift();
+
+    const { content, links } = await scrapePage(page, currentPath);
+
+    if (content) {
+      const title = getPageTitle(currentPath);
+      const level = getHeadingLevel(currentPath);
+
+      pagesContent.push({
+        path: currentPath,
+        title,
+        level,
+        content,
+      });
+    }
+
+    // Add new links to queue
+    for (const link of links) {
+      if (!visitedUrls.has(link) && !toVisit.includes(link)) {
+        toVisit.push(link);
+      }
+    }
+
+    // Wait between requests
+    await wait(1000);
+  }
+
+  // Close browser
+  await browser.close();
+
+  // Build the final Markdown document
+  let allMarkdown = '# Prism RPG - Wiki Complete\n\n';
+  allMarkdown += `*Scraped from ${BASE_URL} on ${new Date().toLocaleDateString('fr-FR')}*\n\n`;
+  allMarkdown += `*${pagesContent.length} pages crawled*\n\n`;
+  allMarkdown += '---\n\n';
+
+  // Sort pages by path depth for better organization
+  pagesContent.sort((a, b) => {
+    const depthA = a.path.split('/').length;
+    const depthB = b.path.split('/').length;
+    if (depthA !== depthB) {
+      return depthA - depthB;
+    }
+    return a.path.localeCompare(b.path);
+  });
+
+  for (const pageData of pagesContent) {
+    allMarkdown += `${pageData.level} ${pageData.title}\n\n`;
+    allMarkdown += `*Source: ${BASE_URL}${pageData.path}*\n\n`;
+    allMarkdown += `${pageData.content}\n\n`;
+    allMarkdown += '---\n\n';
+  }
+
+  // Save the combined Markdown file
+  fs.writeFileSync(OUTPUT_FILE, allMarkdown, 'utf-8');
+
+  console.log(`\n✅ Scraping terminé!`);
+  console.log(`   Pages visitées: ${visitedUrls.size}`);
+  console.log(`   Pages avec contenu: ${pagesContent.length}`);
+  console.log(`   Fichier généré: ${OUTPUT_FILE}`);
+  console.log(`   Taille: ${(allMarkdown.length / 1024).toFixed(2)} KB`);
+
+  // Save a list of crawled pages
+  const listPath = path.join(outputDir, 'pages-crawled.txt');
+  fs.writeFileSync(listPath, Array.from(visitedUrls).sort().join('\n'));
+  console.log(`   Liste des pages: ${listPath}`);
+}
+
+main().catch((error) => {
+  console.error('❌ Erreur fatale:', error);
+  process.exit(1);
+});
@@ -0,0 +1,187 @@
+#!/usr/bin/env node
+
+import https from 'https';
+import http from 'http';
+import fs from 'fs';
+import path from 'path';
+import { URL } from 'url';
+
+const BASE_URL = 'https://corvanis.wiki';
+const START_PAGE = '/Prism+(Testing)/Prism';
+const OUTPUT_DIR = './wiki-downloads';
+
+// Known Prism pages structure - will be expanded as we find more
+const KNOWN_SECTIONS = [
+  '',
+  '/Rules',
+  '/Character+Creation',
+  '/Combat',
+  '/Equipment',
+  '/Magic',
+  '/Spells',
+  '/Miracles',
+  '/Monsters',
+  '/Bestiary',
+];
+
+// Set to track visited pages
+const visitedPages = new Set();
+const pagesToVisit = [START_PAGE];
+
+// Create output directory
+if (!fs.existsSync(OUTPUT_DIR)) {
+  fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+}
+
+/**
+ * Make an HTTP(S) request
+ * @param {string} url The URL to request
+ * @returns {Promise<{data: string, statusCode: number}>} The response data and status code
+ */
+function makeRequest(url) {
+  return new Promise((resolve, reject) => {
+    const urlObj = new URL(url);
+    const protocol = urlObj.protocol === 'https:' ? https : http;
+
+    protocol.get(url, (res) => {
+      let data = '';
+
+      res.on('data', (chunk) => {
+        data += chunk;
+      });
+
+      res.on('end', () => {
+        resolve({ data, statusCode: res.statusCode });
+      });
+    }).on('error', (err) => {
+      reject(err);
+    });
+  });
+}
+
+/**
+ * Extract links from HTML content
+ * @param {string} html The HTML content to parse
+ * @param {string} currentPath The current page path
+ * @returns {string[]} Array of extracted links
+ */
+function extractLinks(html, currentPath) {
+  const links = new Set();
+
+  // Match links that contain Prism+(Testing)/Prism in the path
+  const linkRegex = /href=["']([^"']*Prism\+\(Testing\)\/Prism[^"']*)["']/gi;
+  let match;
+
+  while ((match = linkRegex.exec(html)) !== null) {
+    let link = match[1];
+
+    // Clean up the link
+    link = link.split('#')[0]; // Remove anchors
+    link = link.split('?')[0]; // Remove query params
+
+    // Make sure it starts with /
+    if (!link.startsWith('/')) {
+      if (link.startsWith('http')) {
+        try {
+          const urlObj = new URL(link);
+          link = urlObj.pathname;
+        } catch (e) {
+          continue;
+        }
+      } else {
+        continue;
+      }
+    }
+
+    // Only keep links under Prism+(Testing)/Prism
+    if (link.includes('Prism+(Testing)/Prism')) {
+      links.add(link);
+    }
+  }
+
+  return Array.from(links);
+}
+
+/**
+ * Sanitize filename
+ * @param {string} str The string to sanitize
+ * @returns {string} Sanitized filename
+ */
+function sanitizeFilename(str) {
+  return str
+    .replace(/[^a-z0-9_\-+()]/gi, '_')
+    .replace(/_+/g, '_')
+    .replace(/^_|_$/g, '');
+}
+
+/**
+ * Download a page
+ * @param {string} pagePath The path of the page to download
+ * @returns {Promise<void>}
+ */
+async function downloadPage(pagePath) {
+  if (visitedPages.has(pagePath)) {
+    return;
+  }
+
+  visitedPages.add(pagePath);
+  console.log(`Téléchargement: ${pagePath}`);
+
+  try {
+    const url = BASE_URL + pagePath;
+    const { data, statusCode } = await makeRequest(url);
+
+    if (statusCode !== 200) {
+      console.error(`  ❌ Erreur ${statusCode} pour ${pagePath}`);
+      return;
+    }
+
+    // Save the HTML file
+    const filename = `${sanitizeFilename(pagePath)}.html`;
+    const filepath = path.join(OUTPUT_DIR, filename);
+    fs.writeFileSync(filepath, data, 'utf-8');
+    console.log(`  ✓ Sauvegardé: ${filename}`);
+
+    // Extract and queue new links
+    const links = extractLinks(data, pagePath);
+    for (const link of links) {
+      if (!visitedPages.has(link)) {
+        pagesToVisit.push(link);
+      }
+    }
+
+    // Wait a bit to avoid overwhelming the server
+    await new Promise((resolve) => {
+      setTimeout(resolve, 500);
+    });
+
+  } catch (error) {
+    console.error(`  ❌ Erreur lors du téléchargement de ${pagePath}:`, error.message);
+  }
+}
+
+/**
+ * Main function
+ */
+async function main() {
+  console.log('🚀 Début du téléchargement des pages wiki...');
+  console.log(`   Base URL: ${BASE_URL}`);
+  console.log(`   Page de départ: ${START_PAGE}`);
+  console.log(`   Répertoire de sortie: ${OUTPUT_DIR}\n`);
+
+  while (pagesToVisit.length > 0) {
+    const page = pagesToVisit.shift();
+    await downloadPage(page);
+  }
+
+  console.log(`\n✅ Téléchargement terminé!`);
+  console.log(`   Pages téléchargées: ${visitedPages.size}`);
+  console.log(`   Fichiers sauvegardés dans: ${OUTPUT_DIR}`);
+
+  // Save a list of downloaded pages
+  const listPath = path.join(OUTPUT_DIR, '_pages-list.txt');
+  fs.writeFileSync(listPath, Array.from(visitedPages).sort().join('\n'));
+  console.log(`   Liste des pages: ${listPath}`);
+}
+
+main().catch(console.error);
@@ -0,0 +1,131 @@
+#!/usr/bin/env node
+
+import https from 'https';
+import http from 'http';
+import fs from 'fs';
+import path from 'path';
+import { URL } from 'url';
+
+const BASE_URL = 'https://corvanis.wiki';
+const PAGES_LIST_FILE = './tools/wikiPagesList.txt';
+const OUTPUT_DIR = './wiki-downloads';
+
+// Create output directory
+if (!fs.existsSync(OUTPUT_DIR)) {
+  fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+}
+
+/**
+ * Make an HTTP(S) request
+ * @param {string} url The URL to request
+ * @returns {Promise<{data: string, statusCode: number}>} The response data and status code
+ */
+function makeRequest(url) {
+  return new Promise((resolve, reject) => {
+    const urlObj = new URL(url);
+    const protocol = urlObj.protocol === 'https:' ? https : http;
+
+    protocol
+      .get(url, (res) => {
+        let data = '';
+
+        res.on('data', (chunk) => {
+          data += chunk;
+        });
+
+        res.on('end', () => {
+          resolve({ data, statusCode: res.statusCode });
+        });
+      })
+      .on('error', (err) => {
+        reject(err);
+      });
+  });
+}
+
+/**
+ * Sanitize filename
+ * @param {string} str The string to sanitize
+ * @returns {string} Sanitized filename
+ */
+function sanitizeFilename(str) {
+  return str
+    .replace(/^\//, '') // Remove leading slash
+    .replace(/[^a-z0-9_\-+()]/gi, '_')
+    .replace(/_+/g, '_')
+    .replace(/^_|_$/g, '');
+}
+
+/**
+ * Download a page
+ * @param {string} pagePath The path of the page to download
+ * @returns {Promise<boolean>} Success status
+ */
+async function downloadPage(pagePath) {
+  console.log(`Téléchargement: ${pagePath}`);
+
+  try {
+    const url = BASE_URL + pagePath;
+    const { data, statusCode } = await makeRequest(url);
+
+    if (statusCode !== 200) {
+      console.error(`  ❌ Erreur ${statusCode} pour ${pagePath}`);
+      return false;
+    }
+
+    // Save the HTML file
+    const filename = `${sanitizeFilename(pagePath)}.html`;
+    const filepath = path.join(OUTPUT_DIR, filename);
+    fs.writeFileSync(filepath, data, 'utf-8');
+    console.log(`  ✓ Sauvegardé: ${filename}`);
+
+    // Wait a bit to avoid overwhelming the server
+    await new Promise((resolve) => {
+      setTimeout(resolve, 500);
+    });
+
+    return true;
+  } catch (error) {
+    console.error(`  ❌ Erreur lors du téléchargement de ${pagePath}:`, error.message);
+    return false;
+  }
+}
+
+/**
+ * Main function
+ */
+async function main() {
+  console.log('🚀 Début du téléchargement des pages wiki...');
+  console.log(`   Base URL: ${BASE_URL}`);
+  console.log(`   Liste des pages: ${PAGES_LIST_FILE}`);
+  console.log(`   Répertoire de sortie: ${OUTPUT_DIR}\n`);
+
+  // Read the list of pages
+  let pages = [];
+  if (fs.existsSync(PAGES_LIST_FILE)) {
+    const content = fs.readFileSync(PAGES_LIST_FILE, 'utf-8');
+    pages = content
+      .split('\n')
+      .map((line) => line.trim())
+      .filter((line) => line && !line.startsWith('#'));
+  } else {
+    console.error(`❌ Fichier de liste introuvable: ${PAGES_LIST_FILE}`);
+    process.exit(1);
+  }
+
+  console.log(`📄 ${pages.length} page(s) à télécharger\n`);
+
+  let successCount = 0;
+  for (const page of pages) {
+    const success = await downloadPage(page);
+    if (success) {
+      successCount += 1;
+    }
+  }
+
+  console.log(`\n✅ Téléchargement terminé!`);
+  console.log(`   Pages réussies: ${successCount}/${pages.length}`);
+  console.log(`   Fichiers sauvegardés dans: ${OUTPUT_DIR}`);
+}
+
+main().catch(console.error);
@@ -0,0 +1,171 @@
+#!/usr/bin/env node
+
+import puppeteer from 'puppeteer';
+import TurndownService from 'turndown';
+import fs from 'fs';
+import path from 'path';
+
+const BASE_URL = 'https://corvanis.wiki';
+const PAGES_LIST_FILE = './tools/wikiPagesList.txt';
+const OUTPUT_FILE = './prism-rules/prism-wiki-complete.md';
+
+// Create output directory
+const outputDir = path.dirname(OUTPUT_FILE);
+if (!fs.existsSync(outputDir)) {
+  fs.mkdirSync(outputDir, { recursive: true });
+}
+
+// Initialize Turndown for HTML to Markdown conversion
+const turndownService = new TurndownService({
+  headingStyle: 'atx',
+  codeBlockStyle: 'fenced',
+});
+
+/**
+ * Wait for a specific time
+ * @param {number} ms Milliseconds to wait
+ * @returns {Promise<void>}
+ */
+function wait(ms) {
+  return new Promise((resolve) => {
+    setTimeout(resolve, ms);
+  });
+}
+
+/**
+ * Sanitize page title for heading
+ * @param {string} path Page path
+ * @returns {string} Clean title
+ */
+function getPageTitle(path) {
+  return path
+    .replace(/^\/Prism\+\(Testing\)\/Prism\/?/, '')
+    .replace(/\+/g, ' ')
+    .trim() || 'Prism Home';
+}
+
+/**
+ * Scrape a page and convert to Markdown
+ * @param {object} page Puppeteer page object
+ * @param {string} pagePath Page path to scrape
+ * @returns {Promise<string>} Markdown content
+ */
+async function scrapePage(page, pagePath) {
+  const url = BASE_URL + pagePath;
+  console.log(`📄 Scraping: ${pagePath}`);
+
+  try {
+    // Navigate to the page
+    await page.goto(url, {
+      waitUntil: 'networkidle0',
+      timeout: 30000,
+    });
+
+    // Wait for the main content to load
+    await page.waitForSelector('.published-container', { timeout: 10000 });
+
+    // Wait a bit more for dynamic content
+    await wait(2000);
+
+    // Extract the main content
+    const content = await page.evaluate(() => {
+      const container = document.querySelector('.published-container');
+      if (!container) {
+        return null;
+      }
+
+      // Remove navigation elements
+      const nav = container.querySelector('.site-body-left-column');
+      if (nav) {
+        nav.remove();
+      }
+
+      // Get the main content area
+      const mainContent = container.querySelector('.site-body-center-column');
+      return mainContent ? mainContent.innerHTML : container.innerHTML;
+    });
+
+    if (!content) {
+      console.log(`  ⚠️  Pas de contenu trouvé pour ${pagePath}`);
+      return '';
+    }
+
+    // Convert HTML to Markdown
+    const markdown = turndownService.turndown(content);
+    console.log(`  ✓ Converti (${markdown.length} caractères)`);
+
+    return markdown;
+  } catch (error) {
+    console.error(`  ❌ Erreur: ${error.message}`);
+    return '';
+  }
+}
+
+/**
+ * Main function
+ */
+async function main() {
+  console.log('🚀 Début du scraping du wiki Prism...\n');
+
+  // Read the list of pages
+  let pages = [];
+  if (fs.existsSync(PAGES_LIST_FILE)) {
+    const content = fs.readFileSync(PAGES_LIST_FILE, 'utf-8');
+    pages = content
+      .split('\n')
+      .map((line) => line.trim())
+      .filter((line) => line && !line.startsWith('#'));
+  } else {
+    console.error(`❌ Fichier de liste introuvable: ${PAGES_LIST_FILE}`);
+    process.exit(1);
+  }
+
+  console.log(`📋 ${pages.length} page(s) à scraper\n`);
+
+  // Launch browser
+  console.log('🌐 Lancement du navigateur...\n');
+  const browser = await puppeteer.launch({
+    headless: true,
+    args: ['--no-sandbox', '--disable-setuid-sandbox'],
+  });
+
+  const page = await browser.newPage();
+  await page.setViewport({ width: 1280, height: 800 });
+
+  // Scrape all pages
+  let allMarkdown = '# Prism RPG - Wiki Complete\n\n';
+  allMarkdown += `*Scraped from ${BASE_URL} on ${new Date().toLocaleDateString('fr-FR')}*\n\n`;
+  allMarkdown += '---\n\n';
+
+  for (const pagePath of pages) {
+    const title = getPageTitle(pagePath);
+    const level = pagePath === '/Prism+(Testing)/Prism' ? '##' : '###';
+
+    allMarkdown += `${level} ${title}\n\n`;
+
+    const markdown = await scrapePage(page, pagePath);
+    if (markdown) {
+      allMarkdown += `${markdown}\n\n`;
+      allMarkdown += '---\n\n';
+    }
+
+    // Wait between requests
+    await wait(1000);
+  }
+
+  // Close browser
+  await browser.close();
+
+  // Save the combined Markdown file
+  fs.writeFileSync(OUTPUT_FILE, allMarkdown, 'utf-8');
+
+  console.log(`\n✅ Scraping terminé!`);
+  console.log(`   Pages scrapées: ${pages.length}`);
+  console.log(`   Fichier généré: ${OUTPUT_FILE}`);
+  console.log(`   Taille: ${(allMarkdown.length / 1024).toFixed(2)} KB`);
+}
+
+main().catch((error) => {
+  console.error('❌ Erreur fatale:', error);
+  process.exit(1);
+});
@@ -0,0 +1,12 @@
+/Prism+(Testing)/Prism
+/Prism+(Testing)/Prism/Rules
+/Prism+(Testing)/Prism/Character+Creation
+/Prism+(Testing)/Prism/Combat
+/Prism+(Testing)/Prism/Equipment
+/Prism+(Testing)/Prism/Armor
+/Prism+(Testing)/Prism/Weapons
+/Prism+(Testing)/Prism/Magic
+/Prism+(Testing)/Prism/Spells
+/Prism+(Testing)/Prism/Miracles
+/Prism+(Testing)/Prism/Monsters
+/Prism+(Testing)/Prism/Bestiary