#!/usr/bin/env node import puppeteer from 'puppeteer'; import TurndownService from 'turndown'; import fs from 'fs'; import path from 'path'; const BASE_URL = 'https://corvanis.wiki'; const START_PAGE = '/Prism+(Testing)/Prism'; const OUTPUT_FILE = './prism-rules/prism-wiki-complete.md'; const MAX_PAGES = 500; // Increased limit to get more pages // Create output directory const outputDir = path.dirname(OUTPUT_FILE); if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } // Initialize Turndown for HTML to Markdown conversion const turndownService = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced', }); // Track visited pages const visitedUrls = new Set(); const pagesContent = []; /** * Wait for a specific time * @param {number} ms Milliseconds to wait * @returns {Promise} */ function wait(ms) { return new Promise((resolve) => { setTimeout(resolve, ms); }); } /** * Extract Prism-related links from the page * @param {object} page Puppeteer page object * @returns {Promise} Array of URLs */ async function extractPrismLinks(page) { return await page.evaluate(() => { const links = []; const anchors = document.querySelectorAll('a[href*="Prism"]'); anchors.forEach((anchor) => { const href = anchor.getAttribute('href'); if (href && href.includes('Prism')) { // Extract the path const url = new URL(href, window.location.href); const path = url.pathname; // Only keep Prism Testing paths if (path.includes('Prism+(Testing)') || path.includes('Prism+%28Testing%29')) { links.push(path); } } }); return [...new Set(links)]; }); } /** * Sanitize page title for heading * @param {string} urlPath Page path * @returns {string} Clean title */ function getPageTitle(urlPath) { return urlPath .replace(/.*\/([^/]+)$/, '$1') .replace(/\+/g, ' ') .replace(/%28/g, '(') .replace(/%29/g, ')') .replace(/_/g, ' ') .trim(); } /** * Get heading level based on URL depth * @param {string} urlPath Page path * @returns {string} Markdown heading level */ function getHeadingLevel(urlPath) { const depth = urlPath.split('/').filter((s) => s).length; return '#'.repeat(Math.min(depth + 1, 6)); } /** * Scrape a page and convert to Markdown * @param {object} page Puppeteer page object * @param {string} pagePath Page path to scrape * @returns {Promise<{content: string, links: string[]}>} Markdown content and found links */ async function scrapePage(page, pagePath) { if (visitedUrls.has(pagePath)) { return { content: '', links: [] }; } visitedUrls.add(pagePath); const url = BASE_URL + pagePath; console.log(`📄 Scraping [${visitedUrls.size}]: ${pagePath}`); try { // Navigate to the page const response = await page.goto(url, { waitUntil: 'networkidle0', timeout: 30000, }); // Check if page exists if (response.status() === 404) { console.log(` ⚠ Page non trouvĂ©e (404)`); return { content: '', links: [] }; } // Wait for the main content to load await page.waitForSelector('.published-container', { timeout: 10000 }); // Wait a bit more for dynamic content await wait(2000); // Extract links for crawling const links = await extractPrismLinks(page); // Extract the main content const content = await page.evaluate(() => { // Try different selectors for content let mainContent = document.querySelector('.markdown-preview-view'); if (!mainContent) { mainContent = document.querySelector('.site-body-center-column'); } if (!mainContent) { mainContent = document.querySelector('.published-container'); } if (!mainContent) { return null; } // Clone to avoid modifying the page const clone = mainContent.cloneNode(true); // Remove navigation and UI elements const removeSelectors = [ '.site-body-left-column', '.site-body-right-column', '.tree-item-self', '.search-input-container', '.graph-view', 'nav', '.frontmatter-container', ]; removeSelectors.forEach((selector) => { const elements = clone.querySelectorAll(selector); elements.forEach((el) => { el.remove(); }); }); return clone.innerHTML; }); if (!content || content.trim().length < 50) { console.log(` ⚠ Contenu vide ou trop court`); return { content: '', links }; } // Convert HTML to Markdown const markdown = turndownService.turndown(content); console.log(` ✓ Converti (${markdown.length} caractĂšres, ${links.length} liens)`); return { content: markdown, links }; } catch (error) { console.error(` ❌ Erreur: ${error.message}`); return { content: '', links: [] }; } } /** * Main function */ async function main() { console.log('🚀 DĂ©but du scraping automatique du wiki Prism...\n'); // Launch browser console.log('🌐 Lancement du navigateur...\n'); const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setViewport({ width: 1280, height: 800 }); // Queue of pages to visit const toVisit = [START_PAGE]; // Crawl pages while (toVisit.length > 0 && visitedUrls.size < MAX_PAGES) { const currentPath = toVisit.shift(); const { content, links } = await scrapePage(page, currentPath); if (content) { const title = getPageTitle(currentPath); const level = getHeadingLevel(currentPath); pagesContent.push({ path: currentPath, title, level, content, }); } // Add new links to queue for (const link of links) { if (!visitedUrls.has(link) && !toVisit.includes(link)) { toVisit.push(link); } } // Wait between requests await wait(1000); } // Close browser await browser.close(); // Build the final Markdown document let allMarkdown = '# Prism RPG - Wiki Complete\n\n'; allMarkdown += `*Scraped from ${BASE_URL} on ${new Date().toLocaleDateString('fr-FR')}*\n\n`; allMarkdown += `*${pagesContent.length} pages crawled*\n\n`; allMarkdown += '---\n\n'; // Sort pages by path depth for better organization pagesContent.sort((a, b) => { const depthA = a.path.split('/').length; const depthB = b.path.split('/').length; if (depthA !== depthB) { return depthA - depthB; } return a.path.localeCompare(b.path); }); for (const pageData of pagesContent) { allMarkdown += `${pageData.level} ${pageData.title}\n\n`; allMarkdown += `*Source: ${BASE_URL}${pageData.path}*\n\n`; allMarkdown += `${pageData.content}\n\n`; allMarkdown += '---\n\n'; } // Save the combined Markdown file fs.writeFileSync(OUTPUT_FILE, allMarkdown, 'utf-8'); console.log(`\n✅ Scraping terminĂ©!`); console.log(` Pages visitĂ©es: ${visitedUrls.size}`); console.log(` Pages avec contenu: ${pagesContent.length}`); console.log(` Fichier gĂ©nĂ©rĂ©: ${OUTPUT_FILE}`); console.log(` Taille: ${(allMarkdown.length / 1024).toFixed(2)} KB`); // Save a list of crawled pages const listPath = path.join(outputDir, 'pages-crawled.txt'); fs.writeFileSync(listPath, Array.from(visitedUrls).sort().join('\n')); console.log(` Liste des pages: ${listPath}`); } main().catch((error) => { console.error('❌ Erreur fatale:', error); process.exit(1); });