278 lines
7.5 KiB
JavaScript
278 lines
7.5 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
import puppeteer from 'puppeteer';
|
|
import TurndownService from 'turndown';
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
|
|
const BASE_URL = 'https://corvanis.wiki';
|
|
const START_PAGE = '/Prism+(Testing)/Prism';
|
|
const OUTPUT_FILE = './prism-rules/prism-wiki-complete.md';
|
|
const MAX_PAGES = 500; // Increased limit to get more pages
|
|
|
|
// Create output directory
|
|
const outputDir = path.dirname(OUTPUT_FILE);
|
|
if (!fs.existsSync(outputDir)) {
|
|
fs.mkdirSync(outputDir, { recursive: true });
|
|
}
|
|
|
|
// Initialize Turndown for HTML to Markdown conversion
|
|
const turndownService = new TurndownService({
|
|
headingStyle: 'atx',
|
|
codeBlockStyle: 'fenced',
|
|
});
|
|
|
|
// Track visited pages
|
|
const visitedUrls = new Set();
|
|
const pagesContent = [];
|
|
|
|
/**
|
|
* Wait for a specific time
|
|
* @param {number} ms Milliseconds to wait
|
|
* @returns {Promise<void>}
|
|
*/
|
|
function wait(ms) {
|
|
return new Promise((resolve) => {
|
|
setTimeout(resolve, ms);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Extract Prism-related links from the page
|
|
* @param {object} page Puppeteer page object
|
|
* @returns {Promise<string[]>} Array of URLs
|
|
*/
|
|
async function extractPrismLinks(page) {
|
|
return await page.evaluate(() => {
|
|
const links = [];
|
|
const anchors = document.querySelectorAll('a[href*="Prism"]');
|
|
|
|
anchors.forEach((anchor) => {
|
|
const href = anchor.getAttribute('href');
|
|
if (href && href.includes('Prism')) {
|
|
// Extract the path
|
|
const url = new URL(href, window.location.href);
|
|
const path = url.pathname;
|
|
|
|
// Only keep Prism Testing paths
|
|
if (path.includes('Prism+(Testing)') || path.includes('Prism+%28Testing%29')) {
|
|
links.push(path);
|
|
}
|
|
}
|
|
});
|
|
|
|
return [...new Set(links)];
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Sanitize page title for heading
|
|
* @param {string} urlPath Page path
|
|
* @returns {string} Clean title
|
|
*/
|
|
function getPageTitle(urlPath) {
|
|
return urlPath
|
|
.replace(/.*\/([^/]+)$/, '$1')
|
|
.replace(/\+/g, ' ')
|
|
.replace(/%28/g, '(')
|
|
.replace(/%29/g, ')')
|
|
.replace(/_/g, ' ')
|
|
.trim();
|
|
}
|
|
|
|
/**
|
|
* Get heading level based on URL depth
|
|
* @param {string} urlPath Page path
|
|
* @returns {string} Markdown heading level
|
|
*/
|
|
function getHeadingLevel(urlPath) {
|
|
const depth = urlPath.split('/').filter((s) => s).length;
|
|
return '#'.repeat(Math.min(depth + 1, 6));
|
|
}
|
|
|
|
/**
|
|
* Scrape a page and convert to Markdown
|
|
* @param {object} page Puppeteer page object
|
|
* @param {string} pagePath Page path to scrape
|
|
* @returns {Promise<{content: string, links: string[]}>} Markdown content and found links
|
|
*/
|
|
async function scrapePage(page, pagePath) {
|
|
if (visitedUrls.has(pagePath)) {
|
|
return { content: '', links: [] };
|
|
}
|
|
|
|
visitedUrls.add(pagePath);
|
|
const url = BASE_URL + pagePath;
|
|
console.log(`📄 Scraping [${visitedUrls.size}]: ${pagePath}`);
|
|
|
|
try {
|
|
// Navigate to the page
|
|
const response = await page.goto(url, {
|
|
waitUntil: 'networkidle0',
|
|
timeout: 30000,
|
|
});
|
|
|
|
// Check if page exists
|
|
if (response.status() === 404) {
|
|
console.log(` ⚠️ Page non trouvée (404)`);
|
|
return { content: '', links: [] };
|
|
}
|
|
|
|
// Wait for the main content to load
|
|
await page.waitForSelector('.published-container', { timeout: 10000 });
|
|
|
|
// Wait a bit more for dynamic content
|
|
await wait(2000);
|
|
|
|
// Extract links for crawling
|
|
const links = await extractPrismLinks(page);
|
|
|
|
// Extract the main content
|
|
const content = await page.evaluate(() => {
|
|
// Try different selectors for content
|
|
let mainContent = document.querySelector('.markdown-preview-view');
|
|
if (!mainContent) {
|
|
mainContent = document.querySelector('.site-body-center-column');
|
|
}
|
|
if (!mainContent) {
|
|
mainContent = document.querySelector('.published-container');
|
|
}
|
|
|
|
if (!mainContent) {
|
|
return null;
|
|
}
|
|
|
|
// Clone to avoid modifying the page
|
|
const clone = mainContent.cloneNode(true);
|
|
|
|
// Remove navigation and UI elements
|
|
const removeSelectors = [
|
|
'.site-body-left-column',
|
|
'.site-body-right-column',
|
|
'.tree-item-self',
|
|
'.search-input-container',
|
|
'.graph-view',
|
|
'nav',
|
|
'.frontmatter-container',
|
|
];
|
|
|
|
removeSelectors.forEach((selector) => {
|
|
const elements = clone.querySelectorAll(selector);
|
|
elements.forEach((el) => {
|
|
el.remove();
|
|
});
|
|
});
|
|
|
|
return clone.innerHTML;
|
|
});
|
|
|
|
if (!content || content.trim().length < 50) {
|
|
console.log(` ⚠️ Contenu vide ou trop court`);
|
|
return { content: '', links };
|
|
}
|
|
|
|
// Convert HTML to Markdown
|
|
const markdown = turndownService.turndown(content);
|
|
console.log(` ✓ Converti (${markdown.length} caractères, ${links.length} liens)`);
|
|
|
|
return { content: markdown, links };
|
|
} catch (error) {
|
|
console.error(` ❌ Erreur: ${error.message}`);
|
|
return { content: '', links: [] };
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main function
|
|
*/
|
|
async function main() {
|
|
console.log('🚀 Début du scraping automatique du wiki Prism...\n');
|
|
|
|
// Launch browser
|
|
console.log('🌐 Lancement du navigateur...\n');
|
|
const browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
await page.setViewport({ width: 1280, height: 800 });
|
|
|
|
// Queue of pages to visit
|
|
const toVisit = [START_PAGE];
|
|
|
|
// Crawl pages
|
|
while (toVisit.length > 0 && visitedUrls.size < MAX_PAGES) {
|
|
const currentPath = toVisit.shift();
|
|
|
|
const { content, links } = await scrapePage(page, currentPath);
|
|
|
|
if (content) {
|
|
const title = getPageTitle(currentPath);
|
|
const level = getHeadingLevel(currentPath);
|
|
|
|
pagesContent.push({
|
|
path: currentPath,
|
|
title,
|
|
level,
|
|
content,
|
|
});
|
|
}
|
|
|
|
// Add new links to queue
|
|
for (const link of links) {
|
|
if (!visitedUrls.has(link) && !toVisit.includes(link)) {
|
|
toVisit.push(link);
|
|
}
|
|
}
|
|
|
|
// Wait between requests
|
|
await wait(1000);
|
|
}
|
|
|
|
// Close browser
|
|
await browser.close();
|
|
|
|
// Build the final Markdown document
|
|
let allMarkdown = '# Prism RPG - Wiki Complete\n\n';
|
|
allMarkdown += `*Scraped from ${BASE_URL} on ${new Date().toLocaleDateString('fr-FR')}*\n\n`;
|
|
allMarkdown += `*${pagesContent.length} pages crawled*\n\n`;
|
|
allMarkdown += '---\n\n';
|
|
|
|
// Sort pages by path depth for better organization
|
|
pagesContent.sort((a, b) => {
|
|
const depthA = a.path.split('/').length;
|
|
const depthB = b.path.split('/').length;
|
|
if (depthA !== depthB) {
|
|
return depthA - depthB;
|
|
}
|
|
return a.path.localeCompare(b.path);
|
|
});
|
|
|
|
for (const pageData of pagesContent) {
|
|
allMarkdown += `${pageData.level} ${pageData.title}\n\n`;
|
|
allMarkdown += `*Source: ${BASE_URL}${pageData.path}*\n\n`;
|
|
allMarkdown += `${pageData.content}\n\n`;
|
|
allMarkdown += '---\n\n';
|
|
}
|
|
|
|
// Save the combined Markdown file
|
|
fs.writeFileSync(OUTPUT_FILE, allMarkdown, 'utf-8');
|
|
|
|
console.log(`\n✅ Scraping terminé!`);
|
|
console.log(` Pages visitées: ${visitedUrls.size}`);
|
|
console.log(` Pages avec contenu: ${pagesContent.length}`);
|
|
console.log(` Fichier généré: ${OUTPUT_FILE}`);
|
|
console.log(` Taille: ${(allMarkdown.length / 1024).toFixed(2)} KB`);
|
|
|
|
// Save a list of crawled pages
|
|
const listPath = path.join(outputDir, 'pages-crawled.txt');
|
|
fs.writeFileSync(listPath, Array.from(visitedUrls).sort().join('\n'));
|
|
console.log(` Liste des pages: ${listPath}`);
|
|
}
|
|
|
|
main().catch((error) => {
|
|
console.error('❌ Erreur fatale:', error);
|
|
process.exit(1);
|
|
});
|