First adaptation pass
This commit is contained in:
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import puppeteer from 'puppeteer';
|
||||
import TurndownService from 'turndown';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
const BASE_URL = 'https://corvanis.wiki';
|
||||
const PAGES_LIST_FILE = './tools/wikiPagesList.txt';
|
||||
const OUTPUT_FILE = './prism-rules/prism-wiki-complete.md';
|
||||
|
||||
// Create output directory
|
||||
const outputDir = path.dirname(OUTPUT_FILE);
|
||||
if (!fs.existsSync(outputDir)) {
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
}
|
||||
|
||||
// Initialize Turndown for HTML to Markdown conversion
|
||||
const turndownService = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
codeBlockStyle: 'fenced',
|
||||
});
|
||||
|
||||
/**
|
||||
* Wait for a specific time
|
||||
* @param {number} ms Milliseconds to wait
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
function wait(ms) {
|
||||
return new Promise((resolve) => {
|
||||
setTimeout(resolve, ms);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize page title for heading
|
||||
* @param {string} path Page path
|
||||
* @returns {string} Clean title
|
||||
*/
|
||||
function getPageTitle(path) {
|
||||
return path
|
||||
.replace(/^\/Prism\+\(Testing\)\/Prism\/?/, '')
|
||||
.replace(/\+/g, ' ')
|
||||
.trim() || 'Prism Home';
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape a page and convert to Markdown
|
||||
* @param {object} page Puppeteer page object
|
||||
* @param {string} pagePath Page path to scrape
|
||||
* @returns {Promise<string>} Markdown content
|
||||
*/
|
||||
async function scrapePage(page, pagePath) {
|
||||
const url = BASE_URL + pagePath;
|
||||
console.log(`📄 Scraping: ${pagePath}`);
|
||||
|
||||
try {
|
||||
// Navigate to the page
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle0',
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Wait for the main content to load
|
||||
await page.waitForSelector('.published-container', { timeout: 10000 });
|
||||
|
||||
// Wait a bit more for dynamic content
|
||||
await wait(2000);
|
||||
|
||||
// Extract the main content
|
||||
const content = await page.evaluate(() => {
|
||||
const container = document.querySelector('.published-container');
|
||||
if (!container) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Remove navigation elements
|
||||
const nav = container.querySelector('.site-body-left-column');
|
||||
if (nav) {
|
||||
nav.remove();
|
||||
}
|
||||
|
||||
// Get the main content area
|
||||
const mainContent = container.querySelector('.site-body-center-column');
|
||||
return mainContent ? mainContent.innerHTML : container.innerHTML;
|
||||
});
|
||||
|
||||
if (!content) {
|
||||
console.log(` ⚠️ Pas de contenu trouvé pour ${pagePath}`);
|
||||
return '';
|
||||
}
|
||||
|
||||
// Convert HTML to Markdown
|
||||
const markdown = turndownService.turndown(content);
|
||||
console.log(` ✓ Converti (${markdown.length} caractères)`);
|
||||
|
||||
return markdown;
|
||||
} catch (error) {
|
||||
console.error(` ❌ Erreur: ${error.message}`);
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function
|
||||
*/
|
||||
async function main() {
|
||||
console.log('🚀 Début du scraping du wiki Prism...\n');
|
||||
|
||||
// Read the list of pages
|
||||
let pages = [];
|
||||
if (fs.existsSync(PAGES_LIST_FILE)) {
|
||||
const content = fs.readFileSync(PAGES_LIST_FILE, 'utf-8');
|
||||
pages = content
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter((line) => line && !line.startsWith('#'));
|
||||
} else {
|
||||
console.error(`❌ Fichier de liste introuvable: ${PAGES_LIST_FILE}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`📋 ${pages.length} page(s) à scraper\n`);
|
||||
|
||||
// Launch browser
|
||||
console.log('🌐 Lancement du navigateur...\n');
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setViewport({ width: 1280, height: 800 });
|
||||
|
||||
// Scrape all pages
|
||||
let allMarkdown = '# Prism RPG - Wiki Complete\n\n';
|
||||
allMarkdown += `*Scraped from ${BASE_URL} on ${new Date().toLocaleDateString('fr-FR')}*\n\n`;
|
||||
allMarkdown += '---\n\n';
|
||||
|
||||
for (const pagePath of pages) {
|
||||
const title = getPageTitle(pagePath);
|
||||
const level = pagePath === '/Prism+(Testing)/Prism' ? '##' : '###';
|
||||
|
||||
allMarkdown += `${level} ${title}\n\n`;
|
||||
|
||||
const markdown = await scrapePage(page, pagePath);
|
||||
if (markdown) {
|
||||
allMarkdown += `${markdown}\n\n`;
|
||||
allMarkdown += '---\n\n';
|
||||
}
|
||||
|
||||
// Wait between requests
|
||||
await wait(1000);
|
||||
}
|
||||
|
||||
// Close browser
|
||||
await browser.close();
|
||||
|
||||
// Save the combined Markdown file
|
||||
fs.writeFileSync(OUTPUT_FILE, allMarkdown, 'utf-8');
|
||||
|
||||
console.log(`\n✅ Scraping terminé!`);
|
||||
console.log(` Pages scrapées: ${pages.length}`);
|
||||
console.log(` Fichier généré: ${OUTPUT_FILE}`);
|
||||
console.log(` Taille: ${(allMarkdown.length / 1024).toFixed(2)} KB`);
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('❌ Erreur fatale:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user