First adaptation pass
This commit is contained in:
@@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import puppeteer from 'puppeteer';
|
||||
import TurndownService from 'turndown';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
const BASE_URL = 'https://corvanis.wiki';
|
||||
const START_PAGE = '/Prism+(Testing)/Prism';
|
||||
const OUTPUT_FILE = './prism-rules/prism-wiki-complete.md';
|
||||
const MAX_PAGES = 500; // Increased limit to get more pages
|
||||
|
||||
// Create output directory
|
||||
const outputDir = path.dirname(OUTPUT_FILE);
|
||||
if (!fs.existsSync(outputDir)) {
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
}
|
||||
|
||||
// Initialize Turndown for HTML to Markdown conversion
|
||||
const turndownService = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
codeBlockStyle: 'fenced',
|
||||
});
|
||||
|
||||
// Track visited pages
|
||||
const visitedUrls = new Set();
|
||||
const pagesContent = [];
|
||||
|
||||
/**
|
||||
* Wait for a specific time
|
||||
* @param {number} ms Milliseconds to wait
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
function wait(ms) {
|
||||
return new Promise((resolve) => {
|
||||
setTimeout(resolve, ms);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract Prism-related links from the page
|
||||
* @param {object} page Puppeteer page object
|
||||
* @returns {Promise<string[]>} Array of URLs
|
||||
*/
|
||||
async function extractPrismLinks(page) {
|
||||
return await page.evaluate(() => {
|
||||
const links = [];
|
||||
const anchors = document.querySelectorAll('a[href*="Prism"]');
|
||||
|
||||
anchors.forEach((anchor) => {
|
||||
const href = anchor.getAttribute('href');
|
||||
if (href && href.includes('Prism')) {
|
||||
// Extract the path
|
||||
const url = new URL(href, window.location.href);
|
||||
const path = url.pathname;
|
||||
|
||||
// Only keep Prism Testing paths
|
||||
if (path.includes('Prism+(Testing)') || path.includes('Prism+%28Testing%29')) {
|
||||
links.push(path);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return [...new Set(links)];
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize page title for heading
|
||||
* @param {string} urlPath Page path
|
||||
* @returns {string} Clean title
|
||||
*/
|
||||
function getPageTitle(urlPath) {
|
||||
return urlPath
|
||||
.replace(/.*\/([^/]+)$/, '$1')
|
||||
.replace(/\+/g, ' ')
|
||||
.replace(/%28/g, '(')
|
||||
.replace(/%29/g, ')')
|
||||
.replace(/_/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get heading level based on URL depth
|
||||
* @param {string} urlPath Page path
|
||||
* @returns {string} Markdown heading level
|
||||
*/
|
||||
function getHeadingLevel(urlPath) {
|
||||
const depth = urlPath.split('/').filter((s) => s).length;
|
||||
return '#'.repeat(Math.min(depth + 1, 6));
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape a page and convert to Markdown
|
||||
* @param {object} page Puppeteer page object
|
||||
* @param {string} pagePath Page path to scrape
|
||||
* @returns {Promise<{content: string, links: string[]}>} Markdown content and found links
|
||||
*/
|
||||
async function scrapePage(page, pagePath) {
|
||||
if (visitedUrls.has(pagePath)) {
|
||||
return { content: '', links: [] };
|
||||
}
|
||||
|
||||
visitedUrls.add(pagePath);
|
||||
const url = BASE_URL + pagePath;
|
||||
console.log(`📄 Scraping [${visitedUrls.size}]: ${pagePath}`);
|
||||
|
||||
try {
|
||||
// Navigate to the page
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: 'networkidle0',
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Check if page exists
|
||||
if (response.status() === 404) {
|
||||
console.log(` ⚠️ Page non trouvée (404)`);
|
||||
return { content: '', links: [] };
|
||||
}
|
||||
|
||||
// Wait for the main content to load
|
||||
await page.waitForSelector('.published-container', { timeout: 10000 });
|
||||
|
||||
// Wait a bit more for dynamic content
|
||||
await wait(2000);
|
||||
|
||||
// Extract links for crawling
|
||||
const links = await extractPrismLinks(page);
|
||||
|
||||
// Extract the main content
|
||||
const content = await page.evaluate(() => {
|
||||
// Try different selectors for content
|
||||
let mainContent = document.querySelector('.markdown-preview-view');
|
||||
if (!mainContent) {
|
||||
mainContent = document.querySelector('.site-body-center-column');
|
||||
}
|
||||
if (!mainContent) {
|
||||
mainContent = document.querySelector('.published-container');
|
||||
}
|
||||
|
||||
if (!mainContent) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Clone to avoid modifying the page
|
||||
const clone = mainContent.cloneNode(true);
|
||||
|
||||
// Remove navigation and UI elements
|
||||
const removeSelectors = [
|
||||
'.site-body-left-column',
|
||||
'.site-body-right-column',
|
||||
'.tree-item-self',
|
||||
'.search-input-container',
|
||||
'.graph-view',
|
||||
'nav',
|
||||
'.frontmatter-container',
|
||||
];
|
||||
|
||||
removeSelectors.forEach((selector) => {
|
||||
const elements = clone.querySelectorAll(selector);
|
||||
elements.forEach((el) => {
|
||||
el.remove();
|
||||
});
|
||||
});
|
||||
|
||||
return clone.innerHTML;
|
||||
});
|
||||
|
||||
if (!content || content.trim().length < 50) {
|
||||
console.log(` ⚠️ Contenu vide ou trop court`);
|
||||
return { content: '', links };
|
||||
}
|
||||
|
||||
// Convert HTML to Markdown
|
||||
const markdown = turndownService.turndown(content);
|
||||
console.log(` ✓ Converti (${markdown.length} caractères, ${links.length} liens)`);
|
||||
|
||||
return { content: markdown, links };
|
||||
} catch (error) {
|
||||
console.error(` ❌ Erreur: ${error.message}`);
|
||||
return { content: '', links: [] };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function
|
||||
*/
|
||||
async function main() {
|
||||
console.log('🚀 Début du scraping automatique du wiki Prism...\n');
|
||||
|
||||
// Launch browser
|
||||
console.log('🌐 Lancement du navigateur...\n');
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setViewport({ width: 1280, height: 800 });
|
||||
|
||||
// Queue of pages to visit
|
||||
const toVisit = [START_PAGE];
|
||||
|
||||
// Crawl pages
|
||||
while (toVisit.length > 0 && visitedUrls.size < MAX_PAGES) {
|
||||
const currentPath = toVisit.shift();
|
||||
|
||||
const { content, links } = await scrapePage(page, currentPath);
|
||||
|
||||
if (content) {
|
||||
const title = getPageTitle(currentPath);
|
||||
const level = getHeadingLevel(currentPath);
|
||||
|
||||
pagesContent.push({
|
||||
path: currentPath,
|
||||
title,
|
||||
level,
|
||||
content,
|
||||
});
|
||||
}
|
||||
|
||||
// Add new links to queue
|
||||
for (const link of links) {
|
||||
if (!visitedUrls.has(link) && !toVisit.includes(link)) {
|
||||
toVisit.push(link);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait between requests
|
||||
await wait(1000);
|
||||
}
|
||||
|
||||
// Close browser
|
||||
await browser.close();
|
||||
|
||||
// Build the final Markdown document
|
||||
let allMarkdown = '# Prism RPG - Wiki Complete\n\n';
|
||||
allMarkdown += `*Scraped from ${BASE_URL} on ${new Date().toLocaleDateString('fr-FR')}*\n\n`;
|
||||
allMarkdown += `*${pagesContent.length} pages crawled*\n\n`;
|
||||
allMarkdown += '---\n\n';
|
||||
|
||||
// Sort pages by path depth for better organization
|
||||
pagesContent.sort((a, b) => {
|
||||
const depthA = a.path.split('/').length;
|
||||
const depthB = b.path.split('/').length;
|
||||
if (depthA !== depthB) {
|
||||
return depthA - depthB;
|
||||
}
|
||||
return a.path.localeCompare(b.path);
|
||||
});
|
||||
|
||||
for (const pageData of pagesContent) {
|
||||
allMarkdown += `${pageData.level} ${pageData.title}\n\n`;
|
||||
allMarkdown += `*Source: ${BASE_URL}${pageData.path}*\n\n`;
|
||||
allMarkdown += `${pageData.content}\n\n`;
|
||||
allMarkdown += '---\n\n';
|
||||
}
|
||||
|
||||
// Save the combined Markdown file
|
||||
fs.writeFileSync(OUTPUT_FILE, allMarkdown, 'utf-8');
|
||||
|
||||
console.log(`\n✅ Scraping terminé!`);
|
||||
console.log(` Pages visitées: ${visitedUrls.size}`);
|
||||
console.log(` Pages avec contenu: ${pagesContent.length}`);
|
||||
console.log(` Fichier généré: ${OUTPUT_FILE}`);
|
||||
console.log(` Taille: ${(allMarkdown.length / 1024).toFixed(2)} KB`);
|
||||
|
||||
// Save a list of crawled pages
|
||||
const listPath = path.join(outputDir, 'pages-crawled.txt');
|
||||
fs.writeFileSync(listPath, Array.from(visitedUrls).sort().join('\n'));
|
||||
console.log(` Liste des pages: ${listPath}`);
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('❌ Erreur fatale:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import https from 'https';
|
||||
import http from 'http';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { URL } from 'url';
|
||||
|
||||
const BASE_URL = 'https://corvanis.wiki';
|
||||
const START_PAGE = '/Prism+(Testing)/Prism';
|
||||
const OUTPUT_DIR = './wiki-downloads';
|
||||
|
||||
// Known Prism pages structure - will be expanded as we find more
|
||||
const KNOWN_SECTIONS = [
|
||||
'',
|
||||
'/Rules',
|
||||
'/Character+Creation',
|
||||
'/Combat',
|
||||
'/Equipment',
|
||||
'/Magic',
|
||||
'/Spells',
|
||||
'/Miracles',
|
||||
'/Monsters',
|
||||
'/Bestiary',
|
||||
];
|
||||
|
||||
// Set to track visited pages
|
||||
const visitedPages = new Set();
|
||||
const pagesToVisit = [START_PAGE];
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an HTTP(S) request
|
||||
* @param {string} url The URL to request
|
||||
* @returns {Promise<{data: string, statusCode: number}>} The response data and status code
|
||||
*/
|
||||
function makeRequest(url) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const urlObj = new URL(url);
|
||||
const protocol = urlObj.protocol === 'https:' ? https : http;
|
||||
|
||||
protocol.get(url, (res) => {
|
||||
let data = '';
|
||||
|
||||
res.on('data', (chunk) => {
|
||||
data += chunk;
|
||||
});
|
||||
|
||||
res.on('end', () => {
|
||||
resolve({ data, statusCode: res.statusCode });
|
||||
});
|
||||
}).on('error', (err) => {
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract links from HTML content
|
||||
* @param {string} html The HTML content to parse
|
||||
* @param {string} currentPath The current page path
|
||||
* @returns {string[]} Array of extracted links
|
||||
*/
|
||||
function extractLinks(html, currentPath) {
|
||||
const links = new Set();
|
||||
|
||||
// Match links that contain Prism+(Testing)/Prism in the path
|
||||
const linkRegex = /href=["']([^"']*Prism\+\(Testing\)\/Prism[^"']*)["']/gi;
|
||||
let match;
|
||||
|
||||
while ((match = linkRegex.exec(html)) !== null) {
|
||||
let link = match[1];
|
||||
|
||||
// Clean up the link
|
||||
link = link.split('#')[0]; // Remove anchors
|
||||
link = link.split('?')[0]; // Remove query params
|
||||
|
||||
// Make sure it starts with /
|
||||
if (!link.startsWith('/')) {
|
||||
if (link.startsWith('http')) {
|
||||
try {
|
||||
const urlObj = new URL(link);
|
||||
link = urlObj.pathname;
|
||||
} catch (e) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Only keep links under Prism+(Testing)/Prism
|
||||
if (link.includes('Prism+(Testing)/Prism')) {
|
||||
links.add(link);
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(links);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize filename
|
||||
* @param {string} str The string to sanitize
|
||||
* @returns {string} Sanitized filename
|
||||
*/
|
||||
function sanitizeFilename(str) {
|
||||
return str
|
||||
.replace(/[^a-z0-9_\-+()]/gi, '_')
|
||||
.replace(/_+/g, '_')
|
||||
.replace(/^_|_$/g, '');
|
||||
}
|
||||
|
||||
/**
|
||||
* Download a page
|
||||
* @param {string} pagePath The path of the page to download
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async function downloadPage(pagePath) {
|
||||
if (visitedPages.has(pagePath)) {
|
||||
return;
|
||||
}
|
||||
|
||||
visitedPages.add(pagePath);
|
||||
console.log(`Téléchargement: ${pagePath}`);
|
||||
|
||||
try {
|
||||
const url = BASE_URL + pagePath;
|
||||
const { data, statusCode } = await makeRequest(url);
|
||||
|
||||
if (statusCode !== 200) {
|
||||
console.error(` ❌ Erreur ${statusCode} pour ${pagePath}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Save the HTML file
|
||||
const filename = `${sanitizeFilename(pagePath)}.html`;
|
||||
const filepath = path.join(OUTPUT_DIR, filename);
|
||||
fs.writeFileSync(filepath, data, 'utf-8');
|
||||
console.log(` ✓ Sauvegardé: ${filename}`);
|
||||
|
||||
// Extract and queue new links
|
||||
const links = extractLinks(data, pagePath);
|
||||
for (const link of links) {
|
||||
if (!visitedPages.has(link)) {
|
||||
pagesToVisit.push(link);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait a bit to avoid overwhelming the server
|
||||
await new Promise((resolve) => {
|
||||
setTimeout(resolve, 500);
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
console.error(` ❌ Erreur lors du téléchargement de ${pagePath}:`, error.message);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function
|
||||
*/
|
||||
async function main() {
|
||||
console.log('🚀 Début du téléchargement des pages wiki...');
|
||||
console.log(` Base URL: ${BASE_URL}`);
|
||||
console.log(` Page de départ: ${START_PAGE}`);
|
||||
console.log(` Répertoire de sortie: ${OUTPUT_DIR}\n`);
|
||||
|
||||
while (pagesToVisit.length > 0) {
|
||||
const page = pagesToVisit.shift();
|
||||
await downloadPage(page);
|
||||
}
|
||||
|
||||
console.log(`\n✅ Téléchargement terminé!`);
|
||||
console.log(` Pages téléchargées: ${visitedPages.size}`);
|
||||
console.log(` Fichiers sauvegardés dans: ${OUTPUT_DIR}`);
|
||||
|
||||
// Save a list of downloaded pages
|
||||
const listPath = path.join(OUTPUT_DIR, '_pages-list.txt');
|
||||
fs.writeFileSync(listPath, Array.from(visitedPages).sort().join('\n'));
|
||||
console.log(` Liste des pages: ${listPath}`);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import https from 'https';
|
||||
import http from 'http';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { URL } from 'url';
|
||||
|
||||
const BASE_URL = 'https://corvanis.wiki';
|
||||
const PAGES_LIST_FILE = './tools/wikiPagesList.txt';
|
||||
const OUTPUT_DIR = './wiki-downloads';
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an HTTP(S) request
|
||||
* @param {string} url The URL to request
|
||||
* @returns {Promise<{data: string, statusCode: number}>} The response data and status code
|
||||
*/
|
||||
function makeRequest(url) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const urlObj = new URL(url);
|
||||
const protocol = urlObj.protocol === 'https:' ? https : http;
|
||||
|
||||
protocol
|
||||
.get(url, (res) => {
|
||||
let data = '';
|
||||
|
||||
res.on('data', (chunk) => {
|
||||
data += chunk;
|
||||
});
|
||||
|
||||
res.on('end', () => {
|
||||
resolve({ data, statusCode: res.statusCode });
|
||||
});
|
||||
})
|
||||
.on('error', (err) => {
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize filename
|
||||
* @param {string} str The string to sanitize
|
||||
* @returns {string} Sanitized filename
|
||||
*/
|
||||
function sanitizeFilename(str) {
|
||||
return str
|
||||
.replace(/^\//, '') // Remove leading slash
|
||||
.replace(/[^a-z0-9_\-+()]/gi, '_')
|
||||
.replace(/_+/g, '_')
|
||||
.replace(/^_|_$/g, '');
|
||||
}
|
||||
|
||||
/**
|
||||
* Download a page
|
||||
* @param {string} pagePath The path of the page to download
|
||||
* @returns {Promise<boolean>} Success status
|
||||
*/
|
||||
async function downloadPage(pagePath) {
|
||||
console.log(`Téléchargement: ${pagePath}`);
|
||||
|
||||
try {
|
||||
const url = BASE_URL + pagePath;
|
||||
const { data, statusCode } = await makeRequest(url);
|
||||
|
||||
if (statusCode !== 200) {
|
||||
console.error(` ❌ Erreur ${statusCode} pour ${pagePath}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Save the HTML file
|
||||
const filename = `${sanitizeFilename(pagePath)}.html`;
|
||||
const filepath = path.join(OUTPUT_DIR, filename);
|
||||
fs.writeFileSync(filepath, data, 'utf-8');
|
||||
console.log(` ✓ Sauvegardé: ${filename}`);
|
||||
|
||||
// Wait a bit to avoid overwhelming the server
|
||||
await new Promise((resolve) => {
|
||||
setTimeout(resolve, 500);
|
||||
});
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error(` ❌ Erreur lors du téléchargement de ${pagePath}:`, error.message);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function
|
||||
*/
|
||||
async function main() {
|
||||
console.log('🚀 Début du téléchargement des pages wiki...');
|
||||
console.log(` Base URL: ${BASE_URL}`);
|
||||
console.log(` Liste des pages: ${PAGES_LIST_FILE}`);
|
||||
console.log(` Répertoire de sortie: ${OUTPUT_DIR}\n`);
|
||||
|
||||
// Read the list of pages
|
||||
let pages = [];
|
||||
if (fs.existsSync(PAGES_LIST_FILE)) {
|
||||
const content = fs.readFileSync(PAGES_LIST_FILE, 'utf-8');
|
||||
pages = content
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter((line) => line && !line.startsWith('#'));
|
||||
} else {
|
||||
console.error(`❌ Fichier de liste introuvable: ${PAGES_LIST_FILE}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`📄 ${pages.length} page(s) à télécharger\n`);
|
||||
|
||||
let successCount = 0;
|
||||
for (const page of pages) {
|
||||
const success = await downloadPage(page);
|
||||
if (success) {
|
||||
successCount += 1;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n✅ Téléchargement terminé!`);
|
||||
console.log(` Pages réussies: ${successCount}/${pages.length}`);
|
||||
console.log(` Fichiers sauvegardés dans: ${OUTPUT_DIR}`);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import puppeteer from 'puppeteer';
|
||||
import TurndownService from 'turndown';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
const BASE_URL = 'https://corvanis.wiki';
|
||||
const PAGES_LIST_FILE = './tools/wikiPagesList.txt';
|
||||
const OUTPUT_FILE = './prism-rules/prism-wiki-complete.md';
|
||||
|
||||
// Create output directory
|
||||
const outputDir = path.dirname(OUTPUT_FILE);
|
||||
if (!fs.existsSync(outputDir)) {
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
}
|
||||
|
||||
// Initialize Turndown for HTML to Markdown conversion
|
||||
const turndownService = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
codeBlockStyle: 'fenced',
|
||||
});
|
||||
|
||||
/**
|
||||
* Wait for a specific time
|
||||
* @param {number} ms Milliseconds to wait
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
function wait(ms) {
|
||||
return new Promise((resolve) => {
|
||||
setTimeout(resolve, ms);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize page title for heading
|
||||
* @param {string} path Page path
|
||||
* @returns {string} Clean title
|
||||
*/
|
||||
function getPageTitle(path) {
|
||||
return path
|
||||
.replace(/^\/Prism\+\(Testing\)\/Prism\/?/, '')
|
||||
.replace(/\+/g, ' ')
|
||||
.trim() || 'Prism Home';
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape a page and convert to Markdown
|
||||
* @param {object} page Puppeteer page object
|
||||
* @param {string} pagePath Page path to scrape
|
||||
* @returns {Promise<string>} Markdown content
|
||||
*/
|
||||
async function scrapePage(page, pagePath) {
|
||||
const url = BASE_URL + pagePath;
|
||||
console.log(`📄 Scraping: ${pagePath}`);
|
||||
|
||||
try {
|
||||
// Navigate to the page
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle0',
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Wait for the main content to load
|
||||
await page.waitForSelector('.published-container', { timeout: 10000 });
|
||||
|
||||
// Wait a bit more for dynamic content
|
||||
await wait(2000);
|
||||
|
||||
// Extract the main content
|
||||
const content = await page.evaluate(() => {
|
||||
const container = document.querySelector('.published-container');
|
||||
if (!container) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Remove navigation elements
|
||||
const nav = container.querySelector('.site-body-left-column');
|
||||
if (nav) {
|
||||
nav.remove();
|
||||
}
|
||||
|
||||
// Get the main content area
|
||||
const mainContent = container.querySelector('.site-body-center-column');
|
||||
return mainContent ? mainContent.innerHTML : container.innerHTML;
|
||||
});
|
||||
|
||||
if (!content) {
|
||||
console.log(` ⚠️ Pas de contenu trouvé pour ${pagePath}`);
|
||||
return '';
|
||||
}
|
||||
|
||||
// Convert HTML to Markdown
|
||||
const markdown = turndownService.turndown(content);
|
||||
console.log(` ✓ Converti (${markdown.length} caractères)`);
|
||||
|
||||
return markdown;
|
||||
} catch (error) {
|
||||
console.error(` ❌ Erreur: ${error.message}`);
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function
|
||||
*/
|
||||
async function main() {
|
||||
console.log('🚀 Début du scraping du wiki Prism...\n');
|
||||
|
||||
// Read the list of pages
|
||||
let pages = [];
|
||||
if (fs.existsSync(PAGES_LIST_FILE)) {
|
||||
const content = fs.readFileSync(PAGES_LIST_FILE, 'utf-8');
|
||||
pages = content
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter((line) => line && !line.startsWith('#'));
|
||||
} else {
|
||||
console.error(`❌ Fichier de liste introuvable: ${PAGES_LIST_FILE}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`📋 ${pages.length} page(s) à scraper\n`);
|
||||
|
||||
// Launch browser
|
||||
console.log('🌐 Lancement du navigateur...\n');
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setViewport({ width: 1280, height: 800 });
|
||||
|
||||
// Scrape all pages
|
||||
let allMarkdown = '# Prism RPG - Wiki Complete\n\n';
|
||||
allMarkdown += `*Scraped from ${BASE_URL} on ${new Date().toLocaleDateString('fr-FR')}*\n\n`;
|
||||
allMarkdown += '---\n\n';
|
||||
|
||||
for (const pagePath of pages) {
|
||||
const title = getPageTitle(pagePath);
|
||||
const level = pagePath === '/Prism+(Testing)/Prism' ? '##' : '###';
|
||||
|
||||
allMarkdown += `${level} ${title}\n\n`;
|
||||
|
||||
const markdown = await scrapePage(page, pagePath);
|
||||
if (markdown) {
|
||||
allMarkdown += `${markdown}\n\n`;
|
||||
allMarkdown += '---\n\n';
|
||||
}
|
||||
|
||||
// Wait between requests
|
||||
await wait(1000);
|
||||
}
|
||||
|
||||
// Close browser
|
||||
await browser.close();
|
||||
|
||||
// Save the combined Markdown file
|
||||
fs.writeFileSync(OUTPUT_FILE, allMarkdown, 'utf-8');
|
||||
|
||||
console.log(`\n✅ Scraping terminé!`);
|
||||
console.log(` Pages scrapées: ${pages.length}`);
|
||||
console.log(` Fichier généré: ${OUTPUT_FILE}`);
|
||||
console.log(` Taille: ${(allMarkdown.length / 1024).toFixed(2)} KB`);
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('❌ Erreur fatale:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -0,0 +1,12 @@
|
||||
/Prism+(Testing)/Prism
|
||||
/Prism+(Testing)/Prism/Rules
|
||||
/Prism+(Testing)/Prism/Character+Creation
|
||||
/Prism+(Testing)/Prism/Combat
|
||||
/Prism+(Testing)/Prism/Equipment
|
||||
/Prism+(Testing)/Prism/Armor
|
||||
/Prism+(Testing)/Prism/Weapons
|
||||
/Prism+(Testing)/Prism/Magic
|
||||
/Prism+(Testing)/Prism/Spells
|
||||
/Prism+(Testing)/Prism/Miracles
|
||||
/Prism+(Testing)/Prism/Monsters
|
||||
/Prism+(Testing)/Prism/Bestiary
|
||||
Reference in New Issue
Block a user