First adaptation pass
This commit is contained in:
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import https from 'https';
|
||||
import http from 'http';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { URL } from 'url';
|
||||
|
||||
const BASE_URL = 'https://corvanis.wiki';
|
||||
const START_PAGE = '/Prism+(Testing)/Prism';
|
||||
const OUTPUT_DIR = './wiki-downloads';
|
||||
|
||||
// Known Prism pages structure - will be expanded as we find more
|
||||
const KNOWN_SECTIONS = [
|
||||
'',
|
||||
'/Rules',
|
||||
'/Character+Creation',
|
||||
'/Combat',
|
||||
'/Equipment',
|
||||
'/Magic',
|
||||
'/Spells',
|
||||
'/Miracles',
|
||||
'/Monsters',
|
||||
'/Bestiary',
|
||||
];
|
||||
|
||||
// Set to track visited pages
|
||||
const visitedPages = new Set();
|
||||
const pagesToVisit = [START_PAGE];
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an HTTP(S) request
|
||||
* @param {string} url The URL to request
|
||||
* @returns {Promise<{data: string, statusCode: number}>} The response data and status code
|
||||
*/
|
||||
function makeRequest(url) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const urlObj = new URL(url);
|
||||
const protocol = urlObj.protocol === 'https:' ? https : http;
|
||||
|
||||
protocol.get(url, (res) => {
|
||||
let data = '';
|
||||
|
||||
res.on('data', (chunk) => {
|
||||
data += chunk;
|
||||
});
|
||||
|
||||
res.on('end', () => {
|
||||
resolve({ data, statusCode: res.statusCode });
|
||||
});
|
||||
}).on('error', (err) => {
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract links from HTML content
|
||||
* @param {string} html The HTML content to parse
|
||||
* @param {string} currentPath The current page path
|
||||
* @returns {string[]} Array of extracted links
|
||||
*/
|
||||
function extractLinks(html, currentPath) {
|
||||
const links = new Set();
|
||||
|
||||
// Match links that contain Prism+(Testing)/Prism in the path
|
||||
const linkRegex = /href=["']([^"']*Prism\+\(Testing\)\/Prism[^"']*)["']/gi;
|
||||
let match;
|
||||
|
||||
while ((match = linkRegex.exec(html)) !== null) {
|
||||
let link = match[1];
|
||||
|
||||
// Clean up the link
|
||||
link = link.split('#')[0]; // Remove anchors
|
||||
link = link.split('?')[0]; // Remove query params
|
||||
|
||||
// Make sure it starts with /
|
||||
if (!link.startsWith('/')) {
|
||||
if (link.startsWith('http')) {
|
||||
try {
|
||||
const urlObj = new URL(link);
|
||||
link = urlObj.pathname;
|
||||
} catch (e) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Only keep links under Prism+(Testing)/Prism
|
||||
if (link.includes('Prism+(Testing)/Prism')) {
|
||||
links.add(link);
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(links);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize filename
|
||||
* @param {string} str The string to sanitize
|
||||
* @returns {string} Sanitized filename
|
||||
*/
|
||||
function sanitizeFilename(str) {
|
||||
return str
|
||||
.replace(/[^a-z0-9_\-+()]/gi, '_')
|
||||
.replace(/_+/g, '_')
|
||||
.replace(/^_|_$/g, '');
|
||||
}
|
||||
|
||||
/**
|
||||
* Download a page
|
||||
* @param {string} pagePath The path of the page to download
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async function downloadPage(pagePath) {
|
||||
if (visitedPages.has(pagePath)) {
|
||||
return;
|
||||
}
|
||||
|
||||
visitedPages.add(pagePath);
|
||||
console.log(`Téléchargement: ${pagePath}`);
|
||||
|
||||
try {
|
||||
const url = BASE_URL + pagePath;
|
||||
const { data, statusCode } = await makeRequest(url);
|
||||
|
||||
if (statusCode !== 200) {
|
||||
console.error(` ❌ Erreur ${statusCode} pour ${pagePath}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Save the HTML file
|
||||
const filename = `${sanitizeFilename(pagePath)}.html`;
|
||||
const filepath = path.join(OUTPUT_DIR, filename);
|
||||
fs.writeFileSync(filepath, data, 'utf-8');
|
||||
console.log(` ✓ Sauvegardé: ${filename}`);
|
||||
|
||||
// Extract and queue new links
|
||||
const links = extractLinks(data, pagePath);
|
||||
for (const link of links) {
|
||||
if (!visitedPages.has(link)) {
|
||||
pagesToVisit.push(link);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait a bit to avoid overwhelming the server
|
||||
await new Promise((resolve) => {
|
||||
setTimeout(resolve, 500);
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
console.error(` ❌ Erreur lors du téléchargement de ${pagePath}:`, error.message);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function
|
||||
*/
|
||||
async function main() {
|
||||
console.log('🚀 Début du téléchargement des pages wiki...');
|
||||
console.log(` Base URL: ${BASE_URL}`);
|
||||
console.log(` Page de départ: ${START_PAGE}`);
|
||||
console.log(` Répertoire de sortie: ${OUTPUT_DIR}\n`);
|
||||
|
||||
while (pagesToVisit.length > 0) {
|
||||
const page = pagesToVisit.shift();
|
||||
await downloadPage(page);
|
||||
}
|
||||
|
||||
console.log(`\n✅ Téléchargement terminé!`);
|
||||
console.log(` Pages téléchargées: ${visitedPages.size}`);
|
||||
console.log(` Fichiers sauvegardés dans: ${OUTPUT_DIR}`);
|
||||
|
||||
// Save a list of downloaded pages
|
||||
const listPath = path.join(OUTPUT_DIR, '_pages-list.txt');
|
||||
fs.writeFileSync(listPath, Array.from(visitedPages).sort().join('\n'));
|
||||
console.log(` Liste des pages: ${listPath}`);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user