Last active
May 17, 2025 03:49
-
-
Save weskerty/ea2c9ff1f918991eb16ff7c19610bda5 to your computer and use it in GitHub Desktop.
WhatsApp Web Browser. Need Linux Plugin to: .linux npm install playwright@latest --no-save --force
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs').promises; | |
const path = require('path'); | |
const { chromium } = require('playwright'); | |
const { bot, isUrl } = require('../lib'); | |
require('dotenv').config(); | |
class WebDownloader { | |
constructor() { | |
this.config = { | |
tempDir: process.env.TEMP_DOWNLOAD_DIR || path.join(process.cwd(), 'tmp'), | |
maxImageSize: parseInt(process.env.MAX_IMAGE_SIZE, 10) || 11240, | |
maxImages: parseInt(process.env.MAX_IMAGES, 10) || 10, | |
browserTimeout: parseInt(process.env.BROWSER_TIMEOUT, 10) || 99000, | |
browserWaitTime: parseInt(process.env.BROWSER_WAIT_TIME, 10) || 9000, | |
userAgent: process.env.USER_AGENT || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', | |
chromiumPath: process.env.CHROMIUM_PATH || '', | |
scrollPages: parseInt(process.env.SCROLL_PAGES, 10) || 5, | |
userDataDir: process.env.USER_DATA_DIR || '/root/.config/chromium/Bot' | |
}; | |
this.browserInstance = null; | |
this.activeRequests = 0; | |
} | |
generateSafeFileName(originalName) { | |
const ext = path.extname(originalName); | |
const timestamp = Date.now(); | |
return `web_${timestamp}${ext}`; | |
} | |
async ensureDirectories(sessionId) { | |
const sessionDir = path.join(this.config.tempDir, sessionId); | |
await fs.mkdir(sessionDir, { recursive: true }); | |
return sessionDir; | |
} | |
async initBrowser() { | |
if (!this.browserInstance) { | |
const launchOptions = { | |
headless: true, | |
args: [ | |
'--disable-blink-features=AutomationControlled', | |
'--disable-web-security', | |
'--disable-features=IsolateOrigins', | |
'--disable-site-isolation-trials', | |
'--disable-features=BlockInsecurePrivateNetworkRequests', | |
'--proxy-server=\'direct://\'', | |
'--proxy-bypass-list=*', | |
'--headless', | |
'--hide-scrollbars', | |
'--mute-audio', | |
'--disable-logging', | |
'--disable-infobars', | |
'--disable-breakpad', | |
'--disable-gl-drawing-for-tests', | |
'--disable-canvas-aa', | |
'--disable-2d-canvas-clip-aa', | |
'--no-sandbox', | |
// Bandera para forzar carga de recursos en modo lazy | |
'--blink-settings=imagesEnabled=true', | |
'--run-all-compositor-stages-before-draw', | |
'--disable-lazy-loading' | |
] | |
}; | |
if (this.config.chromiumPath) { | |
launchOptions.executablePath = this.config.chromiumPath; | |
} | |
try { | |
// Usamos launchPersistentContext en lugar de launch para gestionar el userDataDir | |
if (this.config.userDataDir) { | |
// Aseguramos que exista el directorio | |
await fs.mkdir(this.config.userDataDir, { recursive: true }).catch(() => {}); | |
// Lanzamos un contexto persistente directamente (sin crear browserInstance) | |
this.persistentContext = await chromium.launchPersistentContext( | |
this.config.userDataDir, | |
{ | |
...launchOptions, | |
userAgent: this.config.userAgent, | |
bypassCSP: true | |
} | |
); | |
return this.persistentContext; | |
} else { | |
// Si no hay userDataDir, usamos el método normal | |
this.browserInstance = await chromium.launch(launchOptions); | |
} | |
} catch (error) { | |
throw new Error(`Browser initialization failed: ${error.message}`); | |
} | |
} | |
return this.browserInstance; | |
} | |
async closeBrowserIfIdle() { | |
if (this.activeRequests === 0) { | |
if (this.persistentContext) { | |
await this.persistentContext.close().catch(() => {}); | |
this.persistentContext = null; | |
} else if (this.browserInstance) { | |
await this.browserInstance.close().catch(() => {}); | |
this.browserInstance = null; | |
} | |
} | |
} | |
async processDownloadedFile(message, filePath, type) { | |
const safeFileName = this.generateSafeFileName(`web.${type}`); | |
const mimeTypes = { | |
'pdf': 'application/pdf', | |
'mhtml': 'text/html', | |
'jpg': 'image/jpeg', | |
'png': 'image/png' | |
}; | |
try { | |
await message.send( | |
await fs.readFile(filePath), | |
{ | |
fileName: safeFileName, | |
mimetype: mimeTypes[type], | |
quoted: message.data | |
}, | |
type === 'pdf' ? 'document' : (type === 'mhtml' ? 'document' : 'image') | |
); | |
} finally { | |
await fs.unlink(filePath).catch(() => {}); | |
} | |
} | |
// Función para realizar scroll en la página | |
async scrollPageDown(page, scrollCount = 5) { | |
try { | |
// Altura de la ventana del navegador | |
const viewportHeight = await page.evaluate(() => window.innerHeight); | |
// Realizar scrolls | |
for (let i = 0; i < scrollCount; i++) { | |
await page.evaluate((viewportHeight) => { | |
window.scrollBy(0, viewportHeight); | |
}, viewportHeight); | |
// Esperar un momento para que se carguen los recursos | |
await page.waitForTimeout(1500); | |
// Forzar carga de imágenes lazy | |
await this.forceLazyLoad(page); | |
} | |
// Volver al inicio de la página | |
await page.evaluate(() => { | |
window.scrollTo(0, 0); | |
}); | |
// Esperar a que se estabilice la página | |
await page.waitForTimeout(1000); | |
} catch (error) { | |
console.error('Error durante el scroll:', error); | |
} | |
} | |
// Función para forzar la carga de imágenes lazy | |
async forceLazyLoad(page) { | |
await page.evaluate(() => { | |
// Buscar todas las imágenes con atributos de carga diferida | |
const lazyImages = document.querySelectorAll('img[loading="lazy"], img[data-src], img[data-srcset], img[data-original], img[data-lazy-src]'); | |
lazyImages.forEach(img => { | |
// Asignar src desde data-src o atributos similares si están presentes | |
if (img.getAttribute('data-src')) { | |
img.setAttribute('src', img.getAttribute('data-src')); | |
} | |
if (img.getAttribute('data-srcset')) { | |
img.setAttribute('srcset', img.getAttribute('data-srcset')); | |
} | |
if (img.getAttribute('data-original')) { | |
img.setAttribute('src', img.getAttribute('data-original')); | |
} | |
if (img.getAttribute('data-lazy-src')) { | |
img.setAttribute('src', img.getAttribute('data-lazy-src')); | |
} | |
// Cambiar el loading a eager para forzar la carga | |
img.loading = 'eager'; | |
// Establecer intersectionObserver en true simulando que está en el viewport | |
if (img.dataset) { | |
img.dataset.wasProcessed = 'true'; | |
} | |
}); | |
// Disparar evento de scroll para activar cargadores de lazy loading basados en eventos | |
const scrollEvent = new Event('scroll'); | |
window.dispatchEvent(scrollEvent); | |
}); | |
} | |
async downloadAsMHTML(message, url) { | |
this.activeRequests++; | |
const sessionId = `web_${Date.now()}`; | |
const outputDir = await this.ensureDirectories(sessionId); | |
try { | |
const browserOrContext = await this.initBrowser(); | |
// Si tenemos un contexto persistente, lo usamos directamente | |
// De lo contrario, creamos un nuevo contexto desde la instancia del navegador | |
const page = this.persistentContext | |
? await this.persistentContext.newPage() | |
: await (await browserOrContext.newContext({ | |
userAgent: this.config.userAgent, | |
bypassCSP: true | |
})).newPage(); | |
// Configurar un timeout para evitar esperar indefinidamente | |
page.setDefaultTimeout(this.config.browserTimeout); | |
try { | |
// Navegar a la URL con un criterio de carga menos estricto (load en lugar de networkidle) | |
await page.goto(url, { | |
timeout: this.config.browserTimeout, | |
waitUntil: 'load' // Cambio de networkidle a load | |
}); | |
} catch (navigationError) { | |
console.log(`Advertencia de navegación: ${navigationError.message}`); | |
// Continuamos aunque haya error de timeout, puede que la página esté parcialmente cargada | |
} | |
// Esperar un tiempo fijo para permitir cargar recursos adicionales | |
await page.waitForTimeout(5000); | |
// Realizar scroll para cargar contenido lazy | |
await this.scrollPageDown(page, this.config.scrollPages); | |
// Forzar la carga de todas las imágenes lazy | |
await this.forceLazyLoad(page); | |
// Esperar a que se complete la carga | |
await page.waitForTimeout(this.config.browserWaitTime); | |
// Obtener la sesión CDP desde el contexto correcto | |
const client = await page.context().newCDPSession(page); | |
const mhtmlData = (await client.send('Page.captureSnapshot', { format: 'mhtml' })).data; | |
const filePath = path.join(outputDir, this.generateSafeFileName('web.mhtml')); | |
await fs.writeFile(filePath, mhtmlData); | |
await this.processDownloadedFile(message, filePath, 'mhtml'); | |
await page.close(); | |
// Solo cerramos el contexto si no es persistente | |
if (!this.persistentContext && page.context) { | |
await page.context().close(); | |
} | |
} catch (error) { | |
await message.send(`❌ Error al capturar MHTML: ${error.message}`, { quoted: message.data }); | |
} finally { | |
this.activeRequests--; | |
await this.closeBrowserIfIdle(); | |
await fs.rmdir(outputDir, { recursive: true }).catch(() => {}); | |
} | |
} | |
async downloadAsPDF(message, url) { | |
this.activeRequests++; | |
const sessionId = `web_${Date.now()}`; | |
const outputDir = await this.ensureDirectories(sessionId); | |
try { | |
const browserOrContext = await this.initBrowser(); | |
// Si tenemos un contexto persistente, lo usamos directamente | |
// De lo contrario, creamos un nuevo contexto desde la instancia del navegador | |
const page = this.persistentContext | |
? await this.persistentContext.newPage() | |
: await (await browserOrContext.newContext({ | |
userAgent: this.config.userAgent, | |
bypassCSP: true | |
})).newPage(); | |
// Configurar un timeout para evitar esperar indefinidamente | |
page.setDefaultTimeout(this.config.browserTimeout); | |
try { | |
// Navegar a la URL con un criterio de carga menos estricto | |
await page.goto(url, { | |
timeout: this.config.browserTimeout, | |
waitUntil: 'load' // Cambio de networkidle a load | |
}); | |
} catch (navigationError) { | |
console.log(`Advertencia de navegación: ${navigationError.message}`); | |
// Continuamos aunque haya error de timeout, puede que la página esté parcialmente cargada | |
} | |
// Esperar un tiempo fijo para permitir cargar recursos adicionales | |
await page.waitForTimeout(5000); | |
// Realizar scroll para cargar contenido lazy | |
await this.scrollPageDown(page, this.config.scrollPages); | |
// Forzar la carga de todas las imágenes lazy | |
await this.forceLazyLoad(page); | |
// Esperar a que se complete la carga | |
await page.waitForTimeout(this.config.browserWaitTime); | |
const filePath = path.join(outputDir, this.generateSafeFileName('web.pdf')); | |
await page.pdf({ | |
path: filePath, | |
format: 'A4', | |
printBackground: true | |
}); | |
await this.processDownloadedFile(message, filePath, 'pdf'); | |
await page.close(); | |
// Solo cerramos el contexto si no es persistente | |
if (!this.persistentContext && page.context) { | |
await page.context().close(); | |
} | |
} catch (error) { | |
await message.send(`❌ Error al generar PDF: ${error.message}`, { quoted: message.data }); | |
} finally { | |
this.activeRequests--; | |
await this.closeBrowserIfIdle(); | |
await fs.rmdir(outputDir, { recursive: true }).catch(() => {}); | |
} | |
} | |
async downloadImages(message, url) { | |
this.activeRequests++; | |
const sessionId = `web_${Date.now()}`; | |
const outputDir = await this.ensureDirectories(sessionId); | |
try { | |
const browserOrContext = await this.initBrowser(); | |
// Si tenemos un contexto persistente, lo usamos directamente | |
// De lo contrario, creamos un nuevo contexto desde la instancia del navegador | |
const page = this.persistentContext | |
? await this.persistentContext.newPage() | |
: await (await browserOrContext.newContext({ | |
userAgent: this.config.userAgent, | |
bypassCSP: true | |
})).newPage(); | |
// Configurar un timeout para evitar esperar indefinidamente | |
page.setDefaultTimeout(this.config.browserTimeout); | |
try { | |
// Navegar a la URL con un criterio de carga menos estricto | |
await page.goto(url, { | |
timeout: this.config.browserTimeout, | |
waitUntil: 'load' // Cambio de networkidle a load | |
}); | |
} catch (navigationError) { | |
console.log(`Advertencia de navegación: ${navigationError.message}`); | |
// Continuamos aunque haya error de timeout, puede que la página esté parcialmente cargada | |
} | |
// Esperar un tiempo fijo para permitir cargar recursos adicionales | |
await page.waitForTimeout(5000); | |
// Realizar scroll para cargar contenido lazy | |
await this.scrollPageDown(page, this.config.scrollPages); | |
// Forzar la carga de todas las imágenes lazy | |
await this.forceLazyLoad(page); | |
// Esperar a que se complete la carga | |
await page.waitForTimeout(this.config.browserWaitTime); | |
const images = await page.evaluate(() => { | |
const imgElements = Array.from(document.images); | |
return imgElements.map(img => ({ | |
src: img.src, | |
size: img.naturalWidth * img.naturalHeight, | |
width: img.naturalWidth, | |
height: img.naturalHeight | |
})).filter(img => img.src && img.src.startsWith('http')); | |
}); | |
const validImages = images | |
.filter(img => img.width > 100 && img.height > 100 && img.size > this.config.maxImageSize) | |
.map(img => img.src) | |
.slice(0, this.config.maxImages); | |
if (validImages.length === 0) { | |
await message.send('❌ No se encontraron imágenes mayores a 10KB', { quoted: message.data }); | |
return; | |
} | |
const fetch = (await import('node-fetch')).default; | |
for (const imageUrl of validImages) { | |
try { | |
const response = await fetch(imageUrl); | |
if (!response.ok) continue; | |
const buffer = await response.buffer(); | |
const filePath = path.join(outputDir, this.generateSafeFileName('image.jpg')); | |
await fs.writeFile(filePath, buffer); | |
await this.processDownloadedFile(message, filePath, 'jpg'); | |
} catch (error) { | |
console.error(`Error downloading image ${imageUrl}:`, error); | |
} | |
} | |
await page.close(); | |
// Solo cerramos el contexto si no es persistente | |
if (!this.persistentContext && page.context) { | |
await page.context().close(); | |
} | |
} catch (error) { | |
await message.send(`❌ Error al descargar imágenes: ${error.message}`, { quoted: message.data }); | |
} finally { | |
this.activeRequests--; | |
await this.closeBrowserIfIdle(); | |
await fs.rmdir(outputDir, { recursive: true }).catch(() => {}); | |
} | |
} | |
} | |
const webDownloader = new WebDownloader(); | |
bot( | |
{ | |
pattern: 'web ?(.*)', | |
fromMe: true, | |
desc: 'Descarga páginas web en formatos MHTML, PDF o imágenes.', | |
type: 'downloads', | |
}, | |
async (message, match) => { | |
const args = match.split(/\s+/); | |
const url = args.find(arg => isUrl(arg)); | |
const command = args[0].toLowerCase(); | |
if (!url) { | |
await message.send('❌ Proporciona una URL válida', { quoted: message.data }); | |
return; | |
} | |
await message.send('⏳ Procesando página web...\nEsto puede tomar un momento...', { quoted: message.data }); | |
try { | |
if (command === 'img') { | |
await webDownloader.downloadImages(message, url); | |
} else if (command === 'pdf') { | |
await webDownloader.downloadAsPDF(message, url); | |
} else { | |
await webDownloader.downloadAsMHTML(message, url); | |
} | |
} catch (error) { | |
await message.send(`❌ Error: ${error.message}`, { quoted: message.data }); | |
} | |
} | |
); | |
module.exports = { webDownloader }; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment