Created
February 13, 2025 16:17
-
-
Save dmitry/c63f7c62883ebc824b6eb19f5265cb93 to your computer and use it in GitHub Desktop.
Convert telegram html files and combine them into a one json file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const path = require('path'); | |
const { JSDOM } = require('jsdom'); | |
// Function to convert HTML content to JSON | |
function htmlToJson(htmlContent) { | |
const dom = new JSDOM(htmlContent); | |
const document = dom.window.document; | |
// Extract messages | |
const messages = []; | |
const messageElements = document.querySelectorAll('.message'); | |
messageElements.forEach(messageEl => { | |
const message = {}; | |
// Get message ID | |
message.id = messageEl.id; | |
// Get message type | |
if (messageEl.classList.contains('service')) { | |
message.type = 'service'; | |
} else { | |
message.type = 'message'; | |
} | |
// Get sender info if available | |
const fromNameEl = messageEl.querySelector('.from_name'); | |
if (fromNameEl) { | |
message.from = fromNameEl.textContent.trim(); | |
} | |
// Get timestamp | |
const dateEl = messageEl.querySelector('.date'); | |
if (dateEl) { | |
message.timestamp = dateEl.getAttribute('title'); | |
message.time = dateEl.textContent.trim(); | |
} | |
// Get message text | |
const textEl = messageEl.querySelector('.text'); | |
if (textEl) { | |
message.text = textEl.textContent.trim(); | |
} | |
// Get reply info if available | |
const replyEl = messageEl.querySelector('.reply_to'); | |
if (replyEl) { | |
const replyLink = replyEl.querySelector('a'); | |
if (replyLink) { | |
message.reply_to = replyLink.getAttribute('href').replace('#go_to_message', ''); | |
} | |
} | |
// Get media if available | |
const mediaEl = messageEl.querySelector('.media_wrap'); | |
if (mediaEl) { | |
const photoEl = mediaEl.querySelector('.photo'); | |
if (photoEl) { | |
message.media = { | |
type: 'photo', | |
src: photoEl.getAttribute('src') | |
}; | |
} | |
} | |
messages.push(message); | |
}); | |
return messages; | |
} | |
// Function to process all HTML files in current directory | |
async function processHtmlFiles() { | |
try { | |
// Read all files in current directory | |
const files = fs.readdirSync('.').filter(file => file.endsWith('.html')); | |
let allMessages = []; | |
// Process each HTML file | |
for (const file of files) { | |
console.log(`Processing ${file}...`); | |
const htmlContent = fs.readFileSync(file, 'utf8'); | |
const messages = htmlToJson(htmlContent); | |
allMessages = allMessages.concat(messages); | |
} | |
// Sort messages by ID if needed | |
allMessages.sort((a, b) => { | |
const idA = parseInt(a.id.replace(/\D/g, '')); | |
const idB = parseInt(b.id.replace(/\D/g, '')); | |
return idA - idB; | |
}); | |
// Write merged JSON to file | |
const outputFile = 'messages.json'; | |
fs.writeFileSync(outputFile, JSON.stringify(allMessages, null, 2)); | |
console.log(`Conversion complete! Output saved to ${outputFile}`); | |
} catch (error) { | |
console.error('Error processing files:', error); | |
} | |
} | |
// Run the processor | |
processHtmlFiles(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment