Skip to content

Instantly share code, notes, and snippets.

@dmitry
Created February 13, 2025 16:17
Show Gist options
  • Save dmitry/c63f7c62883ebc824b6eb19f5265cb93 to your computer and use it in GitHub Desktop.
Save dmitry/c63f7c62883ebc824b6eb19f5265cb93 to your computer and use it in GitHub Desktop.
Convert telegram html files and combine them into a one json file.
const fs = require('fs');
const path = require('path');
const { JSDOM } = require('jsdom');
// Function to convert HTML content to JSON
function htmlToJson(htmlContent) {
const dom = new JSDOM(htmlContent);
const document = dom.window.document;
// Extract messages
const messages = [];
const messageElements = document.querySelectorAll('.message');
messageElements.forEach(messageEl => {
const message = {};
// Get message ID
message.id = messageEl.id;
// Get message type
if (messageEl.classList.contains('service')) {
message.type = 'service';
} else {
message.type = 'message';
}
// Get sender info if available
const fromNameEl = messageEl.querySelector('.from_name');
if (fromNameEl) {
message.from = fromNameEl.textContent.trim();
}
// Get timestamp
const dateEl = messageEl.querySelector('.date');
if (dateEl) {
message.timestamp = dateEl.getAttribute('title');
message.time = dateEl.textContent.trim();
}
// Get message text
const textEl = messageEl.querySelector('.text');
if (textEl) {
message.text = textEl.textContent.trim();
}
// Get reply info if available
const replyEl = messageEl.querySelector('.reply_to');
if (replyEl) {
const replyLink = replyEl.querySelector('a');
if (replyLink) {
message.reply_to = replyLink.getAttribute('href').replace('#go_to_message', '');
}
}
// Get media if available
const mediaEl = messageEl.querySelector('.media_wrap');
if (mediaEl) {
const photoEl = mediaEl.querySelector('.photo');
if (photoEl) {
message.media = {
type: 'photo',
src: photoEl.getAttribute('src')
};
}
}
messages.push(message);
});
return messages;
}
// Function to process all HTML files in current directory
async function processHtmlFiles() {
try {
// Read all files in current directory
const files = fs.readdirSync('.').filter(file => file.endsWith('.html'));
let allMessages = [];
// Process each HTML file
for (const file of files) {
console.log(`Processing ${file}...`);
const htmlContent = fs.readFileSync(file, 'utf8');
const messages = htmlToJson(htmlContent);
allMessages = allMessages.concat(messages);
}
// Sort messages by ID if needed
allMessages.sort((a, b) => {
const idA = parseInt(a.id.replace(/\D/g, ''));
const idB = parseInt(b.id.replace(/\D/g, ''));
return idA - idB;
});
// Write merged JSON to file
const outputFile = 'messages.json';
fs.writeFileSync(outputFile, JSON.stringify(allMessages, null, 2));
console.log(`Conversion complete! Output saved to ${outputFile}`);
} catch (error) {
console.error('Error processing files:', error);
}
}
// Run the processor
processHtmlFiles();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment