thiagosanches · February 5, 2023 12:35
diff --git a/createMarkdownFromTelegramExportedChat.js b/createMarkdownFromTelegramExportedChat.js
 const fs = require('fs')
 const axios = require('axios')
 const myData = require('./result.json');
 const regexTitleTag = /<title>(.*)<\/title>/g;
 const regexRedditPost = /<meta property="og:title" content\=\"(.*?")/gm;

 async function getContentFromURL(url) {

    let html = null
    try {
        const result = await axios.get(url, {
            headers: {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0'
            }
        })

        if (result && result.data) {
            html = result.data;
        }
    }
    catch (e) {
        console.log("I was not able to fetch: ", url);
    }
    return html;
 }

 try {
    (async () => {
        let markdown = ""
        const processedUrls = []

        for (const message of myData.messages) {
            if (message.text_entities) {
                for (const entity of message.text_entities) {

                    let url = null;
                    let currentRegex = regexTitleTag
                    if (entity.type === 'link') url = entity.text
                    if (entity.type === 'text_link') url = entity.href

                    // I don't need to process an URL twice...
                    if (processedUrls.includes(url)) {
                        console.log("Already processed url: ", url)
                        continue;
                    }

                    // for now, block some URLs, due to limiting calls from their servers :/.
                    if (url && url.trim().startsWith("https://news.ycombinator.com")) continue;
                    if (url && url.trim().startsWith("https://api.whatsapp.com")) continue;

                    // switch the regexes.
                    if (url && url.trim().startsWith("https://www.reddit.com")) currentRegex = regexRedditPost

                    if (url && url.trim().startsWith('https://')) {
                        processedUrls.push(url.trim())
                        console.log("Fetching: ", url)
                        const html = await getContentFromURL(url)

                        if (html) {
                            const regex = new RegExp(currentRegex)
                            const matchRegex = regex.exec(html)
                            if (matchRegex) {
                                const text = matchRegex[1].replaceAll("<title>", "").replaceAll("</title>", "").replaceAll('"', "")
                                markdown += `- ${text} [[Bulk]]\n`
                                markdown += `   - ${url} #learning\n`
                                fs.writeFileSync('markdown.md', markdown);
                            }
                        }
                    }
                }
            }
        }
    })();
 } catch (e) {
    console.error(e.message);
    process.exit(1);
 }
	const fs = require('fs')
	const axios = require('axios')
	const myData = require('./result.json');
	const regexTitleTag = /<title>(.*)<\/title>/g;
	const regexRedditPost = /<meta property="og:title" content\=\"(.*?")/gm;

	async function getContentFromURL(url) {

	let html = null
	try {
	const result = await axios.get(url, {
	headers: {
	'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0'
	}
	})

	if (result && result.data) {
	html = result.data;
	}
	}
	catch (e) {
	console.log("I was not able to fetch: ", url);
	}
	return html;
	}

	try {
	(async () => {
	let markdown = ""
	const processedUrls = []

	for (const message of myData.messages) {
	if (message.text_entities) {
	for (const entity of message.text_entities) {

	let url = null;
	let currentRegex = regexTitleTag
	if (entity.type === 'link') url = entity.text
	if (entity.type === 'text_link') url = entity.href

	// I don't need to process an URL twice...
	if (processedUrls.includes(url)) {
	console.log("Already processed url: ", url)
	continue;
	}

	// for now, block some URLs, due to limiting calls from their servers :/.
	if (url && url.trim().startsWith("https://news.ycombinator.com")) continue;
	if (url && url.trim().startsWith("https://api.whatsapp.com")) continue;

	// switch the regexes.
	if (url && url.trim().startsWith("https://www.reddit.com")) currentRegex = regexRedditPost

	if (url && url.trim().startsWith('https://')) {
	processedUrls.push(url.trim())
	console.log("Fetching: ", url)
	const html = await getContentFromURL(url)

	if (html) {
	const regex = new RegExp(currentRegex)
	const matchRegex = regex.exec(html)
	if (matchRegex) {
	const text = matchRegex[1].replaceAll("<title>", "").replaceAll("</title>", "").replaceAll('"', "")
	markdown += `- ${text} [[Bulk]]\n`
	markdown += ` - ${url} #learning\n`
	fs.writeFileSync('markdown.md', markdown);
	}
	}
	}
	}
	}
	}
	})();
	} catch (e) {
	console.error(e.message);
	process.exit(1);
	}