Last active
February 5, 2023 12:35
-
-
Save thiagosanches/dfaa0018d05f3f934f2c1539aa48aa2e to your computer and use it in GitHub Desktop.
A little nodejs script that load a JSON exported chat from Telegram, in order to get the links and create a markdown with them, so I can use them in LogSeq.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs') | |
const axios = require('axios') | |
const myData = require('./result.json'); | |
const regexTitleTag = /<title>(.*)<\/title>/g; | |
const regexRedditPost = /<meta property="og:title" content\=\"(.*?")/gm; | |
async function getContentFromURL(url) { | |
let html = null | |
try { | |
const result = await axios.get(url, { | |
headers: { | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0' | |
} | |
}) | |
if (result && result.data) { | |
html = result.data; | |
} | |
} | |
catch (e) { | |
console.log("I was not able to fetch: ", url); | |
} | |
return html; | |
} | |
try { | |
(async () => { | |
let markdown = "" | |
const processedUrls = [] | |
for (const message of myData.messages) { | |
if (message.text_entities) { | |
for (const entity of message.text_entities) { | |
let url = null; | |
let currentRegex = regexTitleTag | |
if (entity.type === 'link') url = entity.text | |
if (entity.type === 'text_link') url = entity.href | |
// I don't need to process an URL twice... | |
if (processedUrls.includes(url)) { | |
console.log("Already processed url: ", url) | |
continue; | |
} | |
// for now, block some URLs, due to limiting calls from their servers :/. | |
if (url && url.trim().startsWith("https://news.ycombinator.com")) continue; | |
if (url && url.trim().startsWith("https://api.whatsapp.com")) continue; | |
// switch the regexes. | |
if (url && url.trim().startsWith("https://www.reddit.com")) currentRegex = regexRedditPost | |
if (url && url.trim().startsWith('https://')) { | |
processedUrls.push(url.trim()) | |
console.log("Fetching: ", url) | |
const html = await getContentFromURL(url) | |
if (html) { | |
const regex = new RegExp(currentRegex) | |
const matchRegex = regex.exec(html) | |
if (matchRegex) { | |
const text = matchRegex[1].replaceAll("<title>", "").replaceAll("</title>", "").replaceAll('"', "") | |
markdown += `- ${text} [[Bulk]]\n` | |
markdown += ` - ${url} #learning\n` | |
fs.writeFileSync('markdown.md', markdown); | |
} | |
} | |
} | |
} | |
} | |
} | |
})(); | |
} catch (e) { | |
console.error(e.message); | |
process.exit(1); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment